aboutsummaryrefslogtreecommitdiffstats
path: root/kernel
diff options
context:
space:
mode:
Diffstat (limited to 'kernel')
-rw-r--r--kernel/Makefile6
-rw-r--r--kernel/async.c6
-rw-r--r--kernel/audit.c2
-rw-r--r--kernel/auditsc.c2
-rw-r--r--kernel/capability.c2
-rw-r--r--kernel/cgroup.c57
-rw-r--r--kernel/cgroup_freezer.c13
-rw-r--r--kernel/compat.c1
-rw-r--r--kernel/cpu.c76
-rw-r--r--kernel/cpu_pm.c233
-rw-r--r--kernel/cpuset.c11
-rw-r--r--kernel/crash_dump.c13
-rw-r--r--kernel/cred.c26
-rw-r--r--kernel/debug/gdbstub.c12
-rw-r--r--kernel/debug/kdb/kdb_debugger.c1
-rw-r--r--kernel/dma.c2
-rw-r--r--kernel/events/core.c78
-rw-r--r--kernel/exit.c2
-rw-r--r--kernel/fork.c19
-rw-r--r--kernel/freezer.c2
-rw-r--r--kernel/futex.c12
-rw-r--r--kernel/groups.c2
-rw-r--r--kernel/hrtimer.c2
-rw-r--r--kernel/hung_task.c2
-rw-r--r--kernel/irq/chip.c66
-rw-r--r--kernel/irq/generic-chip.c9
-rw-r--r--kernel/irq/internals.h19
-rw-r--r--kernel/irq/irqdesc.c71
-rw-r--r--kernel/irq/irqdomain.c18
-rw-r--r--kernel/irq/manage.c240
-rw-r--r--kernel/irq/pm.c48
-rw-r--r--kernel/irq/settings.h7
-rw-r--r--kernel/irq/spurious.c6
-rw-r--r--kernel/irq_work.c95
-rw-r--r--kernel/jump_label.c37
-rw-r--r--kernel/kexec.c41
-rw-r--r--kernel/kfifo.c2
-rw-r--r--kernel/kmod.c4
-rw-r--r--kernel/kprobes.c36
-rw-r--r--kernel/ksysfs.c3
-rw-r--r--kernel/kthread.c2
-rw-r--r--kernel/latencytop.c16
-rw-r--r--kernel/lockdep.c248
-rw-r--r--kernel/lockdep_proc.c2
-rw-r--r--kernel/module.c60
-rw-r--r--kernel/mutex-debug.c2
-rw-r--r--kernel/mutex.c2
-rw-r--r--kernel/notifier.c2
-rw-r--r--kernel/nsproxy.c2
-rw-r--r--kernel/padata.c2
-rw-r--r--kernel/panic.c2
-rw-r--r--kernel/params.c23
-rw-r--r--kernel/pid.c6
-rw-r--r--kernel/posix-cpu-timers.c17
-rw-r--r--kernel/posix-timers.c2
-rw-r--r--kernel/power/Kconfig12
-rw-r--r--kernel/power/Makefile4
-rw-r--r--kernel/power/console.c4
-rw-r--r--kernel/power/hibernate.c87
-rw-r--r--kernel/power/main.c106
-rw-r--r--kernel/power/power.h4
-rw-r--r--kernel/power/process.c30
-rw-r--r--kernel/power/qos.c (renamed from kernel/pm_qos_params.c)278
-rw-r--r--kernel/power/snapshot.c18
-rw-r--r--kernel/power/suspend.c19
-rw-r--r--kernel/power/swap.c818
-rw-r--r--kernel/power/user.c1
-rw-r--r--kernel/printk.c64
-rw-r--r--kernel/profile.c2
-rw-r--r--kernel/ptrace.c25
-rw-r--r--kernel/range.c2
-rw-r--r--kernel/rcu.h85
-rw-r--r--kernel/rcupdate.c28
-rw-r--r--kernel/rcutiny.c120
-rw-r--r--kernel/rcutiny_plugin.h135
-rw-r--r--kernel/rcutorture.c77
-rw-r--r--kernel/rcutree.c292
-rw-r--r--kernel/rcutree.h17
-rw-r--r--kernel/rcutree_plugin.h150
-rw-r--r--kernel/rcutree_trace.c13
-rw-r--r--kernel/relay.c2
-rw-r--r--kernel/resource.c9
-rw-r--r--kernel/rtmutex-debug.c79
-rw-r--r--kernel/rtmutex-tester.c2
-rw-r--r--kernel/rtmutex.c10
-rw-r--r--kernel/rwsem.c2
-rw-r--r--kernel/sched.c743
-rw-r--r--kernel/sched_clock.c2
-rw-r--r--kernel/sched_cpupri.c89
-rw-r--r--kernel/sched_cpupri.h7
-rw-r--r--kernel/sched_fair.c916
-rw-r--r--kernel/sched_features.h6
-rw-r--r--kernel/sched_rt.c106
-rw-r--r--kernel/sched_stats.h12
-rw-r--r--kernel/sched_stoptask.c2
-rw-r--r--kernel/semaphore.c30
-rw-r--r--kernel/signal.c26
-rw-r--r--kernel/smp.c2
-rw-r--r--kernel/softirq.c2
-rw-r--r--kernel/spinlock.c2
-rw-r--r--kernel/srcu.c2
-rw-r--r--kernel/stacktrace.c2
-rw-r--r--kernel/stop_machine.c24
-rw-r--r--kernel/sys.c59
-rw-r--r--kernel/sys_ni.c5
-rw-r--r--kernel/sysctl.c27
-rw-r--r--kernel/sysctl_binary.c4
-rw-r--r--kernel/sysctl_check.c2
-rw-r--r--kernel/taskstats.c1
-rw-r--r--kernel/time.c2
-rw-r--r--kernel/time/posix-clock.c1
-rw-r--r--kernel/time/tick-sched.c6
-rw-r--r--kernel/time/timer_stats.c6
-rw-r--r--kernel/timer.c2
-rw-r--r--kernel/trace/Kconfig2
-rw-r--r--kernel/trace/Makefile5
-rw-r--r--kernel/trace/blktrace.c22
-rw-r--r--kernel/trace/ftrace.c9
-rw-r--r--kernel/trace/ring_buffer.c122
-rw-r--r--kernel/trace/rpm-traces.c20
-rw-r--r--kernel/trace/trace.c191
-rw-r--r--kernel/trace/trace.h16
-rw-r--r--kernel/trace/trace_clock.c12
-rw-r--r--kernel/trace/trace_events_filter.c795
-rw-r--r--kernel/trace/trace_events_filter_test.h50
-rw-r--r--kernel/trace/trace_irqsoff.c10
-rw-r--r--kernel/trace/trace_kprobe.c58
-rw-r--r--kernel/trace/trace_printk.c19
-rw-r--r--kernel/trace/trace_syscalls.c1
-rw-r--r--kernel/tracepoint.c169
-rw-r--r--kernel/tsacct.c15
-rw-r--r--kernel/up.c2
-rw-r--r--kernel/user-return-notifier.c2
-rw-r--r--kernel/user.c2
-rw-r--r--kernel/user_namespace.c2
-rw-r--r--kernel/utsname.c2
-rw-r--r--kernel/utsname_sysctl.c25
-rw-r--r--kernel/wait.c2
-rw-r--r--kernel/watchdog.c11
-rw-r--r--kernel/workqueue.c9
140 files changed, 5711 insertions, 2117 deletions
diff --git a/kernel/Makefile b/kernel/Makefile
index d06467fc8f7..e898c5b9d02 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -9,8 +9,8 @@ obj-y = sched.o fork.o exec_domain.o panic.o printk.o \
9 rcupdate.o extable.o params.o posix-timers.o \ 9 rcupdate.o extable.o params.o posix-timers.o \
10 kthread.o wait.o kfifo.o sys_ni.o posix-cpu-timers.o mutex.o \ 10 kthread.o wait.o kfifo.o sys_ni.o posix-cpu-timers.o mutex.o \
11 hrtimer.o rwsem.o nsproxy.o srcu.o semaphore.o \ 11 hrtimer.o rwsem.o nsproxy.o srcu.o semaphore.o \
12 notifier.o ksysfs.o pm_qos_params.o sched_clock.o cred.o \ 12 notifier.o ksysfs.o sched_clock.o cred.o \
13 async.o range.o jump_label.o 13 async.o range.o
14obj-y += groups.o 14obj-y += groups.o
15 15
16ifdef CONFIG_FUNCTION_TRACER 16ifdef CONFIG_FUNCTION_TRACER
@@ -101,12 +101,14 @@ obj-$(CONFIG_RING_BUFFER) += trace/
101obj-$(CONFIG_TRACEPOINTS) += trace/ 101obj-$(CONFIG_TRACEPOINTS) += trace/
102obj-$(CONFIG_SMP) += sched_cpupri.o 102obj-$(CONFIG_SMP) += sched_cpupri.o
103obj-$(CONFIG_IRQ_WORK) += irq_work.o 103obj-$(CONFIG_IRQ_WORK) += irq_work.o
104obj-$(CONFIG_CPU_PM) += cpu_pm.o
104 105
105obj-$(CONFIG_PERF_EVENTS) += events/ 106obj-$(CONFIG_PERF_EVENTS) += events/
106 107
107obj-$(CONFIG_USER_RETURN_NOTIFIER) += user-return-notifier.o 108obj-$(CONFIG_USER_RETURN_NOTIFIER) += user-return-notifier.o
108obj-$(CONFIG_PADATA) += padata.o 109obj-$(CONFIG_PADATA) += padata.o
109obj-$(CONFIG_CRASH_DUMP) += crash_dump.o 110obj-$(CONFIG_CRASH_DUMP) += crash_dump.o
111obj-$(CONFIG_JUMP_LABEL) += jump_label.o
110 112
111ifneq ($(CONFIG_SCHED_OMIT_FRAME_POINTER),y) 113ifneq ($(CONFIG_SCHED_OMIT_FRAME_POINTER),y)
112# According to Alan Modra <alan@linuxcare.com.au>, the -fno-omit-frame-pointer is 114# According to Alan Modra <alan@linuxcare.com.au>, the -fno-omit-frame-pointer is
diff --git a/kernel/async.c b/kernel/async.c
index d5fe7af0de2..80b74b88fef 100644
--- a/kernel/async.c
+++ b/kernel/async.c
@@ -51,7 +51,7 @@ asynchronous and synchronous parts of the kernel.
51#include <linux/async.h> 51#include <linux/async.h>
52#include <linux/atomic.h> 52#include <linux/atomic.h>
53#include <linux/ktime.h> 53#include <linux/ktime.h>
54#include <linux/module.h> 54#include <linux/export.h>
55#include <linux/wait.h> 55#include <linux/wait.h>
56#include <linux/sched.h> 56#include <linux/sched.h>
57#include <linux/slab.h> 57#include <linux/slab.h>
@@ -120,7 +120,7 @@ static void async_run_entry_fn(struct work_struct *work)
120 struct async_entry *entry = 120 struct async_entry *entry =
121 container_of(work, struct async_entry, work); 121 container_of(work, struct async_entry, work);
122 unsigned long flags; 122 unsigned long flags;
123 ktime_t calltime, delta, rettime; 123 ktime_t uninitialized_var(calltime), delta, rettime;
124 124
125 /* 1) move self to the running queue */ 125 /* 1) move self to the running queue */
126 spin_lock_irqsave(&async_lock, flags); 126 spin_lock_irqsave(&async_lock, flags);
@@ -269,7 +269,7 @@ EXPORT_SYMBOL_GPL(async_synchronize_full_domain);
269void async_synchronize_cookie_domain(async_cookie_t cookie, 269void async_synchronize_cookie_domain(async_cookie_t cookie,
270 struct list_head *running) 270 struct list_head *running)
271{ 271{
272 ktime_t starttime, delta, endtime; 272 ktime_t uninitialized_var(starttime), delta, endtime;
273 273
274 if (initcall_debug && system_state == SYSTEM_BOOTING) { 274 if (initcall_debug && system_state == SYSTEM_BOOTING) {
275 printk(KERN_DEBUG "async_waiting @ %i\n", task_pid_nr(current)); 275 printk(KERN_DEBUG "async_waiting @ %i\n", task_pid_nr(current));
diff --git a/kernel/audit.c b/kernel/audit.c
index 0a1355ca3d7..09fae2677a4 100644
--- a/kernel/audit.c
+++ b/kernel/audit.c
@@ -45,7 +45,7 @@
45#include <asm/types.h> 45#include <asm/types.h>
46#include <linux/atomic.h> 46#include <linux/atomic.h>
47#include <linux/mm.h> 47#include <linux/mm.h>
48#include <linux/module.h> 48#include <linux/export.h>
49#include <linux/slab.h> 49#include <linux/slab.h>
50#include <linux/err.h> 50#include <linux/err.h>
51#include <linux/kthread.h> 51#include <linux/kthread.h>
diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index ce4b054acee..47b7fc1ea89 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -48,7 +48,7 @@
48#include <linux/fs.h> 48#include <linux/fs.h>
49#include <linux/namei.h> 49#include <linux/namei.h>
50#include <linux/mm.h> 50#include <linux/mm.h>
51#include <linux/module.h> 51#include <linux/export.h>
52#include <linux/slab.h> 52#include <linux/slab.h>
53#include <linux/mount.h> 53#include <linux/mount.h>
54#include <linux/socket.h> 54#include <linux/socket.h>
diff --git a/kernel/capability.c b/kernel/capability.c
index 283c529f8b1..b463871a4e6 100644
--- a/kernel/capability.c
+++ b/kernel/capability.c
@@ -10,7 +10,7 @@
10#include <linux/audit.h> 10#include <linux/audit.h>
11#include <linux/capability.h> 11#include <linux/capability.h>
12#include <linux/mm.h> 12#include <linux/mm.h>
13#include <linux/module.h> 13#include <linux/export.h>
14#include <linux/security.h> 14#include <linux/security.h>
15#include <linux/syscalls.h> 15#include <linux/syscalls.h>
16#include <linux/pid_namespace.h> 16#include <linux/pid_namespace.h>
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 1d2b6ceea95..d9d5648f3cd 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -265,7 +265,7 @@ list_for_each_entry(_root, &roots, root_list)
265/* the list of cgroups eligible for automatic release. Protected by 265/* the list of cgroups eligible for automatic release. Protected by
266 * release_list_lock */ 266 * release_list_lock */
267static LIST_HEAD(release_list); 267static LIST_HEAD(release_list);
268static DEFINE_SPINLOCK(release_list_lock); 268static DEFINE_RAW_SPINLOCK(release_list_lock);
269static void cgroup_release_agent(struct work_struct *work); 269static void cgroup_release_agent(struct work_struct *work);
270static DECLARE_WORK(release_agent_work, cgroup_release_agent); 270static DECLARE_WORK(release_agent_work, cgroup_release_agent);
271static void check_for_release(struct cgroup *cgrp); 271static void check_for_release(struct cgroup *cgrp);
@@ -2027,7 +2027,7 @@ int cgroup_attach_proc(struct cgroup *cgrp, struct task_struct *leader)
2027 goto out_free_group_list; 2027 goto out_free_group_list;
2028 2028
2029 /* prevent changes to the threadgroup list while we take a snapshot. */ 2029 /* prevent changes to the threadgroup list while we take a snapshot. */
2030 rcu_read_lock(); 2030 read_lock(&tasklist_lock);
2031 if (!thread_group_leader(leader)) { 2031 if (!thread_group_leader(leader)) {
2032 /* 2032 /*
2033 * a race with de_thread from another thread's exec() may strip 2033 * a race with de_thread from another thread's exec() may strip
@@ -2036,7 +2036,7 @@ int cgroup_attach_proc(struct cgroup *cgrp, struct task_struct *leader)
2036 * throw this task away and try again (from cgroup_procs_write); 2036 * throw this task away and try again (from cgroup_procs_write);
2037 * this is "double-double-toil-and-trouble-check locking". 2037 * this is "double-double-toil-and-trouble-check locking".
2038 */ 2038 */
2039 rcu_read_unlock(); 2039 read_unlock(&tasklist_lock);
2040 retval = -EAGAIN; 2040 retval = -EAGAIN;
2041 goto out_free_group_list; 2041 goto out_free_group_list;
2042 } 2042 }
@@ -2057,7 +2057,7 @@ int cgroup_attach_proc(struct cgroup *cgrp, struct task_struct *leader)
2057 } while_each_thread(leader, tsk); 2057 } while_each_thread(leader, tsk);
2058 /* remember the number of threads in the array for later. */ 2058 /* remember the number of threads in the array for later. */
2059 group_size = i; 2059 group_size = i;
2060 rcu_read_unlock(); 2060 read_unlock(&tasklist_lock);
2061 2061
2062 /* 2062 /*
2063 * step 1: check that we can legitimately attach to the cgroup. 2063 * step 1: check that we can legitimately attach to the cgroup.
@@ -2135,14 +2135,17 @@ int cgroup_attach_proc(struct cgroup *cgrp, struct task_struct *leader)
2135 oldcgrp = task_cgroup_from_root(tsk, root); 2135 oldcgrp = task_cgroup_from_root(tsk, root);
2136 if (cgrp == oldcgrp) 2136 if (cgrp == oldcgrp)
2137 continue; 2137 continue;
2138 /* attach each task to each subsystem */
2139 for_each_subsys(root, ss) {
2140 if (ss->attach_task)
2141 ss->attach_task(cgrp, tsk);
2142 }
2143 /* if the thread is PF_EXITING, it can just get skipped. */ 2138 /* if the thread is PF_EXITING, it can just get skipped. */
2144 retval = cgroup_task_migrate(cgrp, oldcgrp, tsk, true); 2139 retval = cgroup_task_migrate(cgrp, oldcgrp, tsk, true);
2145 BUG_ON(retval != 0 && retval != -ESRCH); 2140 if (retval == 0) {
2141 /* attach each task to each subsystem */
2142 for_each_subsys(root, ss) {
2143 if (ss->attach_task)
2144 ss->attach_task(cgrp, tsk);
2145 }
2146 } else {
2147 BUG_ON(retval != -ESRCH);
2148 }
2146 } 2149 }
2147 /* nothing is sensitive to fork() after this point. */ 2150 /* nothing is sensitive to fork() after this point. */
2148 2151
@@ -4014,11 +4017,11 @@ again:
4014 finish_wait(&cgroup_rmdir_waitq, &wait); 4017 finish_wait(&cgroup_rmdir_waitq, &wait);
4015 clear_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags); 4018 clear_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags);
4016 4019
4017 spin_lock(&release_list_lock); 4020 raw_spin_lock(&release_list_lock);
4018 set_bit(CGRP_REMOVED, &cgrp->flags); 4021 set_bit(CGRP_REMOVED, &cgrp->flags);
4019 if (!list_empty(&cgrp->release_list)) 4022 if (!list_empty(&cgrp->release_list))
4020 list_del_init(&cgrp->release_list); 4023 list_del_init(&cgrp->release_list);
4021 spin_unlock(&release_list_lock); 4024 raw_spin_unlock(&release_list_lock);
4022 4025
4023 cgroup_lock_hierarchy(cgrp->root); 4026 cgroup_lock_hierarchy(cgrp->root);
4024 /* delete this cgroup from parent->children */ 4027 /* delete this cgroup from parent->children */
@@ -4671,13 +4674,13 @@ static void check_for_release(struct cgroup *cgrp)
4671 * already queued for a userspace notification, queue 4674 * already queued for a userspace notification, queue
4672 * it now */ 4675 * it now */
4673 int need_schedule_work = 0; 4676 int need_schedule_work = 0;
4674 spin_lock(&release_list_lock); 4677 raw_spin_lock(&release_list_lock);
4675 if (!cgroup_is_removed(cgrp) && 4678 if (!cgroup_is_removed(cgrp) &&
4676 list_empty(&cgrp->release_list)) { 4679 list_empty(&cgrp->release_list)) {
4677 list_add(&cgrp->release_list, &release_list); 4680 list_add(&cgrp->release_list, &release_list);
4678 need_schedule_work = 1; 4681 need_schedule_work = 1;
4679 } 4682 }
4680 spin_unlock(&release_list_lock); 4683 raw_spin_unlock(&release_list_lock);
4681 if (need_schedule_work) 4684 if (need_schedule_work)
4682 schedule_work(&release_agent_work); 4685 schedule_work(&release_agent_work);
4683 } 4686 }
@@ -4729,7 +4732,7 @@ static void cgroup_release_agent(struct work_struct *work)
4729{ 4732{
4730 BUG_ON(work != &release_agent_work); 4733 BUG_ON(work != &release_agent_work);
4731 mutex_lock(&cgroup_mutex); 4734 mutex_lock(&cgroup_mutex);
4732 spin_lock(&release_list_lock); 4735 raw_spin_lock(&release_list_lock);
4733 while (!list_empty(&release_list)) { 4736 while (!list_empty(&release_list)) {
4734 char *argv[3], *envp[3]; 4737 char *argv[3], *envp[3];
4735 int i; 4738 int i;
@@ -4738,7 +4741,7 @@ static void cgroup_release_agent(struct work_struct *work)
4738 struct cgroup, 4741 struct cgroup,
4739 release_list); 4742 release_list);
4740 list_del_init(&cgrp->release_list); 4743 list_del_init(&cgrp->release_list);
4741 spin_unlock(&release_list_lock); 4744 raw_spin_unlock(&release_list_lock);
4742 pathbuf = kmalloc(PAGE_SIZE, GFP_KERNEL); 4745 pathbuf = kmalloc(PAGE_SIZE, GFP_KERNEL);
4743 if (!pathbuf) 4746 if (!pathbuf)
4744 goto continue_free; 4747 goto continue_free;
@@ -4768,9 +4771,9 @@ static void cgroup_release_agent(struct work_struct *work)
4768 continue_free: 4771 continue_free:
4769 kfree(pathbuf); 4772 kfree(pathbuf);
4770 kfree(agentbuf); 4773 kfree(agentbuf);
4771 spin_lock(&release_list_lock); 4774 raw_spin_lock(&release_list_lock);
4772 } 4775 }
4773 spin_unlock(&release_list_lock); 4776 raw_spin_unlock(&release_list_lock);
4774 mutex_unlock(&cgroup_mutex); 4777 mutex_unlock(&cgroup_mutex);
4775} 4778}
4776 4779
@@ -4880,9 +4883,9 @@ void free_css_id(struct cgroup_subsys *ss, struct cgroup_subsys_state *css)
4880 4883
4881 rcu_assign_pointer(id->css, NULL); 4884 rcu_assign_pointer(id->css, NULL);
4882 rcu_assign_pointer(css->id, NULL); 4885 rcu_assign_pointer(css->id, NULL);
4883 spin_lock(&ss->id_lock); 4886 write_lock(&ss->id_lock);
4884 idr_remove(&ss->idr, id->id); 4887 idr_remove(&ss->idr, id->id);
4885 spin_unlock(&ss->id_lock); 4888 write_unlock(&ss->id_lock);
4886 kfree_rcu(id, rcu_head); 4889 kfree_rcu(id, rcu_head);
4887} 4890}
4888EXPORT_SYMBOL_GPL(free_css_id); 4891EXPORT_SYMBOL_GPL(free_css_id);
@@ -4908,10 +4911,10 @@ static struct css_id *get_new_cssid(struct cgroup_subsys *ss, int depth)
4908 error = -ENOMEM; 4911 error = -ENOMEM;
4909 goto err_out; 4912 goto err_out;
4910 } 4913 }
4911 spin_lock(&ss->id_lock); 4914 write_lock(&ss->id_lock);
4912 /* Don't use 0. allocates an ID of 1-65535 */ 4915 /* Don't use 0. allocates an ID of 1-65535 */
4913 error = idr_get_new_above(&ss->idr, newid, 1, &myid); 4916 error = idr_get_new_above(&ss->idr, newid, 1, &myid);
4914 spin_unlock(&ss->id_lock); 4917 write_unlock(&ss->id_lock);
4915 4918
4916 /* Returns error when there are no free spaces for new ID.*/ 4919 /* Returns error when there are no free spaces for new ID.*/
4917 if (error) { 4920 if (error) {
@@ -4926,9 +4929,9 @@ static struct css_id *get_new_cssid(struct cgroup_subsys *ss, int depth)
4926 return newid; 4929 return newid;
4927remove_idr: 4930remove_idr:
4928 error = -ENOSPC; 4931 error = -ENOSPC;
4929 spin_lock(&ss->id_lock); 4932 write_lock(&ss->id_lock);
4930 idr_remove(&ss->idr, myid); 4933 idr_remove(&ss->idr, myid);
4931 spin_unlock(&ss->id_lock); 4934 write_unlock(&ss->id_lock);
4932err_out: 4935err_out:
4933 kfree(newid); 4936 kfree(newid);
4934 return ERR_PTR(error); 4937 return ERR_PTR(error);
@@ -4940,7 +4943,7 @@ static int __init_or_module cgroup_init_idr(struct cgroup_subsys *ss,
4940{ 4943{
4941 struct css_id *newid; 4944 struct css_id *newid;
4942 4945
4943 spin_lock_init(&ss->id_lock); 4946 rwlock_init(&ss->id_lock);
4944 idr_init(&ss->idr); 4947 idr_init(&ss->idr);
4945 4948
4946 newid = get_new_cssid(ss, 0); 4949 newid = get_new_cssid(ss, 0);
@@ -5035,9 +5038,9 @@ css_get_next(struct cgroup_subsys *ss, int id,
5035 * scan next entry from bitmap(tree), tmpid is updated after 5038 * scan next entry from bitmap(tree), tmpid is updated after
5036 * idr_get_next(). 5039 * idr_get_next().
5037 */ 5040 */
5038 spin_lock(&ss->id_lock); 5041 read_lock(&ss->id_lock);
5039 tmp = idr_get_next(&ss->idr, &tmpid); 5042 tmp = idr_get_next(&ss->idr, &tmpid);
5040 spin_unlock(&ss->id_lock); 5043 read_unlock(&ss->id_lock);
5041 5044
5042 if (!tmp) 5045 if (!tmp)
5043 break; 5046 break;
diff --git a/kernel/cgroup_freezer.c b/kernel/cgroup_freezer.c
index e691818d7e4..213c0351dad 100644
--- a/kernel/cgroup_freezer.c
+++ b/kernel/cgroup_freezer.c
@@ -14,7 +14,7 @@
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. 14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
15 */ 15 */
16 16
17#include <linux/module.h> 17#include <linux/export.h>
18#include <linux/slab.h> 18#include <linux/slab.h>
19#include <linux/cgroup.h> 19#include <linux/cgroup.h>
20#include <linux/fs.h> 20#include <linux/fs.h>
@@ -153,6 +153,13 @@ static void freezer_destroy(struct cgroup_subsys *ss,
153 kfree(cgroup_freezer(cgroup)); 153 kfree(cgroup_freezer(cgroup));
154} 154}
155 155
156/* task is frozen or will freeze immediately when next it gets woken */
157static bool is_task_frozen_enough(struct task_struct *task)
158{
159 return frozen(task) ||
160 (task_is_stopped_or_traced(task) && freezing(task));
161}
162
156/* 163/*
157 * The call to cgroup_lock() in the freezer.state write method prevents 164 * The call to cgroup_lock() in the freezer.state write method prevents
158 * a write to that file racing against an attach, and hence the 165 * a write to that file racing against an attach, and hence the
@@ -231,7 +238,7 @@ static void update_if_frozen(struct cgroup *cgroup,
231 cgroup_iter_start(cgroup, &it); 238 cgroup_iter_start(cgroup, &it);
232 while ((task = cgroup_iter_next(cgroup, &it))) { 239 while ((task = cgroup_iter_next(cgroup, &it))) {
233 ntotal++; 240 ntotal++;
234 if (frozen(task)) 241 if (is_task_frozen_enough(task))
235 nfrozen++; 242 nfrozen++;
236 } 243 }
237 244
@@ -284,7 +291,7 @@ static int try_to_freeze_cgroup(struct cgroup *cgroup, struct freezer *freezer)
284 while ((task = cgroup_iter_next(cgroup, &it))) { 291 while ((task = cgroup_iter_next(cgroup, &it))) {
285 if (!freeze_task(task, true)) 292 if (!freeze_task(task, true))
286 continue; 293 continue;
287 if (frozen(task)) 294 if (is_task_frozen_enough(task))
288 continue; 295 continue;
289 if (!freezing(task) && !freezer_should_skip(task)) 296 if (!freezing(task) && !freezer_should_skip(task))
290 num_cant_freeze_now++; 297 num_cant_freeze_now++;
diff --git a/kernel/compat.c b/kernel/compat.c
index e2435ee9993..f346cedfe24 100644
--- a/kernel/compat.c
+++ b/kernel/compat.c
@@ -21,6 +21,7 @@
21#include <linux/unistd.h> 21#include <linux/unistd.h>
22#include <linux/security.h> 22#include <linux/security.h>
23#include <linux/timex.h> 23#include <linux/timex.h>
24#include <linux/export.h>
24#include <linux/migrate.h> 25#include <linux/migrate.h>
25#include <linux/posix-timers.h> 26#include <linux/posix-timers.h>
26#include <linux/times.h> 27#include <linux/times.h>
diff --git a/kernel/cpu.c b/kernel/cpu.c
index 12b7458f23b..563f1360947 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -10,11 +10,12 @@
10#include <linux/sched.h> 10#include <linux/sched.h>
11#include <linux/unistd.h> 11#include <linux/unistd.h>
12#include <linux/cpu.h> 12#include <linux/cpu.h>
13#include <linux/module.h> 13#include <linux/export.h>
14#include <linux/kthread.h> 14#include <linux/kthread.h>
15#include <linux/stop_machine.h> 15#include <linux/stop_machine.h>
16#include <linux/mutex.h> 16#include <linux/mutex.h>
17#include <linux/gfp.h> 17#include <linux/gfp.h>
18#include <linux/suspend.h>
18 19
19#ifdef CONFIG_SMP 20#ifdef CONFIG_SMP
20/* Serializes the updates to cpu_online_mask, cpu_present_mask */ 21/* Serializes the updates to cpu_online_mask, cpu_present_mask */
@@ -476,6 +477,79 @@ static int alloc_frozen_cpus(void)
476 return 0; 477 return 0;
477} 478}
478core_initcall(alloc_frozen_cpus); 479core_initcall(alloc_frozen_cpus);
480
481/*
482 * Prevent regular CPU hotplug from racing with the freezer, by disabling CPU
483 * hotplug when tasks are about to be frozen. Also, don't allow the freezer
484 * to continue until any currently running CPU hotplug operation gets
485 * completed.
486 * To modify the 'cpu_hotplug_disabled' flag, we need to acquire the
487 * 'cpu_add_remove_lock'. And this same lock is also taken by the regular
488 * CPU hotplug path and released only after it is complete. Thus, we
489 * (and hence the freezer) will block here until any currently running CPU
490 * hotplug operation gets completed.
491 */
492void cpu_hotplug_disable_before_freeze(void)
493{
494 cpu_maps_update_begin();
495 cpu_hotplug_disabled = 1;
496 cpu_maps_update_done();
497}
498
499
500/*
501 * When tasks have been thawed, re-enable regular CPU hotplug (which had been
502 * disabled while beginning to freeze tasks).
503 */
504void cpu_hotplug_enable_after_thaw(void)
505{
506 cpu_maps_update_begin();
507 cpu_hotplug_disabled = 0;
508 cpu_maps_update_done();
509}
510
511/*
512 * When callbacks for CPU hotplug notifications are being executed, we must
513 * ensure that the state of the system with respect to the tasks being frozen
514 * or not, as reported by the notification, remains unchanged *throughout the
515 * duration* of the execution of the callbacks.
516 * Hence we need to prevent the freezer from racing with regular CPU hotplug.
517 *
518 * This synchronization is implemented by mutually excluding regular CPU
519 * hotplug and Suspend/Hibernate call paths by hooking onto the Suspend/
520 * Hibernate notifications.
521 */
522static int
523cpu_hotplug_pm_callback(struct notifier_block *nb,
524 unsigned long action, void *ptr)
525{
526 switch (action) {
527
528 case PM_SUSPEND_PREPARE:
529 case PM_HIBERNATION_PREPARE:
530 cpu_hotplug_disable_before_freeze();
531 break;
532
533 case PM_POST_SUSPEND:
534 case PM_POST_HIBERNATION:
535 cpu_hotplug_enable_after_thaw();
536 break;
537
538 default:
539 return NOTIFY_DONE;
540 }
541
542 return NOTIFY_OK;
543}
544
545
546int cpu_hotplug_pm_sync_init(void)
547{
548 pm_notifier(cpu_hotplug_pm_callback, 0);
549 return 0;
550}
551core_initcall(cpu_hotplug_pm_sync_init);
552
479#endif /* CONFIG_PM_SLEEP_SMP */ 553#endif /* CONFIG_PM_SLEEP_SMP */
480 554
481/** 555/**
diff --git a/kernel/cpu_pm.c b/kernel/cpu_pm.c
new file mode 100644
index 00000000000..249152e1530
--- /dev/null
+++ b/kernel/cpu_pm.c
@@ -0,0 +1,233 @@
1/*
2 * Copyright (C) 2011 Google, Inc.
3 *
4 * Author:
5 * Colin Cross <ccross@android.com>
6 *
7 * This software is licensed under the terms of the GNU General Public
8 * License version 2, as published by the Free Software Foundation, and
9 * may be copied, distributed, and modified under those terms.
10 *
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
15 *
16 */
17
18#include <linux/kernel.h>
19#include <linux/cpu_pm.h>
20#include <linux/module.h>
21#include <linux/notifier.h>
22#include <linux/spinlock.h>
23#include <linux/syscore_ops.h>
24
25static DEFINE_RWLOCK(cpu_pm_notifier_lock);
26static RAW_NOTIFIER_HEAD(cpu_pm_notifier_chain);
27
28static int cpu_pm_notify(enum cpu_pm_event event, int nr_to_call, int *nr_calls)
29{
30 int ret;
31
32 ret = __raw_notifier_call_chain(&cpu_pm_notifier_chain, event, NULL,
33 nr_to_call, nr_calls);
34
35 return notifier_to_errno(ret);
36}
37
38/**
39 * cpu_pm_register_notifier - register a driver with cpu_pm
40 * @nb: notifier block to register
41 *
42 * Add a driver to a list of drivers that are notified about
43 * CPU and CPU cluster low power entry and exit.
44 *
45 * This function may sleep, and has the same return conditions as
46 * raw_notifier_chain_register.
47 */
48int cpu_pm_register_notifier(struct notifier_block *nb)
49{
50 unsigned long flags;
51 int ret;
52
53 write_lock_irqsave(&cpu_pm_notifier_lock, flags);
54 ret = raw_notifier_chain_register(&cpu_pm_notifier_chain, nb);
55 write_unlock_irqrestore(&cpu_pm_notifier_lock, flags);
56
57 return ret;
58}
59EXPORT_SYMBOL_GPL(cpu_pm_register_notifier);
60
61/**
62 * cpu_pm_unregister_notifier - unregister a driver with cpu_pm
63 * @nb: notifier block to be unregistered
64 *
65 * Remove a driver from the CPU PM notifier list.
66 *
67 * This function may sleep, and has the same return conditions as
68 * raw_notifier_chain_unregister.
69 */
70int cpu_pm_unregister_notifier(struct notifier_block *nb)
71{
72 unsigned long flags;
73 int ret;
74
75 write_lock_irqsave(&cpu_pm_notifier_lock, flags);
76 ret = raw_notifier_chain_unregister(&cpu_pm_notifier_chain, nb);
77 write_unlock_irqrestore(&cpu_pm_notifier_lock, flags);
78
79 return ret;
80}
81EXPORT_SYMBOL_GPL(cpu_pm_unregister_notifier);
82
83/**
84 * cpm_pm_enter - CPU low power entry notifier
85 *
86 * Notifies listeners that a single CPU is entering a low power state that may
87 * cause some blocks in the same power domain as the cpu to reset.
88 *
89 * Must be called on the affected CPU with interrupts disabled. Platform is
90 * responsible for ensuring that cpu_pm_enter is not called twice on the same
91 * CPU before cpu_pm_exit is called. Notified drivers can include VFP
92 * co-processor, interrupt controller and it's PM extensions, local CPU
93 * timers context save/restore which shouldn't be interrupted. Hence it
94 * must be called with interrupts disabled.
95 *
96 * Return conditions are same as __raw_notifier_call_chain.
97 */
98int cpu_pm_enter(void)
99{
100 int nr_calls;
101 int ret = 0;
102
103 read_lock(&cpu_pm_notifier_lock);
104 ret = cpu_pm_notify(CPU_PM_ENTER, -1, &nr_calls);
105 if (ret)
106 /*
107 * Inform listeners (nr_calls - 1) about failure of CPU PM
108 * PM entry who are notified earlier to prepare for it.
109 */
110 cpu_pm_notify(CPU_PM_ENTER_FAILED, nr_calls - 1, NULL);
111 read_unlock(&cpu_pm_notifier_lock);
112
113 return ret;
114}
115EXPORT_SYMBOL_GPL(cpu_pm_enter);
116
117/**
118 * cpm_pm_exit - CPU low power exit notifier
119 *
120 * Notifies listeners that a single CPU is exiting a low power state that may
121 * have caused some blocks in the same power domain as the cpu to reset.
122 *
123 * Notified drivers can include VFP co-processor, interrupt controller
124 * and it's PM extensions, local CPU timers context save/restore which
125 * shouldn't be interrupted. Hence it must be called with interrupts disabled.
126 *
127 * Return conditions are same as __raw_notifier_call_chain.
128 */
129int cpu_pm_exit(void)
130{
131 int ret;
132
133 read_lock(&cpu_pm_notifier_lock);
134 ret = cpu_pm_notify(CPU_PM_EXIT, -1, NULL);
135 read_unlock(&cpu_pm_notifier_lock);
136
137 return ret;
138}
139EXPORT_SYMBOL_GPL(cpu_pm_exit);
140
141/**
142 * cpm_cluster_pm_enter - CPU cluster low power entry notifier
143 *
144 * Notifies listeners that all cpus in a power domain are entering a low power
145 * state that may cause some blocks in the same power domain to reset.
146 *
147 * Must be called after cpu_pm_enter has been called on all cpus in the power
148 * domain, and before cpu_pm_exit has been called on any cpu in the power
149 * domain. Notified drivers can include VFP co-processor, interrupt controller
150 * and it's PM extensions, local CPU timers context save/restore which
151 * shouldn't be interrupted. Hence it must be called with interrupts disabled.
152 *
153 * Must be called with interrupts disabled.
154 *
155 * Return conditions are same as __raw_notifier_call_chain.
156 */
157int cpu_cluster_pm_enter(void)
158{
159 int nr_calls;
160 int ret = 0;
161
162 read_lock(&cpu_pm_notifier_lock);
163 ret = cpu_pm_notify(CPU_CLUSTER_PM_ENTER, -1, &nr_calls);
164 if (ret)
165 /*
166 * Inform listeners (nr_calls - 1) about failure of CPU cluster
167 * PM entry who are notified earlier to prepare for it.
168 */
169 cpu_pm_notify(CPU_CLUSTER_PM_ENTER_FAILED, nr_calls - 1, NULL);
170 read_unlock(&cpu_pm_notifier_lock);
171
172 return ret;
173}
174EXPORT_SYMBOL_GPL(cpu_cluster_pm_enter);
175
176/**
177 * cpm_cluster_pm_exit - CPU cluster low power exit notifier
178 *
179 * Notifies listeners that all cpus in a power domain are exiting form a
180 * low power state that may have caused some blocks in the same power domain
181 * to reset.
182 *
183 * Must be called after cpu_pm_exit has been called on all cpus in the power
184 * domain, and before cpu_pm_exit has been called on any cpu in the power
185 * domain. Notified drivers can include VFP co-processor, interrupt controller
186 * and it's PM extensions, local CPU timers context save/restore which
187 * shouldn't be interrupted. Hence it must be called with interrupts disabled.
188 *
189 * Return conditions are same as __raw_notifier_call_chain.
190 */
191int cpu_cluster_pm_exit(void)
192{
193 int ret;
194
195 read_lock(&cpu_pm_notifier_lock);
196 ret = cpu_pm_notify(CPU_CLUSTER_PM_EXIT, -1, NULL);
197 read_unlock(&cpu_pm_notifier_lock);
198
199 return ret;
200}
201EXPORT_SYMBOL_GPL(cpu_cluster_pm_exit);
202
203#ifdef CONFIG_PM
204static int cpu_pm_suspend(void)
205{
206 int ret;
207
208 ret = cpu_pm_enter();
209 if (ret)
210 return ret;
211
212 ret = cpu_cluster_pm_enter();
213 return ret;
214}
215
216static void cpu_pm_resume(void)
217{
218 cpu_cluster_pm_exit();
219 cpu_pm_exit();
220}
221
222static struct syscore_ops cpu_pm_syscore_ops = {
223 .suspend = cpu_pm_suspend,
224 .resume = cpu_pm_resume,
225};
226
227static int cpu_pm_init(void)
228{
229 register_syscore_ops(&cpu_pm_syscore_ops);
230 return 0;
231}
232core_initcall(cpu_pm_init);
233#endif
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index 10131fdaff7..9fe58c46a42 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -37,7 +37,7 @@
37#include <linux/mempolicy.h> 37#include <linux/mempolicy.h>
38#include <linux/mm.h> 38#include <linux/mm.h>
39#include <linux/memory.h> 39#include <linux/memory.h>
40#include <linux/module.h> 40#include <linux/export.h>
41#include <linux/mount.h> 41#include <linux/mount.h>
42#include <linux/namei.h> 42#include <linux/namei.h>
43#include <linux/pagemap.h> 43#include <linux/pagemap.h>
@@ -949,6 +949,8 @@ static void cpuset_migrate_mm(struct mm_struct *mm, const nodemask_t *from,
949static void cpuset_change_task_nodemask(struct task_struct *tsk, 949static void cpuset_change_task_nodemask(struct task_struct *tsk,
950 nodemask_t *newmems) 950 nodemask_t *newmems)
951{ 951{
952 bool masks_disjoint = !nodes_intersects(*newmems, tsk->mems_allowed);
953
952repeat: 954repeat:
953 /* 955 /*
954 * Allow tasks that have access to memory reserves because they have 956 * Allow tasks that have access to memory reserves because they have
@@ -963,7 +965,6 @@ repeat:
963 nodes_or(tsk->mems_allowed, tsk->mems_allowed, *newmems); 965 nodes_or(tsk->mems_allowed, tsk->mems_allowed, *newmems);
964 mpol_rebind_task(tsk, newmems, MPOL_REBIND_STEP1); 966 mpol_rebind_task(tsk, newmems, MPOL_REBIND_STEP1);
965 967
966
967 /* 968 /*
968 * ensure checking ->mems_allowed_change_disable after setting all new 969 * ensure checking ->mems_allowed_change_disable after setting all new
969 * allowed nodes. 970 * allowed nodes.
@@ -980,9 +981,11 @@ repeat:
980 981
981 /* 982 /*
982 * Allocation of memory is very fast, we needn't sleep when waiting 983 * Allocation of memory is very fast, we needn't sleep when waiting
983 * for the read-side. 984 * for the read-side. No wait is necessary, however, if at least one
985 * node remains unchanged.
984 */ 986 */
985 while (ACCESS_ONCE(tsk->mems_allowed_change_disable)) { 987 while (masks_disjoint &&
988 ACCESS_ONCE(tsk->mems_allowed_change_disable)) {
986 task_unlock(tsk); 989 task_unlock(tsk);
987 if (!task_curr(tsk)) 990 if (!task_curr(tsk))
988 yield(); 991 yield();
diff --git a/kernel/crash_dump.c b/kernel/crash_dump.c
index 5f85690285d..c766ee54c0b 100644
--- a/kernel/crash_dump.c
+++ b/kernel/crash_dump.c
@@ -2,7 +2,7 @@
2#include <linux/crash_dump.h> 2#include <linux/crash_dump.h>
3#include <linux/init.h> 3#include <linux/init.h>
4#include <linux/errno.h> 4#include <linux/errno.h>
5#include <linux/module.h> 5#include <linux/export.h>
6 6
7/* 7/*
8 * If we have booted due to a crash, max_pfn will be a very low value. We need 8 * If we have booted due to a crash, max_pfn will be a very low value. We need
@@ -20,8 +20,15 @@ unsigned long saved_max_pfn;
20unsigned long long elfcorehdr_addr = ELFCORE_ADDR_MAX; 20unsigned long long elfcorehdr_addr = ELFCORE_ADDR_MAX;
21 21
22/* 22/*
23 * stores the size of elf header of crash image
24 */
25unsigned long long elfcorehdr_size;
26
27/*
23 * elfcorehdr= specifies the location of elf core header stored by the crashed 28 * elfcorehdr= specifies the location of elf core header stored by the crashed
24 * kernel. This option will be passed by kexec loader to the capture kernel. 29 * kernel. This option will be passed by kexec loader to the capture kernel.
30 *
31 * Syntax: elfcorehdr=[size[KMG]@]offset[KMG]
25 */ 32 */
26static int __init setup_elfcorehdr(char *arg) 33static int __init setup_elfcorehdr(char *arg)
27{ 34{
@@ -29,6 +36,10 @@ static int __init setup_elfcorehdr(char *arg)
29 if (!arg) 36 if (!arg)
30 return -EINVAL; 37 return -EINVAL;
31 elfcorehdr_addr = memparse(arg, &end); 38 elfcorehdr_addr = memparse(arg, &end);
39 if (*end == '@') {
40 elfcorehdr_size = elfcorehdr_addr;
41 elfcorehdr_addr = memparse(end + 1, &end);
42 }
32 return end > arg ? 0 : -EINVAL; 43 return end > arg ? 0 : -EINVAL;
33} 44}
34early_param("elfcorehdr", setup_elfcorehdr); 45early_param("elfcorehdr", setup_elfcorehdr);
diff --git a/kernel/cred.c b/kernel/cred.c
index 174fa84eca3..5791612a404 100644
--- a/kernel/cred.c
+++ b/kernel/cred.c
@@ -8,7 +8,7 @@
8 * as published by the Free Software Foundation; either version 8 * as published by the Free Software Foundation; either version
9 * 2 of the Licence, or (at your option) any later version. 9 * 2 of the Licence, or (at your option) any later version.
10 */ 10 */
11#include <linux/module.h> 11#include <linux/export.h>
12#include <linux/cred.h> 12#include <linux/cred.h>
13#include <linux/slab.h> 13#include <linux/slab.h>
14#include <linux/sched.h> 14#include <linux/sched.h>
@@ -508,10 +508,8 @@ int commit_creds(struct cred *new)
508 key_fsgid_changed(task); 508 key_fsgid_changed(task);
509 509
510 /* do it 510 /* do it
511 * - What if a process setreuid()'s and this brings the 511 * RLIMIT_NPROC limits on user->processes have already been checked
512 * new uid over his NPROC rlimit? We can check this now 512 * in set_user().
513 * cheaply with the new uid cache, so if it matters
514 * we should be checking for it. -DaveM
515 */ 513 */
516 alter_cred_subscribers(new, 2); 514 alter_cred_subscribers(new, 2);
517 if (new->user != old->user) 515 if (new->user != old->user)
@@ -646,6 +644,9 @@ void __init cred_init(void)
646 */ 644 */
647struct cred *prepare_kernel_cred(struct task_struct *daemon) 645struct cred *prepare_kernel_cred(struct task_struct *daemon)
648{ 646{
647#ifdef CONFIG_KEYS
648 struct thread_group_cred *tgcred;
649#endif
649 const struct cred *old; 650 const struct cred *old;
650 struct cred *new; 651 struct cred *new;
651 652
@@ -653,6 +654,14 @@ struct cred *prepare_kernel_cred(struct task_struct *daemon)
653 if (!new) 654 if (!new)
654 return NULL; 655 return NULL;
655 656
657#ifdef CONFIG_KEYS
658 tgcred = kmalloc(sizeof(*tgcred), GFP_KERNEL);
659 if (!tgcred) {
660 kmem_cache_free(cred_jar, new);
661 return NULL;
662 }
663#endif
664
656 kdebug("prepare_kernel_cred() alloc %p", new); 665 kdebug("prepare_kernel_cred() alloc %p", new);
657 666
658 if (daemon) 667 if (daemon)
@@ -669,8 +678,11 @@ struct cred *prepare_kernel_cred(struct task_struct *daemon)
669 get_group_info(new->group_info); 678 get_group_info(new->group_info);
670 679
671#ifdef CONFIG_KEYS 680#ifdef CONFIG_KEYS
672 atomic_inc(&init_tgcred.usage); 681 atomic_set(&tgcred->usage, 1);
673 new->tgcred = &init_tgcred; 682 spin_lock_init(&tgcred->lock);
683 tgcred->process_keyring = NULL;
684 tgcred->session_keyring = NULL;
685 new->tgcred = tgcred;
674 new->request_key_auth = NULL; 686 new->request_key_auth = NULL;
675 new->thread_keyring = NULL; 687 new->thread_keyring = NULL;
676 new->jit_keyring = KEY_REQKEY_DEFL_THREAD_KEYRING; 688 new->jit_keyring = KEY_REQKEY_DEFL_THREAD_KEYRING;
diff --git a/kernel/debug/gdbstub.c b/kernel/debug/gdbstub.c
index 34872482315..c22d8c28ad8 100644
--- a/kernel/debug/gdbstub.c
+++ b/kernel/debug/gdbstub.c
@@ -217,7 +217,7 @@ void gdbstub_msg_write(const char *s, int len)
217 217
218 /* Pack in hex chars */ 218 /* Pack in hex chars */
219 for (i = 0; i < wcount; i++) 219 for (i = 0; i < wcount; i++)
220 bufptr = pack_hex_byte(bufptr, s[i]); 220 bufptr = hex_byte_pack(bufptr, s[i]);
221 *bufptr = '\0'; 221 *bufptr = '\0';
222 222
223 /* Move up */ 223 /* Move up */
@@ -249,7 +249,7 @@ char *kgdb_mem2hex(char *mem, char *buf, int count)
249 if (err) 249 if (err)
250 return NULL; 250 return NULL;
251 while (count > 0) { 251 while (count > 0) {
252 buf = pack_hex_byte(buf, *tmp); 252 buf = hex_byte_pack(buf, *tmp);
253 tmp++; 253 tmp++;
254 count--; 254 count--;
255 } 255 }
@@ -411,14 +411,14 @@ static char *pack_threadid(char *pkt, unsigned char *id)
411 limit = id + (BUF_THREAD_ID_SIZE / 2); 411 limit = id + (BUF_THREAD_ID_SIZE / 2);
412 while (id < limit) { 412 while (id < limit) {
413 if (!lzero || *id != 0) { 413 if (!lzero || *id != 0) {
414 pkt = pack_hex_byte(pkt, *id); 414 pkt = hex_byte_pack(pkt, *id);
415 lzero = 0; 415 lzero = 0;
416 } 416 }
417 id++; 417 id++;
418 } 418 }
419 419
420 if (lzero) 420 if (lzero)
421 pkt = pack_hex_byte(pkt, 0); 421 pkt = hex_byte_pack(pkt, 0);
422 422
423 return pkt; 423 return pkt;
424} 424}
@@ -486,7 +486,7 @@ static void gdb_cmd_status(struct kgdb_state *ks)
486 dbg_remove_all_break(); 486 dbg_remove_all_break();
487 487
488 remcom_out_buffer[0] = 'S'; 488 remcom_out_buffer[0] = 'S';
489 pack_hex_byte(&remcom_out_buffer[1], ks->signo); 489 hex_byte_pack(&remcom_out_buffer[1], ks->signo);
490} 490}
491 491
492static void gdb_get_regs_helper(struct kgdb_state *ks) 492static void gdb_get_regs_helper(struct kgdb_state *ks)
@@ -954,7 +954,7 @@ int gdb_serial_stub(struct kgdb_state *ks)
954 /* Reply to host that an exception has occurred */ 954 /* Reply to host that an exception has occurred */
955 ptr = remcom_out_buffer; 955 ptr = remcom_out_buffer;
956 *ptr++ = 'T'; 956 *ptr++ = 'T';
957 ptr = pack_hex_byte(ptr, ks->signo); 957 ptr = hex_byte_pack(ptr, ks->signo);
958 ptr += strlen(strcpy(ptr, "thread:")); 958 ptr += strlen(strcpy(ptr, "thread:"));
959 int_to_threadref(thref, shadow_pid(current->pid)); 959 int_to_threadref(thref, shadow_pid(current->pid));
960 ptr = pack_threadid(ptr, thref); 960 ptr = pack_threadid(ptr, thref);
diff --git a/kernel/debug/kdb/kdb_debugger.c b/kernel/debug/kdb/kdb_debugger.c
index d9ca9aa481e..8b68ce78ff1 100644
--- a/kernel/debug/kdb/kdb_debugger.c
+++ b/kernel/debug/kdb/kdb_debugger.c
@@ -11,6 +11,7 @@
11#include <linux/kgdb.h> 11#include <linux/kgdb.h>
12#include <linux/kdb.h> 12#include <linux/kdb.h>
13#include <linux/kdebug.h> 13#include <linux/kdebug.h>
14#include <linux/export.h>
14#include "kdb_private.h" 15#include "kdb_private.h"
15#include "../debug_core.h" 16#include "../debug_core.h"
16 17
diff --git a/kernel/dma.c b/kernel/dma.c
index f903189c530..68a2306522c 100644
--- a/kernel/dma.c
+++ b/kernel/dma.c
@@ -9,7 +9,7 @@
9 * [It also happened to remove the sizeof(char *) == sizeof(int) 9 * [It also happened to remove the sizeof(char *) == sizeof(int)
10 * assumption introduced because of those /proc/dma patches. -- Hennus] 10 * assumption introduced because of those /proc/dma patches. -- Hennus]
11 */ 11 */
12#include <linux/module.h> 12#include <linux/export.h>
13#include <linux/kernel.h> 13#include <linux/kernel.h>
14#include <linux/errno.h> 14#include <linux/errno.h>
15#include <linux/spinlock.h> 15#include <linux/spinlock.h>
diff --git a/kernel/events/core.c b/kernel/events/core.c
index b8785e26ee1..0e8457da6f9 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -25,6 +25,7 @@
25#include <linux/reboot.h> 25#include <linux/reboot.h>
26#include <linux/vmstat.h> 26#include <linux/vmstat.h>
27#include <linux/device.h> 27#include <linux/device.h>
28#include <linux/export.h>
28#include <linux/vmalloc.h> 29#include <linux/vmalloc.h>
29#include <linux/hardirq.h> 30#include <linux/hardirq.h>
30#include <linux/rculist.h> 31#include <linux/rculist.h>
@@ -399,14 +400,54 @@ void perf_cgroup_switch(struct task_struct *task, int mode)
399 local_irq_restore(flags); 400 local_irq_restore(flags);
400} 401}
401 402
402static inline void perf_cgroup_sched_out(struct task_struct *task) 403static inline void perf_cgroup_sched_out(struct task_struct *task,
404 struct task_struct *next)
403{ 405{
404 perf_cgroup_switch(task, PERF_CGROUP_SWOUT); 406 struct perf_cgroup *cgrp1;
407 struct perf_cgroup *cgrp2 = NULL;
408
409 /*
410 * we come here when we know perf_cgroup_events > 0
411 */
412 cgrp1 = perf_cgroup_from_task(task);
413
414 /*
415 * next is NULL when called from perf_event_enable_on_exec()
416 * that will systematically cause a cgroup_switch()
417 */
418 if (next)
419 cgrp2 = perf_cgroup_from_task(next);
420
421 /*
422 * only schedule out current cgroup events if we know
423 * that we are switching to a different cgroup. Otherwise,
424 * do no touch the cgroup events.
425 */
426 if (cgrp1 != cgrp2)
427 perf_cgroup_switch(task, PERF_CGROUP_SWOUT);
405} 428}
406 429
407static inline void perf_cgroup_sched_in(struct task_struct *task) 430static inline void perf_cgroup_sched_in(struct task_struct *prev,
431 struct task_struct *task)
408{ 432{
409 perf_cgroup_switch(task, PERF_CGROUP_SWIN); 433 struct perf_cgroup *cgrp1;
434 struct perf_cgroup *cgrp2 = NULL;
435
436 /*
437 * we come here when we know perf_cgroup_events > 0
438 */
439 cgrp1 = perf_cgroup_from_task(task);
440
441 /* prev can never be NULL */
442 cgrp2 = perf_cgroup_from_task(prev);
443
444 /*
445 * only need to schedule in cgroup events if we are changing
446 * cgroup during ctxsw. Cgroup events were not scheduled
447 * out of ctxsw out if that was not the case.
448 */
449 if (cgrp1 != cgrp2)
450 perf_cgroup_switch(task, PERF_CGROUP_SWIN);
410} 451}
411 452
412static inline int perf_cgroup_connect(int fd, struct perf_event *event, 453static inline int perf_cgroup_connect(int fd, struct perf_event *event,
@@ -518,11 +559,13 @@ static inline void update_cgrp_time_from_cpuctx(struct perf_cpu_context *cpuctx)
518{ 559{
519} 560}
520 561
521static inline void perf_cgroup_sched_out(struct task_struct *task) 562static inline void perf_cgroup_sched_out(struct task_struct *task,
563 struct task_struct *next)
522{ 564{
523} 565}
524 566
525static inline void perf_cgroup_sched_in(struct task_struct *task) 567static inline void perf_cgroup_sched_in(struct task_struct *prev,
568 struct task_struct *task)
526{ 569{
527} 570}
528 571
@@ -1988,7 +2031,7 @@ void __perf_event_task_sched_out(struct task_struct *task,
1988 * cgroup event are system-wide mode only 2031 * cgroup event are system-wide mode only
1989 */ 2032 */
1990 if (atomic_read(&__get_cpu_var(perf_cgroup_events))) 2033 if (atomic_read(&__get_cpu_var(perf_cgroup_events)))
1991 perf_cgroup_sched_out(task); 2034 perf_cgroup_sched_out(task, next);
1992} 2035}
1993 2036
1994static void task_ctx_sched_out(struct perf_event_context *ctx) 2037static void task_ctx_sched_out(struct perf_event_context *ctx)
@@ -2153,7 +2196,8 @@ static void perf_event_context_sched_in(struct perf_event_context *ctx,
2153 * accessing the event control register. If a NMI hits, then it will 2196 * accessing the event control register. If a NMI hits, then it will
2154 * keep the event running. 2197 * keep the event running.
2155 */ 2198 */
2156void __perf_event_task_sched_in(struct task_struct *task) 2199void __perf_event_task_sched_in(struct task_struct *prev,
2200 struct task_struct *task)
2157{ 2201{
2158 struct perf_event_context *ctx; 2202 struct perf_event_context *ctx;
2159 int ctxn; 2203 int ctxn;
@@ -2171,7 +2215,7 @@ void __perf_event_task_sched_in(struct task_struct *task)
2171 * cgroup event are system-wide mode only 2215 * cgroup event are system-wide mode only
2172 */ 2216 */
2173 if (atomic_read(&__get_cpu_var(perf_cgroup_events))) 2217 if (atomic_read(&__get_cpu_var(perf_cgroup_events)))
2174 perf_cgroup_sched_in(task); 2218 perf_cgroup_sched_in(prev, task);
2175} 2219}
2176 2220
2177static u64 perf_calculate_period(struct perf_event *event, u64 nsec, u64 count) 2221static u64 perf_calculate_period(struct perf_event *event, u64 nsec, u64 count)
@@ -2427,7 +2471,7 @@ static void perf_event_enable_on_exec(struct perf_event_context *ctx)
2427 * ctxswin cgroup events which are already scheduled 2471 * ctxswin cgroup events which are already scheduled
2428 * in. 2472 * in.
2429 */ 2473 */
2430 perf_cgroup_sched_out(current); 2474 perf_cgroup_sched_out(current, NULL);
2431 2475
2432 raw_spin_lock(&ctx->lock); 2476 raw_spin_lock(&ctx->lock);
2433 task_ctx_sched_out(ctx); 2477 task_ctx_sched_out(ctx);
@@ -3353,8 +3397,8 @@ static int perf_event_index(struct perf_event *event)
3353} 3397}
3354 3398
3355static void calc_timer_values(struct perf_event *event, 3399static void calc_timer_values(struct perf_event *event,
3356 u64 *running, 3400 u64 *enabled,
3357 u64 *enabled) 3401 u64 *running)
3358{ 3402{
3359 u64 now, ctx_time; 3403 u64 now, ctx_time;
3360 3404
@@ -3500,7 +3544,7 @@ static void perf_mmap_close(struct vm_area_struct *vma)
3500 struct ring_buffer *rb = event->rb; 3544 struct ring_buffer *rb = event->rb;
3501 3545
3502 atomic_long_sub((size >> PAGE_SHIFT) + 1, &user->locked_vm); 3546 atomic_long_sub((size >> PAGE_SHIFT) + 1, &user->locked_vm);
3503 vma->vm_mm->locked_vm -= event->mmap_locked; 3547 vma->vm_mm->pinned_vm -= event->mmap_locked;
3504 rcu_assign_pointer(event->rb, NULL); 3548 rcu_assign_pointer(event->rb, NULL);
3505 mutex_unlock(&event->mmap_mutex); 3549 mutex_unlock(&event->mmap_mutex);
3506 3550
@@ -3581,7 +3625,7 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma)
3581 3625
3582 lock_limit = rlimit(RLIMIT_MEMLOCK); 3626 lock_limit = rlimit(RLIMIT_MEMLOCK);
3583 lock_limit >>= PAGE_SHIFT; 3627 lock_limit >>= PAGE_SHIFT;
3584 locked = vma->vm_mm->locked_vm + extra; 3628 locked = vma->vm_mm->pinned_vm + extra;
3585 3629
3586 if ((locked > lock_limit) && perf_paranoid_tracepoint_raw() && 3630 if ((locked > lock_limit) && perf_paranoid_tracepoint_raw() &&
3587 !capable(CAP_IPC_LOCK)) { 3631 !capable(CAP_IPC_LOCK)) {
@@ -3607,7 +3651,7 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma)
3607 atomic_long_add(user_extra, &user->locked_vm); 3651 atomic_long_add(user_extra, &user->locked_vm);
3608 event->mmap_locked = extra; 3652 event->mmap_locked = extra;
3609 event->mmap_user = get_current_user(); 3653 event->mmap_user = get_current_user();
3610 vma->vm_mm->locked_vm += event->mmap_locked; 3654 vma->vm_mm->pinned_vm += event->mmap_locked;
3611 3655
3612unlock: 3656unlock:
3613 if (!ret) 3657 if (!ret)
@@ -5715,6 +5759,7 @@ struct pmu *perf_init_event(struct perf_event *event)
5715 pmu = idr_find(&pmu_idr, event->attr.type); 5759 pmu = idr_find(&pmu_idr, event->attr.type);
5716 rcu_read_unlock(); 5760 rcu_read_unlock();
5717 if (pmu) { 5761 if (pmu) {
5762 event->pmu = pmu;
5718 ret = pmu->event_init(event); 5763 ret = pmu->event_init(event);
5719 if (ret) 5764 if (ret)
5720 pmu = ERR_PTR(ret); 5765 pmu = ERR_PTR(ret);
@@ -5722,6 +5767,7 @@ struct pmu *perf_init_event(struct perf_event *event)
5722 } 5767 }
5723 5768
5724 list_for_each_entry_rcu(pmu, &pmus, entry) { 5769 list_for_each_entry_rcu(pmu, &pmus, entry) {
5770 event->pmu = pmu;
5725 ret = pmu->event_init(event); 5771 ret = pmu->event_init(event);
5726 if (!ret) 5772 if (!ret)
5727 goto unlock; 5773 goto unlock;
@@ -5848,8 +5894,6 @@ done:
5848 return ERR_PTR(err); 5894 return ERR_PTR(err);
5849 } 5895 }
5850 5896
5851 event->pmu = pmu;
5852
5853 if (!event->parent) { 5897 if (!event->parent) {
5854 if (event->attach_state & PERF_ATTACH_TASK) 5898 if (event->attach_state & PERF_ATTACH_TASK)
5855 jump_label_inc(&perf_sched_events); 5899 jump_label_inc(&perf_sched_events);
diff --git a/kernel/exit.c b/kernel/exit.c
index 2913b3509d4..d0b7d988f87 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -681,8 +681,6 @@ static void exit_mm(struct task_struct * tsk)
681 enter_lazy_tlb(mm, current); 681 enter_lazy_tlb(mm, current);
682 /* We don't want this task to be frozen prematurely */ 682 /* We don't want this task to be frozen prematurely */
683 clear_freeze_flag(tsk); 683 clear_freeze_flag(tsk);
684 if (tsk->signal->oom_score_adj == OOM_SCORE_ADJ_MIN)
685 atomic_dec(&mm->oom_disable_count);
686 task_unlock(tsk); 684 task_unlock(tsk);
687 mm_update_next_owner(mm); 685 mm_update_next_owner(mm);
688 mmput(mm); 686 mmput(mm);
diff --git a/kernel/fork.c b/kernel/fork.c
index e7ceaca8960..da4a6a10d08 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -162,7 +162,6 @@ static void account_kernel_stack(struct thread_info *ti, int account)
162 162
163void free_task(struct task_struct *tsk) 163void free_task(struct task_struct *tsk)
164{ 164{
165 prop_local_destroy_single(&tsk->dirties);
166 account_kernel_stack(tsk->stack, -1); 165 account_kernel_stack(tsk->stack, -1);
167 free_thread_info(tsk->stack); 166 free_thread_info(tsk->stack);
168 rt_mutex_debug_task_free(tsk); 167 rt_mutex_debug_task_free(tsk);
@@ -274,10 +273,6 @@ static struct task_struct *dup_task_struct(struct task_struct *orig)
274 273
275 tsk->stack = ti; 274 tsk->stack = ti;
276 275
277 err = prop_local_init_single(&tsk->dirties);
278 if (err)
279 goto out;
280
281 setup_thread_stack(tsk, orig); 276 setup_thread_stack(tsk, orig);
282 clear_user_return_notifier(tsk); 277 clear_user_return_notifier(tsk);
283 clear_tsk_need_resched(tsk); 278 clear_tsk_need_resched(tsk);
@@ -501,7 +496,6 @@ static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p)
501 mm->cached_hole_size = ~0UL; 496 mm->cached_hole_size = ~0UL;
502 mm_init_aio(mm); 497 mm_init_aio(mm);
503 mm_init_owner(mm, p); 498 mm_init_owner(mm, p);
504 atomic_set(&mm->oom_disable_count, 0);
505 499
506 if (likely(!mm_alloc_pgd(mm))) { 500 if (likely(!mm_alloc_pgd(mm))) {
507 mm->def_flags = 0; 501 mm->def_flags = 0;
@@ -816,8 +810,6 @@ good_mm:
816 /* Initializing for Swap token stuff */ 810 /* Initializing for Swap token stuff */
817 mm->token_priority = 0; 811 mm->token_priority = 0;
818 mm->last_interval = 0; 812 mm->last_interval = 0;
819 if (tsk->signal->oom_score_adj == OOM_SCORE_ADJ_MIN)
820 atomic_inc(&mm->oom_disable_count);
821 813
822 tsk->mm = mm; 814 tsk->mm = mm;
823 tsk->active_mm = mm; 815 tsk->active_mm = mm;
@@ -1111,6 +1103,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
1111 p->real_cred->user != INIT_USER) 1103 p->real_cred->user != INIT_USER)
1112 goto bad_fork_free; 1104 goto bad_fork_free;
1113 } 1105 }
1106 current->flags &= ~PF_NPROC_EXCEEDED;
1114 1107
1115 retval = copy_creds(p, clone_flags); 1108 retval = copy_creds(p, clone_flags);
1116 if (retval < 0) 1109 if (retval < 0)
@@ -1301,6 +1294,9 @@ static struct task_struct *copy_process(unsigned long clone_flags,
1301 p->pdeath_signal = 0; 1294 p->pdeath_signal = 0;
1302 p->exit_state = 0; 1295 p->exit_state = 0;
1303 1296
1297 p->nr_dirtied = 0;
1298 p->nr_dirtied_pause = 128 >> (PAGE_SHIFT - 10);
1299
1304 /* 1300 /*
1305 * Ok, make it visible to the rest of the system. 1301 * Ok, make it visible to the rest of the system.
1306 * We dont wake it up yet. 1302 * We dont wake it up yet.
@@ -1390,13 +1386,8 @@ bad_fork_cleanup_io:
1390bad_fork_cleanup_namespaces: 1386bad_fork_cleanup_namespaces:
1391 exit_task_namespaces(p); 1387 exit_task_namespaces(p);
1392bad_fork_cleanup_mm: 1388bad_fork_cleanup_mm:
1393 if (p->mm) { 1389 if (p->mm)
1394 task_lock(p);
1395 if (p->signal->oom_score_adj == OOM_SCORE_ADJ_MIN)
1396 atomic_dec(&p->mm->oom_disable_count);
1397 task_unlock(p);
1398 mmput(p->mm); 1390 mmput(p->mm);
1399 }
1400bad_fork_cleanup_signal: 1391bad_fork_cleanup_signal:
1401 if (!(clone_flags & CLONE_THREAD)) 1392 if (!(clone_flags & CLONE_THREAD))
1402 free_signal_struct(p->signal); 1393 free_signal_struct(p->signal);
diff --git a/kernel/freezer.c b/kernel/freezer.c
index 7b01de98bb6..7be56c53439 100644
--- a/kernel/freezer.c
+++ b/kernel/freezer.c
@@ -6,7 +6,7 @@
6 6
7#include <linux/interrupt.h> 7#include <linux/interrupt.h>
8#include <linux/suspend.h> 8#include <linux/suspend.h>
9#include <linux/module.h> 9#include <linux/export.h>
10#include <linux/syscalls.h> 10#include <linux/syscalls.h>
11#include <linux/freezer.h> 11#include <linux/freezer.h>
12 12
diff --git a/kernel/futex.c b/kernel/futex.c
index 11cbe052b2e..ea87f4d2f45 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -55,7 +55,7 @@
55#include <linux/pagemap.h> 55#include <linux/pagemap.h>
56#include <linux/syscalls.h> 56#include <linux/syscalls.h>
57#include <linux/signal.h> 57#include <linux/signal.h>
58#include <linux/module.h> 58#include <linux/export.h>
59#include <linux/magic.h> 59#include <linux/magic.h>
60#include <linux/pid.h> 60#include <linux/pid.h>
61#include <linux/nsproxy.h> 61#include <linux/nsproxy.h>
@@ -854,7 +854,7 @@ static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_q *this)
854{ 854{
855 struct task_struct *new_owner; 855 struct task_struct *new_owner;
856 struct futex_pi_state *pi_state = this->pi_state; 856 struct futex_pi_state *pi_state = this->pi_state;
857 u32 curval, newval; 857 u32 uninitialized_var(curval), newval;
858 858
859 if (!pi_state) 859 if (!pi_state)
860 return -EINVAL; 860 return -EINVAL;
@@ -916,7 +916,7 @@ static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_q *this)
916 916
917static int unlock_futex_pi(u32 __user *uaddr, u32 uval) 917static int unlock_futex_pi(u32 __user *uaddr, u32 uval)
918{ 918{
919 u32 oldval; 919 u32 uninitialized_var(oldval);
920 920
921 /* 921 /*
922 * There is no waiter, so we unlock the futex. The owner died 922 * There is no waiter, so we unlock the futex. The owner died
@@ -1576,7 +1576,7 @@ static int fixup_pi_state_owner(u32 __user *uaddr, struct futex_q *q,
1576 u32 newtid = task_pid_vnr(newowner) | FUTEX_WAITERS; 1576 u32 newtid = task_pid_vnr(newowner) | FUTEX_WAITERS;
1577 struct futex_pi_state *pi_state = q->pi_state; 1577 struct futex_pi_state *pi_state = q->pi_state;
1578 struct task_struct *oldowner = pi_state->owner; 1578 struct task_struct *oldowner = pi_state->owner;
1579 u32 uval, curval, newval; 1579 u32 uval, uninitialized_var(curval), newval;
1580 int ret; 1580 int ret;
1581 1581
1582 /* Owner died? */ 1582 /* Owner died? */
@@ -1793,7 +1793,7 @@ static void futex_wait_queue_me(struct futex_hash_bucket *hb, struct futex_q *q,
1793 * 1793 *
1794 * Returns: 1794 * Returns:
1795 * 0 - uaddr contains val and hb has been locked 1795 * 0 - uaddr contains val and hb has been locked
1796 * <1 - -EFAULT or -EWOULDBLOCK (uaddr does not contain val) and hb is unlcoked 1796 * <1 - -EFAULT or -EWOULDBLOCK (uaddr does not contain val) and hb is unlocked
1797 */ 1797 */
1798static int futex_wait_setup(u32 __user *uaddr, u32 val, unsigned int flags, 1798static int futex_wait_setup(u32 __user *uaddr, u32 val, unsigned int flags,
1799 struct futex_q *q, struct futex_hash_bucket **hb) 1799 struct futex_q *q, struct futex_hash_bucket **hb)
@@ -2481,7 +2481,7 @@ err_unlock:
2481 */ 2481 */
2482int handle_futex_death(u32 __user *uaddr, struct task_struct *curr, int pi) 2482int handle_futex_death(u32 __user *uaddr, struct task_struct *curr, int pi)
2483{ 2483{
2484 u32 uval, nval, mval; 2484 u32 uval, uninitialized_var(nval), mval;
2485 2485
2486retry: 2486retry:
2487 if (get_user(uval, uaddr)) 2487 if (get_user(uval, uaddr))
diff --git a/kernel/groups.c b/kernel/groups.c
index 1cc476d52dd..99b53d1eb7e 100644
--- a/kernel/groups.c
+++ b/kernel/groups.c
@@ -2,7 +2,7 @@
2 * Supplementary group IDs 2 * Supplementary group IDs
3 */ 3 */
4#include <linux/cred.h> 4#include <linux/cred.h>
5#include <linux/module.h> 5#include <linux/export.h>
6#include <linux/slab.h> 6#include <linux/slab.h>
7#include <linux/security.h> 7#include <linux/security.h>
8#include <linux/syscalls.h> 8#include <linux/syscalls.h>
diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c
index 2043c08d36c..ae34bf51682 100644
--- a/kernel/hrtimer.c
+++ b/kernel/hrtimer.c
@@ -32,7 +32,7 @@
32 */ 32 */
33 33
34#include <linux/cpu.h> 34#include <linux/cpu.h>
35#include <linux/module.h> 35#include <linux/export.h>
36#include <linux/percpu.h> 36#include <linux/percpu.h>
37#include <linux/hrtimer.h> 37#include <linux/hrtimer.h>
38#include <linux/notifier.h> 38#include <linux/notifier.h>
diff --git a/kernel/hung_task.c b/kernel/hung_task.c
index ea640120ab8..8b1748d0172 100644
--- a/kernel/hung_task.c
+++ b/kernel/hung_task.c
@@ -13,7 +13,7 @@
13#include <linux/freezer.h> 13#include <linux/freezer.h>
14#include <linux/kthread.h> 14#include <linux/kthread.h>
15#include <linux/lockdep.h> 15#include <linux/lockdep.h>
16#include <linux/module.h> 16#include <linux/export.h>
17#include <linux/sysctl.h> 17#include <linux/sysctl.h>
18 18
19/* 19/*
diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c
index d5a3009da71..f7c543a801d 100644
--- a/kernel/irq/chip.c
+++ b/kernel/irq/chip.c
@@ -26,7 +26,7 @@
26int irq_set_chip(unsigned int irq, struct irq_chip *chip) 26int irq_set_chip(unsigned int irq, struct irq_chip *chip)
27{ 27{
28 unsigned long flags; 28 unsigned long flags;
29 struct irq_desc *desc = irq_get_desc_lock(irq, &flags); 29 struct irq_desc *desc = irq_get_desc_lock(irq, &flags, 0);
30 30
31 if (!desc) 31 if (!desc)
32 return -EINVAL; 32 return -EINVAL;
@@ -54,7 +54,7 @@ EXPORT_SYMBOL(irq_set_chip);
54int irq_set_irq_type(unsigned int irq, unsigned int type) 54int irq_set_irq_type(unsigned int irq, unsigned int type)
55{ 55{
56 unsigned long flags; 56 unsigned long flags;
57 struct irq_desc *desc = irq_get_desc_buslock(irq, &flags); 57 struct irq_desc *desc = irq_get_desc_buslock(irq, &flags, IRQ_GET_DESC_CHECK_GLOBAL);
58 int ret = 0; 58 int ret = 0;
59 59
60 if (!desc) 60 if (!desc)
@@ -78,7 +78,7 @@ EXPORT_SYMBOL(irq_set_irq_type);
78int irq_set_handler_data(unsigned int irq, void *data) 78int irq_set_handler_data(unsigned int irq, void *data)
79{ 79{
80 unsigned long flags; 80 unsigned long flags;
81 struct irq_desc *desc = irq_get_desc_lock(irq, &flags); 81 struct irq_desc *desc = irq_get_desc_lock(irq, &flags, 0);
82 82
83 if (!desc) 83 if (!desc)
84 return -EINVAL; 84 return -EINVAL;
@@ -98,7 +98,7 @@ EXPORT_SYMBOL(irq_set_handler_data);
98int irq_set_msi_desc(unsigned int irq, struct msi_desc *entry) 98int irq_set_msi_desc(unsigned int irq, struct msi_desc *entry)
99{ 99{
100 unsigned long flags; 100 unsigned long flags;
101 struct irq_desc *desc = irq_get_desc_lock(irq, &flags); 101 struct irq_desc *desc = irq_get_desc_lock(irq, &flags, IRQ_GET_DESC_CHECK_GLOBAL);
102 102
103 if (!desc) 103 if (!desc)
104 return -EINVAL; 104 return -EINVAL;
@@ -119,7 +119,7 @@ int irq_set_msi_desc(unsigned int irq, struct msi_desc *entry)
119int irq_set_chip_data(unsigned int irq, void *data) 119int irq_set_chip_data(unsigned int irq, void *data)
120{ 120{
121 unsigned long flags; 121 unsigned long flags;
122 struct irq_desc *desc = irq_get_desc_lock(irq, &flags); 122 struct irq_desc *desc = irq_get_desc_lock(irq, &flags, 0);
123 123
124 if (!desc) 124 if (!desc)
125 return -EINVAL; 125 return -EINVAL;
@@ -178,7 +178,7 @@ void irq_shutdown(struct irq_desc *desc)
178 desc->depth = 1; 178 desc->depth = 1;
179 if (desc->irq_data.chip->irq_shutdown) 179 if (desc->irq_data.chip->irq_shutdown)
180 desc->irq_data.chip->irq_shutdown(&desc->irq_data); 180 desc->irq_data.chip->irq_shutdown(&desc->irq_data);
181 if (desc->irq_data.chip->irq_disable) 181 else if (desc->irq_data.chip->irq_disable)
182 desc->irq_data.chip->irq_disable(&desc->irq_data); 182 desc->irq_data.chip->irq_disable(&desc->irq_data);
183 else 183 else
184 desc->irq_data.chip->irq_mask(&desc->irq_data); 184 desc->irq_data.chip->irq_mask(&desc->irq_data);
@@ -204,6 +204,24 @@ void irq_disable(struct irq_desc *desc)
204 } 204 }
205} 205}
206 206
207void irq_percpu_enable(struct irq_desc *desc, unsigned int cpu)
208{
209 if (desc->irq_data.chip->irq_enable)
210 desc->irq_data.chip->irq_enable(&desc->irq_data);
211 else
212 desc->irq_data.chip->irq_unmask(&desc->irq_data);
213 cpumask_set_cpu(cpu, desc->percpu_enabled);
214}
215
216void irq_percpu_disable(struct irq_desc *desc, unsigned int cpu)
217{
218 if (desc->irq_data.chip->irq_disable)
219 desc->irq_data.chip->irq_disable(&desc->irq_data);
220 else
221 desc->irq_data.chip->irq_mask(&desc->irq_data);
222 cpumask_clear_cpu(cpu, desc->percpu_enabled);
223}
224
207static inline void mask_ack_irq(struct irq_desc *desc) 225static inline void mask_ack_irq(struct irq_desc *desc)
208{ 226{
209 if (desc->irq_data.chip->irq_mask_ack) 227 if (desc->irq_data.chip->irq_mask_ack)
@@ -544,12 +562,44 @@ handle_percpu_irq(unsigned int irq, struct irq_desc *desc)
544 chip->irq_eoi(&desc->irq_data); 562 chip->irq_eoi(&desc->irq_data);
545} 563}
546 564
565/**
566 * handle_percpu_devid_irq - Per CPU local irq handler with per cpu dev ids
567 * @irq: the interrupt number
568 * @desc: the interrupt description structure for this irq
569 *
570 * Per CPU interrupts on SMP machines without locking requirements. Same as
571 * handle_percpu_irq() above but with the following extras:
572 *
573 * action->percpu_dev_id is a pointer to percpu variables which
574 * contain the real device id for the cpu on which this handler is
575 * called
576 */
577void handle_percpu_devid_irq(unsigned int irq, struct irq_desc *desc)
578{
579 struct irq_chip *chip = irq_desc_get_chip(desc);
580 struct irqaction *action = desc->action;
581 void *dev_id = __this_cpu_ptr(action->percpu_dev_id);
582 irqreturn_t res;
583
584 kstat_incr_irqs_this_cpu(irq, desc);
585
586 if (chip->irq_ack)
587 chip->irq_ack(&desc->irq_data);
588
589 trace_irq_handler_entry(irq, action);
590 res = action->handler(irq, dev_id);
591 trace_irq_handler_exit(irq, action, res);
592
593 if (chip->irq_eoi)
594 chip->irq_eoi(&desc->irq_data);
595}
596
547void 597void
548__irq_set_handler(unsigned int irq, irq_flow_handler_t handle, int is_chained, 598__irq_set_handler(unsigned int irq, irq_flow_handler_t handle, int is_chained,
549 const char *name) 599 const char *name)
550{ 600{
551 unsigned long flags; 601 unsigned long flags;
552 struct irq_desc *desc = irq_get_desc_buslock(irq, &flags); 602 struct irq_desc *desc = irq_get_desc_buslock(irq, &flags, 0);
553 603
554 if (!desc) 604 if (!desc)
555 return; 605 return;
@@ -593,7 +643,7 @@ irq_set_chip_and_handler_name(unsigned int irq, struct irq_chip *chip,
593void irq_modify_status(unsigned int irq, unsigned long clr, unsigned long set) 643void irq_modify_status(unsigned int irq, unsigned long clr, unsigned long set)
594{ 644{
595 unsigned long flags; 645 unsigned long flags;
596 struct irq_desc *desc = irq_get_desc_lock(irq, &flags); 646 struct irq_desc *desc = irq_get_desc_lock(irq, &flags, 0);
597 647
598 if (!desc) 648 if (!desc)
599 return; 649 return;
diff --git a/kernel/irq/generic-chip.c b/kernel/irq/generic-chip.c
index 3a2cab407b9..c89295a8f66 100644
--- a/kernel/irq/generic-chip.c
+++ b/kernel/irq/generic-chip.c
@@ -6,6 +6,7 @@
6#include <linux/io.h> 6#include <linux/io.h>
7#include <linux/irq.h> 7#include <linux/irq.h>
8#include <linux/slab.h> 8#include <linux/slab.h>
9#include <linux/export.h>
9#include <linux/interrupt.h> 10#include <linux/interrupt.h>
10#include <linux/kernel_stat.h> 11#include <linux/kernel_stat.h>
11#include <linux/syscore_ops.h> 12#include <linux/syscore_ops.h>
@@ -211,6 +212,7 @@ irq_alloc_generic_chip(const char *name, int num_ct, unsigned int irq_base,
211 } 212 }
212 return gc; 213 return gc;
213} 214}
215EXPORT_SYMBOL_GPL(irq_alloc_generic_chip);
214 216
215/* 217/*
216 * Separate lockdep class for interrupt chip which can nest irq_desc 218 * Separate lockdep class for interrupt chip which can nest irq_desc
@@ -246,7 +248,7 @@ void irq_setup_generic_chip(struct irq_chip_generic *gc, u32 msk,
246 gc->mask_cache = irq_reg_readl(gc->reg_base + ct->regs.mask); 248 gc->mask_cache = irq_reg_readl(gc->reg_base + ct->regs.mask);
247 249
248 for (i = gc->irq_base; msk; msk >>= 1, i++) { 250 for (i = gc->irq_base; msk; msk >>= 1, i++) {
249 if (!msk & 0x01) 251 if (!(msk & 0x01))
250 continue; 252 continue;
251 253
252 if (flags & IRQ_GC_INIT_NESTED_LOCK) 254 if (flags & IRQ_GC_INIT_NESTED_LOCK)
@@ -258,6 +260,7 @@ void irq_setup_generic_chip(struct irq_chip_generic *gc, u32 msk,
258 } 260 }
259 gc->irq_cnt = i - gc->irq_base; 261 gc->irq_cnt = i - gc->irq_base;
260} 262}
263EXPORT_SYMBOL_GPL(irq_setup_generic_chip);
261 264
262/** 265/**
263 * irq_setup_alt_chip - Switch to alternative chip 266 * irq_setup_alt_chip - Switch to alternative chip
@@ -281,6 +284,7 @@ int irq_setup_alt_chip(struct irq_data *d, unsigned int type)
281 } 284 }
282 return -EINVAL; 285 return -EINVAL;
283} 286}
287EXPORT_SYMBOL_GPL(irq_setup_alt_chip);
284 288
285/** 289/**
286 * irq_remove_generic_chip - Remove a chip 290 * irq_remove_generic_chip - Remove a chip
@@ -301,7 +305,7 @@ void irq_remove_generic_chip(struct irq_chip_generic *gc, u32 msk,
301 raw_spin_unlock(&gc_lock); 305 raw_spin_unlock(&gc_lock);
302 306
303 for (; msk; msk >>= 1, i++) { 307 for (; msk; msk >>= 1, i++) {
304 if (!msk & 0x01) 308 if (!(msk & 0x01))
305 continue; 309 continue;
306 310
307 /* Remove handler first. That will mask the irq line */ 311 /* Remove handler first. That will mask the irq line */
@@ -311,6 +315,7 @@ void irq_remove_generic_chip(struct irq_chip_generic *gc, u32 msk,
311 irq_modify_status(i, clr, set); 315 irq_modify_status(i, clr, set);
312 } 316 }
313} 317}
318EXPORT_SYMBOL_GPL(irq_remove_generic_chip);
314 319
315#ifdef CONFIG_PM 320#ifdef CONFIG_PM
316static int irq_gc_suspend(void) 321static int irq_gc_suspend(void)
diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h
index 6546431447d..a73dd6c7372 100644
--- a/kernel/irq/internals.h
+++ b/kernel/irq/internals.h
@@ -71,6 +71,8 @@ extern int irq_startup(struct irq_desc *desc);
71extern void irq_shutdown(struct irq_desc *desc); 71extern void irq_shutdown(struct irq_desc *desc);
72extern void irq_enable(struct irq_desc *desc); 72extern void irq_enable(struct irq_desc *desc);
73extern void irq_disable(struct irq_desc *desc); 73extern void irq_disable(struct irq_desc *desc);
74extern void irq_percpu_enable(struct irq_desc *desc, unsigned int cpu);
75extern void irq_percpu_disable(struct irq_desc *desc, unsigned int cpu);
74extern void mask_irq(struct irq_desc *desc); 76extern void mask_irq(struct irq_desc *desc);
75extern void unmask_irq(struct irq_desc *desc); 77extern void unmask_irq(struct irq_desc *desc);
76 78
@@ -114,14 +116,21 @@ static inline void chip_bus_sync_unlock(struct irq_desc *desc)
114 desc->irq_data.chip->irq_bus_sync_unlock(&desc->irq_data); 116 desc->irq_data.chip->irq_bus_sync_unlock(&desc->irq_data);
115} 117}
116 118
119#define _IRQ_DESC_CHECK (1 << 0)
120#define _IRQ_DESC_PERCPU (1 << 1)
121
122#define IRQ_GET_DESC_CHECK_GLOBAL (_IRQ_DESC_CHECK)
123#define IRQ_GET_DESC_CHECK_PERCPU (_IRQ_DESC_CHECK | _IRQ_DESC_PERCPU)
124
117struct irq_desc * 125struct irq_desc *
118__irq_get_desc_lock(unsigned int irq, unsigned long *flags, bool bus); 126__irq_get_desc_lock(unsigned int irq, unsigned long *flags, bool bus,
127 unsigned int check);
119void __irq_put_desc_unlock(struct irq_desc *desc, unsigned long flags, bool bus); 128void __irq_put_desc_unlock(struct irq_desc *desc, unsigned long flags, bool bus);
120 129
121static inline struct irq_desc * 130static inline struct irq_desc *
122irq_get_desc_buslock(unsigned int irq, unsigned long *flags) 131irq_get_desc_buslock(unsigned int irq, unsigned long *flags, unsigned int check)
123{ 132{
124 return __irq_get_desc_lock(irq, flags, true); 133 return __irq_get_desc_lock(irq, flags, true, check);
125} 134}
126 135
127static inline void 136static inline void
@@ -131,9 +140,9 @@ irq_put_desc_busunlock(struct irq_desc *desc, unsigned long flags)
131} 140}
132 141
133static inline struct irq_desc * 142static inline struct irq_desc *
134irq_get_desc_lock(unsigned int irq, unsigned long *flags) 143irq_get_desc_lock(unsigned int irq, unsigned long *flags, unsigned int check)
135{ 144{
136 return __irq_get_desc_lock(irq, flags, false); 145 return __irq_get_desc_lock(irq, flags, false, check);
137} 146}
138 147
139static inline void 148static inline void
diff --git a/kernel/irq/irqdesc.c b/kernel/irq/irqdesc.c
index 4c60a50e66b..d86e254b95e 100644
--- a/kernel/irq/irqdesc.c
+++ b/kernel/irq/irqdesc.c
@@ -9,7 +9,7 @@
9 */ 9 */
10#include <linux/irq.h> 10#include <linux/irq.h>
11#include <linux/slab.h> 11#include <linux/slab.h>
12#include <linux/module.h> 12#include <linux/export.h>
13#include <linux/interrupt.h> 13#include <linux/interrupt.h>
14#include <linux/kernel_stat.h> 14#include <linux/kernel_stat.h>
15#include <linux/radix-tree.h> 15#include <linux/radix-tree.h>
@@ -70,7 +70,8 @@ static inline void desc_smp_init(struct irq_desc *desc, int node) { }
70static inline int desc_node(struct irq_desc *desc) { return 0; } 70static inline int desc_node(struct irq_desc *desc) { return 0; }
71#endif 71#endif
72 72
73static void desc_set_defaults(unsigned int irq, struct irq_desc *desc, int node) 73static void desc_set_defaults(unsigned int irq, struct irq_desc *desc, int node,
74 struct module *owner)
74{ 75{
75 int cpu; 76 int cpu;
76 77
@@ -86,6 +87,7 @@ static void desc_set_defaults(unsigned int irq, struct irq_desc *desc, int node)
86 desc->irq_count = 0; 87 desc->irq_count = 0;
87 desc->irqs_unhandled = 0; 88 desc->irqs_unhandled = 0;
88 desc->name = NULL; 89 desc->name = NULL;
90 desc->owner = owner;
89 for_each_possible_cpu(cpu) 91 for_each_possible_cpu(cpu)
90 *per_cpu_ptr(desc->kstat_irqs, cpu) = 0; 92 *per_cpu_ptr(desc->kstat_irqs, cpu) = 0;
91 desc_smp_init(desc, node); 93 desc_smp_init(desc, node);
@@ -128,7 +130,7 @@ static void free_masks(struct irq_desc *desc)
128static inline void free_masks(struct irq_desc *desc) { } 130static inline void free_masks(struct irq_desc *desc) { }
129#endif 131#endif
130 132
131static struct irq_desc *alloc_desc(int irq, int node) 133static struct irq_desc *alloc_desc(int irq, int node, struct module *owner)
132{ 134{
133 struct irq_desc *desc; 135 struct irq_desc *desc;
134 gfp_t gfp = GFP_KERNEL; 136 gfp_t gfp = GFP_KERNEL;
@@ -147,7 +149,7 @@ static struct irq_desc *alloc_desc(int irq, int node)
147 raw_spin_lock_init(&desc->lock); 149 raw_spin_lock_init(&desc->lock);
148 lockdep_set_class(&desc->lock, &irq_desc_lock_class); 150 lockdep_set_class(&desc->lock, &irq_desc_lock_class);
149 151
150 desc_set_defaults(irq, desc, node); 152 desc_set_defaults(irq, desc, node, owner);
151 153
152 return desc; 154 return desc;
153 155
@@ -173,13 +175,14 @@ static void free_desc(unsigned int irq)
173 kfree(desc); 175 kfree(desc);
174} 176}
175 177
176static int alloc_descs(unsigned int start, unsigned int cnt, int node) 178static int alloc_descs(unsigned int start, unsigned int cnt, int node,
179 struct module *owner)
177{ 180{
178 struct irq_desc *desc; 181 struct irq_desc *desc;
179 int i; 182 int i;
180 183
181 for (i = 0; i < cnt; i++) { 184 for (i = 0; i < cnt; i++) {
182 desc = alloc_desc(start + i, node); 185 desc = alloc_desc(start + i, node, owner);
183 if (!desc) 186 if (!desc)
184 goto err; 187 goto err;
185 mutex_lock(&sparse_irq_lock); 188 mutex_lock(&sparse_irq_lock);
@@ -227,7 +230,7 @@ int __init early_irq_init(void)
227 nr_irqs = initcnt; 230 nr_irqs = initcnt;
228 231
229 for (i = 0; i < initcnt; i++) { 232 for (i = 0; i < initcnt; i++) {
230 desc = alloc_desc(i, node); 233 desc = alloc_desc(i, node, NULL);
231 set_bit(i, allocated_irqs); 234 set_bit(i, allocated_irqs);
232 irq_insert_desc(i, desc); 235 irq_insert_desc(i, desc);
233 } 236 }
@@ -261,7 +264,7 @@ int __init early_irq_init(void)
261 alloc_masks(&desc[i], GFP_KERNEL, node); 264 alloc_masks(&desc[i], GFP_KERNEL, node);
262 raw_spin_lock_init(&desc[i].lock); 265 raw_spin_lock_init(&desc[i].lock);
263 lockdep_set_class(&desc[i].lock, &irq_desc_lock_class); 266 lockdep_set_class(&desc[i].lock, &irq_desc_lock_class);
264 desc_set_defaults(i, &desc[i], node); 267 desc_set_defaults(i, &desc[i], node, NULL);
265 } 268 }
266 return arch_early_irq_init(); 269 return arch_early_irq_init();
267} 270}
@@ -276,8 +279,16 @@ static void free_desc(unsigned int irq)
276 dynamic_irq_cleanup(irq); 279 dynamic_irq_cleanup(irq);
277} 280}
278 281
279static inline int alloc_descs(unsigned int start, unsigned int cnt, int node) 282static inline int alloc_descs(unsigned int start, unsigned int cnt, int node,
283 struct module *owner)
280{ 284{
285 u32 i;
286
287 for (i = 0; i < cnt; i++) {
288 struct irq_desc *desc = irq_to_desc(start + i);
289
290 desc->owner = owner;
291 }
281 return start; 292 return start;
282} 293}
283 294
@@ -333,11 +344,13 @@ EXPORT_SYMBOL_GPL(irq_free_descs);
333 * @from: Start the search from this irq number 344 * @from: Start the search from this irq number
334 * @cnt: Number of consecutive irqs to allocate. 345 * @cnt: Number of consecutive irqs to allocate.
335 * @node: Preferred node on which the irq descriptor should be allocated 346 * @node: Preferred node on which the irq descriptor should be allocated
347 * @owner: Owning module (can be NULL)
336 * 348 *
337 * Returns the first irq number or error code 349 * Returns the first irq number or error code
338 */ 350 */
339int __ref 351int __ref
340irq_alloc_descs(int irq, unsigned int from, unsigned int cnt, int node) 352__irq_alloc_descs(int irq, unsigned int from, unsigned int cnt, int node,
353 struct module *owner)
341{ 354{
342 int start, ret; 355 int start, ret;
343 356
@@ -366,13 +379,13 @@ irq_alloc_descs(int irq, unsigned int from, unsigned int cnt, int node)
366 379
367 bitmap_set(allocated_irqs, start, cnt); 380 bitmap_set(allocated_irqs, start, cnt);
368 mutex_unlock(&sparse_irq_lock); 381 mutex_unlock(&sparse_irq_lock);
369 return alloc_descs(start, cnt, node); 382 return alloc_descs(start, cnt, node, owner);
370 383
371err: 384err:
372 mutex_unlock(&sparse_irq_lock); 385 mutex_unlock(&sparse_irq_lock);
373 return ret; 386 return ret;
374} 387}
375EXPORT_SYMBOL_GPL(irq_alloc_descs); 388EXPORT_SYMBOL_GPL(__irq_alloc_descs);
376 389
377/** 390/**
378 * irq_reserve_irqs - mark irqs allocated 391 * irq_reserve_irqs - mark irqs allocated
@@ -411,11 +424,22 @@ unsigned int irq_get_next_irq(unsigned int offset)
411} 424}
412 425
413struct irq_desc * 426struct irq_desc *
414__irq_get_desc_lock(unsigned int irq, unsigned long *flags, bool bus) 427__irq_get_desc_lock(unsigned int irq, unsigned long *flags, bool bus,
428 unsigned int check)
415{ 429{
416 struct irq_desc *desc = irq_to_desc(irq); 430 struct irq_desc *desc = irq_to_desc(irq);
417 431
418 if (desc) { 432 if (desc) {
433 if (check & _IRQ_DESC_CHECK) {
434 if ((check & _IRQ_DESC_PERCPU) &&
435 !irq_settings_is_per_cpu_devid(desc))
436 return NULL;
437
438 if (!(check & _IRQ_DESC_PERCPU) &&
439 irq_settings_is_per_cpu_devid(desc))
440 return NULL;
441 }
442
419 if (bus) 443 if (bus)
420 chip_bus_lock(desc); 444 chip_bus_lock(desc);
421 raw_spin_lock_irqsave(&desc->lock, *flags); 445 raw_spin_lock_irqsave(&desc->lock, *flags);
@@ -430,6 +454,25 @@ void __irq_put_desc_unlock(struct irq_desc *desc, unsigned long flags, bool bus)
430 chip_bus_sync_unlock(desc); 454 chip_bus_sync_unlock(desc);
431} 455}
432 456
457int irq_set_percpu_devid(unsigned int irq)
458{
459 struct irq_desc *desc = irq_to_desc(irq);
460
461 if (!desc)
462 return -EINVAL;
463
464 if (desc->percpu_enabled)
465 return -EINVAL;
466
467 desc->percpu_enabled = kzalloc(sizeof(*desc->percpu_enabled), GFP_KERNEL);
468
469 if (!desc->percpu_enabled)
470 return -ENOMEM;
471
472 irq_set_percpu_devid_flags(irq);
473 return 0;
474}
475
433/** 476/**
434 * dynamic_irq_cleanup - cleanup a dynamically allocated irq 477 * dynamic_irq_cleanup - cleanup a dynamically allocated irq
435 * @irq: irq number to initialize 478 * @irq: irq number to initialize
@@ -440,7 +483,7 @@ void dynamic_irq_cleanup(unsigned int irq)
440 unsigned long flags; 483 unsigned long flags;
441 484
442 raw_spin_lock_irqsave(&desc->lock, flags); 485 raw_spin_lock_irqsave(&desc->lock, flags);
443 desc_set_defaults(irq, desc, desc_node(desc)); 486 desc_set_defaults(irq, desc, desc_node(desc), NULL);
444 raw_spin_unlock_irqrestore(&desc->lock, flags); 487 raw_spin_unlock_irqrestore(&desc->lock, flags);
445} 488}
446 489
diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c
index d5828da3fd3..200ce832c58 100644
--- a/kernel/irq/irqdomain.c
+++ b/kernel/irq/irqdomain.c
@@ -20,16 +20,20 @@ static DEFINE_MUTEX(irq_domain_mutex);
20void irq_domain_add(struct irq_domain *domain) 20void irq_domain_add(struct irq_domain *domain)
21{ 21{
22 struct irq_data *d; 22 struct irq_data *d;
23 int hwirq; 23 int hwirq, irq;
24 24
25 /* 25 /*
26 * This assumes that the irq_domain owner has already allocated 26 * This assumes that the irq_domain owner has already allocated
27 * the irq_descs. This block will be removed when support for dynamic 27 * the irq_descs. This block will be removed when support for dynamic
28 * allocation of irq_descs is added to irq_domain. 28 * allocation of irq_descs is added to irq_domain.
29 */ 29 */
30 for (hwirq = 0; hwirq < domain->nr_irq; hwirq++) { 30 irq_domain_for_each_irq(domain, hwirq, irq) {
31 d = irq_get_irq_data(irq_domain_to_irq(domain, hwirq)); 31 d = irq_get_irq_data(irq);
32 if (d || d->domain) { 32 if (!d) {
33 WARN(1, "error: assigning domain to non existant irq_desc");
34 return;
35 }
36 if (d->domain) {
33 /* things are broken; just report, don't clean up */ 37 /* things are broken; just report, don't clean up */
34 WARN(1, "error: irq_desc already assigned to a domain"); 38 WARN(1, "error: irq_desc already assigned to a domain");
35 return; 39 return;
@@ -50,15 +54,15 @@ void irq_domain_add(struct irq_domain *domain)
50void irq_domain_del(struct irq_domain *domain) 54void irq_domain_del(struct irq_domain *domain)
51{ 55{
52 struct irq_data *d; 56 struct irq_data *d;
53 int hwirq; 57 int hwirq, irq;
54 58
55 mutex_lock(&irq_domain_mutex); 59 mutex_lock(&irq_domain_mutex);
56 list_del(&domain->list); 60 list_del(&domain->list);
57 mutex_unlock(&irq_domain_mutex); 61 mutex_unlock(&irq_domain_mutex);
58 62
59 /* Clear the irq_domain assignments */ 63 /* Clear the irq_domain assignments */
60 for (hwirq = 0; hwirq < domain->nr_irq; hwirq++) { 64 irq_domain_for_each_irq(domain, hwirq, irq) {
61 d = irq_get_irq_data(irq_domain_to_irq(domain, hwirq)); 65 d = irq_get_irq_data(irq);
62 d->domain = NULL; 66 d->domain = NULL;
63 } 67 }
64} 68}
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index 0a7840aeb0f..1da999f5e74 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -195,7 +195,7 @@ int irq_set_affinity(unsigned int irq, const struct cpumask *mask)
195int irq_set_affinity_hint(unsigned int irq, const struct cpumask *m) 195int irq_set_affinity_hint(unsigned int irq, const struct cpumask *m)
196{ 196{
197 unsigned long flags; 197 unsigned long flags;
198 struct irq_desc *desc = irq_get_desc_lock(irq, &flags); 198 struct irq_desc *desc = irq_get_desc_lock(irq, &flags, IRQ_GET_DESC_CHECK_GLOBAL);
199 199
200 if (!desc) 200 if (!desc)
201 return -EINVAL; 201 return -EINVAL;
@@ -356,7 +356,7 @@ void __disable_irq(struct irq_desc *desc, unsigned int irq, bool suspend)
356static int __disable_irq_nosync(unsigned int irq) 356static int __disable_irq_nosync(unsigned int irq)
357{ 357{
358 unsigned long flags; 358 unsigned long flags;
359 struct irq_desc *desc = irq_get_desc_buslock(irq, &flags); 359 struct irq_desc *desc = irq_get_desc_buslock(irq, &flags, IRQ_GET_DESC_CHECK_GLOBAL);
360 360
361 if (!desc) 361 if (!desc)
362 return -EINVAL; 362 return -EINVAL;
@@ -448,7 +448,7 @@ void __enable_irq(struct irq_desc *desc, unsigned int irq, bool resume)
448void enable_irq(unsigned int irq) 448void enable_irq(unsigned int irq)
449{ 449{
450 unsigned long flags; 450 unsigned long flags;
451 struct irq_desc *desc = irq_get_desc_buslock(irq, &flags); 451 struct irq_desc *desc = irq_get_desc_buslock(irq, &flags, IRQ_GET_DESC_CHECK_GLOBAL);
452 452
453 if (!desc) 453 if (!desc)
454 return; 454 return;
@@ -467,6 +467,9 @@ static int set_irq_wake_real(unsigned int irq, unsigned int on)
467 struct irq_desc *desc = irq_to_desc(irq); 467 struct irq_desc *desc = irq_to_desc(irq);
468 int ret = -ENXIO; 468 int ret = -ENXIO;
469 469
470 if (irq_desc_get_chip(desc)->flags & IRQCHIP_SKIP_SET_WAKE)
471 return 0;
472
470 if (desc->irq_data.chip->irq_set_wake) 473 if (desc->irq_data.chip->irq_set_wake)
471 ret = desc->irq_data.chip->irq_set_wake(&desc->irq_data, on); 474 ret = desc->irq_data.chip->irq_set_wake(&desc->irq_data, on);
472 475
@@ -488,7 +491,7 @@ static int set_irq_wake_real(unsigned int irq, unsigned int on)
488int irq_set_irq_wake(unsigned int irq, unsigned int on) 491int irq_set_irq_wake(unsigned int irq, unsigned int on)
489{ 492{
490 unsigned long flags; 493 unsigned long flags;
491 struct irq_desc *desc = irq_get_desc_buslock(irq, &flags); 494 struct irq_desc *desc = irq_get_desc_buslock(irq, &flags, IRQ_GET_DESC_CHECK_GLOBAL);
492 int ret = 0; 495 int ret = 0;
493 496
494 if (!desc) 497 if (!desc)
@@ -529,7 +532,7 @@ EXPORT_SYMBOL(irq_set_irq_wake);
529int can_request_irq(unsigned int irq, unsigned long irqflags) 532int can_request_irq(unsigned int irq, unsigned long irqflags)
530{ 533{
531 unsigned long flags; 534 unsigned long flags;
532 struct irq_desc *desc = irq_get_desc_lock(irq, &flags); 535 struct irq_desc *desc = irq_get_desc_lock(irq, &flags, 0);
533 int canrequest = 0; 536 int canrequest = 0;
534 537
535 if (!desc) 538 if (!desc)
@@ -620,8 +623,9 @@ static irqreturn_t irq_nested_primary_handler(int irq, void *dev_id)
620 623
621static int irq_wait_for_interrupt(struct irqaction *action) 624static int irq_wait_for_interrupt(struct irqaction *action)
622{ 625{
626 set_current_state(TASK_INTERRUPTIBLE);
627
623 while (!kthread_should_stop()) { 628 while (!kthread_should_stop()) {
624 set_current_state(TASK_INTERRUPTIBLE);
625 629
626 if (test_and_clear_bit(IRQTF_RUNTHREAD, 630 if (test_and_clear_bit(IRQTF_RUNTHREAD,
627 &action->thread_flags)) { 631 &action->thread_flags)) {
@@ -629,7 +633,9 @@ static int irq_wait_for_interrupt(struct irqaction *action)
629 return 0; 633 return 0;
630 } 634 }
631 schedule(); 635 schedule();
636 set_current_state(TASK_INTERRUPTIBLE);
632 } 637 }
638 __set_current_state(TASK_RUNNING);
633 return -1; 639 return -1;
634} 640}
635 641
@@ -883,6 +889,8 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new)
883 889
884 if (desc->irq_data.chip == &no_irq_chip) 890 if (desc->irq_data.chip == &no_irq_chip)
885 return -ENOSYS; 891 return -ENOSYS;
892 if (!try_module_get(desc->owner))
893 return -ENODEV;
886 /* 894 /*
887 * Some drivers like serial.c use request_irq() heavily, 895 * Some drivers like serial.c use request_irq() heavily,
888 * so we have to be careful not to interfere with a 896 * so we have to be careful not to interfere with a
@@ -906,8 +914,10 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new)
906 */ 914 */
907 nested = irq_settings_is_nested_thread(desc); 915 nested = irq_settings_is_nested_thread(desc);
908 if (nested) { 916 if (nested) {
909 if (!new->thread_fn) 917 if (!new->thread_fn) {
910 return -EINVAL; 918 ret = -EINVAL;
919 goto out_mput;
920 }
911 /* 921 /*
912 * Replace the primary handler which was provided from 922 * Replace the primary handler which was provided from
913 * the driver for non nested interrupt handling by the 923 * the driver for non nested interrupt handling by the
@@ -929,8 +939,10 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new)
929 939
930 t = kthread_create(irq_thread, new, "irq/%d-%s", irq, 940 t = kthread_create(irq_thread, new, "irq/%d-%s", irq,
931 new->name); 941 new->name);
932 if (IS_ERR(t)) 942 if (IS_ERR(t)) {
933 return PTR_ERR(t); 943 ret = PTR_ERR(t);
944 goto out_mput;
945 }
934 /* 946 /*
935 * We keep the reference to the task struct even if 947 * We keep the reference to the task struct even if
936 * the thread dies to avoid that the interrupt code 948 * the thread dies to avoid that the interrupt code
@@ -1095,6 +1107,8 @@ out_thread:
1095 kthread_stop(t); 1107 kthread_stop(t);
1096 put_task_struct(t); 1108 put_task_struct(t);
1097 } 1109 }
1110out_mput:
1111 module_put(desc->owner);
1098 return ret; 1112 return ret;
1099} 1113}
1100 1114
@@ -1110,6 +1124,8 @@ int setup_irq(unsigned int irq, struct irqaction *act)
1110 int retval; 1124 int retval;
1111 struct irq_desc *desc = irq_to_desc(irq); 1125 struct irq_desc *desc = irq_to_desc(irq);
1112 1126
1127 if (WARN_ON(irq_settings_is_per_cpu_devid(desc)))
1128 return -EINVAL;
1113 chip_bus_lock(desc); 1129 chip_bus_lock(desc);
1114 retval = __setup_irq(irq, desc, act); 1130 retval = __setup_irq(irq, desc, act);
1115 chip_bus_sync_unlock(desc); 1131 chip_bus_sync_unlock(desc);
@@ -1118,7 +1134,7 @@ int setup_irq(unsigned int irq, struct irqaction *act)
1118} 1134}
1119EXPORT_SYMBOL_GPL(setup_irq); 1135EXPORT_SYMBOL_GPL(setup_irq);
1120 1136
1121 /* 1137/*
1122 * Internal function to unregister an irqaction - used to free 1138 * Internal function to unregister an irqaction - used to free
1123 * regular and special interrupts that are part of the architecture. 1139 * regular and special interrupts that are part of the architecture.
1124 */ 1140 */
@@ -1203,6 +1219,7 @@ static struct irqaction *__free_irq(unsigned int irq, void *dev_id)
1203 put_task_struct(action->thread); 1219 put_task_struct(action->thread);
1204 } 1220 }
1205 1221
1222 module_put(desc->owner);
1206 return action; 1223 return action;
1207} 1224}
1208 1225
@@ -1215,7 +1232,10 @@ static struct irqaction *__free_irq(unsigned int irq, void *dev_id)
1215 */ 1232 */
1216void remove_irq(unsigned int irq, struct irqaction *act) 1233void remove_irq(unsigned int irq, struct irqaction *act)
1217{ 1234{
1218 __free_irq(irq, act->dev_id); 1235 struct irq_desc *desc = irq_to_desc(irq);
1236
1237 if (desc && !WARN_ON(irq_settings_is_per_cpu_devid(desc)))
1238 __free_irq(irq, act->dev_id);
1219} 1239}
1220EXPORT_SYMBOL_GPL(remove_irq); 1240EXPORT_SYMBOL_GPL(remove_irq);
1221 1241
@@ -1237,7 +1257,7 @@ void free_irq(unsigned int irq, void *dev_id)
1237{ 1257{
1238 struct irq_desc *desc = irq_to_desc(irq); 1258 struct irq_desc *desc = irq_to_desc(irq);
1239 1259
1240 if (!desc) 1260 if (!desc || WARN_ON(irq_settings_is_per_cpu_devid(desc)))
1241 return; 1261 return;
1242 1262
1243#ifdef CONFIG_SMP 1263#ifdef CONFIG_SMP
@@ -1315,7 +1335,8 @@ int request_threaded_irq(unsigned int irq, irq_handler_t handler,
1315 if (!desc) 1335 if (!desc)
1316 return -EINVAL; 1336 return -EINVAL;
1317 1337
1318 if (!irq_settings_can_request(desc)) 1338 if (!irq_settings_can_request(desc) ||
1339 WARN_ON(irq_settings_is_per_cpu_devid(desc)))
1319 return -EINVAL; 1340 return -EINVAL;
1320 1341
1321 if (!handler) { 1342 if (!handler) {
@@ -1400,3 +1421,194 @@ int request_any_context_irq(unsigned int irq, irq_handler_t handler,
1400 return !ret ? IRQC_IS_HARDIRQ : ret; 1421 return !ret ? IRQC_IS_HARDIRQ : ret;
1401} 1422}
1402EXPORT_SYMBOL_GPL(request_any_context_irq); 1423EXPORT_SYMBOL_GPL(request_any_context_irq);
1424
1425void enable_percpu_irq(unsigned int irq, unsigned int type)
1426{
1427 unsigned int cpu = smp_processor_id();
1428 unsigned long flags;
1429 struct irq_desc *desc = irq_get_desc_lock(irq, &flags, IRQ_GET_DESC_CHECK_PERCPU);
1430
1431 if (!desc)
1432 return;
1433
1434 type &= IRQ_TYPE_SENSE_MASK;
1435 if (type != IRQ_TYPE_NONE) {
1436 int ret;
1437
1438 ret = __irq_set_trigger(desc, irq, type);
1439
1440 if (ret) {
1441 WARN(1, "failed to set type for IRQ%d\n", irq);
1442 goto out;
1443 }
1444 }
1445
1446 irq_percpu_enable(desc, cpu);
1447out:
1448 irq_put_desc_unlock(desc, flags);
1449}
1450
1451void disable_percpu_irq(unsigned int irq)
1452{
1453 unsigned int cpu = smp_processor_id();
1454 unsigned long flags;
1455 struct irq_desc *desc = irq_get_desc_lock(irq, &flags, IRQ_GET_DESC_CHECK_PERCPU);
1456
1457 if (!desc)
1458 return;
1459
1460 irq_percpu_disable(desc, cpu);
1461 irq_put_desc_unlock(desc, flags);
1462}
1463
1464/*
1465 * Internal function to unregister a percpu irqaction.
1466 */
1467static struct irqaction *__free_percpu_irq(unsigned int irq, void __percpu *dev_id)
1468{
1469 struct irq_desc *desc = irq_to_desc(irq);
1470 struct irqaction *action;
1471 unsigned long flags;
1472
1473 WARN(in_interrupt(), "Trying to free IRQ %d from IRQ context!\n", irq);
1474
1475 if (!desc)
1476 return NULL;
1477
1478 raw_spin_lock_irqsave(&desc->lock, flags);
1479
1480 action = desc->action;
1481 if (!action || action->percpu_dev_id != dev_id) {
1482 WARN(1, "Trying to free already-free IRQ %d\n", irq);
1483 goto bad;
1484 }
1485
1486 if (!cpumask_empty(desc->percpu_enabled)) {
1487 WARN(1, "percpu IRQ %d still enabled on CPU%d!\n",
1488 irq, cpumask_first(desc->percpu_enabled));
1489 goto bad;
1490 }
1491
1492 /* Found it - now remove it from the list of entries: */
1493 desc->action = NULL;
1494
1495 raw_spin_unlock_irqrestore(&desc->lock, flags);
1496
1497 unregister_handler_proc(irq, action);
1498
1499 module_put(desc->owner);
1500 return action;
1501
1502bad:
1503 raw_spin_unlock_irqrestore(&desc->lock, flags);
1504 return NULL;
1505}
1506
1507/**
1508 * remove_percpu_irq - free a per-cpu interrupt
1509 * @irq: Interrupt line to free
1510 * @act: irqaction for the interrupt
1511 *
1512 * Used to remove interrupts statically setup by the early boot process.
1513 */
1514void remove_percpu_irq(unsigned int irq, struct irqaction *act)
1515{
1516 struct irq_desc *desc = irq_to_desc(irq);
1517
1518 if (desc && irq_settings_is_per_cpu_devid(desc))
1519 __free_percpu_irq(irq, act->percpu_dev_id);
1520}
1521
1522/**
1523 * free_percpu_irq - free an interrupt allocated with request_percpu_irq
1524 * @irq: Interrupt line to free
1525 * @dev_id: Device identity to free
1526 *
1527 * Remove a percpu interrupt handler. The handler is removed, but
1528 * the interrupt line is not disabled. This must be done on each
1529 * CPU before calling this function. The function does not return
1530 * until any executing interrupts for this IRQ have completed.
1531 *
1532 * This function must not be called from interrupt context.
1533 */
1534void free_percpu_irq(unsigned int irq, void __percpu *dev_id)
1535{
1536 struct irq_desc *desc = irq_to_desc(irq);
1537
1538 if (!desc || !irq_settings_is_per_cpu_devid(desc))
1539 return;
1540
1541 chip_bus_lock(desc);
1542 kfree(__free_percpu_irq(irq, dev_id));
1543 chip_bus_sync_unlock(desc);
1544}
1545
1546/**
1547 * setup_percpu_irq - setup a per-cpu interrupt
1548 * @irq: Interrupt line to setup
1549 * @act: irqaction for the interrupt
1550 *
1551 * Used to statically setup per-cpu interrupts in the early boot process.
1552 */
1553int setup_percpu_irq(unsigned int irq, struct irqaction *act)
1554{
1555 struct irq_desc *desc = irq_to_desc(irq);
1556 int retval;
1557
1558 if (!desc || !irq_settings_is_per_cpu_devid(desc))
1559 return -EINVAL;
1560 chip_bus_lock(desc);
1561 retval = __setup_irq(irq, desc, act);
1562 chip_bus_sync_unlock(desc);
1563
1564 return retval;
1565}
1566
1567/**
1568 * request_percpu_irq - allocate a percpu interrupt line
1569 * @irq: Interrupt line to allocate
1570 * @handler: Function to be called when the IRQ occurs.
1571 * @devname: An ascii name for the claiming device
1572 * @dev_id: A percpu cookie passed back to the handler function
1573 *
1574 * This call allocates interrupt resources, but doesn't
1575 * automatically enable the interrupt. It has to be done on each
1576 * CPU using enable_percpu_irq().
1577 *
1578 * Dev_id must be globally unique. It is a per-cpu variable, and
1579 * the handler gets called with the interrupted CPU's instance of
1580 * that variable.
1581 */
1582int request_percpu_irq(unsigned int irq, irq_handler_t handler,
1583 const char *devname, void __percpu *dev_id)
1584{
1585 struct irqaction *action;
1586 struct irq_desc *desc;
1587 int retval;
1588
1589 if (!dev_id)
1590 return -EINVAL;
1591
1592 desc = irq_to_desc(irq);
1593 if (!desc || !irq_settings_can_request(desc) ||
1594 !irq_settings_is_per_cpu_devid(desc))
1595 return -EINVAL;
1596
1597 action = kzalloc(sizeof(struct irqaction), GFP_KERNEL);
1598 if (!action)
1599 return -ENOMEM;
1600
1601 action->handler = handler;
1602 action->flags = IRQF_PERCPU | IRQF_NO_SUSPEND;
1603 action->name = devname;
1604 action->percpu_dev_id = dev_id;
1605
1606 chip_bus_lock(desc);
1607 retval = __setup_irq(irq, desc, action);
1608 chip_bus_sync_unlock(desc);
1609
1610 if (retval)
1611 kfree(action);
1612
1613 return retval;
1614}
diff --git a/kernel/irq/pm.c b/kernel/irq/pm.c
index f76fc00c987..15e53b1766a 100644
--- a/kernel/irq/pm.c
+++ b/kernel/irq/pm.c
@@ -9,6 +9,7 @@
9#include <linux/irq.h> 9#include <linux/irq.h>
10#include <linux/module.h> 10#include <linux/module.h>
11#include <linux/interrupt.h> 11#include <linux/interrupt.h>
12#include <linux/syscore_ops.h>
12 13
13#include "internals.h" 14#include "internals.h"
14 15
@@ -39,25 +40,58 @@ void suspend_device_irqs(void)
39} 40}
40EXPORT_SYMBOL_GPL(suspend_device_irqs); 41EXPORT_SYMBOL_GPL(suspend_device_irqs);
41 42
42/** 43static void resume_irqs(bool want_early)
43 * resume_device_irqs - enable interrupt lines disabled by suspend_device_irqs()
44 *
45 * Enable all interrupt lines previously disabled by suspend_device_irqs() that
46 * have the IRQS_SUSPENDED flag set.
47 */
48void resume_device_irqs(void)
49{ 44{
50 struct irq_desc *desc; 45 struct irq_desc *desc;
51 int irq; 46 int irq;
52 47
53 for_each_irq_desc(irq, desc) { 48 for_each_irq_desc(irq, desc) {
54 unsigned long flags; 49 unsigned long flags;
50 bool is_early = desc->action &&
51 desc->action->flags & IRQF_EARLY_RESUME;
52
53 if (is_early != want_early)
54 continue;
55 55
56 raw_spin_lock_irqsave(&desc->lock, flags); 56 raw_spin_lock_irqsave(&desc->lock, flags);
57 __enable_irq(desc, irq, true); 57 __enable_irq(desc, irq, true);
58 raw_spin_unlock_irqrestore(&desc->lock, flags); 58 raw_spin_unlock_irqrestore(&desc->lock, flags);
59 } 59 }
60} 60}
61
62/**
63 * irq_pm_syscore_ops - enable interrupt lines early
64 *
65 * Enable all interrupt lines with %IRQF_EARLY_RESUME set.
66 */
67static void irq_pm_syscore_resume(void)
68{
69 resume_irqs(true);
70}
71
72static struct syscore_ops irq_pm_syscore_ops = {
73 .resume = irq_pm_syscore_resume,
74};
75
76static int __init irq_pm_init_ops(void)
77{
78 register_syscore_ops(&irq_pm_syscore_ops);
79 return 0;
80}
81
82device_initcall(irq_pm_init_ops);
83
84/**
85 * resume_device_irqs - enable interrupt lines disabled by suspend_device_irqs()
86 *
87 * Enable all non-%IRQF_EARLY_RESUME interrupt lines previously
88 * disabled by suspend_device_irqs() that have the IRQS_SUSPENDED flag
89 * set as well as those with %IRQF_FORCE_RESUME.
90 */
91void resume_device_irqs(void)
92{
93 resume_irqs(false);
94}
61EXPORT_SYMBOL_GPL(resume_device_irqs); 95EXPORT_SYMBOL_GPL(resume_device_irqs);
62 96
63/** 97/**
diff --git a/kernel/irq/settings.h b/kernel/irq/settings.h
index f1667833d44..1162f1030f1 100644
--- a/kernel/irq/settings.h
+++ b/kernel/irq/settings.h
@@ -13,6 +13,7 @@ enum {
13 _IRQ_MOVE_PCNTXT = IRQ_MOVE_PCNTXT, 13 _IRQ_MOVE_PCNTXT = IRQ_MOVE_PCNTXT,
14 _IRQ_NO_BALANCING = IRQ_NO_BALANCING, 14 _IRQ_NO_BALANCING = IRQ_NO_BALANCING,
15 _IRQ_NESTED_THREAD = IRQ_NESTED_THREAD, 15 _IRQ_NESTED_THREAD = IRQ_NESTED_THREAD,
16 _IRQ_PER_CPU_DEVID = IRQ_PER_CPU_DEVID,
16 _IRQF_MODIFY_MASK = IRQF_MODIFY_MASK, 17 _IRQF_MODIFY_MASK = IRQF_MODIFY_MASK,
17}; 18};
18 19
@@ -24,6 +25,7 @@ enum {
24#define IRQ_NOTHREAD GOT_YOU_MORON 25#define IRQ_NOTHREAD GOT_YOU_MORON
25#define IRQ_NOAUTOEN GOT_YOU_MORON 26#define IRQ_NOAUTOEN GOT_YOU_MORON
26#define IRQ_NESTED_THREAD GOT_YOU_MORON 27#define IRQ_NESTED_THREAD GOT_YOU_MORON
28#define IRQ_PER_CPU_DEVID GOT_YOU_MORON
27#undef IRQF_MODIFY_MASK 29#undef IRQF_MODIFY_MASK
28#define IRQF_MODIFY_MASK GOT_YOU_MORON 30#define IRQF_MODIFY_MASK GOT_YOU_MORON
29 31
@@ -39,6 +41,11 @@ static inline bool irq_settings_is_per_cpu(struct irq_desc *desc)
39 return desc->status_use_accessors & _IRQ_PER_CPU; 41 return desc->status_use_accessors & _IRQ_PER_CPU;
40} 42}
41 43
44static inline bool irq_settings_is_per_cpu_devid(struct irq_desc *desc)
45{
46 return desc->status_use_accessors & _IRQ_PER_CPU_DEVID;
47}
48
42static inline void irq_settings_set_per_cpu(struct irq_desc *desc) 49static inline void irq_settings_set_per_cpu(struct irq_desc *desc)
43{ 50{
44 desc->status_use_accessors |= _IRQ_PER_CPU; 51 desc->status_use_accessors |= _IRQ_PER_CPU;
diff --git a/kernel/irq/spurious.c b/kernel/irq/spurious.c
index aa57d5da18c..dc813a948be 100644
--- a/kernel/irq/spurious.c
+++ b/kernel/irq/spurious.c
@@ -84,7 +84,9 @@ static int try_one_irq(int irq, struct irq_desc *desc, bool force)
84 */ 84 */
85 action = desc->action; 85 action = desc->action;
86 if (!action || !(action->flags & IRQF_SHARED) || 86 if (!action || !(action->flags & IRQF_SHARED) ||
87 (action->flags & __IRQF_TIMER) || !action->next) 87 (action->flags & __IRQF_TIMER) ||
88 (action->handler(irq, action->dev_id) == IRQ_HANDLED) ||
89 !action->next)
88 goto out; 90 goto out;
89 91
90 /* Already running on another processor */ 92 /* Already running on another processor */
@@ -115,7 +117,7 @@ static int misrouted_irq(int irq)
115 struct irq_desc *desc; 117 struct irq_desc *desc;
116 int i, ok = 0; 118 int i, ok = 0;
117 119
118 if (atomic_inc_return(&irq_poll_active) == 1) 120 if (atomic_inc_return(&irq_poll_active) != 1)
119 goto out; 121 goto out;
120 122
121 irq_poll_cpu = smp_processor_id(); 123 irq_poll_cpu = smp_processor_id();
diff --git a/kernel/irq_work.c b/kernel/irq_work.c
index c58fa7da8ae..c3c46c72046 100644
--- a/kernel/irq_work.c
+++ b/kernel/irq_work.c
@@ -6,9 +6,11 @@
6 */ 6 */
7 7
8#include <linux/kernel.h> 8#include <linux/kernel.h>
9#include <linux/module.h> 9#include <linux/export.h>
10#include <linux/irq_work.h> 10#include <linux/irq_work.h>
11#include <linux/percpu.h>
11#include <linux/hardirq.h> 12#include <linux/hardirq.h>
13#include <asm/processor.h>
12 14
13/* 15/*
14 * An entry can be in one of four states: 16 * An entry can be in one of four states:
@@ -17,54 +19,34 @@
17 * claimed NULL, 3 -> {pending} : claimed to be enqueued 19 * claimed NULL, 3 -> {pending} : claimed to be enqueued
18 * pending next, 3 -> {busy} : queued, pending callback 20 * pending next, 3 -> {busy} : queued, pending callback
19 * busy NULL, 2 -> {free, claimed} : callback in progress, can be claimed 21 * busy NULL, 2 -> {free, claimed} : callback in progress, can be claimed
20 *
21 * We use the lower two bits of the next pointer to keep PENDING and BUSY
22 * flags.
23 */ 22 */
24 23
25#define IRQ_WORK_PENDING 1UL 24#define IRQ_WORK_PENDING 1UL
26#define IRQ_WORK_BUSY 2UL 25#define IRQ_WORK_BUSY 2UL
27#define IRQ_WORK_FLAGS 3UL 26#define IRQ_WORK_FLAGS 3UL
28 27
29static inline bool irq_work_is_set(struct irq_work *entry, int flags) 28static DEFINE_PER_CPU(struct llist_head, irq_work_list);
30{
31 return (unsigned long)entry->next & flags;
32}
33
34static inline struct irq_work *irq_work_next(struct irq_work *entry)
35{
36 unsigned long next = (unsigned long)entry->next;
37 next &= ~IRQ_WORK_FLAGS;
38 return (struct irq_work *)next;
39}
40
41static inline struct irq_work *next_flags(struct irq_work *entry, int flags)
42{
43 unsigned long next = (unsigned long)entry;
44 next |= flags;
45 return (struct irq_work *)next;
46}
47
48static DEFINE_PER_CPU(struct irq_work *, irq_work_list);
49 29
50/* 30/*
51 * Claim the entry so that no one else will poke at it. 31 * Claim the entry so that no one else will poke at it.
52 */ 32 */
53static bool irq_work_claim(struct irq_work *entry) 33static bool irq_work_claim(struct irq_work *work)
54{ 34{
55 struct irq_work *next, *nflags; 35 unsigned long flags, nflags;
56 36
57 do { 37 for (;;) {
58 next = entry->next; 38 flags = work->flags;
59 if ((unsigned long)next & IRQ_WORK_PENDING) 39 if (flags & IRQ_WORK_PENDING)
60 return false; 40 return false;
61 nflags = next_flags(next, IRQ_WORK_FLAGS); 41 nflags = flags | IRQ_WORK_FLAGS;
62 } while (cmpxchg(&entry->next, next, nflags) != next); 42 if (cmpxchg(&work->flags, flags, nflags) == flags)
43 break;
44 cpu_relax();
45 }
63 46
64 return true; 47 return true;
65} 48}
66 49
67
68void __weak arch_irq_work_raise(void) 50void __weak arch_irq_work_raise(void)
69{ 51{
70 /* 52 /*
@@ -75,20 +57,15 @@ void __weak arch_irq_work_raise(void)
75/* 57/*
76 * Queue the entry and raise the IPI if needed. 58 * Queue the entry and raise the IPI if needed.
77 */ 59 */
78static void __irq_work_queue(struct irq_work *entry) 60static void __irq_work_queue(struct irq_work *work)
79{ 61{
80 struct irq_work *next; 62 bool empty;
81 63
82 preempt_disable(); 64 preempt_disable();
83 65
84 do { 66 empty = llist_add(&work->llnode, &__get_cpu_var(irq_work_list));
85 next = __this_cpu_read(irq_work_list);
86 /* Can assign non-atomic because we keep the flags set. */
87 entry->next = next_flags(next, IRQ_WORK_FLAGS);
88 } while (this_cpu_cmpxchg(irq_work_list, next, entry) != next);
89
90 /* The list was empty, raise self-interrupt to start processing. */ 67 /* The list was empty, raise self-interrupt to start processing. */
91 if (!irq_work_next(entry)) 68 if (empty)
92 arch_irq_work_raise(); 69 arch_irq_work_raise();
93 70
94 preempt_enable(); 71 preempt_enable();
@@ -100,16 +77,16 @@ static void __irq_work_queue(struct irq_work *entry)
100 * 77 *
101 * Can be re-enqueued while the callback is still in progress. 78 * Can be re-enqueued while the callback is still in progress.
102 */ 79 */
103bool irq_work_queue(struct irq_work *entry) 80bool irq_work_queue(struct irq_work *work)
104{ 81{
105 if (!irq_work_claim(entry)) { 82 if (!irq_work_claim(work)) {
106 /* 83 /*
107 * Already enqueued, can't do! 84 * Already enqueued, can't do!
108 */ 85 */
109 return false; 86 return false;
110 } 87 }
111 88
112 __irq_work_queue(entry); 89 __irq_work_queue(work);
113 return true; 90 return true;
114} 91}
115EXPORT_SYMBOL_GPL(irq_work_queue); 92EXPORT_SYMBOL_GPL(irq_work_queue);
@@ -120,34 +97,34 @@ EXPORT_SYMBOL_GPL(irq_work_queue);
120 */ 97 */
121void irq_work_run(void) 98void irq_work_run(void)
122{ 99{
123 struct irq_work *list; 100 struct irq_work *work;
101 struct llist_head *this_list;
102 struct llist_node *llnode;
124 103
125 if (this_cpu_read(irq_work_list) == NULL) 104 this_list = &__get_cpu_var(irq_work_list);
105 if (llist_empty(this_list))
126 return; 106 return;
127 107
128 BUG_ON(!in_irq()); 108 BUG_ON(!in_irq());
129 BUG_ON(!irqs_disabled()); 109 BUG_ON(!irqs_disabled());
130 110
131 list = this_cpu_xchg(irq_work_list, NULL); 111 llnode = llist_del_all(this_list);
132 112 while (llnode != NULL) {
133 while (list != NULL) { 113 work = llist_entry(llnode, struct irq_work, llnode);
134 struct irq_work *entry = list;
135 114
136 list = irq_work_next(list); 115 llnode = llist_next(llnode);
137 116
138 /* 117 /*
139 * Clear the PENDING bit, after this point the @entry 118 * Clear the PENDING bit, after this point the @work
140 * can be re-used. 119 * can be re-used.
141 */ 120 */
142 entry->next = next_flags(NULL, IRQ_WORK_BUSY); 121 work->flags = IRQ_WORK_BUSY;
143 entry->func(entry); 122 work->func(work);
144 /* 123 /*
145 * Clear the BUSY bit and return to the free state if 124 * Clear the BUSY bit and return to the free state if
146 * no-one else claimed it meanwhile. 125 * no-one else claimed it meanwhile.
147 */ 126 */
148 (void)cmpxchg(&entry->next, 127 (void)cmpxchg(&work->flags, IRQ_WORK_BUSY, 0);
149 next_flags(NULL, IRQ_WORK_BUSY),
150 NULL);
151 } 128 }
152} 129}
153EXPORT_SYMBOL_GPL(irq_work_run); 130EXPORT_SYMBOL_GPL(irq_work_run);
@@ -156,11 +133,11 @@ EXPORT_SYMBOL_GPL(irq_work_run);
156 * Synchronize against the irq_work @entry, ensures the entry is not 133 * Synchronize against the irq_work @entry, ensures the entry is not
157 * currently in use. 134 * currently in use.
158 */ 135 */
159void irq_work_sync(struct irq_work *entry) 136void irq_work_sync(struct irq_work *work)
160{ 137{
161 WARN_ON_ONCE(irqs_disabled()); 138 WARN_ON_ONCE(irqs_disabled());
162 139
163 while (irq_work_is_set(entry, IRQ_WORK_BUSY)) 140 while (work->flags & IRQ_WORK_BUSY)
164 cpu_relax(); 141 cpu_relax();
165} 142}
166EXPORT_SYMBOL_GPL(irq_work_sync); 143EXPORT_SYMBOL_GPL(irq_work_sync);
diff --git a/kernel/jump_label.c b/kernel/jump_label.c
index a8ce45097f3..bbdfe2a462a 100644
--- a/kernel/jump_label.c
+++ b/kernel/jump_label.c
@@ -104,6 +104,18 @@ static int __jump_label_text_reserved(struct jump_entry *iter_start,
104 return 0; 104 return 0;
105} 105}
106 106
107/*
108 * Update code which is definitely not currently executing.
109 * Architectures which need heavyweight synchronization to modify
110 * running code can override this to make the non-live update case
111 * cheaper.
112 */
113void __weak arch_jump_label_transform_static(struct jump_entry *entry,
114 enum jump_label_type type)
115{
116 arch_jump_label_transform(entry, type);
117}
118
107static void __jump_label_update(struct jump_label_key *key, 119static void __jump_label_update(struct jump_label_key *key,
108 struct jump_entry *entry, 120 struct jump_entry *entry,
109 struct jump_entry *stop, int enable) 121 struct jump_entry *stop, int enable)
@@ -121,14 +133,7 @@ static void __jump_label_update(struct jump_label_key *key,
121 } 133 }
122} 134}
123 135
124/* 136void __init jump_label_init(void)
125 * Not all archs need this.
126 */
127void __weak arch_jump_label_text_poke_early(jump_label_t addr)
128{
129}
130
131static __init int jump_label_init(void)
132{ 137{
133 struct jump_entry *iter_start = __start___jump_table; 138 struct jump_entry *iter_start = __start___jump_table;
134 struct jump_entry *iter_stop = __stop___jump_table; 139 struct jump_entry *iter_stop = __stop___jump_table;
@@ -139,22 +144,22 @@ static __init int jump_label_init(void)
139 jump_label_sort_entries(iter_start, iter_stop); 144 jump_label_sort_entries(iter_start, iter_stop);
140 145
141 for (iter = iter_start; iter < iter_stop; iter++) { 146 for (iter = iter_start; iter < iter_stop; iter++) {
142 arch_jump_label_text_poke_early(iter->code); 147 struct jump_label_key *iterk;
143 if (iter->key == (jump_label_t)(unsigned long)key) 148
149 iterk = (struct jump_label_key *)(unsigned long)iter->key;
150 arch_jump_label_transform_static(iter, jump_label_enabled(iterk) ?
151 JUMP_LABEL_ENABLE : JUMP_LABEL_DISABLE);
152 if (iterk == key)
144 continue; 153 continue;
145 154
146 key = (struct jump_label_key *)(unsigned long)iter->key; 155 key = iterk;
147 atomic_set(&key->enabled, 0);
148 key->entries = iter; 156 key->entries = iter;
149#ifdef CONFIG_MODULES 157#ifdef CONFIG_MODULES
150 key->next = NULL; 158 key->next = NULL;
151#endif 159#endif
152 } 160 }
153 jump_label_unlock(); 161 jump_label_unlock();
154
155 return 0;
156} 162}
157early_initcall(jump_label_init);
158 163
159#ifdef CONFIG_MODULES 164#ifdef CONFIG_MODULES
160 165
@@ -212,7 +217,7 @@ void jump_label_apply_nops(struct module *mod)
212 return; 217 return;
213 218
214 for (iter = iter_start; iter < iter_stop; iter++) 219 for (iter = iter_start; iter < iter_stop; iter++)
215 arch_jump_label_text_poke_early(iter->code); 220 arch_jump_label_transform_static(iter, JUMP_LABEL_DISABLE);
216} 221}
217 222
218static int jump_label_add_module(struct module *mod) 223static int jump_label_add_module(struct module *mod)
diff --git a/kernel/kexec.c b/kernel/kexec.c
index 296fbc84d65..dc7bc082928 100644
--- a/kernel/kexec.c
+++ b/kernel/kexec.c
@@ -498,7 +498,7 @@ static struct page *kimage_alloc_crash_control_pages(struct kimage *image,
498 while (hole_end <= crashk_res.end) { 498 while (hole_end <= crashk_res.end) {
499 unsigned long i; 499 unsigned long i;
500 500
501 if (hole_end > KEXEC_CONTROL_MEMORY_LIMIT) 501 if (hole_end > KEXEC_CRASH_CONTROL_MEMORY_LIMIT)
502 break; 502 break;
503 if (hole_end > crashk_res.end) 503 if (hole_end > crashk_res.end)
504 break; 504 break;
@@ -999,6 +999,7 @@ SYSCALL_DEFINE4(kexec_load, unsigned long, entry, unsigned long, nr_segments,
999 kimage_free(xchg(&kexec_crash_image, NULL)); 999 kimage_free(xchg(&kexec_crash_image, NULL));
1000 result = kimage_crash_alloc(&image, entry, 1000 result = kimage_crash_alloc(&image, entry,
1001 nr_segments, segments); 1001 nr_segments, segments);
1002 crash_map_reserved_pages();
1002 } 1003 }
1003 if (result) 1004 if (result)
1004 goto out; 1005 goto out;
@@ -1015,6 +1016,8 @@ SYSCALL_DEFINE4(kexec_load, unsigned long, entry, unsigned long, nr_segments,
1015 goto out; 1016 goto out;
1016 } 1017 }
1017 kimage_terminate(image); 1018 kimage_terminate(image);
1019 if (flags & KEXEC_ON_CRASH)
1020 crash_unmap_reserved_pages();
1018 } 1021 }
1019 /* Install the new kernel, and Uninstall the old */ 1022 /* Install the new kernel, and Uninstall the old */
1020 image = xchg(dest_image, image); 1023 image = xchg(dest_image, image);
@@ -1026,6 +1029,18 @@ out:
1026 return result; 1029 return result;
1027} 1030}
1028 1031
1032/*
1033 * Add and remove page tables for crashkernel memory
1034 *
1035 * Provide an empty default implementation here -- architecture
1036 * code may override this
1037 */
1038void __weak crash_map_reserved_pages(void)
1039{}
1040
1041void __weak crash_unmap_reserved_pages(void)
1042{}
1043
1029#ifdef CONFIG_COMPAT 1044#ifdef CONFIG_COMPAT
1030asmlinkage long compat_sys_kexec_load(unsigned long entry, 1045asmlinkage long compat_sys_kexec_load(unsigned long entry,
1031 unsigned long nr_segments, 1046 unsigned long nr_segments,
@@ -1134,14 +1149,16 @@ int crash_shrink_memory(unsigned long new_size)
1134 goto unlock; 1149 goto unlock;
1135 } 1150 }
1136 1151
1137 start = roundup(start, PAGE_SIZE); 1152 start = roundup(start, KEXEC_CRASH_MEM_ALIGN);
1138 end = roundup(start + new_size, PAGE_SIZE); 1153 end = roundup(start + new_size, KEXEC_CRASH_MEM_ALIGN);
1139 1154
1155 crash_map_reserved_pages();
1140 crash_free_reserved_phys_range(end, crashk_res.end); 1156 crash_free_reserved_phys_range(end, crashk_res.end);
1141 1157
1142 if ((start == end) && (crashk_res.parent != NULL)) 1158 if ((start == end) && (crashk_res.parent != NULL))
1143 release_resource(&crashk_res); 1159 release_resource(&crashk_res);
1144 crashk_res.end = end - 1; 1160 crashk_res.end = end - 1;
1161 crash_unmap_reserved_pages();
1145 1162
1146unlock: 1163unlock:
1147 mutex_unlock(&kexec_mutex); 1164 mutex_unlock(&kexec_mutex);
@@ -1380,24 +1397,23 @@ int __init parse_crashkernel(char *cmdline,
1380} 1397}
1381 1398
1382 1399
1383 1400static void update_vmcoreinfo_note(void)
1384void crash_save_vmcoreinfo(void)
1385{ 1401{
1386 u32 *buf; 1402 u32 *buf = vmcoreinfo_note;
1387 1403
1388 if (!vmcoreinfo_size) 1404 if (!vmcoreinfo_size)
1389 return; 1405 return;
1390
1391 vmcoreinfo_append_str("CRASHTIME=%ld", get_seconds());
1392
1393 buf = (u32 *)vmcoreinfo_note;
1394
1395 buf = append_elf_note(buf, VMCOREINFO_NOTE_NAME, 0, vmcoreinfo_data, 1406 buf = append_elf_note(buf, VMCOREINFO_NOTE_NAME, 0, vmcoreinfo_data,
1396 vmcoreinfo_size); 1407 vmcoreinfo_size);
1397
1398 final_note(buf); 1408 final_note(buf);
1399} 1409}
1400 1410
1411void crash_save_vmcoreinfo(void)
1412{
1413 vmcoreinfo_append_str("CRASHTIME=%ld", get_seconds());
1414 update_vmcoreinfo_note();
1415}
1416
1401void vmcoreinfo_append_str(const char *fmt, ...) 1417void vmcoreinfo_append_str(const char *fmt, ...)
1402{ 1418{
1403 va_list args; 1419 va_list args;
@@ -1483,6 +1499,7 @@ static int __init crash_save_vmcoreinfo_init(void)
1483 VMCOREINFO_NUMBER(PG_swapcache); 1499 VMCOREINFO_NUMBER(PG_swapcache);
1484 1500
1485 arch_crash_save_vmcoreinfo(); 1501 arch_crash_save_vmcoreinfo();
1502 update_vmcoreinfo_note();
1486 1503
1487 return 0; 1504 return 0;
1488} 1505}
diff --git a/kernel/kfifo.c b/kernel/kfifo.c
index 01a0700e873..c744b88c44e 100644
--- a/kernel/kfifo.c
+++ b/kernel/kfifo.c
@@ -20,7 +20,7 @@
20 */ 20 */
21 21
22#include <linux/kernel.h> 22#include <linux/kernel.h>
23#include <linux/module.h> 23#include <linux/export.h>
24#include <linux/slab.h> 24#include <linux/slab.h>
25#include <linux/err.h> 25#include <linux/err.h>
26#include <linux/log2.h> 26#include <linux/log2.h>
diff --git a/kernel/kmod.c b/kernel/kmod.c
index ddc7644c130..a4bea97c75b 100644
--- a/kernel/kmod.c
+++ b/kernel/kmod.c
@@ -114,10 +114,12 @@ int __request_module(bool wait, const char *fmt, ...)
114 atomic_inc(&kmod_concurrent); 114 atomic_inc(&kmod_concurrent);
115 if (atomic_read(&kmod_concurrent) > max_modprobes) { 115 if (atomic_read(&kmod_concurrent) > max_modprobes) {
116 /* We may be blaming an innocent here, but unlikely */ 116 /* We may be blaming an innocent here, but unlikely */
117 if (kmod_loop_msg++ < 5) 117 if (kmod_loop_msg < 5) {
118 printk(KERN_ERR 118 printk(KERN_ERR
119 "request_module: runaway loop modprobe %s\n", 119 "request_module: runaway loop modprobe %s\n",
120 module_name); 120 module_name);
121 kmod_loop_msg++;
122 }
121 atomic_dec(&kmod_concurrent); 123 atomic_dec(&kmod_concurrent);
122 return -ENOMEM; 124 return -ENOMEM;
123 } 125 }
diff --git a/kernel/kprobes.c b/kernel/kprobes.c
index b30fd54eb98..e5d84644823 100644
--- a/kernel/kprobes.c
+++ b/kernel/kprobes.c
@@ -36,7 +36,7 @@
36#include <linux/init.h> 36#include <linux/init.h>
37#include <linux/slab.h> 37#include <linux/slab.h>
38#include <linux/stddef.h> 38#include <linux/stddef.h>
39#include <linux/module.h> 39#include <linux/export.h>
40#include <linux/moduleloader.h> 40#include <linux/moduleloader.h>
41#include <linux/kallsyms.h> 41#include <linux/kallsyms.h>
42#include <linux/freezer.h> 42#include <linux/freezer.h>
@@ -78,10 +78,10 @@ static bool kprobes_all_disarmed;
78static DEFINE_MUTEX(kprobe_mutex); 78static DEFINE_MUTEX(kprobe_mutex);
79static DEFINE_PER_CPU(struct kprobe *, kprobe_instance) = NULL; 79static DEFINE_PER_CPU(struct kprobe *, kprobe_instance) = NULL;
80static struct { 80static struct {
81 spinlock_t lock ____cacheline_aligned_in_smp; 81 raw_spinlock_t lock ____cacheline_aligned_in_smp;
82} kretprobe_table_locks[KPROBE_TABLE_SIZE]; 82} kretprobe_table_locks[KPROBE_TABLE_SIZE];
83 83
84static spinlock_t *kretprobe_table_lock_ptr(unsigned long hash) 84static raw_spinlock_t *kretprobe_table_lock_ptr(unsigned long hash)
85{ 85{
86 return &(kretprobe_table_locks[hash].lock); 86 return &(kretprobe_table_locks[hash].lock);
87} 87}
@@ -1013,9 +1013,9 @@ void __kprobes recycle_rp_inst(struct kretprobe_instance *ri,
1013 hlist_del(&ri->hlist); 1013 hlist_del(&ri->hlist);
1014 INIT_HLIST_NODE(&ri->hlist); 1014 INIT_HLIST_NODE(&ri->hlist);
1015 if (likely(rp)) { 1015 if (likely(rp)) {
1016 spin_lock(&rp->lock); 1016 raw_spin_lock(&rp->lock);
1017 hlist_add_head(&ri->hlist, &rp->free_instances); 1017 hlist_add_head(&ri->hlist, &rp->free_instances);
1018 spin_unlock(&rp->lock); 1018 raw_spin_unlock(&rp->lock);
1019 } else 1019 } else
1020 /* Unregistering */ 1020 /* Unregistering */
1021 hlist_add_head(&ri->hlist, head); 1021 hlist_add_head(&ri->hlist, head);
@@ -1026,19 +1026,19 @@ void __kprobes kretprobe_hash_lock(struct task_struct *tsk,
1026__acquires(hlist_lock) 1026__acquires(hlist_lock)
1027{ 1027{
1028 unsigned long hash = hash_ptr(tsk, KPROBE_HASH_BITS); 1028 unsigned long hash = hash_ptr(tsk, KPROBE_HASH_BITS);
1029 spinlock_t *hlist_lock; 1029 raw_spinlock_t *hlist_lock;
1030 1030
1031 *head = &kretprobe_inst_table[hash]; 1031 *head = &kretprobe_inst_table[hash];
1032 hlist_lock = kretprobe_table_lock_ptr(hash); 1032 hlist_lock = kretprobe_table_lock_ptr(hash);
1033 spin_lock_irqsave(hlist_lock, *flags); 1033 raw_spin_lock_irqsave(hlist_lock, *flags);
1034} 1034}
1035 1035
1036static void __kprobes kretprobe_table_lock(unsigned long hash, 1036static void __kprobes kretprobe_table_lock(unsigned long hash,
1037 unsigned long *flags) 1037 unsigned long *flags)
1038__acquires(hlist_lock) 1038__acquires(hlist_lock)
1039{ 1039{
1040 spinlock_t *hlist_lock = kretprobe_table_lock_ptr(hash); 1040 raw_spinlock_t *hlist_lock = kretprobe_table_lock_ptr(hash);
1041 spin_lock_irqsave(hlist_lock, *flags); 1041 raw_spin_lock_irqsave(hlist_lock, *flags);
1042} 1042}
1043 1043
1044void __kprobes kretprobe_hash_unlock(struct task_struct *tsk, 1044void __kprobes kretprobe_hash_unlock(struct task_struct *tsk,
@@ -1046,18 +1046,18 @@ void __kprobes kretprobe_hash_unlock(struct task_struct *tsk,
1046__releases(hlist_lock) 1046__releases(hlist_lock)
1047{ 1047{
1048 unsigned long hash = hash_ptr(tsk, KPROBE_HASH_BITS); 1048 unsigned long hash = hash_ptr(tsk, KPROBE_HASH_BITS);
1049 spinlock_t *hlist_lock; 1049 raw_spinlock_t *hlist_lock;
1050 1050
1051 hlist_lock = kretprobe_table_lock_ptr(hash); 1051 hlist_lock = kretprobe_table_lock_ptr(hash);
1052 spin_unlock_irqrestore(hlist_lock, *flags); 1052 raw_spin_unlock_irqrestore(hlist_lock, *flags);
1053} 1053}
1054 1054
1055static void __kprobes kretprobe_table_unlock(unsigned long hash, 1055static void __kprobes kretprobe_table_unlock(unsigned long hash,
1056 unsigned long *flags) 1056 unsigned long *flags)
1057__releases(hlist_lock) 1057__releases(hlist_lock)
1058{ 1058{
1059 spinlock_t *hlist_lock = kretprobe_table_lock_ptr(hash); 1059 raw_spinlock_t *hlist_lock = kretprobe_table_lock_ptr(hash);
1060 spin_unlock_irqrestore(hlist_lock, *flags); 1060 raw_spin_unlock_irqrestore(hlist_lock, *flags);
1061} 1061}
1062 1062
1063/* 1063/*
@@ -1663,12 +1663,12 @@ static int __kprobes pre_handler_kretprobe(struct kprobe *p,
1663 1663
1664 /*TODO: consider to only swap the RA after the last pre_handler fired */ 1664 /*TODO: consider to only swap the RA after the last pre_handler fired */
1665 hash = hash_ptr(current, KPROBE_HASH_BITS); 1665 hash = hash_ptr(current, KPROBE_HASH_BITS);
1666 spin_lock_irqsave(&rp->lock, flags); 1666 raw_spin_lock_irqsave(&rp->lock, flags);
1667 if (!hlist_empty(&rp->free_instances)) { 1667 if (!hlist_empty(&rp->free_instances)) {
1668 ri = hlist_entry(rp->free_instances.first, 1668 ri = hlist_entry(rp->free_instances.first,
1669 struct kretprobe_instance, hlist); 1669 struct kretprobe_instance, hlist);
1670 hlist_del(&ri->hlist); 1670 hlist_del(&ri->hlist);
1671 spin_unlock_irqrestore(&rp->lock, flags); 1671 raw_spin_unlock_irqrestore(&rp->lock, flags);
1672 1672
1673 ri->rp = rp; 1673 ri->rp = rp;
1674 ri->task = current; 1674 ri->task = current;
@@ -1685,7 +1685,7 @@ static int __kprobes pre_handler_kretprobe(struct kprobe *p,
1685 kretprobe_table_unlock(hash, &flags); 1685 kretprobe_table_unlock(hash, &flags);
1686 } else { 1686 } else {
1687 rp->nmissed++; 1687 rp->nmissed++;
1688 spin_unlock_irqrestore(&rp->lock, flags); 1688 raw_spin_unlock_irqrestore(&rp->lock, flags);
1689 } 1689 }
1690 return 0; 1690 return 0;
1691} 1691}
@@ -1721,7 +1721,7 @@ int __kprobes register_kretprobe(struct kretprobe *rp)
1721 rp->maxactive = num_possible_cpus(); 1721 rp->maxactive = num_possible_cpus();
1722#endif 1722#endif
1723 } 1723 }
1724 spin_lock_init(&rp->lock); 1724 raw_spin_lock_init(&rp->lock);
1725 INIT_HLIST_HEAD(&rp->free_instances); 1725 INIT_HLIST_HEAD(&rp->free_instances);
1726 for (i = 0; i < rp->maxactive; i++) { 1726 for (i = 0; i < rp->maxactive; i++) {
1727 inst = kmalloc(sizeof(struct kretprobe_instance) + 1727 inst = kmalloc(sizeof(struct kretprobe_instance) +
@@ -1959,7 +1959,7 @@ static int __init init_kprobes(void)
1959 for (i = 0; i < KPROBE_TABLE_SIZE; i++) { 1959 for (i = 0; i < KPROBE_TABLE_SIZE; i++) {
1960 INIT_HLIST_HEAD(&kprobe_table[i]); 1960 INIT_HLIST_HEAD(&kprobe_table[i]);
1961 INIT_HLIST_HEAD(&kretprobe_inst_table[i]); 1961 INIT_HLIST_HEAD(&kretprobe_inst_table[i]);
1962 spin_lock_init(&(kretprobe_table_locks[i].lock)); 1962 raw_spin_lock_init(&(kretprobe_table_locks[i].lock));
1963 } 1963 }
1964 1964
1965 /* 1965 /*
diff --git a/kernel/ksysfs.c b/kernel/ksysfs.c
index 3b053c04dd8..4e316e1acf5 100644
--- a/kernel/ksysfs.c
+++ b/kernel/ksysfs.c
@@ -11,10 +11,11 @@
11#include <linux/kobject.h> 11#include <linux/kobject.h>
12#include <linux/string.h> 12#include <linux/string.h>
13#include <linux/sysfs.h> 13#include <linux/sysfs.h>
14#include <linux/module.h> 14#include <linux/export.h>
15#include <linux/init.h> 15#include <linux/init.h>
16#include <linux/kexec.h> 16#include <linux/kexec.h>
17#include <linux/profile.h> 17#include <linux/profile.h>
18#include <linux/stat.h>
18#include <linux/sched.h> 19#include <linux/sched.h>
19#include <linux/capability.h> 20#include <linux/capability.h>
20 21
diff --git a/kernel/kthread.c b/kernel/kthread.c
index 4ba7cccb499..b6d216a9263 100644
--- a/kernel/kthread.c
+++ b/kernel/kthread.c
@@ -12,7 +12,7 @@
12#include <linux/cpuset.h> 12#include <linux/cpuset.h>
13#include <linux/unistd.h> 13#include <linux/unistd.h>
14#include <linux/file.h> 14#include <linux/file.h>
15#include <linux/module.h> 15#include <linux/export.h>
16#include <linux/mutex.h> 16#include <linux/mutex.h>
17#include <linux/slab.h> 17#include <linux/slab.h>
18#include <linux/freezer.h> 18#include <linux/freezer.h>
diff --git a/kernel/latencytop.c b/kernel/latencytop.c
index 376066e1041..a462b317f9a 100644
--- a/kernel/latencytop.c
+++ b/kernel/latencytop.c
@@ -53,12 +53,12 @@
53#include <linux/notifier.h> 53#include <linux/notifier.h>
54#include <linux/spinlock.h> 54#include <linux/spinlock.h>
55#include <linux/proc_fs.h> 55#include <linux/proc_fs.h>
56#include <linux/module.h> 56#include <linux/export.h>
57#include <linux/sched.h> 57#include <linux/sched.h>
58#include <linux/list.h> 58#include <linux/list.h>
59#include <linux/stacktrace.h> 59#include <linux/stacktrace.h>
60 60
61static DEFINE_SPINLOCK(latency_lock); 61static DEFINE_RAW_SPINLOCK(latency_lock);
62 62
63#define MAXLR 128 63#define MAXLR 128
64static struct latency_record latency_record[MAXLR]; 64static struct latency_record latency_record[MAXLR];
@@ -72,19 +72,19 @@ void clear_all_latency_tracing(struct task_struct *p)
72 if (!latencytop_enabled) 72 if (!latencytop_enabled)
73 return; 73 return;
74 74
75 spin_lock_irqsave(&latency_lock, flags); 75 raw_spin_lock_irqsave(&latency_lock, flags);
76 memset(&p->latency_record, 0, sizeof(p->latency_record)); 76 memset(&p->latency_record, 0, sizeof(p->latency_record));
77 p->latency_record_count = 0; 77 p->latency_record_count = 0;
78 spin_unlock_irqrestore(&latency_lock, flags); 78 raw_spin_unlock_irqrestore(&latency_lock, flags);
79} 79}
80 80
81static void clear_global_latency_tracing(void) 81static void clear_global_latency_tracing(void)
82{ 82{
83 unsigned long flags; 83 unsigned long flags;
84 84
85 spin_lock_irqsave(&latency_lock, flags); 85 raw_spin_lock_irqsave(&latency_lock, flags);
86 memset(&latency_record, 0, sizeof(latency_record)); 86 memset(&latency_record, 0, sizeof(latency_record));
87 spin_unlock_irqrestore(&latency_lock, flags); 87 raw_spin_unlock_irqrestore(&latency_lock, flags);
88} 88}
89 89
90static void __sched 90static void __sched
@@ -190,7 +190,7 @@ __account_scheduler_latency(struct task_struct *tsk, int usecs, int inter)
190 lat.max = usecs; 190 lat.max = usecs;
191 store_stacktrace(tsk, &lat); 191 store_stacktrace(tsk, &lat);
192 192
193 spin_lock_irqsave(&latency_lock, flags); 193 raw_spin_lock_irqsave(&latency_lock, flags);
194 194
195 account_global_scheduler_latency(tsk, &lat); 195 account_global_scheduler_latency(tsk, &lat);
196 196
@@ -231,7 +231,7 @@ __account_scheduler_latency(struct task_struct *tsk, int usecs, int inter)
231 memcpy(&tsk->latency_record[i], &lat, sizeof(struct latency_record)); 231 memcpy(&tsk->latency_record[i], &lat, sizeof(struct latency_record));
232 232
233out_unlock: 233out_unlock:
234 spin_unlock_irqrestore(&latency_lock, flags); 234 raw_spin_unlock_irqrestore(&latency_lock, flags);
235} 235}
236 236
237static int lstats_show(struct seq_file *m, void *v) 237static int lstats_show(struct seq_file *m, void *v)
diff --git a/kernel/lockdep.c b/kernel/lockdep.c
index 8c24294e477..e69434b070d 100644
--- a/kernel/lockdep.c
+++ b/kernel/lockdep.c
@@ -96,8 +96,13 @@ static int graph_lock(void)
96 96
97static inline int graph_unlock(void) 97static inline int graph_unlock(void)
98{ 98{
99 if (debug_locks && !arch_spin_is_locked(&lockdep_lock)) 99 if (debug_locks && !arch_spin_is_locked(&lockdep_lock)) {
100 /*
101 * The lockdep graph lock isn't locked while we expect it to
102 * be, we're confused now, bye!
103 */
100 return DEBUG_LOCKS_WARN_ON(1); 104 return DEBUG_LOCKS_WARN_ON(1);
105 }
101 106
102 current->lockdep_recursion--; 107 current->lockdep_recursion--;
103 arch_spin_unlock(&lockdep_lock); 108 arch_spin_unlock(&lockdep_lock);
@@ -134,6 +139,9 @@ static struct lock_class lock_classes[MAX_LOCKDEP_KEYS];
134static inline struct lock_class *hlock_class(struct held_lock *hlock) 139static inline struct lock_class *hlock_class(struct held_lock *hlock)
135{ 140{
136 if (!hlock->class_idx) { 141 if (!hlock->class_idx) {
142 /*
143 * Someone passed in garbage, we give up.
144 */
137 DEBUG_LOCKS_WARN_ON(1); 145 DEBUG_LOCKS_WARN_ON(1);
138 return NULL; 146 return NULL;
139 } 147 }
@@ -687,6 +695,10 @@ look_up_lock_class(struct lockdep_map *lock, unsigned int subclass)
687 */ 695 */
688 list_for_each_entry(class, hash_head, hash_entry) { 696 list_for_each_entry(class, hash_head, hash_entry) {
689 if (class->key == key) { 697 if (class->key == key) {
698 /*
699 * Huh! same key, different name? Did someone trample
700 * on some memory? We're most confused.
701 */
690 WARN_ON_ONCE(class->name != lock->name); 702 WARN_ON_ONCE(class->name != lock->name);
691 return class; 703 return class;
692 } 704 }
@@ -800,6 +812,10 @@ out_unlock_set:
800 else if (subclass < NR_LOCKDEP_CACHING_CLASSES) 812 else if (subclass < NR_LOCKDEP_CACHING_CLASSES)
801 lock->class_cache[subclass] = class; 813 lock->class_cache[subclass] = class;
802 814
815 /*
816 * Hash collision, did we smoke some? We found a class with a matching
817 * hash but the subclass -- which is hashed in -- didn't match.
818 */
803 if (DEBUG_LOCKS_WARN_ON(class->subclass != subclass)) 819 if (DEBUG_LOCKS_WARN_ON(class->subclass != subclass))
804 return NULL; 820 return NULL;
805 821
@@ -926,7 +942,7 @@ static inline void mark_lock_accessed(struct lock_list *lock,
926 unsigned long nr; 942 unsigned long nr;
927 943
928 nr = lock - list_entries; 944 nr = lock - list_entries;
929 WARN_ON(nr >= nr_list_entries); 945 WARN_ON(nr >= nr_list_entries); /* Out-of-bounds, input fail */
930 lock->parent = parent; 946 lock->parent = parent;
931 lock->class->dep_gen_id = lockdep_dependency_gen_id; 947 lock->class->dep_gen_id = lockdep_dependency_gen_id;
932} 948}
@@ -936,7 +952,7 @@ static inline unsigned long lock_accessed(struct lock_list *lock)
936 unsigned long nr; 952 unsigned long nr;
937 953
938 nr = lock - list_entries; 954 nr = lock - list_entries;
939 WARN_ON(nr >= nr_list_entries); 955 WARN_ON(nr >= nr_list_entries); /* Out-of-bounds, input fail */
940 return lock->class->dep_gen_id == lockdep_dependency_gen_id; 956 return lock->class->dep_gen_id == lockdep_dependency_gen_id;
941} 957}
942 958
@@ -1129,10 +1145,11 @@ print_circular_bug_header(struct lock_list *entry, unsigned int depth,
1129 if (debug_locks_silent) 1145 if (debug_locks_silent)
1130 return 0; 1146 return 0;
1131 1147
1132 printk("\n=======================================================\n"); 1148 printk("\n");
1133 printk( "[ INFO: possible circular locking dependency detected ]\n"); 1149 printk("======================================================\n");
1150 printk("[ INFO: possible circular locking dependency detected ]\n");
1134 print_kernel_version(); 1151 print_kernel_version();
1135 printk( "-------------------------------------------------------\n"); 1152 printk("-------------------------------------------------------\n");
1136 printk("%s/%d is trying to acquire lock:\n", 1153 printk("%s/%d is trying to acquire lock:\n",
1137 curr->comm, task_pid_nr(curr)); 1154 curr->comm, task_pid_nr(curr));
1138 print_lock(check_src); 1155 print_lock(check_src);
@@ -1196,6 +1213,9 @@ static noinline int print_bfs_bug(int ret)
1196 if (!debug_locks_off_graph_unlock()) 1213 if (!debug_locks_off_graph_unlock())
1197 return 0; 1214 return 0;
1198 1215
1216 /*
1217 * Breadth-first-search failed, graph got corrupted?
1218 */
1199 WARN(1, "lockdep bfs error:%d\n", ret); 1219 WARN(1, "lockdep bfs error:%d\n", ret);
1200 1220
1201 return 0; 1221 return 0;
@@ -1463,11 +1483,12 @@ print_bad_irq_dependency(struct task_struct *curr,
1463 if (!debug_locks_off_graph_unlock() || debug_locks_silent) 1483 if (!debug_locks_off_graph_unlock() || debug_locks_silent)
1464 return 0; 1484 return 0;
1465 1485
1466 printk("\n======================================================\n"); 1486 printk("\n");
1467 printk( "[ INFO: %s-safe -> %s-unsafe lock order detected ]\n", 1487 printk("======================================================\n");
1488 printk("[ INFO: %s-safe -> %s-unsafe lock order detected ]\n",
1468 irqclass, irqclass); 1489 irqclass, irqclass);
1469 print_kernel_version(); 1490 print_kernel_version();
1470 printk( "------------------------------------------------------\n"); 1491 printk("------------------------------------------------------\n");
1471 printk("%s/%d [HC%u[%lu]:SC%u[%lu]:HE%u:SE%u] is trying to acquire:\n", 1492 printk("%s/%d [HC%u[%lu]:SC%u[%lu]:HE%u:SE%u] is trying to acquire:\n",
1472 curr->comm, task_pid_nr(curr), 1493 curr->comm, task_pid_nr(curr),
1473 curr->hardirq_context, hardirq_count() >> HARDIRQ_SHIFT, 1494 curr->hardirq_context, hardirq_count() >> HARDIRQ_SHIFT,
@@ -1692,10 +1713,11 @@ print_deadlock_bug(struct task_struct *curr, struct held_lock *prev,
1692 if (!debug_locks_off_graph_unlock() || debug_locks_silent) 1713 if (!debug_locks_off_graph_unlock() || debug_locks_silent)
1693 return 0; 1714 return 0;
1694 1715
1695 printk("\n=============================================\n"); 1716 printk("\n");
1696 printk( "[ INFO: possible recursive locking detected ]\n"); 1717 printk("=============================================\n");
1718 printk("[ INFO: possible recursive locking detected ]\n");
1697 print_kernel_version(); 1719 print_kernel_version();
1698 printk( "---------------------------------------------\n"); 1720 printk("---------------------------------------------\n");
1699 printk("%s/%d is trying to acquire lock:\n", 1721 printk("%s/%d is trying to acquire lock:\n",
1700 curr->comm, task_pid_nr(curr)); 1722 curr->comm, task_pid_nr(curr));
1701 print_lock(next); 1723 print_lock(next);
@@ -1944,6 +1966,11 @@ out_bug:
1944 if (!debug_locks_off_graph_unlock()) 1966 if (!debug_locks_off_graph_unlock())
1945 return 0; 1967 return 0;
1946 1968
1969 /*
1970 * Clearly we all shouldn't be here, but since we made it we
1971 * can reliable say we messed up our state. See the above two
1972 * gotos for reasons why we could possibly end up here.
1973 */
1947 WARN_ON(1); 1974 WARN_ON(1);
1948 1975
1949 return 0; 1976 return 0;
@@ -1975,6 +2002,11 @@ static inline int lookup_chain_cache(struct task_struct *curr,
1975 struct held_lock *hlock_curr, *hlock_next; 2002 struct held_lock *hlock_curr, *hlock_next;
1976 int i, j; 2003 int i, j;
1977 2004
2005 /*
2006 * We might need to take the graph lock, ensure we've got IRQs
2007 * disabled to make this an IRQ-safe lock.. for recursion reasons
2008 * lockdep won't complain about its own locking errors.
2009 */
1978 if (DEBUG_LOCKS_WARN_ON(!irqs_disabled())) 2010 if (DEBUG_LOCKS_WARN_ON(!irqs_disabled()))
1979 return 0; 2011 return 0;
1980 /* 2012 /*
@@ -2126,6 +2158,10 @@ static void check_chain_key(struct task_struct *curr)
2126 hlock = curr->held_locks + i; 2158 hlock = curr->held_locks + i;
2127 if (chain_key != hlock->prev_chain_key) { 2159 if (chain_key != hlock->prev_chain_key) {
2128 debug_locks_off(); 2160 debug_locks_off();
2161 /*
2162 * We got mighty confused, our chain keys don't match
2163 * with what we expect, someone trample on our task state?
2164 */
2129 WARN(1, "hm#1, depth: %u [%u], %016Lx != %016Lx\n", 2165 WARN(1, "hm#1, depth: %u [%u], %016Lx != %016Lx\n",
2130 curr->lockdep_depth, i, 2166 curr->lockdep_depth, i,
2131 (unsigned long long)chain_key, 2167 (unsigned long long)chain_key,
@@ -2133,6 +2169,9 @@ static void check_chain_key(struct task_struct *curr)
2133 return; 2169 return;
2134 } 2170 }
2135 id = hlock->class_idx - 1; 2171 id = hlock->class_idx - 1;
2172 /*
2173 * Whoops ran out of static storage again?
2174 */
2136 if (DEBUG_LOCKS_WARN_ON(id >= MAX_LOCKDEP_KEYS)) 2175 if (DEBUG_LOCKS_WARN_ON(id >= MAX_LOCKDEP_KEYS))
2137 return; 2176 return;
2138 2177
@@ -2144,6 +2183,10 @@ static void check_chain_key(struct task_struct *curr)
2144 } 2183 }
2145 if (chain_key != curr->curr_chain_key) { 2184 if (chain_key != curr->curr_chain_key) {
2146 debug_locks_off(); 2185 debug_locks_off();
2186 /*
2187 * More smoking hash instead of calculating it, damn see these
2188 * numbers float.. I bet that a pink elephant stepped on my memory.
2189 */
2147 WARN(1, "hm#2, depth: %u [%u], %016Lx != %016Lx\n", 2190 WARN(1, "hm#2, depth: %u [%u], %016Lx != %016Lx\n",
2148 curr->lockdep_depth, i, 2191 curr->lockdep_depth, i,
2149 (unsigned long long)chain_key, 2192 (unsigned long long)chain_key,
@@ -2177,10 +2220,11 @@ print_usage_bug(struct task_struct *curr, struct held_lock *this,
2177 if (!debug_locks_off_graph_unlock() || debug_locks_silent) 2220 if (!debug_locks_off_graph_unlock() || debug_locks_silent)
2178 return 0; 2221 return 0;
2179 2222
2180 printk("\n=================================\n"); 2223 printk("\n");
2181 printk( "[ INFO: inconsistent lock state ]\n"); 2224 printk("=================================\n");
2225 printk("[ INFO: inconsistent lock state ]\n");
2182 print_kernel_version(); 2226 print_kernel_version();
2183 printk( "---------------------------------\n"); 2227 printk("---------------------------------\n");
2184 2228
2185 printk("inconsistent {%s} -> {%s} usage.\n", 2229 printk("inconsistent {%s} -> {%s} usage.\n",
2186 usage_str[prev_bit], usage_str[new_bit]); 2230 usage_str[prev_bit], usage_str[new_bit]);
@@ -2241,10 +2285,11 @@ print_irq_inversion_bug(struct task_struct *curr,
2241 if (!debug_locks_off_graph_unlock() || debug_locks_silent) 2285 if (!debug_locks_off_graph_unlock() || debug_locks_silent)
2242 return 0; 2286 return 0;
2243 2287
2244 printk("\n=========================================================\n"); 2288 printk("\n");
2245 printk( "[ INFO: possible irq lock inversion dependency detected ]\n"); 2289 printk("=========================================================\n");
2290 printk("[ INFO: possible irq lock inversion dependency detected ]\n");
2246 print_kernel_version(); 2291 print_kernel_version();
2247 printk( "---------------------------------------------------------\n"); 2292 printk("---------------------------------------------------------\n");
2248 printk("%s/%d just changed the state of lock:\n", 2293 printk("%s/%d just changed the state of lock:\n",
2249 curr->comm, task_pid_nr(curr)); 2294 curr->comm, task_pid_nr(curr));
2250 print_lock(this); 2295 print_lock(this);
@@ -2525,12 +2570,24 @@ void trace_hardirqs_on_caller(unsigned long ip)
2525 return; 2570 return;
2526 } 2571 }
2527 2572
2573 /*
2574 * We're enabling irqs and according to our state above irqs weren't
2575 * already enabled, yet we find the hardware thinks they are in fact
2576 * enabled.. someone messed up their IRQ state tracing.
2577 */
2528 if (DEBUG_LOCKS_WARN_ON(!irqs_disabled())) 2578 if (DEBUG_LOCKS_WARN_ON(!irqs_disabled()))
2529 return; 2579 return;
2530 2580
2581 /*
2582 * See the fine text that goes along with this variable definition.
2583 */
2531 if (DEBUG_LOCKS_WARN_ON(unlikely(early_boot_irqs_disabled))) 2584 if (DEBUG_LOCKS_WARN_ON(unlikely(early_boot_irqs_disabled)))
2532 return; 2585 return;
2533 2586
2587 /*
2588 * Can't allow enabling interrupts while in an interrupt handler,
2589 * that's general bad form and such. Recursion, limited stack etc..
2590 */
2534 if (DEBUG_LOCKS_WARN_ON(current->hardirq_context)) 2591 if (DEBUG_LOCKS_WARN_ON(current->hardirq_context))
2535 return; 2592 return;
2536 2593
@@ -2558,6 +2615,10 @@ void trace_hardirqs_off_caller(unsigned long ip)
2558 if (unlikely(!debug_locks || current->lockdep_recursion)) 2615 if (unlikely(!debug_locks || current->lockdep_recursion))
2559 return; 2616 return;
2560 2617
2618 /*
2619 * So we're supposed to get called after you mask local IRQs, but for
2620 * some reason the hardware doesn't quite think you did a proper job.
2621 */
2561 if (DEBUG_LOCKS_WARN_ON(!irqs_disabled())) 2622 if (DEBUG_LOCKS_WARN_ON(!irqs_disabled()))
2562 return; 2623 return;
2563 2624
@@ -2590,6 +2651,10 @@ void trace_softirqs_on(unsigned long ip)
2590 if (unlikely(!debug_locks || current->lockdep_recursion)) 2651 if (unlikely(!debug_locks || current->lockdep_recursion))
2591 return; 2652 return;
2592 2653
2654 /*
2655 * We fancy IRQs being disabled here, see softirq.c, avoids
2656 * funny state and nesting things.
2657 */
2593 if (DEBUG_LOCKS_WARN_ON(!irqs_disabled())) 2658 if (DEBUG_LOCKS_WARN_ON(!irqs_disabled()))
2594 return; 2659 return;
2595 2660
@@ -2626,6 +2691,9 @@ void trace_softirqs_off(unsigned long ip)
2626 if (unlikely(!debug_locks || current->lockdep_recursion)) 2691 if (unlikely(!debug_locks || current->lockdep_recursion))
2627 return; 2692 return;
2628 2693
2694 /*
2695 * We fancy IRQs being disabled here, see softirq.c
2696 */
2629 if (DEBUG_LOCKS_WARN_ON(!irqs_disabled())) 2697 if (DEBUG_LOCKS_WARN_ON(!irqs_disabled()))
2630 return; 2698 return;
2631 2699
@@ -2637,6 +2705,9 @@ void trace_softirqs_off(unsigned long ip)
2637 curr->softirq_disable_ip = ip; 2705 curr->softirq_disable_ip = ip;
2638 curr->softirq_disable_event = ++curr->irq_events; 2706 curr->softirq_disable_event = ++curr->irq_events;
2639 debug_atomic_inc(softirqs_off_events); 2707 debug_atomic_inc(softirqs_off_events);
2708 /*
2709 * Whoops, we wanted softirqs off, so why aren't they?
2710 */
2640 DEBUG_LOCKS_WARN_ON(!softirq_count()); 2711 DEBUG_LOCKS_WARN_ON(!softirq_count());
2641 } else 2712 } else
2642 debug_atomic_inc(redundant_softirqs_off); 2713 debug_atomic_inc(redundant_softirqs_off);
@@ -2661,6 +2732,9 @@ static void __lockdep_trace_alloc(gfp_t gfp_mask, unsigned long flags)
2661 if (!(gfp_mask & __GFP_FS)) 2732 if (!(gfp_mask & __GFP_FS))
2662 return; 2733 return;
2663 2734
2735 /*
2736 * Oi! Can't be having __GFP_FS allocations with IRQs disabled.
2737 */
2664 if (DEBUG_LOCKS_WARN_ON(irqs_disabled_flags(flags))) 2738 if (DEBUG_LOCKS_WARN_ON(irqs_disabled_flags(flags)))
2665 return; 2739 return;
2666 2740
@@ -2773,13 +2847,13 @@ static int separate_irq_context(struct task_struct *curr,
2773 return 0; 2847 return 0;
2774} 2848}
2775 2849
2776#else 2850#else /* defined(CONFIG_TRACE_IRQFLAGS) && defined(CONFIG_PROVE_LOCKING) */
2777 2851
2778static inline 2852static inline
2779int mark_lock_irq(struct task_struct *curr, struct held_lock *this, 2853int mark_lock_irq(struct task_struct *curr, struct held_lock *this,
2780 enum lock_usage_bit new_bit) 2854 enum lock_usage_bit new_bit)
2781{ 2855{
2782 WARN_ON(1); 2856 WARN_ON(1); /* Impossible innit? when we don't have TRACE_IRQFLAG */
2783 return 1; 2857 return 1;
2784} 2858}
2785 2859
@@ -2799,7 +2873,7 @@ void lockdep_trace_alloc(gfp_t gfp_mask)
2799{ 2873{
2800} 2874}
2801 2875
2802#endif 2876#endif /* defined(CONFIG_TRACE_IRQFLAGS) && defined(CONFIG_PROVE_LOCKING) */
2803 2877
2804/* 2878/*
2805 * Mark a lock with a usage bit, and validate the state transition: 2879 * Mark a lock with a usage bit, and validate the state transition:
@@ -2880,6 +2954,9 @@ void lockdep_init_map(struct lockdep_map *lock, const char *name,
2880 lock->cpu = raw_smp_processor_id(); 2954 lock->cpu = raw_smp_processor_id();
2881#endif 2955#endif
2882 2956
2957 /*
2958 * Can't be having no nameless bastards around this place!
2959 */
2883 if (DEBUG_LOCKS_WARN_ON(!name)) { 2960 if (DEBUG_LOCKS_WARN_ON(!name)) {
2884 lock->name = "NULL"; 2961 lock->name = "NULL";
2885 return; 2962 return;
@@ -2887,6 +2964,9 @@ void lockdep_init_map(struct lockdep_map *lock, const char *name,
2887 2964
2888 lock->name = name; 2965 lock->name = name;
2889 2966
2967 /*
2968 * No key, no joy, we need to hash something.
2969 */
2890 if (DEBUG_LOCKS_WARN_ON(!key)) 2970 if (DEBUG_LOCKS_WARN_ON(!key))
2891 return; 2971 return;
2892 /* 2972 /*
@@ -2894,6 +2974,9 @@ void lockdep_init_map(struct lockdep_map *lock, const char *name,
2894 */ 2974 */
2895 if (!static_obj(key)) { 2975 if (!static_obj(key)) {
2896 printk("BUG: key %p not in .data!\n", key); 2976 printk("BUG: key %p not in .data!\n", key);
2977 /*
2978 * What it says above ^^^^^, I suggest you read it.
2979 */
2897 DEBUG_LOCKS_WARN_ON(1); 2980 DEBUG_LOCKS_WARN_ON(1);
2898 return; 2981 return;
2899 } 2982 }
@@ -2932,6 +3015,11 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass,
2932 if (unlikely(!debug_locks)) 3015 if (unlikely(!debug_locks))
2933 return 0; 3016 return 0;
2934 3017
3018 /*
3019 * Lockdep should run with IRQs disabled, otherwise we could
3020 * get an interrupt which would want to take locks, which would
3021 * end up in lockdep and have you got a head-ache already?
3022 */
2935 if (DEBUG_LOCKS_WARN_ON(!irqs_disabled())) 3023 if (DEBUG_LOCKS_WARN_ON(!irqs_disabled()))
2936 return 0; 3024 return 0;
2937 3025
@@ -2963,6 +3051,9 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass,
2963 * dependency checks are done) 3051 * dependency checks are done)
2964 */ 3052 */
2965 depth = curr->lockdep_depth; 3053 depth = curr->lockdep_depth;
3054 /*
3055 * Ran out of static storage for our per-task lock stack again have we?
3056 */
2966 if (DEBUG_LOCKS_WARN_ON(depth >= MAX_LOCK_DEPTH)) 3057 if (DEBUG_LOCKS_WARN_ON(depth >= MAX_LOCK_DEPTH))
2967 return 0; 3058 return 0;
2968 3059
@@ -2981,6 +3072,10 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass,
2981 } 3072 }
2982 3073
2983 hlock = curr->held_locks + depth; 3074 hlock = curr->held_locks + depth;
3075 /*
3076 * Plain impossible, we just registered it and checked it weren't no
3077 * NULL like.. I bet this mushroom I ate was good!
3078 */
2984 if (DEBUG_LOCKS_WARN_ON(!class)) 3079 if (DEBUG_LOCKS_WARN_ON(!class))
2985 return 0; 3080 return 0;
2986 hlock->class_idx = class_idx; 3081 hlock->class_idx = class_idx;
@@ -3015,11 +3110,17 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass,
3015 * the hash, not class->key. 3110 * the hash, not class->key.
3016 */ 3111 */
3017 id = class - lock_classes; 3112 id = class - lock_classes;
3113 /*
3114 * Whoops, we did it again.. ran straight out of our static allocation.
3115 */
3018 if (DEBUG_LOCKS_WARN_ON(id >= MAX_LOCKDEP_KEYS)) 3116 if (DEBUG_LOCKS_WARN_ON(id >= MAX_LOCKDEP_KEYS))
3019 return 0; 3117 return 0;
3020 3118
3021 chain_key = curr->curr_chain_key; 3119 chain_key = curr->curr_chain_key;
3022 if (!depth) { 3120 if (!depth) {
3121 /*
3122 * How can we have a chain hash when we ain't got no keys?!
3123 */
3023 if (DEBUG_LOCKS_WARN_ON(chain_key != 0)) 3124 if (DEBUG_LOCKS_WARN_ON(chain_key != 0))
3024 return 0; 3125 return 0;
3025 chain_head = 1; 3126 chain_head = 1;
@@ -3065,9 +3166,10 @@ print_unlock_inbalance_bug(struct task_struct *curr, struct lockdep_map *lock,
3065 if (debug_locks_silent) 3166 if (debug_locks_silent)
3066 return 0; 3167 return 0;
3067 3168
3068 printk("\n=====================================\n"); 3169 printk("\n");
3069 printk( "[ BUG: bad unlock balance detected! ]\n"); 3170 printk("=====================================\n");
3070 printk( "-------------------------------------\n"); 3171 printk("[ BUG: bad unlock balance detected! ]\n");
3172 printk("-------------------------------------\n");
3071 printk("%s/%d is trying to release lock (", 3173 printk("%s/%d is trying to release lock (",
3072 curr->comm, task_pid_nr(curr)); 3174 curr->comm, task_pid_nr(curr));
3073 print_lockdep_cache(lock); 3175 print_lockdep_cache(lock);
@@ -3091,6 +3193,9 @@ static int check_unlock(struct task_struct *curr, struct lockdep_map *lock,
3091{ 3193{
3092 if (unlikely(!debug_locks)) 3194 if (unlikely(!debug_locks))
3093 return 0; 3195 return 0;
3196 /*
3197 * Lockdep should run with IRQs disabled, recursion, head-ache, etc..
3198 */
3094 if (DEBUG_LOCKS_WARN_ON(!irqs_disabled())) 3199 if (DEBUG_LOCKS_WARN_ON(!irqs_disabled()))
3095 return 0; 3200 return 0;
3096 3201
@@ -3111,9 +3216,20 @@ static int match_held_lock(struct held_lock *hlock, struct lockdep_map *lock)
3111 if (!class) 3216 if (!class)
3112 class = look_up_lock_class(lock, 0); 3217 class = look_up_lock_class(lock, 0);
3113 3218
3114 if (DEBUG_LOCKS_WARN_ON(!class)) 3219 /*
3220 * If look_up_lock_class() failed to find a class, we're trying
3221 * to test if we hold a lock that has never yet been acquired.
3222 * Clearly if the lock hasn't been acquired _ever_, we're not
3223 * holding it either, so report failure.
3224 */
3225 if (!class)
3115 return 0; 3226 return 0;
3116 3227
3228 /*
3229 * References, but not a lock we're actually ref-counting?
3230 * State got messed up, follow the sites that change ->references
3231 * and try to make sense of it.
3232 */
3117 if (DEBUG_LOCKS_WARN_ON(!hlock->nest_lock)) 3233 if (DEBUG_LOCKS_WARN_ON(!hlock->nest_lock))
3118 return 0; 3234 return 0;
3119 3235
@@ -3136,6 +3252,10 @@ __lock_set_class(struct lockdep_map *lock, const char *name,
3136 int i; 3252 int i;
3137 3253
3138 depth = curr->lockdep_depth; 3254 depth = curr->lockdep_depth;
3255 /*
3256 * This function is about (re)setting the class of a held lock,
3257 * yet we're not actually holding any locks. Naughty user!
3258 */
3139 if (DEBUG_LOCKS_WARN_ON(!depth)) 3259 if (DEBUG_LOCKS_WARN_ON(!depth))
3140 return 0; 3260 return 0;
3141 3261
@@ -3171,6 +3291,10 @@ found_it:
3171 return 0; 3291 return 0;
3172 } 3292 }
3173 3293
3294 /*
3295 * I took it apart and put it back together again, except now I have
3296 * these 'spare' parts.. where shall I put them.
3297 */
3174 if (DEBUG_LOCKS_WARN_ON(curr->lockdep_depth != depth)) 3298 if (DEBUG_LOCKS_WARN_ON(curr->lockdep_depth != depth))
3175 return 0; 3299 return 0;
3176 return 1; 3300 return 1;
@@ -3195,6 +3319,10 @@ lock_release_non_nested(struct task_struct *curr,
3195 * of held locks: 3319 * of held locks:
3196 */ 3320 */
3197 depth = curr->lockdep_depth; 3321 depth = curr->lockdep_depth;
3322 /*
3323 * So we're all set to release this lock.. wait what lock? We don't
3324 * own any locks, you've been drinking again?
3325 */
3198 if (DEBUG_LOCKS_WARN_ON(!depth)) 3326 if (DEBUG_LOCKS_WARN_ON(!depth))
3199 return 0; 3327 return 0;
3200 3328
@@ -3247,6 +3375,10 @@ found_it:
3247 return 0; 3375 return 0;
3248 } 3376 }
3249 3377
3378 /*
3379 * We had N bottles of beer on the wall, we drank one, but now
3380 * there's not N-1 bottles of beer left on the wall...
3381 */
3250 if (DEBUG_LOCKS_WARN_ON(curr->lockdep_depth != depth - 1)) 3382 if (DEBUG_LOCKS_WARN_ON(curr->lockdep_depth != depth - 1))
3251 return 0; 3383 return 0;
3252 return 1; 3384 return 1;
@@ -3277,6 +3409,9 @@ static int lock_release_nested(struct task_struct *curr,
3277 return lock_release_non_nested(curr, lock, ip); 3409 return lock_release_non_nested(curr, lock, ip);
3278 curr->lockdep_depth--; 3410 curr->lockdep_depth--;
3279 3411
3412 /*
3413 * No more locks, but somehow we've got hash left over, who left it?
3414 */
3280 if (DEBUG_LOCKS_WARN_ON(!depth && (hlock->prev_chain_key != 0))) 3415 if (DEBUG_LOCKS_WARN_ON(!depth && (hlock->prev_chain_key != 0)))
3281 return 0; 3416 return 0;
3282 3417
@@ -3359,10 +3494,13 @@ static void check_flags(unsigned long flags)
3359 * check if not in hardirq contexts: 3494 * check if not in hardirq contexts:
3360 */ 3495 */
3361 if (!hardirq_count()) { 3496 if (!hardirq_count()) {
3362 if (softirq_count()) 3497 if (softirq_count()) {
3498 /* like the above, but with softirqs */
3363 DEBUG_LOCKS_WARN_ON(current->softirqs_enabled); 3499 DEBUG_LOCKS_WARN_ON(current->softirqs_enabled);
3364 else 3500 } else {
3501 /* lick the above, does it taste good? */
3365 DEBUG_LOCKS_WARN_ON(!current->softirqs_enabled); 3502 DEBUG_LOCKS_WARN_ON(!current->softirqs_enabled);
3503 }
3366 } 3504 }
3367 3505
3368 if (!debug_locks) 3506 if (!debug_locks)
@@ -3472,9 +3610,10 @@ print_lock_contention_bug(struct task_struct *curr, struct lockdep_map *lock,
3472 if (debug_locks_silent) 3610 if (debug_locks_silent)
3473 return 0; 3611 return 0;
3474 3612
3475 printk("\n=================================\n"); 3613 printk("\n");
3476 printk( "[ BUG: bad contention detected! ]\n"); 3614 printk("=================================\n");
3477 printk( "---------------------------------\n"); 3615 printk("[ BUG: bad contention detected! ]\n");
3616 printk("---------------------------------\n");
3478 printk("%s/%d is trying to contend lock (", 3617 printk("%s/%d is trying to contend lock (",
3479 curr->comm, task_pid_nr(curr)); 3618 curr->comm, task_pid_nr(curr));
3480 print_lockdep_cache(lock); 3619 print_lockdep_cache(lock);
@@ -3500,6 +3639,10 @@ __lock_contended(struct lockdep_map *lock, unsigned long ip)
3500 int i, contention_point, contending_point; 3639 int i, contention_point, contending_point;
3501 3640
3502 depth = curr->lockdep_depth; 3641 depth = curr->lockdep_depth;
3642 /*
3643 * Whee, we contended on this lock, except it seems we're not
3644 * actually trying to acquire anything much at all..
3645 */
3503 if (DEBUG_LOCKS_WARN_ON(!depth)) 3646 if (DEBUG_LOCKS_WARN_ON(!depth))
3504 return; 3647 return;
3505 3648
@@ -3549,6 +3692,10 @@ __lock_acquired(struct lockdep_map *lock, unsigned long ip)
3549 int i, cpu; 3692 int i, cpu;
3550 3693
3551 depth = curr->lockdep_depth; 3694 depth = curr->lockdep_depth;
3695 /*
3696 * Yay, we acquired ownership of this lock we didn't try to
3697 * acquire, how the heck did that happen?
3698 */
3552 if (DEBUG_LOCKS_WARN_ON(!depth)) 3699 if (DEBUG_LOCKS_WARN_ON(!depth))
3553 return; 3700 return;
3554 3701
@@ -3753,8 +3900,12 @@ void lockdep_reset_lock(struct lockdep_map *lock)
3753 match |= class == lock->class_cache[j]; 3900 match |= class == lock->class_cache[j];
3754 3901
3755 if (unlikely(match)) { 3902 if (unlikely(match)) {
3756 if (debug_locks_off_graph_unlock()) 3903 if (debug_locks_off_graph_unlock()) {
3904 /*
3905 * We all just reset everything, how did it match?
3906 */
3757 WARN_ON(1); 3907 WARN_ON(1);
3908 }
3758 goto out_restore; 3909 goto out_restore;
3759 } 3910 }
3760 } 3911 }
@@ -3833,9 +3984,10 @@ print_freed_lock_bug(struct task_struct *curr, const void *mem_from,
3833 if (debug_locks_silent) 3984 if (debug_locks_silent)
3834 return; 3985 return;
3835 3986
3836 printk("\n=========================\n"); 3987 printk("\n");
3837 printk( "[ BUG: held lock freed! ]\n"); 3988 printk("=========================\n");
3838 printk( "-------------------------\n"); 3989 printk("[ BUG: held lock freed! ]\n");
3990 printk("-------------------------\n");
3839 printk("%s/%d is freeing memory %p-%p, with a lock still held there!\n", 3991 printk("%s/%d is freeing memory %p-%p, with a lock still held there!\n",
3840 curr->comm, task_pid_nr(curr), mem_from, mem_to-1); 3992 curr->comm, task_pid_nr(curr), mem_from, mem_to-1);
3841 print_lock(hlock); 3993 print_lock(hlock);
@@ -3889,9 +4041,10 @@ static void print_held_locks_bug(struct task_struct *curr)
3889 if (debug_locks_silent) 4041 if (debug_locks_silent)
3890 return; 4042 return;
3891 4043
3892 printk("\n=====================================\n"); 4044 printk("\n");
3893 printk( "[ BUG: lock held at task exit time! ]\n"); 4045 printk("=====================================\n");
3894 printk( "-------------------------------------\n"); 4046 printk("[ BUG: lock held at task exit time! ]\n");
4047 printk("-------------------------------------\n");
3895 printk("%s/%d is exiting with locks still held!\n", 4048 printk("%s/%d is exiting with locks still held!\n",
3896 curr->comm, task_pid_nr(curr)); 4049 curr->comm, task_pid_nr(curr));
3897 lockdep_print_held_locks(curr); 4050 lockdep_print_held_locks(curr);
@@ -3985,16 +4138,17 @@ void lockdep_sys_exit(void)
3985 if (unlikely(curr->lockdep_depth)) { 4138 if (unlikely(curr->lockdep_depth)) {
3986 if (!debug_locks_off()) 4139 if (!debug_locks_off())
3987 return; 4140 return;
3988 printk("\n================================================\n"); 4141 printk("\n");
3989 printk( "[ BUG: lock held when returning to user space! ]\n"); 4142 printk("================================================\n");
3990 printk( "------------------------------------------------\n"); 4143 printk("[ BUG: lock held when returning to user space! ]\n");
4144 printk("------------------------------------------------\n");
3991 printk("%s/%d is leaving the kernel with locks still held!\n", 4145 printk("%s/%d is leaving the kernel with locks still held!\n",
3992 curr->comm, curr->pid); 4146 curr->comm, curr->pid);
3993 lockdep_print_held_locks(curr); 4147 lockdep_print_held_locks(curr);
3994 } 4148 }
3995} 4149}
3996 4150
3997void lockdep_rcu_dereference(const char *file, const int line) 4151void lockdep_rcu_suspicious(const char *file, const int line, const char *s)
3998{ 4152{
3999 struct task_struct *curr = current; 4153 struct task_struct *curr = current;
4000 4154
@@ -4003,15 +4157,15 @@ void lockdep_rcu_dereference(const char *file, const int line)
4003 return; 4157 return;
4004#endif /* #ifdef CONFIG_PROVE_RCU_REPEATEDLY */ 4158#endif /* #ifdef CONFIG_PROVE_RCU_REPEATEDLY */
4005 /* Note: the following can be executed concurrently, so be careful. */ 4159 /* Note: the following can be executed concurrently, so be careful. */
4006 printk("\n===================================================\n"); 4160 printk("\n");
4007 printk( "[ INFO: suspicious rcu_dereference_check() usage. ]\n"); 4161 printk("===============================\n");
4008 printk( "---------------------------------------------------\n"); 4162 printk("[ INFO: suspicious RCU usage. ]\n");
4009 printk("%s:%d invoked rcu_dereference_check() without protection!\n", 4163 printk("-------------------------------\n");
4010 file, line); 4164 printk("%s:%d %s!\n", file, line, s);
4011 printk("\nother info that might help us debug this:\n\n"); 4165 printk("\nother info that might help us debug this:\n\n");
4012 printk("\nrcu_scheduler_active = %d, debug_locks = %d\n", rcu_scheduler_active, debug_locks); 4166 printk("\nrcu_scheduler_active = %d, debug_locks = %d\n", rcu_scheduler_active, debug_locks);
4013 lockdep_print_held_locks(curr); 4167 lockdep_print_held_locks(curr);
4014 printk("\nstack backtrace:\n"); 4168 printk("\nstack backtrace:\n");
4015 dump_stack(); 4169 dump_stack();
4016} 4170}
4017EXPORT_SYMBOL_GPL(lockdep_rcu_dereference); 4171EXPORT_SYMBOL_GPL(lockdep_rcu_suspicious);
diff --git a/kernel/lockdep_proc.c b/kernel/lockdep_proc.c
index 71edd2f60c0..91c32a0b612 100644
--- a/kernel/lockdep_proc.c
+++ b/kernel/lockdep_proc.c
@@ -11,7 +11,7 @@
11 * Code for /proc/lockdep and /proc/lockdep_stats: 11 * Code for /proc/lockdep and /proc/lockdep_stats:
12 * 12 *
13 */ 13 */
14#include <linux/module.h> 14#include <linux/export.h>
15#include <linux/proc_fs.h> 15#include <linux/proc_fs.h>
16#include <linux/seq_file.h> 16#include <linux/seq_file.h>
17#include <linux/kallsyms.h> 17#include <linux/kallsyms.h>
diff --git a/kernel/module.c b/kernel/module.c
index 04379f92f84..178333c48d1 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -16,7 +16,7 @@
16 along with this program; if not, write to the Free Software 16 along with this program; if not, write to the Free Software
17 Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA 17 Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
18*/ 18*/
19#include <linux/module.h> 19#include <linux/export.h>
20#include <linux/moduleloader.h> 20#include <linux/moduleloader.h>
21#include <linux/ftrace_event.h> 21#include <linux/ftrace_event.h>
22#include <linux/init.h> 22#include <linux/init.h>
@@ -2487,6 +2487,9 @@ static int check_modinfo(struct module *mod, struct load_info *info)
2487 return -ENOEXEC; 2487 return -ENOEXEC;
2488 } 2488 }
2489 2489
2490 if (!get_modinfo(info, "intree"))
2491 add_taint_module(mod, TAINT_OOT_MODULE);
2492
2490 if (get_modinfo(info, "staging")) { 2493 if (get_modinfo(info, "staging")) {
2491 add_taint_module(mod, TAINT_CRAP); 2494 add_taint_module(mod, TAINT_CRAP);
2492 printk(KERN_WARNING "%s: module is from the staging directory," 2495 printk(KERN_WARNING "%s: module is from the staging directory,"
@@ -2878,8 +2881,7 @@ static struct module *load_module(void __user *umod,
2878 } 2881 }
2879 2882
2880 /* This has to be done once we're sure module name is unique. */ 2883 /* This has to be done once we're sure module name is unique. */
2881 if (!mod->taints || mod->taints == (1U<<TAINT_CRAP)) 2884 dynamic_debug_setup(info.debug, info.num_debug);
2882 dynamic_debug_setup(info.debug, info.num_debug);
2883 2885
2884 /* Find duplicate symbols */ 2886 /* Find duplicate symbols */
2885 err = verify_export_symbols(mod); 2887 err = verify_export_symbols(mod);
@@ -2915,8 +2917,7 @@ static struct module *load_module(void __user *umod,
2915 module_bug_cleanup(mod); 2917 module_bug_cleanup(mod);
2916 2918
2917 ddebug: 2919 ddebug:
2918 if (!mod->taints || mod->taints == (1U<<TAINT_CRAP)) 2920 dynamic_debug_remove(info.debug);
2919 dynamic_debug_remove(info.debug);
2920 unlock: 2921 unlock:
2921 mutex_unlock(&module_mutex); 2922 mutex_unlock(&module_mutex);
2922 synchronize_sched(); 2923 synchronize_sched();
@@ -3257,6 +3258,8 @@ static char *module_flags(struct module *mod, char *buf)
3257 buf[bx++] = '('; 3258 buf[bx++] = '(';
3258 if (mod->taints & (1 << TAINT_PROPRIETARY_MODULE)) 3259 if (mod->taints & (1 << TAINT_PROPRIETARY_MODULE))
3259 buf[bx++] = 'P'; 3260 buf[bx++] = 'P';
3261 else if (mod->taints & (1 << TAINT_OOT_MODULE))
3262 buf[bx++] = 'O';
3260 if (mod->taints & (1 << TAINT_FORCED_MODULE)) 3263 if (mod->taints & (1 << TAINT_FORCED_MODULE))
3261 buf[bx++] = 'F'; 3264 buf[bx++] = 'F';
3262 if (mod->taints & (1 << TAINT_CRAP)) 3265 if (mod->taints & (1 << TAINT_CRAP))
@@ -3487,50 +3490,3 @@ void module_layout(struct module *mod,
3487} 3490}
3488EXPORT_SYMBOL(module_layout); 3491EXPORT_SYMBOL(module_layout);
3489#endif 3492#endif
3490
3491#ifdef CONFIG_TRACEPOINTS
3492void module_update_tracepoints(void)
3493{
3494 struct module *mod;
3495
3496 mutex_lock(&module_mutex);
3497 list_for_each_entry(mod, &modules, list)
3498 if (!mod->taints)
3499 tracepoint_update_probe_range(mod->tracepoints_ptrs,
3500 mod->tracepoints_ptrs + mod->num_tracepoints);
3501 mutex_unlock(&module_mutex);
3502}
3503
3504/*
3505 * Returns 0 if current not found.
3506 * Returns 1 if current found.
3507 */
3508int module_get_iter_tracepoints(struct tracepoint_iter *iter)
3509{
3510 struct module *iter_mod;
3511 int found = 0;
3512
3513 mutex_lock(&module_mutex);
3514 list_for_each_entry(iter_mod, &modules, list) {
3515 if (!iter_mod->taints) {
3516 /*
3517 * Sorted module list
3518 */
3519 if (iter_mod < iter->module)
3520 continue;
3521 else if (iter_mod > iter->module)
3522 iter->tracepoint = NULL;
3523 found = tracepoint_get_iter_range(&iter->tracepoint,
3524 iter_mod->tracepoints_ptrs,
3525 iter_mod->tracepoints_ptrs
3526 + iter_mod->num_tracepoints);
3527 if (found) {
3528 iter->module = iter_mod;
3529 break;
3530 }
3531 }
3532 }
3533 mutex_unlock(&module_mutex);
3534 return found;
3535}
3536#endif
diff --git a/kernel/mutex-debug.c b/kernel/mutex-debug.c
index 73da83aff41..7e3443fe1f4 100644
--- a/kernel/mutex-debug.c
+++ b/kernel/mutex-debug.c
@@ -14,7 +14,7 @@
14 */ 14 */
15#include <linux/mutex.h> 15#include <linux/mutex.h>
16#include <linux/delay.h> 16#include <linux/delay.h>
17#include <linux/module.h> 17#include <linux/export.h>
18#include <linux/poison.h> 18#include <linux/poison.h>
19#include <linux/sched.h> 19#include <linux/sched.h>
20#include <linux/spinlock.h> 20#include <linux/spinlock.h>
diff --git a/kernel/mutex.c b/kernel/mutex.c
index d607ed5dd44..89096dd8786 100644
--- a/kernel/mutex.c
+++ b/kernel/mutex.c
@@ -19,7 +19,7 @@
19 */ 19 */
20#include <linux/mutex.h> 20#include <linux/mutex.h>
21#include <linux/sched.h> 21#include <linux/sched.h>
22#include <linux/module.h> 22#include <linux/export.h>
23#include <linux/spinlock.h> 23#include <linux/spinlock.h>
24#include <linux/interrupt.h> 24#include <linux/interrupt.h>
25#include <linux/debug_locks.h> 25#include <linux/debug_locks.h>
diff --git a/kernel/notifier.c b/kernel/notifier.c
index 8d7b435806c..2d5cc4ccff7 100644
--- a/kernel/notifier.c
+++ b/kernel/notifier.c
@@ -1,6 +1,6 @@
1#include <linux/kdebug.h> 1#include <linux/kdebug.h>
2#include <linux/kprobes.h> 2#include <linux/kprobes.h>
3#include <linux/module.h> 3#include <linux/export.h>
4#include <linux/notifier.h> 4#include <linux/notifier.h>
5#include <linux/rcupdate.h> 5#include <linux/rcupdate.h>
6#include <linux/vmalloc.h> 6#include <linux/vmalloc.h>
diff --git a/kernel/nsproxy.c b/kernel/nsproxy.c
index 9aeab4b98c6..b576f7f14bc 100644
--- a/kernel/nsproxy.c
+++ b/kernel/nsproxy.c
@@ -14,7 +14,7 @@
14 */ 14 */
15 15
16#include <linux/slab.h> 16#include <linux/slab.h>
17#include <linux/module.h> 17#include <linux/export.h>
18#include <linux/nsproxy.h> 18#include <linux/nsproxy.h>
19#include <linux/init_task.h> 19#include <linux/init_task.h>
20#include <linux/mnt_namespace.h> 20#include <linux/mnt_namespace.h>
diff --git a/kernel/padata.c b/kernel/padata.c
index b91941df5e6..b4525993151 100644
--- a/kernel/padata.c
+++ b/kernel/padata.c
@@ -18,7 +18,7 @@
18 * 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA. 18 * 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA.
19 */ 19 */
20 20
21#include <linux/module.h> 21#include <linux/export.h>
22#include <linux/cpumask.h> 22#include <linux/cpumask.h>
23#include <linux/err.h> 23#include <linux/err.h>
24#include <linux/cpu.h> 24#include <linux/cpu.h>
diff --git a/kernel/panic.c b/kernel/panic.c
index d7bb6974efb..b2659360421 100644
--- a/kernel/panic.c
+++ b/kernel/panic.c
@@ -177,6 +177,7 @@ static const struct tnt tnts[] = {
177 { TAINT_WARN, 'W', ' ' }, 177 { TAINT_WARN, 'W', ' ' },
178 { TAINT_CRAP, 'C', ' ' }, 178 { TAINT_CRAP, 'C', ' ' },
179 { TAINT_FIRMWARE_WORKAROUND, 'I', ' ' }, 179 { TAINT_FIRMWARE_WORKAROUND, 'I', ' ' },
180 { TAINT_OOT_MODULE, 'O', ' ' },
180}; 181};
181 182
182/** 183/**
@@ -194,6 +195,7 @@ static const struct tnt tnts[] = {
194 * 'W' - Taint on warning. 195 * 'W' - Taint on warning.
195 * 'C' - modules from drivers/staging are loaded. 196 * 'C' - modules from drivers/staging are loaded.
196 * 'I' - Working around severe firmware bug. 197 * 'I' - Working around severe firmware bug.
198 * 'O' - Out-of-tree module has been loaded.
197 * 199 *
198 * The string is overwritten by the next call to print_tainted(). 200 * The string is overwritten by the next call to print_tainted().
199 */ 201 */
diff --git a/kernel/params.c b/kernel/params.c
index 22df3e0d142..65aae11eb93 100644
--- a/kernel/params.c
+++ b/kernel/params.c
@@ -15,7 +15,7 @@
15 along with this program; if not, write to the Free Software 15 along with this program; if not, write to the Free Software
16 Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA 16 Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
17*/ 17*/
18#include <linux/moduleparam.h> 18#include <linux/module.h>
19#include <linux/kernel.h> 19#include <linux/kernel.h>
20#include <linux/string.h> 20#include <linux/string.h>
21#include <linux/errno.h> 21#include <linux/errno.h>
@@ -67,20 +67,27 @@ static void maybe_kfree_parameter(void *param)
67 } 67 }
68} 68}
69 69
70static inline char dash2underscore(char c) 70static char dash2underscore(char c)
71{ 71{
72 if (c == '-') 72 if (c == '-')
73 return '_'; 73 return '_';
74 return c; 74 return c;
75} 75}
76 76
77static inline int parameq(const char *input, const char *paramname) 77bool parameqn(const char *a, const char *b, size_t n)
78{ 78{
79 unsigned int i; 79 size_t i;
80 for (i = 0; dash2underscore(input[i]) == paramname[i]; i++) 80
81 if (input[i] == '\0') 81 for (i = 0; i < n; i++) {
82 return 1; 82 if (dash2underscore(a[i]) != dash2underscore(b[i]))
83 return 0; 83 return false;
84 }
85 return true;
86}
87
88bool parameq(const char *a, const char *b)
89{
90 return parameqn(a, b, strlen(a)+1);
84} 91}
85 92
86static int parse_one(char *param, 93static int parse_one(char *param,
diff --git a/kernel/pid.c b/kernel/pid.c
index e432057f3b2..fa5f72227e5 100644
--- a/kernel/pid.c
+++ b/kernel/pid.c
@@ -27,7 +27,7 @@
27 */ 27 */
28 28
29#include <linux/mm.h> 29#include <linux/mm.h>
30#include <linux/module.h> 30#include <linux/export.h>
31#include <linux/slab.h> 31#include <linux/slab.h>
32#include <linux/init.h> 32#include <linux/init.h>
33#include <linux/rculist.h> 33#include <linux/rculist.h>
@@ -418,7 +418,9 @@ EXPORT_SYMBOL(pid_task);
418 */ 418 */
419struct task_struct *find_task_by_pid_ns(pid_t nr, struct pid_namespace *ns) 419struct task_struct *find_task_by_pid_ns(pid_t nr, struct pid_namespace *ns)
420{ 420{
421 rcu_lockdep_assert(rcu_read_lock_held()); 421 rcu_lockdep_assert(rcu_read_lock_held(),
422 "find_task_by_pid_ns() needs rcu_read_lock()"
423 " protection");
422 return pid_task(find_pid_ns(nr, ns), PIDTYPE_PID); 424 return pid_task(find_pid_ns(nr, ns), PIDTYPE_PID);
423} 425}
424 426
diff --git a/kernel/posix-cpu-timers.c b/kernel/posix-cpu-timers.c
index c8008dd58ef..e7cb76dc18f 100644
--- a/kernel/posix-cpu-timers.c
+++ b/kernel/posix-cpu-timers.c
@@ -274,9 +274,7 @@ void thread_group_cputimer(struct task_struct *tsk, struct task_cputime *times)
274 struct task_cputime sum; 274 struct task_cputime sum;
275 unsigned long flags; 275 unsigned long flags;
276 276
277 spin_lock_irqsave(&cputimer->lock, flags);
278 if (!cputimer->running) { 277 if (!cputimer->running) {
279 cputimer->running = 1;
280 /* 278 /*
281 * The POSIX timer interface allows for absolute time expiry 279 * The POSIX timer interface allows for absolute time expiry
282 * values through the TIMER_ABSTIME flag, therefore we have 280 * values through the TIMER_ABSTIME flag, therefore we have
@@ -284,10 +282,13 @@ void thread_group_cputimer(struct task_struct *tsk, struct task_cputime *times)
284 * it. 282 * it.
285 */ 283 */
286 thread_group_cputime(tsk, &sum); 284 thread_group_cputime(tsk, &sum);
285 raw_spin_lock_irqsave(&cputimer->lock, flags);
286 cputimer->running = 1;
287 update_gt_cputime(&cputimer->cputime, &sum); 287 update_gt_cputime(&cputimer->cputime, &sum);
288 } 288 } else
289 raw_spin_lock_irqsave(&cputimer->lock, flags);
289 *times = cputimer->cputime; 290 *times = cputimer->cputime;
290 spin_unlock_irqrestore(&cputimer->lock, flags); 291 raw_spin_unlock_irqrestore(&cputimer->lock, flags);
291} 292}
292 293
293/* 294/*
@@ -998,9 +999,9 @@ static void stop_process_timers(struct signal_struct *sig)
998 struct thread_group_cputimer *cputimer = &sig->cputimer; 999 struct thread_group_cputimer *cputimer = &sig->cputimer;
999 unsigned long flags; 1000 unsigned long flags;
1000 1001
1001 spin_lock_irqsave(&cputimer->lock, flags); 1002 raw_spin_lock_irqsave(&cputimer->lock, flags);
1002 cputimer->running = 0; 1003 cputimer->running = 0;
1003 spin_unlock_irqrestore(&cputimer->lock, flags); 1004 raw_spin_unlock_irqrestore(&cputimer->lock, flags);
1004} 1005}
1005 1006
1006static u32 onecputick; 1007static u32 onecputick;
@@ -1290,9 +1291,9 @@ static inline int fastpath_timer_check(struct task_struct *tsk)
1290 if (sig->cputimer.running) { 1291 if (sig->cputimer.running) {
1291 struct task_cputime group_sample; 1292 struct task_cputime group_sample;
1292 1293
1293 spin_lock(&sig->cputimer.lock); 1294 raw_spin_lock(&sig->cputimer.lock);
1294 group_sample = sig->cputimer.cputime; 1295 group_sample = sig->cputimer.cputime;
1295 spin_unlock(&sig->cputimer.lock); 1296 raw_spin_unlock(&sig->cputimer.lock);
1296 1297
1297 if (task_cputime_expired(&group_sample, &sig->cputime_expires)) 1298 if (task_cputime_expired(&group_sample, &sig->cputime_expires))
1298 return 1; 1299 return 1;
diff --git a/kernel/posix-timers.c b/kernel/posix-timers.c
index 4556182527f..69185ae6b70 100644
--- a/kernel/posix-timers.c
+++ b/kernel/posix-timers.c
@@ -46,7 +46,7 @@
46#include <linux/syscalls.h> 46#include <linux/syscalls.h>
47#include <linux/wait.h> 47#include <linux/wait.h>
48#include <linux/workqueue.h> 48#include <linux/workqueue.h>
49#include <linux/module.h> 49#include <linux/export.h>
50 50
51/* 51/*
52 * Management arrays for POSIX timers. Timers are kept in slab memory 52 * Management arrays for POSIX timers. Timers are kept in slab memory
diff --git a/kernel/power/Kconfig b/kernel/power/Kconfig
index b1914cb9095..deb5461e321 100644
--- a/kernel/power/Kconfig
+++ b/kernel/power/Kconfig
@@ -27,6 +27,7 @@ config HIBERNATION
27 select HIBERNATE_CALLBACKS 27 select HIBERNATE_CALLBACKS
28 select LZO_COMPRESS 28 select LZO_COMPRESS
29 select LZO_DECOMPRESS 29 select LZO_DECOMPRESS
30 select CRC32
30 ---help--- 31 ---help---
31 Enable the suspend to disk (STD) functionality, which is usually 32 Enable the suspend to disk (STD) functionality, which is usually
32 called "hibernation" in user interfaces. STD checkpoints the 33 called "hibernation" in user interfaces. STD checkpoints the
@@ -65,6 +66,9 @@ config HIBERNATION
65 66
66 For more information take a look at <file:Documentation/power/swsusp.txt>. 67 For more information take a look at <file:Documentation/power/swsusp.txt>.
67 68
69config ARCH_SAVE_PAGE_KEYS
70 bool
71
68config PM_STD_PARTITION 72config PM_STD_PARTITION
69 string "Default resume partition" 73 string "Default resume partition"
70 depends on HIBERNATION 74 depends on HIBERNATION
@@ -231,3 +235,11 @@ config PM_CLK
231config PM_GENERIC_DOMAINS 235config PM_GENERIC_DOMAINS
232 bool 236 bool
233 depends on PM 237 depends on PM
238
239config PM_GENERIC_DOMAINS_RUNTIME
240 def_bool y
241 depends on PM_RUNTIME && PM_GENERIC_DOMAINS
242
243config CPU_PM
244 bool
245 depends on SUSPEND || CPU_IDLE
diff --git a/kernel/power/Makefile b/kernel/power/Makefile
index c5ebc6a9064..07e0e28ffba 100644
--- a/kernel/power/Makefile
+++ b/kernel/power/Makefile
@@ -1,8 +1,8 @@
1 1
2ccflags-$(CONFIG_PM_DEBUG) := -DDEBUG 2ccflags-$(CONFIG_PM_DEBUG) := -DDEBUG
3 3
4obj-$(CONFIG_PM) += main.o 4obj-$(CONFIG_PM) += main.o qos.o
5obj-$(CONFIG_PM_SLEEP) += console.o 5obj-$(CONFIG_VT_CONSOLE_SLEEP) += console.o
6obj-$(CONFIG_FREEZER) += process.o 6obj-$(CONFIG_FREEZER) += process.o
7obj-$(CONFIG_SUSPEND) += suspend.o 7obj-$(CONFIG_SUSPEND) += suspend.o
8obj-$(CONFIG_PM_TEST_SUSPEND) += suspend_test.o 8obj-$(CONFIG_PM_TEST_SUSPEND) += suspend_test.o
diff --git a/kernel/power/console.c b/kernel/power/console.c
index 218e5af9015..b1dc456474b 100644
--- a/kernel/power/console.c
+++ b/kernel/power/console.c
@@ -1,5 +1,5 @@
1/* 1/*
2 * drivers/power/process.c - Functions for saving/restoring console. 2 * Functions for saving/restoring console.
3 * 3 *
4 * Originally from swsusp. 4 * Originally from swsusp.
5 */ 5 */
@@ -10,7 +10,6 @@
10#include <linux/module.h> 10#include <linux/module.h>
11#include "power.h" 11#include "power.h"
12 12
13#if defined(CONFIG_VT) && defined(CONFIG_VT_CONSOLE)
14#define SUSPEND_CONSOLE (MAX_NR_CONSOLES-1) 13#define SUSPEND_CONSOLE (MAX_NR_CONSOLES-1)
15 14
16static int orig_fgconsole, orig_kmsg; 15static int orig_fgconsole, orig_kmsg;
@@ -32,4 +31,3 @@ void pm_restore_console(void)
32 vt_kmsg_redirect(orig_kmsg); 31 vt_kmsg_redirect(orig_kmsg);
33 } 32 }
34} 33}
35#endif
diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c
index 8f7b1db1ece..a6b0503574e 100644
--- a/kernel/power/hibernate.c
+++ b/kernel/power/hibernate.c
@@ -9,11 +9,13 @@
9 * This file is released under the GPLv2. 9 * This file is released under the GPLv2.
10 */ 10 */
11 11
12#include <linux/export.h>
12#include <linux/suspend.h> 13#include <linux/suspend.h>
13#include <linux/syscalls.h> 14#include <linux/syscalls.h>
14#include <linux/reboot.h> 15#include <linux/reboot.h>
15#include <linux/string.h> 16#include <linux/string.h>
16#include <linux/device.h> 17#include <linux/device.h>
18#include <linux/async.h>
17#include <linux/kmod.h> 19#include <linux/kmod.h>
18#include <linux/delay.h> 20#include <linux/delay.h>
19#include <linux/fs.h> 21#include <linux/fs.h>
@@ -29,12 +31,14 @@
29#include "power.h" 31#include "power.h"
30 32
31 33
32static int nocompress = 0; 34static int nocompress;
33static int noresume = 0; 35static int noresume;
36static int resume_wait;
37static int resume_delay;
34static char resume_file[256] = CONFIG_PM_STD_PARTITION; 38static char resume_file[256] = CONFIG_PM_STD_PARTITION;
35dev_t swsusp_resume_device; 39dev_t swsusp_resume_device;
36sector_t swsusp_resume_block; 40sector_t swsusp_resume_block;
37int in_suspend __nosavedata = 0; 41int in_suspend __nosavedata;
38 42
39enum { 43enum {
40 HIBERNATION_INVALID, 44 HIBERNATION_INVALID,
@@ -51,6 +55,8 @@ enum {
51 55
52static int hibernation_mode = HIBERNATION_SHUTDOWN; 56static int hibernation_mode = HIBERNATION_SHUTDOWN;
53 57
58static bool freezer_test_done;
59
54static const struct platform_hibernation_ops *hibernation_ops; 60static const struct platform_hibernation_ops *hibernation_ops;
55 61
56/** 62/**
@@ -334,14 +340,31 @@ int hibernation_snapshot(int platform_mode)
334 if (error) 340 if (error)
335 goto Close; 341 goto Close;
336 342
337 error = dpm_prepare(PMSG_FREEZE);
338 if (error)
339 goto Complete_devices;
340
341 /* Preallocate image memory before shutting down devices. */ 343 /* Preallocate image memory before shutting down devices. */
342 error = hibernate_preallocate_memory(); 344 error = hibernate_preallocate_memory();
343 if (error) 345 if (error)
344 goto Complete_devices; 346 goto Close;
347
348 error = freeze_kernel_threads();
349 if (error)
350 goto Cleanup;
351
352 if (hibernation_test(TEST_FREEZER) ||
353 hibernation_testmode(HIBERNATION_TESTPROC)) {
354
355 /*
356 * Indicate to the caller that we are returning due to a
357 * successful freezer test.
358 */
359 freezer_test_done = true;
360 goto Cleanup;
361 }
362
363 error = dpm_prepare(PMSG_FREEZE);
364 if (error) {
365 dpm_complete(msg);
366 goto Cleanup;
367 }
345 368
346 suspend_console(); 369 suspend_console();
347 pm_restrict_gfp_mask(); 370 pm_restrict_gfp_mask();
@@ -370,8 +393,6 @@ int hibernation_snapshot(int platform_mode)
370 pm_restore_gfp_mask(); 393 pm_restore_gfp_mask();
371 394
372 resume_console(); 395 resume_console();
373
374 Complete_devices:
375 dpm_complete(msg); 396 dpm_complete(msg);
376 397
377 Close: 398 Close:
@@ -381,6 +402,10 @@ int hibernation_snapshot(int platform_mode)
381 Recover_platform: 402 Recover_platform:
382 platform_recover(platform_mode); 403 platform_recover(platform_mode);
383 goto Resume_devices; 404 goto Resume_devices;
405
406 Cleanup:
407 swsusp_free();
408 goto Close;
384} 409}
385 410
386/** 411/**
@@ -463,7 +488,7 @@ static int resume_target_kernel(bool platform_mode)
463 * @platform_mode: If set, use platform driver to prepare for the transition. 488 * @platform_mode: If set, use platform driver to prepare for the transition.
464 * 489 *
465 * This routine must be called with pm_mutex held. If it is successful, control 490 * This routine must be called with pm_mutex held. If it is successful, control
466 * reappears in the restored target kernel in hibernation_snaphot(). 491 * reappears in the restored target kernel in hibernation_snapshot().
467 */ 492 */
468int hibernation_restore(int platform_mode) 493int hibernation_restore(int platform_mode)
469{ 494{
@@ -633,15 +658,13 @@ int hibernate(void)
633 if (error) 658 if (error)
634 goto Finish; 659 goto Finish;
635 660
636 if (hibernation_test(TEST_FREEZER))
637 goto Thaw;
638
639 if (hibernation_testmode(HIBERNATION_TESTPROC))
640 goto Thaw;
641
642 error = hibernation_snapshot(hibernation_mode == HIBERNATION_PLATFORM); 661 error = hibernation_snapshot(hibernation_mode == HIBERNATION_PLATFORM);
643 if (error) 662 if (error)
644 goto Thaw; 663 goto Thaw;
664 if (freezer_test_done) {
665 freezer_test_done = false;
666 goto Thaw;
667 }
645 668
646 if (in_suspend) { 669 if (in_suspend) {
647 unsigned int flags = 0; 670 unsigned int flags = 0;
@@ -650,6 +673,9 @@ int hibernate(void)
650 flags |= SF_PLATFORM_MODE; 673 flags |= SF_PLATFORM_MODE;
651 if (nocompress) 674 if (nocompress)
652 flags |= SF_NOCOMPRESS_MODE; 675 flags |= SF_NOCOMPRESS_MODE;
676 else
677 flags |= SF_CRC32_MODE;
678
653 pr_debug("PM: writing image.\n"); 679 pr_debug("PM: writing image.\n");
654 error = swsusp_write(flags); 680 error = swsusp_write(flags);
655 swsusp_free(); 681 swsusp_free();
@@ -724,6 +750,12 @@ static int software_resume(void)
724 750
725 pr_debug("PM: Checking hibernation image partition %s\n", resume_file); 751 pr_debug("PM: Checking hibernation image partition %s\n", resume_file);
726 752
753 if (resume_delay) {
754 printk(KERN_INFO "Waiting %dsec before reading resume device...\n",
755 resume_delay);
756 ssleep(resume_delay);
757 }
758
727 /* Check if the device is there */ 759 /* Check if the device is there */
728 swsusp_resume_device = name_to_dev_t(resume_file); 760 swsusp_resume_device = name_to_dev_t(resume_file);
729 if (!swsusp_resume_device) { 761 if (!swsusp_resume_device) {
@@ -732,6 +764,13 @@ static int software_resume(void)
732 * to wait for this to finish. 764 * to wait for this to finish.
733 */ 765 */
734 wait_for_device_probe(); 766 wait_for_device_probe();
767
768 if (resume_wait) {
769 while ((swsusp_resume_device = name_to_dev_t(resume_file)) == 0)
770 msleep(10);
771 async_synchronize_full();
772 }
773
735 /* 774 /*
736 * We can't depend on SCSI devices being available after loading 775 * We can't depend on SCSI devices being available after loading
737 * one of their modules until scsi_complete_async_scans() is 776 * one of their modules until scsi_complete_async_scans() is
@@ -1060,7 +1099,21 @@ static int __init noresume_setup(char *str)
1060 return 1; 1099 return 1;
1061} 1100}
1062 1101
1102static int __init resumewait_setup(char *str)
1103{
1104 resume_wait = 1;
1105 return 1;
1106}
1107
1108static int __init resumedelay_setup(char *str)
1109{
1110 resume_delay = simple_strtoul(str, NULL, 0);
1111 return 1;
1112}
1113
1063__setup("noresume", noresume_setup); 1114__setup("noresume", noresume_setup);
1064__setup("resume_offset=", resume_offset_setup); 1115__setup("resume_offset=", resume_offset_setup);
1065__setup("resume=", resume_setup); 1116__setup("resume=", resume_setup);
1066__setup("hibernate=", hibernate_setup); 1117__setup("hibernate=", hibernate_setup);
1118__setup("resumewait", resumewait_setup);
1119__setup("resumedelay=", resumedelay_setup);
diff --git a/kernel/power/main.c b/kernel/power/main.c
index 6c601f87196..36e0f0903c3 100644
--- a/kernel/power/main.c
+++ b/kernel/power/main.c
@@ -8,10 +8,13 @@
8 * 8 *
9 */ 9 */
10 10
11#include <linux/export.h>
11#include <linux/kobject.h> 12#include <linux/kobject.h>
12#include <linux/string.h> 13#include <linux/string.h>
13#include <linux/resume-trace.h> 14#include <linux/resume-trace.h>
14#include <linux/workqueue.h> 15#include <linux/workqueue.h>
16#include <linux/debugfs.h>
17#include <linux/seq_file.h>
15 18
16#include "power.h" 19#include "power.h"
17 20
@@ -131,6 +134,101 @@ static ssize_t pm_test_store(struct kobject *kobj, struct kobj_attribute *attr,
131power_attr(pm_test); 134power_attr(pm_test);
132#endif /* CONFIG_PM_DEBUG */ 135#endif /* CONFIG_PM_DEBUG */
133 136
137#ifdef CONFIG_DEBUG_FS
138static char *suspend_step_name(enum suspend_stat_step step)
139{
140 switch (step) {
141 case SUSPEND_FREEZE:
142 return "freeze";
143 case SUSPEND_PREPARE:
144 return "prepare";
145 case SUSPEND_SUSPEND:
146 return "suspend";
147 case SUSPEND_SUSPEND_NOIRQ:
148 return "suspend_noirq";
149 case SUSPEND_RESUME_NOIRQ:
150 return "resume_noirq";
151 case SUSPEND_RESUME:
152 return "resume";
153 default:
154 return "";
155 }
156}
157
158static int suspend_stats_show(struct seq_file *s, void *unused)
159{
160 int i, index, last_dev, last_errno, last_step;
161
162 last_dev = suspend_stats.last_failed_dev + REC_FAILED_NUM - 1;
163 last_dev %= REC_FAILED_NUM;
164 last_errno = suspend_stats.last_failed_errno + REC_FAILED_NUM - 1;
165 last_errno %= REC_FAILED_NUM;
166 last_step = suspend_stats.last_failed_step + REC_FAILED_NUM - 1;
167 last_step %= REC_FAILED_NUM;
168 seq_printf(s, "%s: %d\n%s: %d\n%s: %d\n%s: %d\n"
169 "%s: %d\n%s: %d\n%s: %d\n%s: %d\n",
170 "success", suspend_stats.success,
171 "fail", suspend_stats.fail,
172 "failed_freeze", suspend_stats.failed_freeze,
173 "failed_prepare", suspend_stats.failed_prepare,
174 "failed_suspend", suspend_stats.failed_suspend,
175 "failed_suspend_noirq",
176 suspend_stats.failed_suspend_noirq,
177 "failed_resume", suspend_stats.failed_resume,
178 "failed_resume_noirq",
179 suspend_stats.failed_resume_noirq);
180 seq_printf(s, "failures:\n last_failed_dev:\t%-s\n",
181 suspend_stats.failed_devs[last_dev]);
182 for (i = 1; i < REC_FAILED_NUM; i++) {
183 index = last_dev + REC_FAILED_NUM - i;
184 index %= REC_FAILED_NUM;
185 seq_printf(s, "\t\t\t%-s\n",
186 suspend_stats.failed_devs[index]);
187 }
188 seq_printf(s, " last_failed_errno:\t%-d\n",
189 suspend_stats.errno[last_errno]);
190 for (i = 1; i < REC_FAILED_NUM; i++) {
191 index = last_errno + REC_FAILED_NUM - i;
192 index %= REC_FAILED_NUM;
193 seq_printf(s, "\t\t\t%-d\n",
194 suspend_stats.errno[index]);
195 }
196 seq_printf(s, " last_failed_step:\t%-s\n",
197 suspend_step_name(
198 suspend_stats.failed_steps[last_step]));
199 for (i = 1; i < REC_FAILED_NUM; i++) {
200 index = last_step + REC_FAILED_NUM - i;
201 index %= REC_FAILED_NUM;
202 seq_printf(s, "\t\t\t%-s\n",
203 suspend_step_name(
204 suspend_stats.failed_steps[index]));
205 }
206
207 return 0;
208}
209
210static int suspend_stats_open(struct inode *inode, struct file *file)
211{
212 return single_open(file, suspend_stats_show, NULL);
213}
214
215static const struct file_operations suspend_stats_operations = {
216 .open = suspend_stats_open,
217 .read = seq_read,
218 .llseek = seq_lseek,
219 .release = single_release,
220};
221
222static int __init pm_debugfs_init(void)
223{
224 debugfs_create_file("suspend_stats", S_IFREG | S_IRUGO,
225 NULL, NULL, &suspend_stats_operations);
226 return 0;
227}
228
229late_initcall(pm_debugfs_init);
230#endif /* CONFIG_DEBUG_FS */
231
134#endif /* CONFIG_PM_SLEEP */ 232#endif /* CONFIG_PM_SLEEP */
135 233
136struct kobject *power_kobj; 234struct kobject *power_kobj;
@@ -192,8 +290,14 @@ static ssize_t state_store(struct kobject *kobj, struct kobj_attribute *attr,
192 if (*s && len == strlen(*s) && !strncmp(buf, *s, len)) 290 if (*s && len == strlen(*s) && !strncmp(buf, *s, len))
193 break; 291 break;
194 } 292 }
195 if (state < PM_SUSPEND_MAX && *s) 293 if (state < PM_SUSPEND_MAX && *s) {
196 error = enter_state(state); 294 error = enter_state(state);
295 if (error) {
296 suspend_stats.fail++;
297 dpm_save_failed_errno(error);
298 } else
299 suspend_stats.success++;
300 }
197#endif 301#endif
198 302
199 Exit: 303 Exit:
diff --git a/kernel/power/power.h b/kernel/power/power.h
index 9a00a0a2628..23a2db1ec44 100644
--- a/kernel/power/power.h
+++ b/kernel/power/power.h
@@ -146,6 +146,7 @@ extern int swsusp_swap_in_use(void);
146 */ 146 */
147#define SF_PLATFORM_MODE 1 147#define SF_PLATFORM_MODE 1
148#define SF_NOCOMPRESS_MODE 2 148#define SF_NOCOMPRESS_MODE 2
149#define SF_CRC32_MODE 4
149 150
150/* kernel/power/hibernate.c */ 151/* kernel/power/hibernate.c */
151extern int swsusp_check(void); 152extern int swsusp_check(void);
@@ -228,7 +229,8 @@ extern int pm_test_level;
228#ifdef CONFIG_SUSPEND_FREEZER 229#ifdef CONFIG_SUSPEND_FREEZER
229static inline int suspend_freeze_processes(void) 230static inline int suspend_freeze_processes(void)
230{ 231{
231 return freeze_processes(); 232 int error = freeze_processes();
233 return error ? : freeze_kernel_threads();
232} 234}
233 235
234static inline void suspend_thaw_processes(void) 236static inline void suspend_thaw_processes(void)
diff --git a/kernel/power/process.c b/kernel/power/process.c
index 0cf3a27a6c9..addbbe5531b 100644
--- a/kernel/power/process.c
+++ b/kernel/power/process.c
@@ -135,7 +135,7 @@ static int try_to_freeze_tasks(bool sig_only)
135} 135}
136 136
137/** 137/**
138 * freeze_processes - tell processes to enter the refrigerator 138 * freeze_processes - Signal user space processes to enter the refrigerator.
139 */ 139 */
140int freeze_processes(void) 140int freeze_processes(void)
141{ 141{
@@ -143,20 +143,30 @@ int freeze_processes(void)
143 143
144 printk("Freezing user space processes ... "); 144 printk("Freezing user space processes ... ");
145 error = try_to_freeze_tasks(true); 145 error = try_to_freeze_tasks(true);
146 if (error) 146 if (!error) {
147 goto Exit; 147 printk("done.");
148 printk("done.\n"); 148 oom_killer_disable();
149 }
150 printk("\n");
151 BUG_ON(in_atomic());
152
153 return error;
154}
155
156/**
157 * freeze_kernel_threads - Make freezable kernel threads go to the refrigerator.
158 */
159int freeze_kernel_threads(void)
160{
161 int error;
149 162
150 printk("Freezing remaining freezable tasks ... "); 163 printk("Freezing remaining freezable tasks ... ");
151 error = try_to_freeze_tasks(false); 164 error = try_to_freeze_tasks(false);
152 if (error) 165 if (!error)
153 goto Exit; 166 printk("done.");
154 printk("done.");
155 167
156 oom_killer_disable();
157 Exit:
158 BUG_ON(in_atomic());
159 printk("\n"); 168 printk("\n");
169 BUG_ON(in_atomic());
160 170
161 return error; 171 return error;
162} 172}
diff --git a/kernel/pm_qos_params.c b/kernel/power/qos.c
index 37f05d0f079..995e3bd3417 100644
--- a/kernel/pm_qos_params.c
+++ b/kernel/power/qos.c
@@ -29,7 +29,7 @@
29 29
30/*#define DEBUG*/ 30/*#define DEBUG*/
31 31
32#include <linux/pm_qos_params.h> 32#include <linux/pm_qos.h>
33#include <linux/sched.h> 33#include <linux/sched.h>
34#include <linux/spinlock.h> 34#include <linux/spinlock.h>
35#include <linux/slab.h> 35#include <linux/slab.h>
@@ -43,64 +43,61 @@
43#include <linux/kernel.h> 43#include <linux/kernel.h>
44 44
45#include <linux/uaccess.h> 45#include <linux/uaccess.h>
46#include <linux/export.h>
46 47
47/* 48/*
48 * locking rule: all changes to requests or notifiers lists 49 * locking rule: all changes to constraints or notifiers lists
49 * or pm_qos_object list and pm_qos_objects need to happen with pm_qos_lock 50 * or pm_qos_object list and pm_qos_objects need to happen with pm_qos_lock
50 * held, taken with _irqsave. One lock to rule them all 51 * held, taken with _irqsave. One lock to rule them all
51 */ 52 */
52enum pm_qos_type {
53 PM_QOS_MAX, /* return the largest value */
54 PM_QOS_MIN /* return the smallest value */
55};
56
57/*
58 * Note: The lockless read path depends on the CPU accessing
59 * target_value atomically. Atomic access is only guaranteed on all CPU
60 * types linux supports for 32 bit quantites
61 */
62struct pm_qos_object { 53struct pm_qos_object {
63 struct plist_head requests; 54 struct pm_qos_constraints *constraints;
64 struct blocking_notifier_head *notifiers;
65 struct miscdevice pm_qos_power_miscdev; 55 struct miscdevice pm_qos_power_miscdev;
66 char *name; 56 char *name;
67 s32 target_value; /* Do not change to 64 bit */
68 s32 default_value;
69 enum pm_qos_type type;
70}; 57};
71 58
72static DEFINE_SPINLOCK(pm_qos_lock); 59static DEFINE_SPINLOCK(pm_qos_lock);
73 60
74static struct pm_qos_object null_pm_qos; 61static struct pm_qos_object null_pm_qos;
62
75static BLOCKING_NOTIFIER_HEAD(cpu_dma_lat_notifier); 63static BLOCKING_NOTIFIER_HEAD(cpu_dma_lat_notifier);
76static struct pm_qos_object cpu_dma_pm_qos = { 64static struct pm_qos_constraints cpu_dma_constraints = {
77 .requests = PLIST_HEAD_INIT(cpu_dma_pm_qos.requests), 65 .list = PLIST_HEAD_INIT(cpu_dma_constraints.list),
78 .notifiers = &cpu_dma_lat_notifier,
79 .name = "cpu_dma_latency",
80 .target_value = PM_QOS_CPU_DMA_LAT_DEFAULT_VALUE, 66 .target_value = PM_QOS_CPU_DMA_LAT_DEFAULT_VALUE,
81 .default_value = PM_QOS_CPU_DMA_LAT_DEFAULT_VALUE, 67 .default_value = PM_QOS_CPU_DMA_LAT_DEFAULT_VALUE,
82 .type = PM_QOS_MIN, 68 .type = PM_QOS_MIN,
69 .notifiers = &cpu_dma_lat_notifier,
70};
71static struct pm_qos_object cpu_dma_pm_qos = {
72 .constraints = &cpu_dma_constraints,
73 .name = "cpu_dma_latency",
83}; 74};
84 75
85static BLOCKING_NOTIFIER_HEAD(network_lat_notifier); 76static BLOCKING_NOTIFIER_HEAD(network_lat_notifier);
86static struct pm_qos_object network_lat_pm_qos = { 77static struct pm_qos_constraints network_lat_constraints = {
87 .requests = PLIST_HEAD_INIT(network_lat_pm_qos.requests), 78 .list = PLIST_HEAD_INIT(network_lat_constraints.list),
88 .notifiers = &network_lat_notifier,
89 .name = "network_latency",
90 .target_value = PM_QOS_NETWORK_LAT_DEFAULT_VALUE, 79 .target_value = PM_QOS_NETWORK_LAT_DEFAULT_VALUE,
91 .default_value = PM_QOS_NETWORK_LAT_DEFAULT_VALUE, 80 .default_value = PM_QOS_NETWORK_LAT_DEFAULT_VALUE,
92 .type = PM_QOS_MIN 81 .type = PM_QOS_MIN,
82 .notifiers = &network_lat_notifier,
83};
84static struct pm_qos_object network_lat_pm_qos = {
85 .constraints = &network_lat_constraints,
86 .name = "network_latency",
93}; 87};
94 88
95 89
96static BLOCKING_NOTIFIER_HEAD(network_throughput_notifier); 90static BLOCKING_NOTIFIER_HEAD(network_throughput_notifier);
97static struct pm_qos_object network_throughput_pm_qos = { 91static struct pm_qos_constraints network_tput_constraints = {
98 .requests = PLIST_HEAD_INIT(network_throughput_pm_qos.requests), 92 .list = PLIST_HEAD_INIT(network_tput_constraints.list),
99 .notifiers = &network_throughput_notifier,
100 .name = "network_throughput",
101 .target_value = PM_QOS_NETWORK_THROUGHPUT_DEFAULT_VALUE, 93 .target_value = PM_QOS_NETWORK_THROUGHPUT_DEFAULT_VALUE,
102 .default_value = PM_QOS_NETWORK_THROUGHPUT_DEFAULT_VALUE, 94 .default_value = PM_QOS_NETWORK_THROUGHPUT_DEFAULT_VALUE,
103 .type = PM_QOS_MAX, 95 .type = PM_QOS_MAX,
96 .notifiers = &network_throughput_notifier,
97};
98static struct pm_qos_object network_throughput_pm_qos = {
99 .constraints = &network_tput_constraints,
100 .name = "network_throughput",
104}; 101};
105 102
106 103
@@ -127,17 +124,17 @@ static const struct file_operations pm_qos_power_fops = {
127}; 124};
128 125
129/* unlocked internal variant */ 126/* unlocked internal variant */
130static inline int pm_qos_get_value(struct pm_qos_object *o) 127static inline int pm_qos_get_value(struct pm_qos_constraints *c)
131{ 128{
132 if (plist_head_empty(&o->requests)) 129 if (plist_head_empty(&c->list))
133 return o->default_value; 130 return c->default_value;
134 131
135 switch (o->type) { 132 switch (c->type) {
136 case PM_QOS_MIN: 133 case PM_QOS_MIN:
137 return plist_first(&o->requests)->prio; 134 return plist_first(&c->list)->prio;
138 135
139 case PM_QOS_MAX: 136 case PM_QOS_MAX:
140 return plist_last(&o->requests)->prio; 137 return plist_last(&c->list)->prio;
141 138
142 default: 139 default:
143 /* runtime check for not using enum */ 140 /* runtime check for not using enum */
@@ -145,69 +142,73 @@ static inline int pm_qos_get_value(struct pm_qos_object *o)
145 } 142 }
146} 143}
147 144
148static inline s32 pm_qos_read_value(struct pm_qos_object *o) 145s32 pm_qos_read_value(struct pm_qos_constraints *c)
149{ 146{
150 return o->target_value; 147 return c->target_value;
151} 148}
152 149
153static inline void pm_qos_set_value(struct pm_qos_object *o, s32 value) 150static inline void pm_qos_set_value(struct pm_qos_constraints *c, s32 value)
154{ 151{
155 o->target_value = value; 152 c->target_value = value;
156} 153}
157 154
158static void update_target(struct pm_qos_object *o, struct plist_node *node, 155/**
159 int del, int value) 156 * pm_qos_update_target - manages the constraints list and calls the notifiers
157 * if needed
158 * @c: constraints data struct
159 * @node: request to add to the list, to update or to remove
160 * @action: action to take on the constraints list
161 * @value: value of the request to add or update
162 *
163 * This function returns 1 if the aggregated constraint value has changed, 0
164 * otherwise.
165 */
166int pm_qos_update_target(struct pm_qos_constraints *c, struct plist_node *node,
167 enum pm_qos_req_action action, int value)
160{ 168{
161 unsigned long flags; 169 unsigned long flags;
162 int prev_value, curr_value; 170 int prev_value, curr_value, new_value;
163 171
164 spin_lock_irqsave(&pm_qos_lock, flags); 172 spin_lock_irqsave(&pm_qos_lock, flags);
165 prev_value = pm_qos_get_value(o); 173 prev_value = pm_qos_get_value(c);
166 /* PM_QOS_DEFAULT_VALUE is a signal that the value is unchanged */ 174 if (value == PM_QOS_DEFAULT_VALUE)
167 if (value != PM_QOS_DEFAULT_VALUE) { 175 new_value = c->default_value;
176 else
177 new_value = value;
178
179 switch (action) {
180 case PM_QOS_REMOVE_REQ:
181 plist_del(node, &c->list);
182 break;
183 case PM_QOS_UPDATE_REQ:
168 /* 184 /*
169 * to change the list, we atomically remove, reinit 185 * to change the list, we atomically remove, reinit
170 * with new value and add, then see if the extremal 186 * with new value and add, then see if the extremal
171 * changed 187 * changed
172 */ 188 */
173 plist_del(node, &o->requests); 189 plist_del(node, &c->list);
174 plist_node_init(node, value); 190 case PM_QOS_ADD_REQ:
175 plist_add(node, &o->requests); 191 plist_node_init(node, new_value);
176 } else if (del) { 192 plist_add(node, &c->list);
177 plist_del(node, &o->requests); 193 break;
178 } else { 194 default:
179 plist_add(node, &o->requests); 195 /* no action */
196 ;
180 } 197 }
181 curr_value = pm_qos_get_value(o); 198
182 pm_qos_set_value(o, curr_value); 199 curr_value = pm_qos_get_value(c);
200 pm_qos_set_value(c, curr_value);
201
183 spin_unlock_irqrestore(&pm_qos_lock, flags); 202 spin_unlock_irqrestore(&pm_qos_lock, flags);
184 203
185 if (prev_value != curr_value) 204 if (prev_value != curr_value) {
186 blocking_notifier_call_chain(o->notifiers, 205 blocking_notifier_call_chain(c->notifiers,
187 (unsigned long)curr_value, 206 (unsigned long)curr_value,
188 NULL); 207 NULL);
189} 208 return 1;
190 209 } else {
191static int register_pm_qos_misc(struct pm_qos_object *qos) 210 return 0;
192{
193 qos->pm_qos_power_miscdev.minor = MISC_DYNAMIC_MINOR;
194 qos->pm_qos_power_miscdev.name = qos->name;
195 qos->pm_qos_power_miscdev.fops = &pm_qos_power_fops;
196
197 return misc_register(&qos->pm_qos_power_miscdev);
198}
199
200static int find_pm_qos_object_by_minor(int minor)
201{
202 int pm_qos_class;
203
204 for (pm_qos_class = 0;
205 pm_qos_class < PM_QOS_NUM_CLASSES; pm_qos_class++) {
206 if (minor ==
207 pm_qos_array[pm_qos_class]->pm_qos_power_miscdev.minor)
208 return pm_qos_class;
209 } 211 }
210 return -1;
211} 212}
212 213
213/** 214/**
@@ -218,11 +219,11 @@ static int find_pm_qos_object_by_minor(int minor)
218 */ 219 */
219int pm_qos_request(int pm_qos_class) 220int pm_qos_request(int pm_qos_class)
220{ 221{
221 return pm_qos_read_value(pm_qos_array[pm_qos_class]); 222 return pm_qos_read_value(pm_qos_array[pm_qos_class]->constraints);
222} 223}
223EXPORT_SYMBOL_GPL(pm_qos_request); 224EXPORT_SYMBOL_GPL(pm_qos_request);
224 225
225int pm_qos_request_active(struct pm_qos_request_list *req) 226int pm_qos_request_active(struct pm_qos_request *req)
226{ 227{
227 return req->pm_qos_class != 0; 228 return req->pm_qos_class != 0;
228} 229}
@@ -230,40 +231,36 @@ EXPORT_SYMBOL_GPL(pm_qos_request_active);
230 231
231/** 232/**
232 * pm_qos_add_request - inserts new qos request into the list 233 * pm_qos_add_request - inserts new qos request into the list
233 * @dep: pointer to a preallocated handle 234 * @req: pointer to a preallocated handle
234 * @pm_qos_class: identifies which list of qos request to use 235 * @pm_qos_class: identifies which list of qos request to use
235 * @value: defines the qos request 236 * @value: defines the qos request
236 * 237 *
237 * This function inserts a new entry in the pm_qos_class list of requested qos 238 * This function inserts a new entry in the pm_qos_class list of requested qos
238 * performance characteristics. It recomputes the aggregate QoS expectations 239 * performance characteristics. It recomputes the aggregate QoS expectations
239 * for the pm_qos_class of parameters and initializes the pm_qos_request_list 240 * for the pm_qos_class of parameters and initializes the pm_qos_request
240 * handle. Caller needs to save this handle for later use in updates and 241 * handle. Caller needs to save this handle for later use in updates and
241 * removal. 242 * removal.
242 */ 243 */
243 244
244void pm_qos_add_request(struct pm_qos_request_list *dep, 245void pm_qos_add_request(struct pm_qos_request *req,
245 int pm_qos_class, s32 value) 246 int pm_qos_class, s32 value)
246{ 247{
247 struct pm_qos_object *o = pm_qos_array[pm_qos_class]; 248 if (!req) /*guard against callers passing in null */
248 int new_value; 249 return;
249 250
250 if (pm_qos_request_active(dep)) { 251 if (pm_qos_request_active(req)) {
251 WARN(1, KERN_ERR "pm_qos_add_request() called for already added request\n"); 252 WARN(1, KERN_ERR "pm_qos_add_request() called for already added request\n");
252 return; 253 return;
253 } 254 }
254 if (value == PM_QOS_DEFAULT_VALUE) 255 req->pm_qos_class = pm_qos_class;
255 new_value = o->default_value; 256 pm_qos_update_target(pm_qos_array[pm_qos_class]->constraints,
256 else 257 &req->node, PM_QOS_ADD_REQ, value);
257 new_value = value;
258 plist_node_init(&dep->list, new_value);
259 dep->pm_qos_class = pm_qos_class;
260 update_target(o, &dep->list, 0, PM_QOS_DEFAULT_VALUE);
261} 258}
262EXPORT_SYMBOL_GPL(pm_qos_add_request); 259EXPORT_SYMBOL_GPL(pm_qos_add_request);
263 260
264/** 261/**
265 * pm_qos_update_request - modifies an existing qos request 262 * pm_qos_update_request - modifies an existing qos request
266 * @pm_qos_req : handle to list element holding a pm_qos request to use 263 * @req : handle to list element holding a pm_qos request to use
267 * @value: defines the qos request 264 * @value: defines the qos request
268 * 265 *
269 * Updates an existing qos request for the pm_qos_class of parameters along 266 * Updates an existing qos request for the pm_qos_class of parameters along
@@ -271,56 +268,47 @@ EXPORT_SYMBOL_GPL(pm_qos_add_request);
271 * 268 *
272 * Attempts are made to make this code callable on hot code paths. 269 * Attempts are made to make this code callable on hot code paths.
273 */ 270 */
274void pm_qos_update_request(struct pm_qos_request_list *pm_qos_req, 271void pm_qos_update_request(struct pm_qos_request *req,
275 s32 new_value) 272 s32 new_value)
276{ 273{
277 s32 temp; 274 if (!req) /*guard against callers passing in null */
278 struct pm_qos_object *o;
279
280 if (!pm_qos_req) /*guard against callers passing in null */
281 return; 275 return;
282 276
283 if (!pm_qos_request_active(pm_qos_req)) { 277 if (!pm_qos_request_active(req)) {
284 WARN(1, KERN_ERR "pm_qos_update_request() called for unknown object\n"); 278 WARN(1, KERN_ERR "pm_qos_update_request() called for unknown object\n");
285 return; 279 return;
286 } 280 }
287 281
288 o = pm_qos_array[pm_qos_req->pm_qos_class]; 282 if (new_value != req->node.prio)
289 283 pm_qos_update_target(
290 if (new_value == PM_QOS_DEFAULT_VALUE) 284 pm_qos_array[req->pm_qos_class]->constraints,
291 temp = o->default_value; 285 &req->node, PM_QOS_UPDATE_REQ, new_value);
292 else
293 temp = new_value;
294
295 if (temp != pm_qos_req->list.prio)
296 update_target(o, &pm_qos_req->list, 0, temp);
297} 286}
298EXPORT_SYMBOL_GPL(pm_qos_update_request); 287EXPORT_SYMBOL_GPL(pm_qos_update_request);
299 288
300/** 289/**
301 * pm_qos_remove_request - modifies an existing qos request 290 * pm_qos_remove_request - modifies an existing qos request
302 * @pm_qos_req: handle to request list element 291 * @req: handle to request list element
303 * 292 *
304 * Will remove pm qos request from the list of requests and 293 * Will remove pm qos request from the list of constraints and
305 * recompute the current target value for the pm_qos_class. Call this 294 * recompute the current target value for the pm_qos_class. Call this
306 * on slow code paths. 295 * on slow code paths.
307 */ 296 */
308void pm_qos_remove_request(struct pm_qos_request_list *pm_qos_req) 297void pm_qos_remove_request(struct pm_qos_request *req)
309{ 298{
310 struct pm_qos_object *o; 299 if (!req) /*guard against callers passing in null */
311
312 if (pm_qos_req == NULL)
313 return; 300 return;
314 /* silent return to keep pcm code cleaner */ 301 /* silent return to keep pcm code cleaner */
315 302
316 if (!pm_qos_request_active(pm_qos_req)) { 303 if (!pm_qos_request_active(req)) {
317 WARN(1, KERN_ERR "pm_qos_remove_request() called for unknown object\n"); 304 WARN(1, KERN_ERR "pm_qos_remove_request() called for unknown object\n");
318 return; 305 return;
319 } 306 }
320 307
321 o = pm_qos_array[pm_qos_req->pm_qos_class]; 308 pm_qos_update_target(pm_qos_array[req->pm_qos_class]->constraints,
322 update_target(o, &pm_qos_req->list, 1, PM_QOS_DEFAULT_VALUE); 309 &req->node, PM_QOS_REMOVE_REQ,
323 memset(pm_qos_req, 0, sizeof(*pm_qos_req)); 310 PM_QOS_DEFAULT_VALUE);
311 memset(req, 0, sizeof(*req));
324} 312}
325EXPORT_SYMBOL_GPL(pm_qos_remove_request); 313EXPORT_SYMBOL_GPL(pm_qos_remove_request);
326 314
@@ -337,7 +325,8 @@ int pm_qos_add_notifier(int pm_qos_class, struct notifier_block *notifier)
337 int retval; 325 int retval;
338 326
339 retval = blocking_notifier_chain_register( 327 retval = blocking_notifier_chain_register(
340 pm_qos_array[pm_qos_class]->notifiers, notifier); 328 pm_qos_array[pm_qos_class]->constraints->notifiers,
329 notifier);
341 330
342 return retval; 331 return retval;
343} 332}
@@ -356,34 +345,57 @@ int pm_qos_remove_notifier(int pm_qos_class, struct notifier_block *notifier)
356 int retval; 345 int retval;
357 346
358 retval = blocking_notifier_chain_unregister( 347 retval = blocking_notifier_chain_unregister(
359 pm_qos_array[pm_qos_class]->notifiers, notifier); 348 pm_qos_array[pm_qos_class]->constraints->notifiers,
349 notifier);
360 350
361 return retval; 351 return retval;
362} 352}
363EXPORT_SYMBOL_GPL(pm_qos_remove_notifier); 353EXPORT_SYMBOL_GPL(pm_qos_remove_notifier);
364 354
355/* User space interface to PM QoS classes via misc devices */
356static int register_pm_qos_misc(struct pm_qos_object *qos)
357{
358 qos->pm_qos_power_miscdev.minor = MISC_DYNAMIC_MINOR;
359 qos->pm_qos_power_miscdev.name = qos->name;
360 qos->pm_qos_power_miscdev.fops = &pm_qos_power_fops;
361
362 return misc_register(&qos->pm_qos_power_miscdev);
363}
364
365static int find_pm_qos_object_by_minor(int minor)
366{
367 int pm_qos_class;
368
369 for (pm_qos_class = 0;
370 pm_qos_class < PM_QOS_NUM_CLASSES; pm_qos_class++) {
371 if (minor ==
372 pm_qos_array[pm_qos_class]->pm_qos_power_miscdev.minor)
373 return pm_qos_class;
374 }
375 return -1;
376}
377
365static int pm_qos_power_open(struct inode *inode, struct file *filp) 378static int pm_qos_power_open(struct inode *inode, struct file *filp)
366{ 379{
367 long pm_qos_class; 380 long pm_qos_class;
368 381
369 pm_qos_class = find_pm_qos_object_by_minor(iminor(inode)); 382 pm_qos_class = find_pm_qos_object_by_minor(iminor(inode));
370 if (pm_qos_class >= 0) { 383 if (pm_qos_class >= 0) {
371 struct pm_qos_request_list *req = kzalloc(sizeof(*req), GFP_KERNEL); 384 struct pm_qos_request *req = kzalloc(sizeof(*req), GFP_KERNEL);
372 if (!req) 385 if (!req)
373 return -ENOMEM; 386 return -ENOMEM;
374 387
375 pm_qos_add_request(req, pm_qos_class, PM_QOS_DEFAULT_VALUE); 388 pm_qos_add_request(req, pm_qos_class, PM_QOS_DEFAULT_VALUE);
376 filp->private_data = req; 389 filp->private_data = req;
377 390
378 if (filp->private_data) 391 return 0;
379 return 0;
380 } 392 }
381 return -EPERM; 393 return -EPERM;
382} 394}
383 395
384static int pm_qos_power_release(struct inode *inode, struct file *filp) 396static int pm_qos_power_release(struct inode *inode, struct file *filp)
385{ 397{
386 struct pm_qos_request_list *req; 398 struct pm_qos_request *req;
387 399
388 req = filp->private_data; 400 req = filp->private_data;
389 pm_qos_remove_request(req); 401 pm_qos_remove_request(req);
@@ -398,17 +410,15 @@ static ssize_t pm_qos_power_read(struct file *filp, char __user *buf,
398{ 410{
399 s32 value; 411 s32 value;
400 unsigned long flags; 412 unsigned long flags;
401 struct pm_qos_object *o; 413 struct pm_qos_request *req = filp->private_data;
402 struct pm_qos_request_list *pm_qos_req = filp->private_data;
403 414
404 if (!pm_qos_req) 415 if (!req)
405 return -EINVAL; 416 return -EINVAL;
406 if (!pm_qos_request_active(pm_qos_req)) 417 if (!pm_qos_request_active(req))
407 return -EINVAL; 418 return -EINVAL;
408 419
409 o = pm_qos_array[pm_qos_req->pm_qos_class];
410 spin_lock_irqsave(&pm_qos_lock, flags); 420 spin_lock_irqsave(&pm_qos_lock, flags);
411 value = pm_qos_get_value(o); 421 value = pm_qos_get_value(pm_qos_array[req->pm_qos_class]->constraints);
412 spin_unlock_irqrestore(&pm_qos_lock, flags); 422 spin_unlock_irqrestore(&pm_qos_lock, flags);
413 423
414 return simple_read_from_buffer(buf, count, f_pos, &value, sizeof(s32)); 424 return simple_read_from_buffer(buf, count, f_pos, &value, sizeof(s32));
@@ -418,7 +428,7 @@ static ssize_t pm_qos_power_write(struct file *filp, const char __user *buf,
418 size_t count, loff_t *f_pos) 428 size_t count, loff_t *f_pos)
419{ 429{
420 s32 value; 430 s32 value;
421 struct pm_qos_request_list *pm_qos_req; 431 struct pm_qos_request *req;
422 432
423 if (count == sizeof(s32)) { 433 if (count == sizeof(s32)) {
424 if (copy_from_user(&value, buf, sizeof(s32))) 434 if (copy_from_user(&value, buf, sizeof(s32)))
@@ -449,8 +459,8 @@ static ssize_t pm_qos_power_write(struct file *filp, const char __user *buf,
449 return -EINVAL; 459 return -EINVAL;
450 } 460 }
451 461
452 pm_qos_req = filp->private_data; 462 req = filp->private_data;
453 pm_qos_update_request(pm_qos_req, value); 463 pm_qos_update_request(req, value);
454 464
455 return count; 465 return count;
456} 466}
diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c
index 06efa54f93d..cbe2c144139 100644
--- a/kernel/power/snapshot.c
+++ b/kernel/power/snapshot.c
@@ -1339,6 +1339,9 @@ int hibernate_preallocate_memory(void)
1339 count += highmem; 1339 count += highmem;
1340 count -= totalreserve_pages; 1340 count -= totalreserve_pages;
1341 1341
1342 /* Add number of pages required for page keys (s390 only). */
1343 size += page_key_additional_pages(saveable);
1344
1342 /* Compute the maximum number of saveable pages to leave in memory. */ 1345 /* Compute the maximum number of saveable pages to leave in memory. */
1343 max_size = (count - (size + PAGES_FOR_IO)) / 2 1346 max_size = (count - (size + PAGES_FOR_IO)) / 2
1344 - 2 * DIV_ROUND_UP(reserved_size, PAGE_SIZE); 1347 - 2 * DIV_ROUND_UP(reserved_size, PAGE_SIZE);
@@ -1662,6 +1665,8 @@ pack_pfns(unsigned long *buf, struct memory_bitmap *bm)
1662 buf[j] = memory_bm_next_pfn(bm); 1665 buf[j] = memory_bm_next_pfn(bm);
1663 if (unlikely(buf[j] == BM_END_OF_MAP)) 1666 if (unlikely(buf[j] == BM_END_OF_MAP))
1664 break; 1667 break;
1668 /* Save page key for data page (s390 only). */
1669 page_key_read(buf + j);
1665 } 1670 }
1666} 1671}
1667 1672
@@ -1821,6 +1826,9 @@ static int unpack_orig_pfns(unsigned long *buf, struct memory_bitmap *bm)
1821 if (unlikely(buf[j] == BM_END_OF_MAP)) 1826 if (unlikely(buf[j] == BM_END_OF_MAP))
1822 break; 1827 break;
1823 1828
1829 /* Extract and buffer page key for data page (s390 only). */
1830 page_key_memorize(buf + j);
1831
1824 if (memory_bm_pfn_present(bm, buf[j])) 1832 if (memory_bm_pfn_present(bm, buf[j]))
1825 memory_bm_set_bit(bm, buf[j]); 1833 memory_bm_set_bit(bm, buf[j]);
1826 else 1834 else
@@ -2223,6 +2231,11 @@ int snapshot_write_next(struct snapshot_handle *handle)
2223 if (error) 2231 if (error)
2224 return error; 2232 return error;
2225 2233
2234 /* Allocate buffer for page keys. */
2235 error = page_key_alloc(nr_copy_pages);
2236 if (error)
2237 return error;
2238
2226 } else if (handle->cur <= nr_meta_pages + 1) { 2239 } else if (handle->cur <= nr_meta_pages + 1) {
2227 error = unpack_orig_pfns(buffer, &copy_bm); 2240 error = unpack_orig_pfns(buffer, &copy_bm);
2228 if (error) 2241 if (error)
@@ -2243,6 +2256,8 @@ int snapshot_write_next(struct snapshot_handle *handle)
2243 } 2256 }
2244 } else { 2257 } else {
2245 copy_last_highmem_page(); 2258 copy_last_highmem_page();
2259 /* Restore page key for data page (s390 only). */
2260 page_key_write(handle->buffer);
2246 handle->buffer = get_buffer(&orig_bm, &ca); 2261 handle->buffer = get_buffer(&orig_bm, &ca);
2247 if (IS_ERR(handle->buffer)) 2262 if (IS_ERR(handle->buffer))
2248 return PTR_ERR(handle->buffer); 2263 return PTR_ERR(handle->buffer);
@@ -2264,6 +2279,9 @@ int snapshot_write_next(struct snapshot_handle *handle)
2264void snapshot_write_finalize(struct snapshot_handle *handle) 2279void snapshot_write_finalize(struct snapshot_handle *handle)
2265{ 2280{
2266 copy_last_highmem_page(); 2281 copy_last_highmem_page();
2282 /* Restore page key for data page (s390 only). */
2283 page_key_write(handle->buffer);
2284 page_key_free();
2267 /* Free only if we have loaded the image entirely */ 2285 /* Free only if we have loaded the image entirely */
2268 if (handle->cur > 1 && handle->cur > nr_meta_pages + nr_copy_pages) { 2286 if (handle->cur > 1 && handle->cur > nr_meta_pages + nr_copy_pages) {
2269 memory_bm_free(&orig_bm, PG_UNSAFE_CLEAR); 2287 memory_bm_free(&orig_bm, PG_UNSAFE_CLEAR);
diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c
index b6b71ad2208..4953dc054c5 100644
--- a/kernel/power/suspend.c
+++ b/kernel/power/suspend.c
@@ -12,6 +12,7 @@
12#include <linux/delay.h> 12#include <linux/delay.h>
13#include <linux/errno.h> 13#include <linux/errno.h>
14#include <linux/init.h> 14#include <linux/init.h>
15#include <linux/kmod.h>
15#include <linux/console.h> 16#include <linux/console.h>
16#include <linux/cpu.h> 17#include <linux/cpu.h>
17#include <linux/syscalls.h> 18#include <linux/syscalls.h>
@@ -21,6 +22,7 @@
21#include <linux/list.h> 22#include <linux/list.h>
22#include <linux/mm.h> 23#include <linux/mm.h>
23#include <linux/slab.h> 24#include <linux/slab.h>
25#include <linux/export.h>
24#include <linux/suspend.h> 26#include <linux/suspend.h>
25#include <linux/syscore_ops.h> 27#include <linux/syscore_ops.h>
26#include <trace/events/power.h> 28#include <trace/events/power.h>
@@ -104,7 +106,10 @@ static int suspend_prepare(void)
104 goto Finish; 106 goto Finish;
105 107
106 error = suspend_freeze_processes(); 108 error = suspend_freeze_processes();
107 if (!error) 109 if (error) {
110 suspend_stats.failed_freeze++;
111 dpm_save_failed_step(SUSPEND_FREEZE);
112 } else
108 return 0; 113 return 0;
109 114
110 suspend_thaw_processes(); 115 suspend_thaw_processes();
@@ -315,8 +320,16 @@ int enter_state(suspend_state_t state)
315 */ 320 */
316int pm_suspend(suspend_state_t state) 321int pm_suspend(suspend_state_t state)
317{ 322{
318 if (state > PM_SUSPEND_ON && state <= PM_SUSPEND_MAX) 323 int ret;
319 return enter_state(state); 324 if (state > PM_SUSPEND_ON && state < PM_SUSPEND_MAX) {
325 ret = enter_state(state);
326 if (ret) {
327 suspend_stats.fail++;
328 dpm_save_failed_errno(ret);
329 } else
330 suspend_stats.success++;
331 return ret;
332 }
320 return -EINVAL; 333 return -EINVAL;
321} 334}
322EXPORT_SYMBOL(pm_suspend); 335EXPORT_SYMBOL(pm_suspend);
diff --git a/kernel/power/swap.c b/kernel/power/swap.c
index 7c97c3a0eee..11a594c4ba2 100644
--- a/kernel/power/swap.c
+++ b/kernel/power/swap.c
@@ -27,6 +27,10 @@
27#include <linux/slab.h> 27#include <linux/slab.h>
28#include <linux/lzo.h> 28#include <linux/lzo.h>
29#include <linux/vmalloc.h> 29#include <linux/vmalloc.h>
30#include <linux/cpumask.h>
31#include <linux/atomic.h>
32#include <linux/kthread.h>
33#include <linux/crc32.h>
30 34
31#include "power.h" 35#include "power.h"
32 36
@@ -43,8 +47,7 @@
43 * allocated and populated one at a time, so we only need one memory 47 * allocated and populated one at a time, so we only need one memory
44 * page to set up the entire structure. 48 * page to set up the entire structure.
45 * 49 *
46 * During resume we also only need to use one swap_map_page structure 50 * During resume we pick up all swap_map_page structures into a list.
47 * at a time.
48 */ 51 */
49 52
50#define MAP_PAGE_ENTRIES (PAGE_SIZE / sizeof(sector_t) - 1) 53#define MAP_PAGE_ENTRIES (PAGE_SIZE / sizeof(sector_t) - 1)
@@ -54,6 +57,11 @@ struct swap_map_page {
54 sector_t next_swap; 57 sector_t next_swap;
55}; 58};
56 59
60struct swap_map_page_list {
61 struct swap_map_page *map;
62 struct swap_map_page_list *next;
63};
64
57/** 65/**
58 * The swap_map_handle structure is used for handling swap in 66 * The swap_map_handle structure is used for handling swap in
59 * a file-alike way 67 * a file-alike way
@@ -61,13 +69,18 @@ struct swap_map_page {
61 69
62struct swap_map_handle { 70struct swap_map_handle {
63 struct swap_map_page *cur; 71 struct swap_map_page *cur;
72 struct swap_map_page_list *maps;
64 sector_t cur_swap; 73 sector_t cur_swap;
65 sector_t first_sector; 74 sector_t first_sector;
66 unsigned int k; 75 unsigned int k;
76 unsigned long nr_free_pages, written;
77 u32 crc32;
67}; 78};
68 79
69struct swsusp_header { 80struct swsusp_header {
70 char reserved[PAGE_SIZE - 20 - sizeof(sector_t) - sizeof(int)]; 81 char reserved[PAGE_SIZE - 20 - sizeof(sector_t) - sizeof(int) -
82 sizeof(u32)];
83 u32 crc32;
71 sector_t image; 84 sector_t image;
72 unsigned int flags; /* Flags to pass to the "boot" kernel */ 85 unsigned int flags; /* Flags to pass to the "boot" kernel */
73 char orig_sig[10]; 86 char orig_sig[10];
@@ -199,6 +212,8 @@ static int mark_swapfiles(struct swap_map_handle *handle, unsigned int flags)
199 memcpy(swsusp_header->sig, HIBERNATE_SIG, 10); 212 memcpy(swsusp_header->sig, HIBERNATE_SIG, 10);
200 swsusp_header->image = handle->first_sector; 213 swsusp_header->image = handle->first_sector;
201 swsusp_header->flags = flags; 214 swsusp_header->flags = flags;
215 if (flags & SF_CRC32_MODE)
216 swsusp_header->crc32 = handle->crc32;
202 error = hib_bio_write_page(swsusp_resume_block, 217 error = hib_bio_write_page(swsusp_resume_block,
203 swsusp_header, NULL); 218 swsusp_header, NULL);
204 } else { 219 } else {
@@ -245,6 +260,7 @@ static int swsusp_swap_check(void)
245static int write_page(void *buf, sector_t offset, struct bio **bio_chain) 260static int write_page(void *buf, sector_t offset, struct bio **bio_chain)
246{ 261{
247 void *src; 262 void *src;
263 int ret;
248 264
249 if (!offset) 265 if (!offset)
250 return -ENOSPC; 266 return -ENOSPC;
@@ -254,9 +270,17 @@ static int write_page(void *buf, sector_t offset, struct bio **bio_chain)
254 if (src) { 270 if (src) {
255 copy_page(src, buf); 271 copy_page(src, buf);
256 } else { 272 } else {
257 WARN_ON_ONCE(1); 273 ret = hib_wait_on_bio_chain(bio_chain); /* Free pages */
258 bio_chain = NULL; /* Go synchronous */ 274 if (ret)
259 src = buf; 275 return ret;
276 src = (void *)__get_free_page(__GFP_WAIT | __GFP_HIGH);
277 if (src) {
278 copy_page(src, buf);
279 } else {
280 WARN_ON_ONCE(1);
281 bio_chain = NULL; /* Go synchronous */
282 src = buf;
283 }
260 } 284 }
261 } else { 285 } else {
262 src = buf; 286 src = buf;
@@ -293,6 +317,8 @@ static int get_swap_writer(struct swap_map_handle *handle)
293 goto err_rel; 317 goto err_rel;
294 } 318 }
295 handle->k = 0; 319 handle->k = 0;
320 handle->nr_free_pages = nr_free_pages() >> 1;
321 handle->written = 0;
296 handle->first_sector = handle->cur_swap; 322 handle->first_sector = handle->cur_swap;
297 return 0; 323 return 0;
298err_rel: 324err_rel:
@@ -316,20 +342,23 @@ static int swap_write_page(struct swap_map_handle *handle, void *buf,
316 return error; 342 return error;
317 handle->cur->entries[handle->k++] = offset; 343 handle->cur->entries[handle->k++] = offset;
318 if (handle->k >= MAP_PAGE_ENTRIES) { 344 if (handle->k >= MAP_PAGE_ENTRIES) {
319 error = hib_wait_on_bio_chain(bio_chain);
320 if (error)
321 goto out;
322 offset = alloc_swapdev_block(root_swap); 345 offset = alloc_swapdev_block(root_swap);
323 if (!offset) 346 if (!offset)
324 return -ENOSPC; 347 return -ENOSPC;
325 handle->cur->next_swap = offset; 348 handle->cur->next_swap = offset;
326 error = write_page(handle->cur, handle->cur_swap, NULL); 349 error = write_page(handle->cur, handle->cur_swap, bio_chain);
327 if (error) 350 if (error)
328 goto out; 351 goto out;
329 clear_page(handle->cur); 352 clear_page(handle->cur);
330 handle->cur_swap = offset; 353 handle->cur_swap = offset;
331 handle->k = 0; 354 handle->k = 0;
332 } 355 }
356 if (bio_chain && ++handle->written > handle->nr_free_pages) {
357 error = hib_wait_on_bio_chain(bio_chain);
358 if (error)
359 goto out;
360 handle->written = 0;
361 }
333 out: 362 out:
334 return error; 363 return error;
335} 364}
@@ -372,6 +401,13 @@ static int swap_writer_finish(struct swap_map_handle *handle,
372 LZO_HEADER, PAGE_SIZE) 401 LZO_HEADER, PAGE_SIZE)
373#define LZO_CMP_SIZE (LZO_CMP_PAGES * PAGE_SIZE) 402#define LZO_CMP_SIZE (LZO_CMP_PAGES * PAGE_SIZE)
374 403
404/* Maximum number of threads for compression/decompression. */
405#define LZO_THREADS 3
406
407/* Maximum number of pages for read buffering. */
408#define LZO_READ_PAGES (MAP_PAGE_ENTRIES * 8)
409
410
375/** 411/**
376 * save_image - save the suspend image data 412 * save_image - save the suspend image data
377 */ 413 */
@@ -419,6 +455,92 @@ static int save_image(struct swap_map_handle *handle,
419 return ret; 455 return ret;
420} 456}
421 457
458/**
459 * Structure used for CRC32.
460 */
461struct crc_data {
462 struct task_struct *thr; /* thread */
463 atomic_t ready; /* ready to start flag */
464 atomic_t stop; /* ready to stop flag */
465 unsigned run_threads; /* nr current threads */
466 wait_queue_head_t go; /* start crc update */
467 wait_queue_head_t done; /* crc update done */
468 u32 *crc32; /* points to handle's crc32 */
469 size_t *unc_len[LZO_THREADS]; /* uncompressed lengths */
470 unsigned char *unc[LZO_THREADS]; /* uncompressed data */
471};
472
473/**
474 * CRC32 update function that runs in its own thread.
475 */
476static int crc32_threadfn(void *data)
477{
478 struct crc_data *d = data;
479 unsigned i;
480
481 while (1) {
482 wait_event(d->go, atomic_read(&d->ready) ||
483 kthread_should_stop());
484 if (kthread_should_stop()) {
485 d->thr = NULL;
486 atomic_set(&d->stop, 1);
487 wake_up(&d->done);
488 break;
489 }
490 atomic_set(&d->ready, 0);
491
492 for (i = 0; i < d->run_threads; i++)
493 *d->crc32 = crc32_le(*d->crc32,
494 d->unc[i], *d->unc_len[i]);
495 atomic_set(&d->stop, 1);
496 wake_up(&d->done);
497 }
498 return 0;
499}
500/**
501 * Structure used for LZO data compression.
502 */
503struct cmp_data {
504 struct task_struct *thr; /* thread */
505 atomic_t ready; /* ready to start flag */
506 atomic_t stop; /* ready to stop flag */
507 int ret; /* return code */
508 wait_queue_head_t go; /* start compression */
509 wait_queue_head_t done; /* compression done */
510 size_t unc_len; /* uncompressed length */
511 size_t cmp_len; /* compressed length */
512 unsigned char unc[LZO_UNC_SIZE]; /* uncompressed buffer */
513 unsigned char cmp[LZO_CMP_SIZE]; /* compressed buffer */
514 unsigned char wrk[LZO1X_1_MEM_COMPRESS]; /* compression workspace */
515};
516
517/**
518 * Compression function that runs in its own thread.
519 */
520static int lzo_compress_threadfn(void *data)
521{
522 struct cmp_data *d = data;
523
524 while (1) {
525 wait_event(d->go, atomic_read(&d->ready) ||
526 kthread_should_stop());
527 if (kthread_should_stop()) {
528 d->thr = NULL;
529 d->ret = -1;
530 atomic_set(&d->stop, 1);
531 wake_up(&d->done);
532 break;
533 }
534 atomic_set(&d->ready, 0);
535
536 d->ret = lzo1x_1_compress(d->unc, d->unc_len,
537 d->cmp + LZO_HEADER, &d->cmp_len,
538 d->wrk);
539 atomic_set(&d->stop, 1);
540 wake_up(&d->done);
541 }
542 return 0;
543}
422 544
423/** 545/**
424 * save_image_lzo - Save the suspend image data compressed with LZO. 546 * save_image_lzo - Save the suspend image data compressed with LZO.
@@ -437,42 +559,93 @@ static int save_image_lzo(struct swap_map_handle *handle,
437 struct bio *bio; 559 struct bio *bio;
438 struct timeval start; 560 struct timeval start;
439 struct timeval stop; 561 struct timeval stop;
440 size_t off, unc_len, cmp_len; 562 size_t off;
441 unsigned char *unc, *cmp, *wrk, *page; 563 unsigned thr, run_threads, nr_threads;
564 unsigned char *page = NULL;
565 struct cmp_data *data = NULL;
566 struct crc_data *crc = NULL;
567
568 /*
569 * We'll limit the number of threads for compression to limit memory
570 * footprint.
571 */
572 nr_threads = num_online_cpus() - 1;
573 nr_threads = clamp_val(nr_threads, 1, LZO_THREADS);
442 574
443 page = (void *)__get_free_page(__GFP_WAIT | __GFP_HIGH); 575 page = (void *)__get_free_page(__GFP_WAIT | __GFP_HIGH);
444 if (!page) { 576 if (!page) {
445 printk(KERN_ERR "PM: Failed to allocate LZO page\n"); 577 printk(KERN_ERR "PM: Failed to allocate LZO page\n");
446 return -ENOMEM; 578 ret = -ENOMEM;
579 goto out_clean;
447 } 580 }
448 581
449 wrk = vmalloc(LZO1X_1_MEM_COMPRESS); 582 data = vmalloc(sizeof(*data) * nr_threads);
450 if (!wrk) { 583 if (!data) {
451 printk(KERN_ERR "PM: Failed to allocate LZO workspace\n"); 584 printk(KERN_ERR "PM: Failed to allocate LZO data\n");
452 free_page((unsigned long)page); 585 ret = -ENOMEM;
453 return -ENOMEM; 586 goto out_clean;
454 } 587 }
588 for (thr = 0; thr < nr_threads; thr++)
589 memset(&data[thr], 0, offsetof(struct cmp_data, go));
455 590
456 unc = vmalloc(LZO_UNC_SIZE); 591 crc = kmalloc(sizeof(*crc), GFP_KERNEL);
457 if (!unc) { 592 if (!crc) {
458 printk(KERN_ERR "PM: Failed to allocate LZO uncompressed\n"); 593 printk(KERN_ERR "PM: Failed to allocate crc\n");
459 vfree(wrk); 594 ret = -ENOMEM;
460 free_page((unsigned long)page); 595 goto out_clean;
461 return -ENOMEM; 596 }
597 memset(crc, 0, offsetof(struct crc_data, go));
598
599 /*
600 * Start the compression threads.
601 */
602 for (thr = 0; thr < nr_threads; thr++) {
603 init_waitqueue_head(&data[thr].go);
604 init_waitqueue_head(&data[thr].done);
605
606 data[thr].thr = kthread_run(lzo_compress_threadfn,
607 &data[thr],
608 "image_compress/%u", thr);
609 if (IS_ERR(data[thr].thr)) {
610 data[thr].thr = NULL;
611 printk(KERN_ERR
612 "PM: Cannot start compression threads\n");
613 ret = -ENOMEM;
614 goto out_clean;
615 }
462 } 616 }
463 617
464 cmp = vmalloc(LZO_CMP_SIZE); 618 /*
465 if (!cmp) { 619 * Adjust number of free pages after all allocations have been done.
466 printk(KERN_ERR "PM: Failed to allocate LZO compressed\n"); 620 * We don't want to run out of pages when writing.
467 vfree(unc); 621 */
468 vfree(wrk); 622 handle->nr_free_pages = nr_free_pages() >> 1;
469 free_page((unsigned long)page); 623
470 return -ENOMEM; 624 /*
625 * Start the CRC32 thread.
626 */
627 init_waitqueue_head(&crc->go);
628 init_waitqueue_head(&crc->done);
629
630 handle->crc32 = 0;
631 crc->crc32 = &handle->crc32;
632 for (thr = 0; thr < nr_threads; thr++) {
633 crc->unc[thr] = data[thr].unc;
634 crc->unc_len[thr] = &data[thr].unc_len;
635 }
636
637 crc->thr = kthread_run(crc32_threadfn, crc, "image_crc32");
638 if (IS_ERR(crc->thr)) {
639 crc->thr = NULL;
640 printk(KERN_ERR "PM: Cannot start CRC32 thread\n");
641 ret = -ENOMEM;
642 goto out_clean;
471 } 643 }
472 644
473 printk(KERN_INFO 645 printk(KERN_INFO
646 "PM: Using %u thread(s) for compression.\n"
474 "PM: Compressing and saving image data (%u pages) ... ", 647 "PM: Compressing and saving image data (%u pages) ... ",
475 nr_to_write); 648 nr_threads, nr_to_write);
476 m = nr_to_write / 100; 649 m = nr_to_write / 100;
477 if (!m) 650 if (!m)
478 m = 1; 651 m = 1;
@@ -480,55 +653,83 @@ static int save_image_lzo(struct swap_map_handle *handle,
480 bio = NULL; 653 bio = NULL;
481 do_gettimeofday(&start); 654 do_gettimeofday(&start);
482 for (;;) { 655 for (;;) {
483 for (off = 0; off < LZO_UNC_SIZE; off += PAGE_SIZE) { 656 for (thr = 0; thr < nr_threads; thr++) {
484 ret = snapshot_read_next(snapshot); 657 for (off = 0; off < LZO_UNC_SIZE; off += PAGE_SIZE) {
485 if (ret < 0) 658 ret = snapshot_read_next(snapshot);
486 goto out_finish; 659 if (ret < 0)
487 660 goto out_finish;
488 if (!ret) 661
662 if (!ret)
663 break;
664
665 memcpy(data[thr].unc + off,
666 data_of(*snapshot), PAGE_SIZE);
667
668 if (!(nr_pages % m))
669 printk(KERN_CONT "\b\b\b\b%3d%%",
670 nr_pages / m);
671 nr_pages++;
672 }
673 if (!off)
489 break; 674 break;
490 675
491 memcpy(unc + off, data_of(*snapshot), PAGE_SIZE); 676 data[thr].unc_len = off;
492 677
493 if (!(nr_pages % m)) 678 atomic_set(&data[thr].ready, 1);
494 printk(KERN_CONT "\b\b\b\b%3d%%", nr_pages / m); 679 wake_up(&data[thr].go);
495 nr_pages++;
496 } 680 }
497 681
498 if (!off) 682 if (!thr)
499 break; 683 break;
500 684
501 unc_len = off; 685 crc->run_threads = thr;
502 ret = lzo1x_1_compress(unc, unc_len, 686 atomic_set(&crc->ready, 1);
503 cmp + LZO_HEADER, &cmp_len, wrk); 687 wake_up(&crc->go);
504 if (ret < 0) {
505 printk(KERN_ERR "PM: LZO compression failed\n");
506 break;
507 }
508 688
509 if (unlikely(!cmp_len || 689 for (run_threads = thr, thr = 0; thr < run_threads; thr++) {
510 cmp_len > lzo1x_worst_compress(unc_len))) { 690 wait_event(data[thr].done,
511 printk(KERN_ERR "PM: Invalid LZO compressed length\n"); 691 atomic_read(&data[thr].stop));
512 ret = -1; 692 atomic_set(&data[thr].stop, 0);
513 break;
514 }
515 693
516 *(size_t *)cmp = cmp_len; 694 ret = data[thr].ret;
517 695
518 /* 696 if (ret < 0) {
519 * Given we are writing one page at a time to disk, we copy 697 printk(KERN_ERR "PM: LZO compression failed\n");
520 * that much from the buffer, although the last bit will likely 698 goto out_finish;
521 * be smaller than full page. This is OK - we saved the length 699 }
522 * of the compressed data, so any garbage at the end will be
523 * discarded when we read it.
524 */
525 for (off = 0; off < LZO_HEADER + cmp_len; off += PAGE_SIZE) {
526 memcpy(page, cmp + off, PAGE_SIZE);
527 700
528 ret = swap_write_page(handle, page, &bio); 701 if (unlikely(!data[thr].cmp_len ||
529 if (ret) 702 data[thr].cmp_len >
703 lzo1x_worst_compress(data[thr].unc_len))) {
704 printk(KERN_ERR
705 "PM: Invalid LZO compressed length\n");
706 ret = -1;
530 goto out_finish; 707 goto out_finish;
708 }
709
710 *(size_t *)data[thr].cmp = data[thr].cmp_len;
711
712 /*
713 * Given we are writing one page at a time to disk, we
714 * copy that much from the buffer, although the last
715 * bit will likely be smaller than full page. This is
716 * OK - we saved the length of the compressed data, so
717 * any garbage at the end will be discarded when we
718 * read it.
719 */
720 for (off = 0;
721 off < LZO_HEADER + data[thr].cmp_len;
722 off += PAGE_SIZE) {
723 memcpy(page, data[thr].cmp + off, PAGE_SIZE);
724
725 ret = swap_write_page(handle, page, &bio);
726 if (ret)
727 goto out_finish;
728 }
531 } 729 }
730
731 wait_event(crc->done, atomic_read(&crc->stop));
732 atomic_set(&crc->stop, 0);
532 } 733 }
533 734
534out_finish: 735out_finish:
@@ -536,16 +737,25 @@ out_finish:
536 do_gettimeofday(&stop); 737 do_gettimeofday(&stop);
537 if (!ret) 738 if (!ret)
538 ret = err2; 739 ret = err2;
539 if (!ret) 740 if (!ret) {
540 printk(KERN_CONT "\b\b\b\bdone\n"); 741 printk(KERN_CONT "\b\b\b\bdone\n");
541 else 742 } else {
542 printk(KERN_CONT "\n"); 743 printk(KERN_CONT "\n");
744 }
543 swsusp_show_speed(&start, &stop, nr_to_write, "Wrote"); 745 swsusp_show_speed(&start, &stop, nr_to_write, "Wrote");
544 746out_clean:
545 vfree(cmp); 747 if (crc) {
546 vfree(unc); 748 if (crc->thr)
547 vfree(wrk); 749 kthread_stop(crc->thr);
548 free_page((unsigned long)page); 750 kfree(crc);
751 }
752 if (data) {
753 for (thr = 0; thr < nr_threads; thr++)
754 if (data[thr].thr)
755 kthread_stop(data[thr].thr);
756 vfree(data);
757 }
758 if (page) free_page((unsigned long)page);
549 759
550 return ret; 760 return ret;
551} 761}
@@ -625,8 +835,15 @@ out_finish:
625 835
626static void release_swap_reader(struct swap_map_handle *handle) 836static void release_swap_reader(struct swap_map_handle *handle)
627{ 837{
628 if (handle->cur) 838 struct swap_map_page_list *tmp;
629 free_page((unsigned long)handle->cur); 839
840 while (handle->maps) {
841 if (handle->maps->map)
842 free_page((unsigned long)handle->maps->map);
843 tmp = handle->maps;
844 handle->maps = handle->maps->next;
845 kfree(tmp);
846 }
630 handle->cur = NULL; 847 handle->cur = NULL;
631} 848}
632 849
@@ -634,22 +851,46 @@ static int get_swap_reader(struct swap_map_handle *handle,
634 unsigned int *flags_p) 851 unsigned int *flags_p)
635{ 852{
636 int error; 853 int error;
854 struct swap_map_page_list *tmp, *last;
855 sector_t offset;
637 856
638 *flags_p = swsusp_header->flags; 857 *flags_p = swsusp_header->flags;
639 858
640 if (!swsusp_header->image) /* how can this happen? */ 859 if (!swsusp_header->image) /* how can this happen? */
641 return -EINVAL; 860 return -EINVAL;
642 861
643 handle->cur = (struct swap_map_page *)get_zeroed_page(__GFP_WAIT | __GFP_HIGH); 862 handle->cur = NULL;
644 if (!handle->cur) 863 last = handle->maps = NULL;
645 return -ENOMEM; 864 offset = swsusp_header->image;
865 while (offset) {
866 tmp = kmalloc(sizeof(*handle->maps), GFP_KERNEL);
867 if (!tmp) {
868 release_swap_reader(handle);
869 return -ENOMEM;
870 }
871 memset(tmp, 0, sizeof(*tmp));
872 if (!handle->maps)
873 handle->maps = tmp;
874 if (last)
875 last->next = tmp;
876 last = tmp;
877
878 tmp->map = (struct swap_map_page *)
879 __get_free_page(__GFP_WAIT | __GFP_HIGH);
880 if (!tmp->map) {
881 release_swap_reader(handle);
882 return -ENOMEM;
883 }
646 884
647 error = hib_bio_read_page(swsusp_header->image, handle->cur, NULL); 885 error = hib_bio_read_page(offset, tmp->map, NULL);
648 if (error) { 886 if (error) {
649 release_swap_reader(handle); 887 release_swap_reader(handle);
650 return error; 888 return error;
889 }
890 offset = tmp->map->next_swap;
651 } 891 }
652 handle->k = 0; 892 handle->k = 0;
893 handle->cur = handle->maps->map;
653 return 0; 894 return 0;
654} 895}
655 896
@@ -658,6 +899,7 @@ static int swap_read_page(struct swap_map_handle *handle, void *buf,
658{ 899{
659 sector_t offset; 900 sector_t offset;
660 int error; 901 int error;
902 struct swap_map_page_list *tmp;
661 903
662 if (!handle->cur) 904 if (!handle->cur)
663 return -EINVAL; 905 return -EINVAL;
@@ -668,13 +910,15 @@ static int swap_read_page(struct swap_map_handle *handle, void *buf,
668 if (error) 910 if (error)
669 return error; 911 return error;
670 if (++handle->k >= MAP_PAGE_ENTRIES) { 912 if (++handle->k >= MAP_PAGE_ENTRIES) {
671 error = hib_wait_on_bio_chain(bio_chain);
672 handle->k = 0; 913 handle->k = 0;
673 offset = handle->cur->next_swap; 914 free_page((unsigned long)handle->maps->map);
674 if (!offset) 915 tmp = handle->maps;
916 handle->maps = handle->maps->next;
917 kfree(tmp);
918 if (!handle->maps)
675 release_swap_reader(handle); 919 release_swap_reader(handle);
676 else if (!error) 920 else
677 error = hib_bio_read_page(offset, handle->cur, NULL); 921 handle->cur = handle->maps->map;
678 } 922 }
679 return error; 923 return error;
680} 924}
@@ -697,7 +941,7 @@ static int load_image(struct swap_map_handle *handle,
697 unsigned int nr_to_read) 941 unsigned int nr_to_read)
698{ 942{
699 unsigned int m; 943 unsigned int m;
700 int error = 0; 944 int ret = 0;
701 struct timeval start; 945 struct timeval start;
702 struct timeval stop; 946 struct timeval stop;
703 struct bio *bio; 947 struct bio *bio;
@@ -713,15 +957,15 @@ static int load_image(struct swap_map_handle *handle,
713 bio = NULL; 957 bio = NULL;
714 do_gettimeofday(&start); 958 do_gettimeofday(&start);
715 for ( ; ; ) { 959 for ( ; ; ) {
716 error = snapshot_write_next(snapshot); 960 ret = snapshot_write_next(snapshot);
717 if (error <= 0) 961 if (ret <= 0)
718 break; 962 break;
719 error = swap_read_page(handle, data_of(*snapshot), &bio); 963 ret = swap_read_page(handle, data_of(*snapshot), &bio);
720 if (error) 964 if (ret)
721 break; 965 break;
722 if (snapshot->sync_read) 966 if (snapshot->sync_read)
723 error = hib_wait_on_bio_chain(&bio); 967 ret = hib_wait_on_bio_chain(&bio);
724 if (error) 968 if (ret)
725 break; 969 break;
726 if (!(nr_pages % m)) 970 if (!(nr_pages % m))
727 printk("\b\b\b\b%3d%%", nr_pages / m); 971 printk("\b\b\b\b%3d%%", nr_pages / m);
@@ -729,17 +973,61 @@ static int load_image(struct swap_map_handle *handle,
729 } 973 }
730 err2 = hib_wait_on_bio_chain(&bio); 974 err2 = hib_wait_on_bio_chain(&bio);
731 do_gettimeofday(&stop); 975 do_gettimeofday(&stop);
732 if (!error) 976 if (!ret)
733 error = err2; 977 ret = err2;
734 if (!error) { 978 if (!ret) {
735 printk("\b\b\b\bdone\n"); 979 printk("\b\b\b\bdone\n");
736 snapshot_write_finalize(snapshot); 980 snapshot_write_finalize(snapshot);
737 if (!snapshot_image_loaded(snapshot)) 981 if (!snapshot_image_loaded(snapshot))
738 error = -ENODATA; 982 ret = -ENODATA;
739 } else 983 } else
740 printk("\n"); 984 printk("\n");
741 swsusp_show_speed(&start, &stop, nr_to_read, "Read"); 985 swsusp_show_speed(&start, &stop, nr_to_read, "Read");
742 return error; 986 return ret;
987}
988
989/**
990 * Structure used for LZO data decompression.
991 */
992struct dec_data {
993 struct task_struct *thr; /* thread */
994 atomic_t ready; /* ready to start flag */
995 atomic_t stop; /* ready to stop flag */
996 int ret; /* return code */
997 wait_queue_head_t go; /* start decompression */
998 wait_queue_head_t done; /* decompression done */
999 size_t unc_len; /* uncompressed length */
1000 size_t cmp_len; /* compressed length */
1001 unsigned char unc[LZO_UNC_SIZE]; /* uncompressed buffer */
1002 unsigned char cmp[LZO_CMP_SIZE]; /* compressed buffer */
1003};
1004
1005/**
1006 * Deompression function that runs in its own thread.
1007 */
1008static int lzo_decompress_threadfn(void *data)
1009{
1010 struct dec_data *d = data;
1011
1012 while (1) {
1013 wait_event(d->go, atomic_read(&d->ready) ||
1014 kthread_should_stop());
1015 if (kthread_should_stop()) {
1016 d->thr = NULL;
1017 d->ret = -1;
1018 atomic_set(&d->stop, 1);
1019 wake_up(&d->done);
1020 break;
1021 }
1022 atomic_set(&d->ready, 0);
1023
1024 d->unc_len = LZO_UNC_SIZE;
1025 d->ret = lzo1x_decompress_safe(d->cmp + LZO_HEADER, d->cmp_len,
1026 d->unc, &d->unc_len);
1027 atomic_set(&d->stop, 1);
1028 wake_up(&d->done);
1029 }
1030 return 0;
743} 1031}
744 1032
745/** 1033/**
@@ -753,50 +1041,120 @@ static int load_image_lzo(struct swap_map_handle *handle,
753 unsigned int nr_to_read) 1041 unsigned int nr_to_read)
754{ 1042{
755 unsigned int m; 1043 unsigned int m;
756 int error = 0; 1044 int ret = 0;
1045 int eof = 0;
757 struct bio *bio; 1046 struct bio *bio;
758 struct timeval start; 1047 struct timeval start;
759 struct timeval stop; 1048 struct timeval stop;
760 unsigned nr_pages; 1049 unsigned nr_pages;
761 size_t i, off, unc_len, cmp_len; 1050 size_t off;
762 unsigned char *unc, *cmp, *page[LZO_CMP_PAGES]; 1051 unsigned i, thr, run_threads, nr_threads;
763 1052 unsigned ring = 0, pg = 0, ring_size = 0,
764 for (i = 0; i < LZO_CMP_PAGES; i++) { 1053 have = 0, want, need, asked = 0;
765 page[i] = (void *)__get_free_page(__GFP_WAIT | __GFP_HIGH); 1054 unsigned long read_pages;
766 if (!page[i]) { 1055 unsigned char **page = NULL;
767 printk(KERN_ERR "PM: Failed to allocate LZO page\n"); 1056 struct dec_data *data = NULL;
1057 struct crc_data *crc = NULL;
1058
1059 /*
1060 * We'll limit the number of threads for decompression to limit memory
1061 * footprint.
1062 */
1063 nr_threads = num_online_cpus() - 1;
1064 nr_threads = clamp_val(nr_threads, 1, LZO_THREADS);
1065
1066 page = vmalloc(sizeof(*page) * LZO_READ_PAGES);
1067 if (!page) {
1068 printk(KERN_ERR "PM: Failed to allocate LZO page\n");
1069 ret = -ENOMEM;
1070 goto out_clean;
1071 }
768 1072
769 while (i) 1073 data = vmalloc(sizeof(*data) * nr_threads);
770 free_page((unsigned long)page[--i]); 1074 if (!data) {
1075 printk(KERN_ERR "PM: Failed to allocate LZO data\n");
1076 ret = -ENOMEM;
1077 goto out_clean;
1078 }
1079 for (thr = 0; thr < nr_threads; thr++)
1080 memset(&data[thr], 0, offsetof(struct dec_data, go));
771 1081
772 return -ENOMEM; 1082 crc = kmalloc(sizeof(*crc), GFP_KERNEL);
1083 if (!crc) {
1084 printk(KERN_ERR "PM: Failed to allocate crc\n");
1085 ret = -ENOMEM;
1086 goto out_clean;
1087 }
1088 memset(crc, 0, offsetof(struct crc_data, go));
1089
1090 /*
1091 * Start the decompression threads.
1092 */
1093 for (thr = 0; thr < nr_threads; thr++) {
1094 init_waitqueue_head(&data[thr].go);
1095 init_waitqueue_head(&data[thr].done);
1096
1097 data[thr].thr = kthread_run(lzo_decompress_threadfn,
1098 &data[thr],
1099 "image_decompress/%u", thr);
1100 if (IS_ERR(data[thr].thr)) {
1101 data[thr].thr = NULL;
1102 printk(KERN_ERR
1103 "PM: Cannot start decompression threads\n");
1104 ret = -ENOMEM;
1105 goto out_clean;
773 } 1106 }
774 } 1107 }
775 1108
776 unc = vmalloc(LZO_UNC_SIZE); 1109 /*
777 if (!unc) { 1110 * Start the CRC32 thread.
778 printk(KERN_ERR "PM: Failed to allocate LZO uncompressed\n"); 1111 */
779 1112 init_waitqueue_head(&crc->go);
780 for (i = 0; i < LZO_CMP_PAGES; i++) 1113 init_waitqueue_head(&crc->done);
781 free_page((unsigned long)page[i]); 1114
782 1115 handle->crc32 = 0;
783 return -ENOMEM; 1116 crc->crc32 = &handle->crc32;
1117 for (thr = 0; thr < nr_threads; thr++) {
1118 crc->unc[thr] = data[thr].unc;
1119 crc->unc_len[thr] = &data[thr].unc_len;
784 } 1120 }
785 1121
786 cmp = vmalloc(LZO_CMP_SIZE); 1122 crc->thr = kthread_run(crc32_threadfn, crc, "image_crc32");
787 if (!cmp) { 1123 if (IS_ERR(crc->thr)) {
788 printk(KERN_ERR "PM: Failed to allocate LZO compressed\n"); 1124 crc->thr = NULL;
1125 printk(KERN_ERR "PM: Cannot start CRC32 thread\n");
1126 ret = -ENOMEM;
1127 goto out_clean;
1128 }
789 1129
790 vfree(unc); 1130 /*
791 for (i = 0; i < LZO_CMP_PAGES; i++) 1131 * Adjust number of pages for read buffering, in case we are short.
792 free_page((unsigned long)page[i]); 1132 */
1133 read_pages = (nr_free_pages() - snapshot_get_image_size()) >> 1;
1134 read_pages = clamp_val(read_pages, LZO_CMP_PAGES, LZO_READ_PAGES);
793 1135
794 return -ENOMEM; 1136 for (i = 0; i < read_pages; i++) {
1137 page[i] = (void *)__get_free_page(i < LZO_CMP_PAGES ?
1138 __GFP_WAIT | __GFP_HIGH :
1139 __GFP_WAIT);
1140 if (!page[i]) {
1141 if (i < LZO_CMP_PAGES) {
1142 ring_size = i;
1143 printk(KERN_ERR
1144 "PM: Failed to allocate LZO pages\n");
1145 ret = -ENOMEM;
1146 goto out_clean;
1147 } else {
1148 break;
1149 }
1150 }
795 } 1151 }
1152 want = ring_size = i;
796 1153
797 printk(KERN_INFO 1154 printk(KERN_INFO
1155 "PM: Using %u thread(s) for decompression.\n"
798 "PM: Loading and decompressing image data (%u pages) ... ", 1156 "PM: Loading and decompressing image data (%u pages) ... ",
799 nr_to_read); 1157 nr_threads, nr_to_read);
800 m = nr_to_read / 100; 1158 m = nr_to_read / 100;
801 if (!m) 1159 if (!m)
802 m = 1; 1160 m = 1;
@@ -804,85 +1162,189 @@ static int load_image_lzo(struct swap_map_handle *handle,
804 bio = NULL; 1162 bio = NULL;
805 do_gettimeofday(&start); 1163 do_gettimeofday(&start);
806 1164
807 error = snapshot_write_next(snapshot); 1165 ret = snapshot_write_next(snapshot);
808 if (error <= 0) 1166 if (ret <= 0)
809 goto out_finish; 1167 goto out_finish;
810 1168
811 for (;;) { 1169 for(;;) {
812 error = swap_read_page(handle, page[0], NULL); /* sync */ 1170 for (i = 0; !eof && i < want; i++) {
813 if (error) 1171 ret = swap_read_page(handle, page[ring], &bio);
814 break; 1172 if (ret) {
815 1173 /*
816 cmp_len = *(size_t *)page[0]; 1174 * On real read error, finish. On end of data,
817 if (unlikely(!cmp_len || 1175 * set EOF flag and just exit the read loop.
818 cmp_len > lzo1x_worst_compress(LZO_UNC_SIZE))) { 1176 */
819 printk(KERN_ERR "PM: Invalid LZO compressed length\n"); 1177 if (handle->cur &&
820 error = -1; 1178 handle->cur->entries[handle->k]) {
821 break; 1179 goto out_finish;
1180 } else {
1181 eof = 1;
1182 break;
1183 }
1184 }
1185 if (++ring >= ring_size)
1186 ring = 0;
822 } 1187 }
1188 asked += i;
1189 want -= i;
823 1190
824 for (off = PAGE_SIZE, i = 1; 1191 /*
825 off < LZO_HEADER + cmp_len; off += PAGE_SIZE, i++) { 1192 * We are out of data, wait for some more.
826 error = swap_read_page(handle, page[i], &bio); 1193 */
827 if (error) 1194 if (!have) {
1195 if (!asked)
1196 break;
1197
1198 ret = hib_wait_on_bio_chain(&bio);
1199 if (ret)
828 goto out_finish; 1200 goto out_finish;
1201 have += asked;
1202 asked = 0;
1203 if (eof)
1204 eof = 2;
829 } 1205 }
830 1206
831 error = hib_wait_on_bio_chain(&bio); /* need all data now */ 1207 if (crc->run_threads) {
832 if (error) 1208 wait_event(crc->done, atomic_read(&crc->stop));
833 goto out_finish; 1209 atomic_set(&crc->stop, 0);
834 1210 crc->run_threads = 0;
835 for (off = 0, i = 0;
836 off < LZO_HEADER + cmp_len; off += PAGE_SIZE, i++) {
837 memcpy(cmp + off, page[i], PAGE_SIZE);
838 } 1211 }
839 1212
840 unc_len = LZO_UNC_SIZE; 1213 for (thr = 0; have && thr < nr_threads; thr++) {
841 error = lzo1x_decompress_safe(cmp + LZO_HEADER, cmp_len, 1214 data[thr].cmp_len = *(size_t *)page[pg];
842 unc, &unc_len); 1215 if (unlikely(!data[thr].cmp_len ||
843 if (error < 0) { 1216 data[thr].cmp_len >
844 printk(KERN_ERR "PM: LZO decompression failed\n"); 1217 lzo1x_worst_compress(LZO_UNC_SIZE))) {
845 break; 1218 printk(KERN_ERR
1219 "PM: Invalid LZO compressed length\n");
1220 ret = -1;
1221 goto out_finish;
1222 }
1223
1224 need = DIV_ROUND_UP(data[thr].cmp_len + LZO_HEADER,
1225 PAGE_SIZE);
1226 if (need > have) {
1227 if (eof > 1) {
1228 ret = -1;
1229 goto out_finish;
1230 }
1231 break;
1232 }
1233
1234 for (off = 0;
1235 off < LZO_HEADER + data[thr].cmp_len;
1236 off += PAGE_SIZE) {
1237 memcpy(data[thr].cmp + off,
1238 page[pg], PAGE_SIZE);
1239 have--;
1240 want++;
1241 if (++pg >= ring_size)
1242 pg = 0;
1243 }
1244
1245 atomic_set(&data[thr].ready, 1);
1246 wake_up(&data[thr].go);
846 } 1247 }
847 1248
848 if (unlikely(!unc_len || 1249 /*
849 unc_len > LZO_UNC_SIZE || 1250 * Wait for more data while we are decompressing.
850 unc_len & (PAGE_SIZE - 1))) { 1251 */
851 printk(KERN_ERR "PM: Invalid LZO uncompressed length\n"); 1252 if (have < LZO_CMP_PAGES && asked) {
852 error = -1; 1253 ret = hib_wait_on_bio_chain(&bio);
853 break; 1254 if (ret)
1255 goto out_finish;
1256 have += asked;
1257 asked = 0;
1258 if (eof)
1259 eof = 2;
854 } 1260 }
855 1261
856 for (off = 0; off < unc_len; off += PAGE_SIZE) { 1262 for (run_threads = thr, thr = 0; thr < run_threads; thr++) {
857 memcpy(data_of(*snapshot), unc + off, PAGE_SIZE); 1263 wait_event(data[thr].done,
1264 atomic_read(&data[thr].stop));
1265 atomic_set(&data[thr].stop, 0);
1266
1267 ret = data[thr].ret;
858 1268
859 if (!(nr_pages % m)) 1269 if (ret < 0) {
860 printk("\b\b\b\b%3d%%", nr_pages / m); 1270 printk(KERN_ERR
861 nr_pages++; 1271 "PM: LZO decompression failed\n");
1272 goto out_finish;
1273 }
862 1274
863 error = snapshot_write_next(snapshot); 1275 if (unlikely(!data[thr].unc_len ||
864 if (error <= 0) 1276 data[thr].unc_len > LZO_UNC_SIZE ||
1277 data[thr].unc_len & (PAGE_SIZE - 1))) {
1278 printk(KERN_ERR
1279 "PM: Invalid LZO uncompressed length\n");
1280 ret = -1;
865 goto out_finish; 1281 goto out_finish;
1282 }
1283
1284 for (off = 0;
1285 off < data[thr].unc_len; off += PAGE_SIZE) {
1286 memcpy(data_of(*snapshot),
1287 data[thr].unc + off, PAGE_SIZE);
1288
1289 if (!(nr_pages % m))
1290 printk("\b\b\b\b%3d%%", nr_pages / m);
1291 nr_pages++;
1292
1293 ret = snapshot_write_next(snapshot);
1294 if (ret <= 0) {
1295 crc->run_threads = thr + 1;
1296 atomic_set(&crc->ready, 1);
1297 wake_up(&crc->go);
1298 goto out_finish;
1299 }
1300 }
866 } 1301 }
1302
1303 crc->run_threads = thr;
1304 atomic_set(&crc->ready, 1);
1305 wake_up(&crc->go);
867 } 1306 }
868 1307
869out_finish: 1308out_finish:
1309 if (crc->run_threads) {
1310 wait_event(crc->done, atomic_read(&crc->stop));
1311 atomic_set(&crc->stop, 0);
1312 }
870 do_gettimeofday(&stop); 1313 do_gettimeofday(&stop);
871 if (!error) { 1314 if (!ret) {
872 printk("\b\b\b\bdone\n"); 1315 printk("\b\b\b\bdone\n");
873 snapshot_write_finalize(snapshot); 1316 snapshot_write_finalize(snapshot);
874 if (!snapshot_image_loaded(snapshot)) 1317 if (!snapshot_image_loaded(snapshot))
875 error = -ENODATA; 1318 ret = -ENODATA;
1319 if (!ret) {
1320 if (swsusp_header->flags & SF_CRC32_MODE) {
1321 if(handle->crc32 != swsusp_header->crc32) {
1322 printk(KERN_ERR
1323 "PM: Invalid image CRC32!\n");
1324 ret = -ENODATA;
1325 }
1326 }
1327 }
876 } else 1328 } else
877 printk("\n"); 1329 printk("\n");
878 swsusp_show_speed(&start, &stop, nr_to_read, "Read"); 1330 swsusp_show_speed(&start, &stop, nr_to_read, "Read");
879 1331out_clean:
880 vfree(cmp); 1332 for (i = 0; i < ring_size; i++)
881 vfree(unc);
882 for (i = 0; i < LZO_CMP_PAGES; i++)
883 free_page((unsigned long)page[i]); 1333 free_page((unsigned long)page[i]);
1334 if (crc) {
1335 if (crc->thr)
1336 kthread_stop(crc->thr);
1337 kfree(crc);
1338 }
1339 if (data) {
1340 for (thr = 0; thr < nr_threads; thr++)
1341 if (data[thr].thr)
1342 kthread_stop(data[thr].thr);
1343 vfree(data);
1344 }
1345 if (page) vfree(page);
884 1346
885 return error; 1347 return ret;
886} 1348}
887 1349
888/** 1350/**
diff --git a/kernel/power/user.c b/kernel/power/user.c
index 42ddbc6f0de..6d8f535c2b8 100644
--- a/kernel/power/user.c
+++ b/kernel/power/user.c
@@ -12,6 +12,7 @@
12#include <linux/suspend.h> 12#include <linux/suspend.h>
13#include <linux/syscalls.h> 13#include <linux/syscalls.h>
14#include <linux/reboot.h> 14#include <linux/reboot.h>
15#include <linux/kmod.h>
15#include <linux/string.h> 16#include <linux/string.h>
16#include <linux/device.h> 17#include <linux/device.h>
17#include <linux/miscdevice.h> 18#include <linux/miscdevice.h>
diff --git a/kernel/printk.c b/kernel/printk.c
index 37dff3429ad..1455a0d4eed 100644
--- a/kernel/printk.c
+++ b/kernel/printk.c
@@ -100,7 +100,7 @@ static int console_locked, console_suspended;
100 * It is also used in interesting ways to provide interlocking in 100 * It is also used in interesting ways to provide interlocking in
101 * console_unlock();. 101 * console_unlock();.
102 */ 102 */
103static DEFINE_SPINLOCK(logbuf_lock); 103static DEFINE_RAW_SPINLOCK(logbuf_lock);
104 104
105#define LOG_BUF_MASK (log_buf_len-1) 105#define LOG_BUF_MASK (log_buf_len-1)
106#define LOG_BUF(idx) (log_buf[(idx) & LOG_BUF_MASK]) 106#define LOG_BUF(idx) (log_buf[(idx) & LOG_BUF_MASK])
@@ -212,7 +212,7 @@ void __init setup_log_buf(int early)
212 return; 212 return;
213 } 213 }
214 214
215 spin_lock_irqsave(&logbuf_lock, flags); 215 raw_spin_lock_irqsave(&logbuf_lock, flags);
216 log_buf_len = new_log_buf_len; 216 log_buf_len = new_log_buf_len;
217 log_buf = new_log_buf; 217 log_buf = new_log_buf;
218 new_log_buf_len = 0; 218 new_log_buf_len = 0;
@@ -230,7 +230,7 @@ void __init setup_log_buf(int early)
230 log_start -= offset; 230 log_start -= offset;
231 con_start -= offset; 231 con_start -= offset;
232 log_end -= offset; 232 log_end -= offset;
233 spin_unlock_irqrestore(&logbuf_lock, flags); 233 raw_spin_unlock_irqrestore(&logbuf_lock, flags);
234 234
235 pr_info("log_buf_len: %d\n", log_buf_len); 235 pr_info("log_buf_len: %d\n", log_buf_len);
236 pr_info("early log buf free: %d(%d%%)\n", 236 pr_info("early log buf free: %d(%d%%)\n",
@@ -318,8 +318,10 @@ static int check_syslog_permissions(int type, bool from_file)
318 return 0; 318 return 0;
319 /* For historical reasons, accept CAP_SYS_ADMIN too, with a warning */ 319 /* For historical reasons, accept CAP_SYS_ADMIN too, with a warning */
320 if (capable(CAP_SYS_ADMIN)) { 320 if (capable(CAP_SYS_ADMIN)) {
321 WARN_ONCE(1, "Attempt to access syslog with CAP_SYS_ADMIN " 321 printk_once(KERN_WARNING "%s (%d): "
322 "but no CAP_SYSLOG (deprecated).\n"); 322 "Attempt to access syslog with CAP_SYS_ADMIN "
323 "but no CAP_SYSLOG (deprecated).\n",
324 current->comm, task_pid_nr(current));
323 return 0; 325 return 0;
324 } 326 }
325 return -EPERM; 327 return -EPERM;
@@ -363,18 +365,18 @@ int do_syslog(int type, char __user *buf, int len, bool from_file)
363 if (error) 365 if (error)
364 goto out; 366 goto out;
365 i = 0; 367 i = 0;
366 spin_lock_irq(&logbuf_lock); 368 raw_spin_lock_irq(&logbuf_lock);
367 while (!error && (log_start != log_end) && i < len) { 369 while (!error && (log_start != log_end) && i < len) {
368 c = LOG_BUF(log_start); 370 c = LOG_BUF(log_start);
369 log_start++; 371 log_start++;
370 spin_unlock_irq(&logbuf_lock); 372 raw_spin_unlock_irq(&logbuf_lock);
371 error = __put_user(c,buf); 373 error = __put_user(c,buf);
372 buf++; 374 buf++;
373 i++; 375 i++;
374 cond_resched(); 376 cond_resched();
375 spin_lock_irq(&logbuf_lock); 377 raw_spin_lock_irq(&logbuf_lock);
376 } 378 }
377 spin_unlock_irq(&logbuf_lock); 379 raw_spin_unlock_irq(&logbuf_lock);
378 if (!error) 380 if (!error)
379 error = i; 381 error = i;
380 break; 382 break;
@@ -397,7 +399,7 @@ int do_syslog(int type, char __user *buf, int len, bool from_file)
397 count = len; 399 count = len;
398 if (count > log_buf_len) 400 if (count > log_buf_len)
399 count = log_buf_len; 401 count = log_buf_len;
400 spin_lock_irq(&logbuf_lock); 402 raw_spin_lock_irq(&logbuf_lock);
401 if (count > logged_chars) 403 if (count > logged_chars)
402 count = logged_chars; 404 count = logged_chars;
403 if (do_clear) 405 if (do_clear)
@@ -414,12 +416,12 @@ int do_syslog(int type, char __user *buf, int len, bool from_file)
414 if (j + log_buf_len < log_end) 416 if (j + log_buf_len < log_end)
415 break; 417 break;
416 c = LOG_BUF(j); 418 c = LOG_BUF(j);
417 spin_unlock_irq(&logbuf_lock); 419 raw_spin_unlock_irq(&logbuf_lock);
418 error = __put_user(c,&buf[count-1-i]); 420 error = __put_user(c,&buf[count-1-i]);
419 cond_resched(); 421 cond_resched();
420 spin_lock_irq(&logbuf_lock); 422 raw_spin_lock_irq(&logbuf_lock);
421 } 423 }
422 spin_unlock_irq(&logbuf_lock); 424 raw_spin_unlock_irq(&logbuf_lock);
423 if (error) 425 if (error)
424 break; 426 break;
425 error = i; 427 error = i;
@@ -530,6 +532,9 @@ static int __init ignore_loglevel_setup(char *str)
530} 532}
531 533
532early_param("ignore_loglevel", ignore_loglevel_setup); 534early_param("ignore_loglevel", ignore_loglevel_setup);
535module_param_named(ignore_loglevel, ignore_loglevel, bool, S_IRUGO | S_IWUSR);
536MODULE_PARM_DESC(ignore_loglevel, "ignore loglevel setting, to"
537 "print all kernel messages to the console.");
533 538
534/* 539/*
535 * Write out chars from start to end - 1 inclusive 540 * Write out chars from start to end - 1 inclusive
@@ -590,9 +595,6 @@ static size_t log_prefix(const char *p, unsigned int *level, char *special)
590 /* multi digit including the level and facility number */ 595 /* multi digit including the level and facility number */
591 char *endp = NULL; 596 char *endp = NULL;
592 597
593 if (p[1] < '0' && p[1] > '9')
594 return 0;
595
596 lev = (simple_strtoul(&p[1], &endp, 10) & 7); 598 lev = (simple_strtoul(&p[1], &endp, 10) & 7);
597 if (endp == NULL || endp[0] != '>') 599 if (endp == NULL || endp[0] != '>')
598 return 0; 600 return 0;
@@ -687,7 +689,7 @@ static void zap_locks(void)
687 oops_timestamp = jiffies; 689 oops_timestamp = jiffies;
688 690
689 /* If a crash is occurring, make sure we can't deadlock */ 691 /* If a crash is occurring, make sure we can't deadlock */
690 spin_lock_init(&logbuf_lock); 692 raw_spin_lock_init(&logbuf_lock);
691 /* And make sure that we print immediately */ 693 /* And make sure that we print immediately */
692 sema_init(&console_sem, 1); 694 sema_init(&console_sem, 1);
693} 695}
@@ -800,9 +802,9 @@ static int console_trylock_for_printk(unsigned int cpu)
800 } 802 }
801 } 803 }
802 printk_cpu = UINT_MAX; 804 printk_cpu = UINT_MAX;
803 spin_unlock(&logbuf_lock);
804 if (wake) 805 if (wake)
805 up(&console_sem); 806 up(&console_sem);
807 raw_spin_unlock(&logbuf_lock);
806 return retval; 808 return retval;
807} 809}
808static const char recursion_bug_msg [] = 810static const char recursion_bug_msg [] =
@@ -862,7 +864,7 @@ asmlinkage int vprintk(const char *fmt, va_list args)
862 } 864 }
863 865
864 lockdep_off(); 866 lockdep_off();
865 spin_lock(&logbuf_lock); 867 raw_spin_lock(&logbuf_lock);
866 printk_cpu = this_cpu; 868 printk_cpu = this_cpu;
867 869
868 if (recursion_bug) { 870 if (recursion_bug) {
@@ -1106,6 +1108,10 @@ static int __init console_suspend_disable(char *str)
1106 return 1; 1108 return 1;
1107} 1109}
1108__setup("no_console_suspend", console_suspend_disable); 1110__setup("no_console_suspend", console_suspend_disable);
1111module_param_named(console_suspend, console_suspend_enabled,
1112 bool, S_IRUGO | S_IWUSR);
1113MODULE_PARM_DESC(console_suspend, "suspend console during suspend"
1114 " and hibernate operations");
1109 1115
1110/** 1116/**
1111 * suspend_console - suspend the console subsystem 1117 * suspend_console - suspend the console subsystem
@@ -1255,14 +1261,14 @@ void console_unlock(void)
1255 1261
1256again: 1262again:
1257 for ( ; ; ) { 1263 for ( ; ; ) {
1258 spin_lock_irqsave(&logbuf_lock, flags); 1264 raw_spin_lock_irqsave(&logbuf_lock, flags);
1259 wake_klogd |= log_start - log_end; 1265 wake_klogd |= log_start - log_end;
1260 if (con_start == log_end) 1266 if (con_start == log_end)
1261 break; /* Nothing to print */ 1267 break; /* Nothing to print */
1262 _con_start = con_start; 1268 _con_start = con_start;
1263 _log_end = log_end; 1269 _log_end = log_end;
1264 con_start = log_end; /* Flush */ 1270 con_start = log_end; /* Flush */
1265 spin_unlock(&logbuf_lock); 1271 raw_spin_unlock(&logbuf_lock);
1266 stop_critical_timings(); /* don't trace print latency */ 1272 stop_critical_timings(); /* don't trace print latency */
1267 call_console_drivers(_con_start, _log_end); 1273 call_console_drivers(_con_start, _log_end);
1268 start_critical_timings(); 1274 start_critical_timings();
@@ -1274,7 +1280,7 @@ again:
1274 if (unlikely(exclusive_console)) 1280 if (unlikely(exclusive_console))
1275 exclusive_console = NULL; 1281 exclusive_console = NULL;
1276 1282
1277 spin_unlock(&logbuf_lock); 1283 raw_spin_unlock(&logbuf_lock);
1278 1284
1279 up(&console_sem); 1285 up(&console_sem);
1280 1286
@@ -1284,13 +1290,13 @@ again:
1284 * there's a new owner and the console_unlock() from them will do the 1290 * there's a new owner and the console_unlock() from them will do the
1285 * flush, no worries. 1291 * flush, no worries.
1286 */ 1292 */
1287 spin_lock(&logbuf_lock); 1293 raw_spin_lock(&logbuf_lock);
1288 if (con_start != log_end) 1294 if (con_start != log_end)
1289 retry = 1; 1295 retry = 1;
1290 spin_unlock_irqrestore(&logbuf_lock, flags);
1291 if (retry && console_trylock()) 1296 if (retry && console_trylock())
1292 goto again; 1297 goto again;
1293 1298
1299 raw_spin_unlock_irqrestore(&logbuf_lock, flags);
1294 if (wake_klogd) 1300 if (wake_klogd)
1295 wake_up_klogd(); 1301 wake_up_klogd();
1296} 1302}
@@ -1520,9 +1526,9 @@ void register_console(struct console *newcon)
1520 * console_unlock(); will print out the buffered messages 1526 * console_unlock(); will print out the buffered messages
1521 * for us. 1527 * for us.
1522 */ 1528 */
1523 spin_lock_irqsave(&logbuf_lock, flags); 1529 raw_spin_lock_irqsave(&logbuf_lock, flags);
1524 con_start = log_start; 1530 con_start = log_start;
1525 spin_unlock_irqrestore(&logbuf_lock, flags); 1531 raw_spin_unlock_irqrestore(&logbuf_lock, flags);
1526 /* 1532 /*
1527 * We're about to replay the log buffer. Only do this to the 1533 * We're about to replay the log buffer. Only do this to the
1528 * just-registered console to avoid excessive message spam to 1534 * just-registered console to avoid excessive message spam to
@@ -1602,7 +1608,7 @@ static int __init printk_late_init(void)
1602 struct console *con; 1608 struct console *con;
1603 1609
1604 for_each_console(con) { 1610 for_each_console(con) {
1605 if (con->flags & CON_BOOT) { 1611 if (!keep_bootcon && con->flags & CON_BOOT) {
1606 printk(KERN_INFO "turn off boot console %s%d\n", 1612 printk(KERN_INFO "turn off boot console %s%d\n",
1607 con->name, con->index); 1613 con->name, con->index);
1608 unregister_console(con); 1614 unregister_console(con);
@@ -1729,10 +1735,10 @@ void kmsg_dump(enum kmsg_dump_reason reason)
1729 /* Theoretically, the log could move on after we do this, but 1735 /* Theoretically, the log could move on after we do this, but
1730 there's not a lot we can do about that. The new messages 1736 there's not a lot we can do about that. The new messages
1731 will overwrite the start of what we dump. */ 1737 will overwrite the start of what we dump. */
1732 spin_lock_irqsave(&logbuf_lock, flags); 1738 raw_spin_lock_irqsave(&logbuf_lock, flags);
1733 end = log_end & LOG_BUF_MASK; 1739 end = log_end & LOG_BUF_MASK;
1734 chars = logged_chars; 1740 chars = logged_chars;
1735 spin_unlock_irqrestore(&logbuf_lock, flags); 1741 raw_spin_unlock_irqrestore(&logbuf_lock, flags);
1736 1742
1737 if (chars > end) { 1743 if (chars > end) {
1738 s1 = log_buf + log_buf_len - chars + end; 1744 s1 = log_buf + log_buf_len - chars + end;
diff --git a/kernel/profile.c b/kernel/profile.c
index 961b389fe52..76b8e77773e 100644
--- a/kernel/profile.c
+++ b/kernel/profile.c
@@ -13,7 +13,7 @@
13 * to resolve timer interrupt livelocks, William Irwin, Oracle, 2004 13 * to resolve timer interrupt livelocks, William Irwin, Oracle, 2004
14 */ 14 */
15 15
16#include <linux/module.h> 16#include <linux/export.h>
17#include <linux/profile.h> 17#include <linux/profile.h>
18#include <linux/bootmem.h> 18#include <linux/bootmem.h>
19#include <linux/notifier.h> 19#include <linux/notifier.h>
diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index 9de3ecfd20f..24d04477b25 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -8,7 +8,7 @@
8 */ 8 */
9 9
10#include <linux/capability.h> 10#include <linux/capability.h>
11#include <linux/module.h> 11#include <linux/export.h>
12#include <linux/sched.h> 12#include <linux/sched.h>
13#include <linux/errno.h> 13#include <linux/errno.h>
14#include <linux/mm.h> 14#include <linux/mm.h>
@@ -744,20 +744,17 @@ int ptrace_request(struct task_struct *child, long request,
744 break; 744 break;
745 745
746 si = child->last_siginfo; 746 si = child->last_siginfo;
747 if (unlikely(!si || si->si_code >> 8 != PTRACE_EVENT_STOP)) 747 if (likely(si && (si->si_code >> 8) == PTRACE_EVENT_STOP)) {
748 break; 748 child->jobctl |= JOBCTL_LISTENING;
749 749 /*
750 child->jobctl |= JOBCTL_LISTENING; 750 * If NOTIFY is set, it means event happened between
751 751 * start of this trap and now. Trigger re-trap.
752 /* 752 */
753 * If NOTIFY is set, it means event happened between start 753 if (child->jobctl & JOBCTL_TRAP_NOTIFY)
754 * of this trap and now. Trigger re-trap immediately. 754 signal_wake_up(child, true);
755 */ 755 ret = 0;
756 if (child->jobctl & JOBCTL_TRAP_NOTIFY) 756 }
757 signal_wake_up(child, true);
758
759 unlock_task_sighand(child, &flags); 757 unlock_task_sighand(child, &flags);
760 ret = 0;
761 break; 758 break;
762 759
763 case PTRACE_DETACH: /* detach a process that was attached. */ 760 case PTRACE_DETACH: /* detach a process that was attached. */
diff --git a/kernel/range.c b/kernel/range.c
index 37fa9b99ad5..9b8ae2d6ed6 100644
--- a/kernel/range.c
+++ b/kernel/range.c
@@ -1,7 +1,7 @@
1/* 1/*
2 * Range add and subtract 2 * Range add and subtract
3 */ 3 */
4#include <linux/module.h> 4#include <linux/kernel.h>
5#include <linux/init.h> 5#include <linux/init.h>
6#include <linux/sort.h> 6#include <linux/sort.h>
7 7
diff --git a/kernel/rcu.h b/kernel/rcu.h
new file mode 100644
index 00000000000..f600868d550
--- /dev/null
+++ b/kernel/rcu.h
@@ -0,0 +1,85 @@
1/*
2 * Read-Copy Update definitions shared among RCU implementations.
3 *
4 * This program is free software; you can redistribute it and/or modify
5 * it under the terms of the GNU General Public License as published by
6 * the Free Software Foundation; either version 2 of the License, or
7 * (at your option) any later version.
8 *
9 * This program is distributed in the hope that it will be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 * GNU General Public License for more details.
13 *
14 * You should have received a copy of the GNU General Public License
15 * along with this program; if not, write to the Free Software
16 * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
17 *
18 * Copyright IBM Corporation, 2011
19 *
20 * Author: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
21 */
22
23#ifndef __LINUX_RCU_H
24#define __LINUX_RCU_H
25
26#ifdef CONFIG_RCU_TRACE
27#define RCU_TRACE(stmt) stmt
28#else /* #ifdef CONFIG_RCU_TRACE */
29#define RCU_TRACE(stmt)
30#endif /* #else #ifdef CONFIG_RCU_TRACE */
31
32/*
33 * debug_rcu_head_queue()/debug_rcu_head_unqueue() are used internally
34 * by call_rcu() and rcu callback execution, and are therefore not part of the
35 * RCU API. Leaving in rcupdate.h because they are used by all RCU flavors.
36 */
37
38#ifdef CONFIG_DEBUG_OBJECTS_RCU_HEAD
39# define STATE_RCU_HEAD_READY 0
40# define STATE_RCU_HEAD_QUEUED 1
41
42extern struct debug_obj_descr rcuhead_debug_descr;
43
44static inline void debug_rcu_head_queue(struct rcu_head *head)
45{
46 WARN_ON_ONCE((unsigned long)head & 0x3);
47 debug_object_activate(head, &rcuhead_debug_descr);
48 debug_object_active_state(head, &rcuhead_debug_descr,
49 STATE_RCU_HEAD_READY,
50 STATE_RCU_HEAD_QUEUED);
51}
52
53static inline void debug_rcu_head_unqueue(struct rcu_head *head)
54{
55 debug_object_active_state(head, &rcuhead_debug_descr,
56 STATE_RCU_HEAD_QUEUED,
57 STATE_RCU_HEAD_READY);
58 debug_object_deactivate(head, &rcuhead_debug_descr);
59}
60#else /* !CONFIG_DEBUG_OBJECTS_RCU_HEAD */
61static inline void debug_rcu_head_queue(struct rcu_head *head)
62{
63}
64
65static inline void debug_rcu_head_unqueue(struct rcu_head *head)
66{
67}
68#endif /* #else !CONFIG_DEBUG_OBJECTS_RCU_HEAD */
69
70extern void kfree(const void *);
71
72static inline void __rcu_reclaim(char *rn, struct rcu_head *head)
73{
74 unsigned long offset = (unsigned long)head->func;
75
76 if (__is_kfree_rcu_offset(offset)) {
77 RCU_TRACE(trace_rcu_invoke_kfree_callback(rn, head, offset));
78 kfree((void *)head - offset);
79 } else {
80 RCU_TRACE(trace_rcu_invoke_callback(rn, head));
81 head->func(head);
82 }
83}
84
85#endif /* __LINUX_RCU_H */
diff --git a/kernel/rcupdate.c b/kernel/rcupdate.c
index ddddb320be6..c5b98e565ae 100644
--- a/kernel/rcupdate.c
+++ b/kernel/rcupdate.c
@@ -43,9 +43,14 @@
43#include <linux/notifier.h> 43#include <linux/notifier.h>
44#include <linux/cpu.h> 44#include <linux/cpu.h>
45#include <linux/mutex.h> 45#include <linux/mutex.h>
46#include <linux/module.h> 46#include <linux/export.h>
47#include <linux/hardirq.h> 47#include <linux/hardirq.h>
48 48
49#define CREATE_TRACE_POINTS
50#include <trace/events/rcu.h>
51
52#include "rcu.h"
53
49#ifdef CONFIG_DEBUG_LOCK_ALLOC 54#ifdef CONFIG_DEBUG_LOCK_ALLOC
50static struct lock_class_key rcu_lock_key; 55static struct lock_class_key rcu_lock_key;
51struct lockdep_map rcu_lock_map = 56struct lockdep_map rcu_lock_map =
@@ -94,11 +99,16 @@ EXPORT_SYMBOL_GPL(rcu_read_lock_bh_held);
94 99
95#endif /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */ 100#endif /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */
96 101
102struct rcu_synchronize {
103 struct rcu_head head;
104 struct completion completion;
105};
106
97/* 107/*
98 * Awaken the corresponding synchronize_rcu() instance now that a 108 * Awaken the corresponding synchronize_rcu() instance now that a
99 * grace period has elapsed. 109 * grace period has elapsed.
100 */ 110 */
101void wakeme_after_rcu(struct rcu_head *head) 111static void wakeme_after_rcu(struct rcu_head *head)
102{ 112{
103 struct rcu_synchronize *rcu; 113 struct rcu_synchronize *rcu;
104 114
@@ -106,6 +116,20 @@ void wakeme_after_rcu(struct rcu_head *head)
106 complete(&rcu->completion); 116 complete(&rcu->completion);
107} 117}
108 118
119void wait_rcu_gp(call_rcu_func_t crf)
120{
121 struct rcu_synchronize rcu;
122
123 init_rcu_head_on_stack(&rcu.head);
124 init_completion(&rcu.completion);
125 /* Will wake me after RCU finished. */
126 crf(&rcu.head, wakeme_after_rcu);
127 /* Wait for it. */
128 wait_for_completion(&rcu.completion);
129 destroy_rcu_head_on_stack(&rcu.head);
130}
131EXPORT_SYMBOL_GPL(wait_rcu_gp);
132
109#ifdef CONFIG_PROVE_RCU 133#ifdef CONFIG_PROVE_RCU
110/* 134/*
111 * wrapper function to avoid #include problems. 135 * wrapper function to avoid #include problems.
diff --git a/kernel/rcutiny.c b/kernel/rcutiny.c
index 7bbac7d0f5a..636af6d9c6e 100644
--- a/kernel/rcutiny.c
+++ b/kernel/rcutiny.c
@@ -22,13 +22,12 @@
22 * For detailed explanation of Read-Copy Update mechanism see - 22 * For detailed explanation of Read-Copy Update mechanism see -
23 * Documentation/RCU 23 * Documentation/RCU
24 */ 24 */
25#include <linux/moduleparam.h>
26#include <linux/completion.h> 25#include <linux/completion.h>
27#include <linux/interrupt.h> 26#include <linux/interrupt.h>
28#include <linux/notifier.h> 27#include <linux/notifier.h>
29#include <linux/rcupdate.h> 28#include <linux/rcupdate.h>
30#include <linux/kernel.h> 29#include <linux/kernel.h>
31#include <linux/module.h> 30#include <linux/export.h>
32#include <linux/mutex.h> 31#include <linux/mutex.h>
33#include <linux/sched.h> 32#include <linux/sched.h>
34#include <linux/types.h> 33#include <linux/types.h>
@@ -37,16 +36,17 @@
37#include <linux/cpu.h> 36#include <linux/cpu.h>
38#include <linux/prefetch.h> 37#include <linux/prefetch.h>
39 38
40/* Controls for rcu_kthread() kthread, replacing RCU_SOFTIRQ used previously. */ 39#ifdef CONFIG_RCU_TRACE
41static struct task_struct *rcu_kthread_task; 40#include <trace/events/rcu.h>
42static DECLARE_WAIT_QUEUE_HEAD(rcu_kthread_wq); 41#endif /* #else #ifdef CONFIG_RCU_TRACE */
43static unsigned long have_rcu_kthread_work; 42
43#include "rcu.h"
44 44
45/* Forward declarations for rcutiny_plugin.h. */ 45/* Forward declarations for rcutiny_plugin.h. */
46struct rcu_ctrlblk; 46struct rcu_ctrlblk;
47static void invoke_rcu_kthread(void); 47static void invoke_rcu_callbacks(void);
48static void rcu_process_callbacks(struct rcu_ctrlblk *rcp); 48static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp);
49static int rcu_kthread(void *arg); 49static void rcu_process_callbacks(struct softirq_action *unused);
50static void __call_rcu(struct rcu_head *head, 50static void __call_rcu(struct rcu_head *head,
51 void (*func)(struct rcu_head *rcu), 51 void (*func)(struct rcu_head *rcu),
52 struct rcu_ctrlblk *rcp); 52 struct rcu_ctrlblk *rcp);
@@ -96,16 +96,6 @@ static int rcu_qsctr_help(struct rcu_ctrlblk *rcp)
96} 96}
97 97
98/* 98/*
99 * Wake up rcu_kthread() to process callbacks now eligible for invocation
100 * or to boost readers.
101 */
102static void invoke_rcu_kthread(void)
103{
104 have_rcu_kthread_work = 1;
105 wake_up(&rcu_kthread_wq);
106}
107
108/*
109 * Record an rcu quiescent state. And an rcu_bh quiescent state while we 99 * Record an rcu quiescent state. And an rcu_bh quiescent state while we
110 * are at it, given that any rcu quiescent state is also an rcu_bh 100 * are at it, given that any rcu quiescent state is also an rcu_bh
111 * quiescent state. Use "+" instead of "||" to defeat short circuiting. 101 * quiescent state. Use "+" instead of "||" to defeat short circuiting.
@@ -117,7 +107,7 @@ void rcu_sched_qs(int cpu)
117 local_irq_save(flags); 107 local_irq_save(flags);
118 if (rcu_qsctr_help(&rcu_sched_ctrlblk) + 108 if (rcu_qsctr_help(&rcu_sched_ctrlblk) +
119 rcu_qsctr_help(&rcu_bh_ctrlblk)) 109 rcu_qsctr_help(&rcu_bh_ctrlblk))
120 invoke_rcu_kthread(); 110 invoke_rcu_callbacks();
121 local_irq_restore(flags); 111 local_irq_restore(flags);
122} 112}
123 113
@@ -130,7 +120,7 @@ void rcu_bh_qs(int cpu)
130 120
131 local_irq_save(flags); 121 local_irq_save(flags);
132 if (rcu_qsctr_help(&rcu_bh_ctrlblk)) 122 if (rcu_qsctr_help(&rcu_bh_ctrlblk))
133 invoke_rcu_kthread(); 123 invoke_rcu_callbacks();
134 local_irq_restore(flags); 124 local_irq_restore(flags);
135} 125}
136 126
@@ -154,18 +144,23 @@ void rcu_check_callbacks(int cpu, int user)
154 * Invoke the RCU callbacks on the specified rcu_ctrlkblk structure 144 * Invoke the RCU callbacks on the specified rcu_ctrlkblk structure
155 * whose grace period has elapsed. 145 * whose grace period has elapsed.
156 */ 146 */
157static void rcu_process_callbacks(struct rcu_ctrlblk *rcp) 147static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp)
158{ 148{
149 char *rn = NULL;
159 struct rcu_head *next, *list; 150 struct rcu_head *next, *list;
160 unsigned long flags; 151 unsigned long flags;
161 RCU_TRACE(int cb_count = 0); 152 RCU_TRACE(int cb_count = 0);
162 153
163 /* If no RCU callbacks ready to invoke, just return. */ 154 /* If no RCU callbacks ready to invoke, just return. */
164 if (&rcp->rcucblist == rcp->donetail) 155 if (&rcp->rcucblist == rcp->donetail) {
156 RCU_TRACE(trace_rcu_batch_start(rcp->name, 0, -1));
157 RCU_TRACE(trace_rcu_batch_end(rcp->name, 0));
165 return; 158 return;
159 }
166 160
167 /* Move the ready-to-invoke callbacks to a local list. */ 161 /* Move the ready-to-invoke callbacks to a local list. */
168 local_irq_save(flags); 162 local_irq_save(flags);
163 RCU_TRACE(trace_rcu_batch_start(rcp->name, 0, -1));
169 list = rcp->rcucblist; 164 list = rcp->rcucblist;
170 rcp->rcucblist = *rcp->donetail; 165 rcp->rcucblist = *rcp->donetail;
171 *rcp->donetail = NULL; 166 *rcp->donetail = NULL;
@@ -176,49 +171,26 @@ static void rcu_process_callbacks(struct rcu_ctrlblk *rcp)
176 local_irq_restore(flags); 171 local_irq_restore(flags);
177 172
178 /* Invoke the callbacks on the local list. */ 173 /* Invoke the callbacks on the local list. */
174 RCU_TRACE(rn = rcp->name);
179 while (list) { 175 while (list) {
180 next = list->next; 176 next = list->next;
181 prefetch(next); 177 prefetch(next);
182 debug_rcu_head_unqueue(list); 178 debug_rcu_head_unqueue(list);
183 local_bh_disable(); 179 local_bh_disable();
184 __rcu_reclaim(list); 180 __rcu_reclaim(rn, list);
185 local_bh_enable(); 181 local_bh_enable();
186 list = next; 182 list = next;
187 RCU_TRACE(cb_count++); 183 RCU_TRACE(cb_count++);
188 } 184 }
189 RCU_TRACE(rcu_trace_sub_qlen(rcp, cb_count)); 185 RCU_TRACE(rcu_trace_sub_qlen(rcp, cb_count));
186 RCU_TRACE(trace_rcu_batch_end(rcp->name, cb_count));
190} 187}
191 188
192/* 189static void rcu_process_callbacks(struct softirq_action *unused)
193 * This kthread invokes RCU callbacks whose grace periods have
194 * elapsed. It is awakened as needed, and takes the place of the
195 * RCU_SOFTIRQ that was used previously for this purpose.
196 * This is a kthread, but it is never stopped, at least not until
197 * the system goes down.
198 */
199static int rcu_kthread(void *arg)
200{ 190{
201 unsigned long work; 191 __rcu_process_callbacks(&rcu_sched_ctrlblk);
202 unsigned long morework; 192 __rcu_process_callbacks(&rcu_bh_ctrlblk);
203 unsigned long flags; 193 rcu_preempt_process_callbacks();
204
205 for (;;) {
206 wait_event_interruptible(rcu_kthread_wq,
207 have_rcu_kthread_work != 0);
208 morework = rcu_boost();
209 local_irq_save(flags);
210 work = have_rcu_kthread_work;
211 have_rcu_kthread_work = morework;
212 local_irq_restore(flags);
213 if (work) {
214 rcu_process_callbacks(&rcu_sched_ctrlblk);
215 rcu_process_callbacks(&rcu_bh_ctrlblk);
216 rcu_preempt_process_callbacks();
217 }
218 schedule_timeout_interruptible(1); /* Leave CPU for others. */
219 }
220
221 return 0; /* Not reached, but needed to shut gcc up. */
222} 194}
223 195
224/* 196/*
@@ -280,45 +252,3 @@ void call_rcu_bh(struct rcu_head *head, void (*func)(struct rcu_head *rcu))
280 __call_rcu(head, func, &rcu_bh_ctrlblk); 252 __call_rcu(head, func, &rcu_bh_ctrlblk);
281} 253}
282EXPORT_SYMBOL_GPL(call_rcu_bh); 254EXPORT_SYMBOL_GPL(call_rcu_bh);
283
284void rcu_barrier_bh(void)
285{
286 struct rcu_synchronize rcu;
287
288 init_rcu_head_on_stack(&rcu.head);
289 init_completion(&rcu.completion);
290 /* Will wake me after RCU finished. */
291 call_rcu_bh(&rcu.head, wakeme_after_rcu);
292 /* Wait for it. */
293 wait_for_completion(&rcu.completion);
294 destroy_rcu_head_on_stack(&rcu.head);
295}
296EXPORT_SYMBOL_GPL(rcu_barrier_bh);
297
298void rcu_barrier_sched(void)
299{
300 struct rcu_synchronize rcu;
301
302 init_rcu_head_on_stack(&rcu.head);
303 init_completion(&rcu.completion);
304 /* Will wake me after RCU finished. */
305 call_rcu_sched(&rcu.head, wakeme_after_rcu);
306 /* Wait for it. */
307 wait_for_completion(&rcu.completion);
308 destroy_rcu_head_on_stack(&rcu.head);
309}
310EXPORT_SYMBOL_GPL(rcu_barrier_sched);
311
312/*
313 * Spawn the kthread that invokes RCU callbacks.
314 */
315static int __init rcu_spawn_kthreads(void)
316{
317 struct sched_param sp;
318
319 rcu_kthread_task = kthread_run(rcu_kthread, NULL, "rcu_kthread");
320 sp.sched_priority = RCU_BOOST_PRIO;
321 sched_setscheduler_nocheck(rcu_kthread_task, SCHED_FIFO, &sp);
322 return 0;
323}
324early_initcall(rcu_spawn_kthreads);
diff --git a/kernel/rcutiny_plugin.h b/kernel/rcutiny_plugin.h
index f259c676195..2b0484a5dc2 100644
--- a/kernel/rcutiny_plugin.h
+++ b/kernel/rcutiny_plugin.h
@@ -23,32 +23,30 @@
23 */ 23 */
24 24
25#include <linux/kthread.h> 25#include <linux/kthread.h>
26#include <linux/module.h>
26#include <linux/debugfs.h> 27#include <linux/debugfs.h>
27#include <linux/seq_file.h> 28#include <linux/seq_file.h>
28 29
29#ifdef CONFIG_RCU_TRACE
30#define RCU_TRACE(stmt) stmt
31#else /* #ifdef CONFIG_RCU_TRACE */
32#define RCU_TRACE(stmt)
33#endif /* #else #ifdef CONFIG_RCU_TRACE */
34
35/* Global control variables for rcupdate callback mechanism. */ 30/* Global control variables for rcupdate callback mechanism. */
36struct rcu_ctrlblk { 31struct rcu_ctrlblk {
37 struct rcu_head *rcucblist; /* List of pending callbacks (CBs). */ 32 struct rcu_head *rcucblist; /* List of pending callbacks (CBs). */
38 struct rcu_head **donetail; /* ->next pointer of last "done" CB. */ 33 struct rcu_head **donetail; /* ->next pointer of last "done" CB. */
39 struct rcu_head **curtail; /* ->next pointer of last CB. */ 34 struct rcu_head **curtail; /* ->next pointer of last CB. */
40 RCU_TRACE(long qlen); /* Number of pending CBs. */ 35 RCU_TRACE(long qlen); /* Number of pending CBs. */
36 RCU_TRACE(char *name); /* Name of RCU type. */
41}; 37};
42 38
43/* Definition for rcupdate control block. */ 39/* Definition for rcupdate control block. */
44static struct rcu_ctrlblk rcu_sched_ctrlblk = { 40static struct rcu_ctrlblk rcu_sched_ctrlblk = {
45 .donetail = &rcu_sched_ctrlblk.rcucblist, 41 .donetail = &rcu_sched_ctrlblk.rcucblist,
46 .curtail = &rcu_sched_ctrlblk.rcucblist, 42 .curtail = &rcu_sched_ctrlblk.rcucblist,
43 RCU_TRACE(.name = "rcu_sched")
47}; 44};
48 45
49static struct rcu_ctrlblk rcu_bh_ctrlblk = { 46static struct rcu_ctrlblk rcu_bh_ctrlblk = {
50 .donetail = &rcu_bh_ctrlblk.rcucblist, 47 .donetail = &rcu_bh_ctrlblk.rcucblist,
51 .curtail = &rcu_bh_ctrlblk.rcucblist, 48 .curtail = &rcu_bh_ctrlblk.rcucblist,
49 RCU_TRACE(.name = "rcu_bh")
52}; 50};
53 51
54#ifdef CONFIG_DEBUG_LOCK_ALLOC 52#ifdef CONFIG_DEBUG_LOCK_ALLOC
@@ -131,6 +129,7 @@ static struct rcu_preempt_ctrlblk rcu_preempt_ctrlblk = {
131 .rcb.curtail = &rcu_preempt_ctrlblk.rcb.rcucblist, 129 .rcb.curtail = &rcu_preempt_ctrlblk.rcb.rcucblist,
132 .nexttail = &rcu_preempt_ctrlblk.rcb.rcucblist, 130 .nexttail = &rcu_preempt_ctrlblk.rcb.rcucblist,
133 .blkd_tasks = LIST_HEAD_INIT(rcu_preempt_ctrlblk.blkd_tasks), 131 .blkd_tasks = LIST_HEAD_INIT(rcu_preempt_ctrlblk.blkd_tasks),
132 RCU_TRACE(.rcb.name = "rcu_preempt")
134}; 133};
135 134
136static int rcu_preempted_readers_exp(void); 135static int rcu_preempted_readers_exp(void);
@@ -247,6 +246,13 @@ static void show_tiny_preempt_stats(struct seq_file *m)
247 246
248#include "rtmutex_common.h" 247#include "rtmutex_common.h"
249 248
249#define RCU_BOOST_PRIO CONFIG_RCU_BOOST_PRIO
250
251/* Controls for rcu_kthread() kthread. */
252static struct task_struct *rcu_kthread_task;
253static DECLARE_WAIT_QUEUE_HEAD(rcu_kthread_wq);
254static unsigned long have_rcu_kthread_work;
255
250/* 256/*
251 * Carry out RCU priority boosting on the task indicated by ->boost_tasks, 257 * Carry out RCU priority boosting on the task indicated by ->boost_tasks,
252 * and advance ->boost_tasks to the next task in the ->blkd_tasks list. 258 * and advance ->boost_tasks to the next task in the ->blkd_tasks list.
@@ -334,7 +340,7 @@ static int rcu_initiate_boost(void)
334 if (rcu_preempt_ctrlblk.exp_tasks == NULL) 340 if (rcu_preempt_ctrlblk.exp_tasks == NULL)
335 rcu_preempt_ctrlblk.boost_tasks = 341 rcu_preempt_ctrlblk.boost_tasks =
336 rcu_preempt_ctrlblk.gp_tasks; 342 rcu_preempt_ctrlblk.gp_tasks;
337 invoke_rcu_kthread(); 343 invoke_rcu_callbacks();
338 } else 344 } else
339 RCU_TRACE(rcu_initiate_boost_trace()); 345 RCU_TRACE(rcu_initiate_boost_trace());
340 return 1; 346 return 1;
@@ -353,14 +359,6 @@ static void rcu_preempt_boost_start_gp(void)
353#else /* #ifdef CONFIG_RCU_BOOST */ 359#else /* #ifdef CONFIG_RCU_BOOST */
354 360
355/* 361/*
356 * If there is no RCU priority boosting, we don't boost.
357 */
358static int rcu_boost(void)
359{
360 return 0;
361}
362
363/*
364 * If there is no RCU priority boosting, we don't initiate boosting, 362 * If there is no RCU priority boosting, we don't initiate boosting,
365 * but we do indicate whether there are blocked readers blocking the 363 * but we do indicate whether there are blocked readers blocking the
366 * current grace period. 364 * current grace period.
@@ -427,7 +425,7 @@ static void rcu_preempt_cpu_qs(void)
427 425
428 /* If there are done callbacks, cause them to be invoked. */ 426 /* If there are done callbacks, cause them to be invoked. */
429 if (*rcu_preempt_ctrlblk.rcb.donetail != NULL) 427 if (*rcu_preempt_ctrlblk.rcb.donetail != NULL)
430 invoke_rcu_kthread(); 428 invoke_rcu_callbacks();
431} 429}
432 430
433/* 431/*
@@ -648,7 +646,7 @@ static void rcu_preempt_check_callbacks(void)
648 rcu_preempt_cpu_qs(); 646 rcu_preempt_cpu_qs();
649 if (&rcu_preempt_ctrlblk.rcb.rcucblist != 647 if (&rcu_preempt_ctrlblk.rcb.rcucblist !=
650 rcu_preempt_ctrlblk.rcb.donetail) 648 rcu_preempt_ctrlblk.rcb.donetail)
651 invoke_rcu_kthread(); 649 invoke_rcu_callbacks();
652 if (rcu_preempt_gp_in_progress() && 650 if (rcu_preempt_gp_in_progress() &&
653 rcu_cpu_blocking_cur_gp() && 651 rcu_cpu_blocking_cur_gp() &&
654 rcu_preempt_running_reader()) 652 rcu_preempt_running_reader())
@@ -674,7 +672,7 @@ static void rcu_preempt_remove_callbacks(struct rcu_ctrlblk *rcp)
674 */ 672 */
675static void rcu_preempt_process_callbacks(void) 673static void rcu_preempt_process_callbacks(void)
676{ 674{
677 rcu_process_callbacks(&rcu_preempt_ctrlblk.rcb); 675 __rcu_process_callbacks(&rcu_preempt_ctrlblk.rcb);
678} 676}
679 677
680/* 678/*
@@ -697,20 +695,6 @@ void call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu))
697} 695}
698EXPORT_SYMBOL_GPL(call_rcu); 696EXPORT_SYMBOL_GPL(call_rcu);
699 697
700void rcu_barrier(void)
701{
702 struct rcu_synchronize rcu;
703
704 init_rcu_head_on_stack(&rcu.head);
705 init_completion(&rcu.completion);
706 /* Will wake me after RCU finished. */
707 call_rcu(&rcu.head, wakeme_after_rcu);
708 /* Wait for it. */
709 wait_for_completion(&rcu.completion);
710 destroy_rcu_head_on_stack(&rcu.head);
711}
712EXPORT_SYMBOL_GPL(rcu_barrier);
713
714/* 698/*
715 * synchronize_rcu - wait until a grace period has elapsed. 699 * synchronize_rcu - wait until a grace period has elapsed.
716 * 700 *
@@ -864,15 +848,6 @@ static void show_tiny_preempt_stats(struct seq_file *m)
864#endif /* #ifdef CONFIG_RCU_TRACE */ 848#endif /* #ifdef CONFIG_RCU_TRACE */
865 849
866/* 850/*
867 * Because preemptible RCU does not exist, it is never necessary to
868 * boost preempted RCU readers.
869 */
870static int rcu_boost(void)
871{
872 return 0;
873}
874
875/*
876 * Because preemptible RCU does not exist, it never has any callbacks 851 * Because preemptible RCU does not exist, it never has any callbacks
877 * to check. 852 * to check.
878 */ 853 */
@@ -898,6 +873,78 @@ static void rcu_preempt_process_callbacks(void)
898 873
899#endif /* #else #ifdef CONFIG_TINY_PREEMPT_RCU */ 874#endif /* #else #ifdef CONFIG_TINY_PREEMPT_RCU */
900 875
876#ifdef CONFIG_RCU_BOOST
877
878/*
879 * Wake up rcu_kthread() to process callbacks now eligible for invocation
880 * or to boost readers.
881 */
882static void invoke_rcu_callbacks(void)
883{
884 have_rcu_kthread_work = 1;
885 wake_up(&rcu_kthread_wq);
886}
887
888/*
889 * This kthread invokes RCU callbacks whose grace periods have
890 * elapsed. It is awakened as needed, and takes the place of the
891 * RCU_SOFTIRQ that is used for this purpose when boosting is disabled.
892 * This is a kthread, but it is never stopped, at least not until
893 * the system goes down.
894 */
895static int rcu_kthread(void *arg)
896{
897 unsigned long work;
898 unsigned long morework;
899 unsigned long flags;
900
901 for (;;) {
902 wait_event_interruptible(rcu_kthread_wq,
903 have_rcu_kthread_work != 0);
904 morework = rcu_boost();
905 local_irq_save(flags);
906 work = have_rcu_kthread_work;
907 have_rcu_kthread_work = morework;
908 local_irq_restore(flags);
909 if (work)
910 rcu_process_callbacks(NULL);
911 schedule_timeout_interruptible(1); /* Leave CPU for others. */
912 }
913
914 return 0; /* Not reached, but needed to shut gcc up. */
915}
916
917/*
918 * Spawn the kthread that invokes RCU callbacks.
919 */
920static int __init rcu_spawn_kthreads(void)
921{
922 struct sched_param sp;
923
924 rcu_kthread_task = kthread_run(rcu_kthread, NULL, "rcu_kthread");
925 sp.sched_priority = RCU_BOOST_PRIO;
926 sched_setscheduler_nocheck(rcu_kthread_task, SCHED_FIFO, &sp);
927 return 0;
928}
929early_initcall(rcu_spawn_kthreads);
930
931#else /* #ifdef CONFIG_RCU_BOOST */
932
933/*
934 * Start up softirq processing of callbacks.
935 */
936void invoke_rcu_callbacks(void)
937{
938 raise_softirq(RCU_SOFTIRQ);
939}
940
941void rcu_init(void)
942{
943 open_softirq(RCU_SOFTIRQ, rcu_process_callbacks);
944}
945
946#endif /* #else #ifdef CONFIG_RCU_BOOST */
947
901#ifdef CONFIG_DEBUG_LOCK_ALLOC 948#ifdef CONFIG_DEBUG_LOCK_ALLOC
902#include <linux/kernel_stat.h> 949#include <linux/kernel_stat.h>
903 950
@@ -913,12 +960,6 @@ void __init rcu_scheduler_starting(void)
913 960
914#endif /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */ 961#endif /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */
915 962
916#ifdef CONFIG_RCU_BOOST
917#define RCU_BOOST_PRIO CONFIG_RCU_BOOST_PRIO
918#else /* #ifdef CONFIG_RCU_BOOST */
919#define RCU_BOOST_PRIO 1
920#endif /* #else #ifdef CONFIG_RCU_BOOST */
921
922#ifdef CONFIG_RCU_TRACE 963#ifdef CONFIG_RCU_TRACE
923 964
924#ifdef CONFIG_RCU_BOOST 965#ifdef CONFIG_RCU_BOOST
diff --git a/kernel/rcutorture.c b/kernel/rcutorture.c
index 98f51b13bb7..764825c2685 100644
--- a/kernel/rcutorture.c
+++ b/kernel/rcutorture.c
@@ -73,7 +73,7 @@ module_param(nreaders, int, 0444);
73MODULE_PARM_DESC(nreaders, "Number of RCU reader threads"); 73MODULE_PARM_DESC(nreaders, "Number of RCU reader threads");
74module_param(nfakewriters, int, 0444); 74module_param(nfakewriters, int, 0444);
75MODULE_PARM_DESC(nfakewriters, "Number of RCU fake writer threads"); 75MODULE_PARM_DESC(nfakewriters, "Number of RCU fake writer threads");
76module_param(stat_interval, int, 0444); 76module_param(stat_interval, int, 0644);
77MODULE_PARM_DESC(stat_interval, "Number of seconds between stats printk()s"); 77MODULE_PARM_DESC(stat_interval, "Number of seconds between stats printk()s");
78module_param(verbose, bool, 0444); 78module_param(verbose, bool, 0444);
79MODULE_PARM_DESC(verbose, "Enable verbose debugging printk()s"); 79MODULE_PARM_DESC(verbose, "Enable verbose debugging printk()s");
@@ -480,30 +480,6 @@ static void rcu_bh_torture_deferred_free(struct rcu_torture *p)
480 call_rcu_bh(&p->rtort_rcu, rcu_torture_cb); 480 call_rcu_bh(&p->rtort_rcu, rcu_torture_cb);
481} 481}
482 482
483struct rcu_bh_torture_synchronize {
484 struct rcu_head head;
485 struct completion completion;
486};
487
488static void rcu_bh_torture_wakeme_after_cb(struct rcu_head *head)
489{
490 struct rcu_bh_torture_synchronize *rcu;
491
492 rcu = container_of(head, struct rcu_bh_torture_synchronize, head);
493 complete(&rcu->completion);
494}
495
496static void rcu_bh_torture_synchronize(void)
497{
498 struct rcu_bh_torture_synchronize rcu;
499
500 init_rcu_head_on_stack(&rcu.head);
501 init_completion(&rcu.completion);
502 call_rcu_bh(&rcu.head, rcu_bh_torture_wakeme_after_cb);
503 wait_for_completion(&rcu.completion);
504 destroy_rcu_head_on_stack(&rcu.head);
505}
506
507static struct rcu_torture_ops rcu_bh_ops = { 483static struct rcu_torture_ops rcu_bh_ops = {
508 .init = NULL, 484 .init = NULL,
509 .cleanup = NULL, 485 .cleanup = NULL,
@@ -512,7 +488,7 @@ static struct rcu_torture_ops rcu_bh_ops = {
512 .readunlock = rcu_bh_torture_read_unlock, 488 .readunlock = rcu_bh_torture_read_unlock,
513 .completed = rcu_bh_torture_completed, 489 .completed = rcu_bh_torture_completed,
514 .deferred_free = rcu_bh_torture_deferred_free, 490 .deferred_free = rcu_bh_torture_deferred_free,
515 .sync = rcu_bh_torture_synchronize, 491 .sync = synchronize_rcu_bh,
516 .cb_barrier = rcu_barrier_bh, 492 .cb_barrier = rcu_barrier_bh,
517 .fqs = rcu_bh_force_quiescent_state, 493 .fqs = rcu_bh_force_quiescent_state,
518 .stats = NULL, 494 .stats = NULL,
@@ -528,7 +504,7 @@ static struct rcu_torture_ops rcu_bh_sync_ops = {
528 .readunlock = rcu_bh_torture_read_unlock, 504 .readunlock = rcu_bh_torture_read_unlock,
529 .completed = rcu_bh_torture_completed, 505 .completed = rcu_bh_torture_completed,
530 .deferred_free = rcu_sync_torture_deferred_free, 506 .deferred_free = rcu_sync_torture_deferred_free,
531 .sync = rcu_bh_torture_synchronize, 507 .sync = synchronize_rcu_bh,
532 .cb_barrier = NULL, 508 .cb_barrier = NULL,
533 .fqs = rcu_bh_force_quiescent_state, 509 .fqs = rcu_bh_force_quiescent_state,
534 .stats = NULL, 510 .stats = NULL,
@@ -536,6 +512,22 @@ static struct rcu_torture_ops rcu_bh_sync_ops = {
536 .name = "rcu_bh_sync" 512 .name = "rcu_bh_sync"
537}; 513};
538 514
515static struct rcu_torture_ops rcu_bh_expedited_ops = {
516 .init = rcu_sync_torture_init,
517 .cleanup = NULL,
518 .readlock = rcu_bh_torture_read_lock,
519 .read_delay = rcu_read_delay, /* just reuse rcu's version. */
520 .readunlock = rcu_bh_torture_read_unlock,
521 .completed = rcu_bh_torture_completed,
522 .deferred_free = rcu_sync_torture_deferred_free,
523 .sync = synchronize_rcu_bh_expedited,
524 .cb_barrier = NULL,
525 .fqs = rcu_bh_force_quiescent_state,
526 .stats = NULL,
527 .irq_capable = 1,
528 .name = "rcu_bh_expedited"
529};
530
539/* 531/*
540 * Definitions for srcu torture testing. 532 * Definitions for srcu torture testing.
541 */ 533 */
@@ -659,11 +651,6 @@ static void rcu_sched_torture_deferred_free(struct rcu_torture *p)
659 call_rcu_sched(&p->rtort_rcu, rcu_torture_cb); 651 call_rcu_sched(&p->rtort_rcu, rcu_torture_cb);
660} 652}
661 653
662static void sched_torture_synchronize(void)
663{
664 synchronize_sched();
665}
666
667static struct rcu_torture_ops sched_ops = { 654static struct rcu_torture_ops sched_ops = {
668 .init = rcu_sync_torture_init, 655 .init = rcu_sync_torture_init,
669 .cleanup = NULL, 656 .cleanup = NULL,
@@ -672,7 +659,7 @@ static struct rcu_torture_ops sched_ops = {
672 .readunlock = sched_torture_read_unlock, 659 .readunlock = sched_torture_read_unlock,
673 .completed = rcu_no_completed, 660 .completed = rcu_no_completed,
674 .deferred_free = rcu_sched_torture_deferred_free, 661 .deferred_free = rcu_sched_torture_deferred_free,
675 .sync = sched_torture_synchronize, 662 .sync = synchronize_sched,
676 .cb_barrier = rcu_barrier_sched, 663 .cb_barrier = rcu_barrier_sched,
677 .fqs = rcu_sched_force_quiescent_state, 664 .fqs = rcu_sched_force_quiescent_state,
678 .stats = NULL, 665 .stats = NULL,
@@ -688,7 +675,7 @@ static struct rcu_torture_ops sched_sync_ops = {
688 .readunlock = sched_torture_read_unlock, 675 .readunlock = sched_torture_read_unlock,
689 .completed = rcu_no_completed, 676 .completed = rcu_no_completed,
690 .deferred_free = rcu_sync_torture_deferred_free, 677 .deferred_free = rcu_sync_torture_deferred_free,
691 .sync = sched_torture_synchronize, 678 .sync = synchronize_sched,
692 .cb_barrier = NULL, 679 .cb_barrier = NULL,
693 .fqs = rcu_sched_force_quiescent_state, 680 .fqs = rcu_sched_force_quiescent_state,
694 .stats = NULL, 681 .stats = NULL,
@@ -754,7 +741,7 @@ static int rcu_torture_boost(void *arg)
754 do { 741 do {
755 /* Wait for the next test interval. */ 742 /* Wait for the next test interval. */
756 oldstarttime = boost_starttime; 743 oldstarttime = boost_starttime;
757 while (jiffies - oldstarttime > ULONG_MAX / 2) { 744 while (ULONG_CMP_LT(jiffies, oldstarttime)) {
758 schedule_timeout_uninterruptible(1); 745 schedule_timeout_uninterruptible(1);
759 rcu_stutter_wait("rcu_torture_boost"); 746 rcu_stutter_wait("rcu_torture_boost");
760 if (kthread_should_stop() || 747 if (kthread_should_stop() ||
@@ -765,7 +752,7 @@ static int rcu_torture_boost(void *arg)
765 /* Do one boost-test interval. */ 752 /* Do one boost-test interval. */
766 endtime = oldstarttime + test_boost_duration * HZ; 753 endtime = oldstarttime + test_boost_duration * HZ;
767 call_rcu_time = jiffies; 754 call_rcu_time = jiffies;
768 while (jiffies - endtime > ULONG_MAX / 2) { 755 while (ULONG_CMP_LT(jiffies, endtime)) {
769 /* If we don't have a callback in flight, post one. */ 756 /* If we don't have a callback in flight, post one. */
770 if (!rbi.inflight) { 757 if (!rbi.inflight) {
771 smp_mb(); /* RCU core before ->inflight = 1. */ 758 smp_mb(); /* RCU core before ->inflight = 1. */
@@ -792,7 +779,8 @@ static int rcu_torture_boost(void *arg)
792 * interval. Besides, we are running at RT priority, 779 * interval. Besides, we are running at RT priority,
793 * so delays should be relatively rare. 780 * so delays should be relatively rare.
794 */ 781 */
795 while (oldstarttime == boost_starttime) { 782 while (oldstarttime == boost_starttime &&
783 !kthread_should_stop()) {
796 if (mutex_trylock(&boost_mutex)) { 784 if (mutex_trylock(&boost_mutex)) {
797 boost_starttime = jiffies + 785 boost_starttime = jiffies +
798 test_boost_interval * HZ; 786 test_boost_interval * HZ;
@@ -809,11 +797,11 @@ checkwait: rcu_stutter_wait("rcu_torture_boost");
809 797
810 /* Clean up and exit. */ 798 /* Clean up and exit. */
811 VERBOSE_PRINTK_STRING("rcu_torture_boost task stopping"); 799 VERBOSE_PRINTK_STRING("rcu_torture_boost task stopping");
812 destroy_rcu_head_on_stack(&rbi.rcu);
813 rcutorture_shutdown_absorb("rcu_torture_boost"); 800 rcutorture_shutdown_absorb("rcu_torture_boost");
814 while (!kthread_should_stop() || rbi.inflight) 801 while (!kthread_should_stop() || rbi.inflight)
815 schedule_timeout_uninterruptible(1); 802 schedule_timeout_uninterruptible(1);
816 smp_mb(); /* order accesses to ->inflight before stack-frame death. */ 803 smp_mb(); /* order accesses to ->inflight before stack-frame death. */
804 destroy_rcu_head_on_stack(&rbi.rcu);
817 return 0; 805 return 0;
818} 806}
819 807
@@ -831,11 +819,13 @@ rcu_torture_fqs(void *arg)
831 VERBOSE_PRINTK_STRING("rcu_torture_fqs task started"); 819 VERBOSE_PRINTK_STRING("rcu_torture_fqs task started");
832 do { 820 do {
833 fqs_resume_time = jiffies + fqs_stutter * HZ; 821 fqs_resume_time = jiffies + fqs_stutter * HZ;
834 while (jiffies - fqs_resume_time > LONG_MAX) { 822 while (ULONG_CMP_LT(jiffies, fqs_resume_time) &&
823 !kthread_should_stop()) {
835 schedule_timeout_interruptible(1); 824 schedule_timeout_interruptible(1);
836 } 825 }
837 fqs_burst_remaining = fqs_duration; 826 fqs_burst_remaining = fqs_duration;
838 while (fqs_burst_remaining > 0) { 827 while (fqs_burst_remaining > 0 &&
828 !kthread_should_stop()) {
839 cur_ops->fqs(); 829 cur_ops->fqs();
840 udelay(fqs_holdoff); 830 udelay(fqs_holdoff);
841 fqs_burst_remaining -= fqs_holdoff; 831 fqs_burst_remaining -= fqs_holdoff;
@@ -1280,8 +1270,9 @@ static int rcutorture_booster_init(int cpu)
1280 /* Don't allow time recalculation while creating a new task. */ 1270 /* Don't allow time recalculation while creating a new task. */
1281 mutex_lock(&boost_mutex); 1271 mutex_lock(&boost_mutex);
1282 VERBOSE_PRINTK_STRING("Creating rcu_torture_boost task"); 1272 VERBOSE_PRINTK_STRING("Creating rcu_torture_boost task");
1283 boost_tasks[cpu] = kthread_create(rcu_torture_boost, NULL, 1273 boost_tasks[cpu] = kthread_create_on_node(rcu_torture_boost, NULL,
1284 "rcu_torture_boost"); 1274 cpu_to_node(cpu),
1275 "rcu_torture_boost");
1285 if (IS_ERR(boost_tasks[cpu])) { 1276 if (IS_ERR(boost_tasks[cpu])) {
1286 retval = PTR_ERR(boost_tasks[cpu]); 1277 retval = PTR_ERR(boost_tasks[cpu]);
1287 VERBOSE_PRINTK_STRING("rcu_torture_boost task create failed"); 1278 VERBOSE_PRINTK_STRING("rcu_torture_boost task create failed");
@@ -1424,7 +1415,7 @@ rcu_torture_init(void)
1424 int firsterr = 0; 1415 int firsterr = 0;
1425 static struct rcu_torture_ops *torture_ops[] = 1416 static struct rcu_torture_ops *torture_ops[] =
1426 { &rcu_ops, &rcu_sync_ops, &rcu_expedited_ops, 1417 { &rcu_ops, &rcu_sync_ops, &rcu_expedited_ops,
1427 &rcu_bh_ops, &rcu_bh_sync_ops, 1418 &rcu_bh_ops, &rcu_bh_sync_ops, &rcu_bh_expedited_ops,
1428 &srcu_ops, &srcu_expedited_ops, 1419 &srcu_ops, &srcu_expedited_ops,
1429 &sched_ops, &sched_sync_ops, &sched_expedited_ops, }; 1420 &sched_ops, &sched_sync_ops, &sched_expedited_ops, };
1430 1421
diff --git a/kernel/rcutree.c b/kernel/rcutree.c
index ba06207b1dd..6b76d812740 100644
--- a/kernel/rcutree.c
+++ b/kernel/rcutree.c
@@ -38,7 +38,7 @@
38#include <linux/nmi.h> 38#include <linux/nmi.h>
39#include <linux/atomic.h> 39#include <linux/atomic.h>
40#include <linux/bitops.h> 40#include <linux/bitops.h>
41#include <linux/module.h> 41#include <linux/export.h>
42#include <linux/completion.h> 42#include <linux/completion.h>
43#include <linux/moduleparam.h> 43#include <linux/moduleparam.h>
44#include <linux/percpu.h> 44#include <linux/percpu.h>
@@ -52,13 +52,16 @@
52#include <linux/prefetch.h> 52#include <linux/prefetch.h>
53 53
54#include "rcutree.h" 54#include "rcutree.h"
55#include <trace/events/rcu.h>
56
57#include "rcu.h"
55 58
56/* Data structures. */ 59/* Data structures. */
57 60
58static struct lock_class_key rcu_node_class[NUM_RCU_LVLS]; 61static struct lock_class_key rcu_node_class[NUM_RCU_LVLS];
59 62
60#define RCU_STATE_INITIALIZER(structname) { \ 63#define RCU_STATE_INITIALIZER(structname) { \
61 .level = { &structname.node[0] }, \ 64 .level = { &structname##_state.node[0] }, \
62 .levelcnt = { \ 65 .levelcnt = { \
63 NUM_RCU_LVL_0, /* root of hierarchy. */ \ 66 NUM_RCU_LVL_0, /* root of hierarchy. */ \
64 NUM_RCU_LVL_1, \ 67 NUM_RCU_LVL_1, \
@@ -69,17 +72,17 @@ static struct lock_class_key rcu_node_class[NUM_RCU_LVLS];
69 .signaled = RCU_GP_IDLE, \ 72 .signaled = RCU_GP_IDLE, \
70 .gpnum = -300, \ 73 .gpnum = -300, \
71 .completed = -300, \ 74 .completed = -300, \
72 .onofflock = __RAW_SPIN_LOCK_UNLOCKED(&structname.onofflock), \ 75 .onofflock = __RAW_SPIN_LOCK_UNLOCKED(&structname##_state.onofflock), \
73 .fqslock = __RAW_SPIN_LOCK_UNLOCKED(&structname.fqslock), \ 76 .fqslock = __RAW_SPIN_LOCK_UNLOCKED(&structname##_state.fqslock), \
74 .n_force_qs = 0, \ 77 .n_force_qs = 0, \
75 .n_force_qs_ngp = 0, \ 78 .n_force_qs_ngp = 0, \
76 .name = #structname, \ 79 .name = #structname, \
77} 80}
78 81
79struct rcu_state rcu_sched_state = RCU_STATE_INITIALIZER(rcu_sched_state); 82struct rcu_state rcu_sched_state = RCU_STATE_INITIALIZER(rcu_sched);
80DEFINE_PER_CPU(struct rcu_data, rcu_sched_data); 83DEFINE_PER_CPU(struct rcu_data, rcu_sched_data);
81 84
82struct rcu_state rcu_bh_state = RCU_STATE_INITIALIZER(rcu_bh_state); 85struct rcu_state rcu_bh_state = RCU_STATE_INITIALIZER(rcu_bh);
83DEFINE_PER_CPU(struct rcu_data, rcu_bh_data); 86DEFINE_PER_CPU(struct rcu_data, rcu_bh_data);
84 87
85static struct rcu_state *rcu_state; 88static struct rcu_state *rcu_state;
@@ -128,8 +131,6 @@ static void rcu_node_kthread_setaffinity(struct rcu_node *rnp, int outgoingcpu);
128static void invoke_rcu_core(void); 131static void invoke_rcu_core(void);
129static void invoke_rcu_callbacks(struct rcu_state *rsp, struct rcu_data *rdp); 132static void invoke_rcu_callbacks(struct rcu_state *rsp, struct rcu_data *rdp);
130 133
131#define RCU_KTHREAD_PRIO 1 /* RT priority for per-CPU kthreads. */
132
133/* 134/*
134 * Track the rcutorture test sequence number and the update version 135 * Track the rcutorture test sequence number and the update version
135 * number within a given test. The rcutorture_testseq is incremented 136 * number within a given test. The rcutorture_testseq is incremented
@@ -156,33 +157,41 @@ static int rcu_gp_in_progress(struct rcu_state *rsp)
156 * Note a quiescent state. Because we do not need to know 157 * Note a quiescent state. Because we do not need to know
157 * how many quiescent states passed, just if there was at least 158 * how many quiescent states passed, just if there was at least
158 * one since the start of the grace period, this just sets a flag. 159 * one since the start of the grace period, this just sets a flag.
160 * The caller must have disabled preemption.
159 */ 161 */
160void rcu_sched_qs(int cpu) 162void rcu_sched_qs(int cpu)
161{ 163{
162 struct rcu_data *rdp = &per_cpu(rcu_sched_data, cpu); 164 struct rcu_data *rdp = &per_cpu(rcu_sched_data, cpu);
163 165
164 rdp->passed_quiesc_completed = rdp->gpnum - 1; 166 rdp->passed_quiesce_gpnum = rdp->gpnum;
165 barrier(); 167 barrier();
166 rdp->passed_quiesc = 1; 168 if (rdp->passed_quiesce == 0)
169 trace_rcu_grace_period("rcu_sched", rdp->gpnum, "cpuqs");
170 rdp->passed_quiesce = 1;
167} 171}
168 172
169void rcu_bh_qs(int cpu) 173void rcu_bh_qs(int cpu)
170{ 174{
171 struct rcu_data *rdp = &per_cpu(rcu_bh_data, cpu); 175 struct rcu_data *rdp = &per_cpu(rcu_bh_data, cpu);
172 176
173 rdp->passed_quiesc_completed = rdp->gpnum - 1; 177 rdp->passed_quiesce_gpnum = rdp->gpnum;
174 barrier(); 178 barrier();
175 rdp->passed_quiesc = 1; 179 if (rdp->passed_quiesce == 0)
180 trace_rcu_grace_period("rcu_bh", rdp->gpnum, "cpuqs");
181 rdp->passed_quiesce = 1;
176} 182}
177 183
178/* 184/*
179 * Note a context switch. This is a quiescent state for RCU-sched, 185 * Note a context switch. This is a quiescent state for RCU-sched,
180 * and requires special handling for preemptible RCU. 186 * and requires special handling for preemptible RCU.
187 * The caller must have disabled preemption.
181 */ 188 */
182void rcu_note_context_switch(int cpu) 189void rcu_note_context_switch(int cpu)
183{ 190{
191 trace_rcu_utilization("Start context switch");
184 rcu_sched_qs(cpu); 192 rcu_sched_qs(cpu);
185 rcu_preempt_note_context_switch(cpu); 193 rcu_preempt_note_context_switch(cpu);
194 trace_rcu_utilization("End context switch");
186} 195}
187EXPORT_SYMBOL_GPL(rcu_note_context_switch); 196EXPORT_SYMBOL_GPL(rcu_note_context_switch);
188 197
@@ -193,7 +202,7 @@ DEFINE_PER_CPU(struct rcu_dynticks, rcu_dynticks) = {
193}; 202};
194#endif /* #ifdef CONFIG_NO_HZ */ 203#endif /* #ifdef CONFIG_NO_HZ */
195 204
196static int blimit = 10; /* Maximum callbacks per softirq. */ 205static int blimit = 10; /* Maximum callbacks per rcu_do_batch. */
197static int qhimark = 10000; /* If this many pending, ignore blimit. */ 206static int qhimark = 10000; /* If this many pending, ignore blimit. */
198static int qlowmark = 100; /* Once only this many pending, use blimit. */ 207static int qlowmark = 100; /* Once only this many pending, use blimit. */
199 208
@@ -314,6 +323,7 @@ static int rcu_implicit_offline_qs(struct rcu_data *rdp)
314 * trust its state not to change because interrupts are disabled. 323 * trust its state not to change because interrupts are disabled.
315 */ 324 */
316 if (cpu_is_offline(rdp->cpu)) { 325 if (cpu_is_offline(rdp->cpu)) {
326 trace_rcu_fqs(rdp->rsp->name, rdp->gpnum, rdp->cpu, "ofl");
317 rdp->offline_fqs++; 327 rdp->offline_fqs++;
318 return 1; 328 return 1;
319 } 329 }
@@ -354,19 +364,13 @@ void rcu_enter_nohz(void)
354 local_irq_restore(flags); 364 local_irq_restore(flags);
355 return; 365 return;
356 } 366 }
367 trace_rcu_dyntick("Start");
357 /* CPUs seeing atomic_inc() must see prior RCU read-side crit sects */ 368 /* CPUs seeing atomic_inc() must see prior RCU read-side crit sects */
358 smp_mb__before_atomic_inc(); /* See above. */ 369 smp_mb__before_atomic_inc(); /* See above. */
359 atomic_inc(&rdtp->dynticks); 370 atomic_inc(&rdtp->dynticks);
360 smp_mb__after_atomic_inc(); /* Force ordering with next sojourn. */ 371 smp_mb__after_atomic_inc(); /* Force ordering with next sojourn. */
361 WARN_ON_ONCE(atomic_read(&rdtp->dynticks) & 0x1); 372 WARN_ON_ONCE(atomic_read(&rdtp->dynticks) & 0x1);
362 local_irq_restore(flags); 373 local_irq_restore(flags);
363
364 /* If the interrupt queued a callback, get out of dyntick mode. */
365 if (in_irq() &&
366 (__get_cpu_var(rcu_sched_data).nxtlist ||
367 __get_cpu_var(rcu_bh_data).nxtlist ||
368 rcu_preempt_needs_cpu(smp_processor_id())))
369 set_need_resched();
370} 374}
371 375
372/* 376/*
@@ -391,6 +395,7 @@ void rcu_exit_nohz(void)
391 /* CPUs seeing atomic_inc() must see later RCU read-side crit sects */ 395 /* CPUs seeing atomic_inc() must see later RCU read-side crit sects */
392 smp_mb__after_atomic_inc(); /* See above. */ 396 smp_mb__after_atomic_inc(); /* See above. */
393 WARN_ON_ONCE(!(atomic_read(&rdtp->dynticks) & 0x1)); 397 WARN_ON_ONCE(!(atomic_read(&rdtp->dynticks) & 0x1));
398 trace_rcu_dyntick("End");
394 local_irq_restore(flags); 399 local_irq_restore(flags);
395} 400}
396 401
@@ -481,11 +486,11 @@ static int dyntick_save_progress_counter(struct rcu_data *rdp)
481 */ 486 */
482static int rcu_implicit_dynticks_qs(struct rcu_data *rdp) 487static int rcu_implicit_dynticks_qs(struct rcu_data *rdp)
483{ 488{
484 unsigned long curr; 489 unsigned int curr;
485 unsigned long snap; 490 unsigned int snap;
486 491
487 curr = (unsigned long)atomic_add_return(0, &rdp->dynticks->dynticks); 492 curr = (unsigned int)atomic_add_return(0, &rdp->dynticks->dynticks);
488 snap = (unsigned long)rdp->dynticks_snap; 493 snap = (unsigned int)rdp->dynticks_snap;
489 494
490 /* 495 /*
491 * If the CPU passed through or entered a dynticks idle phase with 496 * If the CPU passed through or entered a dynticks idle phase with
@@ -495,7 +500,8 @@ static int rcu_implicit_dynticks_qs(struct rcu_data *rdp)
495 * read-side critical section that started before the beginning 500 * read-side critical section that started before the beginning
496 * of the current RCU grace period. 501 * of the current RCU grace period.
497 */ 502 */
498 if ((curr & 0x1) == 0 || ULONG_CMP_GE(curr, snap + 2)) { 503 if ((curr & 0x1) == 0 || UINT_CMP_GE(curr, snap + 2)) {
504 trace_rcu_fqs(rdp->rsp->name, rdp->gpnum, rdp->cpu, "dti");
499 rdp->dynticks_fqs++; 505 rdp->dynticks_fqs++;
500 return 1; 506 return 1;
501 } 507 }
@@ -537,6 +543,7 @@ static void print_other_cpu_stall(struct rcu_state *rsp)
537 int cpu; 543 int cpu;
538 long delta; 544 long delta;
539 unsigned long flags; 545 unsigned long flags;
546 int ndetected;
540 struct rcu_node *rnp = rcu_get_root(rsp); 547 struct rcu_node *rnp = rcu_get_root(rsp);
541 548
542 /* Only let one CPU complain about others per time interval. */ 549 /* Only let one CPU complain about others per time interval. */
@@ -553,7 +560,7 @@ static void print_other_cpu_stall(struct rcu_state *rsp)
553 * Now rat on any tasks that got kicked up to the root rcu_node 560 * Now rat on any tasks that got kicked up to the root rcu_node
554 * due to CPU offlining. 561 * due to CPU offlining.
555 */ 562 */
556 rcu_print_task_stall(rnp); 563 ndetected = rcu_print_task_stall(rnp);
557 raw_spin_unlock_irqrestore(&rnp->lock, flags); 564 raw_spin_unlock_irqrestore(&rnp->lock, flags);
558 565
559 /* 566 /*
@@ -565,17 +572,22 @@ static void print_other_cpu_stall(struct rcu_state *rsp)
565 rsp->name); 572 rsp->name);
566 rcu_for_each_leaf_node(rsp, rnp) { 573 rcu_for_each_leaf_node(rsp, rnp) {
567 raw_spin_lock_irqsave(&rnp->lock, flags); 574 raw_spin_lock_irqsave(&rnp->lock, flags);
568 rcu_print_task_stall(rnp); 575 ndetected += rcu_print_task_stall(rnp);
569 raw_spin_unlock_irqrestore(&rnp->lock, flags); 576 raw_spin_unlock_irqrestore(&rnp->lock, flags);
570 if (rnp->qsmask == 0) 577 if (rnp->qsmask == 0)
571 continue; 578 continue;
572 for (cpu = 0; cpu <= rnp->grphi - rnp->grplo; cpu++) 579 for (cpu = 0; cpu <= rnp->grphi - rnp->grplo; cpu++)
573 if (rnp->qsmask & (1UL << cpu)) 580 if (rnp->qsmask & (1UL << cpu)) {
574 printk(" %d", rnp->grplo + cpu); 581 printk(" %d", rnp->grplo + cpu);
582 ndetected++;
583 }
575 } 584 }
576 printk("} (detected by %d, t=%ld jiffies)\n", 585 printk("} (detected by %d, t=%ld jiffies)\n",
577 smp_processor_id(), (long)(jiffies - rsp->gp_start)); 586 smp_processor_id(), (long)(jiffies - rsp->gp_start));
578 trigger_all_cpu_backtrace(); 587 if (ndetected == 0)
588 printk(KERN_ERR "INFO: Stall ended before state dump start\n");
589 else if (!trigger_all_cpu_backtrace())
590 dump_stack();
579 591
580 /* If so configured, complain about tasks blocking the grace period. */ 592 /* If so configured, complain about tasks blocking the grace period. */
581 593
@@ -596,7 +608,8 @@ static void print_cpu_stall(struct rcu_state *rsp)
596 */ 608 */
597 printk(KERN_ERR "INFO: %s detected stall on CPU %d (t=%lu jiffies)\n", 609 printk(KERN_ERR "INFO: %s detected stall on CPU %d (t=%lu jiffies)\n",
598 rsp->name, smp_processor_id(), jiffies - rsp->gp_start); 610 rsp->name, smp_processor_id(), jiffies - rsp->gp_start);
599 trigger_all_cpu_backtrace(); 611 if (!trigger_all_cpu_backtrace())
612 dump_stack();
600 613
601 raw_spin_lock_irqsave(&rnp->lock, flags); 614 raw_spin_lock_irqsave(&rnp->lock, flags);
602 if (ULONG_CMP_GE(jiffies, rsp->jiffies_stall)) 615 if (ULONG_CMP_GE(jiffies, rsp->jiffies_stall))
@@ -678,9 +691,10 @@ static void __note_new_gpnum(struct rcu_state *rsp, struct rcu_node *rnp, struct
678 * go looking for one. 691 * go looking for one.
679 */ 692 */
680 rdp->gpnum = rnp->gpnum; 693 rdp->gpnum = rnp->gpnum;
694 trace_rcu_grace_period(rsp->name, rdp->gpnum, "cpustart");
681 if (rnp->qsmask & rdp->grpmask) { 695 if (rnp->qsmask & rdp->grpmask) {
682 rdp->qs_pending = 1; 696 rdp->qs_pending = 1;
683 rdp->passed_quiesc = 0; 697 rdp->passed_quiesce = 0;
684 } else 698 } else
685 rdp->qs_pending = 0; 699 rdp->qs_pending = 0;
686 } 700 }
@@ -741,6 +755,7 @@ __rcu_process_gp_end(struct rcu_state *rsp, struct rcu_node *rnp, struct rcu_dat
741 755
742 /* Remember that we saw this grace-period completion. */ 756 /* Remember that we saw this grace-period completion. */
743 rdp->completed = rnp->completed; 757 rdp->completed = rnp->completed;
758 trace_rcu_grace_period(rsp->name, rdp->gpnum, "cpuend");
744 759
745 /* 760 /*
746 * If we were in an extended quiescent state, we may have 761 * If we were in an extended quiescent state, we may have
@@ -826,31 +841,31 @@ rcu_start_gp(struct rcu_state *rsp, unsigned long flags)
826 struct rcu_data *rdp = this_cpu_ptr(rsp->rda); 841 struct rcu_data *rdp = this_cpu_ptr(rsp->rda);
827 struct rcu_node *rnp = rcu_get_root(rsp); 842 struct rcu_node *rnp = rcu_get_root(rsp);
828 843
829 if (!cpu_needs_another_gp(rsp, rdp) || rsp->fqs_active) { 844 if (!rcu_scheduler_fully_active ||
830 if (cpu_needs_another_gp(rsp, rdp)) 845 !cpu_needs_another_gp(rsp, rdp)) {
831 rsp->fqs_need_gp = 1; 846 /*
832 if (rnp->completed == rsp->completed) { 847 * Either the scheduler hasn't yet spawned the first
833 raw_spin_unlock_irqrestore(&rnp->lock, flags); 848 * non-idle task or this CPU does not need another
834 return; 849 * grace period. Either way, don't start a new grace
835 } 850 * period.
836 raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */ 851 */
852 raw_spin_unlock_irqrestore(&rnp->lock, flags);
853 return;
854 }
837 855
856 if (rsp->fqs_active) {
838 /* 857 /*
839 * Propagate new ->completed value to rcu_node structures 858 * This CPU needs a grace period, but force_quiescent_state()
840 * so that other CPUs don't have to wait until the start 859 * is running. Tell it to start one on this CPU's behalf.
841 * of the next grace period to process their callbacks.
842 */ 860 */
843 rcu_for_each_node_breadth_first(rsp, rnp) { 861 rsp->fqs_need_gp = 1;
844 raw_spin_lock(&rnp->lock); /* irqs already disabled. */ 862 raw_spin_unlock_irqrestore(&rnp->lock, flags);
845 rnp->completed = rsp->completed;
846 raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */
847 }
848 local_irq_restore(flags);
849 return; 863 return;
850 } 864 }
851 865
852 /* Advance to a new grace period and initialize state. */ 866 /* Advance to a new grace period and initialize state. */
853 rsp->gpnum++; 867 rsp->gpnum++;
868 trace_rcu_grace_period(rsp->name, rsp->gpnum, "start");
854 WARN_ON_ONCE(rsp->signaled == RCU_GP_INIT); 869 WARN_ON_ONCE(rsp->signaled == RCU_GP_INIT);
855 rsp->signaled = RCU_GP_INIT; /* Hold off force_quiescent_state. */ 870 rsp->signaled = RCU_GP_INIT; /* Hold off force_quiescent_state. */
856 rsp->jiffies_force_qs = jiffies + RCU_JIFFIES_TILL_FORCE_QS; 871 rsp->jiffies_force_qs = jiffies + RCU_JIFFIES_TILL_FORCE_QS;
@@ -865,6 +880,9 @@ rcu_start_gp(struct rcu_state *rsp, unsigned long flags)
865 rsp->signaled = RCU_SIGNAL_INIT; /* force_quiescent_state OK. */ 880 rsp->signaled = RCU_SIGNAL_INIT; /* force_quiescent_state OK. */
866 rcu_start_gp_per_cpu(rsp, rnp, rdp); 881 rcu_start_gp_per_cpu(rsp, rnp, rdp);
867 rcu_preempt_boost_start_gp(rnp); 882 rcu_preempt_boost_start_gp(rnp);
883 trace_rcu_grace_period_init(rsp->name, rnp->gpnum,
884 rnp->level, rnp->grplo,
885 rnp->grphi, rnp->qsmask);
868 raw_spin_unlock_irqrestore(&rnp->lock, flags); 886 raw_spin_unlock_irqrestore(&rnp->lock, flags);
869 return; 887 return;
870 } 888 }
@@ -901,6 +919,9 @@ rcu_start_gp(struct rcu_state *rsp, unsigned long flags)
901 if (rnp == rdp->mynode) 919 if (rnp == rdp->mynode)
902 rcu_start_gp_per_cpu(rsp, rnp, rdp); 920 rcu_start_gp_per_cpu(rsp, rnp, rdp);
903 rcu_preempt_boost_start_gp(rnp); 921 rcu_preempt_boost_start_gp(rnp);
922 trace_rcu_grace_period_init(rsp->name, rnp->gpnum,
923 rnp->level, rnp->grplo,
924 rnp->grphi, rnp->qsmask);
904 raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */ 925 raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */
905 } 926 }
906 927
@@ -922,6 +943,8 @@ static void rcu_report_qs_rsp(struct rcu_state *rsp, unsigned long flags)
922 __releases(rcu_get_root(rsp)->lock) 943 __releases(rcu_get_root(rsp)->lock)
923{ 944{
924 unsigned long gp_duration; 945 unsigned long gp_duration;
946 struct rcu_node *rnp = rcu_get_root(rsp);
947 struct rcu_data *rdp = this_cpu_ptr(rsp->rda);
925 948
926 WARN_ON_ONCE(!rcu_gp_in_progress(rsp)); 949 WARN_ON_ONCE(!rcu_gp_in_progress(rsp));
927 950
@@ -933,7 +956,41 @@ static void rcu_report_qs_rsp(struct rcu_state *rsp, unsigned long flags)
933 gp_duration = jiffies - rsp->gp_start; 956 gp_duration = jiffies - rsp->gp_start;
934 if (gp_duration > rsp->gp_max) 957 if (gp_duration > rsp->gp_max)
935 rsp->gp_max = gp_duration; 958 rsp->gp_max = gp_duration;
936 rsp->completed = rsp->gpnum; 959
960 /*
961 * We know the grace period is complete, but to everyone else
962 * it appears to still be ongoing. But it is also the case
963 * that to everyone else it looks like there is nothing that
964 * they can do to advance the grace period. It is therefore
965 * safe for us to drop the lock in order to mark the grace
966 * period as completed in all of the rcu_node structures.
967 *
968 * But if this CPU needs another grace period, it will take
969 * care of this while initializing the next grace period.
970 * We use RCU_WAIT_TAIL instead of the usual RCU_DONE_TAIL
971 * because the callbacks have not yet been advanced: Those
972 * callbacks are waiting on the grace period that just now
973 * completed.
974 */
975 if (*rdp->nxttail[RCU_WAIT_TAIL] == NULL) {
976 raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */
977
978 /*
979 * Propagate new ->completed value to rcu_node structures
980 * so that other CPUs don't have to wait until the start
981 * of the next grace period to process their callbacks.
982 */
983 rcu_for_each_node_breadth_first(rsp, rnp) {
984 raw_spin_lock(&rnp->lock); /* irqs already disabled. */
985 rnp->completed = rsp->gpnum;
986 raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */
987 }
988 rnp = rcu_get_root(rsp);
989 raw_spin_lock(&rnp->lock); /* irqs already disabled. */
990 }
991
992 rsp->completed = rsp->gpnum; /* Declare the grace period complete. */
993 trace_rcu_grace_period(rsp->name, rsp->completed, "end");
937 rsp->signaled = RCU_GP_IDLE; 994 rsp->signaled = RCU_GP_IDLE;
938 rcu_start_gp(rsp, flags); /* releases root node's rnp->lock. */ 995 rcu_start_gp(rsp, flags); /* releases root node's rnp->lock. */
939} 996}
@@ -962,6 +1019,10 @@ rcu_report_qs_rnp(unsigned long mask, struct rcu_state *rsp,
962 return; 1019 return;
963 } 1020 }
964 rnp->qsmask &= ~mask; 1021 rnp->qsmask &= ~mask;
1022 trace_rcu_quiescent_state_report(rsp->name, rnp->gpnum,
1023 mask, rnp->qsmask, rnp->level,
1024 rnp->grplo, rnp->grphi,
1025 !!rnp->gp_tasks);
965 if (rnp->qsmask != 0 || rcu_preempt_blocked_readers_cgp(rnp)) { 1026 if (rnp->qsmask != 0 || rcu_preempt_blocked_readers_cgp(rnp)) {
966 1027
967 /* Other bits still set at this level, so done. */ 1028 /* Other bits still set at this level, so done. */
@@ -1000,7 +1061,7 @@ rcu_report_qs_rnp(unsigned long mask, struct rcu_state *rsp,
1000 * based on quiescent states detected in an earlier grace period! 1061 * based on quiescent states detected in an earlier grace period!
1001 */ 1062 */
1002static void 1063static void
1003rcu_report_qs_rdp(int cpu, struct rcu_state *rsp, struct rcu_data *rdp, long lastcomp) 1064rcu_report_qs_rdp(int cpu, struct rcu_state *rsp, struct rcu_data *rdp, long lastgp)
1004{ 1065{
1005 unsigned long flags; 1066 unsigned long flags;
1006 unsigned long mask; 1067 unsigned long mask;
@@ -1008,17 +1069,15 @@ rcu_report_qs_rdp(int cpu, struct rcu_state *rsp, struct rcu_data *rdp, long las
1008 1069
1009 rnp = rdp->mynode; 1070 rnp = rdp->mynode;
1010 raw_spin_lock_irqsave(&rnp->lock, flags); 1071 raw_spin_lock_irqsave(&rnp->lock, flags);
1011 if (lastcomp != rnp->completed) { 1072 if (lastgp != rnp->gpnum || rnp->completed == rnp->gpnum) {
1012 1073
1013 /* 1074 /*
1014 * Someone beat us to it for this grace period, so leave. 1075 * The grace period in which this quiescent state was
1015 * The race with GP start is resolved by the fact that we 1076 * recorded has ended, so don't report it upwards.
1016 * hold the leaf rcu_node lock, so that the per-CPU bits 1077 * We will instead need a new quiescent state that lies
1017 * cannot yet be initialized -- so we would simply find our 1078 * within the current grace period.
1018 * CPU's bit already cleared in rcu_report_qs_rnp() if this
1019 * race occurred.
1020 */ 1079 */
1021 rdp->passed_quiesc = 0; /* try again later! */ 1080 rdp->passed_quiesce = 0; /* need qs for new gp. */
1022 raw_spin_unlock_irqrestore(&rnp->lock, flags); 1081 raw_spin_unlock_irqrestore(&rnp->lock, flags);
1023 return; 1082 return;
1024 } 1083 }
@@ -1062,14 +1121,14 @@ rcu_check_quiescent_state(struct rcu_state *rsp, struct rcu_data *rdp)
1062 * Was there a quiescent state since the beginning of the grace 1121 * Was there a quiescent state since the beginning of the grace
1063 * period? If no, then exit and wait for the next call. 1122 * period? If no, then exit and wait for the next call.
1064 */ 1123 */
1065 if (!rdp->passed_quiesc) 1124 if (!rdp->passed_quiesce)
1066 return; 1125 return;
1067 1126
1068 /* 1127 /*
1069 * Tell RCU we are done (but rcu_report_qs_rdp() will be the 1128 * Tell RCU we are done (but rcu_report_qs_rdp() will be the
1070 * judge of that). 1129 * judge of that).
1071 */ 1130 */
1072 rcu_report_qs_rdp(rdp->cpu, rsp, rdp, rdp->passed_quiesc_completed); 1131 rcu_report_qs_rdp(rdp->cpu, rsp, rdp, rdp->passed_quiesce_gpnum);
1073} 1132}
1074 1133
1075#ifdef CONFIG_HOTPLUG_CPU 1134#ifdef CONFIG_HOTPLUG_CPU
@@ -1130,11 +1189,20 @@ static void __rcu_offline_cpu(int cpu, struct rcu_state *rsp)
1130 if (rnp->qsmaskinit != 0) { 1189 if (rnp->qsmaskinit != 0) {
1131 if (rnp != rdp->mynode) 1190 if (rnp != rdp->mynode)
1132 raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */ 1191 raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */
1192 else
1193 trace_rcu_grace_period(rsp->name,
1194 rnp->gpnum + 1 -
1195 !!(rnp->qsmask & mask),
1196 "cpuofl");
1133 break; 1197 break;
1134 } 1198 }
1135 if (rnp == rdp->mynode) 1199 if (rnp == rdp->mynode) {
1200 trace_rcu_grace_period(rsp->name,
1201 rnp->gpnum + 1 -
1202 !!(rnp->qsmask & mask),
1203 "cpuofl");
1136 need_report = rcu_preempt_offline_tasks(rsp, rnp, rdp); 1204 need_report = rcu_preempt_offline_tasks(rsp, rnp, rdp);
1137 else 1205 } else
1138 raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */ 1206 raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */
1139 mask = rnp->grpmask; 1207 mask = rnp->grpmask;
1140 rnp = rnp->parent; 1208 rnp = rnp->parent;
@@ -1190,17 +1258,22 @@ static void rcu_do_batch(struct rcu_state *rsp, struct rcu_data *rdp)
1190{ 1258{
1191 unsigned long flags; 1259 unsigned long flags;
1192 struct rcu_head *next, *list, **tail; 1260 struct rcu_head *next, *list, **tail;
1193 int count; 1261 int bl, count;
1194 1262
1195 /* If no callbacks are ready, just return.*/ 1263 /* If no callbacks are ready, just return.*/
1196 if (!cpu_has_callbacks_ready_to_invoke(rdp)) 1264 if (!cpu_has_callbacks_ready_to_invoke(rdp)) {
1265 trace_rcu_batch_start(rsp->name, 0, 0);
1266 trace_rcu_batch_end(rsp->name, 0);
1197 return; 1267 return;
1268 }
1198 1269
1199 /* 1270 /*
1200 * Extract the list of ready callbacks, disabling to prevent 1271 * Extract the list of ready callbacks, disabling to prevent
1201 * races with call_rcu() from interrupt handlers. 1272 * races with call_rcu() from interrupt handlers.
1202 */ 1273 */
1203 local_irq_save(flags); 1274 local_irq_save(flags);
1275 bl = rdp->blimit;
1276 trace_rcu_batch_start(rsp->name, rdp->qlen, bl);
1204 list = rdp->nxtlist; 1277 list = rdp->nxtlist;
1205 rdp->nxtlist = *rdp->nxttail[RCU_DONE_TAIL]; 1278 rdp->nxtlist = *rdp->nxttail[RCU_DONE_TAIL];
1206 *rdp->nxttail[RCU_DONE_TAIL] = NULL; 1279 *rdp->nxttail[RCU_DONE_TAIL] = NULL;
@@ -1216,13 +1289,14 @@ static void rcu_do_batch(struct rcu_state *rsp, struct rcu_data *rdp)
1216 next = list->next; 1289 next = list->next;
1217 prefetch(next); 1290 prefetch(next);
1218 debug_rcu_head_unqueue(list); 1291 debug_rcu_head_unqueue(list);
1219 __rcu_reclaim(list); 1292 __rcu_reclaim(rsp->name, list);
1220 list = next; 1293 list = next;
1221 if (++count >= rdp->blimit) 1294 if (++count >= bl)
1222 break; 1295 break;
1223 } 1296 }
1224 1297
1225 local_irq_save(flags); 1298 local_irq_save(flags);
1299 trace_rcu_batch_end(rsp->name, count);
1226 1300
1227 /* Update count, and requeue any remaining callbacks. */ 1301 /* Update count, and requeue any remaining callbacks. */
1228 rdp->qlen -= count; 1302 rdp->qlen -= count;
@@ -1250,7 +1324,7 @@ static void rcu_do_batch(struct rcu_state *rsp, struct rcu_data *rdp)
1250 1324
1251 local_irq_restore(flags); 1325 local_irq_restore(flags);
1252 1326
1253 /* Re-raise the RCU softirq if there are callbacks remaining. */ 1327 /* Re-invoke RCU core processing if there are callbacks remaining. */
1254 if (cpu_has_callbacks_ready_to_invoke(rdp)) 1328 if (cpu_has_callbacks_ready_to_invoke(rdp))
1255 invoke_rcu_core(); 1329 invoke_rcu_core();
1256} 1330}
@@ -1258,7 +1332,7 @@ static void rcu_do_batch(struct rcu_state *rsp, struct rcu_data *rdp)
1258/* 1332/*
1259 * Check to see if this CPU is in a non-context-switch quiescent state 1333 * Check to see if this CPU is in a non-context-switch quiescent state
1260 * (user mode or idle loop for rcu, non-softirq execution for rcu_bh). 1334 * (user mode or idle loop for rcu, non-softirq execution for rcu_bh).
1261 * Also schedule the RCU softirq handler. 1335 * Also schedule RCU core processing.
1262 * 1336 *
1263 * This function must be called with hardirqs disabled. It is normally 1337 * This function must be called with hardirqs disabled. It is normally
1264 * invoked from the scheduling-clock interrupt. If rcu_pending returns 1338 * invoked from the scheduling-clock interrupt. If rcu_pending returns
@@ -1266,6 +1340,7 @@ static void rcu_do_batch(struct rcu_state *rsp, struct rcu_data *rdp)
1266 */ 1340 */
1267void rcu_check_callbacks(int cpu, int user) 1341void rcu_check_callbacks(int cpu, int user)
1268{ 1342{
1343 trace_rcu_utilization("Start scheduler-tick");
1269 if (user || 1344 if (user ||
1270 (idle_cpu(cpu) && rcu_scheduler_active && 1345 (idle_cpu(cpu) && rcu_scheduler_active &&
1271 !in_softirq() && hardirq_count() <= (1 << HARDIRQ_SHIFT))) { 1346 !in_softirq() && hardirq_count() <= (1 << HARDIRQ_SHIFT))) {
@@ -1299,6 +1374,7 @@ void rcu_check_callbacks(int cpu, int user)
1299 rcu_preempt_check_callbacks(cpu); 1374 rcu_preempt_check_callbacks(cpu);
1300 if (rcu_pending(cpu)) 1375 if (rcu_pending(cpu))
1301 invoke_rcu_core(); 1376 invoke_rcu_core();
1377 trace_rcu_utilization("End scheduler-tick");
1302} 1378}
1303 1379
1304#ifdef CONFIG_SMP 1380#ifdef CONFIG_SMP
@@ -1360,10 +1436,14 @@ static void force_quiescent_state(struct rcu_state *rsp, int relaxed)
1360 unsigned long flags; 1436 unsigned long flags;
1361 struct rcu_node *rnp = rcu_get_root(rsp); 1437 struct rcu_node *rnp = rcu_get_root(rsp);
1362 1438
1363 if (!rcu_gp_in_progress(rsp)) 1439 trace_rcu_utilization("Start fqs");
1440 if (!rcu_gp_in_progress(rsp)) {
1441 trace_rcu_utilization("End fqs");
1364 return; /* No grace period in progress, nothing to force. */ 1442 return; /* No grace period in progress, nothing to force. */
1443 }
1365 if (!raw_spin_trylock_irqsave(&rsp->fqslock, flags)) { 1444 if (!raw_spin_trylock_irqsave(&rsp->fqslock, flags)) {
1366 rsp->n_force_qs_lh++; /* Inexact, can lose counts. Tough! */ 1445 rsp->n_force_qs_lh++; /* Inexact, can lose counts. Tough! */
1446 trace_rcu_utilization("End fqs");
1367 return; /* Someone else is already on the job. */ 1447 return; /* Someone else is already on the job. */
1368 } 1448 }
1369 if (relaxed && ULONG_CMP_GE(rsp->jiffies_force_qs, jiffies)) 1449 if (relaxed && ULONG_CMP_GE(rsp->jiffies_force_qs, jiffies))
@@ -1412,11 +1492,13 @@ static void force_quiescent_state(struct rcu_state *rsp, int relaxed)
1412 raw_spin_unlock(&rsp->fqslock); /* irqs remain disabled */ 1492 raw_spin_unlock(&rsp->fqslock); /* irqs remain disabled */
1413 rsp->fqs_need_gp = 0; 1493 rsp->fqs_need_gp = 0;
1414 rcu_start_gp(rsp, flags); /* releases rnp->lock */ 1494 rcu_start_gp(rsp, flags); /* releases rnp->lock */
1495 trace_rcu_utilization("End fqs");
1415 return; 1496 return;
1416 } 1497 }
1417 raw_spin_unlock(&rnp->lock); /* irqs remain disabled */ 1498 raw_spin_unlock(&rnp->lock); /* irqs remain disabled */
1418unlock_fqs_ret: 1499unlock_fqs_ret:
1419 raw_spin_unlock_irqrestore(&rsp->fqslock, flags); 1500 raw_spin_unlock_irqrestore(&rsp->fqslock, flags);
1501 trace_rcu_utilization("End fqs");
1420} 1502}
1421 1503
1422#else /* #ifdef CONFIG_SMP */ 1504#else /* #ifdef CONFIG_SMP */
@@ -1429,9 +1511,9 @@ static void force_quiescent_state(struct rcu_state *rsp, int relaxed)
1429#endif /* #else #ifdef CONFIG_SMP */ 1511#endif /* #else #ifdef CONFIG_SMP */
1430 1512
1431/* 1513/*
1432 * This does the RCU processing work from softirq context for the 1514 * This does the RCU core processing work for the specified rcu_state
1433 * specified rcu_state and rcu_data structures. This may be called 1515 * and rcu_data structures. This may be called only from the CPU to
1434 * only from the CPU to whom the rdp belongs. 1516 * whom the rdp belongs.
1435 */ 1517 */
1436static void 1518static void
1437__rcu_process_callbacks(struct rcu_state *rsp, struct rcu_data *rdp) 1519__rcu_process_callbacks(struct rcu_state *rsp, struct rcu_data *rdp)
@@ -1468,24 +1550,24 @@ __rcu_process_callbacks(struct rcu_state *rsp, struct rcu_data *rdp)
1468} 1550}
1469 1551
1470/* 1552/*
1471 * Do softirq processing for the current CPU. 1553 * Do RCU core processing for the current CPU.
1472 */ 1554 */
1473static void rcu_process_callbacks(struct softirq_action *unused) 1555static void rcu_process_callbacks(struct softirq_action *unused)
1474{ 1556{
1557 trace_rcu_utilization("Start RCU core");
1475 __rcu_process_callbacks(&rcu_sched_state, 1558 __rcu_process_callbacks(&rcu_sched_state,
1476 &__get_cpu_var(rcu_sched_data)); 1559 &__get_cpu_var(rcu_sched_data));
1477 __rcu_process_callbacks(&rcu_bh_state, &__get_cpu_var(rcu_bh_data)); 1560 __rcu_process_callbacks(&rcu_bh_state, &__get_cpu_var(rcu_bh_data));
1478 rcu_preempt_process_callbacks(); 1561 rcu_preempt_process_callbacks();
1479 1562 trace_rcu_utilization("End RCU core");
1480 /* If we are last CPU on way to dyntick-idle mode, accelerate it. */
1481 rcu_needs_cpu_flush();
1482} 1563}
1483 1564
1484/* 1565/*
1485 * Wake up the current CPU's kthread. This replaces raise_softirq() 1566 * Schedule RCU callback invocation. If the specified type of RCU
1486 * in earlier versions of RCU. Note that because we are running on 1567 * does not support RCU priority boosting, just do a direct call,
1487 * the current CPU with interrupts disabled, the rcu_cpu_kthread_task 1568 * otherwise wake up the per-CPU kernel kthread. Note that because we
1488 * cannot disappear out from under us. 1569 * are running on the current CPU with interrupts disabled, the
1570 * rcu_cpu_kthread_task cannot disappear out from under us.
1489 */ 1571 */
1490static void invoke_rcu_callbacks(struct rcu_state *rsp, struct rcu_data *rdp) 1572static void invoke_rcu_callbacks(struct rcu_state *rsp, struct rcu_data *rdp)
1491{ 1573{
@@ -1530,6 +1612,12 @@ __call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu),
1530 rdp->nxttail[RCU_NEXT_TAIL] = &head->next; 1612 rdp->nxttail[RCU_NEXT_TAIL] = &head->next;
1531 rdp->qlen++; 1613 rdp->qlen++;
1532 1614
1615 if (__is_kfree_rcu_offset((unsigned long)func))
1616 trace_rcu_kfree_callback(rsp->name, head, (unsigned long)func,
1617 rdp->qlen);
1618 else
1619 trace_rcu_callback(rsp->name, head, rdp->qlen);
1620
1533 /* If interrupts were disabled, don't dive into RCU core. */ 1621 /* If interrupts were disabled, don't dive into RCU core. */
1534 if (irqs_disabled_flags(flags)) { 1622 if (irqs_disabled_flags(flags)) {
1535 local_irq_restore(flags); 1623 local_irq_restore(flags);
@@ -1613,18 +1701,9 @@ EXPORT_SYMBOL_GPL(call_rcu_bh);
1613 */ 1701 */
1614void synchronize_sched(void) 1702void synchronize_sched(void)
1615{ 1703{
1616 struct rcu_synchronize rcu;
1617
1618 if (rcu_blocking_is_gp()) 1704 if (rcu_blocking_is_gp())
1619 return; 1705 return;
1620 1706 wait_rcu_gp(call_rcu_sched);
1621 init_rcu_head_on_stack(&rcu.head);
1622 init_completion(&rcu.completion);
1623 /* Will wake me after RCU finished. */
1624 call_rcu_sched(&rcu.head, wakeme_after_rcu);
1625 /* Wait for it. */
1626 wait_for_completion(&rcu.completion);
1627 destroy_rcu_head_on_stack(&rcu.head);
1628} 1707}
1629EXPORT_SYMBOL_GPL(synchronize_sched); 1708EXPORT_SYMBOL_GPL(synchronize_sched);
1630 1709
@@ -1639,18 +1718,9 @@ EXPORT_SYMBOL_GPL(synchronize_sched);
1639 */ 1718 */
1640void synchronize_rcu_bh(void) 1719void synchronize_rcu_bh(void)
1641{ 1720{
1642 struct rcu_synchronize rcu;
1643
1644 if (rcu_blocking_is_gp()) 1721 if (rcu_blocking_is_gp())
1645 return; 1722 return;
1646 1723 wait_rcu_gp(call_rcu_bh);
1647 init_rcu_head_on_stack(&rcu.head);
1648 init_completion(&rcu.completion);
1649 /* Will wake me after RCU finished. */
1650 call_rcu_bh(&rcu.head, wakeme_after_rcu);
1651 /* Wait for it. */
1652 wait_for_completion(&rcu.completion);
1653 destroy_rcu_head_on_stack(&rcu.head);
1654} 1724}
1655EXPORT_SYMBOL_GPL(synchronize_rcu_bh); 1725EXPORT_SYMBOL_GPL(synchronize_rcu_bh);
1656 1726
@@ -1671,7 +1741,8 @@ static int __rcu_pending(struct rcu_state *rsp, struct rcu_data *rdp)
1671 check_cpu_stall(rsp, rdp); 1741 check_cpu_stall(rsp, rdp);
1672 1742
1673 /* Is the RCU core waiting for a quiescent state from this CPU? */ 1743 /* Is the RCU core waiting for a quiescent state from this CPU? */
1674 if (rdp->qs_pending && !rdp->passed_quiesc) { 1744 if (rcu_scheduler_fully_active &&
1745 rdp->qs_pending && !rdp->passed_quiesce) {
1675 1746
1676 /* 1747 /*
1677 * If force_quiescent_state() coming soon and this CPU 1748 * If force_quiescent_state() coming soon and this CPU
@@ -1683,7 +1754,7 @@ static int __rcu_pending(struct rcu_state *rsp, struct rcu_data *rdp)
1683 ULONG_CMP_LT(ACCESS_ONCE(rsp->jiffies_force_qs) - 1, 1754 ULONG_CMP_LT(ACCESS_ONCE(rsp->jiffies_force_qs) - 1,
1684 jiffies)) 1755 jiffies))
1685 set_need_resched(); 1756 set_need_resched();
1686 } else if (rdp->qs_pending && rdp->passed_quiesc) { 1757 } else if (rdp->qs_pending && rdp->passed_quiesce) {
1687 rdp->n_rp_report_qs++; 1758 rdp->n_rp_report_qs++;
1688 return 1; 1759 return 1;
1689 } 1760 }
@@ -1846,6 +1917,7 @@ rcu_boot_init_percpu_data(int cpu, struct rcu_state *rsp)
1846 rdp->dynticks = &per_cpu(rcu_dynticks, cpu); 1917 rdp->dynticks = &per_cpu(rcu_dynticks, cpu);
1847#endif /* #ifdef CONFIG_NO_HZ */ 1918#endif /* #ifdef CONFIG_NO_HZ */
1848 rdp->cpu = cpu; 1919 rdp->cpu = cpu;
1920 rdp->rsp = rsp;
1849 raw_spin_unlock_irqrestore(&rnp->lock, flags); 1921 raw_spin_unlock_irqrestore(&rnp->lock, flags);
1850} 1922}
1851 1923
@@ -1865,8 +1937,6 @@ rcu_init_percpu_data(int cpu, struct rcu_state *rsp, int preemptible)
1865 1937
1866 /* Set up local state, ensuring consistent view of global state. */ 1938 /* Set up local state, ensuring consistent view of global state. */
1867 raw_spin_lock_irqsave(&rnp->lock, flags); 1939 raw_spin_lock_irqsave(&rnp->lock, flags);
1868 rdp->passed_quiesc = 0; /* We could be racing with new GP, */
1869 rdp->qs_pending = 1; /* so set up to respond to current GP. */
1870 rdp->beenonline = 1; /* We have now been online. */ 1940 rdp->beenonline = 1; /* We have now been online. */
1871 rdp->preemptible = preemptible; 1941 rdp->preemptible = preemptible;
1872 rdp->qlen_last_fqs_check = 0; 1942 rdp->qlen_last_fqs_check = 0;
@@ -1891,9 +1961,17 @@ rcu_init_percpu_data(int cpu, struct rcu_state *rsp, int preemptible)
1891 rnp->qsmaskinit |= mask; 1961 rnp->qsmaskinit |= mask;
1892 mask = rnp->grpmask; 1962 mask = rnp->grpmask;
1893 if (rnp == rdp->mynode) { 1963 if (rnp == rdp->mynode) {
1894 rdp->gpnum = rnp->completed; /* if GP in progress... */ 1964 /*
1965 * If there is a grace period in progress, we will
1966 * set up to wait for it next time we run the
1967 * RCU core code.
1968 */
1969 rdp->gpnum = rnp->completed;
1895 rdp->completed = rnp->completed; 1970 rdp->completed = rnp->completed;
1896 rdp->passed_quiesc_completed = rnp->completed - 1; 1971 rdp->passed_quiesce = 0;
1972 rdp->qs_pending = 0;
1973 rdp->passed_quiesce_gpnum = rnp->gpnum - 1;
1974 trace_rcu_grace_period(rsp->name, rdp->gpnum, "cpuonl");
1897 } 1975 }
1898 raw_spin_unlock(&rnp->lock); /* irqs already disabled. */ 1976 raw_spin_unlock(&rnp->lock); /* irqs already disabled. */
1899 rnp = rnp->parent; 1977 rnp = rnp->parent;
@@ -1919,6 +1997,7 @@ static int __cpuinit rcu_cpu_notify(struct notifier_block *self,
1919 struct rcu_data *rdp = per_cpu_ptr(rcu_state->rda, cpu); 1997 struct rcu_data *rdp = per_cpu_ptr(rcu_state->rda, cpu);
1920 struct rcu_node *rnp = rdp->mynode; 1998 struct rcu_node *rnp = rdp->mynode;
1921 1999
2000 trace_rcu_utilization("Start CPU hotplug");
1922 switch (action) { 2001 switch (action) {
1923 case CPU_UP_PREPARE: 2002 case CPU_UP_PREPARE:
1924 case CPU_UP_PREPARE_FROZEN: 2003 case CPU_UP_PREPARE_FROZEN:
@@ -1954,6 +2033,7 @@ static int __cpuinit rcu_cpu_notify(struct notifier_block *self,
1954 default: 2033 default:
1955 break; 2034 break;
1956 } 2035 }
2036 trace_rcu_utilization("End CPU hotplug");
1957 return NOTIFY_OK; 2037 return NOTIFY_OK;
1958} 2038}
1959 2039
diff --git a/kernel/rcutree.h b/kernel/rcutree.h
index 01b2ccda26f..849ce9ec51f 100644
--- a/kernel/rcutree.h
+++ b/kernel/rcutree.h
@@ -230,9 +230,9 @@ struct rcu_data {
230 /* in order to detect GP end. */ 230 /* in order to detect GP end. */
231 unsigned long gpnum; /* Highest gp number that this CPU */ 231 unsigned long gpnum; /* Highest gp number that this CPU */
232 /* is aware of having started. */ 232 /* is aware of having started. */
233 unsigned long passed_quiesc_completed; 233 unsigned long passed_quiesce_gpnum;
234 /* Value of completed at time of qs. */ 234 /* gpnum at time of quiescent state. */
235 bool passed_quiesc; /* User-mode/idle loop etc. */ 235 bool passed_quiesce; /* User-mode/idle loop etc. */
236 bool qs_pending; /* Core waits for quiesc state. */ 236 bool qs_pending; /* Core waits for quiesc state. */
237 bool beenonline; /* CPU online at least once. */ 237 bool beenonline; /* CPU online at least once. */
238 bool preemptible; /* Preemptible RCU? */ 238 bool preemptible; /* Preemptible RCU? */
@@ -299,6 +299,7 @@ struct rcu_data {
299 unsigned long n_rp_need_nothing; 299 unsigned long n_rp_need_nothing;
300 300
301 int cpu; 301 int cpu;
302 struct rcu_state *rsp;
302}; 303};
303 304
304/* Values for signaled field in struct rcu_state. */ 305/* Values for signaled field in struct rcu_state. */
@@ -417,6 +418,13 @@ extern struct rcu_state rcu_preempt_state;
417DECLARE_PER_CPU(struct rcu_data, rcu_preempt_data); 418DECLARE_PER_CPU(struct rcu_data, rcu_preempt_data);
418#endif /* #ifdef CONFIG_TREE_PREEMPT_RCU */ 419#endif /* #ifdef CONFIG_TREE_PREEMPT_RCU */
419 420
421#ifdef CONFIG_RCU_BOOST
422DECLARE_PER_CPU(unsigned int, rcu_cpu_kthread_status);
423DECLARE_PER_CPU(int, rcu_cpu_kthread_cpu);
424DECLARE_PER_CPU(unsigned int, rcu_cpu_kthread_loops);
425DECLARE_PER_CPU(char, rcu_cpu_has_work);
426#endif /* #ifdef CONFIG_RCU_BOOST */
427
420#ifndef RCU_TREE_NONCORE 428#ifndef RCU_TREE_NONCORE
421 429
422/* Forward declarations for rcutree_plugin.h */ 430/* Forward declarations for rcutree_plugin.h */
@@ -430,7 +438,7 @@ static void rcu_report_unblock_qs_rnp(struct rcu_node *rnp,
430static void rcu_stop_cpu_kthread(int cpu); 438static void rcu_stop_cpu_kthread(int cpu);
431#endif /* #ifdef CONFIG_HOTPLUG_CPU */ 439#endif /* #ifdef CONFIG_HOTPLUG_CPU */
432static void rcu_print_detail_task_stall(struct rcu_state *rsp); 440static void rcu_print_detail_task_stall(struct rcu_state *rsp);
433static void rcu_print_task_stall(struct rcu_node *rnp); 441static int rcu_print_task_stall(struct rcu_node *rnp);
434static void rcu_preempt_stall_reset(void); 442static void rcu_preempt_stall_reset(void);
435static void rcu_preempt_check_blocked_tasks(struct rcu_node *rnp); 443static void rcu_preempt_check_blocked_tasks(struct rcu_node *rnp);
436#ifdef CONFIG_HOTPLUG_CPU 444#ifdef CONFIG_HOTPLUG_CPU
@@ -450,7 +458,6 @@ static int rcu_preempt_needs_cpu(int cpu);
450static void __cpuinit rcu_preempt_init_percpu_data(int cpu); 458static void __cpuinit rcu_preempt_init_percpu_data(int cpu);
451static void rcu_preempt_send_cbs_to_online(void); 459static void rcu_preempt_send_cbs_to_online(void);
452static void __init __rcu_init_preempt(void); 460static void __init __rcu_init_preempt(void);
453static void rcu_needs_cpu_flush(void);
454static void rcu_initiate_boost(struct rcu_node *rnp, unsigned long flags); 461static void rcu_initiate_boost(struct rcu_node *rnp, unsigned long flags);
455static void rcu_preempt_boost_start_gp(struct rcu_node *rnp); 462static void rcu_preempt_boost_start_gp(struct rcu_node *rnp);
456static void invoke_rcu_callbacks_kthread(void); 463static void invoke_rcu_callbacks_kthread(void);
diff --git a/kernel/rcutree_plugin.h b/kernel/rcutree_plugin.h
index 8aafbb80b8b..4b9b9f8a418 100644
--- a/kernel/rcutree_plugin.h
+++ b/kernel/rcutree_plugin.h
@@ -27,6 +27,14 @@
27#include <linux/delay.h> 27#include <linux/delay.h>
28#include <linux/stop_machine.h> 28#include <linux/stop_machine.h>
29 29
30#define RCU_KTHREAD_PRIO 1
31
32#ifdef CONFIG_RCU_BOOST
33#define RCU_BOOST_PRIO CONFIG_RCU_BOOST_PRIO
34#else
35#define RCU_BOOST_PRIO RCU_KTHREAD_PRIO
36#endif
37
30/* 38/*
31 * Check the RCU kernel configuration parameters and print informative 39 * Check the RCU kernel configuration parameters and print informative
32 * messages about anything out of the ordinary. If you like #ifdef, you 40 * messages about anything out of the ordinary. If you like #ifdef, you
@@ -64,7 +72,7 @@ static void __init rcu_bootup_announce_oddness(void)
64 72
65#ifdef CONFIG_TREE_PREEMPT_RCU 73#ifdef CONFIG_TREE_PREEMPT_RCU
66 74
67struct rcu_state rcu_preempt_state = RCU_STATE_INITIALIZER(rcu_preempt_state); 75struct rcu_state rcu_preempt_state = RCU_STATE_INITIALIZER(rcu_preempt);
68DEFINE_PER_CPU(struct rcu_data, rcu_preempt_data); 76DEFINE_PER_CPU(struct rcu_data, rcu_preempt_data);
69static struct rcu_state *rcu_state = &rcu_preempt_state; 77static struct rcu_state *rcu_state = &rcu_preempt_state;
70 78
@@ -122,9 +130,11 @@ static void rcu_preempt_qs(int cpu)
122{ 130{
123 struct rcu_data *rdp = &per_cpu(rcu_preempt_data, cpu); 131 struct rcu_data *rdp = &per_cpu(rcu_preempt_data, cpu);
124 132
125 rdp->passed_quiesc_completed = rdp->gpnum - 1; 133 rdp->passed_quiesce_gpnum = rdp->gpnum;
126 barrier(); 134 barrier();
127 rdp->passed_quiesc = 1; 135 if (rdp->passed_quiesce == 0)
136 trace_rcu_grace_period("rcu_preempt", rdp->gpnum, "cpuqs");
137 rdp->passed_quiesce = 1;
128 current->rcu_read_unlock_special &= ~RCU_READ_UNLOCK_NEED_QS; 138 current->rcu_read_unlock_special &= ~RCU_READ_UNLOCK_NEED_QS;
129} 139}
130 140
@@ -190,6 +200,11 @@ static void rcu_preempt_note_context_switch(int cpu)
190 if (rnp->qsmask & rdp->grpmask) 200 if (rnp->qsmask & rdp->grpmask)
191 rnp->gp_tasks = &t->rcu_node_entry; 201 rnp->gp_tasks = &t->rcu_node_entry;
192 } 202 }
203 trace_rcu_preempt_task(rdp->rsp->name,
204 t->pid,
205 (rnp->qsmask & rdp->grpmask)
206 ? rnp->gpnum
207 : rnp->gpnum + 1);
193 raw_spin_unlock_irqrestore(&rnp->lock, flags); 208 raw_spin_unlock_irqrestore(&rnp->lock, flags);
194 } else if (t->rcu_read_lock_nesting < 0 && 209 } else if (t->rcu_read_lock_nesting < 0 &&
195 t->rcu_read_unlock_special) { 210 t->rcu_read_unlock_special) {
@@ -299,6 +314,9 @@ static noinline void rcu_read_unlock_special(struct task_struct *t)
299 int empty_exp; 314 int empty_exp;
300 unsigned long flags; 315 unsigned long flags;
301 struct list_head *np; 316 struct list_head *np;
317#ifdef CONFIG_RCU_BOOST
318 struct rt_mutex *rbmp = NULL;
319#endif /* #ifdef CONFIG_RCU_BOOST */
302 struct rcu_node *rnp; 320 struct rcu_node *rnp;
303 int special; 321 int special;
304 322
@@ -344,6 +362,9 @@ static noinline void rcu_read_unlock_special(struct task_struct *t)
344 smp_mb(); /* ensure expedited fastpath sees end of RCU c-s. */ 362 smp_mb(); /* ensure expedited fastpath sees end of RCU c-s. */
345 np = rcu_next_node_entry(t, rnp); 363 np = rcu_next_node_entry(t, rnp);
346 list_del_init(&t->rcu_node_entry); 364 list_del_init(&t->rcu_node_entry);
365 t->rcu_blocked_node = NULL;
366 trace_rcu_unlock_preempted_task("rcu_preempt",
367 rnp->gpnum, t->pid);
347 if (&t->rcu_node_entry == rnp->gp_tasks) 368 if (&t->rcu_node_entry == rnp->gp_tasks)
348 rnp->gp_tasks = np; 369 rnp->gp_tasks = np;
349 if (&t->rcu_node_entry == rnp->exp_tasks) 370 if (&t->rcu_node_entry == rnp->exp_tasks)
@@ -351,30 +372,34 @@ static noinline void rcu_read_unlock_special(struct task_struct *t)
351#ifdef CONFIG_RCU_BOOST 372#ifdef CONFIG_RCU_BOOST
352 if (&t->rcu_node_entry == rnp->boost_tasks) 373 if (&t->rcu_node_entry == rnp->boost_tasks)
353 rnp->boost_tasks = np; 374 rnp->boost_tasks = np;
354 /* Snapshot and clear ->rcu_boosted with rcu_node lock held. */ 375 /* Snapshot/clear ->rcu_boost_mutex with rcu_node lock held. */
355 if (t->rcu_boosted) { 376 if (t->rcu_boost_mutex) {
356 special |= RCU_READ_UNLOCK_BOOSTED; 377 rbmp = t->rcu_boost_mutex;
357 t->rcu_boosted = 0; 378 t->rcu_boost_mutex = NULL;
358 } 379 }
359#endif /* #ifdef CONFIG_RCU_BOOST */ 380#endif /* #ifdef CONFIG_RCU_BOOST */
360 t->rcu_blocked_node = NULL;
361 381
362 /* 382 /*
363 * If this was the last task on the current list, and if 383 * If this was the last task on the current list, and if
364 * we aren't waiting on any CPUs, report the quiescent state. 384 * we aren't waiting on any CPUs, report the quiescent state.
365 * Note that rcu_report_unblock_qs_rnp() releases rnp->lock. 385 * Note that rcu_report_unblock_qs_rnp() releases rnp->lock.
366 */ 386 */
367 if (empty) 387 if (!empty && !rcu_preempt_blocked_readers_cgp(rnp)) {
368 raw_spin_unlock_irqrestore(&rnp->lock, flags); 388 trace_rcu_quiescent_state_report("preempt_rcu",
369 else 389 rnp->gpnum,
390 0, rnp->qsmask,
391 rnp->level,
392 rnp->grplo,
393 rnp->grphi,
394 !!rnp->gp_tasks);
370 rcu_report_unblock_qs_rnp(rnp, flags); 395 rcu_report_unblock_qs_rnp(rnp, flags);
396 } else
397 raw_spin_unlock_irqrestore(&rnp->lock, flags);
371 398
372#ifdef CONFIG_RCU_BOOST 399#ifdef CONFIG_RCU_BOOST
373 /* Unboost if we were boosted. */ 400 /* Unboost if we were boosted. */
374 if (special & RCU_READ_UNLOCK_BOOSTED) { 401 if (rbmp)
375 rt_mutex_unlock(t->rcu_boost_mutex); 402 rt_mutex_unlock(rbmp);
376 t->rcu_boost_mutex = NULL;
377 }
378#endif /* #ifdef CONFIG_RCU_BOOST */ 403#endif /* #ifdef CONFIG_RCU_BOOST */
379 404
380 /* 405 /*
@@ -399,10 +424,10 @@ void __rcu_read_unlock(void)
399{ 424{
400 struct task_struct *t = current; 425 struct task_struct *t = current;
401 426
402 barrier(); /* needed if we ever invoke rcu_read_unlock in rcutree.c */
403 if (t->rcu_read_lock_nesting != 1) 427 if (t->rcu_read_lock_nesting != 1)
404 --t->rcu_read_lock_nesting; 428 --t->rcu_read_lock_nesting;
405 else { 429 else {
430 barrier(); /* critical section before exit code. */
406 t->rcu_read_lock_nesting = INT_MIN; 431 t->rcu_read_lock_nesting = INT_MIN;
407 barrier(); /* assign before ->rcu_read_unlock_special load */ 432 barrier(); /* assign before ->rcu_read_unlock_special load */
408 if (unlikely(ACCESS_ONCE(t->rcu_read_unlock_special))) 433 if (unlikely(ACCESS_ONCE(t->rcu_read_unlock_special)))
@@ -466,16 +491,20 @@ static void rcu_print_detail_task_stall(struct rcu_state *rsp)
466 * Scan the current list of tasks blocked within RCU read-side critical 491 * Scan the current list of tasks blocked within RCU read-side critical
467 * sections, printing out the tid of each. 492 * sections, printing out the tid of each.
468 */ 493 */
469static void rcu_print_task_stall(struct rcu_node *rnp) 494static int rcu_print_task_stall(struct rcu_node *rnp)
470{ 495{
471 struct task_struct *t; 496 struct task_struct *t;
497 int ndetected = 0;
472 498
473 if (!rcu_preempt_blocked_readers_cgp(rnp)) 499 if (!rcu_preempt_blocked_readers_cgp(rnp))
474 return; 500 return 0;
475 t = list_entry(rnp->gp_tasks, 501 t = list_entry(rnp->gp_tasks,
476 struct task_struct, rcu_node_entry); 502 struct task_struct, rcu_node_entry);
477 list_for_each_entry_continue(t, &rnp->blkd_tasks, rcu_node_entry) 503 list_for_each_entry_continue(t, &rnp->blkd_tasks, rcu_node_entry) {
478 printk(" P%d", t->pid); 504 printk(" P%d", t->pid);
505 ndetected++;
506 }
507 return ndetected;
479} 508}
480 509
481/* 510/*
@@ -656,18 +685,9 @@ EXPORT_SYMBOL_GPL(call_rcu);
656 */ 685 */
657void synchronize_rcu(void) 686void synchronize_rcu(void)
658{ 687{
659 struct rcu_synchronize rcu;
660
661 if (!rcu_scheduler_active) 688 if (!rcu_scheduler_active)
662 return; 689 return;
663 690 wait_rcu_gp(call_rcu);
664 init_rcu_head_on_stack(&rcu.head);
665 init_completion(&rcu.completion);
666 /* Will wake me after RCU finished. */
667 call_rcu(&rcu.head, wakeme_after_rcu);
668 /* Wait for it. */
669 wait_for_completion(&rcu.completion);
670 destroy_rcu_head_on_stack(&rcu.head);
671} 691}
672EXPORT_SYMBOL_GPL(synchronize_rcu); 692EXPORT_SYMBOL_GPL(synchronize_rcu);
673 693
@@ -968,8 +988,9 @@ static void rcu_print_detail_task_stall(struct rcu_state *rsp)
968 * Because preemptible RCU does not exist, we never have to check for 988 * Because preemptible RCU does not exist, we never have to check for
969 * tasks blocked within RCU read-side critical sections. 989 * tasks blocked within RCU read-side critical sections.
970 */ 990 */
971static void rcu_print_task_stall(struct rcu_node *rnp) 991static int rcu_print_task_stall(struct rcu_node *rnp)
972{ 992{
993 return 0;
973} 994}
974 995
975/* 996/*
@@ -1136,6 +1157,8 @@ static void rcu_initiate_boost_trace(struct rcu_node *rnp)
1136 1157
1137#endif /* #else #ifdef CONFIG_RCU_TRACE */ 1158#endif /* #else #ifdef CONFIG_RCU_TRACE */
1138 1159
1160static struct lock_class_key rcu_boost_class;
1161
1139/* 1162/*
1140 * Carry out RCU priority boosting on the task indicated by ->exp_tasks 1163 * Carry out RCU priority boosting on the task indicated by ->exp_tasks
1141 * or ->boost_tasks, advancing the pointer to the next task in the 1164 * or ->boost_tasks, advancing the pointer to the next task in the
@@ -1198,8 +1221,10 @@ static int rcu_boost(struct rcu_node *rnp)
1198 */ 1221 */
1199 t = container_of(tb, struct task_struct, rcu_node_entry); 1222 t = container_of(tb, struct task_struct, rcu_node_entry);
1200 rt_mutex_init_proxy_locked(&mtx, t); 1223 rt_mutex_init_proxy_locked(&mtx, t);
1224 /* Avoid lockdep false positives. This rt_mutex is its own thing. */
1225 lockdep_set_class_and_name(&mtx.wait_lock, &rcu_boost_class,
1226 "rcu_boost_mutex");
1201 t->rcu_boost_mutex = &mtx; 1227 t->rcu_boost_mutex = &mtx;
1202 t->rcu_boosted = 1;
1203 raw_spin_unlock_irqrestore(&rnp->lock, flags); 1228 raw_spin_unlock_irqrestore(&rnp->lock, flags);
1204 rt_mutex_lock(&mtx); /* Side effect: boosts task t's priority. */ 1229 rt_mutex_lock(&mtx); /* Side effect: boosts task t's priority. */
1205 rt_mutex_unlock(&mtx); /* Keep lockdep happy. */ 1230 rt_mutex_unlock(&mtx); /* Keep lockdep happy. */
@@ -1228,9 +1253,12 @@ static int rcu_boost_kthread(void *arg)
1228 int spincnt = 0; 1253 int spincnt = 0;
1229 int more2boost; 1254 int more2boost;
1230 1255
1256 trace_rcu_utilization("Start boost kthread@init");
1231 for (;;) { 1257 for (;;) {
1232 rnp->boost_kthread_status = RCU_KTHREAD_WAITING; 1258 rnp->boost_kthread_status = RCU_KTHREAD_WAITING;
1259 trace_rcu_utilization("End boost kthread@rcu_wait");
1233 rcu_wait(rnp->boost_tasks || rnp->exp_tasks); 1260 rcu_wait(rnp->boost_tasks || rnp->exp_tasks);
1261 trace_rcu_utilization("Start boost kthread@rcu_wait");
1234 rnp->boost_kthread_status = RCU_KTHREAD_RUNNING; 1262 rnp->boost_kthread_status = RCU_KTHREAD_RUNNING;
1235 more2boost = rcu_boost(rnp); 1263 more2boost = rcu_boost(rnp);
1236 if (more2boost) 1264 if (more2boost)
@@ -1238,11 +1266,14 @@ static int rcu_boost_kthread(void *arg)
1238 else 1266 else
1239 spincnt = 0; 1267 spincnt = 0;
1240 if (spincnt > 10) { 1268 if (spincnt > 10) {
1269 trace_rcu_utilization("End boost kthread@rcu_yield");
1241 rcu_yield(rcu_boost_kthread_timer, (unsigned long)rnp); 1270 rcu_yield(rcu_boost_kthread_timer, (unsigned long)rnp);
1271 trace_rcu_utilization("Start boost kthread@rcu_yield");
1242 spincnt = 0; 1272 spincnt = 0;
1243 } 1273 }
1244 } 1274 }
1245 /* NOTREACHED */ 1275 /* NOTREACHED */
1276 trace_rcu_utilization("End boost kthread@notreached");
1246 return 0; 1277 return 0;
1247} 1278}
1248 1279
@@ -1291,11 +1322,9 @@ static void invoke_rcu_callbacks_kthread(void)
1291 1322
1292 local_irq_save(flags); 1323 local_irq_save(flags);
1293 __this_cpu_write(rcu_cpu_has_work, 1); 1324 __this_cpu_write(rcu_cpu_has_work, 1);
1294 if (__this_cpu_read(rcu_cpu_kthread_task) == NULL) { 1325 if (__this_cpu_read(rcu_cpu_kthread_task) != NULL &&
1295 local_irq_restore(flags); 1326 current != __this_cpu_read(rcu_cpu_kthread_task))
1296 return; 1327 wake_up_process(__this_cpu_read(rcu_cpu_kthread_task));
1297 }
1298 wake_up_process(__this_cpu_read(rcu_cpu_kthread_task));
1299 local_irq_restore(flags); 1328 local_irq_restore(flags);
1300} 1329}
1301 1330
@@ -1343,13 +1372,13 @@ static int __cpuinit rcu_spawn_one_boost_kthread(struct rcu_state *rsp,
1343 if (rnp->boost_kthread_task != NULL) 1372 if (rnp->boost_kthread_task != NULL)
1344 return 0; 1373 return 0;
1345 t = kthread_create(rcu_boost_kthread, (void *)rnp, 1374 t = kthread_create(rcu_boost_kthread, (void *)rnp,
1346 "rcub%d", rnp_index); 1375 "rcub/%d", rnp_index);
1347 if (IS_ERR(t)) 1376 if (IS_ERR(t))
1348 return PTR_ERR(t); 1377 return PTR_ERR(t);
1349 raw_spin_lock_irqsave(&rnp->lock, flags); 1378 raw_spin_lock_irqsave(&rnp->lock, flags);
1350 rnp->boost_kthread_task = t; 1379 rnp->boost_kthread_task = t;
1351 raw_spin_unlock_irqrestore(&rnp->lock, flags); 1380 raw_spin_unlock_irqrestore(&rnp->lock, flags);
1352 sp.sched_priority = RCU_KTHREAD_PRIO; 1381 sp.sched_priority = RCU_BOOST_PRIO;
1353 sched_setscheduler_nocheck(t, SCHED_FIFO, &sp); 1382 sched_setscheduler_nocheck(t, SCHED_FIFO, &sp);
1354 wake_up_process(t); /* get to TASK_INTERRUPTIBLE quickly. */ 1383 wake_up_process(t); /* get to TASK_INTERRUPTIBLE quickly. */
1355 return 0; 1384 return 0;
@@ -1444,6 +1473,7 @@ static void rcu_yield(void (*f)(unsigned long), unsigned long arg)
1444{ 1473{
1445 struct sched_param sp; 1474 struct sched_param sp;
1446 struct timer_list yield_timer; 1475 struct timer_list yield_timer;
1476 int prio = current->rt_priority;
1447 1477
1448 setup_timer_on_stack(&yield_timer, f, arg); 1478 setup_timer_on_stack(&yield_timer, f, arg);
1449 mod_timer(&yield_timer, jiffies + 2); 1479 mod_timer(&yield_timer, jiffies + 2);
@@ -1451,7 +1481,8 @@ static void rcu_yield(void (*f)(unsigned long), unsigned long arg)
1451 sched_setscheduler_nocheck(current, SCHED_NORMAL, &sp); 1481 sched_setscheduler_nocheck(current, SCHED_NORMAL, &sp);
1452 set_user_nice(current, 19); 1482 set_user_nice(current, 19);
1453 schedule(); 1483 schedule();
1454 sp.sched_priority = RCU_KTHREAD_PRIO; 1484 set_user_nice(current, 0);
1485 sp.sched_priority = prio;
1455 sched_setscheduler_nocheck(current, SCHED_FIFO, &sp); 1486 sched_setscheduler_nocheck(current, SCHED_FIFO, &sp);
1456 del_timer(&yield_timer); 1487 del_timer(&yield_timer);
1457} 1488}
@@ -1489,7 +1520,8 @@ static int rcu_cpu_kthread_should_stop(int cpu)
1489 1520
1490/* 1521/*
1491 * Per-CPU kernel thread that invokes RCU callbacks. This replaces the 1522 * Per-CPU kernel thread that invokes RCU callbacks. This replaces the
1492 * earlier RCU softirq. 1523 * RCU softirq used in flavors and configurations of RCU that do not
1524 * support RCU priority boosting.
1493 */ 1525 */
1494static int rcu_cpu_kthread(void *arg) 1526static int rcu_cpu_kthread(void *arg)
1495{ 1527{
@@ -1500,9 +1532,12 @@ static int rcu_cpu_kthread(void *arg)
1500 char work; 1532 char work;
1501 char *workp = &per_cpu(rcu_cpu_has_work, cpu); 1533 char *workp = &per_cpu(rcu_cpu_has_work, cpu);
1502 1534
1535 trace_rcu_utilization("Start CPU kthread@init");
1503 for (;;) { 1536 for (;;) {
1504 *statusp = RCU_KTHREAD_WAITING; 1537 *statusp = RCU_KTHREAD_WAITING;
1538 trace_rcu_utilization("End CPU kthread@rcu_wait");
1505 rcu_wait(*workp != 0 || kthread_should_stop()); 1539 rcu_wait(*workp != 0 || kthread_should_stop());
1540 trace_rcu_utilization("Start CPU kthread@rcu_wait");
1506 local_bh_disable(); 1541 local_bh_disable();
1507 if (rcu_cpu_kthread_should_stop(cpu)) { 1542 if (rcu_cpu_kthread_should_stop(cpu)) {
1508 local_bh_enable(); 1543 local_bh_enable();
@@ -1523,11 +1558,14 @@ static int rcu_cpu_kthread(void *arg)
1523 spincnt = 0; 1558 spincnt = 0;
1524 if (spincnt > 10) { 1559 if (spincnt > 10) {
1525 *statusp = RCU_KTHREAD_YIELDING; 1560 *statusp = RCU_KTHREAD_YIELDING;
1561 trace_rcu_utilization("End CPU kthread@rcu_yield");
1526 rcu_yield(rcu_cpu_kthread_timer, (unsigned long)cpu); 1562 rcu_yield(rcu_cpu_kthread_timer, (unsigned long)cpu);
1563 trace_rcu_utilization("Start CPU kthread@rcu_yield");
1527 spincnt = 0; 1564 spincnt = 0;
1528 } 1565 }
1529 } 1566 }
1530 *statusp = RCU_KTHREAD_STOPPED; 1567 *statusp = RCU_KTHREAD_STOPPED;
1568 trace_rcu_utilization("End CPU kthread@term");
1531 return 0; 1569 return 0;
1532} 1570}
1533 1571
@@ -1560,7 +1598,10 @@ static int __cpuinit rcu_spawn_one_cpu_kthread(int cpu)
1560 if (!rcu_scheduler_fully_active || 1598 if (!rcu_scheduler_fully_active ||
1561 per_cpu(rcu_cpu_kthread_task, cpu) != NULL) 1599 per_cpu(rcu_cpu_kthread_task, cpu) != NULL)
1562 return 0; 1600 return 0;
1563 t = kthread_create(rcu_cpu_kthread, (void *)(long)cpu, "rcuc%d", cpu); 1601 t = kthread_create_on_node(rcu_cpu_kthread,
1602 (void *)(long)cpu,
1603 cpu_to_node(cpu),
1604 "rcuc/%d", cpu);
1564 if (IS_ERR(t)) 1605 if (IS_ERR(t))
1565 return PTR_ERR(t); 1606 return PTR_ERR(t);
1566 if (cpu_online(cpu)) 1607 if (cpu_online(cpu))
@@ -1669,7 +1710,7 @@ static int __cpuinit rcu_spawn_one_node_kthread(struct rcu_state *rsp,
1669 return 0; 1710 return 0;
1670 if (rnp->node_kthread_task == NULL) { 1711 if (rnp->node_kthread_task == NULL) {
1671 t = kthread_create(rcu_node_kthread, (void *)rnp, 1712 t = kthread_create(rcu_node_kthread, (void *)rnp,
1672 "rcun%d", rnp_index); 1713 "rcun/%d", rnp_index);
1673 if (IS_ERR(t)) 1714 if (IS_ERR(t))
1674 return PTR_ERR(t); 1715 return PTR_ERR(t);
1675 raw_spin_lock_irqsave(&rnp->lock, flags); 1716 raw_spin_lock_irqsave(&rnp->lock, flags);
@@ -1907,15 +1948,6 @@ int rcu_needs_cpu(int cpu)
1907 return rcu_needs_cpu_quick_check(cpu); 1948 return rcu_needs_cpu_quick_check(cpu);
1908} 1949}
1909 1950
1910/*
1911 * Check to see if we need to continue a callback-flush operations to
1912 * allow the last CPU to enter dyntick-idle mode. But fast dyntick-idle
1913 * entry is not configured, so we never do need to.
1914 */
1915static void rcu_needs_cpu_flush(void)
1916{
1917}
1918
1919#else /* #if !defined(CONFIG_RCU_FAST_NO_HZ) */ 1951#else /* #if !defined(CONFIG_RCU_FAST_NO_HZ) */
1920 1952
1921#define RCU_NEEDS_CPU_FLUSHES 5 1953#define RCU_NEEDS_CPU_FLUSHES 5
@@ -1991,20 +2023,4 @@ int rcu_needs_cpu(int cpu)
1991 return c; 2023 return c;
1992} 2024}
1993 2025
1994/*
1995 * Check to see if we need to continue a callback-flush operations to
1996 * allow the last CPU to enter dyntick-idle mode.
1997 */
1998static void rcu_needs_cpu_flush(void)
1999{
2000 int cpu = smp_processor_id();
2001 unsigned long flags;
2002
2003 if (per_cpu(rcu_dyntick_drain, cpu) <= 0)
2004 return;
2005 local_irq_save(flags);
2006 (void)rcu_needs_cpu(cpu);
2007 local_irq_restore(flags);
2008}
2009
2010#endif /* #else #if !defined(CONFIG_RCU_FAST_NO_HZ) */ 2026#endif /* #else #if !defined(CONFIG_RCU_FAST_NO_HZ) */
diff --git a/kernel/rcutree_trace.c b/kernel/rcutree_trace.c
index 3b0c0986afc..9feffa4c069 100644
--- a/kernel/rcutree_trace.c
+++ b/kernel/rcutree_trace.c
@@ -48,11 +48,6 @@
48 48
49#ifdef CONFIG_RCU_BOOST 49#ifdef CONFIG_RCU_BOOST
50 50
51DECLARE_PER_CPU(unsigned int, rcu_cpu_kthread_status);
52DECLARE_PER_CPU(unsigned int, rcu_cpu_kthread_cpu);
53DECLARE_PER_CPU(unsigned int, rcu_cpu_kthread_loops);
54DECLARE_PER_CPU(char, rcu_cpu_has_work);
55
56static char convert_kthread_status(unsigned int kthread_status) 51static char convert_kthread_status(unsigned int kthread_status)
57{ 52{
58 if (kthread_status > RCU_KTHREAD_MAX) 53 if (kthread_status > RCU_KTHREAD_MAX)
@@ -66,11 +61,11 @@ static void print_one_rcu_data(struct seq_file *m, struct rcu_data *rdp)
66{ 61{
67 if (!rdp->beenonline) 62 if (!rdp->beenonline)
68 return; 63 return;
69 seq_printf(m, "%3d%cc=%lu g=%lu pq=%d pqc=%lu qp=%d", 64 seq_printf(m, "%3d%cc=%lu g=%lu pq=%d pgp=%lu qp=%d",
70 rdp->cpu, 65 rdp->cpu,
71 cpu_is_offline(rdp->cpu) ? '!' : ' ', 66 cpu_is_offline(rdp->cpu) ? '!' : ' ',
72 rdp->completed, rdp->gpnum, 67 rdp->completed, rdp->gpnum,
73 rdp->passed_quiesc, rdp->passed_quiesc_completed, 68 rdp->passed_quiesce, rdp->passed_quiesce_gpnum,
74 rdp->qs_pending); 69 rdp->qs_pending);
75#ifdef CONFIG_NO_HZ 70#ifdef CONFIG_NO_HZ
76 seq_printf(m, " dt=%d/%d/%d df=%lu", 71 seq_printf(m, " dt=%d/%d/%d df=%lu",
@@ -144,7 +139,7 @@ static void print_one_rcu_data_csv(struct seq_file *m, struct rcu_data *rdp)
144 rdp->cpu, 139 rdp->cpu,
145 cpu_is_offline(rdp->cpu) ? "\"N\"" : "\"Y\"", 140 cpu_is_offline(rdp->cpu) ? "\"N\"" : "\"Y\"",
146 rdp->completed, rdp->gpnum, 141 rdp->completed, rdp->gpnum,
147 rdp->passed_quiesc, rdp->passed_quiesc_completed, 142 rdp->passed_quiesce, rdp->passed_quiesce_gpnum,
148 rdp->qs_pending); 143 rdp->qs_pending);
149#ifdef CONFIG_NO_HZ 144#ifdef CONFIG_NO_HZ
150 seq_printf(m, ",%d,%d,%d,%lu", 145 seq_printf(m, ",%d,%d,%d,%lu",
@@ -175,7 +170,7 @@ static void print_one_rcu_data_csv(struct seq_file *m, struct rcu_data *rdp)
175 170
176static int show_rcudata_csv(struct seq_file *m, void *unused) 171static int show_rcudata_csv(struct seq_file *m, void *unused)
177{ 172{
178 seq_puts(m, "\"CPU\",\"Online?\",\"c\",\"g\",\"pq\",\"pqc\",\"pq\","); 173 seq_puts(m, "\"CPU\",\"Online?\",\"c\",\"g\",\"pq\",\"pgp\",\"pq\",");
179#ifdef CONFIG_NO_HZ 174#ifdef CONFIG_NO_HZ
180 seq_puts(m, "\"dt\",\"dt nesting\",\"dt NMI nesting\",\"df\","); 175 seq_puts(m, "\"dt\",\"dt nesting\",\"dt NMI nesting\",\"df\",");
181#endif /* #ifdef CONFIG_NO_HZ */ 176#endif /* #ifdef CONFIG_NO_HZ */
diff --git a/kernel/relay.c b/kernel/relay.c
index 859ea5a9605..226fade4d72 100644
--- a/kernel/relay.c
+++ b/kernel/relay.c
@@ -15,7 +15,7 @@
15#include <linux/errno.h> 15#include <linux/errno.h>
16#include <linux/stddef.h> 16#include <linux/stddef.h>
17#include <linux/slab.h> 17#include <linux/slab.h>
18#include <linux/module.h> 18#include <linux/export.h>
19#include <linux/string.h> 19#include <linux/string.h>
20#include <linux/relay.h> 20#include <linux/relay.h>
21#include <linux/vmalloc.h> 21#include <linux/vmalloc.h>
diff --git a/kernel/resource.c b/kernel/resource.c
index 3b3cedc5259..7640b3a947d 100644
--- a/kernel/resource.c
+++ b/kernel/resource.c
@@ -7,7 +7,7 @@
7 * Arbitrary resource management. 7 * Arbitrary resource management.
8 */ 8 */
9 9
10#include <linux/module.h> 10#include <linux/export.h>
11#include <linux/errno.h> 11#include <linux/errno.h>
12#include <linux/ioport.h> 12#include <linux/ioport.h>
13#include <linux/init.h> 13#include <linux/init.h>
@@ -419,6 +419,9 @@ static int __find_resource(struct resource *root, struct resource *old,
419 else 419 else
420 tmp.end = root->end; 420 tmp.end = root->end;
421 421
422 if (tmp.end < tmp.start)
423 goto next;
424
422 resource_clip(&tmp, constraint->min, constraint->max); 425 resource_clip(&tmp, constraint->min, constraint->max);
423 arch_remove_reservations(&tmp); 426 arch_remove_reservations(&tmp);
424 427
@@ -436,8 +439,10 @@ static int __find_resource(struct resource *root, struct resource *old,
436 return 0; 439 return 0;
437 } 440 }
438 } 441 }
439 if (!this) 442
443next: if (!this || this->end == root->end)
440 break; 444 break;
445
441 if (this != old) 446 if (this != old)
442 tmp.start = this->end + 1; 447 tmp.start = this->end + 1;
443 this = this->sibling; 448 this = this->sibling;
diff --git a/kernel/rtmutex-debug.c b/kernel/rtmutex-debug.c
index 3c7cbc2c33b..8eafd1bd273 100644
--- a/kernel/rtmutex-debug.c
+++ b/kernel/rtmutex-debug.c
@@ -18,7 +18,7 @@
18 */ 18 */
19#include <linux/sched.h> 19#include <linux/sched.h>
20#include <linux/delay.h> 20#include <linux/delay.h>
21#include <linux/module.h> 21#include <linux/export.h>
22#include <linux/spinlock.h> 22#include <linux/spinlock.h>
23#include <linux/kallsyms.h> 23#include <linux/kallsyms.h>
24#include <linux/syscalls.h> 24#include <linux/syscalls.h>
@@ -29,61 +29,6 @@
29 29
30#include "rtmutex_common.h" 30#include "rtmutex_common.h"
31 31
32# define TRACE_WARN_ON(x) WARN_ON(x)
33# define TRACE_BUG_ON(x) BUG_ON(x)
34
35# define TRACE_OFF() \
36do { \
37 if (rt_trace_on) { \
38 rt_trace_on = 0; \
39 console_verbose(); \
40 if (raw_spin_is_locked(&current->pi_lock)) \
41 raw_spin_unlock(&current->pi_lock); \
42 } \
43} while (0)
44
45# define TRACE_OFF_NOLOCK() \
46do { \
47 if (rt_trace_on) { \
48 rt_trace_on = 0; \
49 console_verbose(); \
50 } \
51} while (0)
52
53# define TRACE_BUG_LOCKED() \
54do { \
55 TRACE_OFF(); \
56 BUG(); \
57} while (0)
58
59# define TRACE_WARN_ON_LOCKED(c) \
60do { \
61 if (unlikely(c)) { \
62 TRACE_OFF(); \
63 WARN_ON(1); \
64 } \
65} while (0)
66
67# define TRACE_BUG_ON_LOCKED(c) \
68do { \
69 if (unlikely(c)) \
70 TRACE_BUG_LOCKED(); \
71} while (0)
72
73#ifdef CONFIG_SMP
74# define SMP_TRACE_BUG_ON_LOCKED(c) TRACE_BUG_ON_LOCKED(c)
75#else
76# define SMP_TRACE_BUG_ON_LOCKED(c) do { } while (0)
77#endif
78
79/*
80 * deadlock detection flag. We turn it off when we detect
81 * the first problem because we dont want to recurse back
82 * into the tracing code when doing error printk or
83 * executing a BUG():
84 */
85static int rt_trace_on = 1;
86
87static void printk_task(struct task_struct *p) 32static void printk_task(struct task_struct *p)
88{ 33{
89 if (p) 34 if (p)
@@ -111,8 +56,8 @@ static void printk_lock(struct rt_mutex *lock, int print_owner)
111 56
112void rt_mutex_debug_task_free(struct task_struct *task) 57void rt_mutex_debug_task_free(struct task_struct *task)
113{ 58{
114 WARN_ON(!plist_head_empty(&task->pi_waiters)); 59 DEBUG_LOCKS_WARN_ON(!plist_head_empty(&task->pi_waiters));
115 WARN_ON(task->pi_blocked_on); 60 DEBUG_LOCKS_WARN_ON(task->pi_blocked_on);
116} 61}
117 62
118/* 63/*
@@ -125,7 +70,7 @@ void debug_rt_mutex_deadlock(int detect, struct rt_mutex_waiter *act_waiter,
125{ 70{
126 struct task_struct *task; 71 struct task_struct *task;
127 72
128 if (!rt_trace_on || detect || !act_waiter) 73 if (!debug_locks || detect || !act_waiter)
129 return; 74 return;
130 75
131 task = rt_mutex_owner(act_waiter->lock); 76 task = rt_mutex_owner(act_waiter->lock);
@@ -139,7 +84,7 @@ void debug_rt_mutex_print_deadlock(struct rt_mutex_waiter *waiter)
139{ 84{
140 struct task_struct *task; 85 struct task_struct *task;
141 86
142 if (!waiter->deadlock_lock || !rt_trace_on) 87 if (!waiter->deadlock_lock || !debug_locks)
143 return; 88 return;
144 89
145 rcu_read_lock(); 90 rcu_read_lock();
@@ -149,7 +94,10 @@ void debug_rt_mutex_print_deadlock(struct rt_mutex_waiter *waiter)
149 return; 94 return;
150 } 95 }
151 96
152 TRACE_OFF_NOLOCK(); 97 if (!debug_locks_off()) {
98 rcu_read_unlock();
99 return;
100 }
153 101
154 printk("\n============================================\n"); 102 printk("\n============================================\n");
155 printk( "[ BUG: circular locking deadlock detected! ]\n"); 103 printk( "[ BUG: circular locking deadlock detected! ]\n");
@@ -180,7 +128,6 @@ void debug_rt_mutex_print_deadlock(struct rt_mutex_waiter *waiter)
180 128
181 printk("[ turning off deadlock detection." 129 printk("[ turning off deadlock detection."
182 "Please report this trace. ]\n\n"); 130 "Please report this trace. ]\n\n");
183 local_irq_disable();
184} 131}
185 132
186void debug_rt_mutex_lock(struct rt_mutex *lock) 133void debug_rt_mutex_lock(struct rt_mutex *lock)
@@ -189,7 +136,7 @@ void debug_rt_mutex_lock(struct rt_mutex *lock)
189 136
190void debug_rt_mutex_unlock(struct rt_mutex *lock) 137void debug_rt_mutex_unlock(struct rt_mutex *lock)
191{ 138{
192 TRACE_WARN_ON_LOCKED(rt_mutex_owner(lock) != current); 139 DEBUG_LOCKS_WARN_ON(rt_mutex_owner(lock) != current);
193} 140}
194 141
195void 142void
@@ -199,7 +146,7 @@ debug_rt_mutex_proxy_lock(struct rt_mutex *lock, struct task_struct *powner)
199 146
200void debug_rt_mutex_proxy_unlock(struct rt_mutex *lock) 147void debug_rt_mutex_proxy_unlock(struct rt_mutex *lock)
201{ 148{
202 TRACE_WARN_ON_LOCKED(!rt_mutex_owner(lock)); 149 DEBUG_LOCKS_WARN_ON(!rt_mutex_owner(lock));
203} 150}
204 151
205void debug_rt_mutex_init_waiter(struct rt_mutex_waiter *waiter) 152void debug_rt_mutex_init_waiter(struct rt_mutex_waiter *waiter)
@@ -213,8 +160,8 @@ void debug_rt_mutex_init_waiter(struct rt_mutex_waiter *waiter)
213void debug_rt_mutex_free_waiter(struct rt_mutex_waiter *waiter) 160void debug_rt_mutex_free_waiter(struct rt_mutex_waiter *waiter)
214{ 161{
215 put_pid(waiter->deadlock_task_pid); 162 put_pid(waiter->deadlock_task_pid);
216 TRACE_WARN_ON(!plist_node_empty(&waiter->list_entry)); 163 DEBUG_LOCKS_WARN_ON(!plist_node_empty(&waiter->list_entry));
217 TRACE_WARN_ON(!plist_node_empty(&waiter->pi_list_entry)); 164 DEBUG_LOCKS_WARN_ON(!plist_node_empty(&waiter->pi_list_entry));
218 memset(waiter, 0x22, sizeof(*waiter)); 165 memset(waiter, 0x22, sizeof(*waiter));
219} 166}
220 167
diff --git a/kernel/rtmutex-tester.c b/kernel/rtmutex-tester.c
index 5c9ccd38096..3d9f31cd79e 100644
--- a/kernel/rtmutex-tester.c
+++ b/kernel/rtmutex-tester.c
@@ -7,7 +7,7 @@
7 * 7 *
8 */ 8 */
9#include <linux/kthread.h> 9#include <linux/kthread.h>
10#include <linux/module.h> 10#include <linux/export.h>
11#include <linux/sched.h> 11#include <linux/sched.h>
12#include <linux/spinlock.h> 12#include <linux/spinlock.h>
13#include <linux/sysdev.h> 13#include <linux/sysdev.h>
diff --git a/kernel/rtmutex.c b/kernel/rtmutex.c
index 255e1662acd..f9d8482dd48 100644
--- a/kernel/rtmutex.c
+++ b/kernel/rtmutex.c
@@ -11,7 +11,7 @@
11 * See Documentation/rt-mutex-design.txt for details. 11 * See Documentation/rt-mutex-design.txt for details.
12 */ 12 */
13#include <linux/spinlock.h> 13#include <linux/spinlock.h>
14#include <linux/module.h> 14#include <linux/export.h>
15#include <linux/sched.h> 15#include <linux/sched.h>
16#include <linux/timer.h> 16#include <linux/timer.h>
17 17
@@ -579,6 +579,7 @@ __rt_mutex_slowlock(struct rt_mutex *lock, int state,
579 struct rt_mutex_waiter *waiter) 579 struct rt_mutex_waiter *waiter)
580{ 580{
581 int ret = 0; 581 int ret = 0;
582 int was_disabled;
582 583
583 for (;;) { 584 for (;;) {
584 /* Try to acquire the lock: */ 585 /* Try to acquire the lock: */
@@ -601,10 +602,17 @@ __rt_mutex_slowlock(struct rt_mutex *lock, int state,
601 602
602 raw_spin_unlock(&lock->wait_lock); 603 raw_spin_unlock(&lock->wait_lock);
603 604
605 was_disabled = irqs_disabled();
606 if (was_disabled)
607 local_irq_enable();
608
604 debug_rt_mutex_print_deadlock(waiter); 609 debug_rt_mutex_print_deadlock(waiter);
605 610
606 schedule_rt_mutex(lock); 611 schedule_rt_mutex(lock);
607 612
613 if (was_disabled)
614 local_irq_disable();
615
608 raw_spin_lock(&lock->wait_lock); 616 raw_spin_lock(&lock->wait_lock);
609 set_current_state(state); 617 set_current_state(state);
610 } 618 }
diff --git a/kernel/rwsem.c b/kernel/rwsem.c
index 9f48f3d82e9..b152f74f02d 100644
--- a/kernel/rwsem.c
+++ b/kernel/rwsem.c
@@ -7,7 +7,7 @@
7#include <linux/types.h> 7#include <linux/types.h>
8#include <linux/kernel.h> 8#include <linux/kernel.h>
9#include <linux/sched.h> 9#include <linux/sched.h>
10#include <linux/module.h> 10#include <linux/export.h>
11#include <linux/rwsem.h> 11#include <linux/rwsem.h>
12 12
13#include <asm/system.h> 13#include <asm/system.h>
diff --git a/kernel/sched.c b/kernel/sched.c
index e1290ecee3c..d6b149ccf92 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -71,6 +71,7 @@
71#include <linux/ctype.h> 71#include <linux/ctype.h>
72#include <linux/ftrace.h> 72#include <linux/ftrace.h>
73#include <linux/slab.h> 73#include <linux/slab.h>
74#include <linux/init_task.h>
74 75
75#include <asm/tlb.h> 76#include <asm/tlb.h>
76#include <asm/irq_regs.h> 77#include <asm/irq_regs.h>
@@ -196,10 +197,28 @@ static inline int rt_bandwidth_enabled(void)
196 return sysctl_sched_rt_runtime >= 0; 197 return sysctl_sched_rt_runtime >= 0;
197} 198}
198 199
199static void start_rt_bandwidth(struct rt_bandwidth *rt_b) 200static void start_bandwidth_timer(struct hrtimer *period_timer, ktime_t period)
200{ 201{
201 ktime_t now; 202 unsigned long delta;
203 ktime_t soft, hard, now;
204
205 for (;;) {
206 if (hrtimer_active(period_timer))
207 break;
208
209 now = hrtimer_cb_get_time(period_timer);
210 hrtimer_forward(period_timer, now, period);
202 211
212 soft = hrtimer_get_softexpires(period_timer);
213 hard = hrtimer_get_expires(period_timer);
214 delta = ktime_to_ns(ktime_sub(hard, soft));
215 __hrtimer_start_range_ns(period_timer, soft, delta,
216 HRTIMER_MODE_ABS_PINNED, 0);
217 }
218}
219
220static void start_rt_bandwidth(struct rt_bandwidth *rt_b)
221{
203 if (!rt_bandwidth_enabled() || rt_b->rt_runtime == RUNTIME_INF) 222 if (!rt_bandwidth_enabled() || rt_b->rt_runtime == RUNTIME_INF)
204 return; 223 return;
205 224
@@ -207,22 +226,7 @@ static void start_rt_bandwidth(struct rt_bandwidth *rt_b)
207 return; 226 return;
208 227
209 raw_spin_lock(&rt_b->rt_runtime_lock); 228 raw_spin_lock(&rt_b->rt_runtime_lock);
210 for (;;) { 229 start_bandwidth_timer(&rt_b->rt_period_timer, rt_b->rt_period);
211 unsigned long delta;
212 ktime_t soft, hard;
213
214 if (hrtimer_active(&rt_b->rt_period_timer))
215 break;
216
217 now = hrtimer_cb_get_time(&rt_b->rt_period_timer);
218 hrtimer_forward(&rt_b->rt_period_timer, now, rt_b->rt_period);
219
220 soft = hrtimer_get_softexpires(&rt_b->rt_period_timer);
221 hard = hrtimer_get_expires(&rt_b->rt_period_timer);
222 delta = ktime_to_ns(ktime_sub(hard, soft));
223 __hrtimer_start_range_ns(&rt_b->rt_period_timer, soft, delta,
224 HRTIMER_MODE_ABS_PINNED, 0);
225 }
226 raw_spin_unlock(&rt_b->rt_runtime_lock); 230 raw_spin_unlock(&rt_b->rt_runtime_lock);
227} 231}
228 232
@@ -247,6 +251,24 @@ struct cfs_rq;
247 251
248static LIST_HEAD(task_groups); 252static LIST_HEAD(task_groups);
249 253
254struct cfs_bandwidth {
255#ifdef CONFIG_CFS_BANDWIDTH
256 raw_spinlock_t lock;
257 ktime_t period;
258 u64 quota, runtime;
259 s64 hierarchal_quota;
260 u64 runtime_expires;
261
262 int idle, timer_active;
263 struct hrtimer period_timer, slack_timer;
264 struct list_head throttled_cfs_rq;
265
266 /* statistics */
267 int nr_periods, nr_throttled;
268 u64 throttled_time;
269#endif
270};
271
250/* task group related information */ 272/* task group related information */
251struct task_group { 273struct task_group {
252 struct cgroup_subsys_state css; 274 struct cgroup_subsys_state css;
@@ -278,6 +300,8 @@ struct task_group {
278#ifdef CONFIG_SCHED_AUTOGROUP 300#ifdef CONFIG_SCHED_AUTOGROUP
279 struct autogroup *autogroup; 301 struct autogroup *autogroup;
280#endif 302#endif
303
304 struct cfs_bandwidth cfs_bandwidth;
281}; 305};
282 306
283/* task_group_lock serializes the addition/removal of task groups */ 307/* task_group_lock serializes the addition/removal of task groups */
@@ -311,7 +335,7 @@ struct task_group root_task_group;
311/* CFS-related fields in a runqueue */ 335/* CFS-related fields in a runqueue */
312struct cfs_rq { 336struct cfs_rq {
313 struct load_weight load; 337 struct load_weight load;
314 unsigned long nr_running; 338 unsigned long nr_running, h_nr_running;
315 339
316 u64 exec_clock; 340 u64 exec_clock;
317 u64 min_vruntime; 341 u64 min_vruntime;
@@ -377,9 +401,120 @@ struct cfs_rq {
377 401
378 unsigned long load_contribution; 402 unsigned long load_contribution;
379#endif 403#endif
404#ifdef CONFIG_CFS_BANDWIDTH
405 int runtime_enabled;
406 u64 runtime_expires;
407 s64 runtime_remaining;
408
409 u64 throttled_timestamp;
410 int throttled, throttle_count;
411 struct list_head throttled_list;
412#endif
380#endif 413#endif
381}; 414};
382 415
416#ifdef CONFIG_FAIR_GROUP_SCHED
417#ifdef CONFIG_CFS_BANDWIDTH
418static inline struct cfs_bandwidth *tg_cfs_bandwidth(struct task_group *tg)
419{
420 return &tg->cfs_bandwidth;
421}
422
423static inline u64 default_cfs_period(void);
424static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun);
425static void do_sched_cfs_slack_timer(struct cfs_bandwidth *cfs_b);
426
427static enum hrtimer_restart sched_cfs_slack_timer(struct hrtimer *timer)
428{
429 struct cfs_bandwidth *cfs_b =
430 container_of(timer, struct cfs_bandwidth, slack_timer);
431 do_sched_cfs_slack_timer(cfs_b);
432
433 return HRTIMER_NORESTART;
434}
435
436static enum hrtimer_restart sched_cfs_period_timer(struct hrtimer *timer)
437{
438 struct cfs_bandwidth *cfs_b =
439 container_of(timer, struct cfs_bandwidth, period_timer);
440 ktime_t now;
441 int overrun;
442 int idle = 0;
443
444 for (;;) {
445 now = hrtimer_cb_get_time(timer);
446 overrun = hrtimer_forward(timer, now, cfs_b->period);
447
448 if (!overrun)
449 break;
450
451 idle = do_sched_cfs_period_timer(cfs_b, overrun);
452 }
453
454 return idle ? HRTIMER_NORESTART : HRTIMER_RESTART;
455}
456
457static void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
458{
459 raw_spin_lock_init(&cfs_b->lock);
460 cfs_b->runtime = 0;
461 cfs_b->quota = RUNTIME_INF;
462 cfs_b->period = ns_to_ktime(default_cfs_period());
463
464 INIT_LIST_HEAD(&cfs_b->throttled_cfs_rq);
465 hrtimer_init(&cfs_b->period_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
466 cfs_b->period_timer.function = sched_cfs_period_timer;
467 hrtimer_init(&cfs_b->slack_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
468 cfs_b->slack_timer.function = sched_cfs_slack_timer;
469}
470
471static void init_cfs_rq_runtime(struct cfs_rq *cfs_rq)
472{
473 cfs_rq->runtime_enabled = 0;
474 INIT_LIST_HEAD(&cfs_rq->throttled_list);
475}
476
477/* requires cfs_b->lock, may release to reprogram timer */
478static void __start_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
479{
480 /*
481 * The timer may be active because we're trying to set a new bandwidth
482 * period or because we're racing with the tear-down path
483 * (timer_active==0 becomes visible before the hrtimer call-back
484 * terminates). In either case we ensure that it's re-programmed
485 */
486 while (unlikely(hrtimer_active(&cfs_b->period_timer))) {
487 raw_spin_unlock(&cfs_b->lock);
488 /* ensure cfs_b->lock is available while we wait */
489 hrtimer_cancel(&cfs_b->period_timer);
490
491 raw_spin_lock(&cfs_b->lock);
492 /* if someone else restarted the timer then we're done */
493 if (cfs_b->timer_active)
494 return;
495 }
496
497 cfs_b->timer_active = 1;
498 start_bandwidth_timer(&cfs_b->period_timer, cfs_b->period);
499}
500
501static void destroy_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
502{
503 hrtimer_cancel(&cfs_b->period_timer);
504 hrtimer_cancel(&cfs_b->slack_timer);
505}
506#else
507static void init_cfs_rq_runtime(struct cfs_rq *cfs_rq) {}
508static void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b) {}
509static void destroy_cfs_bandwidth(struct cfs_bandwidth *cfs_b) {}
510
511static inline struct cfs_bandwidth *tg_cfs_bandwidth(struct task_group *tg)
512{
513 return NULL;
514}
515#endif /* CONFIG_CFS_BANDWIDTH */
516#endif /* CONFIG_FAIR_GROUP_SCHED */
517
383/* Real-Time classes' related field in a runqueue: */ 518/* Real-Time classes' related field in a runqueue: */
384struct rt_rq { 519struct rt_rq {
385 struct rt_prio_array active; 520 struct rt_prio_array active;
@@ -510,7 +645,7 @@ struct rq {
510 645
511 unsigned long cpu_power; 646 unsigned long cpu_power;
512 647
513 unsigned char idle_at_tick; 648 unsigned char idle_balance;
514 /* For active balancing */ 649 /* For active balancing */
515 int post_schedule; 650 int post_schedule;
516 int active_balance; 651 int active_balance;
@@ -520,8 +655,6 @@ struct rq {
520 int cpu; 655 int cpu;
521 int online; 656 int online;
522 657
523 unsigned long avg_load_per_task;
524
525 u64 rt_avg; 658 u64 rt_avg;
526 u64 age_stamp; 659 u64 age_stamp;
527 u64 idle_stamp; 660 u64 idle_stamp;
@@ -570,7 +703,7 @@ struct rq {
570#endif 703#endif
571 704
572#ifdef CONFIG_SMP 705#ifdef CONFIG_SMP
573 struct task_struct *wake_list; 706 struct llist_head wake_list;
574#endif 707#endif
575}; 708};
576 709
@@ -1272,6 +1405,18 @@ void wake_up_idle_cpu(int cpu)
1272 smp_send_reschedule(cpu); 1405 smp_send_reschedule(cpu);
1273} 1406}
1274 1407
1408static inline bool got_nohz_idle_kick(void)
1409{
1410 return idle_cpu(smp_processor_id()) && this_rq()->nohz_balance_kick;
1411}
1412
1413#else /* CONFIG_NO_HZ */
1414
1415static inline bool got_nohz_idle_kick(void)
1416{
1417 return false;
1418}
1419
1275#endif /* CONFIG_NO_HZ */ 1420#endif /* CONFIG_NO_HZ */
1276 1421
1277static u64 sched_avg_period(void) 1422static u64 sched_avg_period(void)
@@ -1471,24 +1616,28 @@ static inline void dec_cpu_load(struct rq *rq, unsigned long load)
1471 update_load_sub(&rq->load, load); 1616 update_load_sub(&rq->load, load);
1472} 1617}
1473 1618
1474#if (defined(CONFIG_SMP) && defined(CONFIG_FAIR_GROUP_SCHED)) || defined(CONFIG_RT_GROUP_SCHED) 1619#if defined(CONFIG_RT_GROUP_SCHED) || (defined(CONFIG_FAIR_GROUP_SCHED) && \
1620 (defined(CONFIG_SMP) || defined(CONFIG_CFS_BANDWIDTH)))
1475typedef int (*tg_visitor)(struct task_group *, void *); 1621typedef int (*tg_visitor)(struct task_group *, void *);
1476 1622
1477/* 1623/*
1478 * Iterate the full tree, calling @down when first entering a node and @up when 1624 * Iterate task_group tree rooted at *from, calling @down when first entering a
1479 * leaving it for the final time. 1625 * node and @up when leaving it for the final time.
1626 *
1627 * Caller must hold rcu_lock or sufficient equivalent.
1480 */ 1628 */
1481static int walk_tg_tree(tg_visitor down, tg_visitor up, void *data) 1629static int walk_tg_tree_from(struct task_group *from,
1630 tg_visitor down, tg_visitor up, void *data)
1482{ 1631{
1483 struct task_group *parent, *child; 1632 struct task_group *parent, *child;
1484 int ret; 1633 int ret;
1485 1634
1486 rcu_read_lock(); 1635 parent = from;
1487 parent = &root_task_group; 1636
1488down: 1637down:
1489 ret = (*down)(parent, data); 1638 ret = (*down)(parent, data);
1490 if (ret) 1639 if (ret)
1491 goto out_unlock; 1640 goto out;
1492 list_for_each_entry_rcu(child, &parent->children, siblings) { 1641 list_for_each_entry_rcu(child, &parent->children, siblings) {
1493 parent = child; 1642 parent = child;
1494 goto down; 1643 goto down;
@@ -1497,19 +1646,29 @@ up:
1497 continue; 1646 continue;
1498 } 1647 }
1499 ret = (*up)(parent, data); 1648 ret = (*up)(parent, data);
1500 if (ret) 1649 if (ret || parent == from)
1501 goto out_unlock; 1650 goto out;
1502 1651
1503 child = parent; 1652 child = parent;
1504 parent = parent->parent; 1653 parent = parent->parent;
1505 if (parent) 1654 if (parent)
1506 goto up; 1655 goto up;
1507out_unlock: 1656out:
1508 rcu_read_unlock();
1509
1510 return ret; 1657 return ret;
1511} 1658}
1512 1659
1660/*
1661 * Iterate the full tree, calling @down when first entering a node and @up when
1662 * leaving it for the final time.
1663 *
1664 * Caller must hold rcu_lock or sufficient equivalent.
1665 */
1666
1667static inline int walk_tg_tree(tg_visitor down, tg_visitor up, void *data)
1668{
1669 return walk_tg_tree_from(&root_task_group, down, up, data);
1670}
1671
1513static int tg_nop(struct task_group *tg, void *data) 1672static int tg_nop(struct task_group *tg, void *data)
1514{ 1673{
1515 return 0; 1674 return 0;
@@ -1569,11 +1728,9 @@ static unsigned long cpu_avg_load_per_task(int cpu)
1569 unsigned long nr_running = ACCESS_ONCE(rq->nr_running); 1728 unsigned long nr_running = ACCESS_ONCE(rq->nr_running);
1570 1729
1571 if (nr_running) 1730 if (nr_running)
1572 rq->avg_load_per_task = rq->load.weight / nr_running; 1731 return rq->load.weight / nr_running;
1573 else
1574 rq->avg_load_per_task = 0;
1575 1732
1576 return rq->avg_load_per_task; 1733 return 0;
1577} 1734}
1578 1735
1579#ifdef CONFIG_PREEMPT 1736#ifdef CONFIG_PREEMPT
@@ -1739,7 +1896,7 @@ static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu)
1739#ifdef CONFIG_SMP 1896#ifdef CONFIG_SMP
1740 /* 1897 /*
1741 * After ->cpu is set up to a new value, task_rq_lock(p, ...) can be 1898 * After ->cpu is set up to a new value, task_rq_lock(p, ...) can be
1742 * successfuly executed on another CPU. We must ensure that updates of 1899 * successfully executed on another CPU. We must ensure that updates of
1743 * per-task data have been completed by this moment. 1900 * per-task data have been completed by this moment.
1744 */ 1901 */
1745 smp_wmb(); 1902 smp_wmb();
@@ -1806,7 +1963,6 @@ static void activate_task(struct rq *rq, struct task_struct *p, int flags)
1806 rq->nr_uninterruptible--; 1963 rq->nr_uninterruptible--;
1807 1964
1808 enqueue_task(rq, p, flags); 1965 enqueue_task(rq, p, flags);
1809 inc_nr_running(rq);
1810} 1966}
1811 1967
1812/* 1968/*
@@ -1818,7 +1974,6 @@ static void deactivate_task(struct rq *rq, struct task_struct *p, int flags)
1818 rq->nr_uninterruptible++; 1974 rq->nr_uninterruptible++;
1819 1975
1820 dequeue_task(rq, p, flags); 1976 dequeue_task(rq, p, flags);
1821 dec_nr_running(rq);
1822} 1977}
1823 1978
1824#ifdef CONFIG_IRQ_TIME_ACCOUNTING 1979#ifdef CONFIG_IRQ_TIME_ACCOUNTING
@@ -2390,11 +2545,11 @@ static int select_fallback_rq(int cpu, struct task_struct *p)
2390 2545
2391 /* Look for allowed, online CPU in same node. */ 2546 /* Look for allowed, online CPU in same node. */
2392 for_each_cpu_and(dest_cpu, nodemask, cpu_active_mask) 2547 for_each_cpu_and(dest_cpu, nodemask, cpu_active_mask)
2393 if (cpumask_test_cpu(dest_cpu, &p->cpus_allowed)) 2548 if (cpumask_test_cpu(dest_cpu, tsk_cpus_allowed(p)))
2394 return dest_cpu; 2549 return dest_cpu;
2395 2550
2396 /* Any allowed, online CPU? */ 2551 /* Any allowed, online CPU? */
2397 dest_cpu = cpumask_any_and(&p->cpus_allowed, cpu_active_mask); 2552 dest_cpu = cpumask_any_and(tsk_cpus_allowed(p), cpu_active_mask);
2398 if (dest_cpu < nr_cpu_ids) 2553 if (dest_cpu < nr_cpu_ids)
2399 return dest_cpu; 2554 return dest_cpu;
2400 2555
@@ -2431,7 +2586,7 @@ int select_task_rq(struct task_struct *p, int sd_flags, int wake_flags)
2431 * [ this allows ->select_task() to simply return task_cpu(p) and 2586 * [ this allows ->select_task() to simply return task_cpu(p) and
2432 * not worry about this generic constraint ] 2587 * not worry about this generic constraint ]
2433 */ 2588 */
2434 if (unlikely(!cpumask_test_cpu(cpu, &p->cpus_allowed) || 2589 if (unlikely(!cpumask_test_cpu(cpu, tsk_cpus_allowed(p)) ||
2435 !cpu_online(cpu))) 2590 !cpu_online(cpu)))
2436 cpu = select_fallback_rq(task_cpu(p), p); 2591 cpu = select_fallback_rq(task_cpu(p), p);
2437 2592
@@ -2556,42 +2711,26 @@ static int ttwu_remote(struct task_struct *p, int wake_flags)
2556} 2711}
2557 2712
2558#ifdef CONFIG_SMP 2713#ifdef CONFIG_SMP
2559static void sched_ttwu_do_pending(struct task_struct *list) 2714static void sched_ttwu_pending(void)
2560{ 2715{
2561 struct rq *rq = this_rq(); 2716 struct rq *rq = this_rq();
2717 struct llist_node *llist = llist_del_all(&rq->wake_list);
2718 struct task_struct *p;
2562 2719
2563 raw_spin_lock(&rq->lock); 2720 raw_spin_lock(&rq->lock);
2564 2721
2565 while (list) { 2722 while (llist) {
2566 struct task_struct *p = list; 2723 p = llist_entry(llist, struct task_struct, wake_entry);
2567 list = list->wake_entry; 2724 llist = llist_next(llist);
2568 ttwu_do_activate(rq, p, 0); 2725 ttwu_do_activate(rq, p, 0);
2569 } 2726 }
2570 2727
2571 raw_spin_unlock(&rq->lock); 2728 raw_spin_unlock(&rq->lock);
2572} 2729}
2573 2730
2574#ifdef CONFIG_HOTPLUG_CPU
2575
2576static void sched_ttwu_pending(void)
2577{
2578 struct rq *rq = this_rq();
2579 struct task_struct *list = xchg(&rq->wake_list, NULL);
2580
2581 if (!list)
2582 return;
2583
2584 sched_ttwu_do_pending(list);
2585}
2586
2587#endif /* CONFIG_HOTPLUG_CPU */
2588
2589void scheduler_ipi(void) 2731void scheduler_ipi(void)
2590{ 2732{
2591 struct rq *rq = this_rq(); 2733 if (llist_empty(&this_rq()->wake_list) && !got_nohz_idle_kick())
2592 struct task_struct *list = xchg(&rq->wake_list, NULL);
2593
2594 if (!list)
2595 return; 2734 return;
2596 2735
2597 /* 2736 /*
@@ -2608,25 +2747,21 @@ void scheduler_ipi(void)
2608 * somewhat pessimize the simple resched case. 2747 * somewhat pessimize the simple resched case.
2609 */ 2748 */
2610 irq_enter(); 2749 irq_enter();
2611 sched_ttwu_do_pending(list); 2750 sched_ttwu_pending();
2751
2752 /*
2753 * Check if someone kicked us for doing the nohz idle load balance.
2754 */
2755 if (unlikely(got_nohz_idle_kick() && !need_resched())) {
2756 this_rq()->idle_balance = 1;
2757 raise_softirq_irqoff(SCHED_SOFTIRQ);
2758 }
2612 irq_exit(); 2759 irq_exit();
2613} 2760}
2614 2761
2615static void ttwu_queue_remote(struct task_struct *p, int cpu) 2762static void ttwu_queue_remote(struct task_struct *p, int cpu)
2616{ 2763{
2617 struct rq *rq = cpu_rq(cpu); 2764 if (llist_add(&p->wake_entry, &cpu_rq(cpu)->wake_list))
2618 struct task_struct *next = rq->wake_list;
2619
2620 for (;;) {
2621 struct task_struct *old = next;
2622
2623 p->wake_entry = next;
2624 next = cmpxchg(&rq->wake_list, old, p);
2625 if (next == old)
2626 break;
2627 }
2628
2629 if (!next)
2630 smp_send_reschedule(cpu); 2765 smp_send_reschedule(cpu);
2631} 2766}
2632 2767
@@ -2848,19 +2983,23 @@ void sched_fork(struct task_struct *p)
2848 p->state = TASK_RUNNING; 2983 p->state = TASK_RUNNING;
2849 2984
2850 /* 2985 /*
2986 * Make sure we do not leak PI boosting priority to the child.
2987 */
2988 p->prio = current->normal_prio;
2989
2990 /*
2851 * Revert to default priority/policy on fork if requested. 2991 * Revert to default priority/policy on fork if requested.
2852 */ 2992 */
2853 if (unlikely(p->sched_reset_on_fork)) { 2993 if (unlikely(p->sched_reset_on_fork)) {
2854 if (p->policy == SCHED_FIFO || p->policy == SCHED_RR) { 2994 if (task_has_rt_policy(p)) {
2855 p->policy = SCHED_NORMAL; 2995 p->policy = SCHED_NORMAL;
2856 p->normal_prio = p->static_prio;
2857 }
2858
2859 if (PRIO_TO_NICE(p->static_prio) < 0) {
2860 p->static_prio = NICE_TO_PRIO(0); 2996 p->static_prio = NICE_TO_PRIO(0);
2861 p->normal_prio = p->static_prio; 2997 p->rt_priority = 0;
2862 set_load_weight(p); 2998 } else if (PRIO_TO_NICE(p->static_prio) < 0)
2863 } 2999 p->static_prio = NICE_TO_PRIO(0);
3000
3001 p->prio = p->normal_prio = __normal_prio(p);
3002 set_load_weight(p);
2864 3003
2865 /* 3004 /*
2866 * We don't need the reset flag anymore after the fork. It has 3005 * We don't need the reset flag anymore after the fork. It has
@@ -2869,11 +3008,6 @@ void sched_fork(struct task_struct *p)
2869 p->sched_reset_on_fork = 0; 3008 p->sched_reset_on_fork = 0;
2870 } 3009 }
2871 3010
2872 /*
2873 * Make sure we do not leak PI boosting priority to the child.
2874 */
2875 p->prio = current->normal_prio;
2876
2877 if (!rt_prio(p->prio)) 3011 if (!rt_prio(p->prio))
2878 p->sched_class = &fair_sched_class; 3012 p->sched_class = &fair_sched_class;
2879 3013
@@ -3065,7 +3199,7 @@ static void finish_task_switch(struct rq *rq, struct task_struct *prev)
3065#ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW 3199#ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW
3066 local_irq_disable(); 3200 local_irq_disable();
3067#endif /* __ARCH_WANT_INTERRUPTS_ON_CTXSW */ 3201#endif /* __ARCH_WANT_INTERRUPTS_ON_CTXSW */
3068 perf_event_task_sched_in(current); 3202 perf_event_task_sched_in(prev, current);
3069#ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW 3203#ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW
3070 local_irq_enable(); 3204 local_irq_enable();
3071#endif /* __ARCH_WANT_INTERRUPTS_ON_CTXSW */ 3205#endif /* __ARCH_WANT_INTERRUPTS_ON_CTXSW */
@@ -4116,7 +4250,7 @@ void scheduler_tick(void)
4116 perf_event_task_tick(); 4250 perf_event_task_tick();
4117 4251
4118#ifdef CONFIG_SMP 4252#ifdef CONFIG_SMP
4119 rq->idle_at_tick = idle_cpu(cpu); 4253 rq->idle_balance = idle_cpu(cpu);
4120 trigger_load_balance(rq, cpu); 4254 trigger_load_balance(rq, cpu);
4121#endif 4255#endif
4122} 4256}
@@ -4213,6 +4347,7 @@ static inline void schedule_debug(struct task_struct *prev)
4213 */ 4347 */
4214 if (unlikely(in_atomic_preempt_off() && !prev->exit_state)) 4348 if (unlikely(in_atomic_preempt_off() && !prev->exit_state))
4215 __schedule_bug(prev); 4349 __schedule_bug(prev);
4350 rcu_sleep_check();
4216 4351
4217 profile_hit(SCHED_PROFILING, __builtin_return_address(0)); 4352 profile_hit(SCHED_PROFILING, __builtin_return_address(0));
4218 4353
@@ -4239,7 +4374,7 @@ pick_next_task(struct rq *rq)
4239 * Optimization: we know that if all tasks are in 4374 * Optimization: we know that if all tasks are in
4240 * the fair class we can call that function directly: 4375 * the fair class we can call that function directly:
4241 */ 4376 */
4242 if (likely(rq->nr_running == rq->cfs.nr_running)) { 4377 if (likely(rq->nr_running == rq->cfs.h_nr_running)) {
4243 p = fair_sched_class.pick_next_task(rq); 4378 p = fair_sched_class.pick_next_task(rq);
4244 if (likely(p)) 4379 if (likely(p))
4245 return p; 4380 return p;
@@ -4255,9 +4390,9 @@ pick_next_task(struct rq *rq)
4255} 4390}
4256 4391
4257/* 4392/*
4258 * schedule() is the main scheduler function. 4393 * __schedule() is the main scheduler function.
4259 */ 4394 */
4260asmlinkage void __sched schedule(void) 4395static void __sched __schedule(void)
4261{ 4396{
4262 struct task_struct *prev, *next; 4397 struct task_struct *prev, *next;
4263 unsigned long *switch_count; 4398 unsigned long *switch_count;
@@ -4298,16 +4433,6 @@ need_resched:
4298 if (to_wakeup) 4433 if (to_wakeup)
4299 try_to_wake_up_local(to_wakeup); 4434 try_to_wake_up_local(to_wakeup);
4300 } 4435 }
4301
4302 /*
4303 * If we are going to sleep and we have plugged IO
4304 * queued, make sure to submit it to avoid deadlocks.
4305 */
4306 if (blk_needs_flush_plug(prev)) {
4307 raw_spin_unlock(&rq->lock);
4308 blk_schedule_flush_plug(prev);
4309 raw_spin_lock(&rq->lock);
4310 }
4311 } 4436 }
4312 switch_count = &prev->nvcsw; 4437 switch_count = &prev->nvcsw;
4313 } 4438 }
@@ -4345,6 +4470,26 @@ need_resched:
4345 if (need_resched()) 4470 if (need_resched())
4346 goto need_resched; 4471 goto need_resched;
4347} 4472}
4473
4474static inline void sched_submit_work(struct task_struct *tsk)
4475{
4476 if (!tsk->state)
4477 return;
4478 /*
4479 * If we are going to sleep and we have plugged IO queued,
4480 * make sure to submit it to avoid deadlocks.
4481 */
4482 if (blk_needs_flush_plug(tsk))
4483 blk_schedule_flush_plug(tsk);
4484}
4485
4486asmlinkage void __sched schedule(void)
4487{
4488 struct task_struct *tsk = current;
4489
4490 sched_submit_work(tsk);
4491 __schedule();
4492}
4348EXPORT_SYMBOL(schedule); 4493EXPORT_SYMBOL(schedule);
4349 4494
4350#ifdef CONFIG_MUTEX_SPIN_ON_OWNER 4495#ifdef CONFIG_MUTEX_SPIN_ON_OWNER
@@ -4411,7 +4556,7 @@ asmlinkage void __sched notrace preempt_schedule(void)
4411 4556
4412 do { 4557 do {
4413 add_preempt_count_notrace(PREEMPT_ACTIVE); 4558 add_preempt_count_notrace(PREEMPT_ACTIVE);
4414 schedule(); 4559 __schedule();
4415 sub_preempt_count_notrace(PREEMPT_ACTIVE); 4560 sub_preempt_count_notrace(PREEMPT_ACTIVE);
4416 4561
4417 /* 4562 /*
@@ -4439,7 +4584,7 @@ asmlinkage void __sched preempt_schedule_irq(void)
4439 do { 4584 do {
4440 add_preempt_count(PREEMPT_ACTIVE); 4585 add_preempt_count(PREEMPT_ACTIVE);
4441 local_irq_enable(); 4586 local_irq_enable();
4442 schedule(); 4587 __schedule();
4443 local_irq_disable(); 4588 local_irq_disable();
4444 sub_preempt_count(PREEMPT_ACTIVE); 4589 sub_preempt_count(PREEMPT_ACTIVE);
4445 4590
@@ -4666,6 +4811,9 @@ EXPORT_SYMBOL(wait_for_completion);
4666 * This waits for either a completion of a specific task to be signaled or for a 4811 * This waits for either a completion of a specific task to be signaled or for a
4667 * specified timeout to expire. The timeout is in jiffies. It is not 4812 * specified timeout to expire. The timeout is in jiffies. It is not
4668 * interruptible. 4813 * interruptible.
4814 *
4815 * The return value is 0 if timed out, and positive (at least 1, or number of
4816 * jiffies left till timeout) if completed.
4669 */ 4817 */
4670unsigned long __sched 4818unsigned long __sched
4671wait_for_completion_timeout(struct completion *x, unsigned long timeout) 4819wait_for_completion_timeout(struct completion *x, unsigned long timeout)
@@ -4680,6 +4828,8 @@ EXPORT_SYMBOL(wait_for_completion_timeout);
4680 * 4828 *
4681 * This waits for completion of a specific task to be signaled. It is 4829 * This waits for completion of a specific task to be signaled. It is
4682 * interruptible. 4830 * interruptible.
4831 *
4832 * The return value is -ERESTARTSYS if interrupted, 0 if completed.
4683 */ 4833 */
4684int __sched wait_for_completion_interruptible(struct completion *x) 4834int __sched wait_for_completion_interruptible(struct completion *x)
4685{ 4835{
@@ -4697,6 +4847,9 @@ EXPORT_SYMBOL(wait_for_completion_interruptible);
4697 * 4847 *
4698 * This waits for either a completion of a specific task to be signaled or for a 4848 * This waits for either a completion of a specific task to be signaled or for a
4699 * specified timeout to expire. It is interruptible. The timeout is in jiffies. 4849 * specified timeout to expire. It is interruptible. The timeout is in jiffies.
4850 *
4851 * The return value is -ERESTARTSYS if interrupted, 0 if timed out,
4852 * positive (at least 1, or number of jiffies left till timeout) if completed.
4700 */ 4853 */
4701long __sched 4854long __sched
4702wait_for_completion_interruptible_timeout(struct completion *x, 4855wait_for_completion_interruptible_timeout(struct completion *x,
@@ -4712,6 +4865,8 @@ EXPORT_SYMBOL(wait_for_completion_interruptible_timeout);
4712 * 4865 *
4713 * This waits to be signaled for completion of a specific task. It can be 4866 * This waits to be signaled for completion of a specific task. It can be
4714 * interrupted by a kill signal. 4867 * interrupted by a kill signal.
4868 *
4869 * The return value is -ERESTARTSYS if interrupted, 0 if completed.
4715 */ 4870 */
4716int __sched wait_for_completion_killable(struct completion *x) 4871int __sched wait_for_completion_killable(struct completion *x)
4717{ 4872{
@@ -4730,6 +4885,9 @@ EXPORT_SYMBOL(wait_for_completion_killable);
4730 * This waits for either a completion of a specific task to be 4885 * This waits for either a completion of a specific task to be
4731 * signaled or for a specified timeout to expire. It can be 4886 * signaled or for a specified timeout to expire. It can be
4732 * interrupted by a kill signal. The timeout is in jiffies. 4887 * interrupted by a kill signal. The timeout is in jiffies.
4888 *
4889 * The return value is -ERESTARTSYS if interrupted, 0 if timed out,
4890 * positive (at least 1, or number of jiffies left till timeout) if completed.
4733 */ 4891 */
4734long __sched 4892long __sched
4735wait_for_completion_killable_timeout(struct completion *x, 4893wait_for_completion_killable_timeout(struct completion *x,
@@ -5015,7 +5173,20 @@ EXPORT_SYMBOL(task_nice);
5015 */ 5173 */
5016int idle_cpu(int cpu) 5174int idle_cpu(int cpu)
5017{ 5175{
5018 return cpu_curr(cpu) == cpu_rq(cpu)->idle; 5176 struct rq *rq = cpu_rq(cpu);
5177
5178 if (rq->curr != rq->idle)
5179 return 0;
5180
5181 if (rq->nr_running)
5182 return 0;
5183
5184#ifdef CONFIG_SMP
5185 if (!llist_empty(&rq->wake_list))
5186 return 0;
5187#endif
5188
5189 return 1;
5019} 5190}
5020 5191
5021/** 5192/**
@@ -5564,7 +5735,7 @@ static inline int should_resched(void)
5564static void __cond_resched(void) 5735static void __cond_resched(void)
5565{ 5736{
5566 add_preempt_count(PREEMPT_ACTIVE); 5737 add_preempt_count(PREEMPT_ACTIVE);
5567 schedule(); 5738 __schedule();
5568 sub_preempt_count(PREEMPT_ACTIVE); 5739 sub_preempt_count(PREEMPT_ACTIVE);
5569} 5740}
5570 5741
@@ -5865,7 +6036,7 @@ void show_state_filter(unsigned long state_filter)
5865 printk(KERN_INFO 6036 printk(KERN_INFO
5866 " task PC stack pid father\n"); 6037 " task PC stack pid father\n");
5867#endif 6038#endif
5868 read_lock(&tasklist_lock); 6039 rcu_read_lock();
5869 do_each_thread(g, p) { 6040 do_each_thread(g, p) {
5870 /* 6041 /*
5871 * reset the NMI-timeout, listing all files on a slow 6042 * reset the NMI-timeout, listing all files on a slow
@@ -5881,7 +6052,7 @@ void show_state_filter(unsigned long state_filter)
5881#ifdef CONFIG_SCHED_DEBUG 6052#ifdef CONFIG_SCHED_DEBUG
5882 sysrq_sched_debug_show(); 6053 sysrq_sched_debug_show();
5883#endif 6054#endif
5884 read_unlock(&tasklist_lock); 6055 rcu_read_unlock();
5885 /* 6056 /*
5886 * Only show locks if all tasks are dumped: 6057 * Only show locks if all tasks are dumped:
5887 */ 6058 */
@@ -5942,18 +6113,12 @@ void __cpuinit init_idle(struct task_struct *idle, int cpu)
5942 */ 6113 */
5943 idle->sched_class = &idle_sched_class; 6114 idle->sched_class = &idle_sched_class;
5944 ftrace_graph_init_idle_task(idle, cpu); 6115 ftrace_graph_init_idle_task(idle, cpu);
6116#if defined(CONFIG_SMP)
6117 sprintf(idle->comm, "%s/%d", INIT_TASK_COMM, cpu);
6118#endif
5945} 6119}
5946 6120
5947/* 6121/*
5948 * In a system that switches off the HZ timer nohz_cpu_mask
5949 * indicates which cpus entered this state. This is used
5950 * in the rcu update to wait only for active cpus. For system
5951 * which do not switch off the HZ timer nohz_cpu_mask should
5952 * always be CPU_BITS_NONE.
5953 */
5954cpumask_var_t nohz_cpu_mask;
5955
5956/*
5957 * Increase the granularity value when there are more CPUs, 6122 * Increase the granularity value when there are more CPUs,
5958 * because with more CPUs the 'effective latency' as visible 6123 * because with more CPUs the 'effective latency' as visible
5959 * to users decreases. But the relationship is not linear, 6124 * to users decreases. But the relationship is not linear,
@@ -6005,10 +6170,9 @@ void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask)
6005{ 6170{
6006 if (p->sched_class && p->sched_class->set_cpus_allowed) 6171 if (p->sched_class && p->sched_class->set_cpus_allowed)
6007 p->sched_class->set_cpus_allowed(p, new_mask); 6172 p->sched_class->set_cpus_allowed(p, new_mask);
6008 else { 6173
6009 cpumask_copy(&p->cpus_allowed, new_mask); 6174 cpumask_copy(&p->cpus_allowed, new_mask);
6010 p->rt.nr_cpus_allowed = cpumask_weight(new_mask); 6175 p->rt.nr_cpus_allowed = cpumask_weight(new_mask);
6011 }
6012} 6176}
6013 6177
6014/* 6178/*
@@ -6106,7 +6270,7 @@ static int __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu)
6106 if (task_cpu(p) != src_cpu) 6270 if (task_cpu(p) != src_cpu)
6107 goto done; 6271 goto done;
6108 /* Affinity changed (again). */ 6272 /* Affinity changed (again). */
6109 if (!cpumask_test_cpu(dest_cpu, &p->cpus_allowed)) 6273 if (!cpumask_test_cpu(dest_cpu, tsk_cpus_allowed(p)))
6110 goto fail; 6274 goto fail;
6111 6275
6112 /* 6276 /*
@@ -6187,6 +6351,30 @@ static void calc_global_load_remove(struct rq *rq)
6187 rq->calc_load_active = 0; 6351 rq->calc_load_active = 0;
6188} 6352}
6189 6353
6354#ifdef CONFIG_CFS_BANDWIDTH
6355static void unthrottle_offline_cfs_rqs(struct rq *rq)
6356{
6357 struct cfs_rq *cfs_rq;
6358
6359 for_each_leaf_cfs_rq(rq, cfs_rq) {
6360 struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
6361
6362 if (!cfs_rq->runtime_enabled)
6363 continue;
6364
6365 /*
6366 * clock_task is not advancing so we just need to make sure
6367 * there's some valid quota amount
6368 */
6369 cfs_rq->runtime_remaining = cfs_b->quota;
6370 if (cfs_rq_throttled(cfs_rq))
6371 unthrottle_cfs_rq(cfs_rq);
6372 }
6373}
6374#else
6375static void unthrottle_offline_cfs_rqs(struct rq *rq) {}
6376#endif
6377
6190/* 6378/*
6191 * Migrate all tasks from the rq, sleeping tasks will be migrated by 6379 * Migrate all tasks from the rq, sleeping tasks will be migrated by
6192 * try_to_wake_up()->select_task_rq(). 6380 * try_to_wake_up()->select_task_rq().
@@ -6212,6 +6400,9 @@ static void migrate_tasks(unsigned int dead_cpu)
6212 */ 6400 */
6213 rq->stop = NULL; 6401 rq->stop = NULL;
6214 6402
6403 /* Ensure any throttled groups are reachable by pick_next_task */
6404 unthrottle_offline_cfs_rqs(rq);
6405
6215 for ( ; ; ) { 6406 for ( ; ; ) {
6216 /* 6407 /*
6217 * There's this thread running, bail when that's the only 6408 * There's this thread running, bail when that's the only
@@ -6913,8 +7104,6 @@ static int __init isolated_cpu_setup(char *str)
6913 7104
6914__setup("isolcpus=", isolated_cpu_setup); 7105__setup("isolcpus=", isolated_cpu_setup);
6915 7106
6916#define SD_NODES_PER_DOMAIN 16
6917
6918#ifdef CONFIG_NUMA 7107#ifdef CONFIG_NUMA
6919 7108
6920/** 7109/**
@@ -7419,6 +7608,7 @@ static void __sdt_free(const struct cpumask *cpu_map)
7419 struct sched_domain *sd = *per_cpu_ptr(sdd->sd, j); 7608 struct sched_domain *sd = *per_cpu_ptr(sdd->sd, j);
7420 if (sd && (sd->flags & SD_OVERLAP)) 7609 if (sd && (sd->flags & SD_OVERLAP))
7421 free_sched_groups(sd->groups, 0); 7610 free_sched_groups(sd->groups, 0);
7611 kfree(*per_cpu_ptr(sdd->sd, j));
7422 kfree(*per_cpu_ptr(sdd->sg, j)); 7612 kfree(*per_cpu_ptr(sdd->sg, j));
7423 kfree(*per_cpu_ptr(sdd->sgp, j)); 7613 kfree(*per_cpu_ptr(sdd->sgp, j));
7424 } 7614 }
@@ -7954,6 +8144,7 @@ static void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq,
7954 /* allow initial update_cfs_load() to truncate */ 8144 /* allow initial update_cfs_load() to truncate */
7955 cfs_rq->load_stamp = 1; 8145 cfs_rq->load_stamp = 1;
7956#endif 8146#endif
8147 init_cfs_rq_runtime(cfs_rq);
7957 8148
7958 tg->cfs_rq[cpu] = cfs_rq; 8149 tg->cfs_rq[cpu] = cfs_rq;
7959 tg->se[cpu] = se; 8150 tg->se[cpu] = se;
@@ -8093,6 +8284,7 @@ void __init sched_init(void)
8093 * We achieve this by letting root_task_group's tasks sit 8284 * We achieve this by letting root_task_group's tasks sit
8094 * directly in rq->cfs (i.e root_task_group->se[] = NULL). 8285 * directly in rq->cfs (i.e root_task_group->se[] = NULL).
8095 */ 8286 */
8287 init_cfs_bandwidth(&root_task_group.cfs_bandwidth);
8096 init_tg_cfs_entry(&root_task_group, &rq->cfs, NULL, i, NULL); 8288 init_tg_cfs_entry(&root_task_group, &rq->cfs, NULL, i, NULL);
8097#endif /* CONFIG_FAIR_GROUP_SCHED */ 8289#endif /* CONFIG_FAIR_GROUP_SCHED */
8098 8290
@@ -8122,7 +8314,6 @@ void __init sched_init(void)
8122 rq_attach_root(rq, &def_root_domain); 8314 rq_attach_root(rq, &def_root_domain);
8123#ifdef CONFIG_NO_HZ 8315#ifdef CONFIG_NO_HZ
8124 rq->nohz_balance_kick = 0; 8316 rq->nohz_balance_kick = 0;
8125 init_sched_softirq_csd(&per_cpu(remote_sched_softirq_cb, i));
8126#endif 8317#endif
8127#endif 8318#endif
8128 init_rq_hrtick(rq); 8319 init_rq_hrtick(rq);
@@ -8164,8 +8355,6 @@ void __init sched_init(void)
8164 */ 8355 */
8165 current->sched_class = &fair_sched_class; 8356 current->sched_class = &fair_sched_class;
8166 8357
8167 /* Allocate the nohz_cpu_mask if CONFIG_CPUMASK_OFFSTACK */
8168 zalloc_cpumask_var(&nohz_cpu_mask, GFP_NOWAIT);
8169#ifdef CONFIG_SMP 8358#ifdef CONFIG_SMP
8170 zalloc_cpumask_var(&sched_domains_tmpmask, GFP_NOWAIT); 8359 zalloc_cpumask_var(&sched_domains_tmpmask, GFP_NOWAIT);
8171#ifdef CONFIG_NO_HZ 8360#ifdef CONFIG_NO_HZ
@@ -8195,6 +8384,7 @@ void __might_sleep(const char *file, int line, int preempt_offset)
8195{ 8384{
8196 static unsigned long prev_jiffy; /* ratelimiting */ 8385 static unsigned long prev_jiffy; /* ratelimiting */
8197 8386
8387 rcu_sleep_check(); /* WARN_ON_ONCE() by default, no rate limit reqd. */
8198 if ((preempt_count_equals(preempt_offset) && !irqs_disabled()) || 8388 if ((preempt_count_equals(preempt_offset) && !irqs_disabled()) ||
8199 system_state != SYSTEM_RUNNING || oops_in_progress) 8389 system_state != SYSTEM_RUNNING || oops_in_progress)
8200 return; 8390 return;
@@ -8334,6 +8524,8 @@ static void free_fair_sched_group(struct task_group *tg)
8334{ 8524{
8335 int i; 8525 int i;
8336 8526
8527 destroy_cfs_bandwidth(tg_cfs_bandwidth(tg));
8528
8337 for_each_possible_cpu(i) { 8529 for_each_possible_cpu(i) {
8338 if (tg->cfs_rq) 8530 if (tg->cfs_rq)
8339 kfree(tg->cfs_rq[i]); 8531 kfree(tg->cfs_rq[i]);
@@ -8361,6 +8553,8 @@ int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)
8361 8553
8362 tg->shares = NICE_0_LOAD; 8554 tg->shares = NICE_0_LOAD;
8363 8555
8556 init_cfs_bandwidth(tg_cfs_bandwidth(tg));
8557
8364 for_each_possible_cpu(i) { 8558 for_each_possible_cpu(i) {
8365 cfs_rq = kzalloc_node(sizeof(struct cfs_rq), 8559 cfs_rq = kzalloc_node(sizeof(struct cfs_rq),
8366 GFP_KERNEL, cpu_to_node(i)); 8560 GFP_KERNEL, cpu_to_node(i));
@@ -8636,12 +8830,7 @@ unsigned long sched_group_shares(struct task_group *tg)
8636} 8830}
8637#endif 8831#endif
8638 8832
8639#ifdef CONFIG_RT_GROUP_SCHED 8833#if defined(CONFIG_RT_GROUP_SCHED) || defined(CONFIG_CFS_BANDWIDTH)
8640/*
8641 * Ensure that the real time constraints are schedulable.
8642 */
8643static DEFINE_MUTEX(rt_constraints_mutex);
8644
8645static unsigned long to_ratio(u64 period, u64 runtime) 8834static unsigned long to_ratio(u64 period, u64 runtime)
8646{ 8835{
8647 if (runtime == RUNTIME_INF) 8836 if (runtime == RUNTIME_INF)
@@ -8649,6 +8838,13 @@ static unsigned long to_ratio(u64 period, u64 runtime)
8649 8838
8650 return div64_u64(runtime << 20, period); 8839 return div64_u64(runtime << 20, period);
8651} 8840}
8841#endif
8842
8843#ifdef CONFIG_RT_GROUP_SCHED
8844/*
8845 * Ensure that the real time constraints are schedulable.
8846 */
8847static DEFINE_MUTEX(rt_constraints_mutex);
8652 8848
8653/* Must be called with tasklist_lock held */ 8849/* Must be called with tasklist_lock held */
8654static inline int tg_has_rt_tasks(struct task_group *tg) 8850static inline int tg_has_rt_tasks(struct task_group *tg)
@@ -8669,7 +8865,7 @@ struct rt_schedulable_data {
8669 u64 rt_runtime; 8865 u64 rt_runtime;
8670}; 8866};
8671 8867
8672static int tg_schedulable(struct task_group *tg, void *data) 8868static int tg_rt_schedulable(struct task_group *tg, void *data)
8673{ 8869{
8674 struct rt_schedulable_data *d = data; 8870 struct rt_schedulable_data *d = data;
8675 struct task_group *child; 8871 struct task_group *child;
@@ -8727,16 +8923,22 @@ static int tg_schedulable(struct task_group *tg, void *data)
8727 8923
8728static int __rt_schedulable(struct task_group *tg, u64 period, u64 runtime) 8924static int __rt_schedulable(struct task_group *tg, u64 period, u64 runtime)
8729{ 8925{
8926 int ret;
8927
8730 struct rt_schedulable_data data = { 8928 struct rt_schedulable_data data = {
8731 .tg = tg, 8929 .tg = tg,
8732 .rt_period = period, 8930 .rt_period = period,
8733 .rt_runtime = runtime, 8931 .rt_runtime = runtime,
8734 }; 8932 };
8735 8933
8736 return walk_tg_tree(tg_schedulable, tg_nop, &data); 8934 rcu_read_lock();
8935 ret = walk_tg_tree(tg_rt_schedulable, tg_nop, &data);
8936 rcu_read_unlock();
8937
8938 return ret;
8737} 8939}
8738 8940
8739static int tg_set_bandwidth(struct task_group *tg, 8941static int tg_set_rt_bandwidth(struct task_group *tg,
8740 u64 rt_period, u64 rt_runtime) 8942 u64 rt_period, u64 rt_runtime)
8741{ 8943{
8742 int i, err = 0; 8944 int i, err = 0;
@@ -8775,7 +8977,7 @@ int sched_group_set_rt_runtime(struct task_group *tg, long rt_runtime_us)
8775 if (rt_runtime_us < 0) 8977 if (rt_runtime_us < 0)
8776 rt_runtime = RUNTIME_INF; 8978 rt_runtime = RUNTIME_INF;
8777 8979
8778 return tg_set_bandwidth(tg, rt_period, rt_runtime); 8980 return tg_set_rt_bandwidth(tg, rt_period, rt_runtime);
8779} 8981}
8780 8982
8781long sched_group_rt_runtime(struct task_group *tg) 8983long sched_group_rt_runtime(struct task_group *tg)
@@ -8800,7 +9002,7 @@ int sched_group_set_rt_period(struct task_group *tg, long rt_period_us)
8800 if (rt_period == 0) 9002 if (rt_period == 0)
8801 return -EINVAL; 9003 return -EINVAL;
8802 9004
8803 return tg_set_bandwidth(tg, rt_period, rt_runtime); 9005 return tg_set_rt_bandwidth(tg, rt_period, rt_runtime);
8804} 9006}
8805 9007
8806long sched_group_rt_period(struct task_group *tg) 9008long sched_group_rt_period(struct task_group *tg)
@@ -8990,6 +9192,238 @@ static u64 cpu_shares_read_u64(struct cgroup *cgrp, struct cftype *cft)
8990 9192
8991 return (u64) scale_load_down(tg->shares); 9193 return (u64) scale_load_down(tg->shares);
8992} 9194}
9195
9196#ifdef CONFIG_CFS_BANDWIDTH
9197static DEFINE_MUTEX(cfs_constraints_mutex);
9198
9199const u64 max_cfs_quota_period = 1 * NSEC_PER_SEC; /* 1s */
9200const u64 min_cfs_quota_period = 1 * NSEC_PER_MSEC; /* 1ms */
9201
9202static int __cfs_schedulable(struct task_group *tg, u64 period, u64 runtime);
9203
9204static int tg_set_cfs_bandwidth(struct task_group *tg, u64 period, u64 quota)
9205{
9206 int i, ret = 0, runtime_enabled;
9207 struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(tg);
9208
9209 if (tg == &root_task_group)
9210 return -EINVAL;
9211
9212 /*
9213 * Ensure we have at some amount of bandwidth every period. This is
9214 * to prevent reaching a state of large arrears when throttled via
9215 * entity_tick() resulting in prolonged exit starvation.
9216 */
9217 if (quota < min_cfs_quota_period || period < min_cfs_quota_period)
9218 return -EINVAL;
9219
9220 /*
9221 * Likewise, bound things on the otherside by preventing insane quota
9222 * periods. This also allows us to normalize in computing quota
9223 * feasibility.
9224 */
9225 if (period > max_cfs_quota_period)
9226 return -EINVAL;
9227
9228 mutex_lock(&cfs_constraints_mutex);
9229 ret = __cfs_schedulable(tg, period, quota);
9230 if (ret)
9231 goto out_unlock;
9232
9233 runtime_enabled = quota != RUNTIME_INF;
9234 raw_spin_lock_irq(&cfs_b->lock);
9235 cfs_b->period = ns_to_ktime(period);
9236 cfs_b->quota = quota;
9237
9238 __refill_cfs_bandwidth_runtime(cfs_b);
9239 /* restart the period timer (if active) to handle new period expiry */
9240 if (runtime_enabled && cfs_b->timer_active) {
9241 /* force a reprogram */
9242 cfs_b->timer_active = 0;
9243 __start_cfs_bandwidth(cfs_b);
9244 }
9245 raw_spin_unlock_irq(&cfs_b->lock);
9246
9247 for_each_possible_cpu(i) {
9248 struct cfs_rq *cfs_rq = tg->cfs_rq[i];
9249 struct rq *rq = rq_of(cfs_rq);
9250
9251 raw_spin_lock_irq(&rq->lock);
9252 cfs_rq->runtime_enabled = runtime_enabled;
9253 cfs_rq->runtime_remaining = 0;
9254
9255 if (cfs_rq_throttled(cfs_rq))
9256 unthrottle_cfs_rq(cfs_rq);
9257 raw_spin_unlock_irq(&rq->lock);
9258 }
9259out_unlock:
9260 mutex_unlock(&cfs_constraints_mutex);
9261
9262 return ret;
9263}
9264
9265int tg_set_cfs_quota(struct task_group *tg, long cfs_quota_us)
9266{
9267 u64 quota, period;
9268
9269 period = ktime_to_ns(tg_cfs_bandwidth(tg)->period);
9270 if (cfs_quota_us < 0)
9271 quota = RUNTIME_INF;
9272 else
9273 quota = (u64)cfs_quota_us * NSEC_PER_USEC;
9274
9275 return tg_set_cfs_bandwidth(tg, period, quota);
9276}
9277
9278long tg_get_cfs_quota(struct task_group *tg)
9279{
9280 u64 quota_us;
9281
9282 if (tg_cfs_bandwidth(tg)->quota == RUNTIME_INF)
9283 return -1;
9284
9285 quota_us = tg_cfs_bandwidth(tg)->quota;
9286 do_div(quota_us, NSEC_PER_USEC);
9287
9288 return quota_us;
9289}
9290
9291int tg_set_cfs_period(struct task_group *tg, long cfs_period_us)
9292{
9293 u64 quota, period;
9294
9295 period = (u64)cfs_period_us * NSEC_PER_USEC;
9296 quota = tg_cfs_bandwidth(tg)->quota;
9297
9298 if (period <= 0)
9299 return -EINVAL;
9300
9301 return tg_set_cfs_bandwidth(tg, period, quota);
9302}
9303
9304long tg_get_cfs_period(struct task_group *tg)
9305{
9306 u64 cfs_period_us;
9307
9308 cfs_period_us = ktime_to_ns(tg_cfs_bandwidth(tg)->period);
9309 do_div(cfs_period_us, NSEC_PER_USEC);
9310
9311 return cfs_period_us;
9312}
9313
9314static s64 cpu_cfs_quota_read_s64(struct cgroup *cgrp, struct cftype *cft)
9315{
9316 return tg_get_cfs_quota(cgroup_tg(cgrp));
9317}
9318
9319static int cpu_cfs_quota_write_s64(struct cgroup *cgrp, struct cftype *cftype,
9320 s64 cfs_quota_us)
9321{
9322 return tg_set_cfs_quota(cgroup_tg(cgrp), cfs_quota_us);
9323}
9324
9325static u64 cpu_cfs_period_read_u64(struct cgroup *cgrp, struct cftype *cft)
9326{
9327 return tg_get_cfs_period(cgroup_tg(cgrp));
9328}
9329
9330static int cpu_cfs_period_write_u64(struct cgroup *cgrp, struct cftype *cftype,
9331 u64 cfs_period_us)
9332{
9333 return tg_set_cfs_period(cgroup_tg(cgrp), cfs_period_us);
9334}
9335
9336struct cfs_schedulable_data {
9337 struct task_group *tg;
9338 u64 period, quota;
9339};
9340
9341/*
9342 * normalize group quota/period to be quota/max_period
9343 * note: units are usecs
9344 */
9345static u64 normalize_cfs_quota(struct task_group *tg,
9346 struct cfs_schedulable_data *d)
9347{
9348 u64 quota, period;
9349
9350 if (tg == d->tg) {
9351 period = d->period;
9352 quota = d->quota;
9353 } else {
9354 period = tg_get_cfs_period(tg);
9355 quota = tg_get_cfs_quota(tg);
9356 }
9357
9358 /* note: these should typically be equivalent */
9359 if (quota == RUNTIME_INF || quota == -1)
9360 return RUNTIME_INF;
9361
9362 return to_ratio(period, quota);
9363}
9364
9365static int tg_cfs_schedulable_down(struct task_group *tg, void *data)
9366{
9367 struct cfs_schedulable_data *d = data;
9368 struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(tg);
9369 s64 quota = 0, parent_quota = -1;
9370
9371 if (!tg->parent) {
9372 quota = RUNTIME_INF;
9373 } else {
9374 struct cfs_bandwidth *parent_b = tg_cfs_bandwidth(tg->parent);
9375
9376 quota = normalize_cfs_quota(tg, d);
9377 parent_quota = parent_b->hierarchal_quota;
9378
9379 /*
9380 * ensure max(child_quota) <= parent_quota, inherit when no
9381 * limit is set
9382 */
9383 if (quota == RUNTIME_INF)
9384 quota = parent_quota;
9385 else if (parent_quota != RUNTIME_INF && quota > parent_quota)
9386 return -EINVAL;
9387 }
9388 cfs_b->hierarchal_quota = quota;
9389
9390 return 0;
9391}
9392
9393static int __cfs_schedulable(struct task_group *tg, u64 period, u64 quota)
9394{
9395 int ret;
9396 struct cfs_schedulable_data data = {
9397 .tg = tg,
9398 .period = period,
9399 .quota = quota,
9400 };
9401
9402 if (quota != RUNTIME_INF) {
9403 do_div(data.period, NSEC_PER_USEC);
9404 do_div(data.quota, NSEC_PER_USEC);
9405 }
9406
9407 rcu_read_lock();
9408 ret = walk_tg_tree(tg_cfs_schedulable_down, tg_nop, &data);
9409 rcu_read_unlock();
9410
9411 return ret;
9412}
9413
9414static int cpu_stats_show(struct cgroup *cgrp, struct cftype *cft,
9415 struct cgroup_map_cb *cb)
9416{
9417 struct task_group *tg = cgroup_tg(cgrp);
9418 struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(tg);
9419
9420 cb->fill(cb, "nr_periods", cfs_b->nr_periods);
9421 cb->fill(cb, "nr_throttled", cfs_b->nr_throttled);
9422 cb->fill(cb, "throttled_time", cfs_b->throttled_time);
9423
9424 return 0;
9425}
9426#endif /* CONFIG_CFS_BANDWIDTH */
8993#endif /* CONFIG_FAIR_GROUP_SCHED */ 9427#endif /* CONFIG_FAIR_GROUP_SCHED */
8994 9428
8995#ifdef CONFIG_RT_GROUP_SCHED 9429#ifdef CONFIG_RT_GROUP_SCHED
@@ -9024,6 +9458,22 @@ static struct cftype cpu_files[] = {
9024 .write_u64 = cpu_shares_write_u64, 9458 .write_u64 = cpu_shares_write_u64,
9025 }, 9459 },
9026#endif 9460#endif
9461#ifdef CONFIG_CFS_BANDWIDTH
9462 {
9463 .name = "cfs_quota_us",
9464 .read_s64 = cpu_cfs_quota_read_s64,
9465 .write_s64 = cpu_cfs_quota_write_s64,
9466 },
9467 {
9468 .name = "cfs_period_us",
9469 .read_u64 = cpu_cfs_period_read_u64,
9470 .write_u64 = cpu_cfs_period_write_u64,
9471 },
9472 {
9473 .name = "stat",
9474 .read_map = cpu_stats_show,
9475 },
9476#endif
9027#ifdef CONFIG_RT_GROUP_SCHED 9477#ifdef CONFIG_RT_GROUP_SCHED
9028 { 9478 {
9029 .name = "rt_runtime_us", 9479 .name = "rt_runtime_us",
@@ -9333,4 +9783,3 @@ struct cgroup_subsys cpuacct_subsys = {
9333 .subsys_id = cpuacct_subsys_id, 9783 .subsys_id = cpuacct_subsys_id,
9334}; 9784};
9335#endif /* CONFIG_CGROUP_CPUACCT */ 9785#endif /* CONFIG_CGROUP_CPUACCT */
9336
diff --git a/kernel/sched_clock.c b/kernel/sched_clock.c
index 9d8af0b3fb6..c685e31492d 100644
--- a/kernel/sched_clock.c
+++ b/kernel/sched_clock.c
@@ -62,7 +62,7 @@
62 */ 62 */
63#include <linux/spinlock.h> 63#include <linux/spinlock.h>
64#include <linux/hardirq.h> 64#include <linux/hardirq.h>
65#include <linux/module.h> 65#include <linux/export.h>
66#include <linux/percpu.h> 66#include <linux/percpu.h>
67#include <linux/ktime.h> 67#include <linux/ktime.h>
68#include <linux/sched.h> 68#include <linux/sched.h>
diff --git a/kernel/sched_cpupri.c b/kernel/sched_cpupri.c
index 2722dc1b413..a86cf9d9eb1 100644
--- a/kernel/sched_cpupri.c
+++ b/kernel/sched_cpupri.c
@@ -47,9 +47,6 @@ static int convert_prio(int prio)
47 return cpupri; 47 return cpupri;
48} 48}
49 49
50#define for_each_cpupri_active(array, idx) \
51 for_each_set_bit(idx, array, CPUPRI_NR_PRIORITIES)
52
53/** 50/**
54 * cpupri_find - find the best (lowest-pri) CPU in the system 51 * cpupri_find - find the best (lowest-pri) CPU in the system
55 * @cp: The cpupri context 52 * @cp: The cpupri context
@@ -71,11 +68,38 @@ int cpupri_find(struct cpupri *cp, struct task_struct *p,
71 int idx = 0; 68 int idx = 0;
72 int task_pri = convert_prio(p->prio); 69 int task_pri = convert_prio(p->prio);
73 70
74 for_each_cpupri_active(cp->pri_active, idx) { 71 if (task_pri >= MAX_RT_PRIO)
75 struct cpupri_vec *vec = &cp->pri_to_cpu[idx]; 72 return 0;
76 73
77 if (idx >= task_pri) 74 for (idx = 0; idx < task_pri; idx++) {
78 break; 75 struct cpupri_vec *vec = &cp->pri_to_cpu[idx];
76 int skip = 0;
77
78 if (!atomic_read(&(vec)->count))
79 skip = 1;
80 /*
81 * When looking at the vector, we need to read the counter,
82 * do a memory barrier, then read the mask.
83 *
84 * Note: This is still all racey, but we can deal with it.
85 * Ideally, we only want to look at masks that are set.
86 *
87 * If a mask is not set, then the only thing wrong is that we
88 * did a little more work than necessary.
89 *
90 * If we read a zero count but the mask is set, because of the
91 * memory barriers, that can only happen when the highest prio
92 * task for a run queue has left the run queue, in which case,
93 * it will be followed by a pull. If the task we are processing
94 * fails to find a proper place to go, that pull request will
95 * pull this task if the run queue is running at a lower
96 * priority.
97 */
98 smp_rmb();
99
100 /* Need to do the rmb for every iteration */
101 if (skip)
102 continue;
79 103
80 if (cpumask_any_and(&p->cpus_allowed, vec->mask) >= nr_cpu_ids) 104 if (cpumask_any_and(&p->cpus_allowed, vec->mask) >= nr_cpu_ids)
81 continue; 105 continue;
@@ -115,7 +139,7 @@ void cpupri_set(struct cpupri *cp, int cpu, int newpri)
115{ 139{
116 int *currpri = &cp->cpu_to_pri[cpu]; 140 int *currpri = &cp->cpu_to_pri[cpu];
117 int oldpri = *currpri; 141 int oldpri = *currpri;
118 unsigned long flags; 142 int do_mb = 0;
119 143
120 newpri = convert_prio(newpri); 144 newpri = convert_prio(newpri);
121 145
@@ -128,32 +152,46 @@ void cpupri_set(struct cpupri *cp, int cpu, int newpri)
128 * If the cpu was currently mapped to a different value, we 152 * If the cpu was currently mapped to a different value, we
129 * need to map it to the new value then remove the old value. 153 * need to map it to the new value then remove the old value.
130 * Note, we must add the new value first, otherwise we risk the 154 * Note, we must add the new value first, otherwise we risk the
131 * cpu being cleared from pri_active, and this cpu could be 155 * cpu being missed by the priority loop in cpupri_find.
132 * missed for a push or pull.
133 */ 156 */
134 if (likely(newpri != CPUPRI_INVALID)) { 157 if (likely(newpri != CPUPRI_INVALID)) {
135 struct cpupri_vec *vec = &cp->pri_to_cpu[newpri]; 158 struct cpupri_vec *vec = &cp->pri_to_cpu[newpri];
136 159
137 raw_spin_lock_irqsave(&vec->lock, flags);
138
139 cpumask_set_cpu(cpu, vec->mask); 160 cpumask_set_cpu(cpu, vec->mask);
140 vec->count++; 161 /*
141 if (vec->count == 1) 162 * When adding a new vector, we update the mask first,
142 set_bit(newpri, cp->pri_active); 163 * do a write memory barrier, and then update the count, to
143 164 * make sure the vector is visible when count is set.
144 raw_spin_unlock_irqrestore(&vec->lock, flags); 165 */
166 smp_mb__before_atomic_inc();
167 atomic_inc(&(vec)->count);
168 do_mb = 1;
145 } 169 }
146 if (likely(oldpri != CPUPRI_INVALID)) { 170 if (likely(oldpri != CPUPRI_INVALID)) {
147 struct cpupri_vec *vec = &cp->pri_to_cpu[oldpri]; 171 struct cpupri_vec *vec = &cp->pri_to_cpu[oldpri];
148 172
149 raw_spin_lock_irqsave(&vec->lock, flags); 173 /*
150 174 * Because the order of modification of the vec->count
151 vec->count--; 175 * is important, we must make sure that the update
152 if (!vec->count) 176 * of the new prio is seen before we decrement the
153 clear_bit(oldpri, cp->pri_active); 177 * old prio. This makes sure that the loop sees
178 * one or the other when we raise the priority of
179 * the run queue. We don't care about when we lower the
180 * priority, as that will trigger an rt pull anyway.
181 *
182 * We only need to do a memory barrier if we updated
183 * the new priority vec.
184 */
185 if (do_mb)
186 smp_mb__after_atomic_inc();
187
188 /*
189 * When removing from the vector, we decrement the counter first
190 * do a memory barrier and then clear the mask.
191 */
192 atomic_dec(&(vec)->count);
193 smp_mb__after_atomic_inc();
154 cpumask_clear_cpu(cpu, vec->mask); 194 cpumask_clear_cpu(cpu, vec->mask);
155
156 raw_spin_unlock_irqrestore(&vec->lock, flags);
157 } 195 }
158 196
159 *currpri = newpri; 197 *currpri = newpri;
@@ -175,8 +213,7 @@ int cpupri_init(struct cpupri *cp)
175 for (i = 0; i < CPUPRI_NR_PRIORITIES; i++) { 213 for (i = 0; i < CPUPRI_NR_PRIORITIES; i++) {
176 struct cpupri_vec *vec = &cp->pri_to_cpu[i]; 214 struct cpupri_vec *vec = &cp->pri_to_cpu[i];
177 215
178 raw_spin_lock_init(&vec->lock); 216 atomic_set(&vec->count, 0);
179 vec->count = 0;
180 if (!zalloc_cpumask_var(&vec->mask, GFP_KERNEL)) 217 if (!zalloc_cpumask_var(&vec->mask, GFP_KERNEL))
181 goto cleanup; 218 goto cleanup;
182 } 219 }
diff --git a/kernel/sched_cpupri.h b/kernel/sched_cpupri.h
index 9fc7d386fea..f6d75617349 100644
--- a/kernel/sched_cpupri.h
+++ b/kernel/sched_cpupri.h
@@ -4,7 +4,6 @@
4#include <linux/sched.h> 4#include <linux/sched.h>
5 5
6#define CPUPRI_NR_PRIORITIES (MAX_RT_PRIO + 2) 6#define CPUPRI_NR_PRIORITIES (MAX_RT_PRIO + 2)
7#define CPUPRI_NR_PRI_WORDS BITS_TO_LONGS(CPUPRI_NR_PRIORITIES)
8 7
9#define CPUPRI_INVALID -1 8#define CPUPRI_INVALID -1
10#define CPUPRI_IDLE 0 9#define CPUPRI_IDLE 0
@@ -12,14 +11,12 @@
12/* values 2-101 are RT priorities 0-99 */ 11/* values 2-101 are RT priorities 0-99 */
13 12
14struct cpupri_vec { 13struct cpupri_vec {
15 raw_spinlock_t lock; 14 atomic_t count;
16 int count; 15 cpumask_var_t mask;
17 cpumask_var_t mask;
18}; 16};
19 17
20struct cpupri { 18struct cpupri {
21 struct cpupri_vec pri_to_cpu[CPUPRI_NR_PRIORITIES]; 19 struct cpupri_vec pri_to_cpu[CPUPRI_NR_PRIORITIES];
22 long pri_active[CPUPRI_NR_PRI_WORDS];
23 int cpu_to_pri[NR_CPUS]; 20 int cpu_to_pri[NR_CPUS];
24}; 21};
25 22
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index bc8ee999381..a78ed2736ba 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -89,6 +89,20 @@ const_debug unsigned int sysctl_sched_migration_cost = 500000UL;
89 */ 89 */
90unsigned int __read_mostly sysctl_sched_shares_window = 10000000UL; 90unsigned int __read_mostly sysctl_sched_shares_window = 10000000UL;
91 91
92#ifdef CONFIG_CFS_BANDWIDTH
93/*
94 * Amount of runtime to allocate from global (tg) to local (per-cfs_rq) pool
95 * each time a cfs_rq requests quota.
96 *
97 * Note: in the case that the slice exceeds the runtime remaining (either due
98 * to consumption or the quota being specified to be smaller than the slice)
99 * we will always only issue the remaining available time.
100 *
101 * default: 5 msec, units: microseconds
102 */
103unsigned int sysctl_sched_cfs_bandwidth_slice = 5000UL;
104#endif
105
92static const struct sched_class fair_sched_class; 106static const struct sched_class fair_sched_class;
93 107
94/************************************************************** 108/**************************************************************
@@ -292,6 +306,8 @@ find_matching_se(struct sched_entity **se, struct sched_entity **pse)
292 306
293#endif /* CONFIG_FAIR_GROUP_SCHED */ 307#endif /* CONFIG_FAIR_GROUP_SCHED */
294 308
309static void account_cfs_rq_runtime(struct cfs_rq *cfs_rq,
310 unsigned long delta_exec);
295 311
296/************************************************************** 312/**************************************************************
297 * Scheduling class tree data structure manipulation methods: 313 * Scheduling class tree data structure manipulation methods:
@@ -583,6 +599,8 @@ static void update_curr(struct cfs_rq *cfs_rq)
583 cpuacct_charge(curtask, delta_exec); 599 cpuacct_charge(curtask, delta_exec);
584 account_group_exec_runtime(curtask, delta_exec); 600 account_group_exec_runtime(curtask, delta_exec);
585 } 601 }
602
603 account_cfs_rq_runtime(cfs_rq, delta_exec);
586} 604}
587 605
588static inline void 606static inline void
@@ -688,6 +706,8 @@ account_entity_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se)
688} 706}
689 707
690#ifdef CONFIG_FAIR_GROUP_SCHED 708#ifdef CONFIG_FAIR_GROUP_SCHED
709/* we need this in update_cfs_load and load-balance functions below */
710static inline int throttled_hierarchy(struct cfs_rq *cfs_rq);
691# ifdef CONFIG_SMP 711# ifdef CONFIG_SMP
692static void update_cfs_rq_load_contribution(struct cfs_rq *cfs_rq, 712static void update_cfs_rq_load_contribution(struct cfs_rq *cfs_rq,
693 int global_update) 713 int global_update)
@@ -710,7 +730,7 @@ static void update_cfs_load(struct cfs_rq *cfs_rq, int global_update)
710 u64 now, delta; 730 u64 now, delta;
711 unsigned long load = cfs_rq->load.weight; 731 unsigned long load = cfs_rq->load.weight;
712 732
713 if (cfs_rq->tg == &root_task_group) 733 if (cfs_rq->tg == &root_task_group || throttled_hierarchy(cfs_rq))
714 return; 734 return;
715 735
716 now = rq_of(cfs_rq)->clock_task; 736 now = rq_of(cfs_rq)->clock_task;
@@ -752,19 +772,32 @@ static void update_cfs_load(struct cfs_rq *cfs_rq, int global_update)
752 list_del_leaf_cfs_rq(cfs_rq); 772 list_del_leaf_cfs_rq(cfs_rq);
753} 773}
754 774
775static inline long calc_tg_weight(struct task_group *tg, struct cfs_rq *cfs_rq)
776{
777 long tg_weight;
778
779 /*
780 * Use this CPU's actual weight instead of the last load_contribution
781 * to gain a more accurate current total weight. See
782 * update_cfs_rq_load_contribution().
783 */
784 tg_weight = atomic_read(&tg->load_weight);
785 tg_weight -= cfs_rq->load_contribution;
786 tg_weight += cfs_rq->load.weight;
787
788 return tg_weight;
789}
790
755static long calc_cfs_shares(struct cfs_rq *cfs_rq, struct task_group *tg) 791static long calc_cfs_shares(struct cfs_rq *cfs_rq, struct task_group *tg)
756{ 792{
757 long load_weight, load, shares; 793 long tg_weight, load, shares;
758 794
795 tg_weight = calc_tg_weight(tg, cfs_rq);
759 load = cfs_rq->load.weight; 796 load = cfs_rq->load.weight;
760 797
761 load_weight = atomic_read(&tg->load_weight);
762 load_weight += load;
763 load_weight -= cfs_rq->load_contribution;
764
765 shares = (tg->shares * load); 798 shares = (tg->shares * load);
766 if (load_weight) 799 if (tg_weight)
767 shares /= load_weight; 800 shares /= tg_weight;
768 801
769 if (shares < MIN_SHARES) 802 if (shares < MIN_SHARES)
770 shares = MIN_SHARES; 803 shares = MIN_SHARES;
@@ -819,7 +852,7 @@ static void update_cfs_shares(struct cfs_rq *cfs_rq)
819 852
820 tg = cfs_rq->tg; 853 tg = cfs_rq->tg;
821 se = tg->se[cpu_of(rq_of(cfs_rq))]; 854 se = tg->se[cpu_of(rq_of(cfs_rq))];
822 if (!se) 855 if (!se || throttled_hierarchy(cfs_rq))
823 return; 856 return;
824#ifndef CONFIG_SMP 857#ifndef CONFIG_SMP
825 if (likely(se->load.weight == tg->shares)) 858 if (likely(se->load.weight == tg->shares))
@@ -950,6 +983,8 @@ place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial)
950 se->vruntime = vruntime; 983 se->vruntime = vruntime;
951} 984}
952 985
986static void check_enqueue_throttle(struct cfs_rq *cfs_rq);
987
953static void 988static void
954enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) 989enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
955{ 990{
@@ -979,8 +1014,10 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
979 __enqueue_entity(cfs_rq, se); 1014 __enqueue_entity(cfs_rq, se);
980 se->on_rq = 1; 1015 se->on_rq = 1;
981 1016
982 if (cfs_rq->nr_running == 1) 1017 if (cfs_rq->nr_running == 1) {
983 list_add_leaf_cfs_rq(cfs_rq); 1018 list_add_leaf_cfs_rq(cfs_rq);
1019 check_enqueue_throttle(cfs_rq);
1020 }
984} 1021}
985 1022
986static void __clear_buddies_last(struct sched_entity *se) 1023static void __clear_buddies_last(struct sched_entity *se)
@@ -1028,6 +1065,8 @@ static void clear_buddies(struct cfs_rq *cfs_rq, struct sched_entity *se)
1028 __clear_buddies_skip(se); 1065 __clear_buddies_skip(se);
1029} 1066}
1030 1067
1068static void return_cfs_rq_runtime(struct cfs_rq *cfs_rq);
1069
1031static void 1070static void
1032dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) 1071dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
1033{ 1072{
@@ -1066,6 +1105,9 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
1066 if (!(flags & DEQUEUE_SLEEP)) 1105 if (!(flags & DEQUEUE_SLEEP))
1067 se->vruntime -= cfs_rq->min_vruntime; 1106 se->vruntime -= cfs_rq->min_vruntime;
1068 1107
1108 /* return excess runtime on last dequeue */
1109 return_cfs_rq_runtime(cfs_rq);
1110
1069 update_min_vruntime(cfs_rq); 1111 update_min_vruntime(cfs_rq);
1070 update_cfs_shares(cfs_rq); 1112 update_cfs_shares(cfs_rq);
1071} 1113}
@@ -1077,6 +1119,8 @@ static void
1077check_preempt_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr) 1119check_preempt_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr)
1078{ 1120{
1079 unsigned long ideal_runtime, delta_exec; 1121 unsigned long ideal_runtime, delta_exec;
1122 struct sched_entity *se;
1123 s64 delta;
1080 1124
1081 ideal_runtime = sched_slice(cfs_rq, curr); 1125 ideal_runtime = sched_slice(cfs_rq, curr);
1082 delta_exec = curr->sum_exec_runtime - curr->prev_sum_exec_runtime; 1126 delta_exec = curr->sum_exec_runtime - curr->prev_sum_exec_runtime;
@@ -1095,22 +1139,17 @@ check_preempt_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr)
1095 * narrow margin doesn't have to wait for a full slice. 1139 * narrow margin doesn't have to wait for a full slice.
1096 * This also mitigates buddy induced latencies under load. 1140 * This also mitigates buddy induced latencies under load.
1097 */ 1141 */
1098 if (!sched_feat(WAKEUP_PREEMPT))
1099 return;
1100
1101 if (delta_exec < sysctl_sched_min_granularity) 1142 if (delta_exec < sysctl_sched_min_granularity)
1102 return; 1143 return;
1103 1144
1104 if (cfs_rq->nr_running > 1) { 1145 se = __pick_first_entity(cfs_rq);
1105 struct sched_entity *se = __pick_first_entity(cfs_rq); 1146 delta = curr->vruntime - se->vruntime;
1106 s64 delta = curr->vruntime - se->vruntime;
1107 1147
1108 if (delta < 0) 1148 if (delta < 0)
1109 return; 1149 return;
1110 1150
1111 if (delta > ideal_runtime) 1151 if (delta > ideal_runtime)
1112 resched_task(rq_of(cfs_rq)->curr); 1152 resched_task(rq_of(cfs_rq)->curr);
1113 }
1114} 1153}
1115 1154
1116static void 1155static void
@@ -1185,6 +1224,8 @@ static struct sched_entity *pick_next_entity(struct cfs_rq *cfs_rq)
1185 return se; 1224 return se;
1186} 1225}
1187 1226
1227static void check_cfs_rq_runtime(struct cfs_rq *cfs_rq);
1228
1188static void put_prev_entity(struct cfs_rq *cfs_rq, struct sched_entity *prev) 1229static void put_prev_entity(struct cfs_rq *cfs_rq, struct sched_entity *prev)
1189{ 1230{
1190 /* 1231 /*
@@ -1194,6 +1235,9 @@ static void put_prev_entity(struct cfs_rq *cfs_rq, struct sched_entity *prev)
1194 if (prev->on_rq) 1235 if (prev->on_rq)
1195 update_curr(cfs_rq); 1236 update_curr(cfs_rq);
1196 1237
1238 /* throttle cfs_rqs exceeding runtime */
1239 check_cfs_rq_runtime(cfs_rq);
1240
1197 check_spread(cfs_rq, prev); 1241 check_spread(cfs_rq, prev);
1198 if (prev->on_rq) { 1242 if (prev->on_rq) {
1199 update_stats_wait_start(cfs_rq, prev); 1243 update_stats_wait_start(cfs_rq, prev);
@@ -1233,10 +1277,583 @@ entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr, int queued)
1233 return; 1277 return;
1234#endif 1278#endif
1235 1279
1236 if (cfs_rq->nr_running > 1 || !sched_feat(WAKEUP_PREEMPT)) 1280 if (cfs_rq->nr_running > 1)
1237 check_preempt_tick(cfs_rq, curr); 1281 check_preempt_tick(cfs_rq, curr);
1238} 1282}
1239 1283
1284
1285/**************************************************
1286 * CFS bandwidth control machinery
1287 */
1288
1289#ifdef CONFIG_CFS_BANDWIDTH
1290/*
1291 * default period for cfs group bandwidth.
1292 * default: 0.1s, units: nanoseconds
1293 */
1294static inline u64 default_cfs_period(void)
1295{
1296 return 100000000ULL;
1297}
1298
1299static inline u64 sched_cfs_bandwidth_slice(void)
1300{
1301 return (u64)sysctl_sched_cfs_bandwidth_slice * NSEC_PER_USEC;
1302}
1303
1304/*
1305 * Replenish runtime according to assigned quota and update expiration time.
1306 * We use sched_clock_cpu directly instead of rq->clock to avoid adding
1307 * additional synchronization around rq->lock.
1308 *
1309 * requires cfs_b->lock
1310 */
1311static void __refill_cfs_bandwidth_runtime(struct cfs_bandwidth *cfs_b)
1312{
1313 u64 now;
1314
1315 if (cfs_b->quota == RUNTIME_INF)
1316 return;
1317
1318 now = sched_clock_cpu(smp_processor_id());
1319 cfs_b->runtime = cfs_b->quota;
1320 cfs_b->runtime_expires = now + ktime_to_ns(cfs_b->period);
1321}
1322
1323/* returns 0 on failure to allocate runtime */
1324static int assign_cfs_rq_runtime(struct cfs_rq *cfs_rq)
1325{
1326 struct task_group *tg = cfs_rq->tg;
1327 struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(tg);
1328 u64 amount = 0, min_amount, expires;
1329
1330 /* note: this is a positive sum as runtime_remaining <= 0 */
1331 min_amount = sched_cfs_bandwidth_slice() - cfs_rq->runtime_remaining;
1332
1333 raw_spin_lock(&cfs_b->lock);
1334 if (cfs_b->quota == RUNTIME_INF)
1335 amount = min_amount;
1336 else {
1337 /*
1338 * If the bandwidth pool has become inactive, then at least one
1339 * period must have elapsed since the last consumption.
1340 * Refresh the global state and ensure bandwidth timer becomes
1341 * active.
1342 */
1343 if (!cfs_b->timer_active) {
1344 __refill_cfs_bandwidth_runtime(cfs_b);
1345 __start_cfs_bandwidth(cfs_b);
1346 }
1347
1348 if (cfs_b->runtime > 0) {
1349 amount = min(cfs_b->runtime, min_amount);
1350 cfs_b->runtime -= amount;
1351 cfs_b->idle = 0;
1352 }
1353 }
1354 expires = cfs_b->runtime_expires;
1355 raw_spin_unlock(&cfs_b->lock);
1356
1357 cfs_rq->runtime_remaining += amount;
1358 /*
1359 * we may have advanced our local expiration to account for allowed
1360 * spread between our sched_clock and the one on which runtime was
1361 * issued.
1362 */
1363 if ((s64)(expires - cfs_rq->runtime_expires) > 0)
1364 cfs_rq->runtime_expires = expires;
1365
1366 return cfs_rq->runtime_remaining > 0;
1367}
1368
1369/*
1370 * Note: This depends on the synchronization provided by sched_clock and the
1371 * fact that rq->clock snapshots this value.
1372 */
1373static void expire_cfs_rq_runtime(struct cfs_rq *cfs_rq)
1374{
1375 struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
1376 struct rq *rq = rq_of(cfs_rq);
1377
1378 /* if the deadline is ahead of our clock, nothing to do */
1379 if (likely((s64)(rq->clock - cfs_rq->runtime_expires) < 0))
1380 return;
1381
1382 if (cfs_rq->runtime_remaining < 0)
1383 return;
1384
1385 /*
1386 * If the local deadline has passed we have to consider the
1387 * possibility that our sched_clock is 'fast' and the global deadline
1388 * has not truly expired.
1389 *
1390 * Fortunately we can check determine whether this the case by checking
1391 * whether the global deadline has advanced.
1392 */
1393
1394 if ((s64)(cfs_rq->runtime_expires - cfs_b->runtime_expires) >= 0) {
1395 /* extend local deadline, drift is bounded above by 2 ticks */
1396 cfs_rq->runtime_expires += TICK_NSEC;
1397 } else {
1398 /* global deadline is ahead, expiration has passed */
1399 cfs_rq->runtime_remaining = 0;
1400 }
1401}
1402
1403static void __account_cfs_rq_runtime(struct cfs_rq *cfs_rq,
1404 unsigned long delta_exec)
1405{
1406 /* dock delta_exec before expiring quota (as it could span periods) */
1407 cfs_rq->runtime_remaining -= delta_exec;
1408 expire_cfs_rq_runtime(cfs_rq);
1409
1410 if (likely(cfs_rq->runtime_remaining > 0))
1411 return;
1412
1413 /*
1414 * if we're unable to extend our runtime we resched so that the active
1415 * hierarchy can be throttled
1416 */
1417 if (!assign_cfs_rq_runtime(cfs_rq) && likely(cfs_rq->curr))
1418 resched_task(rq_of(cfs_rq)->curr);
1419}
1420
1421static __always_inline void account_cfs_rq_runtime(struct cfs_rq *cfs_rq,
1422 unsigned long delta_exec)
1423{
1424 if (!cfs_rq->runtime_enabled)
1425 return;
1426
1427 __account_cfs_rq_runtime(cfs_rq, delta_exec);
1428}
1429
1430static inline int cfs_rq_throttled(struct cfs_rq *cfs_rq)
1431{
1432 return cfs_rq->throttled;
1433}
1434
1435/* check whether cfs_rq, or any parent, is throttled */
1436static inline int throttled_hierarchy(struct cfs_rq *cfs_rq)
1437{
1438 return cfs_rq->throttle_count;
1439}
1440
1441/*
1442 * Ensure that neither of the group entities corresponding to src_cpu or
1443 * dest_cpu are members of a throttled hierarchy when performing group
1444 * load-balance operations.
1445 */
1446static inline int throttled_lb_pair(struct task_group *tg,
1447 int src_cpu, int dest_cpu)
1448{
1449 struct cfs_rq *src_cfs_rq, *dest_cfs_rq;
1450
1451 src_cfs_rq = tg->cfs_rq[src_cpu];
1452 dest_cfs_rq = tg->cfs_rq[dest_cpu];
1453
1454 return throttled_hierarchy(src_cfs_rq) ||
1455 throttled_hierarchy(dest_cfs_rq);
1456}
1457
1458/* updated child weight may affect parent so we have to do this bottom up */
1459static int tg_unthrottle_up(struct task_group *tg, void *data)
1460{
1461 struct rq *rq = data;
1462 struct cfs_rq *cfs_rq = tg->cfs_rq[cpu_of(rq)];
1463
1464 cfs_rq->throttle_count--;
1465#ifdef CONFIG_SMP
1466 if (!cfs_rq->throttle_count) {
1467 u64 delta = rq->clock_task - cfs_rq->load_stamp;
1468
1469 /* leaving throttled state, advance shares averaging windows */
1470 cfs_rq->load_stamp += delta;
1471 cfs_rq->load_last += delta;
1472
1473 /* update entity weight now that we are on_rq again */
1474 update_cfs_shares(cfs_rq);
1475 }
1476#endif
1477
1478 return 0;
1479}
1480
1481static int tg_throttle_down(struct task_group *tg, void *data)
1482{
1483 struct rq *rq = data;
1484 struct cfs_rq *cfs_rq = tg->cfs_rq[cpu_of(rq)];
1485
1486 /* group is entering throttled state, record last load */
1487 if (!cfs_rq->throttle_count)
1488 update_cfs_load(cfs_rq, 0);
1489 cfs_rq->throttle_count++;
1490
1491 return 0;
1492}
1493
1494static void throttle_cfs_rq(struct cfs_rq *cfs_rq)
1495{
1496 struct rq *rq = rq_of(cfs_rq);
1497 struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
1498 struct sched_entity *se;
1499 long task_delta, dequeue = 1;
1500
1501 se = cfs_rq->tg->se[cpu_of(rq_of(cfs_rq))];
1502
1503 /* account load preceding throttle */
1504 rcu_read_lock();
1505 walk_tg_tree_from(cfs_rq->tg, tg_throttle_down, tg_nop, (void *)rq);
1506 rcu_read_unlock();
1507
1508 task_delta = cfs_rq->h_nr_running;
1509 for_each_sched_entity(se) {
1510 struct cfs_rq *qcfs_rq = cfs_rq_of(se);
1511 /* throttled entity or throttle-on-deactivate */
1512 if (!se->on_rq)
1513 break;
1514
1515 if (dequeue)
1516 dequeue_entity(qcfs_rq, se, DEQUEUE_SLEEP);
1517 qcfs_rq->h_nr_running -= task_delta;
1518
1519 if (qcfs_rq->load.weight)
1520 dequeue = 0;
1521 }
1522
1523 if (!se)
1524 rq->nr_running -= task_delta;
1525
1526 cfs_rq->throttled = 1;
1527 cfs_rq->throttled_timestamp = rq->clock;
1528 raw_spin_lock(&cfs_b->lock);
1529 list_add_tail_rcu(&cfs_rq->throttled_list, &cfs_b->throttled_cfs_rq);
1530 raw_spin_unlock(&cfs_b->lock);
1531}
1532
1533static void unthrottle_cfs_rq(struct cfs_rq *cfs_rq)
1534{
1535 struct rq *rq = rq_of(cfs_rq);
1536 struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
1537 struct sched_entity *se;
1538 int enqueue = 1;
1539 long task_delta;
1540
1541 se = cfs_rq->tg->se[cpu_of(rq_of(cfs_rq))];
1542
1543 cfs_rq->throttled = 0;
1544 raw_spin_lock(&cfs_b->lock);
1545 cfs_b->throttled_time += rq->clock - cfs_rq->throttled_timestamp;
1546 list_del_rcu(&cfs_rq->throttled_list);
1547 raw_spin_unlock(&cfs_b->lock);
1548 cfs_rq->throttled_timestamp = 0;
1549
1550 update_rq_clock(rq);
1551 /* update hierarchical throttle state */
1552 walk_tg_tree_from(cfs_rq->tg, tg_nop, tg_unthrottle_up, (void *)rq);
1553
1554 if (!cfs_rq->load.weight)
1555 return;
1556
1557 task_delta = cfs_rq->h_nr_running;
1558 for_each_sched_entity(se) {
1559 if (se->on_rq)
1560 enqueue = 0;
1561
1562 cfs_rq = cfs_rq_of(se);
1563 if (enqueue)
1564 enqueue_entity(cfs_rq, se, ENQUEUE_WAKEUP);
1565 cfs_rq->h_nr_running += task_delta;
1566
1567 if (cfs_rq_throttled(cfs_rq))
1568 break;
1569 }
1570
1571 if (!se)
1572 rq->nr_running += task_delta;
1573
1574 /* determine whether we need to wake up potentially idle cpu */
1575 if (rq->curr == rq->idle && rq->cfs.nr_running)
1576 resched_task(rq->curr);
1577}
1578
1579static u64 distribute_cfs_runtime(struct cfs_bandwidth *cfs_b,
1580 u64 remaining, u64 expires)
1581{
1582 struct cfs_rq *cfs_rq;
1583 u64 runtime = remaining;
1584
1585 rcu_read_lock();
1586 list_for_each_entry_rcu(cfs_rq, &cfs_b->throttled_cfs_rq,
1587 throttled_list) {
1588 struct rq *rq = rq_of(cfs_rq);
1589
1590 raw_spin_lock(&rq->lock);
1591 if (!cfs_rq_throttled(cfs_rq))
1592 goto next;
1593
1594 runtime = -cfs_rq->runtime_remaining + 1;
1595 if (runtime > remaining)
1596 runtime = remaining;
1597 remaining -= runtime;
1598
1599 cfs_rq->runtime_remaining += runtime;
1600 cfs_rq->runtime_expires = expires;
1601
1602 /* we check whether we're throttled above */
1603 if (cfs_rq->runtime_remaining > 0)
1604 unthrottle_cfs_rq(cfs_rq);
1605
1606next:
1607 raw_spin_unlock(&rq->lock);
1608
1609 if (!remaining)
1610 break;
1611 }
1612 rcu_read_unlock();
1613
1614 return remaining;
1615}
1616
1617/*
1618 * Responsible for refilling a task_group's bandwidth and unthrottling its
1619 * cfs_rqs as appropriate. If there has been no activity within the last
1620 * period the timer is deactivated until scheduling resumes; cfs_b->idle is
1621 * used to track this state.
1622 */
1623static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun)
1624{
1625 u64 runtime, runtime_expires;
1626 int idle = 1, throttled;
1627
1628 raw_spin_lock(&cfs_b->lock);
1629 /* no need to continue the timer with no bandwidth constraint */
1630 if (cfs_b->quota == RUNTIME_INF)
1631 goto out_unlock;
1632
1633 throttled = !list_empty(&cfs_b->throttled_cfs_rq);
1634 /* idle depends on !throttled (for the case of a large deficit) */
1635 idle = cfs_b->idle && !throttled;
1636 cfs_b->nr_periods += overrun;
1637
1638 /* if we're going inactive then everything else can be deferred */
1639 if (idle)
1640 goto out_unlock;
1641
1642 __refill_cfs_bandwidth_runtime(cfs_b);
1643
1644 if (!throttled) {
1645 /* mark as potentially idle for the upcoming period */
1646 cfs_b->idle = 1;
1647 goto out_unlock;
1648 }
1649
1650 /* account preceding periods in which throttling occurred */
1651 cfs_b->nr_throttled += overrun;
1652
1653 /*
1654 * There are throttled entities so we must first use the new bandwidth
1655 * to unthrottle them before making it generally available. This
1656 * ensures that all existing debts will be paid before a new cfs_rq is
1657 * allowed to run.
1658 */
1659 runtime = cfs_b->runtime;
1660 runtime_expires = cfs_b->runtime_expires;
1661 cfs_b->runtime = 0;
1662
1663 /*
1664 * This check is repeated as we are holding onto the new bandwidth
1665 * while we unthrottle. This can potentially race with an unthrottled
1666 * group trying to acquire new bandwidth from the global pool.
1667 */
1668 while (throttled && runtime > 0) {
1669 raw_spin_unlock(&cfs_b->lock);
1670 /* we can't nest cfs_b->lock while distributing bandwidth */
1671 runtime = distribute_cfs_runtime(cfs_b, runtime,
1672 runtime_expires);
1673 raw_spin_lock(&cfs_b->lock);
1674
1675 throttled = !list_empty(&cfs_b->throttled_cfs_rq);
1676 }
1677
1678 /* return (any) remaining runtime */
1679 cfs_b->runtime = runtime;
1680 /*
1681 * While we are ensured activity in the period following an
1682 * unthrottle, this also covers the case in which the new bandwidth is
1683 * insufficient to cover the existing bandwidth deficit. (Forcing the
1684 * timer to remain active while there are any throttled entities.)
1685 */
1686 cfs_b->idle = 0;
1687out_unlock:
1688 if (idle)
1689 cfs_b->timer_active = 0;
1690 raw_spin_unlock(&cfs_b->lock);
1691
1692 return idle;
1693}
1694
1695/* a cfs_rq won't donate quota below this amount */
1696static const u64 min_cfs_rq_runtime = 1 * NSEC_PER_MSEC;
1697/* minimum remaining period time to redistribute slack quota */
1698static const u64 min_bandwidth_expiration = 2 * NSEC_PER_MSEC;
1699/* how long we wait to gather additional slack before distributing */
1700static const u64 cfs_bandwidth_slack_period = 5 * NSEC_PER_MSEC;
1701
1702/* are we near the end of the current quota period? */
1703static int runtime_refresh_within(struct cfs_bandwidth *cfs_b, u64 min_expire)
1704{
1705 struct hrtimer *refresh_timer = &cfs_b->period_timer;
1706 u64 remaining;
1707
1708 /* if the call-back is running a quota refresh is already occurring */
1709 if (hrtimer_callback_running(refresh_timer))
1710 return 1;
1711
1712 /* is a quota refresh about to occur? */
1713 remaining = ktime_to_ns(hrtimer_expires_remaining(refresh_timer));
1714 if (remaining < min_expire)
1715 return 1;
1716
1717 return 0;
1718}
1719
1720static void start_cfs_slack_bandwidth(struct cfs_bandwidth *cfs_b)
1721{
1722 u64 min_left = cfs_bandwidth_slack_period + min_bandwidth_expiration;
1723
1724 /* if there's a quota refresh soon don't bother with slack */
1725 if (runtime_refresh_within(cfs_b, min_left))
1726 return;
1727
1728 start_bandwidth_timer(&cfs_b->slack_timer,
1729 ns_to_ktime(cfs_bandwidth_slack_period));
1730}
1731
1732/* we know any runtime found here is valid as update_curr() precedes return */
1733static void __return_cfs_rq_runtime(struct cfs_rq *cfs_rq)
1734{
1735 struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
1736 s64 slack_runtime = cfs_rq->runtime_remaining - min_cfs_rq_runtime;
1737
1738 if (slack_runtime <= 0)
1739 return;
1740
1741 raw_spin_lock(&cfs_b->lock);
1742 if (cfs_b->quota != RUNTIME_INF &&
1743 cfs_rq->runtime_expires == cfs_b->runtime_expires) {
1744 cfs_b->runtime += slack_runtime;
1745
1746 /* we are under rq->lock, defer unthrottling using a timer */
1747 if (cfs_b->runtime > sched_cfs_bandwidth_slice() &&
1748 !list_empty(&cfs_b->throttled_cfs_rq))
1749 start_cfs_slack_bandwidth(cfs_b);
1750 }
1751 raw_spin_unlock(&cfs_b->lock);
1752
1753 /* even if it's not valid for return we don't want to try again */
1754 cfs_rq->runtime_remaining -= slack_runtime;
1755}
1756
1757static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq)
1758{
1759 if (!cfs_rq->runtime_enabled || cfs_rq->nr_running)
1760 return;
1761
1762 __return_cfs_rq_runtime(cfs_rq);
1763}
1764
1765/*
1766 * This is done with a timer (instead of inline with bandwidth return) since
1767 * it's necessary to juggle rq->locks to unthrottle their respective cfs_rqs.
1768 */
1769static void do_sched_cfs_slack_timer(struct cfs_bandwidth *cfs_b)
1770{
1771 u64 runtime = 0, slice = sched_cfs_bandwidth_slice();
1772 u64 expires;
1773
1774 /* confirm we're still not at a refresh boundary */
1775 if (runtime_refresh_within(cfs_b, min_bandwidth_expiration))
1776 return;
1777
1778 raw_spin_lock(&cfs_b->lock);
1779 if (cfs_b->quota != RUNTIME_INF && cfs_b->runtime > slice) {
1780 runtime = cfs_b->runtime;
1781 cfs_b->runtime = 0;
1782 }
1783 expires = cfs_b->runtime_expires;
1784 raw_spin_unlock(&cfs_b->lock);
1785
1786 if (!runtime)
1787 return;
1788
1789 runtime = distribute_cfs_runtime(cfs_b, runtime, expires);
1790
1791 raw_spin_lock(&cfs_b->lock);
1792 if (expires == cfs_b->runtime_expires)
1793 cfs_b->runtime = runtime;
1794 raw_spin_unlock(&cfs_b->lock);
1795}
1796
1797/*
1798 * When a group wakes up we want to make sure that its quota is not already
1799 * expired/exceeded, otherwise it may be allowed to steal additional ticks of
1800 * runtime as update_curr() throttling can not not trigger until it's on-rq.
1801 */
1802static void check_enqueue_throttle(struct cfs_rq *cfs_rq)
1803{
1804 /* an active group must be handled by the update_curr()->put() path */
1805 if (!cfs_rq->runtime_enabled || cfs_rq->curr)
1806 return;
1807
1808 /* ensure the group is not already throttled */
1809 if (cfs_rq_throttled(cfs_rq))
1810 return;
1811
1812 /* update runtime allocation */
1813 account_cfs_rq_runtime(cfs_rq, 0);
1814 if (cfs_rq->runtime_remaining <= 0)
1815 throttle_cfs_rq(cfs_rq);
1816}
1817
1818/* conditionally throttle active cfs_rq's from put_prev_entity() */
1819static void check_cfs_rq_runtime(struct cfs_rq *cfs_rq)
1820{
1821 if (likely(!cfs_rq->runtime_enabled || cfs_rq->runtime_remaining > 0))
1822 return;
1823
1824 /*
1825 * it's possible for a throttled entity to be forced into a running
1826 * state (e.g. set_curr_task), in this case we're finished.
1827 */
1828 if (cfs_rq_throttled(cfs_rq))
1829 return;
1830
1831 throttle_cfs_rq(cfs_rq);
1832}
1833#else
1834static void account_cfs_rq_runtime(struct cfs_rq *cfs_rq,
1835 unsigned long delta_exec) {}
1836static void check_cfs_rq_runtime(struct cfs_rq *cfs_rq) {}
1837static void check_enqueue_throttle(struct cfs_rq *cfs_rq) {}
1838static void return_cfs_rq_runtime(struct cfs_rq *cfs_rq) {}
1839
1840static inline int cfs_rq_throttled(struct cfs_rq *cfs_rq)
1841{
1842 return 0;
1843}
1844
1845static inline int throttled_hierarchy(struct cfs_rq *cfs_rq)
1846{
1847 return 0;
1848}
1849
1850static inline int throttled_lb_pair(struct task_group *tg,
1851 int src_cpu, int dest_cpu)
1852{
1853 return 0;
1854}
1855#endif
1856
1240/************************************************** 1857/**************************************************
1241 * CFS operations on tasks: 1858 * CFS operations on tasks:
1242 */ 1859 */
@@ -1313,16 +1930,33 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
1313 break; 1930 break;
1314 cfs_rq = cfs_rq_of(se); 1931 cfs_rq = cfs_rq_of(se);
1315 enqueue_entity(cfs_rq, se, flags); 1932 enqueue_entity(cfs_rq, se, flags);
1933
1934 /*
1935 * end evaluation on encountering a throttled cfs_rq
1936 *
1937 * note: in the case of encountering a throttled cfs_rq we will
1938 * post the final h_nr_running increment below.
1939 */
1940 if (cfs_rq_throttled(cfs_rq))
1941 break;
1942 cfs_rq->h_nr_running++;
1943
1316 flags = ENQUEUE_WAKEUP; 1944 flags = ENQUEUE_WAKEUP;
1317 } 1945 }
1318 1946
1319 for_each_sched_entity(se) { 1947 for_each_sched_entity(se) {
1320 cfs_rq = cfs_rq_of(se); 1948 cfs_rq = cfs_rq_of(se);
1949 cfs_rq->h_nr_running++;
1950
1951 if (cfs_rq_throttled(cfs_rq))
1952 break;
1321 1953
1322 update_cfs_load(cfs_rq, 0); 1954 update_cfs_load(cfs_rq, 0);
1323 update_cfs_shares(cfs_rq); 1955 update_cfs_shares(cfs_rq);
1324 } 1956 }
1325 1957
1958 if (!se)
1959 inc_nr_running(rq);
1326 hrtick_update(rq); 1960 hrtick_update(rq);
1327} 1961}
1328 1962
@@ -1343,6 +1977,16 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
1343 cfs_rq = cfs_rq_of(se); 1977 cfs_rq = cfs_rq_of(se);
1344 dequeue_entity(cfs_rq, se, flags); 1978 dequeue_entity(cfs_rq, se, flags);
1345 1979
1980 /*
1981 * end evaluation on encountering a throttled cfs_rq
1982 *
1983 * note: in the case of encountering a throttled cfs_rq we will
1984 * post the final h_nr_running decrement below.
1985 */
1986 if (cfs_rq_throttled(cfs_rq))
1987 break;
1988 cfs_rq->h_nr_running--;
1989
1346 /* Don't dequeue parent if it has other entities besides us */ 1990 /* Don't dequeue parent if it has other entities besides us */
1347 if (cfs_rq->load.weight) { 1991 if (cfs_rq->load.weight) {
1348 /* 1992 /*
@@ -1361,11 +2005,17 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
1361 2005
1362 for_each_sched_entity(se) { 2006 for_each_sched_entity(se) {
1363 cfs_rq = cfs_rq_of(se); 2007 cfs_rq = cfs_rq_of(se);
2008 cfs_rq->h_nr_running--;
2009
2010 if (cfs_rq_throttled(cfs_rq))
2011 break;
1364 2012
1365 update_cfs_load(cfs_rq, 0); 2013 update_cfs_load(cfs_rq, 0);
1366 update_cfs_shares(cfs_rq); 2014 update_cfs_shares(cfs_rq);
1367 } 2015 }
1368 2016
2017 if (!se)
2018 dec_nr_running(rq);
1369 hrtick_update(rq); 2019 hrtick_update(rq);
1370} 2020}
1371 2021
@@ -1399,42 +2049,105 @@ static void task_waking_fair(struct task_struct *p)
1399 * Adding load to a group doesn't make a group heavier, but can cause movement 2049 * Adding load to a group doesn't make a group heavier, but can cause movement
1400 * of group shares between cpus. Assuming the shares were perfectly aligned one 2050 * of group shares between cpus. Assuming the shares were perfectly aligned one
1401 * can calculate the shift in shares. 2051 * can calculate the shift in shares.
2052 *
2053 * Calculate the effective load difference if @wl is added (subtracted) to @tg
2054 * on this @cpu and results in a total addition (subtraction) of @wg to the
2055 * total group weight.
2056 *
2057 * Given a runqueue weight distribution (rw_i) we can compute a shares
2058 * distribution (s_i) using:
2059 *
2060 * s_i = rw_i / \Sum rw_j (1)
2061 *
2062 * Suppose we have 4 CPUs and our @tg is a direct child of the root group and
2063 * has 7 equal weight tasks, distributed as below (rw_i), with the resulting
2064 * shares distribution (s_i):
2065 *
2066 * rw_i = { 2, 4, 1, 0 }
2067 * s_i = { 2/7, 4/7, 1/7, 0 }
2068 *
2069 * As per wake_affine() we're interested in the load of two CPUs (the CPU the
2070 * task used to run on and the CPU the waker is running on), we need to
2071 * compute the effect of waking a task on either CPU and, in case of a sync
2072 * wakeup, compute the effect of the current task going to sleep.
2073 *
2074 * So for a change of @wl to the local @cpu with an overall group weight change
2075 * of @wl we can compute the new shares distribution (s'_i) using:
2076 *
2077 * s'_i = (rw_i + @wl) / (@wg + \Sum rw_j) (2)
2078 *
2079 * Suppose we're interested in CPUs 0 and 1, and want to compute the load
2080 * differences in waking a task to CPU 0. The additional task changes the
2081 * weight and shares distributions like:
2082 *
2083 * rw'_i = { 3, 4, 1, 0 }
2084 * s'_i = { 3/8, 4/8, 1/8, 0 }
2085 *
2086 * We can then compute the difference in effective weight by using:
2087 *
2088 * dw_i = S * (s'_i - s_i) (3)
2089 *
2090 * Where 'S' is the group weight as seen by its parent.
2091 *
2092 * Therefore the effective change in loads on CPU 0 would be 5/56 (3/8 - 2/7)
2093 * times the weight of the group. The effect on CPU 1 would be -4/56 (4/8 -
2094 * 4/7) times the weight of the group.
1402 */ 2095 */
1403static long effective_load(struct task_group *tg, int cpu, long wl, long wg) 2096static long effective_load(struct task_group *tg, int cpu, long wl, long wg)
1404{ 2097{
1405 struct sched_entity *se = tg->se[cpu]; 2098 struct sched_entity *se = tg->se[cpu];
1406 2099
1407 if (!tg->parent) 2100 if (!tg->parent) /* the trivial, non-cgroup case */
1408 return wl; 2101 return wl;
1409 2102
1410 for_each_sched_entity(se) { 2103 for_each_sched_entity(se) {
1411 long lw, w; 2104 long w, W;
1412 2105
1413 tg = se->my_q->tg; 2106 tg = se->my_q->tg;
1414 w = se->my_q->load.weight;
1415 2107
1416 /* use this cpu's instantaneous contribution */ 2108 /*
1417 lw = atomic_read(&tg->load_weight); 2109 * W = @wg + \Sum rw_j
1418 lw -= se->my_q->load_contribution; 2110 */
1419 lw += w + wg; 2111 W = wg + calc_tg_weight(tg, se->my_q);
1420 2112
1421 wl += w; 2113 /*
2114 * w = rw_i + @wl
2115 */
2116 w = se->my_q->load.weight + wl;
1422 2117
1423 if (lw > 0 && wl < lw) 2118 /*
1424 wl = (wl * tg->shares) / lw; 2119 * wl = S * s'_i; see (2)
2120 */
2121 if (W > 0 && w < W)
2122 wl = (w * tg->shares) / W;
1425 else 2123 else
1426 wl = tg->shares; 2124 wl = tg->shares;
1427 2125
1428 /* zero point is MIN_SHARES */ 2126 /*
2127 * Per the above, wl is the new se->load.weight value; since
2128 * those are clipped to [MIN_SHARES, ...) do so now. See
2129 * calc_cfs_shares().
2130 */
1429 if (wl < MIN_SHARES) 2131 if (wl < MIN_SHARES)
1430 wl = MIN_SHARES; 2132 wl = MIN_SHARES;
2133
2134 /*
2135 * wl = dw_i = S * (s'_i - s_i); see (3)
2136 */
1431 wl -= se->load.weight; 2137 wl -= se->load.weight;
2138
2139 /*
2140 * Recursively apply this logic to all parent groups to compute
2141 * the final effective load change on the root group. Since
2142 * only the @tg group gets extra weight, all parent groups can
2143 * only redistribute existing shares. @wl is the shift in shares
2144 * resulting from this level per the above.
2145 */
1432 wg = 0; 2146 wg = 0;
1433 } 2147 }
1434 2148
1435 return wl; 2149 return wl;
1436} 2150}
1437
1438#else 2151#else
1439 2152
1440static inline unsigned long effective_load(struct task_group *tg, int cpu, 2153static inline unsigned long effective_load(struct task_group *tg, int cpu,
@@ -1547,7 +2260,7 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p,
1547 2260
1548 /* Skip over this group if it has no CPUs allowed */ 2261 /* Skip over this group if it has no CPUs allowed */
1549 if (!cpumask_intersects(sched_group_cpus(group), 2262 if (!cpumask_intersects(sched_group_cpus(group),
1550 &p->cpus_allowed)) 2263 tsk_cpus_allowed(p)))
1551 continue; 2264 continue;
1552 2265
1553 local_group = cpumask_test_cpu(this_cpu, 2266 local_group = cpumask_test_cpu(this_cpu,
@@ -1593,7 +2306,7 @@ find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu)
1593 int i; 2306 int i;
1594 2307
1595 /* Traverse only the allowed CPUs */ 2308 /* Traverse only the allowed CPUs */
1596 for_each_cpu_and(i, sched_group_cpus(group), &p->cpus_allowed) { 2309 for_each_cpu_and(i, sched_group_cpus(group), tsk_cpus_allowed(p)) {
1597 load = weighted_cpuload(i); 2310 load = weighted_cpuload(i);
1598 2311
1599 if (load < min_load || (load == min_load && i == this_cpu)) { 2312 if (load < min_load || (load == min_load && i == this_cpu)) {
@@ -1613,7 +2326,8 @@ static int select_idle_sibling(struct task_struct *p, int target)
1613 int cpu = smp_processor_id(); 2326 int cpu = smp_processor_id();
1614 int prev_cpu = task_cpu(p); 2327 int prev_cpu = task_cpu(p);
1615 struct sched_domain *sd; 2328 struct sched_domain *sd;
1616 int i; 2329 struct sched_group *sg;
2330 int i, smt = 0;
1617 2331
1618 /* 2332 /*
1619 * If the task is going to be woken-up on this cpu and if it is 2333 * If the task is going to be woken-up on this cpu and if it is
@@ -1633,25 +2347,38 @@ static int select_idle_sibling(struct task_struct *p, int target)
1633 * Otherwise, iterate the domains and find an elegible idle cpu. 2347 * Otherwise, iterate the domains and find an elegible idle cpu.
1634 */ 2348 */
1635 rcu_read_lock(); 2349 rcu_read_lock();
2350again:
1636 for_each_domain(target, sd) { 2351 for_each_domain(target, sd) {
1637 if (!(sd->flags & SD_SHARE_PKG_RESOURCES)) 2352 if (!smt && (sd->flags & SD_SHARE_CPUPOWER))
1638 break; 2353 continue;
1639 2354
1640 for_each_cpu_and(i, sched_domain_span(sd), &p->cpus_allowed) { 2355 if (!(sd->flags & SD_SHARE_PKG_RESOURCES)) {
1641 if (idle_cpu(i)) { 2356 if (!smt) {
1642 target = i; 2357 smt = 1;
1643 break; 2358 goto again;
1644 } 2359 }
2360 break;
1645 } 2361 }
1646 2362
1647 /* 2363 sg = sd->groups;
1648 * Lets stop looking for an idle sibling when we reached 2364 do {
1649 * the domain that spans the current cpu and prev_cpu. 2365 if (!cpumask_intersects(sched_group_cpus(sg),
1650 */ 2366 tsk_cpus_allowed(p)))
1651 if (cpumask_test_cpu(cpu, sched_domain_span(sd)) && 2367 goto next;
1652 cpumask_test_cpu(prev_cpu, sched_domain_span(sd))) 2368
1653 break; 2369 for_each_cpu(i, sched_group_cpus(sg)) {
2370 if (!idle_cpu(i))
2371 goto next;
2372 }
2373
2374 target = cpumask_first_and(sched_group_cpus(sg),
2375 tsk_cpus_allowed(p));
2376 goto done;
2377next:
2378 sg = sg->next;
2379 } while (sg != sd->groups);
1654 } 2380 }
2381done:
1655 rcu_read_unlock(); 2382 rcu_read_unlock();
1656 2383
1657 return target; 2384 return target;
@@ -1680,7 +2407,7 @@ select_task_rq_fair(struct task_struct *p, int sd_flag, int wake_flags)
1680 int sync = wake_flags & WF_SYNC; 2407 int sync = wake_flags & WF_SYNC;
1681 2408
1682 if (sd_flag & SD_BALANCE_WAKE) { 2409 if (sd_flag & SD_BALANCE_WAKE) {
1683 if (cpumask_test_cpu(cpu, &p->cpus_allowed)) 2410 if (cpumask_test_cpu(cpu, tsk_cpus_allowed(p)))
1684 want_affine = 1; 2411 want_affine = 1;
1685 new_cpu = prev_cpu; 2412 new_cpu = prev_cpu;
1686 } 2413 }
@@ -1875,6 +2602,15 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_
1875 if (unlikely(se == pse)) 2602 if (unlikely(se == pse))
1876 return; 2603 return;
1877 2604
2605 /*
2606 * This is possible from callers such as pull_task(), in which we
2607 * unconditionally check_prempt_curr() after an enqueue (which may have
2608 * lead to a throttle). This both saves work and prevents false
2609 * next-buddy nomination below.
2610 */
2611 if (unlikely(throttled_hierarchy(cfs_rq_of(pse))))
2612 return;
2613
1878 if (sched_feat(NEXT_BUDDY) && scale && !(wake_flags & WF_FORK)) { 2614 if (sched_feat(NEXT_BUDDY) && scale && !(wake_flags & WF_FORK)) {
1879 set_next_buddy(pse); 2615 set_next_buddy(pse);
1880 next_buddy_marked = 1; 2616 next_buddy_marked = 1;
@@ -1883,6 +2619,12 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_
1883 /* 2619 /*
1884 * We can come here with TIF_NEED_RESCHED already set from new task 2620 * We can come here with TIF_NEED_RESCHED already set from new task
1885 * wake up path. 2621 * wake up path.
2622 *
2623 * Note: this also catches the edge-case of curr being in a throttled
2624 * group (e.g. via set_curr_task), since update_curr() (in the
2625 * enqueue of curr) will have resulted in resched being set. This
2626 * prevents us from potentially nominating it as a false LAST_BUDDY
2627 * below.
1886 */ 2628 */
1887 if (test_tsk_need_resched(curr)) 2629 if (test_tsk_need_resched(curr))
1888 return; 2630 return;
@@ -1899,10 +2641,6 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_
1899 if (unlikely(p->policy != SCHED_NORMAL)) 2641 if (unlikely(p->policy != SCHED_NORMAL))
1900 return; 2642 return;
1901 2643
1902
1903 if (!sched_feat(WAKEUP_PREEMPT))
1904 return;
1905
1906 find_matching_se(&se, &pse); 2644 find_matching_se(&se, &pse);
1907 update_curr(cfs_rq_of(se)); 2645 update_curr(cfs_rq_of(se));
1908 BUG_ON(!pse); 2646 BUG_ON(!pse);
@@ -2005,7 +2743,8 @@ static bool yield_to_task_fair(struct rq *rq, struct task_struct *p, bool preemp
2005{ 2743{
2006 struct sched_entity *se = &p->se; 2744 struct sched_entity *se = &p->se;
2007 2745
2008 if (!se->on_rq) 2746 /* throttled hierarchies are not runnable */
2747 if (!se->on_rq || throttled_hierarchy(cfs_rq_of(se)))
2009 return false; 2748 return false;
2010 2749
2011 /* Tell the scheduler that we'd really like pse to run next. */ 2750 /* Tell the scheduler that we'd really like pse to run next. */
@@ -2049,7 +2788,7 @@ int can_migrate_task(struct task_struct *p, struct rq *rq, int this_cpu,
2049 * 2) cannot be migrated to this CPU due to cpus_allowed, or 2788 * 2) cannot be migrated to this CPU due to cpus_allowed, or
2050 * 3) are cache-hot on their current CPU. 2789 * 3) are cache-hot on their current CPU.
2051 */ 2790 */
2052 if (!cpumask_test_cpu(this_cpu, &p->cpus_allowed)) { 2791 if (!cpumask_test_cpu(this_cpu, tsk_cpus_allowed(p))) {
2053 schedstat_inc(p, se.statistics.nr_failed_migrations_affine); 2792 schedstat_inc(p, se.statistics.nr_failed_migrations_affine);
2054 return 0; 2793 return 0;
2055 } 2794 }
@@ -2102,6 +2841,9 @@ move_one_task(struct rq *this_rq, int this_cpu, struct rq *busiest,
2102 2841
2103 for_each_leaf_cfs_rq(busiest, cfs_rq) { 2842 for_each_leaf_cfs_rq(busiest, cfs_rq) {
2104 list_for_each_entry_safe(p, n, &cfs_rq->tasks, se.group_node) { 2843 list_for_each_entry_safe(p, n, &cfs_rq->tasks, se.group_node) {
2844 if (throttled_lb_pair(task_group(p),
2845 busiest->cpu, this_cpu))
2846 break;
2105 2847
2106 if (!can_migrate_task(p, busiest, this_cpu, 2848 if (!can_migrate_task(p, busiest, this_cpu,
2107 sd, idle, &pinned)) 2849 sd, idle, &pinned))
@@ -2217,8 +2959,13 @@ static void update_shares(int cpu)
2217 * Iterates the task_group tree in a bottom up fashion, see 2959 * Iterates the task_group tree in a bottom up fashion, see
2218 * list_add_leaf_cfs_rq() for details. 2960 * list_add_leaf_cfs_rq() for details.
2219 */ 2961 */
2220 for_each_leaf_cfs_rq(rq, cfs_rq) 2962 for_each_leaf_cfs_rq(rq, cfs_rq) {
2963 /* throttled entities do not contribute to load */
2964 if (throttled_hierarchy(cfs_rq))
2965 continue;
2966
2221 update_shares_cpu(cfs_rq->tg, cpu); 2967 update_shares_cpu(cfs_rq->tg, cpu);
2968 }
2222 rcu_read_unlock(); 2969 rcu_read_unlock();
2223} 2970}
2224 2971
@@ -2268,9 +3015,10 @@ load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest,
2268 u64 rem_load, moved_load; 3015 u64 rem_load, moved_load;
2269 3016
2270 /* 3017 /*
2271 * empty group 3018 * empty group or part of a throttled hierarchy
2272 */ 3019 */
2273 if (!busiest_cfs_rq->task_weight) 3020 if (!busiest_cfs_rq->task_weight ||
3021 throttled_lb_pair(busiest_cfs_rq->tg, cpu_of(busiest), this_cpu))
2274 continue; 3022 continue;
2275 3023
2276 rem_load = (u64)rem_load_move * busiest_weight; 3024 rem_load = (u64)rem_load_move * busiest_weight;
@@ -2854,7 +3602,7 @@ static bool update_sd_pick_busiest(struct sched_domain *sd,
2854} 3602}
2855 3603
2856/** 3604/**
2857 * update_sd_lb_stats - Update sched_group's statistics for load balancing. 3605 * update_sd_lb_stats - Update sched_domain's statistics for load balancing.
2858 * @sd: sched_domain whose statistics are to be updated. 3606 * @sd: sched_domain whose statistics are to be updated.
2859 * @this_cpu: Cpu for which load balance is currently performed. 3607 * @this_cpu: Cpu for which load balance is currently performed.
2860 * @idle: Idle status of this_cpu 3608 * @idle: Idle status of this_cpu
@@ -3430,7 +4178,7 @@ redo:
3430 * moved to this_cpu 4178 * moved to this_cpu
3431 */ 4179 */
3432 if (!cpumask_test_cpu(this_cpu, 4180 if (!cpumask_test_cpu(this_cpu,
3433 &busiest->curr->cpus_allowed)) { 4181 tsk_cpus_allowed(busiest->curr))) {
3434 raw_spin_unlock_irqrestore(&busiest->lock, 4182 raw_spin_unlock_irqrestore(&busiest->lock,
3435 flags); 4183 flags);
3436 all_pinned = 1; 4184 all_pinned = 1;
@@ -3612,22 +4360,6 @@ out_unlock:
3612} 4360}
3613 4361
3614#ifdef CONFIG_NO_HZ 4362#ifdef CONFIG_NO_HZ
3615
3616static DEFINE_PER_CPU(struct call_single_data, remote_sched_softirq_cb);
3617
3618static void trigger_sched_softirq(void *data)
3619{
3620 raise_softirq_irqoff(SCHED_SOFTIRQ);
3621}
3622
3623static inline void init_sched_softirq_csd(struct call_single_data *csd)
3624{
3625 csd->func = trigger_sched_softirq;
3626 csd->info = NULL;
3627 csd->flags = 0;
3628 csd->priv = 0;
3629}
3630
3631/* 4363/*
3632 * idle load balancing details 4364 * idle load balancing details
3633 * - One of the idle CPUs nominates itself as idle load_balancer, while 4365 * - One of the idle CPUs nominates itself as idle load_balancer, while
@@ -3667,7 +4399,7 @@ static inline struct sched_domain *lowest_flag_domain(int cpu, int flag)
3667 struct sched_domain *sd; 4399 struct sched_domain *sd;
3668 4400
3669 for_each_domain(cpu, sd) 4401 for_each_domain(cpu, sd)
3670 if (sd && (sd->flags & flag)) 4402 if (sd->flags & flag)
3671 break; 4403 break;
3672 4404
3673 return sd; 4405 return sd;
@@ -3793,11 +4525,16 @@ static void nohz_balancer_kick(int cpu)
3793 } 4525 }
3794 4526
3795 if (!cpu_rq(ilb_cpu)->nohz_balance_kick) { 4527 if (!cpu_rq(ilb_cpu)->nohz_balance_kick) {
3796 struct call_single_data *cp;
3797
3798 cpu_rq(ilb_cpu)->nohz_balance_kick = 1; 4528 cpu_rq(ilb_cpu)->nohz_balance_kick = 1;
3799 cp = &per_cpu(remote_sched_softirq_cb, cpu); 4529
3800 __smp_call_function_single(ilb_cpu, cp, 0); 4530 smp_mb();
4531 /*
4532 * Use smp_send_reschedule() instead of resched_cpu().
4533 * This way we generate a sched IPI on the target cpu which
4534 * is idle. And the softirq performing nohz idle load balance
4535 * will be run before returning from the IPI.
4536 */
4537 smp_send_reschedule(ilb_cpu);
3801 } 4538 }
3802 return; 4539 return;
3803} 4540}
@@ -4030,7 +4767,7 @@ static inline int nohz_kick_needed(struct rq *rq, int cpu)
4030 if (time_before(now, nohz.next_balance)) 4767 if (time_before(now, nohz.next_balance))
4031 return 0; 4768 return 0;
4032 4769
4033 if (rq->idle_at_tick) 4770 if (idle_cpu(cpu))
4034 return 0; 4771 return 0;
4035 4772
4036 first_pick_cpu = atomic_read(&nohz.first_pick_cpu); 4773 first_pick_cpu = atomic_read(&nohz.first_pick_cpu);
@@ -4066,7 +4803,7 @@ static void run_rebalance_domains(struct softirq_action *h)
4066{ 4803{
4067 int this_cpu = smp_processor_id(); 4804 int this_cpu = smp_processor_id();
4068 struct rq *this_rq = cpu_rq(this_cpu); 4805 struct rq *this_rq = cpu_rq(this_cpu);
4069 enum cpu_idle_type idle = this_rq->idle_at_tick ? 4806 enum cpu_idle_type idle = this_rq->idle_balance ?
4070 CPU_IDLE : CPU_NOT_IDLE; 4807 CPU_IDLE : CPU_NOT_IDLE;
4071 4808
4072 rebalance_domains(this_cpu, idle); 4809 rebalance_domains(this_cpu, idle);
@@ -4251,8 +4988,13 @@ static void set_curr_task_fair(struct rq *rq)
4251{ 4988{
4252 struct sched_entity *se = &rq->curr->se; 4989 struct sched_entity *se = &rq->curr->se;
4253 4990
4254 for_each_sched_entity(se) 4991 for_each_sched_entity(se) {
4255 set_next_entity(cfs_rq_of(se), se); 4992 struct cfs_rq *cfs_rq = cfs_rq_of(se);
4993
4994 set_next_entity(cfs_rq, se);
4995 /* ensure bandwidth has been allocated on our new cfs_rq */
4996 account_cfs_rq_runtime(cfs_rq, 0);
4997 }
4256} 4998}
4257 4999
4258#ifdef CONFIG_FAIR_GROUP_SCHED 5000#ifdef CONFIG_FAIR_GROUP_SCHED
diff --git a/kernel/sched_features.h b/kernel/sched_features.h
index 2e74677cb04..84802245abd 100644
--- a/kernel/sched_features.h
+++ b/kernel/sched_features.h
@@ -12,11 +12,6 @@ SCHED_FEAT(GENTLE_FAIR_SLEEPERS, 1)
12SCHED_FEAT(START_DEBIT, 1) 12SCHED_FEAT(START_DEBIT, 1)
13 13
14/* 14/*
15 * Should wakeups try to preempt running tasks.
16 */
17SCHED_FEAT(WAKEUP_PREEMPT, 1)
18
19/*
20 * Based on load and program behaviour, see if it makes sense to place 15 * Based on load and program behaviour, see if it makes sense to place
21 * a newly woken task on the same cpu as the task that woke it -- 16 * a newly woken task on the same cpu as the task that woke it --
22 * improve cache locality. Typically used with SYNC wakeups as 17 * improve cache locality. Typically used with SYNC wakeups as
@@ -72,3 +67,4 @@ SCHED_FEAT(NONTASK_POWER, 1)
72SCHED_FEAT(TTWU_QUEUE, 1) 67SCHED_FEAT(TTWU_QUEUE, 1)
73 68
74SCHED_FEAT(FORCE_SD_OVERLAP, 0) 69SCHED_FEAT(FORCE_SD_OVERLAP, 0)
70SCHED_FEAT(RT_RUNTIME_SHARE, 1)
diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c
index 97540f0c9e4..583a1368afe 100644
--- a/kernel/sched_rt.c
+++ b/kernel/sched_rt.c
@@ -124,21 +124,33 @@ static void dec_rt_migration(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
124 update_rt_migration(rt_rq); 124 update_rt_migration(rt_rq);
125} 125}
126 126
127static inline int has_pushable_tasks(struct rq *rq)
128{
129 return !plist_head_empty(&rq->rt.pushable_tasks);
130}
131
127static void enqueue_pushable_task(struct rq *rq, struct task_struct *p) 132static void enqueue_pushable_task(struct rq *rq, struct task_struct *p)
128{ 133{
129 plist_del(&p->pushable_tasks, &rq->rt.pushable_tasks); 134 plist_del(&p->pushable_tasks, &rq->rt.pushable_tasks);
130 plist_node_init(&p->pushable_tasks, p->prio); 135 plist_node_init(&p->pushable_tasks, p->prio);
131 plist_add(&p->pushable_tasks, &rq->rt.pushable_tasks); 136 plist_add(&p->pushable_tasks, &rq->rt.pushable_tasks);
137
138 /* Update the highest prio pushable task */
139 if (p->prio < rq->rt.highest_prio.next)
140 rq->rt.highest_prio.next = p->prio;
132} 141}
133 142
134static void dequeue_pushable_task(struct rq *rq, struct task_struct *p) 143static void dequeue_pushable_task(struct rq *rq, struct task_struct *p)
135{ 144{
136 plist_del(&p->pushable_tasks, &rq->rt.pushable_tasks); 145 plist_del(&p->pushable_tasks, &rq->rt.pushable_tasks);
137}
138 146
139static inline int has_pushable_tasks(struct rq *rq) 147 /* Update the new highest prio pushable task */
140{ 148 if (has_pushable_tasks(rq)) {
141 return !plist_head_empty(&rq->rt.pushable_tasks); 149 p = plist_first_entry(&rq->rt.pushable_tasks,
150 struct task_struct, pushable_tasks);
151 rq->rt.highest_prio.next = p->prio;
152 } else
153 rq->rt.highest_prio.next = MAX_RT_PRIO;
142} 154}
143 155
144#else 156#else
@@ -548,6 +560,9 @@ static int balance_runtime(struct rt_rq *rt_rq)
548{ 560{
549 int more = 0; 561 int more = 0;
550 562
563 if (!sched_feat(RT_RUNTIME_SHARE))
564 return more;
565
551 if (rt_rq->rt_time > rt_rq->rt_runtime) { 566 if (rt_rq->rt_time > rt_rq->rt_runtime) {
552 raw_spin_unlock(&rt_rq->rt_runtime_lock); 567 raw_spin_unlock(&rt_rq->rt_runtime_lock);
553 more = do_balance_runtime(rt_rq); 568 more = do_balance_runtime(rt_rq);
@@ -643,6 +658,7 @@ static int sched_rt_runtime_exceeded(struct rt_rq *rt_rq)
643 658
644 if (rt_rq->rt_time > runtime) { 659 if (rt_rq->rt_time > runtime) {
645 rt_rq->rt_throttled = 1; 660 rt_rq->rt_throttled = 1;
661 printk_once(KERN_WARNING "sched: RT throttling activated\n");
646 if (rt_rq_throttled(rt_rq)) { 662 if (rt_rq_throttled(rt_rq)) {
647 sched_rt_rq_dequeue(rt_rq); 663 sched_rt_rq_dequeue(rt_rq);
648 return 1; 664 return 1;
@@ -698,47 +714,13 @@ static void update_curr_rt(struct rq *rq)
698 714
699#if defined CONFIG_SMP 715#if defined CONFIG_SMP
700 716
701static struct task_struct *pick_next_highest_task_rt(struct rq *rq, int cpu);
702
703static inline int next_prio(struct rq *rq)
704{
705 struct task_struct *next = pick_next_highest_task_rt(rq, rq->cpu);
706
707 if (next && rt_prio(next->prio))
708 return next->prio;
709 else
710 return MAX_RT_PRIO;
711}
712
713static void 717static void
714inc_rt_prio_smp(struct rt_rq *rt_rq, int prio, int prev_prio) 718inc_rt_prio_smp(struct rt_rq *rt_rq, int prio, int prev_prio)
715{ 719{
716 struct rq *rq = rq_of_rt_rq(rt_rq); 720 struct rq *rq = rq_of_rt_rq(rt_rq);
717 721
718 if (prio < prev_prio) { 722 if (rq->online && prio < prev_prio)
719 723 cpupri_set(&rq->rd->cpupri, rq->cpu, prio);
720 /*
721 * If the new task is higher in priority than anything on the
722 * run-queue, we know that the previous high becomes our
723 * next-highest.
724 */
725 rt_rq->highest_prio.next = prev_prio;
726
727 if (rq->online)
728 cpupri_set(&rq->rd->cpupri, rq->cpu, prio);
729
730 } else if (prio == rt_rq->highest_prio.curr)
731 /*
732 * If the next task is equal in priority to the highest on
733 * the run-queue, then we implicitly know that the next highest
734 * task cannot be any lower than current
735 */
736 rt_rq->highest_prio.next = prio;
737 else if (prio < rt_rq->highest_prio.next)
738 /*
739 * Otherwise, we need to recompute next-highest
740 */
741 rt_rq->highest_prio.next = next_prio(rq);
742} 724}
743 725
744static void 726static void
@@ -746,9 +728,6 @@ dec_rt_prio_smp(struct rt_rq *rt_rq, int prio, int prev_prio)
746{ 728{
747 struct rq *rq = rq_of_rt_rq(rt_rq); 729 struct rq *rq = rq_of_rt_rq(rt_rq);
748 730
749 if (rt_rq->rt_nr_running && (prio <= rt_rq->highest_prio.next))
750 rt_rq->highest_prio.next = next_prio(rq);
751
752 if (rq->online && rt_rq->highest_prio.curr != prev_prio) 731 if (rq->online && rt_rq->highest_prio.curr != prev_prio)
753 cpupri_set(&rq->rd->cpupri, rq->cpu, rt_rq->highest_prio.curr); 732 cpupri_set(&rq->rd->cpupri, rq->cpu, rt_rq->highest_prio.curr);
754} 733}
@@ -961,6 +940,8 @@ enqueue_task_rt(struct rq *rq, struct task_struct *p, int flags)
961 940
962 if (!task_current(rq, p) && p->rt.nr_cpus_allowed > 1) 941 if (!task_current(rq, p) && p->rt.nr_cpus_allowed > 1)
963 enqueue_pushable_task(rq, p); 942 enqueue_pushable_task(rq, p);
943
944 inc_nr_running(rq);
964} 945}
965 946
966static void dequeue_task_rt(struct rq *rq, struct task_struct *p, int flags) 947static void dequeue_task_rt(struct rq *rq, struct task_struct *p, int flags)
@@ -971,6 +952,8 @@ static void dequeue_task_rt(struct rq *rq, struct task_struct *p, int flags)
971 dequeue_rt_entity(rt_se); 952 dequeue_rt_entity(rt_se);
972 953
973 dequeue_pushable_task(rq, p); 954 dequeue_pushable_task(rq, p);
955
956 dec_nr_running(rq);
974} 957}
975 958
976/* 959/*
@@ -1017,10 +1000,12 @@ select_task_rq_rt(struct task_struct *p, int sd_flag, int flags)
1017 struct rq *rq; 1000 struct rq *rq;
1018 int cpu; 1001 int cpu;
1019 1002
1020 if (sd_flag != SD_BALANCE_WAKE)
1021 return smp_processor_id();
1022
1023 cpu = task_cpu(p); 1003 cpu = task_cpu(p);
1004
1005 /* For anything but wake ups, just return the task_cpu */
1006 if (sd_flag != SD_BALANCE_WAKE && sd_flag != SD_BALANCE_FORK)
1007 goto out;
1008
1024 rq = cpu_rq(cpu); 1009 rq = cpu_rq(cpu);
1025 1010
1026 rcu_read_lock(); 1011 rcu_read_lock();
@@ -1050,7 +1035,7 @@ select_task_rq_rt(struct task_struct *p, int sd_flag, int flags)
1050 */ 1035 */
1051 if (curr && unlikely(rt_task(curr)) && 1036 if (curr && unlikely(rt_task(curr)) &&
1052 (curr->rt.nr_cpus_allowed < 2 || 1037 (curr->rt.nr_cpus_allowed < 2 ||
1053 curr->prio < p->prio) && 1038 curr->prio <= p->prio) &&
1054 (p->rt.nr_cpus_allowed > 1)) { 1039 (p->rt.nr_cpus_allowed > 1)) {
1055 int target = find_lowest_rq(p); 1040 int target = find_lowest_rq(p);
1056 1041
@@ -1059,6 +1044,7 @@ select_task_rq_rt(struct task_struct *p, int sd_flag, int flags)
1059 } 1044 }
1060 rcu_read_unlock(); 1045 rcu_read_unlock();
1061 1046
1047out:
1062 return cpu; 1048 return cpu;
1063} 1049}
1064 1050
@@ -1178,7 +1164,6 @@ static struct task_struct *pick_next_task_rt(struct rq *rq)
1178static void put_prev_task_rt(struct rq *rq, struct task_struct *p) 1164static void put_prev_task_rt(struct rq *rq, struct task_struct *p)
1179{ 1165{
1180 update_curr_rt(rq); 1166 update_curr_rt(rq);
1181 p->se.exec_start = 0;
1182 1167
1183 /* 1168 /*
1184 * The previous task needs to be made eligible for pushing 1169 * The previous task needs to be made eligible for pushing
@@ -1198,7 +1183,7 @@ static void deactivate_task(struct rq *rq, struct task_struct *p, int sleep);
1198static int pick_rt_task(struct rq *rq, struct task_struct *p, int cpu) 1183static int pick_rt_task(struct rq *rq, struct task_struct *p, int cpu)
1199{ 1184{
1200 if (!task_running(rq, p) && 1185 if (!task_running(rq, p) &&
1201 (cpu < 0 || cpumask_test_cpu(cpu, &p->cpus_allowed)) && 1186 (cpu < 0 || cpumask_test_cpu(cpu, tsk_cpus_allowed(p))) &&
1202 (p->rt.nr_cpus_allowed > 1)) 1187 (p->rt.nr_cpus_allowed > 1))
1203 return 1; 1188 return 1;
1204 return 0; 1189 return 0;
@@ -1343,7 +1328,7 @@ static struct rq *find_lock_lowest_rq(struct task_struct *task, struct rq *rq)
1343 */ 1328 */
1344 if (unlikely(task_rq(task) != rq || 1329 if (unlikely(task_rq(task) != rq ||
1345 !cpumask_test_cpu(lowest_rq->cpu, 1330 !cpumask_test_cpu(lowest_rq->cpu,
1346 &task->cpus_allowed) || 1331 tsk_cpus_allowed(task)) ||
1347 task_running(rq, task) || 1332 task_running(rq, task) ||
1348 !task->on_rq)) { 1333 !task->on_rq)) {
1349 1334
@@ -1394,6 +1379,7 @@ static int push_rt_task(struct rq *rq)
1394{ 1379{
1395 struct task_struct *next_task; 1380 struct task_struct *next_task;
1396 struct rq *lowest_rq; 1381 struct rq *lowest_rq;
1382 int ret = 0;
1397 1383
1398 if (!rq->rt.overloaded) 1384 if (!rq->rt.overloaded)
1399 return 0; 1385 return 0;
@@ -1426,7 +1412,7 @@ retry:
1426 if (!lowest_rq) { 1412 if (!lowest_rq) {
1427 struct task_struct *task; 1413 struct task_struct *task;
1428 /* 1414 /*
1429 * find lock_lowest_rq releases rq->lock 1415 * find_lock_lowest_rq releases rq->lock
1430 * so it is possible that next_task has migrated. 1416 * so it is possible that next_task has migrated.
1431 * 1417 *
1432 * We need to make sure that the task is still on the same 1418 * We need to make sure that the task is still on the same
@@ -1436,12 +1422,11 @@ retry:
1436 task = pick_next_pushable_task(rq); 1422 task = pick_next_pushable_task(rq);
1437 if (task_cpu(next_task) == rq->cpu && task == next_task) { 1423 if (task_cpu(next_task) == rq->cpu && task == next_task) {
1438 /* 1424 /*
1439 * If we get here, the task hasn't moved at all, but 1425 * The task hasn't migrated, and is still the next
1440 * it has failed to push. We will not try again, 1426 * eligible task, but we failed to find a run-queue
1441 * since the other cpus will pull from us when they 1427 * to push it to. Do not retry in this case, since
1442 * are ready. 1428 * other cpus will pull from us when ready.
1443 */ 1429 */
1444 dequeue_pushable_task(rq, next_task);
1445 goto out; 1430 goto out;
1446 } 1431 }
1447 1432
@@ -1460,6 +1445,7 @@ retry:
1460 deactivate_task(rq, next_task, 0); 1445 deactivate_task(rq, next_task, 0);
1461 set_task_cpu(next_task, lowest_rq->cpu); 1446 set_task_cpu(next_task, lowest_rq->cpu);
1462 activate_task(lowest_rq, next_task, 0); 1447 activate_task(lowest_rq, next_task, 0);
1448 ret = 1;
1463 1449
1464 resched_task(lowest_rq->curr); 1450 resched_task(lowest_rq->curr);
1465 1451
@@ -1468,7 +1454,7 @@ retry:
1468out: 1454out:
1469 put_task_struct(next_task); 1455 put_task_struct(next_task);
1470 1456
1471 return 1; 1457 return ret;
1472} 1458}
1473 1459
1474static void push_rt_tasks(struct rq *rq) 1460static void push_rt_tasks(struct rq *rq)
@@ -1581,7 +1567,7 @@ static void task_woken_rt(struct rq *rq, struct task_struct *p)
1581 p->rt.nr_cpus_allowed > 1 && 1567 p->rt.nr_cpus_allowed > 1 &&
1582 rt_task(rq->curr) && 1568 rt_task(rq->curr) &&
1583 (rq->curr->rt.nr_cpus_allowed < 2 || 1569 (rq->curr->rt.nr_cpus_allowed < 2 ||
1584 rq->curr->prio < p->prio)) 1570 rq->curr->prio <= p->prio))
1585 push_rt_tasks(rq); 1571 push_rt_tasks(rq);
1586} 1572}
1587 1573
@@ -1626,9 +1612,6 @@ static void set_cpus_allowed_rt(struct task_struct *p,
1626 1612
1627 update_rt_migration(&rq->rt); 1613 update_rt_migration(&rq->rt);
1628 } 1614 }
1629
1630 cpumask_copy(&p->cpus_allowed, new_mask);
1631 p->rt.nr_cpus_allowed = weight;
1632} 1615}
1633 1616
1634/* Assumes rq->lock is held */ 1617/* Assumes rq->lock is held */
@@ -1863,4 +1846,3 @@ static void print_rt_stats(struct seq_file *m, int cpu)
1863 rcu_read_unlock(); 1846 rcu_read_unlock();
1864} 1847}
1865#endif /* CONFIG_SCHED_DEBUG */ 1848#endif /* CONFIG_SCHED_DEBUG */
1866
diff --git a/kernel/sched_stats.h b/kernel/sched_stats.h
index 331e01bcd02..87f9e36ea56 100644
--- a/kernel/sched_stats.h
+++ b/kernel/sched_stats.h
@@ -282,10 +282,10 @@ static inline void account_group_user_time(struct task_struct *tsk,
282 if (!cputimer->running) 282 if (!cputimer->running)
283 return; 283 return;
284 284
285 spin_lock(&cputimer->lock); 285 raw_spin_lock(&cputimer->lock);
286 cputimer->cputime.utime = 286 cputimer->cputime.utime =
287 cputime_add(cputimer->cputime.utime, cputime); 287 cputime_add(cputimer->cputime.utime, cputime);
288 spin_unlock(&cputimer->lock); 288 raw_spin_unlock(&cputimer->lock);
289} 289}
290 290
291/** 291/**
@@ -306,10 +306,10 @@ static inline void account_group_system_time(struct task_struct *tsk,
306 if (!cputimer->running) 306 if (!cputimer->running)
307 return; 307 return;
308 308
309 spin_lock(&cputimer->lock); 309 raw_spin_lock(&cputimer->lock);
310 cputimer->cputime.stime = 310 cputimer->cputime.stime =
311 cputime_add(cputimer->cputime.stime, cputime); 311 cputime_add(cputimer->cputime.stime, cputime);
312 spin_unlock(&cputimer->lock); 312 raw_spin_unlock(&cputimer->lock);
313} 313}
314 314
315/** 315/**
@@ -330,7 +330,7 @@ static inline void account_group_exec_runtime(struct task_struct *tsk,
330 if (!cputimer->running) 330 if (!cputimer->running)
331 return; 331 return;
332 332
333 spin_lock(&cputimer->lock); 333 raw_spin_lock(&cputimer->lock);
334 cputimer->cputime.sum_exec_runtime += ns; 334 cputimer->cputime.sum_exec_runtime += ns;
335 spin_unlock(&cputimer->lock); 335 raw_spin_unlock(&cputimer->lock);
336} 336}
diff --git a/kernel/sched_stoptask.c b/kernel/sched_stoptask.c
index 6f437632afa..8b44e7fa7fb 100644
--- a/kernel/sched_stoptask.c
+++ b/kernel/sched_stoptask.c
@@ -34,11 +34,13 @@ static struct task_struct *pick_next_task_stop(struct rq *rq)
34static void 34static void
35enqueue_task_stop(struct rq *rq, struct task_struct *p, int flags) 35enqueue_task_stop(struct rq *rq, struct task_struct *p, int flags)
36{ 36{
37 inc_nr_running(rq);
37} 38}
38 39
39static void 40static void
40dequeue_task_stop(struct rq *rq, struct task_struct *p, int flags) 41dequeue_task_stop(struct rq *rq, struct task_struct *p, int flags)
41{ 42{
43 dec_nr_running(rq);
42} 44}
43 45
44static void yield_task_stop(struct rq *rq) 46static void yield_task_stop(struct rq *rq)
diff --git a/kernel/semaphore.c b/kernel/semaphore.c
index 94a62c0d4ad..60636a4e25c 100644
--- a/kernel/semaphore.c
+++ b/kernel/semaphore.c
@@ -27,7 +27,7 @@
27 27
28#include <linux/compiler.h> 28#include <linux/compiler.h>
29#include <linux/kernel.h> 29#include <linux/kernel.h>
30#include <linux/module.h> 30#include <linux/export.h>
31#include <linux/sched.h> 31#include <linux/sched.h>
32#include <linux/semaphore.h> 32#include <linux/semaphore.h>
33#include <linux/spinlock.h> 33#include <linux/spinlock.h>
@@ -54,12 +54,12 @@ void down(struct semaphore *sem)
54{ 54{
55 unsigned long flags; 55 unsigned long flags;
56 56
57 spin_lock_irqsave(&sem->lock, flags); 57 raw_spin_lock_irqsave(&sem->lock, flags);
58 if (likely(sem->count > 0)) 58 if (likely(sem->count > 0))
59 sem->count--; 59 sem->count--;
60 else 60 else
61 __down(sem); 61 __down(sem);
62 spin_unlock_irqrestore(&sem->lock, flags); 62 raw_spin_unlock_irqrestore(&sem->lock, flags);
63} 63}
64EXPORT_SYMBOL(down); 64EXPORT_SYMBOL(down);
65 65
@@ -77,12 +77,12 @@ int down_interruptible(struct semaphore *sem)
77 unsigned long flags; 77 unsigned long flags;
78 int result = 0; 78 int result = 0;
79 79
80 spin_lock_irqsave(&sem->lock, flags); 80 raw_spin_lock_irqsave(&sem->lock, flags);
81 if (likely(sem->count > 0)) 81 if (likely(sem->count > 0))
82 sem->count--; 82 sem->count--;
83 else 83 else
84 result = __down_interruptible(sem); 84 result = __down_interruptible(sem);
85 spin_unlock_irqrestore(&sem->lock, flags); 85 raw_spin_unlock_irqrestore(&sem->lock, flags);
86 86
87 return result; 87 return result;
88} 88}
@@ -103,12 +103,12 @@ int down_killable(struct semaphore *sem)
103 unsigned long flags; 103 unsigned long flags;
104 int result = 0; 104 int result = 0;
105 105
106 spin_lock_irqsave(&sem->lock, flags); 106 raw_spin_lock_irqsave(&sem->lock, flags);
107 if (likely(sem->count > 0)) 107 if (likely(sem->count > 0))
108 sem->count--; 108 sem->count--;
109 else 109 else
110 result = __down_killable(sem); 110 result = __down_killable(sem);
111 spin_unlock_irqrestore(&sem->lock, flags); 111 raw_spin_unlock_irqrestore(&sem->lock, flags);
112 112
113 return result; 113 return result;
114} 114}
@@ -132,11 +132,11 @@ int down_trylock(struct semaphore *sem)
132 unsigned long flags; 132 unsigned long flags;
133 int count; 133 int count;
134 134
135 spin_lock_irqsave(&sem->lock, flags); 135 raw_spin_lock_irqsave(&sem->lock, flags);
136 count = sem->count - 1; 136 count = sem->count - 1;
137 if (likely(count >= 0)) 137 if (likely(count >= 0))
138 sem->count = count; 138 sem->count = count;
139 spin_unlock_irqrestore(&sem->lock, flags); 139 raw_spin_unlock_irqrestore(&sem->lock, flags);
140 140
141 return (count < 0); 141 return (count < 0);
142} 142}
@@ -157,12 +157,12 @@ int down_timeout(struct semaphore *sem, long jiffies)
157 unsigned long flags; 157 unsigned long flags;
158 int result = 0; 158 int result = 0;
159 159
160 spin_lock_irqsave(&sem->lock, flags); 160 raw_spin_lock_irqsave(&sem->lock, flags);
161 if (likely(sem->count > 0)) 161 if (likely(sem->count > 0))
162 sem->count--; 162 sem->count--;
163 else 163 else
164 result = __down_timeout(sem, jiffies); 164 result = __down_timeout(sem, jiffies);
165 spin_unlock_irqrestore(&sem->lock, flags); 165 raw_spin_unlock_irqrestore(&sem->lock, flags);
166 166
167 return result; 167 return result;
168} 168}
@@ -179,12 +179,12 @@ void up(struct semaphore *sem)
179{ 179{
180 unsigned long flags; 180 unsigned long flags;
181 181
182 spin_lock_irqsave(&sem->lock, flags); 182 raw_spin_lock_irqsave(&sem->lock, flags);
183 if (likely(list_empty(&sem->wait_list))) 183 if (likely(list_empty(&sem->wait_list)))
184 sem->count++; 184 sem->count++;
185 else 185 else
186 __up(sem); 186 __up(sem);
187 spin_unlock_irqrestore(&sem->lock, flags); 187 raw_spin_unlock_irqrestore(&sem->lock, flags);
188} 188}
189EXPORT_SYMBOL(up); 189EXPORT_SYMBOL(up);
190 190
@@ -217,9 +217,9 @@ static inline int __sched __down_common(struct semaphore *sem, long state,
217 if (timeout <= 0) 217 if (timeout <= 0)
218 goto timed_out; 218 goto timed_out;
219 __set_task_state(task, state); 219 __set_task_state(task, state);
220 spin_unlock_irq(&sem->lock); 220 raw_spin_unlock_irq(&sem->lock);
221 timeout = schedule_timeout(timeout); 221 timeout = schedule_timeout(timeout);
222 spin_lock_irq(&sem->lock); 222 raw_spin_lock_irq(&sem->lock);
223 if (waiter.up) 223 if (waiter.up)
224 return 0; 224 return 0;
225 } 225 }
diff --git a/kernel/signal.c b/kernel/signal.c
index 291c9700be7..b3f78d09a10 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -11,7 +11,7 @@
11 */ 11 */
12 12
13#include <linux/slab.h> 13#include <linux/slab.h>
14#include <linux/module.h> 14#include <linux/export.h>
15#include <linux/init.h> 15#include <linux/init.h>
16#include <linux/sched.h> 16#include <linux/sched.h>
17#include <linux/fs.h> 17#include <linux/fs.h>
@@ -1344,13 +1344,24 @@ int kill_proc_info(int sig, struct siginfo *info, pid_t pid)
1344 return error; 1344 return error;
1345} 1345}
1346 1346
1347static int kill_as_cred_perm(const struct cred *cred,
1348 struct task_struct *target)
1349{
1350 const struct cred *pcred = __task_cred(target);
1351 if (cred->user_ns != pcred->user_ns)
1352 return 0;
1353 if (cred->euid != pcred->suid && cred->euid != pcred->uid &&
1354 cred->uid != pcred->suid && cred->uid != pcred->uid)
1355 return 0;
1356 return 1;
1357}
1358
1347/* like kill_pid_info(), but doesn't use uid/euid of "current" */ 1359/* like kill_pid_info(), but doesn't use uid/euid of "current" */
1348int kill_pid_info_as_uid(int sig, struct siginfo *info, struct pid *pid, 1360int kill_pid_info_as_cred(int sig, struct siginfo *info, struct pid *pid,
1349 uid_t uid, uid_t euid, u32 secid) 1361 const struct cred *cred, u32 secid)
1350{ 1362{
1351 int ret = -EINVAL; 1363 int ret = -EINVAL;
1352 struct task_struct *p; 1364 struct task_struct *p;
1353 const struct cred *pcred;
1354 unsigned long flags; 1365 unsigned long flags;
1355 1366
1356 if (!valid_signal(sig)) 1367 if (!valid_signal(sig))
@@ -1362,10 +1373,7 @@ int kill_pid_info_as_uid(int sig, struct siginfo *info, struct pid *pid,
1362 ret = -ESRCH; 1373 ret = -ESRCH;
1363 goto out_unlock; 1374 goto out_unlock;
1364 } 1375 }
1365 pcred = __task_cred(p); 1376 if (si_fromuser(info) && !kill_as_cred_perm(cred, p)) {
1366 if (si_fromuser(info) &&
1367 euid != pcred->suid && euid != pcred->uid &&
1368 uid != pcred->suid && uid != pcred->uid) {
1369 ret = -EPERM; 1377 ret = -EPERM;
1370 goto out_unlock; 1378 goto out_unlock;
1371 } 1379 }
@@ -1384,7 +1392,7 @@ out_unlock:
1384 rcu_read_unlock(); 1392 rcu_read_unlock();
1385 return ret; 1393 return ret;
1386} 1394}
1387EXPORT_SYMBOL_GPL(kill_pid_info_as_uid); 1395EXPORT_SYMBOL_GPL(kill_pid_info_as_cred);
1388 1396
1389/* 1397/*
1390 * kill_something_info() interprets pid in interesting ways just like kill(2). 1398 * kill_something_info() interprets pid in interesting ways just like kill(2).
diff --git a/kernel/smp.c b/kernel/smp.c
index fb67dfa8394..db197d60489 100644
--- a/kernel/smp.c
+++ b/kernel/smp.c
@@ -6,7 +6,7 @@
6#include <linux/rcupdate.h> 6#include <linux/rcupdate.h>
7#include <linux/rculist.h> 7#include <linux/rculist.h>
8#include <linux/kernel.h> 8#include <linux/kernel.h>
9#include <linux/module.h> 9#include <linux/export.h>
10#include <linux/percpu.h> 10#include <linux/percpu.h>
11#include <linux/init.h> 11#include <linux/init.h>
12#include <linux/gfp.h> 12#include <linux/gfp.h>
diff --git a/kernel/softirq.c b/kernel/softirq.c
index fca82c32042..2c71d91efff 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -10,7 +10,7 @@
10 * Remote softirq infrastructure is by Jens Axboe. 10 * Remote softirq infrastructure is by Jens Axboe.
11 */ 11 */
12 12
13#include <linux/module.h> 13#include <linux/export.h>
14#include <linux/kernel_stat.h> 14#include <linux/kernel_stat.h>
15#include <linux/interrupt.h> 15#include <linux/interrupt.h>
16#include <linux/init.h> 16#include <linux/init.h>
diff --git a/kernel/spinlock.c b/kernel/spinlock.c
index be6517fb9c1..84c7d96918b 100644
--- a/kernel/spinlock.c
+++ b/kernel/spinlock.c
@@ -19,7 +19,7 @@
19#include <linux/spinlock.h> 19#include <linux/spinlock.h>
20#include <linux/interrupt.h> 20#include <linux/interrupt.h>
21#include <linux/debug_locks.h> 21#include <linux/debug_locks.h>
22#include <linux/module.h> 22#include <linux/export.h>
23 23
24/* 24/*
25 * If lockdep is enabled then we use the non-preemption spin-ops 25 * If lockdep is enabled then we use the non-preemption spin-ops
diff --git a/kernel/srcu.c b/kernel/srcu.c
index 73ce23feaea..0febf61e1aa 100644
--- a/kernel/srcu.c
+++ b/kernel/srcu.c
@@ -24,7 +24,7 @@
24 * 24 *
25 */ 25 */
26 26
27#include <linux/module.h> 27#include <linux/export.h>
28#include <linux/mutex.h> 28#include <linux/mutex.h>
29#include <linux/percpu.h> 29#include <linux/percpu.h>
30#include <linux/preempt.h> 30#include <linux/preempt.h>
diff --git a/kernel/stacktrace.c b/kernel/stacktrace.c
index d20c6983aad..00fe55cc5a8 100644
--- a/kernel/stacktrace.c
+++ b/kernel/stacktrace.c
@@ -7,7 +7,7 @@
7 */ 7 */
8#include <linux/sched.h> 8#include <linux/sched.h>
9#include <linux/kernel.h> 9#include <linux/kernel.h>
10#include <linux/module.h> 10#include <linux/export.h>
11#include <linux/kallsyms.h> 11#include <linux/kallsyms.h>
12#include <linux/stacktrace.h> 12#include <linux/stacktrace.h>
13 13
diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c
index ba5070ce576..2f194e96571 100644
--- a/kernel/stop_machine.c
+++ b/kernel/stop_machine.c
@@ -12,7 +12,7 @@
12#include <linux/cpu.h> 12#include <linux/cpu.h>
13#include <linux/init.h> 13#include <linux/init.h>
14#include <linux/kthread.h> 14#include <linux/kthread.h>
15#include <linux/module.h> 15#include <linux/export.h>
16#include <linux/percpu.h> 16#include <linux/percpu.h>
17#include <linux/sched.h> 17#include <linux/sched.h>
18#include <linux/stop_machine.h> 18#include <linux/stop_machine.h>
@@ -41,6 +41,7 @@ struct cpu_stopper {
41}; 41};
42 42
43static DEFINE_PER_CPU(struct cpu_stopper, cpu_stopper); 43static DEFINE_PER_CPU(struct cpu_stopper, cpu_stopper);
44static bool stop_machine_initialized = false;
44 45
45static void cpu_stop_init_done(struct cpu_stop_done *done, unsigned int nr_todo) 46static void cpu_stop_init_done(struct cpu_stop_done *done, unsigned int nr_todo)
46{ 47{
@@ -386,6 +387,8 @@ static int __init cpu_stop_init(void)
386 cpu_stop_cpu_callback(&cpu_stop_cpu_notifier, CPU_ONLINE, bcpu); 387 cpu_stop_cpu_callback(&cpu_stop_cpu_notifier, CPU_ONLINE, bcpu);
387 register_cpu_notifier(&cpu_stop_cpu_notifier); 388 register_cpu_notifier(&cpu_stop_cpu_notifier);
388 389
390 stop_machine_initialized = true;
391
389 return 0; 392 return 0;
390} 393}
391early_initcall(cpu_stop_init); 394early_initcall(cpu_stop_init);
@@ -485,6 +488,25 @@ int __stop_machine(int (*fn)(void *), void *data, const struct cpumask *cpus)
485 .num_threads = num_online_cpus(), 488 .num_threads = num_online_cpus(),
486 .active_cpus = cpus }; 489 .active_cpus = cpus };
487 490
491 if (!stop_machine_initialized) {
492 /*
493 * Handle the case where stop_machine() is called
494 * early in boot before stop_machine() has been
495 * initialized.
496 */
497 unsigned long flags;
498 int ret;
499
500 WARN_ON_ONCE(smdata.num_threads != 1);
501
502 local_irq_save(flags);
503 hard_irq_disable();
504 ret = (*fn)(data);
505 local_irq_restore(flags);
506
507 return ret;
508 }
509
488 /* Set the initial state and stop all online cpus. */ 510 /* Set the initial state and stop all online cpus. */
489 set_state(&smdata, STOPMACHINE_PREPARE); 511 set_state(&smdata, STOPMACHINE_PREPARE);
490 return stop_cpus(cpu_online_mask, stop_machine_cpu_stop, &smdata); 512 return stop_cpus(cpu_online_mask, stop_machine_cpu_stop, &smdata);
diff --git a/kernel/sys.c b/kernel/sys.c
index a101ba36c44..481611fbd07 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -4,7 +4,7 @@
4 * Copyright (C) 1991, 1992 Linus Torvalds 4 * Copyright (C) 1991, 1992 Linus Torvalds
5 */ 5 */
6 6
7#include <linux/module.h> 7#include <linux/export.h>
8#include <linux/mm.h> 8#include <linux/mm.h>
9#include <linux/utsname.h> 9#include <linux/utsname.h>
10#include <linux/mman.h> 10#include <linux/mman.h>
@@ -12,6 +12,7 @@
12#include <linux/prctl.h> 12#include <linux/prctl.h>
13#include <linux/highuid.h> 13#include <linux/highuid.h>
14#include <linux/fs.h> 14#include <linux/fs.h>
15#include <linux/kmod.h>
15#include <linux/perf_event.h> 16#include <linux/perf_event.h>
16#include <linux/resource.h> 17#include <linux/resource.h>
17#include <linux/kernel.h> 18#include <linux/kernel.h>
@@ -37,6 +38,8 @@
37#include <linux/fs_struct.h> 38#include <linux/fs_struct.h>
38#include <linux/gfp.h> 39#include <linux/gfp.h>
39#include <linux/syscore_ops.h> 40#include <linux/syscore_ops.h>
41#include <linux/version.h>
42#include <linux/ctype.h>
40 43
41#include <linux/compat.h> 44#include <linux/compat.h>
42#include <linux/syscalls.h> 45#include <linux/syscalls.h>
@@ -44,6 +47,8 @@
44#include <linux/user_namespace.h> 47#include <linux/user_namespace.h>
45 48
46#include <linux/kmsg_dump.h> 49#include <linux/kmsg_dump.h>
50/* Move somewhere else to avoid recompiling? */
51#include <generated/utsrelease.h>
47 52
48#include <asm/uaccess.h> 53#include <asm/uaccess.h>
49#include <asm/io.h> 54#include <asm/io.h>
@@ -621,11 +626,18 @@ static int set_user(struct cred *new)
621 if (!new_user) 626 if (!new_user)
622 return -EAGAIN; 627 return -EAGAIN;
623 628
629 /*
630 * We don't fail in case of NPROC limit excess here because too many
631 * poorly written programs don't check set*uid() return code, assuming
632 * it never fails if called by root. We may still enforce NPROC limit
633 * for programs doing set*uid()+execve() by harmlessly deferring the
634 * failure to the execve() stage.
635 */
624 if (atomic_read(&new_user->processes) >= rlimit(RLIMIT_NPROC) && 636 if (atomic_read(&new_user->processes) >= rlimit(RLIMIT_NPROC) &&
625 new_user != INIT_USER) { 637 new_user != INIT_USER)
626 free_uid(new_user); 638 current->flags |= PF_NPROC_EXCEEDED;
627 return -EAGAIN; 639 else
628 } 640 current->flags &= ~PF_NPROC_EXCEEDED;
629 641
630 free_uid(new->user); 642 free_uid(new->user);
631 new->user = new_user; 643 new->user = new_user;
@@ -1154,6 +1166,34 @@ DECLARE_RWSEM(uts_sem);
1154#define override_architecture(name) 0 1166#define override_architecture(name) 0
1155#endif 1167#endif
1156 1168
1169/*
1170 * Work around broken programs that cannot handle "Linux 3.0".
1171 * Instead we map 3.x to 2.6.40+x, so e.g. 3.0 would be 2.6.40
1172 */
1173static int override_release(char __user *release, int len)
1174{
1175 int ret = 0;
1176 char buf[65];
1177
1178 if (current->personality & UNAME26) {
1179 char *rest = UTS_RELEASE;
1180 int ndots = 0;
1181 unsigned v;
1182
1183 while (*rest) {
1184 if (*rest == '.' && ++ndots >= 3)
1185 break;
1186 if (!isdigit(*rest) && *rest != '.')
1187 break;
1188 rest++;
1189 }
1190 v = ((LINUX_VERSION_CODE >> 8) & 0xff) + 40;
1191 snprintf(buf, len, "2.6.%u%s", v, rest);
1192 ret = copy_to_user(release, buf, len);
1193 }
1194 return ret;
1195}
1196
1157SYSCALL_DEFINE1(newuname, struct new_utsname __user *, name) 1197SYSCALL_DEFINE1(newuname, struct new_utsname __user *, name)
1158{ 1198{
1159 int errno = 0; 1199 int errno = 0;
@@ -1163,6 +1203,8 @@ SYSCALL_DEFINE1(newuname, struct new_utsname __user *, name)
1163 errno = -EFAULT; 1203 errno = -EFAULT;
1164 up_read(&uts_sem); 1204 up_read(&uts_sem);
1165 1205
1206 if (!errno && override_release(name->release, sizeof(name->release)))
1207 errno = -EFAULT;
1166 if (!errno && override_architecture(name)) 1208 if (!errno && override_architecture(name))
1167 errno = -EFAULT; 1209 errno = -EFAULT;
1168 return errno; 1210 return errno;
@@ -1184,6 +1226,8 @@ SYSCALL_DEFINE1(uname, struct old_utsname __user *, name)
1184 error = -EFAULT; 1226 error = -EFAULT;
1185 up_read(&uts_sem); 1227 up_read(&uts_sem);
1186 1228
1229 if (!error && override_release(name->release, sizeof(name->release)))
1230 error = -EFAULT;
1187 if (!error && override_architecture(name)) 1231 if (!error && override_architecture(name))
1188 error = -EFAULT; 1232 error = -EFAULT;
1189 return error; 1233 return error;
@@ -1218,6 +1262,8 @@ SYSCALL_DEFINE1(olduname, struct oldold_utsname __user *, name)
1218 1262
1219 if (!error && override_architecture(name)) 1263 if (!error && override_architecture(name))
1220 error = -EFAULT; 1264 error = -EFAULT;
1265 if (!error && override_release(name->release, sizeof(name->release)))
1266 error = -EFAULT;
1221 return error ? -EFAULT : 0; 1267 return error ? -EFAULT : 0;
1222} 1268}
1223#endif 1269#endif
@@ -1241,6 +1287,7 @@ SYSCALL_DEFINE2(sethostname, char __user *, name, int, len)
1241 memset(u->nodename + len, 0, sizeof(u->nodename) - len); 1287 memset(u->nodename + len, 0, sizeof(u->nodename) - len);
1242 errno = 0; 1288 errno = 0;
1243 } 1289 }
1290 uts_proc_notify(UTS_PROC_HOSTNAME);
1244 up_write(&uts_sem); 1291 up_write(&uts_sem);
1245 return errno; 1292 return errno;
1246} 1293}
@@ -1291,6 +1338,7 @@ SYSCALL_DEFINE2(setdomainname, char __user *, name, int, len)
1291 memset(u->domainname + len, 0, sizeof(u->domainname) - len); 1338 memset(u->domainname + len, 0, sizeof(u->domainname) - len);
1292 errno = 0; 1339 errno = 0;
1293 } 1340 }
1341 uts_proc_notify(UTS_PROC_DOMAINNAME);
1294 up_write(&uts_sem); 1342 up_write(&uts_sem);
1295 return errno; 1343 return errno;
1296} 1344}
@@ -1714,6 +1762,7 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3,
1714 sizeof(me->comm) - 1) < 0) 1762 sizeof(me->comm) - 1) < 0)
1715 return -EFAULT; 1763 return -EFAULT;
1716 set_task_comm(me, comm); 1764 set_task_comm(me, comm);
1765 proc_comm_connector(me);
1717 return 0; 1766 return 0;
1718 case PR_GET_NAME: 1767 case PR_GET_NAME:
1719 get_task_comm(comm, me); 1768 get_task_comm(comm, me);
diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c
index 62cbc8877fe..47bfa16430d 100644
--- a/kernel/sys_ni.c
+++ b/kernel/sys_ni.c
@@ -16,7 +16,6 @@ asmlinkage long sys_ni_syscall(void)
16 return -ENOSYS; 16 return -ENOSYS;
17} 17}
18 18
19cond_syscall(sys_nfsservctl);
20cond_syscall(sys_quotactl); 19cond_syscall(sys_quotactl);
21cond_syscall(sys32_quotactl); 20cond_syscall(sys32_quotactl);
22cond_syscall(sys_acct); 21cond_syscall(sys_acct);
@@ -146,6 +145,10 @@ cond_syscall(sys_io_submit);
146cond_syscall(sys_io_cancel); 145cond_syscall(sys_io_cancel);
147cond_syscall(sys_io_getevents); 146cond_syscall(sys_io_getevents);
148cond_syscall(sys_syslog); 147cond_syscall(sys_syslog);
148cond_syscall(sys_process_vm_readv);
149cond_syscall(sys_process_vm_writev);
150cond_syscall(compat_sys_process_vm_readv);
151cond_syscall(compat_sys_process_vm_writev);
149 152
150/* arch-specific weak syscall entries */ 153/* arch-specific weak syscall entries */
151cond_syscall(sys_pciconfig_read); 154cond_syscall(sys_pciconfig_read);
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 11d65b531e5..ae271964385 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -57,6 +57,7 @@
57#include <linux/pipe_fs_i.h> 57#include <linux/pipe_fs_i.h>
58#include <linux/oom.h> 58#include <linux/oom.h>
59#include <linux/kmod.h> 59#include <linux/kmod.h>
60#include <linux/capability.h>
60 61
61#include <asm/uaccess.h> 62#include <asm/uaccess.h>
62#include <asm/processor.h> 63#include <asm/processor.h>
@@ -134,6 +135,7 @@ static int minolduid;
134static int min_percpu_pagelist_fract = 8; 135static int min_percpu_pagelist_fract = 8;
135 136
136static int ngroups_max = NGROUPS_MAX; 137static int ngroups_max = NGROUPS_MAX;
138static const int cap_last_cap = CAP_LAST_CAP;
137 139
138#ifdef CONFIG_INOTIFY_USER 140#ifdef CONFIG_INOTIFY_USER
139#include <linux/inotify.h> 141#include <linux/inotify.h>
@@ -151,14 +153,6 @@ extern int pwrsw_enabled;
151extern int unaligned_enabled; 153extern int unaligned_enabled;
152#endif 154#endif
153 155
154#ifdef CONFIG_S390
155#ifdef CONFIG_MATHEMU
156extern int sysctl_ieee_emulation_warnings;
157#endif
158extern int sysctl_userprocess_debug;
159extern int spin_retry;
160#endif
161
162#ifdef CONFIG_IA64 156#ifdef CONFIG_IA64
163extern int no_unaligned_warning; 157extern int no_unaligned_warning;
164extern int unaligned_dump_stack; 158extern int unaligned_dump_stack;
@@ -379,6 +373,16 @@ static struct ctl_table kern_table[] = {
379 .extra2 = &one, 373 .extra2 = &one,
380 }, 374 },
381#endif 375#endif
376#ifdef CONFIG_CFS_BANDWIDTH
377 {
378 .procname = "sched_cfs_bandwidth_slice_us",
379 .data = &sysctl_sched_cfs_bandwidth_slice,
380 .maxlen = sizeof(unsigned int),
381 .mode = 0644,
382 .proc_handler = proc_dointvec_minmax,
383 .extra1 = &one,
384 },
385#endif
382#ifdef CONFIG_PROVE_LOCKING 386#ifdef CONFIG_PROVE_LOCKING
383 { 387 {
384 .procname = "prove_locking", 388 .procname = "prove_locking",
@@ -730,6 +734,13 @@ static struct ctl_table kern_table[] = {
730 .mode = 0444, 734 .mode = 0444,
731 .proc_handler = proc_dointvec, 735 .proc_handler = proc_dointvec,
732 }, 736 },
737 {
738 .procname = "cap_last_cap",
739 .data = (void *)&cap_last_cap,
740 .maxlen = sizeof(int),
741 .mode = 0444,
742 .proc_handler = proc_dointvec,
743 },
733#if defined(CONFIG_LOCKUP_DETECTOR) 744#if defined(CONFIG_LOCKUP_DETECTOR)
734 { 745 {
735 .procname = "watchdog", 746 .procname = "watchdog",
diff --git a/kernel/sysctl_binary.c b/kernel/sysctl_binary.c
index 3b8e028b960..6318b511afa 100644
--- a/kernel/sysctl_binary.c
+++ b/kernel/sysctl_binary.c
@@ -1,6 +1,6 @@
1#include <linux/stat.h> 1#include <linux/stat.h>
2#include <linux/sysctl.h> 2#include <linux/sysctl.h>
3#include "../fs/xfs/linux-2.6/xfs_sysctl.h" 3#include "../fs/xfs/xfs_sysctl.h"
4#include <linux/sunrpc/debug.h> 4#include <linux/sunrpc/debug.h>
5#include <linux/string.h> 5#include <linux/string.h>
6#include <net/ip_vs.h> 6#include <net/ip_vs.h>
@@ -214,7 +214,7 @@ static const struct bin_table bin_net_ipv4_route_table[] = {
214 { CTL_INT, NET_IPV4_ROUTE_GC_MIN_INTERVAL, "gc_min_interval" }, 214 { CTL_INT, NET_IPV4_ROUTE_GC_MIN_INTERVAL, "gc_min_interval" },
215 { CTL_INT, NET_IPV4_ROUTE_GC_MIN_INTERVAL_MS, "gc_min_interval_ms" }, 215 { CTL_INT, NET_IPV4_ROUTE_GC_MIN_INTERVAL_MS, "gc_min_interval_ms" },
216 { CTL_INT, NET_IPV4_ROUTE_GC_TIMEOUT, "gc_timeout" }, 216 { CTL_INT, NET_IPV4_ROUTE_GC_TIMEOUT, "gc_timeout" },
217 { CTL_INT, NET_IPV4_ROUTE_GC_INTERVAL, "gc_interval" }, 217 /* NET_IPV4_ROUTE_GC_INTERVAL "gc_interval" no longer used */
218 { CTL_INT, NET_IPV4_ROUTE_REDIRECT_LOAD, "redirect_load" }, 218 { CTL_INT, NET_IPV4_ROUTE_REDIRECT_LOAD, "redirect_load" },
219 { CTL_INT, NET_IPV4_ROUTE_REDIRECT_NUMBER, "redirect_number" }, 219 { CTL_INT, NET_IPV4_ROUTE_REDIRECT_NUMBER, "redirect_number" },
220 { CTL_INT, NET_IPV4_ROUTE_REDIRECT_SILENCE, "redirect_silence" }, 220 { CTL_INT, NET_IPV4_ROUTE_REDIRECT_SILENCE, "redirect_silence" },
diff --git a/kernel/sysctl_check.c b/kernel/sysctl_check.c
index 4e4932a7b36..362da653813 100644
--- a/kernel/sysctl_check.c
+++ b/kernel/sysctl_check.c
@@ -1,6 +1,6 @@
1#include <linux/stat.h> 1#include <linux/stat.h>
2#include <linux/sysctl.h> 2#include <linux/sysctl.h>
3#include "../fs/xfs/linux-2.6/xfs_sysctl.h" 3#include "../fs/xfs/xfs_sysctl.h"
4#include <linux/sunrpc/debug.h> 4#include <linux/sunrpc/debug.h>
5#include <linux/string.h> 5#include <linux/string.h>
6#include <net/ip_vs.h> 6#include <net/ip_vs.h>
diff --git a/kernel/taskstats.c b/kernel/taskstats.c
index e19ce1454ee..e66046456f4 100644
--- a/kernel/taskstats.c
+++ b/kernel/taskstats.c
@@ -655,6 +655,7 @@ static struct genl_ops taskstats_ops = {
655 .cmd = TASKSTATS_CMD_GET, 655 .cmd = TASKSTATS_CMD_GET,
656 .doit = taskstats_user_cmd, 656 .doit = taskstats_user_cmd,
657 .policy = taskstats_cmd_get_policy, 657 .policy = taskstats_cmd_get_policy,
658 .flags = GENL_ADMIN_PERM,
658}; 659};
659 660
660static struct genl_ops cgroupstats_ops = { 661static struct genl_ops cgroupstats_ops = {
diff --git a/kernel/time.c b/kernel/time.c
index d7760621452..73e416db0a1 100644
--- a/kernel/time.c
+++ b/kernel/time.c
@@ -27,7 +27,7 @@
27 * with nanosecond accuracy 27 * with nanosecond accuracy
28 */ 28 */
29 29
30#include <linux/module.h> 30#include <linux/export.h>
31#include <linux/timex.h> 31#include <linux/timex.h>
32#include <linux/capability.h> 32#include <linux/capability.h>
33#include <linux/clocksource.h> 33#include <linux/clocksource.h>
diff --git a/kernel/time/posix-clock.c b/kernel/time/posix-clock.c
index c340ca658f3..ce033c7aa2e 100644
--- a/kernel/time/posix-clock.c
+++ b/kernel/time/posix-clock.c
@@ -18,6 +18,7 @@
18 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. 18 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
19 */ 19 */
20#include <linux/device.h> 20#include <linux/device.h>
21#include <linux/export.h>
21#include <linux/file.h> 22#include <linux/file.h>
22#include <linux/posix-clock.h> 23#include <linux/posix-clock.h>
23#include <linux/slab.h> 24#include <linux/slab.h>
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index 7e2e0817cbf..40420644d0b 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -139,7 +139,6 @@ static void tick_nohz_update_jiffies(ktime_t now)
139 struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu); 139 struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu);
140 unsigned long flags; 140 unsigned long flags;
141 141
142 cpumask_clear_cpu(cpu, nohz_cpu_mask);
143 ts->idle_waketime = now; 142 ts->idle_waketime = now;
144 143
145 local_irq_save(flags); 144 local_irq_save(flags);
@@ -418,9 +417,6 @@ void tick_nohz_stop_sched_tick(int inidle)
418 else 417 else
419 expires.tv64 = KTIME_MAX; 418 expires.tv64 = KTIME_MAX;
420 419
421 if (delta_jiffies > 1)
422 cpumask_set_cpu(cpu, nohz_cpu_mask);
423
424 /* Skip reprogram of event if its not changed */ 420 /* Skip reprogram of event if its not changed */
425 if (ts->tick_stopped && ktime_equal(expires, dev->next_event)) 421 if (ts->tick_stopped && ktime_equal(expires, dev->next_event))
426 goto out; 422 goto out;
@@ -470,7 +466,6 @@ void tick_nohz_stop_sched_tick(int inidle)
470 * softirq. 466 * softirq.
471 */ 467 */
472 tick_do_update_jiffies64(ktime_get()); 468 tick_do_update_jiffies64(ktime_get());
473 cpumask_clear_cpu(cpu, nohz_cpu_mask);
474 } 469 }
475 raise_softirq_irqoff(TIMER_SOFTIRQ); 470 raise_softirq_irqoff(TIMER_SOFTIRQ);
476out: 471out:
@@ -553,7 +548,6 @@ void tick_nohz_restart_sched_tick(void)
553 /* Update jiffies first */ 548 /* Update jiffies first */
554 select_nohz_load_balancer(0); 549 select_nohz_load_balancer(0);
555 tick_do_update_jiffies64(now); 550 tick_do_update_jiffies64(now);
556 cpumask_clear_cpu(cpu, nohz_cpu_mask);
557 551
558#ifndef CONFIG_VIRT_CPU_ACCOUNTING 552#ifndef CONFIG_VIRT_CPU_ACCOUNTING
559 /* 553 /*
diff --git a/kernel/time/timer_stats.c b/kernel/time/timer_stats.c
index a5d0a3a85dd..0b537f27b55 100644
--- a/kernel/time/timer_stats.c
+++ b/kernel/time/timer_stats.c
@@ -81,7 +81,7 @@ struct entry {
81/* 81/*
82 * Spinlock protecting the tables - not taken during lookup: 82 * Spinlock protecting the tables - not taken during lookup:
83 */ 83 */
84static DEFINE_SPINLOCK(table_lock); 84static DEFINE_RAW_SPINLOCK(table_lock);
85 85
86/* 86/*
87 * Per-CPU lookup locks for fast hash lookup: 87 * Per-CPU lookup locks for fast hash lookup:
@@ -188,7 +188,7 @@ static struct entry *tstat_lookup(struct entry *entry, char *comm)
188 prev = NULL; 188 prev = NULL;
189 curr = *head; 189 curr = *head;
190 190
191 spin_lock(&table_lock); 191 raw_spin_lock(&table_lock);
192 /* 192 /*
193 * Make sure we have not raced with another CPU: 193 * Make sure we have not raced with another CPU:
194 */ 194 */
@@ -215,7 +215,7 @@ static struct entry *tstat_lookup(struct entry *entry, char *comm)
215 *head = curr; 215 *head = curr;
216 } 216 }
217 out_unlock: 217 out_unlock:
218 spin_unlock(&table_lock); 218 raw_spin_unlock(&table_lock);
219 219
220 return curr; 220 return curr;
221} 221}
diff --git a/kernel/timer.c b/kernel/timer.c
index 8cff36119e4..dbaa62422b1 100644
--- a/kernel/timer.c
+++ b/kernel/timer.c
@@ -20,7 +20,7 @@
20 */ 20 */
21 21
22#include <linux/kernel_stat.h> 22#include <linux/kernel_stat.h>
23#include <linux/module.h> 23#include <linux/export.h>
24#include <linux/interrupt.h> 24#include <linux/interrupt.h>
25#include <linux/percpu.h> 25#include <linux/percpu.h>
26#include <linux/init.h> 26#include <linux/init.h>
diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
index 2ad39e556cb..cd3134510f3 100644
--- a/kernel/trace/Kconfig
+++ b/kernel/trace/Kconfig
@@ -82,7 +82,7 @@ config EVENT_POWER_TRACING_DEPRECATED
82 power:power_frequency 82 power:power_frequency
83 This is for userspace compatibility 83 This is for userspace compatibility
84 and will vanish after 5 kernel iterations, 84 and will vanish after 5 kernel iterations,
85 namely 2.6.41. 85 namely 3.1.
86 86
87config CONTEXT_SWITCH_TRACER 87config CONTEXT_SWITCH_TRACER
88 bool 88 bool
diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile
index 761c510a06c..5f39a07fe5e 100644
--- a/kernel/trace/Makefile
+++ b/kernel/trace/Makefile
@@ -15,6 +15,8 @@ ifdef CONFIG_TRACING_BRANCHES
15KBUILD_CFLAGS += -DDISABLE_BRANCH_PROFILING 15KBUILD_CFLAGS += -DDISABLE_BRANCH_PROFILING
16endif 16endif
17 17
18CFLAGS_trace_events_filter.o := -I$(src)
19
18# 20#
19# Make the trace clocks available generally: it's infrastructure 21# Make the trace clocks available generally: it's infrastructure
20# relied on by ptrace for example: 22# relied on by ptrace for example:
@@ -53,6 +55,9 @@ endif
53obj-$(CONFIG_EVENT_TRACING) += trace_events_filter.o 55obj-$(CONFIG_EVENT_TRACING) += trace_events_filter.o
54obj-$(CONFIG_KPROBE_EVENT) += trace_kprobe.o 56obj-$(CONFIG_KPROBE_EVENT) += trace_kprobe.o
55obj-$(CONFIG_TRACEPOINTS) += power-traces.o 57obj-$(CONFIG_TRACEPOINTS) += power-traces.o
58ifeq ($(CONFIG_PM_RUNTIME),y)
59obj-$(CONFIG_TRACEPOINTS) += rpm-traces.o
60endif
56ifeq ($(CONFIG_TRACING),y) 61ifeq ($(CONFIG_TRACING),y)
57obj-$(CONFIG_KGDB_KDB) += trace_kdb.o 62obj-$(CONFIG_KGDB_KDB) += trace_kdb.o
58endif 63endif
diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c
index 6957aa298df..16fc34a0806 100644
--- a/kernel/trace/blktrace.c
+++ b/kernel/trace/blktrace.c
@@ -23,6 +23,7 @@
23#include <linux/mutex.h> 23#include <linux/mutex.h>
24#include <linux/slab.h> 24#include <linux/slab.h>
25#include <linux/debugfs.h> 25#include <linux/debugfs.h>
26#include <linux/export.h>
26#include <linux/time.h> 27#include <linux/time.h>
27#include <linux/uaccess.h> 28#include <linux/uaccess.h>
28 29
@@ -206,6 +207,8 @@ static void __blk_add_trace(struct blk_trace *bt, sector_t sector, int bytes,
206 what |= MASK_TC_BIT(rw, RAHEAD); 207 what |= MASK_TC_BIT(rw, RAHEAD);
207 what |= MASK_TC_BIT(rw, META); 208 what |= MASK_TC_BIT(rw, META);
208 what |= MASK_TC_BIT(rw, DISCARD); 209 what |= MASK_TC_BIT(rw, DISCARD);
210 what |= MASK_TC_BIT(rw, FLUSH);
211 what |= MASK_TC_BIT(rw, FUA);
209 212
210 pid = tsk->pid; 213 pid = tsk->pid;
211 if (act_log_check(bt, what, sector, pid)) 214 if (act_log_check(bt, what, sector, pid))
@@ -1054,6 +1057,9 @@ static void fill_rwbs(char *rwbs, const struct blk_io_trace *t)
1054 goto out; 1057 goto out;
1055 } 1058 }
1056 1059
1060 if (tc & BLK_TC_FLUSH)
1061 rwbs[i++] = 'F';
1062
1057 if (tc & BLK_TC_DISCARD) 1063 if (tc & BLK_TC_DISCARD)
1058 rwbs[i++] = 'D'; 1064 rwbs[i++] = 'D';
1059 else if (tc & BLK_TC_WRITE) 1065 else if (tc & BLK_TC_WRITE)
@@ -1063,10 +1069,10 @@ static void fill_rwbs(char *rwbs, const struct blk_io_trace *t)
1063 else 1069 else
1064 rwbs[i++] = 'N'; 1070 rwbs[i++] = 'N';
1065 1071
1072 if (tc & BLK_TC_FUA)
1073 rwbs[i++] = 'F';
1066 if (tc & BLK_TC_AHEAD) 1074 if (tc & BLK_TC_AHEAD)
1067 rwbs[i++] = 'A'; 1075 rwbs[i++] = 'A';
1068 if (tc & BLK_TC_BARRIER)
1069 rwbs[i++] = 'B';
1070 if (tc & BLK_TC_SYNC) 1076 if (tc & BLK_TC_SYNC)
1071 rwbs[i++] = 'S'; 1077 rwbs[i++] = 'S';
1072 if (tc & BLK_TC_META) 1078 if (tc & BLK_TC_META)
@@ -1132,7 +1138,7 @@ typedef int (blk_log_action_t) (struct trace_iterator *iter, const char *act);
1132 1138
1133static int blk_log_action_classic(struct trace_iterator *iter, const char *act) 1139static int blk_log_action_classic(struct trace_iterator *iter, const char *act)
1134{ 1140{
1135 char rwbs[6]; 1141 char rwbs[RWBS_LEN];
1136 unsigned long long ts = iter->ts; 1142 unsigned long long ts = iter->ts;
1137 unsigned long nsec_rem = do_div(ts, NSEC_PER_SEC); 1143 unsigned long nsec_rem = do_div(ts, NSEC_PER_SEC);
1138 unsigned secs = (unsigned long)ts; 1144 unsigned secs = (unsigned long)ts;
@@ -1148,7 +1154,7 @@ static int blk_log_action_classic(struct trace_iterator *iter, const char *act)
1148 1154
1149static int blk_log_action(struct trace_iterator *iter, const char *act) 1155static int blk_log_action(struct trace_iterator *iter, const char *act)
1150{ 1156{
1151 char rwbs[6]; 1157 char rwbs[RWBS_LEN];
1152 const struct blk_io_trace *t = te_blk_io_trace(iter->ent); 1158 const struct blk_io_trace *t = te_blk_io_trace(iter->ent);
1153 1159
1154 fill_rwbs(rwbs, t); 1160 fill_rwbs(rwbs, t);
@@ -1561,7 +1567,7 @@ static const struct {
1561} mask_maps[] = { 1567} mask_maps[] = {
1562 { BLK_TC_READ, "read" }, 1568 { BLK_TC_READ, "read" },
1563 { BLK_TC_WRITE, "write" }, 1569 { BLK_TC_WRITE, "write" },
1564 { BLK_TC_BARRIER, "barrier" }, 1570 { BLK_TC_FLUSH, "flush" },
1565 { BLK_TC_SYNC, "sync" }, 1571 { BLK_TC_SYNC, "sync" },
1566 { BLK_TC_QUEUE, "queue" }, 1572 { BLK_TC_QUEUE, "queue" },
1567 { BLK_TC_REQUEUE, "requeue" }, 1573 { BLK_TC_REQUEUE, "requeue" },
@@ -1573,6 +1579,7 @@ static const struct {
1573 { BLK_TC_META, "meta" }, 1579 { BLK_TC_META, "meta" },
1574 { BLK_TC_DISCARD, "discard" }, 1580 { BLK_TC_DISCARD, "discard" },
1575 { BLK_TC_DRV_DATA, "drv_data" }, 1581 { BLK_TC_DRV_DATA, "drv_data" },
1582 { BLK_TC_FUA, "fua" },
1576}; 1583};
1577 1584
1578static int blk_trace_str2mask(const char *str) 1585static int blk_trace_str2mask(const char *str)
@@ -1788,6 +1795,9 @@ void blk_fill_rwbs(char *rwbs, u32 rw, int bytes)
1788{ 1795{
1789 int i = 0; 1796 int i = 0;
1790 1797
1798 if (rw & REQ_FLUSH)
1799 rwbs[i++] = 'F';
1800
1791 if (rw & WRITE) 1801 if (rw & WRITE)
1792 rwbs[i++] = 'W'; 1802 rwbs[i++] = 'W';
1793 else if (rw & REQ_DISCARD) 1803 else if (rw & REQ_DISCARD)
@@ -1797,6 +1807,8 @@ void blk_fill_rwbs(char *rwbs, u32 rw, int bytes)
1797 else 1807 else
1798 rwbs[i++] = 'N'; 1808 rwbs[i++] = 'N';
1799 1809
1810 if (rw & REQ_FUA)
1811 rwbs[i++] = 'F';
1800 if (rw & REQ_RAHEAD) 1812 if (rw & REQ_RAHEAD)
1801 rwbs[i++] = 'A'; 1813 rwbs[i++] = 'A';
1802 if (rw & REQ_SYNC) 1814 if (rw & REQ_SYNC)
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index c3e4575e782..900b409543d 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -22,6 +22,7 @@
22#include <linux/hardirq.h> 22#include <linux/hardirq.h>
23#include <linux/kthread.h> 23#include <linux/kthread.h>
24#include <linux/uaccess.h> 24#include <linux/uaccess.h>
25#include <linux/module.h>
25#include <linux/ftrace.h> 26#include <linux/ftrace.h>
26#include <linux/sysctl.h> 27#include <linux/sysctl.h>
27#include <linux/slab.h> 28#include <linux/slab.h>
@@ -3863,6 +3864,14 @@ void ftrace_kill(void)
3863} 3864}
3864 3865
3865/** 3866/**
3867 * Test if ftrace is dead or not.
3868 */
3869int ftrace_is_dead(void)
3870{
3871 return ftrace_disabled;
3872}
3873
3874/**
3866 * register_ftrace_function - register a function for profiling 3875 * register_ftrace_function - register a function for profiling
3867 * @ops - ops structure that holds the function for profiling. 3876 * @ops - ops structure that holds the function for profiling.
3868 * 3877 *
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index 731201bf4ac..f5b7b5c1195 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -478,7 +478,7 @@ struct ring_buffer_per_cpu {
478 int cpu; 478 int cpu;
479 atomic_t record_disabled; 479 atomic_t record_disabled;
480 struct ring_buffer *buffer; 480 struct ring_buffer *buffer;
481 spinlock_t reader_lock; /* serialize readers */ 481 raw_spinlock_t reader_lock; /* serialize readers */
482 arch_spinlock_t lock; 482 arch_spinlock_t lock;
483 struct lock_class_key lock_key; 483 struct lock_class_key lock_key;
484 struct list_head *pages; 484 struct list_head *pages;
@@ -488,12 +488,14 @@ struct ring_buffer_per_cpu {
488 struct buffer_page *reader_page; 488 struct buffer_page *reader_page;
489 unsigned long lost_events; 489 unsigned long lost_events;
490 unsigned long last_overrun; 490 unsigned long last_overrun;
491 local_t entries_bytes;
491 local_t commit_overrun; 492 local_t commit_overrun;
492 local_t overrun; 493 local_t overrun;
493 local_t entries; 494 local_t entries;
494 local_t committing; 495 local_t committing;
495 local_t commits; 496 local_t commits;
496 unsigned long read; 497 unsigned long read;
498 unsigned long read_bytes;
497 u64 write_stamp; 499 u64 write_stamp;
498 u64 read_stamp; 500 u64 read_stamp;
499}; 501};
@@ -1062,7 +1064,7 @@ rb_allocate_cpu_buffer(struct ring_buffer *buffer, int cpu)
1062 1064
1063 cpu_buffer->cpu = cpu; 1065 cpu_buffer->cpu = cpu;
1064 cpu_buffer->buffer = buffer; 1066 cpu_buffer->buffer = buffer;
1065 spin_lock_init(&cpu_buffer->reader_lock); 1067 raw_spin_lock_init(&cpu_buffer->reader_lock);
1066 lockdep_set_class(&cpu_buffer->reader_lock, buffer->reader_lock_key); 1068 lockdep_set_class(&cpu_buffer->reader_lock, buffer->reader_lock_key);
1067 cpu_buffer->lock = (arch_spinlock_t)__ARCH_SPIN_LOCK_UNLOCKED; 1069 cpu_buffer->lock = (arch_spinlock_t)__ARCH_SPIN_LOCK_UNLOCKED;
1068 1070
@@ -1259,7 +1261,7 @@ rb_remove_pages(struct ring_buffer_per_cpu *cpu_buffer, unsigned nr_pages)
1259 struct list_head *p; 1261 struct list_head *p;
1260 unsigned i; 1262 unsigned i;
1261 1263
1262 spin_lock_irq(&cpu_buffer->reader_lock); 1264 raw_spin_lock_irq(&cpu_buffer->reader_lock);
1263 rb_head_page_deactivate(cpu_buffer); 1265 rb_head_page_deactivate(cpu_buffer);
1264 1266
1265 for (i = 0; i < nr_pages; i++) { 1267 for (i = 0; i < nr_pages; i++) {
@@ -1277,7 +1279,7 @@ rb_remove_pages(struct ring_buffer_per_cpu *cpu_buffer, unsigned nr_pages)
1277 rb_check_pages(cpu_buffer); 1279 rb_check_pages(cpu_buffer);
1278 1280
1279out: 1281out:
1280 spin_unlock_irq(&cpu_buffer->reader_lock); 1282 raw_spin_unlock_irq(&cpu_buffer->reader_lock);
1281} 1283}
1282 1284
1283static void 1285static void
@@ -1288,7 +1290,7 @@ rb_insert_pages(struct ring_buffer_per_cpu *cpu_buffer,
1288 struct list_head *p; 1290 struct list_head *p;
1289 unsigned i; 1291 unsigned i;
1290 1292
1291 spin_lock_irq(&cpu_buffer->reader_lock); 1293 raw_spin_lock_irq(&cpu_buffer->reader_lock);
1292 rb_head_page_deactivate(cpu_buffer); 1294 rb_head_page_deactivate(cpu_buffer);
1293 1295
1294 for (i = 0; i < nr_pages; i++) { 1296 for (i = 0; i < nr_pages; i++) {
@@ -1303,7 +1305,7 @@ rb_insert_pages(struct ring_buffer_per_cpu *cpu_buffer,
1303 rb_check_pages(cpu_buffer); 1305 rb_check_pages(cpu_buffer);
1304 1306
1305out: 1307out:
1306 spin_unlock_irq(&cpu_buffer->reader_lock); 1308 raw_spin_unlock_irq(&cpu_buffer->reader_lock);
1307} 1309}
1308 1310
1309/** 1311/**
@@ -1708,6 +1710,7 @@ rb_handle_head_page(struct ring_buffer_per_cpu *cpu_buffer,
1708 * the counters. 1710 * the counters.
1709 */ 1711 */
1710 local_add(entries, &cpu_buffer->overrun); 1712 local_add(entries, &cpu_buffer->overrun);
1713 local_sub(BUF_PAGE_SIZE, &cpu_buffer->entries_bytes);
1711 1714
1712 /* 1715 /*
1713 * The entries will be zeroed out when we move the 1716 * The entries will be zeroed out when we move the
@@ -1863,6 +1866,9 @@ rb_reset_tail(struct ring_buffer_per_cpu *cpu_buffer,
1863 event = __rb_page_index(tail_page, tail); 1866 event = __rb_page_index(tail_page, tail);
1864 kmemcheck_annotate_bitfield(event, bitfield); 1867 kmemcheck_annotate_bitfield(event, bitfield);
1865 1868
1869 /* account for padding bytes */
1870 local_add(BUF_PAGE_SIZE - tail, &cpu_buffer->entries_bytes);
1871
1866 /* 1872 /*
1867 * Save the original length to the meta data. 1873 * Save the original length to the meta data.
1868 * This will be used by the reader to add lost event 1874 * This will be used by the reader to add lost event
@@ -2054,6 +2060,9 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer,
2054 if (!tail) 2060 if (!tail)
2055 tail_page->page->time_stamp = ts; 2061 tail_page->page->time_stamp = ts;
2056 2062
2063 /* account for these added bytes */
2064 local_add(length, &cpu_buffer->entries_bytes);
2065
2057 return event; 2066 return event;
2058} 2067}
2059 2068
@@ -2076,6 +2085,7 @@ rb_try_to_discard(struct ring_buffer_per_cpu *cpu_buffer,
2076 if (bpage->page == (void *)addr && rb_page_write(bpage) == old_index) { 2085 if (bpage->page == (void *)addr && rb_page_write(bpage) == old_index) {
2077 unsigned long write_mask = 2086 unsigned long write_mask =
2078 local_read(&bpage->write) & ~RB_WRITE_MASK; 2087 local_read(&bpage->write) & ~RB_WRITE_MASK;
2088 unsigned long event_length = rb_event_length(event);
2079 /* 2089 /*
2080 * This is on the tail page. It is possible that 2090 * This is on the tail page. It is possible that
2081 * a write could come in and move the tail page 2091 * a write could come in and move the tail page
@@ -2085,8 +2095,11 @@ rb_try_to_discard(struct ring_buffer_per_cpu *cpu_buffer,
2085 old_index += write_mask; 2095 old_index += write_mask;
2086 new_index += write_mask; 2096 new_index += write_mask;
2087 index = local_cmpxchg(&bpage->write, old_index, new_index); 2097 index = local_cmpxchg(&bpage->write, old_index, new_index);
2088 if (index == old_index) 2098 if (index == old_index) {
2099 /* update counters */
2100 local_sub(event_length, &cpu_buffer->entries_bytes);
2089 return 1; 2101 return 1;
2102 }
2090 } 2103 }
2091 2104
2092 /* could not discard */ 2105 /* could not discard */
@@ -2661,6 +2674,58 @@ rb_num_of_entries(struct ring_buffer_per_cpu *cpu_buffer)
2661} 2674}
2662 2675
2663/** 2676/**
2677 * ring_buffer_oldest_event_ts - get the oldest event timestamp from the buffer
2678 * @buffer: The ring buffer
2679 * @cpu: The per CPU buffer to read from.
2680 */
2681unsigned long ring_buffer_oldest_event_ts(struct ring_buffer *buffer, int cpu)
2682{
2683 unsigned long flags;
2684 struct ring_buffer_per_cpu *cpu_buffer;
2685 struct buffer_page *bpage;
2686 unsigned long ret;
2687
2688 if (!cpumask_test_cpu(cpu, buffer->cpumask))
2689 return 0;
2690
2691 cpu_buffer = buffer->buffers[cpu];
2692 raw_spin_lock_irqsave(&cpu_buffer->reader_lock, flags);
2693 /*
2694 * if the tail is on reader_page, oldest time stamp is on the reader
2695 * page
2696 */
2697 if (cpu_buffer->tail_page == cpu_buffer->reader_page)
2698 bpage = cpu_buffer->reader_page;
2699 else
2700 bpage = rb_set_head_page(cpu_buffer);
2701 ret = bpage->page->time_stamp;
2702 raw_spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags);
2703
2704 return ret;
2705}
2706EXPORT_SYMBOL_GPL(ring_buffer_oldest_event_ts);
2707
2708/**
2709 * ring_buffer_bytes_cpu - get the number of bytes consumed in a cpu buffer
2710 * @buffer: The ring buffer
2711 * @cpu: The per CPU buffer to read from.
2712 */
2713unsigned long ring_buffer_bytes_cpu(struct ring_buffer *buffer, int cpu)
2714{
2715 struct ring_buffer_per_cpu *cpu_buffer;
2716 unsigned long ret;
2717
2718 if (!cpumask_test_cpu(cpu, buffer->cpumask))
2719 return 0;
2720
2721 cpu_buffer = buffer->buffers[cpu];
2722 ret = local_read(&cpu_buffer->entries_bytes) - cpu_buffer->read_bytes;
2723
2724 return ret;
2725}
2726EXPORT_SYMBOL_GPL(ring_buffer_bytes_cpu);
2727
2728/**
2664 * ring_buffer_entries_cpu - get the number of entries in a cpu buffer 2729 * ring_buffer_entries_cpu - get the number of entries in a cpu buffer
2665 * @buffer: The ring buffer 2730 * @buffer: The ring buffer
2666 * @cpu: The per CPU buffer to get the entries from. 2731 * @cpu: The per CPU buffer to get the entries from.
@@ -2804,9 +2869,9 @@ void ring_buffer_iter_reset(struct ring_buffer_iter *iter)
2804 2869
2805 cpu_buffer = iter->cpu_buffer; 2870 cpu_buffer = iter->cpu_buffer;
2806 2871
2807 spin_lock_irqsave(&cpu_buffer->reader_lock, flags); 2872 raw_spin_lock_irqsave(&cpu_buffer->reader_lock, flags);
2808 rb_iter_reset(iter); 2873 rb_iter_reset(iter);
2809 spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags); 2874 raw_spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags);
2810} 2875}
2811EXPORT_SYMBOL_GPL(ring_buffer_iter_reset); 2876EXPORT_SYMBOL_GPL(ring_buffer_iter_reset);
2812 2877
@@ -3265,12 +3330,12 @@ ring_buffer_peek(struct ring_buffer *buffer, int cpu, u64 *ts,
3265 again: 3330 again:
3266 local_irq_save(flags); 3331 local_irq_save(flags);
3267 if (dolock) 3332 if (dolock)
3268 spin_lock(&cpu_buffer->reader_lock); 3333 raw_spin_lock(&cpu_buffer->reader_lock);
3269 event = rb_buffer_peek(cpu_buffer, ts, lost_events); 3334 event = rb_buffer_peek(cpu_buffer, ts, lost_events);
3270 if (event && event->type_len == RINGBUF_TYPE_PADDING) 3335 if (event && event->type_len == RINGBUF_TYPE_PADDING)
3271 rb_advance_reader(cpu_buffer); 3336 rb_advance_reader(cpu_buffer);
3272 if (dolock) 3337 if (dolock)
3273 spin_unlock(&cpu_buffer->reader_lock); 3338 raw_spin_unlock(&cpu_buffer->reader_lock);
3274 local_irq_restore(flags); 3339 local_irq_restore(flags);
3275 3340
3276 if (event && event->type_len == RINGBUF_TYPE_PADDING) 3341 if (event && event->type_len == RINGBUF_TYPE_PADDING)
@@ -3295,9 +3360,9 @@ ring_buffer_iter_peek(struct ring_buffer_iter *iter, u64 *ts)
3295 unsigned long flags; 3360 unsigned long flags;
3296 3361
3297 again: 3362 again:
3298 spin_lock_irqsave(&cpu_buffer->reader_lock, flags); 3363 raw_spin_lock_irqsave(&cpu_buffer->reader_lock, flags);
3299 event = rb_iter_peek(iter, ts); 3364 event = rb_iter_peek(iter, ts);
3300 spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags); 3365 raw_spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags);
3301 3366
3302 if (event && event->type_len == RINGBUF_TYPE_PADDING) 3367 if (event && event->type_len == RINGBUF_TYPE_PADDING)
3303 goto again; 3368 goto again;
@@ -3337,7 +3402,7 @@ ring_buffer_consume(struct ring_buffer *buffer, int cpu, u64 *ts,
3337 cpu_buffer = buffer->buffers[cpu]; 3402 cpu_buffer = buffer->buffers[cpu];
3338 local_irq_save(flags); 3403 local_irq_save(flags);
3339 if (dolock) 3404 if (dolock)
3340 spin_lock(&cpu_buffer->reader_lock); 3405 raw_spin_lock(&cpu_buffer->reader_lock);
3341 3406
3342 event = rb_buffer_peek(cpu_buffer, ts, lost_events); 3407 event = rb_buffer_peek(cpu_buffer, ts, lost_events);
3343 if (event) { 3408 if (event) {
@@ -3346,7 +3411,7 @@ ring_buffer_consume(struct ring_buffer *buffer, int cpu, u64 *ts,
3346 } 3411 }
3347 3412
3348 if (dolock) 3413 if (dolock)
3349 spin_unlock(&cpu_buffer->reader_lock); 3414 raw_spin_unlock(&cpu_buffer->reader_lock);
3350 local_irq_restore(flags); 3415 local_irq_restore(flags);
3351 3416
3352 out: 3417 out:
@@ -3438,11 +3503,11 @@ ring_buffer_read_start(struct ring_buffer_iter *iter)
3438 3503
3439 cpu_buffer = iter->cpu_buffer; 3504 cpu_buffer = iter->cpu_buffer;
3440 3505
3441 spin_lock_irqsave(&cpu_buffer->reader_lock, flags); 3506 raw_spin_lock_irqsave(&cpu_buffer->reader_lock, flags);
3442 arch_spin_lock(&cpu_buffer->lock); 3507 arch_spin_lock(&cpu_buffer->lock);
3443 rb_iter_reset(iter); 3508 rb_iter_reset(iter);
3444 arch_spin_unlock(&cpu_buffer->lock); 3509 arch_spin_unlock(&cpu_buffer->lock);
3445 spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags); 3510 raw_spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags);
3446} 3511}
3447EXPORT_SYMBOL_GPL(ring_buffer_read_start); 3512EXPORT_SYMBOL_GPL(ring_buffer_read_start);
3448 3513
@@ -3477,7 +3542,7 @@ ring_buffer_read(struct ring_buffer_iter *iter, u64 *ts)
3477 struct ring_buffer_per_cpu *cpu_buffer = iter->cpu_buffer; 3542 struct ring_buffer_per_cpu *cpu_buffer = iter->cpu_buffer;
3478 unsigned long flags; 3543 unsigned long flags;
3479 3544
3480 spin_lock_irqsave(&cpu_buffer->reader_lock, flags); 3545 raw_spin_lock_irqsave(&cpu_buffer->reader_lock, flags);
3481 again: 3546 again:
3482 event = rb_iter_peek(iter, ts); 3547 event = rb_iter_peek(iter, ts);
3483 if (!event) 3548 if (!event)
@@ -3488,7 +3553,7 @@ ring_buffer_read(struct ring_buffer_iter *iter, u64 *ts)
3488 3553
3489 rb_advance_iter(iter); 3554 rb_advance_iter(iter);
3490 out: 3555 out:
3491 spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags); 3556 raw_spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags);
3492 3557
3493 return event; 3558 return event;
3494} 3559}
@@ -3527,11 +3592,13 @@ rb_reset_cpu(struct ring_buffer_per_cpu *cpu_buffer)
3527 cpu_buffer->reader_page->read = 0; 3592 cpu_buffer->reader_page->read = 0;
3528 3593
3529 local_set(&cpu_buffer->commit_overrun, 0); 3594 local_set(&cpu_buffer->commit_overrun, 0);
3595 local_set(&cpu_buffer->entries_bytes, 0);
3530 local_set(&cpu_buffer->overrun, 0); 3596 local_set(&cpu_buffer->overrun, 0);
3531 local_set(&cpu_buffer->entries, 0); 3597 local_set(&cpu_buffer->entries, 0);
3532 local_set(&cpu_buffer->committing, 0); 3598 local_set(&cpu_buffer->committing, 0);
3533 local_set(&cpu_buffer->commits, 0); 3599 local_set(&cpu_buffer->commits, 0);
3534 cpu_buffer->read = 0; 3600 cpu_buffer->read = 0;
3601 cpu_buffer->read_bytes = 0;
3535 3602
3536 cpu_buffer->write_stamp = 0; 3603 cpu_buffer->write_stamp = 0;
3537 cpu_buffer->read_stamp = 0; 3604 cpu_buffer->read_stamp = 0;
@@ -3557,7 +3624,7 @@ void ring_buffer_reset_cpu(struct ring_buffer *buffer, int cpu)
3557 3624
3558 atomic_inc(&cpu_buffer->record_disabled); 3625 atomic_inc(&cpu_buffer->record_disabled);
3559 3626
3560 spin_lock_irqsave(&cpu_buffer->reader_lock, flags); 3627 raw_spin_lock_irqsave(&cpu_buffer->reader_lock, flags);
3561 3628
3562 if (RB_WARN_ON(cpu_buffer, local_read(&cpu_buffer->committing))) 3629 if (RB_WARN_ON(cpu_buffer, local_read(&cpu_buffer->committing)))
3563 goto out; 3630 goto out;
@@ -3569,7 +3636,7 @@ void ring_buffer_reset_cpu(struct ring_buffer *buffer, int cpu)
3569 arch_spin_unlock(&cpu_buffer->lock); 3636 arch_spin_unlock(&cpu_buffer->lock);
3570 3637
3571 out: 3638 out:
3572 spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags); 3639 raw_spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags);
3573 3640
3574 atomic_dec(&cpu_buffer->record_disabled); 3641 atomic_dec(&cpu_buffer->record_disabled);
3575} 3642}
@@ -3607,10 +3674,10 @@ int ring_buffer_empty(struct ring_buffer *buffer)
3607 cpu_buffer = buffer->buffers[cpu]; 3674 cpu_buffer = buffer->buffers[cpu];
3608 local_irq_save(flags); 3675 local_irq_save(flags);
3609 if (dolock) 3676 if (dolock)
3610 spin_lock(&cpu_buffer->reader_lock); 3677 raw_spin_lock(&cpu_buffer->reader_lock);
3611 ret = rb_per_cpu_empty(cpu_buffer); 3678 ret = rb_per_cpu_empty(cpu_buffer);
3612 if (dolock) 3679 if (dolock)
3613 spin_unlock(&cpu_buffer->reader_lock); 3680 raw_spin_unlock(&cpu_buffer->reader_lock);
3614 local_irq_restore(flags); 3681 local_irq_restore(flags);
3615 3682
3616 if (!ret) 3683 if (!ret)
@@ -3641,10 +3708,10 @@ int ring_buffer_empty_cpu(struct ring_buffer *buffer, int cpu)
3641 cpu_buffer = buffer->buffers[cpu]; 3708 cpu_buffer = buffer->buffers[cpu];
3642 local_irq_save(flags); 3709 local_irq_save(flags);
3643 if (dolock) 3710 if (dolock)
3644 spin_lock(&cpu_buffer->reader_lock); 3711 raw_spin_lock(&cpu_buffer->reader_lock);
3645 ret = rb_per_cpu_empty(cpu_buffer); 3712 ret = rb_per_cpu_empty(cpu_buffer);
3646 if (dolock) 3713 if (dolock)
3647 spin_unlock(&cpu_buffer->reader_lock); 3714 raw_spin_unlock(&cpu_buffer->reader_lock);
3648 local_irq_restore(flags); 3715 local_irq_restore(flags);
3649 3716
3650 return ret; 3717 return ret;
@@ -3841,7 +3908,7 @@ int ring_buffer_read_page(struct ring_buffer *buffer,
3841 if (!bpage) 3908 if (!bpage)
3842 goto out; 3909 goto out;
3843 3910
3844 spin_lock_irqsave(&cpu_buffer->reader_lock, flags); 3911 raw_spin_lock_irqsave(&cpu_buffer->reader_lock, flags);
3845 3912
3846 reader = rb_get_reader_page(cpu_buffer); 3913 reader = rb_get_reader_page(cpu_buffer);
3847 if (!reader) 3914 if (!reader)
@@ -3918,6 +3985,7 @@ int ring_buffer_read_page(struct ring_buffer *buffer,
3918 } else { 3985 } else {
3919 /* update the entry counter */ 3986 /* update the entry counter */
3920 cpu_buffer->read += rb_page_entries(reader); 3987 cpu_buffer->read += rb_page_entries(reader);
3988 cpu_buffer->read_bytes += BUF_PAGE_SIZE;
3921 3989
3922 /* swap the pages */ 3990 /* swap the pages */
3923 rb_init_page(bpage); 3991 rb_init_page(bpage);
@@ -3964,7 +4032,7 @@ int ring_buffer_read_page(struct ring_buffer *buffer,
3964 memset(&bpage->data[commit], 0, BUF_PAGE_SIZE - commit); 4032 memset(&bpage->data[commit], 0, BUF_PAGE_SIZE - commit);
3965 4033
3966 out_unlock: 4034 out_unlock:
3967 spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags); 4035 raw_spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags);
3968 4036
3969 out: 4037 out:
3970 return ret; 4038 return ret;
diff --git a/kernel/trace/rpm-traces.c b/kernel/trace/rpm-traces.c
new file mode 100644
index 00000000000..4b3b5eaf94d
--- /dev/null
+++ b/kernel/trace/rpm-traces.c
@@ -0,0 +1,20 @@
1/*
2 * Power trace points
3 *
4 * Copyright (C) 2009 Ming Lei <ming.lei@canonical.com>
5 */
6
7#include <linux/string.h>
8#include <linux/types.h>
9#include <linux/workqueue.h>
10#include <linux/sched.h>
11#include <linux/module.h>
12#include <linux/usb.h>
13
14#define CREATE_TRACE_POINTS
15#include <trace/events/rpm.h>
16
17EXPORT_TRACEPOINT_SYMBOL_GPL(rpm_return_int);
18EXPORT_TRACEPOINT_SYMBOL_GPL(rpm_idle);
19EXPORT_TRACEPOINT_SYMBOL_GPL(rpm_suspend);
20EXPORT_TRACEPOINT_SYMBOL_GPL(rpm_resume);
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index e5df02c69b1..f2bd275bb60 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -341,7 +341,7 @@ unsigned long trace_flags = TRACE_ITER_PRINT_PARENT | TRACE_ITER_PRINTK |
341 TRACE_ITER_GRAPH_TIME | TRACE_ITER_RECORD_CMD | TRACE_ITER_OVERWRITE; 341 TRACE_ITER_GRAPH_TIME | TRACE_ITER_RECORD_CMD | TRACE_ITER_OVERWRITE;
342 342
343static int trace_stop_count; 343static int trace_stop_count;
344static DEFINE_SPINLOCK(tracing_start_lock); 344static DEFINE_RAW_SPINLOCK(tracing_start_lock);
345 345
346static void wakeup_work_handler(struct work_struct *work) 346static void wakeup_work_handler(struct work_struct *work)
347{ 347{
@@ -435,6 +435,7 @@ static struct {
435} trace_clocks[] = { 435} trace_clocks[] = {
436 { trace_clock_local, "local" }, 436 { trace_clock_local, "local" },
437 { trace_clock_global, "global" }, 437 { trace_clock_global, "global" },
438 { trace_clock_counter, "counter" },
438}; 439};
439 440
440int trace_clock_id; 441int trace_clock_id;
@@ -960,7 +961,7 @@ void tracing_start(void)
960 if (tracing_disabled) 961 if (tracing_disabled)
961 return; 962 return;
962 963
963 spin_lock_irqsave(&tracing_start_lock, flags); 964 raw_spin_lock_irqsave(&tracing_start_lock, flags);
964 if (--trace_stop_count) { 965 if (--trace_stop_count) {
965 if (trace_stop_count < 0) { 966 if (trace_stop_count < 0) {
966 /* Someone screwed up their debugging */ 967 /* Someone screwed up their debugging */
@@ -985,7 +986,7 @@ void tracing_start(void)
985 986
986 ftrace_start(); 987 ftrace_start();
987 out: 988 out:
988 spin_unlock_irqrestore(&tracing_start_lock, flags); 989 raw_spin_unlock_irqrestore(&tracing_start_lock, flags);
989} 990}
990 991
991/** 992/**
@@ -1000,7 +1001,7 @@ void tracing_stop(void)
1000 unsigned long flags; 1001 unsigned long flags;
1001 1002
1002 ftrace_stop(); 1003 ftrace_stop();
1003 spin_lock_irqsave(&tracing_start_lock, flags); 1004 raw_spin_lock_irqsave(&tracing_start_lock, flags);
1004 if (trace_stop_count++) 1005 if (trace_stop_count++)
1005 goto out; 1006 goto out;
1006 1007
@@ -1018,7 +1019,7 @@ void tracing_stop(void)
1018 arch_spin_unlock(&ftrace_max_lock); 1019 arch_spin_unlock(&ftrace_max_lock);
1019 1020
1020 out: 1021 out:
1021 spin_unlock_irqrestore(&tracing_start_lock, flags); 1022 raw_spin_unlock_irqrestore(&tracing_start_lock, flags);
1022} 1023}
1023 1024
1024void trace_stop_cmdline_recording(void); 1025void trace_stop_cmdline_recording(void);
@@ -2159,6 +2160,14 @@ void trace_default_header(struct seq_file *m)
2159 } 2160 }
2160} 2161}
2161 2162
2163static void test_ftrace_alive(struct seq_file *m)
2164{
2165 if (!ftrace_is_dead())
2166 return;
2167 seq_printf(m, "# WARNING: FUNCTION TRACING IS CORRUPTED\n");
2168 seq_printf(m, "# MAY BE MISSING FUNCTION EVENTS\n");
2169}
2170
2162static int s_show(struct seq_file *m, void *v) 2171static int s_show(struct seq_file *m, void *v)
2163{ 2172{
2164 struct trace_iterator *iter = v; 2173 struct trace_iterator *iter = v;
@@ -2168,6 +2177,7 @@ static int s_show(struct seq_file *m, void *v)
2168 if (iter->tr) { 2177 if (iter->tr) {
2169 seq_printf(m, "# tracer: %s\n", iter->trace->name); 2178 seq_printf(m, "# tracer: %s\n", iter->trace->name);
2170 seq_puts(m, "#\n"); 2179 seq_puts(m, "#\n");
2180 test_ftrace_alive(m);
2171 } 2181 }
2172 if (iter->trace && iter->trace->print_header) 2182 if (iter->trace && iter->trace->print_header)
2173 iter->trace->print_header(m); 2183 iter->trace->print_header(m);
@@ -2710,9 +2720,9 @@ static const char readme_msg[] =
2710 "# cat /sys/kernel/debug/tracing/trace_options\n" 2720 "# cat /sys/kernel/debug/tracing/trace_options\n"
2711 "noprint-parent nosym-offset nosym-addr noverbose\n" 2721 "noprint-parent nosym-offset nosym-addr noverbose\n"
2712 "# echo print-parent > /sys/kernel/debug/tracing/trace_options\n" 2722 "# echo print-parent > /sys/kernel/debug/tracing/trace_options\n"
2713 "# echo 1 > /sys/kernel/debug/tracing/tracing_enabled\n" 2723 "# echo 1 > /sys/kernel/debug/tracing/tracing_on\n"
2714 "# cat /sys/kernel/debug/tracing/trace > /tmp/trace.txt\n" 2724 "# cat /sys/kernel/debug/tracing/trace > /tmp/trace.txt\n"
2715 "# echo 0 > /sys/kernel/debug/tracing/tracing_enabled\n" 2725 "# echo 0 > /sys/kernel/debug/tracing/tracing_on\n"
2716; 2726;
2717 2727
2718static ssize_t 2728static ssize_t
@@ -3569,6 +3579,30 @@ tracing_entries_write(struct file *filp, const char __user *ubuf,
3569} 3579}
3570 3580
3571static ssize_t 3581static ssize_t
3582tracing_total_entries_read(struct file *filp, char __user *ubuf,
3583 size_t cnt, loff_t *ppos)
3584{
3585 struct trace_array *tr = filp->private_data;
3586 char buf[64];
3587 int r, cpu;
3588 unsigned long size = 0, expanded_size = 0;
3589
3590 mutex_lock(&trace_types_lock);
3591 for_each_tracing_cpu(cpu) {
3592 size += tr->entries >> 10;
3593 if (!ring_buffer_expanded)
3594 expanded_size += trace_buf_size >> 10;
3595 }
3596 if (ring_buffer_expanded)
3597 r = sprintf(buf, "%lu\n", size);
3598 else
3599 r = sprintf(buf, "%lu (expanded: %lu)\n", size, expanded_size);
3600 mutex_unlock(&trace_types_lock);
3601
3602 return simple_read_from_buffer(ubuf, cnt, ppos, buf, r);
3603}
3604
3605static ssize_t
3572tracing_free_buffer_write(struct file *filp, const char __user *ubuf, 3606tracing_free_buffer_write(struct file *filp, const char __user *ubuf,
3573 size_t cnt, loff_t *ppos) 3607 size_t cnt, loff_t *ppos)
3574{ 3608{
@@ -3594,22 +3628,24 @@ tracing_free_buffer_release(struct inode *inode, struct file *filp)
3594 return 0; 3628 return 0;
3595} 3629}
3596 3630
3597static int mark_printk(const char *fmt, ...)
3598{
3599 int ret;
3600 va_list args;
3601 va_start(args, fmt);
3602 ret = trace_vprintk(0, fmt, args);
3603 va_end(args);
3604 return ret;
3605}
3606
3607static ssize_t 3631static ssize_t
3608tracing_mark_write(struct file *filp, const char __user *ubuf, 3632tracing_mark_write(struct file *filp, const char __user *ubuf,
3609 size_t cnt, loff_t *fpos) 3633 size_t cnt, loff_t *fpos)
3610{ 3634{
3611 char *buf; 3635 unsigned long addr = (unsigned long)ubuf;
3612 size_t written; 3636 struct ring_buffer_event *event;
3637 struct ring_buffer *buffer;
3638 struct print_entry *entry;
3639 unsigned long irq_flags;
3640 struct page *pages[2];
3641 int nr_pages = 1;
3642 ssize_t written;
3643 void *page1;
3644 void *page2;
3645 int offset;
3646 int size;
3647 int len;
3648 int ret;
3613 3649
3614 if (tracing_disabled) 3650 if (tracing_disabled)
3615 return -EINVAL; 3651 return -EINVAL;
@@ -3617,28 +3653,81 @@ tracing_mark_write(struct file *filp, const char __user *ubuf,
3617 if (cnt > TRACE_BUF_SIZE) 3653 if (cnt > TRACE_BUF_SIZE)
3618 cnt = TRACE_BUF_SIZE; 3654 cnt = TRACE_BUF_SIZE;
3619 3655
3620 buf = kmalloc(cnt + 2, GFP_KERNEL); 3656 /*
3621 if (buf == NULL) 3657 * Userspace is injecting traces into the kernel trace buffer.
3622 return -ENOMEM; 3658 * We want to be as non intrusive as possible.
3659 * To do so, we do not want to allocate any special buffers
3660 * or take any locks, but instead write the userspace data
3661 * straight into the ring buffer.
3662 *
3663 * First we need to pin the userspace buffer into memory,
3664 * which, most likely it is, because it just referenced it.
3665 * But there's no guarantee that it is. By using get_user_pages_fast()
3666 * and kmap_atomic/kunmap_atomic() we can get access to the
3667 * pages directly. We then write the data directly into the
3668 * ring buffer.
3669 */
3670 BUILD_BUG_ON(TRACE_BUF_SIZE >= PAGE_SIZE);
3623 3671
3624 if (copy_from_user(buf, ubuf, cnt)) { 3672 /* check if we cross pages */
3625 kfree(buf); 3673 if ((addr & PAGE_MASK) != ((addr + cnt) & PAGE_MASK))
3626 return -EFAULT; 3674 nr_pages = 2;
3675
3676 offset = addr & (PAGE_SIZE - 1);
3677 addr &= PAGE_MASK;
3678
3679 ret = get_user_pages_fast(addr, nr_pages, 0, pages);
3680 if (ret < nr_pages) {
3681 while (--ret >= 0)
3682 put_page(pages[ret]);
3683 written = -EFAULT;
3684 goto out;
3685 }
3686
3687 page1 = kmap_atomic(pages[0]);
3688 if (nr_pages == 2)
3689 page2 = kmap_atomic(pages[1]);
3690
3691 local_save_flags(irq_flags);
3692 size = sizeof(*entry) + cnt + 2; /* possible \n added */
3693 buffer = global_trace.buffer;
3694 event = trace_buffer_lock_reserve(buffer, TRACE_PRINT, size,
3695 irq_flags, preempt_count());
3696 if (!event) {
3697 /* Ring buffer disabled, return as if not open for write */
3698 written = -EBADF;
3699 goto out_unlock;
3627 } 3700 }
3628 if (buf[cnt-1] != '\n') { 3701
3629 buf[cnt] = '\n'; 3702 entry = ring_buffer_event_data(event);
3630 buf[cnt+1] = '\0'; 3703 entry->ip = _THIS_IP_;
3704
3705 if (nr_pages == 2) {
3706 len = PAGE_SIZE - offset;
3707 memcpy(&entry->buf, page1 + offset, len);
3708 memcpy(&entry->buf[len], page2, cnt - len);
3631 } else 3709 } else
3632 buf[cnt] = '\0'; 3710 memcpy(&entry->buf, page1 + offset, cnt);
3633 3711
3634 written = mark_printk("%s", buf); 3712 if (entry->buf[cnt - 1] != '\n') {
3635 kfree(buf); 3713 entry->buf[cnt] = '\n';
3636 *fpos += written; 3714 entry->buf[cnt + 1] = '\0';
3715 } else
3716 entry->buf[cnt] = '\0';
3717
3718 ring_buffer_unlock_commit(buffer, event);
3637 3719
3638 /* don't tell userspace we wrote more - it might confuse them */ 3720 written = cnt;
3639 if (written > cnt)
3640 written = cnt;
3641 3721
3722 *fpos += written;
3723
3724 out_unlock:
3725 if (nr_pages == 2)
3726 kunmap_atomic(page2);
3727 kunmap_atomic(page1);
3728 while (nr_pages > 0)
3729 put_page(pages[--nr_pages]);
3730 out:
3642 return written; 3731 return written;
3643} 3732}
3644 3733
@@ -3739,6 +3828,12 @@ static const struct file_operations tracing_entries_fops = {
3739 .llseek = generic_file_llseek, 3828 .llseek = generic_file_llseek,
3740}; 3829};
3741 3830
3831static const struct file_operations tracing_total_entries_fops = {
3832 .open = tracing_open_generic,
3833 .read = tracing_total_entries_read,
3834 .llseek = generic_file_llseek,
3835};
3836
3742static const struct file_operations tracing_free_buffer_fops = { 3837static const struct file_operations tracing_free_buffer_fops = {
3743 .write = tracing_free_buffer_write, 3838 .write = tracing_free_buffer_write,
3744 .release = tracing_free_buffer_release, 3839 .release = tracing_free_buffer_release,
@@ -3808,8 +3903,6 @@ tracing_buffers_read(struct file *filp, char __user *ubuf,
3808 if (info->read < PAGE_SIZE) 3903 if (info->read < PAGE_SIZE)
3809 goto read; 3904 goto read;
3810 3905
3811 info->read = 0;
3812
3813 trace_access_lock(info->cpu); 3906 trace_access_lock(info->cpu);
3814 ret = ring_buffer_read_page(info->tr->buffer, 3907 ret = ring_buffer_read_page(info->tr->buffer,
3815 &info->spare, 3908 &info->spare,
@@ -3819,6 +3912,8 @@ tracing_buffers_read(struct file *filp, char __user *ubuf,
3819 if (ret < 0) 3912 if (ret < 0)
3820 return 0; 3913 return 0;
3821 3914
3915 info->read = 0;
3916
3822read: 3917read:
3823 size = PAGE_SIZE - info->read; 3918 size = PAGE_SIZE - info->read;
3824 if (size > count) 3919 if (size > count)
@@ -4026,6 +4121,8 @@ tracing_stats_read(struct file *filp, char __user *ubuf,
4026 struct trace_array *tr = &global_trace; 4121 struct trace_array *tr = &global_trace;
4027 struct trace_seq *s; 4122 struct trace_seq *s;
4028 unsigned long cnt; 4123 unsigned long cnt;
4124 unsigned long long t;
4125 unsigned long usec_rem;
4029 4126
4030 s = kmalloc(sizeof(*s), GFP_KERNEL); 4127 s = kmalloc(sizeof(*s), GFP_KERNEL);
4031 if (!s) 4128 if (!s)
@@ -4042,6 +4139,17 @@ tracing_stats_read(struct file *filp, char __user *ubuf,
4042 cnt = ring_buffer_commit_overrun_cpu(tr->buffer, cpu); 4139 cnt = ring_buffer_commit_overrun_cpu(tr->buffer, cpu);
4043 trace_seq_printf(s, "commit overrun: %ld\n", cnt); 4140 trace_seq_printf(s, "commit overrun: %ld\n", cnt);
4044 4141
4142 cnt = ring_buffer_bytes_cpu(tr->buffer, cpu);
4143 trace_seq_printf(s, "bytes: %ld\n", cnt);
4144
4145 t = ns2usecs(ring_buffer_oldest_event_ts(tr->buffer, cpu));
4146 usec_rem = do_div(t, USEC_PER_SEC);
4147 trace_seq_printf(s, "oldest event ts: %5llu.%06lu\n", t, usec_rem);
4148
4149 t = ns2usecs(ring_buffer_time_stamp(tr->buffer, cpu));
4150 usec_rem = do_div(t, USEC_PER_SEC);
4151 trace_seq_printf(s, "now ts: %5llu.%06lu\n", t, usec_rem);
4152
4045 count = simple_read_from_buffer(ubuf, count, ppos, s->buffer, s->len); 4153 count = simple_read_from_buffer(ubuf, count, ppos, s->buffer, s->len);
4046 4154
4047 kfree(s); 4155 kfree(s);
@@ -4450,6 +4558,9 @@ static __init int tracer_init_debugfs(void)
4450 trace_create_file("buffer_size_kb", 0644, d_tracer, 4558 trace_create_file("buffer_size_kb", 0644, d_tracer,
4451 &global_trace, &tracing_entries_fops); 4559 &global_trace, &tracing_entries_fops);
4452 4560
4561 trace_create_file("buffer_total_size_kb", 0444, d_tracer,
4562 &global_trace, &tracing_total_entries_fops);
4563
4453 trace_create_file("free_buffer", 0644, d_tracer, 4564 trace_create_file("free_buffer", 0644, d_tracer,
4454 &global_trace, &tracing_free_buffer_fops); 4565 &global_trace, &tracing_free_buffer_fops);
4455 4566
@@ -4566,6 +4677,12 @@ __ftrace_dump(bool disable_tracing, enum ftrace_dump_mode oops_dump_mode)
4566 4677
4567 tracing_off(); 4678 tracing_off();
4568 4679
4680 /* Did function tracer already get disabled? */
4681 if (ftrace_is_dead()) {
4682 printk("# WARNING: FUNCTION TRACING IS CORRUPTED\n");
4683 printk("# MAY BE MISSING FUNCTION EVENTS\n");
4684 }
4685
4569 if (disable_tracing) 4686 if (disable_tracing)
4570 ftrace_kill(); 4687 ftrace_kill();
4571 4688
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 616846bcfee..092e1f8d18d 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -579,11 +579,13 @@ static inline int ftrace_trace_task(struct task_struct *task)
579 579
580 return test_tsk_trace_trace(task); 580 return test_tsk_trace_trace(task);
581} 581}
582extern int ftrace_is_dead(void);
582#else 583#else
583static inline int ftrace_trace_task(struct task_struct *task) 584static inline int ftrace_trace_task(struct task_struct *task)
584{ 585{
585 return 1; 586 return 1;
586} 587}
588static inline int ftrace_is_dead(void) { return 0; }
587#endif 589#endif
588 590
589/* 591/*
@@ -761,16 +763,10 @@ struct filter_pred {
761 filter_pred_fn_t fn; 763 filter_pred_fn_t fn;
762 u64 val; 764 u64 val;
763 struct regex regex; 765 struct regex regex;
764 /* 766 unsigned short *ops;
765 * Leaf nodes use field_name, ops is used by AND and OR 767#ifdef CONFIG_FTRACE_STARTUP_TEST
766 * nodes. The field_name is always freed when freeing a pred. 768 struct ftrace_event_field *field;
767 * We can overload field_name for ops and have it freed 769#endif
768 * as well.
769 */
770 union {
771 char *field_name;
772 unsigned short *ops;
773 };
774 int offset; 770 int offset;
775 int not; 771 int not;
776 int op; 772 int op;
diff --git a/kernel/trace/trace_clock.c b/kernel/trace/trace_clock.c
index 6302747a139..394783531cb 100644
--- a/kernel/trace/trace_clock.c
+++ b/kernel/trace/trace_clock.c
@@ -113,3 +113,15 @@ u64 notrace trace_clock_global(void)
113 113
114 return now; 114 return now;
115} 115}
116
117static atomic64_t trace_counter;
118
119/*
120 * trace_clock_counter(): simply an atomic counter.
121 * Use the trace_counter "counter" for cases where you do not care
122 * about timings, but are interested in strict ordering.
123 */
124u64 notrace trace_clock_counter(void)
125{
126 return atomic64_add_return(1, &trace_counter);
127}
diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c
index 256764ecccd..816d3d07497 100644
--- a/kernel/trace/trace_events_filter.c
+++ b/kernel/trace/trace_events_filter.c
@@ -381,6 +381,63 @@ get_pred_parent(struct filter_pred *pred, struct filter_pred *preds,
381 return pred; 381 return pred;
382} 382}
383 383
384enum walk_return {
385 WALK_PRED_ABORT,
386 WALK_PRED_PARENT,
387 WALK_PRED_DEFAULT,
388};
389
390typedef int (*filter_pred_walkcb_t) (enum move_type move,
391 struct filter_pred *pred,
392 int *err, void *data);
393
394static int walk_pred_tree(struct filter_pred *preds,
395 struct filter_pred *root,
396 filter_pred_walkcb_t cb, void *data)
397{
398 struct filter_pred *pred = root;
399 enum move_type move = MOVE_DOWN;
400 int done = 0;
401
402 if (!preds)
403 return -EINVAL;
404
405 do {
406 int err = 0, ret;
407
408 ret = cb(move, pred, &err, data);
409 if (ret == WALK_PRED_ABORT)
410 return err;
411 if (ret == WALK_PRED_PARENT)
412 goto get_parent;
413
414 switch (move) {
415 case MOVE_DOWN:
416 if (pred->left != FILTER_PRED_INVALID) {
417 pred = &preds[pred->left];
418 continue;
419 }
420 goto get_parent;
421 case MOVE_UP_FROM_LEFT:
422 pred = &preds[pred->right];
423 move = MOVE_DOWN;
424 continue;
425 case MOVE_UP_FROM_RIGHT:
426 get_parent:
427 if (pred == root)
428 break;
429 pred = get_pred_parent(pred, preds,
430 pred->parent,
431 &move);
432 continue;
433 }
434 done = 1;
435 } while (!done);
436
437 /* We are fine. */
438 return 0;
439}
440
384/* 441/*
385 * A series of AND or ORs where found together. Instead of 442 * A series of AND or ORs where found together. Instead of
386 * climbing up and down the tree branches, an array of the 443 * climbing up and down the tree branches, an array of the
@@ -410,99 +467,91 @@ static int process_ops(struct filter_pred *preds,
410 467
411 for (i = 0; i < op->val; i++) { 468 for (i = 0; i < op->val; i++) {
412 pred = &preds[op->ops[i]]; 469 pred = &preds[op->ops[i]];
413 match = pred->fn(pred, rec); 470 if (!WARN_ON_ONCE(!pred->fn))
471 match = pred->fn(pred, rec);
414 if (!!match == type) 472 if (!!match == type)
415 return match; 473 return match;
416 } 474 }
417 return match; 475 return match;
418} 476}
419 477
478struct filter_match_preds_data {
479 struct filter_pred *preds;
480 int match;
481 void *rec;
482};
483
484static int filter_match_preds_cb(enum move_type move, struct filter_pred *pred,
485 int *err, void *data)
486{
487 struct filter_match_preds_data *d = data;
488
489 *err = 0;
490 switch (move) {
491 case MOVE_DOWN:
492 /* only AND and OR have children */
493 if (pred->left != FILTER_PRED_INVALID) {
494 /* If ops is set, then it was folded. */
495 if (!pred->ops)
496 return WALK_PRED_DEFAULT;
497 /* We can treat folded ops as a leaf node */
498 d->match = process_ops(d->preds, pred, d->rec);
499 } else {
500 if (!WARN_ON_ONCE(!pred->fn))
501 d->match = pred->fn(pred, d->rec);
502 }
503
504 return WALK_PRED_PARENT;
505 case MOVE_UP_FROM_LEFT:
506 /*
507 * Check for short circuits.
508 *
509 * Optimization: !!match == (pred->op == OP_OR)
510 * is the same as:
511 * if ((match && pred->op == OP_OR) ||
512 * (!match && pred->op == OP_AND))
513 */
514 if (!!d->match == (pred->op == OP_OR))
515 return WALK_PRED_PARENT;
516 break;
517 case MOVE_UP_FROM_RIGHT:
518 break;
519 }
520
521 return WALK_PRED_DEFAULT;
522}
523
420/* return 1 if event matches, 0 otherwise (discard) */ 524/* return 1 if event matches, 0 otherwise (discard) */
421int filter_match_preds(struct event_filter *filter, void *rec) 525int filter_match_preds(struct event_filter *filter, void *rec)
422{ 526{
423 int match = -1;
424 enum move_type move = MOVE_DOWN;
425 struct filter_pred *preds; 527 struct filter_pred *preds;
426 struct filter_pred *pred;
427 struct filter_pred *root; 528 struct filter_pred *root;
428 int n_preds; 529 struct filter_match_preds_data data = {
429 int done = 0; 530 /* match is currently meaningless */
531 .match = -1,
532 .rec = rec,
533 };
534 int n_preds, ret;
430 535
431 /* no filter is considered a match */ 536 /* no filter is considered a match */
432 if (!filter) 537 if (!filter)
433 return 1; 538 return 1;
434 539
435 n_preds = filter->n_preds; 540 n_preds = filter->n_preds;
436
437 if (!n_preds) 541 if (!n_preds)
438 return 1; 542 return 1;
439 543
440 /* 544 /*
441 * n_preds, root and filter->preds are protect with preemption disabled. 545 * n_preds, root and filter->preds are protect with preemption disabled.
442 */ 546 */
443 preds = rcu_dereference_sched(filter->preds);
444 root = rcu_dereference_sched(filter->root); 547 root = rcu_dereference_sched(filter->root);
445 if (!root) 548 if (!root)
446 return 1; 549 return 1;
447 550
448 pred = root; 551 data.preds = preds = rcu_dereference_sched(filter->preds);
449 552 ret = walk_pred_tree(preds, root, filter_match_preds_cb, &data);
450 /* match is currently meaningless */ 553 WARN_ON(ret);
451 match = -1; 554 return data.match;
452
453 do {
454 switch (move) {
455 case MOVE_DOWN:
456 /* only AND and OR have children */
457 if (pred->left != FILTER_PRED_INVALID) {
458 /* If ops is set, then it was folded. */
459 if (!pred->ops) {
460 /* keep going to down the left side */
461 pred = &preds[pred->left];
462 continue;
463 }
464 /* We can treat folded ops as a leaf node */
465 match = process_ops(preds, pred, rec);
466 } else
467 match = pred->fn(pred, rec);
468 /* If this pred is the only pred */
469 if (pred == root)
470 break;
471 pred = get_pred_parent(pred, preds,
472 pred->parent, &move);
473 continue;
474 case MOVE_UP_FROM_LEFT:
475 /*
476 * Check for short circuits.
477 *
478 * Optimization: !!match == (pred->op == OP_OR)
479 * is the same as:
480 * if ((match && pred->op == OP_OR) ||
481 * (!match && pred->op == OP_AND))
482 */
483 if (!!match == (pred->op == OP_OR)) {
484 if (pred == root)
485 break;
486 pred = get_pred_parent(pred, preds,
487 pred->parent, &move);
488 continue;
489 }
490 /* now go down the right side of the tree. */
491 pred = &preds[pred->right];
492 move = MOVE_DOWN;
493 continue;
494 case MOVE_UP_FROM_RIGHT:
495 /* We finished this equation. */
496 if (pred == root)
497 break;
498 pred = get_pred_parent(pred, preds,
499 pred->parent, &move);
500 continue;
501 }
502 done = 1;
503 } while (!done);
504
505 return match;
506} 555}
507EXPORT_SYMBOL_GPL(filter_match_preds); 556EXPORT_SYMBOL_GPL(filter_match_preds);
508 557
@@ -628,22 +677,6 @@ find_event_field(struct ftrace_event_call *call, char *name)
628 return __find_event_field(head, name); 677 return __find_event_field(head, name);
629} 678}
630 679
631static void filter_free_pred(struct filter_pred *pred)
632{
633 if (!pred)
634 return;
635
636 kfree(pred->field_name);
637 kfree(pred);
638}
639
640static void filter_clear_pred(struct filter_pred *pred)
641{
642 kfree(pred->field_name);
643 pred->field_name = NULL;
644 pred->regex.len = 0;
645}
646
647static int __alloc_pred_stack(struct pred_stack *stack, int n_preds) 680static int __alloc_pred_stack(struct pred_stack *stack, int n_preds)
648{ 681{
649 stack->preds = kzalloc(sizeof(*stack->preds)*(n_preds + 1), GFP_KERNEL); 682 stack->preds = kzalloc(sizeof(*stack->preds)*(n_preds + 1), GFP_KERNEL);
@@ -689,20 +722,13 @@ __pop_pred_stack(struct pred_stack *stack)
689static int filter_set_pred(struct event_filter *filter, 722static int filter_set_pred(struct event_filter *filter,
690 int idx, 723 int idx,
691 struct pred_stack *stack, 724 struct pred_stack *stack,
692 struct filter_pred *src, 725 struct filter_pred *src)
693 filter_pred_fn_t fn)
694{ 726{
695 struct filter_pred *dest = &filter->preds[idx]; 727 struct filter_pred *dest = &filter->preds[idx];
696 struct filter_pred *left; 728 struct filter_pred *left;
697 struct filter_pred *right; 729 struct filter_pred *right;
698 730
699 *dest = *src; 731 *dest = *src;
700 if (src->field_name) {
701 dest->field_name = kstrdup(src->field_name, GFP_KERNEL);
702 if (!dest->field_name)
703 return -ENOMEM;
704 }
705 dest->fn = fn;
706 dest->index = idx; 732 dest->index = idx;
707 733
708 if (dest->op == OP_OR || dest->op == OP_AND) { 734 if (dest->op == OP_OR || dest->op == OP_AND) {
@@ -743,11 +769,7 @@ static int filter_set_pred(struct event_filter *filter,
743 769
744static void __free_preds(struct event_filter *filter) 770static void __free_preds(struct event_filter *filter)
745{ 771{
746 int i;
747
748 if (filter->preds) { 772 if (filter->preds) {
749 for (i = 0; i < filter->a_preds; i++)
750 kfree(filter->preds[i].field_name);
751 kfree(filter->preds); 773 kfree(filter->preds);
752 filter->preds = NULL; 774 filter->preds = NULL;
753 } 775 }
@@ -840,23 +862,19 @@ static void filter_free_subsystem_filters(struct event_subsystem *system)
840 } 862 }
841} 863}
842 864
843static int filter_add_pred_fn(struct filter_parse_state *ps, 865static int filter_add_pred(struct filter_parse_state *ps,
844 struct ftrace_event_call *call, 866 struct event_filter *filter,
845 struct event_filter *filter, 867 struct filter_pred *pred,
846 struct filter_pred *pred, 868 struct pred_stack *stack)
847 struct pred_stack *stack,
848 filter_pred_fn_t fn)
849{ 869{
850 int idx, err; 870 int err;
851 871
852 if (WARN_ON(filter->n_preds == filter->a_preds)) { 872 if (WARN_ON(filter->n_preds == filter->a_preds)) {
853 parse_error(ps, FILT_ERR_TOO_MANY_PREDS, 0); 873 parse_error(ps, FILT_ERR_TOO_MANY_PREDS, 0);
854 return -ENOSPC; 874 return -ENOSPC;
855 } 875 }
856 876
857 idx = filter->n_preds; 877 err = filter_set_pred(filter, filter->n_preds, stack, pred);
858 filter_clear_pred(&filter->preds[idx]);
859 err = filter_set_pred(filter, idx, stack, pred, fn);
860 if (err) 878 if (err)
861 return err; 879 return err;
862 880
@@ -937,31 +955,15 @@ static filter_pred_fn_t select_comparison_fn(int op, int field_size,
937 return fn; 955 return fn;
938} 956}
939 957
940static int filter_add_pred(struct filter_parse_state *ps, 958static int init_pred(struct filter_parse_state *ps,
941 struct ftrace_event_call *call, 959 struct ftrace_event_field *field,
942 struct event_filter *filter, 960 struct filter_pred *pred)
943 struct filter_pred *pred, 961
944 struct pred_stack *stack,
945 bool dry_run)
946{ 962{
947 struct ftrace_event_field *field; 963 filter_pred_fn_t fn = filter_pred_none;
948 filter_pred_fn_t fn;
949 unsigned long long val; 964 unsigned long long val;
950 int ret; 965 int ret;
951 966
952 fn = pred->fn = filter_pred_none;
953
954 if (pred->op == OP_AND)
955 goto add_pred_fn;
956 else if (pred->op == OP_OR)
957 goto add_pred_fn;
958
959 field = find_event_field(call, pred->field_name);
960 if (!field) {
961 parse_error(ps, FILT_ERR_FIELD_NOT_FOUND, 0);
962 return -EINVAL;
963 }
964
965 pred->offset = field->offset; 967 pred->offset = field->offset;
966 968
967 if (!is_legal_op(field, pred->op)) { 969 if (!is_legal_op(field, pred->op)) {
@@ -1001,9 +1003,7 @@ static int filter_add_pred(struct filter_parse_state *ps,
1001 if (pred->op == OP_NE) 1003 if (pred->op == OP_NE)
1002 pred->not = 1; 1004 pred->not = 1;
1003 1005
1004add_pred_fn: 1006 pred->fn = fn;
1005 if (!dry_run)
1006 return filter_add_pred_fn(ps, call, filter, pred, stack, fn);
1007 return 0; 1007 return 0;
1008} 1008}
1009 1009
@@ -1302,39 +1302,37 @@ parse_operand:
1302 return 0; 1302 return 0;
1303} 1303}
1304 1304
1305static struct filter_pred *create_pred(int op, char *operand1, char *operand2) 1305static struct filter_pred *create_pred(struct filter_parse_state *ps,
1306 struct ftrace_event_call *call,
1307 int op, char *operand1, char *operand2)
1306{ 1308{
1307 struct filter_pred *pred; 1309 struct ftrace_event_field *field;
1310 static struct filter_pred pred;
1308 1311
1309 pred = kzalloc(sizeof(*pred), GFP_KERNEL); 1312 memset(&pred, 0, sizeof(pred));
1310 if (!pred) 1313 pred.op = op;
1311 return NULL;
1312 1314
1313 pred->field_name = kstrdup(operand1, GFP_KERNEL); 1315 if (op == OP_AND || op == OP_OR)
1314 if (!pred->field_name) { 1316 return &pred;
1315 kfree(pred); 1317
1318 if (!operand1 || !operand2) {
1319 parse_error(ps, FILT_ERR_MISSING_FIELD, 0);
1316 return NULL; 1320 return NULL;
1317 } 1321 }
1318 1322
1319 strcpy(pred->regex.pattern, operand2); 1323 field = find_event_field(call, operand1);
1320 pred->regex.len = strlen(pred->regex.pattern); 1324 if (!field) {
1321 1325 parse_error(ps, FILT_ERR_FIELD_NOT_FOUND, 0);
1322 pred->op = op;
1323
1324 return pred;
1325}
1326
1327static struct filter_pred *create_logical_pred(int op)
1328{
1329 struct filter_pred *pred;
1330
1331 pred = kzalloc(sizeof(*pred), GFP_KERNEL);
1332 if (!pred)
1333 return NULL; 1326 return NULL;
1327 }
1334 1328
1335 pred->op = op; 1329 strcpy(pred.regex.pattern, operand2);
1330 pred.regex.len = strlen(pred.regex.pattern);
1336 1331
1337 return pred; 1332#ifdef CONFIG_FTRACE_STARTUP_TEST
1333 pred.field = field;
1334#endif
1335 return init_pred(ps, field, &pred) ? NULL : &pred;
1338} 1336}
1339 1337
1340static int check_preds(struct filter_parse_state *ps) 1338static int check_preds(struct filter_parse_state *ps)
@@ -1375,6 +1373,23 @@ static int count_preds(struct filter_parse_state *ps)
1375 return n_preds; 1373 return n_preds;
1376} 1374}
1377 1375
1376struct check_pred_data {
1377 int count;
1378 int max;
1379};
1380
1381static int check_pred_tree_cb(enum move_type move, struct filter_pred *pred,
1382 int *err, void *data)
1383{
1384 struct check_pred_data *d = data;
1385
1386 if (WARN_ON(d->count++ > d->max)) {
1387 *err = -EINVAL;
1388 return WALK_PRED_ABORT;
1389 }
1390 return WALK_PRED_DEFAULT;
1391}
1392
1378/* 1393/*
1379 * The tree is walked at filtering of an event. If the tree is not correctly 1394 * The tree is walked at filtering of an event. If the tree is not correctly
1380 * built, it may cause an infinite loop. Check here that the tree does 1395 * built, it may cause an infinite loop. Check here that the tree does
@@ -1383,107 +1398,76 @@ static int count_preds(struct filter_parse_state *ps)
1383static int check_pred_tree(struct event_filter *filter, 1398static int check_pred_tree(struct event_filter *filter,
1384 struct filter_pred *root) 1399 struct filter_pred *root)
1385{ 1400{
1386 struct filter_pred *preds; 1401 struct check_pred_data data = {
1387 struct filter_pred *pred; 1402 /*
1388 enum move_type move = MOVE_DOWN; 1403 * The max that we can hit a node is three times.
1389 int count = 0; 1404 * Once going down, once coming up from left, and
1390 int done = 0; 1405 * once coming up from right. This is more than enough
1391 int max; 1406 * since leafs are only hit a single time.
1392 1407 */
1393 /* 1408 .max = 3 * filter->n_preds,
1394 * The max that we can hit a node is three times. 1409 .count = 0,
1395 * Once going down, once coming up from left, and 1410 };
1396 * once coming up from right. This is more than enough
1397 * since leafs are only hit a single time.
1398 */
1399 max = 3 * filter->n_preds;
1400 1411
1401 preds = filter->preds; 1412 return walk_pred_tree(filter->preds, root,
1402 if (!preds) 1413 check_pred_tree_cb, &data);
1403 return -EINVAL; 1414}
1404 pred = root;
1405 1415
1406 do { 1416static int count_leafs_cb(enum move_type move, struct filter_pred *pred,
1407 if (WARN_ON(count++ > max)) 1417 int *err, void *data)
1408 return -EINVAL; 1418{
1419 int *count = data;
1409 1420
1410 switch (move) { 1421 if ((move == MOVE_DOWN) &&
1411 case MOVE_DOWN: 1422 (pred->left == FILTER_PRED_INVALID))
1412 if (pred->left != FILTER_PRED_INVALID) { 1423 (*count)++;
1413 pred = &preds[pred->left];
1414 continue;
1415 }
1416 /* A leaf at the root is just a leaf in the tree */
1417 if (pred == root)
1418 break;
1419 pred = get_pred_parent(pred, preds,
1420 pred->parent, &move);
1421 continue;
1422 case MOVE_UP_FROM_LEFT:
1423 pred = &preds[pred->right];
1424 move = MOVE_DOWN;
1425 continue;
1426 case MOVE_UP_FROM_RIGHT:
1427 if (pred == root)
1428 break;
1429 pred = get_pred_parent(pred, preds,
1430 pred->parent, &move);
1431 continue;
1432 }
1433 done = 1;
1434 } while (!done);
1435 1424
1436 /* We are fine. */ 1425 return WALK_PRED_DEFAULT;
1437 return 0;
1438} 1426}
1439 1427
1440static int count_leafs(struct filter_pred *preds, struct filter_pred *root) 1428static int count_leafs(struct filter_pred *preds, struct filter_pred *root)
1441{ 1429{
1442 struct filter_pred *pred; 1430 int count = 0, ret;
1443 enum move_type move = MOVE_DOWN;
1444 int count = 0;
1445 int done = 0;
1446 1431
1447 pred = root; 1432 ret = walk_pred_tree(preds, root, count_leafs_cb, &count);
1433 WARN_ON(ret);
1434 return count;
1435}
1448 1436
1449 do { 1437struct fold_pred_data {
1450 switch (move) { 1438 struct filter_pred *root;
1451 case MOVE_DOWN: 1439 int count;
1452 if (pred->left != FILTER_PRED_INVALID) { 1440 int children;
1453 pred = &preds[pred->left]; 1441};
1454 continue;
1455 }
1456 /* A leaf at the root is just a leaf in the tree */
1457 if (pred == root)
1458 return 1;
1459 count++;
1460 pred = get_pred_parent(pred, preds,
1461 pred->parent, &move);
1462 continue;
1463 case MOVE_UP_FROM_LEFT:
1464 pred = &preds[pred->right];
1465 move = MOVE_DOWN;
1466 continue;
1467 case MOVE_UP_FROM_RIGHT:
1468 if (pred == root)
1469 break;
1470 pred = get_pred_parent(pred, preds,
1471 pred->parent, &move);
1472 continue;
1473 }
1474 done = 1;
1475 } while (!done);
1476 1442
1477 return count; 1443static int fold_pred_cb(enum move_type move, struct filter_pred *pred,
1444 int *err, void *data)
1445{
1446 struct fold_pred_data *d = data;
1447 struct filter_pred *root = d->root;
1448
1449 if (move != MOVE_DOWN)
1450 return WALK_PRED_DEFAULT;
1451 if (pred->left != FILTER_PRED_INVALID)
1452 return WALK_PRED_DEFAULT;
1453
1454 if (WARN_ON(d->count == d->children)) {
1455 *err = -EINVAL;
1456 return WALK_PRED_ABORT;
1457 }
1458
1459 pred->index &= ~FILTER_PRED_FOLD;
1460 root->ops[d->count++] = pred->index;
1461 return WALK_PRED_DEFAULT;
1478} 1462}
1479 1463
1480static int fold_pred(struct filter_pred *preds, struct filter_pred *root) 1464static int fold_pred(struct filter_pred *preds, struct filter_pred *root)
1481{ 1465{
1482 struct filter_pred *pred; 1466 struct fold_pred_data data = {
1483 enum move_type move = MOVE_DOWN; 1467 .root = root,
1484 int count = 0; 1468 .count = 0,
1469 };
1485 int children; 1470 int children;
1486 int done = 0;
1487 1471
1488 /* No need to keep the fold flag */ 1472 /* No need to keep the fold flag */
1489 root->index &= ~FILTER_PRED_FOLD; 1473 root->index &= ~FILTER_PRED_FOLD;
@@ -1501,37 +1485,26 @@ static int fold_pred(struct filter_pred *preds, struct filter_pred *root)
1501 return -ENOMEM; 1485 return -ENOMEM;
1502 1486
1503 root->val = children; 1487 root->val = children;
1488 data.children = children;
1489 return walk_pred_tree(preds, root, fold_pred_cb, &data);
1490}
1504 1491
1505 pred = root; 1492static int fold_pred_tree_cb(enum move_type move, struct filter_pred *pred,
1506 do { 1493 int *err, void *data)
1507 switch (move) { 1494{
1508 case MOVE_DOWN: 1495 struct filter_pred *preds = data;
1509 if (pred->left != FILTER_PRED_INVALID) {
1510 pred = &preds[pred->left];
1511 continue;
1512 }
1513 if (WARN_ON(count == children))
1514 return -EINVAL;
1515 pred->index &= ~FILTER_PRED_FOLD;
1516 root->ops[count++] = pred->index;
1517 pred = get_pred_parent(pred, preds,
1518 pred->parent, &move);
1519 continue;
1520 case MOVE_UP_FROM_LEFT:
1521 pred = &preds[pred->right];
1522 move = MOVE_DOWN;
1523 continue;
1524 case MOVE_UP_FROM_RIGHT:
1525 if (pred == root)
1526 break;
1527 pred = get_pred_parent(pred, preds,
1528 pred->parent, &move);
1529 continue;
1530 }
1531 done = 1;
1532 } while (!done);
1533 1496
1534 return 0; 1497 if (move != MOVE_DOWN)
1498 return WALK_PRED_DEFAULT;
1499 if (!(pred->index & FILTER_PRED_FOLD))
1500 return WALK_PRED_DEFAULT;
1501
1502 *err = fold_pred(preds, pred);
1503 if (*err)
1504 return WALK_PRED_ABORT;
1505
1506 /* eveyrhing below is folded, continue with parent */
1507 return WALK_PRED_PARENT;
1535} 1508}
1536 1509
1537/* 1510/*
@@ -1542,51 +1515,8 @@ static int fold_pred(struct filter_pred *preds, struct filter_pred *root)
1542static int fold_pred_tree(struct event_filter *filter, 1515static int fold_pred_tree(struct event_filter *filter,
1543 struct filter_pred *root) 1516 struct filter_pred *root)
1544{ 1517{
1545 struct filter_pred *preds; 1518 return walk_pred_tree(filter->preds, root, fold_pred_tree_cb,
1546 struct filter_pred *pred; 1519 filter->preds);
1547 enum move_type move = MOVE_DOWN;
1548 int done = 0;
1549 int err;
1550
1551 preds = filter->preds;
1552 if (!preds)
1553 return -EINVAL;
1554 pred = root;
1555
1556 do {
1557 switch (move) {
1558 case MOVE_DOWN:
1559 if (pred->index & FILTER_PRED_FOLD) {
1560 err = fold_pred(preds, pred);
1561 if (err)
1562 return err;
1563 /* Folded nodes are like leafs */
1564 } else if (pred->left != FILTER_PRED_INVALID) {
1565 pred = &preds[pred->left];
1566 continue;
1567 }
1568
1569 /* A leaf at the root is just a leaf in the tree */
1570 if (pred == root)
1571 break;
1572 pred = get_pred_parent(pred, preds,
1573 pred->parent, &move);
1574 continue;
1575 case MOVE_UP_FROM_LEFT:
1576 pred = &preds[pred->right];
1577 move = MOVE_DOWN;
1578 continue;
1579 case MOVE_UP_FROM_RIGHT:
1580 if (pred == root)
1581 break;
1582 pred = get_pred_parent(pred, preds,
1583 pred->parent, &move);
1584 continue;
1585 }
1586 done = 1;
1587 } while (!done);
1588
1589 return 0;
1590} 1520}
1591 1521
1592static int replace_preds(struct ftrace_event_call *call, 1522static int replace_preds(struct ftrace_event_call *call,
@@ -1643,27 +1573,17 @@ static int replace_preds(struct ftrace_event_call *call,
1643 goto fail; 1573 goto fail;
1644 } 1574 }
1645 1575
1646 if (elt->op == OP_AND || elt->op == OP_OR) { 1576 pred = create_pred(ps, call, elt->op, operand1, operand2);
1647 pred = create_logical_pred(elt->op); 1577 if (!pred) {
1648 goto add_pred;
1649 }
1650
1651 if (!operand1 || !operand2) {
1652 parse_error(ps, FILT_ERR_MISSING_FIELD, 0);
1653 err = -EINVAL; 1578 err = -EINVAL;
1654 goto fail; 1579 goto fail;
1655 } 1580 }
1656 1581
1657 pred = create_pred(elt->op, operand1, operand2); 1582 if (!dry_run) {
1658add_pred: 1583 err = filter_add_pred(ps, filter, pred, &stack);
1659 if (!pred) { 1584 if (err)
1660 err = -ENOMEM; 1585 goto fail;
1661 goto fail;
1662 } 1586 }
1663 err = filter_add_pred(ps, call, filter, pred, &stack, dry_run);
1664 filter_free_pred(pred);
1665 if (err)
1666 goto fail;
1667 1587
1668 operand1 = operand2 = NULL; 1588 operand1 = operand2 = NULL;
1669 } 1589 }
@@ -1958,17 +1878,14 @@ int ftrace_profile_set_filter(struct perf_event *event, int event_id,
1958 int err; 1878 int err;
1959 struct event_filter *filter; 1879 struct event_filter *filter;
1960 struct filter_parse_state *ps; 1880 struct filter_parse_state *ps;
1961 struct ftrace_event_call *call = NULL; 1881 struct ftrace_event_call *call;
1962 1882
1963 mutex_lock(&event_mutex); 1883 mutex_lock(&event_mutex);
1964 1884
1965 list_for_each_entry(call, &ftrace_events, list) { 1885 call = event->tp_event;
1966 if (call->event.type == event_id)
1967 break;
1968 }
1969 1886
1970 err = -EINVAL; 1887 err = -EINVAL;
1971 if (&call->list == &ftrace_events) 1888 if (!call)
1972 goto out_unlock; 1889 goto out_unlock;
1973 1890
1974 err = -EEXIST; 1891 err = -EEXIST;
@@ -2012,3 +1929,215 @@ out_unlock:
2012 1929
2013#endif /* CONFIG_PERF_EVENTS */ 1930#endif /* CONFIG_PERF_EVENTS */
2014 1931
1932#ifdef CONFIG_FTRACE_STARTUP_TEST
1933
1934#include <linux/types.h>
1935#include <linux/tracepoint.h>
1936
1937#define CREATE_TRACE_POINTS
1938#include "trace_events_filter_test.h"
1939
1940static int test_get_filter(char *filter_str, struct ftrace_event_call *call,
1941 struct event_filter **pfilter)
1942{
1943 struct event_filter *filter;
1944 struct filter_parse_state *ps;
1945 int err = -ENOMEM;
1946
1947 filter = __alloc_filter();
1948 if (!filter)
1949 goto out;
1950
1951 ps = kzalloc(sizeof(*ps), GFP_KERNEL);
1952 if (!ps)
1953 goto free_filter;
1954
1955 parse_init(ps, filter_ops, filter_str);
1956 err = filter_parse(ps);
1957 if (err)
1958 goto free_ps;
1959
1960 err = replace_preds(call, filter, ps, filter_str, false);
1961 if (!err)
1962 *pfilter = filter;
1963
1964 free_ps:
1965 filter_opstack_clear(ps);
1966 postfix_clear(ps);
1967 kfree(ps);
1968
1969 free_filter:
1970 if (err)
1971 __free_filter(filter);
1972
1973 out:
1974 return err;
1975}
1976
1977#define DATA_REC(m, va, vb, vc, vd, ve, vf, vg, vh, nvisit) \
1978{ \
1979 .filter = FILTER, \
1980 .rec = { .a = va, .b = vb, .c = vc, .d = vd, \
1981 .e = ve, .f = vf, .g = vg, .h = vh }, \
1982 .match = m, \
1983 .not_visited = nvisit, \
1984}
1985#define YES 1
1986#define NO 0
1987
1988static struct test_filter_data_t {
1989 char *filter;
1990 struct ftrace_raw_ftrace_test_filter rec;
1991 int match;
1992 char *not_visited;
1993} test_filter_data[] = {
1994#define FILTER "a == 1 && b == 1 && c == 1 && d == 1 && " \
1995 "e == 1 && f == 1 && g == 1 && h == 1"
1996 DATA_REC(YES, 1, 1, 1, 1, 1, 1, 1, 1, ""),
1997 DATA_REC(NO, 0, 1, 1, 1, 1, 1, 1, 1, "bcdefgh"),
1998 DATA_REC(NO, 1, 1, 1, 1, 1, 1, 1, 0, ""),
1999#undef FILTER
2000#define FILTER "a == 1 || b == 1 || c == 1 || d == 1 || " \
2001 "e == 1 || f == 1 || g == 1 || h == 1"
2002 DATA_REC(NO, 0, 0, 0, 0, 0, 0, 0, 0, ""),
2003 DATA_REC(YES, 0, 0, 0, 0, 0, 0, 0, 1, ""),
2004 DATA_REC(YES, 1, 0, 0, 0, 0, 0, 0, 0, "bcdefgh"),
2005#undef FILTER
2006#define FILTER "(a == 1 || b == 1) && (c == 1 || d == 1) && " \
2007 "(e == 1 || f == 1) && (g == 1 || h == 1)"
2008 DATA_REC(NO, 0, 0, 1, 1, 1, 1, 1, 1, "dfh"),
2009 DATA_REC(YES, 0, 1, 0, 1, 0, 1, 0, 1, ""),
2010 DATA_REC(YES, 1, 0, 1, 0, 0, 1, 0, 1, "bd"),
2011 DATA_REC(NO, 1, 0, 1, 0, 0, 1, 0, 0, "bd"),
2012#undef FILTER
2013#define FILTER "(a == 1 && b == 1) || (c == 1 && d == 1) || " \
2014 "(e == 1 && f == 1) || (g == 1 && h == 1)"
2015 DATA_REC(YES, 1, 0, 1, 1, 1, 1, 1, 1, "efgh"),
2016 DATA_REC(YES, 0, 0, 0, 0, 0, 0, 1, 1, ""),
2017 DATA_REC(NO, 0, 0, 0, 0, 0, 0, 0, 1, ""),
2018#undef FILTER
2019#define FILTER "(a == 1 && b == 1) && (c == 1 && d == 1) && " \
2020 "(e == 1 && f == 1) || (g == 1 && h == 1)"
2021 DATA_REC(YES, 1, 1, 1, 1, 1, 1, 0, 0, "gh"),
2022 DATA_REC(NO, 0, 0, 0, 0, 0, 0, 0, 1, ""),
2023 DATA_REC(YES, 1, 1, 1, 1, 1, 0, 1, 1, ""),
2024#undef FILTER
2025#define FILTER "((a == 1 || b == 1) || (c == 1 || d == 1) || " \
2026 "(e == 1 || f == 1)) && (g == 1 || h == 1)"
2027 DATA_REC(YES, 1, 1, 1, 1, 1, 1, 0, 1, "bcdef"),
2028 DATA_REC(NO, 0, 0, 0, 0, 0, 0, 0, 0, ""),
2029 DATA_REC(YES, 1, 1, 1, 1, 1, 0, 1, 1, "h"),
2030#undef FILTER
2031#define FILTER "((((((((a == 1) && (b == 1)) || (c == 1)) && (d == 1)) || " \
2032 "(e == 1)) && (f == 1)) || (g == 1)) && (h == 1))"
2033 DATA_REC(YES, 1, 1, 1, 1, 1, 1, 1, 1, "ceg"),
2034 DATA_REC(NO, 0, 1, 0, 1, 0, 1, 0, 1, ""),
2035 DATA_REC(NO, 1, 0, 1, 0, 1, 0, 1, 0, ""),
2036#undef FILTER
2037#define FILTER "((((((((a == 1) || (b == 1)) && (c == 1)) || (d == 1)) && " \
2038 "(e == 1)) || (f == 1)) && (g == 1)) || (h == 1))"
2039 DATA_REC(YES, 1, 1, 1, 1, 1, 1, 1, 1, "bdfh"),
2040 DATA_REC(YES, 0, 1, 0, 1, 0, 1, 0, 1, ""),
2041 DATA_REC(YES, 1, 0, 1, 0, 1, 0, 1, 0, "bdfh"),
2042};
2043
2044#undef DATA_REC
2045#undef FILTER
2046#undef YES
2047#undef NO
2048
2049#define DATA_CNT (sizeof(test_filter_data)/sizeof(struct test_filter_data_t))
2050
2051static int test_pred_visited;
2052
2053static int test_pred_visited_fn(struct filter_pred *pred, void *event)
2054{
2055 struct ftrace_event_field *field = pred->field;
2056
2057 test_pred_visited = 1;
2058 printk(KERN_INFO "\npred visited %s\n", field->name);
2059 return 1;
2060}
2061
2062static int test_walk_pred_cb(enum move_type move, struct filter_pred *pred,
2063 int *err, void *data)
2064{
2065 char *fields = data;
2066
2067 if ((move == MOVE_DOWN) &&
2068 (pred->left == FILTER_PRED_INVALID)) {
2069 struct ftrace_event_field *field = pred->field;
2070
2071 if (!field) {
2072 WARN(1, "all leafs should have field defined");
2073 return WALK_PRED_DEFAULT;
2074 }
2075 if (!strchr(fields, *field->name))
2076 return WALK_PRED_DEFAULT;
2077
2078 WARN_ON(!pred->fn);
2079 pred->fn = test_pred_visited_fn;
2080 }
2081 return WALK_PRED_DEFAULT;
2082}
2083
2084static __init int ftrace_test_event_filter(void)
2085{
2086 int i;
2087
2088 printk(KERN_INFO "Testing ftrace filter: ");
2089
2090 for (i = 0; i < DATA_CNT; i++) {
2091 struct event_filter *filter = NULL;
2092 struct test_filter_data_t *d = &test_filter_data[i];
2093 int err;
2094
2095 err = test_get_filter(d->filter, &event_ftrace_test_filter,
2096 &filter);
2097 if (err) {
2098 printk(KERN_INFO
2099 "Failed to get filter for '%s', err %d\n",
2100 d->filter, err);
2101 break;
2102 }
2103
2104 /*
2105 * The preemption disabling is not really needed for self
2106 * tests, but the rcu dereference will complain without it.
2107 */
2108 preempt_disable();
2109 if (*d->not_visited)
2110 walk_pred_tree(filter->preds, filter->root,
2111 test_walk_pred_cb,
2112 d->not_visited);
2113
2114 test_pred_visited = 0;
2115 err = filter_match_preds(filter, &d->rec);
2116 preempt_enable();
2117
2118 __free_filter(filter);
2119
2120 if (test_pred_visited) {
2121 printk(KERN_INFO
2122 "Failed, unwanted pred visited for filter %s\n",
2123 d->filter);
2124 break;
2125 }
2126
2127 if (err != d->match) {
2128 printk(KERN_INFO
2129 "Failed to match filter '%s', expected %d\n",
2130 d->filter, d->match);
2131 break;
2132 }
2133 }
2134
2135 if (i == DATA_CNT)
2136 printk(KERN_CONT "OK\n");
2137
2138 return 0;
2139}
2140
2141late_initcall(ftrace_test_event_filter);
2142
2143#endif /* CONFIG_FTRACE_STARTUP_TEST */
diff --git a/kernel/trace/trace_events_filter_test.h b/kernel/trace/trace_events_filter_test.h
new file mode 100644
index 00000000000..bfd4dba0d60
--- /dev/null
+++ b/kernel/trace/trace_events_filter_test.h
@@ -0,0 +1,50 @@
1#undef TRACE_SYSTEM
2#define TRACE_SYSTEM test
3
4#if !defined(_TRACE_TEST_H) || defined(TRACE_HEADER_MULTI_READ)
5#define _TRACE_TEST_H
6
7#include <linux/tracepoint.h>
8
9TRACE_EVENT(ftrace_test_filter,
10
11 TP_PROTO(int a, int b, int c, int d, int e, int f, int g, int h),
12
13 TP_ARGS(a, b, c, d, e, f, g, h),
14
15 TP_STRUCT__entry(
16 __field(int, a)
17 __field(int, b)
18 __field(int, c)
19 __field(int, d)
20 __field(int, e)
21 __field(int, f)
22 __field(int, g)
23 __field(int, h)
24 ),
25
26 TP_fast_assign(
27 __entry->a = a;
28 __entry->b = b;
29 __entry->c = c;
30 __entry->d = d;
31 __entry->e = e;
32 __entry->f = f;
33 __entry->g = g;
34 __entry->h = h;
35 ),
36
37 TP_printk("a %d, b %d, c %d, d %d, e %d, f %d, g %d, h %d",
38 __entry->a, __entry->b, __entry->c, __entry->d,
39 __entry->e, __entry->f, __entry->g, __entry->h)
40);
41
42#endif /* _TRACE_TEST_H || TRACE_HEADER_MULTI_READ */
43
44#undef TRACE_INCLUDE_PATH
45#undef TRACE_INCLUDE_FILE
46#define TRACE_INCLUDE_PATH .
47#define TRACE_INCLUDE_FILE trace_events_filter_test
48
49/* This part must be outside protection */
50#include <trace/define_trace.h>
diff --git a/kernel/trace/trace_irqsoff.c b/kernel/trace/trace_irqsoff.c
index 667aa8cc0cf..20dad0d7a16 100644
--- a/kernel/trace/trace_irqsoff.c
+++ b/kernel/trace/trace_irqsoff.c
@@ -23,7 +23,7 @@ static int tracer_enabled __read_mostly;
23 23
24static DEFINE_PER_CPU(int, tracing_cpu); 24static DEFINE_PER_CPU(int, tracing_cpu);
25 25
26static DEFINE_SPINLOCK(max_trace_lock); 26static DEFINE_RAW_SPINLOCK(max_trace_lock);
27 27
28enum { 28enum {
29 TRACER_IRQS_OFF = (1 << 1), 29 TRACER_IRQS_OFF = (1 << 1),
@@ -321,7 +321,7 @@ check_critical_timing(struct trace_array *tr,
321 if (!report_latency(delta)) 321 if (!report_latency(delta))
322 goto out; 322 goto out;
323 323
324 spin_lock_irqsave(&max_trace_lock, flags); 324 raw_spin_lock_irqsave(&max_trace_lock, flags);
325 325
326 /* check if we are still the max latency */ 326 /* check if we are still the max latency */
327 if (!report_latency(delta)) 327 if (!report_latency(delta))
@@ -344,7 +344,7 @@ check_critical_timing(struct trace_array *tr,
344 max_sequence++; 344 max_sequence++;
345 345
346out_unlock: 346out_unlock:
347 spin_unlock_irqrestore(&max_trace_lock, flags); 347 raw_spin_unlock_irqrestore(&max_trace_lock, flags);
348 348
349out: 349out:
350 data->critical_sequence = max_sequence; 350 data->critical_sequence = max_sequence;
@@ -505,13 +505,13 @@ EXPORT_SYMBOL(trace_hardirqs_off_caller);
505#ifdef CONFIG_PREEMPT_TRACER 505#ifdef CONFIG_PREEMPT_TRACER
506void trace_preempt_on(unsigned long a0, unsigned long a1) 506void trace_preempt_on(unsigned long a0, unsigned long a1)
507{ 507{
508 if (preempt_trace()) 508 if (preempt_trace() && !irq_trace())
509 stop_critical_timing(a0, a1); 509 stop_critical_timing(a0, a1);
510} 510}
511 511
512void trace_preempt_off(unsigned long a0, unsigned long a1) 512void trace_preempt_off(unsigned long a0, unsigned long a1)
513{ 513{
514 if (preempt_trace()) 514 if (preempt_trace() && !irq_trace())
515 start_critical_timing(a0, a1); 515 start_critical_timing(a0, a1);
516} 516}
517#endif /* CONFIG_PREEMPT_TRACER */ 517#endif /* CONFIG_PREEMPT_TRACER */
diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c
index 5fb3697bf0e..00d527c945a 100644
--- a/kernel/trace/trace_kprobe.c
+++ b/kernel/trace/trace_kprobe.c
@@ -836,11 +836,17 @@ static void __unregister_trace_probe(struct trace_probe *tp)
836} 836}
837 837
838/* Unregister a trace_probe and probe_event: call with locking probe_lock */ 838/* Unregister a trace_probe and probe_event: call with locking probe_lock */
839static void unregister_trace_probe(struct trace_probe *tp) 839static int unregister_trace_probe(struct trace_probe *tp)
840{ 840{
841 /* Enabled event can not be unregistered */
842 if (trace_probe_is_enabled(tp))
843 return -EBUSY;
844
841 __unregister_trace_probe(tp); 845 __unregister_trace_probe(tp);
842 list_del(&tp->list); 846 list_del(&tp->list);
843 unregister_probe_event(tp); 847 unregister_probe_event(tp);
848
849 return 0;
844} 850}
845 851
846/* Register a trace_probe and probe_event */ 852/* Register a trace_probe and probe_event */
@@ -854,7 +860,9 @@ static int register_trace_probe(struct trace_probe *tp)
854 /* Delete old (same name) event if exist */ 860 /* Delete old (same name) event if exist */
855 old_tp = find_trace_probe(tp->call.name, tp->call.class->system); 861 old_tp = find_trace_probe(tp->call.name, tp->call.class->system);
856 if (old_tp) { 862 if (old_tp) {
857 unregister_trace_probe(old_tp); 863 ret = unregister_trace_probe(old_tp);
864 if (ret < 0)
865 goto end;
858 free_trace_probe(old_tp); 866 free_trace_probe(old_tp);
859 } 867 }
860 868
@@ -892,6 +900,7 @@ static int trace_probe_module_callback(struct notifier_block *nb,
892 mutex_lock(&probe_lock); 900 mutex_lock(&probe_lock);
893 list_for_each_entry(tp, &probe_list, list) { 901 list_for_each_entry(tp, &probe_list, list) {
894 if (trace_probe_within_module(tp, mod)) { 902 if (trace_probe_within_module(tp, mod)) {
903 /* Don't need to check busy - this should have gone. */
895 __unregister_trace_probe(tp); 904 __unregister_trace_probe(tp);
896 ret = __register_trace_probe(tp); 905 ret = __register_trace_probe(tp);
897 if (ret) 906 if (ret)
@@ -1205,10 +1214,11 @@ static int create_trace_probe(int argc, char **argv)
1205 return -ENOENT; 1214 return -ENOENT;
1206 } 1215 }
1207 /* delete an event */ 1216 /* delete an event */
1208 unregister_trace_probe(tp); 1217 ret = unregister_trace_probe(tp);
1209 free_trace_probe(tp); 1218 if (ret == 0)
1219 free_trace_probe(tp);
1210 mutex_unlock(&probe_lock); 1220 mutex_unlock(&probe_lock);
1211 return 0; 1221 return ret;
1212 } 1222 }
1213 1223
1214 if (argc < 2) { 1224 if (argc < 2) {
@@ -1317,18 +1327,29 @@ error:
1317 return ret; 1327 return ret;
1318} 1328}
1319 1329
1320static void release_all_trace_probes(void) 1330static int release_all_trace_probes(void)
1321{ 1331{
1322 struct trace_probe *tp; 1332 struct trace_probe *tp;
1333 int ret = 0;
1323 1334
1324 mutex_lock(&probe_lock); 1335 mutex_lock(&probe_lock);
1336 /* Ensure no probe is in use. */
1337 list_for_each_entry(tp, &probe_list, list)
1338 if (trace_probe_is_enabled(tp)) {
1339 ret = -EBUSY;
1340 goto end;
1341 }
1325 /* TODO: Use batch unregistration */ 1342 /* TODO: Use batch unregistration */
1326 while (!list_empty(&probe_list)) { 1343 while (!list_empty(&probe_list)) {
1327 tp = list_entry(probe_list.next, struct trace_probe, list); 1344 tp = list_entry(probe_list.next, struct trace_probe, list);
1328 unregister_trace_probe(tp); 1345 unregister_trace_probe(tp);
1329 free_trace_probe(tp); 1346 free_trace_probe(tp);
1330 } 1347 }
1348
1349end:
1331 mutex_unlock(&probe_lock); 1350 mutex_unlock(&probe_lock);
1351
1352 return ret;
1332} 1353}
1333 1354
1334/* Probes listing interfaces */ 1355/* Probes listing interfaces */
@@ -1380,9 +1401,13 @@ static const struct seq_operations probes_seq_op = {
1380 1401
1381static int probes_open(struct inode *inode, struct file *file) 1402static int probes_open(struct inode *inode, struct file *file)
1382{ 1403{
1383 if ((file->f_mode & FMODE_WRITE) && 1404 int ret;
1384 (file->f_flags & O_TRUNC)) 1405
1385 release_all_trace_probes(); 1406 if ((file->f_mode & FMODE_WRITE) && (file->f_flags & O_TRUNC)) {
1407 ret = release_all_trace_probes();
1408 if (ret < 0)
1409 return ret;
1410 }
1386 1411
1387 return seq_open(file, &probes_seq_op); 1412 return seq_open(file, &probes_seq_op);
1388} 1413}
@@ -2055,6 +2080,21 @@ static __init int kprobe_trace_self_tests_init(void)
2055 2080
2056 ret = target(1, 2, 3, 4, 5, 6); 2081 ret = target(1, 2, 3, 4, 5, 6);
2057 2082
2083 /* Disable trace points before removing it */
2084 tp = find_trace_probe("testprobe", KPROBE_EVENT_SYSTEM);
2085 if (WARN_ON_ONCE(tp == NULL)) {
2086 pr_warning("error on getting test probe.\n");
2087 warn++;
2088 } else
2089 disable_trace_probe(tp, TP_FLAG_TRACE);
2090
2091 tp = find_trace_probe("testprobe2", KPROBE_EVENT_SYSTEM);
2092 if (WARN_ON_ONCE(tp == NULL)) {
2093 pr_warning("error on getting 2nd test probe.\n");
2094 warn++;
2095 } else
2096 disable_trace_probe(tp, TP_FLAG_TRACE);
2097
2058 ret = command_trace_probe("-:testprobe"); 2098 ret = command_trace_probe("-:testprobe");
2059 if (WARN_ON_ONCE(ret)) { 2099 if (WARN_ON_ONCE(ret)) {
2060 pr_warning("error on deleting a probe.\n"); 2100 pr_warning("error on deleting a probe.\n");
diff --git a/kernel/trace/trace_printk.c b/kernel/trace/trace_printk.c
index 1f06468a10d..6fd4ffd042f 100644
--- a/kernel/trace/trace_printk.c
+++ b/kernel/trace/trace_printk.c
@@ -59,18 +59,19 @@ void hold_module_trace_bprintk_format(const char **start, const char **end)
59 continue; 59 continue;
60 } 60 }
61 61
62 fmt = NULL;
62 tb_fmt = kmalloc(sizeof(*tb_fmt), GFP_KERNEL); 63 tb_fmt = kmalloc(sizeof(*tb_fmt), GFP_KERNEL);
63 if (tb_fmt) 64 if (tb_fmt) {
64 fmt = kmalloc(strlen(*iter) + 1, GFP_KERNEL); 65 fmt = kmalloc(strlen(*iter) + 1, GFP_KERNEL);
65 if (tb_fmt && fmt) { 66 if (fmt) {
66 list_add_tail(&tb_fmt->list, &trace_bprintk_fmt_list); 67 list_add_tail(&tb_fmt->list, &trace_bprintk_fmt_list);
67 strcpy(fmt, *iter); 68 strcpy(fmt, *iter);
68 tb_fmt->fmt = fmt; 69 tb_fmt->fmt = fmt;
69 *iter = tb_fmt->fmt; 70 } else
70 } else { 71 kfree(tb_fmt);
71 kfree(tb_fmt);
72 *iter = NULL;
73 } 72 }
73 *iter = fmt;
74
74 } 75 }
75 mutex_unlock(&btrace_mutex); 76 mutex_unlock(&btrace_mutex);
76} 77}
diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c
index ee7b5a0bb9f..cb654542c1a 100644
--- a/kernel/trace/trace_syscalls.c
+++ b/kernel/trace/trace_syscalls.c
@@ -2,6 +2,7 @@
2#include <trace/events/syscalls.h> 2#include <trace/events/syscalls.h>
3#include <linux/slab.h> 3#include <linux/slab.h>
4#include <linux/kernel.h> 4#include <linux/kernel.h>
5#include <linux/module.h> /* for MODULE_NAME_LEN via KSYM_SYMBOL_LEN */
5#include <linux/ftrace.h> 6#include <linux/ftrace.h>
6#include <linux/perf_event.h> 7#include <linux/perf_event.h>
7#include <asm/syscall.h> 8#include <asm/syscall.h>
diff --git a/kernel/tracepoint.c b/kernel/tracepoint.c
index b219f1449c5..db110b8ae03 100644
--- a/kernel/tracepoint.c
+++ b/kernel/tracepoint.c
@@ -34,11 +34,16 @@ extern struct tracepoint * const __stop___tracepoints_ptrs[];
34static const int tracepoint_debug; 34static const int tracepoint_debug;
35 35
36/* 36/*
37 * tracepoints_mutex nests inside module_mutex. Tracepoints mutex protects the 37 * Tracepoints mutex protects the builtin and module tracepoints and the hash
38 * builtin and module tracepoints and the hash table. 38 * table, as well as the local module list.
39 */ 39 */
40static DEFINE_MUTEX(tracepoints_mutex); 40static DEFINE_MUTEX(tracepoints_mutex);
41 41
42#ifdef CONFIG_MODULES
43/* Local list of struct module */
44static LIST_HEAD(tracepoint_module_list);
45#endif /* CONFIG_MODULES */
46
42/* 47/*
43 * Tracepoint hash table, containing the active tracepoints. 48 * Tracepoint hash table, containing the active tracepoints.
44 * Protected by tracepoints_mutex. 49 * Protected by tracepoints_mutex.
@@ -292,9 +297,10 @@ static void disable_tracepoint(struct tracepoint *elem)
292 * @end: end of the range 297 * @end: end of the range
293 * 298 *
294 * Updates the probe callback corresponding to a range of tracepoints. 299 * Updates the probe callback corresponding to a range of tracepoints.
300 * Called with tracepoints_mutex held.
295 */ 301 */
296void tracepoint_update_probe_range(struct tracepoint * const *begin, 302static void tracepoint_update_probe_range(struct tracepoint * const *begin,
297 struct tracepoint * const *end) 303 struct tracepoint * const *end)
298{ 304{
299 struct tracepoint * const *iter; 305 struct tracepoint * const *iter;
300 struct tracepoint_entry *mark_entry; 306 struct tracepoint_entry *mark_entry;
@@ -302,7 +308,6 @@ void tracepoint_update_probe_range(struct tracepoint * const *begin,
302 if (!begin) 308 if (!begin)
303 return; 309 return;
304 310
305 mutex_lock(&tracepoints_mutex);
306 for (iter = begin; iter < end; iter++) { 311 for (iter = begin; iter < end; iter++) {
307 mark_entry = get_tracepoint((*iter)->name); 312 mark_entry = get_tracepoint((*iter)->name);
308 if (mark_entry) { 313 if (mark_entry) {
@@ -312,11 +317,27 @@ void tracepoint_update_probe_range(struct tracepoint * const *begin,
312 disable_tracepoint(*iter); 317 disable_tracepoint(*iter);
313 } 318 }
314 } 319 }
315 mutex_unlock(&tracepoints_mutex);
316} 320}
317 321
322#ifdef CONFIG_MODULES
323void module_update_tracepoints(void)
324{
325 struct tp_module *tp_mod;
326
327 list_for_each_entry(tp_mod, &tracepoint_module_list, list)
328 tracepoint_update_probe_range(tp_mod->tracepoints_ptrs,
329 tp_mod->tracepoints_ptrs + tp_mod->num_tracepoints);
330}
331#else /* CONFIG_MODULES */
332void module_update_tracepoints(void)
333{
334}
335#endif /* CONFIG_MODULES */
336
337
318/* 338/*
319 * Update probes, removing the faulty probes. 339 * Update probes, removing the faulty probes.
340 * Called with tracepoints_mutex held.
320 */ 341 */
321static void tracepoint_update_probes(void) 342static void tracepoint_update_probes(void)
322{ 343{
@@ -359,11 +380,12 @@ int tracepoint_probe_register(const char *name, void *probe, void *data)
359 380
360 mutex_lock(&tracepoints_mutex); 381 mutex_lock(&tracepoints_mutex);
361 old = tracepoint_add_probe(name, probe, data); 382 old = tracepoint_add_probe(name, probe, data);
362 mutex_unlock(&tracepoints_mutex); 383 if (IS_ERR(old)) {
363 if (IS_ERR(old)) 384 mutex_unlock(&tracepoints_mutex);
364 return PTR_ERR(old); 385 return PTR_ERR(old);
365 386 }
366 tracepoint_update_probes(); /* may update entry */ 387 tracepoint_update_probes(); /* may update entry */
388 mutex_unlock(&tracepoints_mutex);
367 release_probes(old); 389 release_probes(old);
368 return 0; 390 return 0;
369} 391}
@@ -402,11 +424,12 @@ int tracepoint_probe_unregister(const char *name, void *probe, void *data)
402 424
403 mutex_lock(&tracepoints_mutex); 425 mutex_lock(&tracepoints_mutex);
404 old = tracepoint_remove_probe(name, probe, data); 426 old = tracepoint_remove_probe(name, probe, data);
405 mutex_unlock(&tracepoints_mutex); 427 if (IS_ERR(old)) {
406 if (IS_ERR(old)) 428 mutex_unlock(&tracepoints_mutex);
407 return PTR_ERR(old); 429 return PTR_ERR(old);
408 430 }
409 tracepoint_update_probes(); /* may update entry */ 431 tracepoint_update_probes(); /* may update entry */
432 mutex_unlock(&tracepoints_mutex);
410 release_probes(old); 433 release_probes(old);
411 return 0; 434 return 0;
412} 435}
@@ -489,9 +512,8 @@ void tracepoint_probe_update_all(void)
489 if (!list_empty(&old_probes)) 512 if (!list_empty(&old_probes))
490 list_replace_init(&old_probes, &release_probes); 513 list_replace_init(&old_probes, &release_probes);
491 need_update = 0; 514 need_update = 0;
492 mutex_unlock(&tracepoints_mutex);
493
494 tracepoint_update_probes(); 515 tracepoint_update_probes();
516 mutex_unlock(&tracepoints_mutex);
495 list_for_each_entry_safe(pos, next, &release_probes, u.list) { 517 list_for_each_entry_safe(pos, next, &release_probes, u.list) {
496 list_del(&pos->u.list); 518 list_del(&pos->u.list);
497 call_rcu_sched(&pos->u.rcu, rcu_free_old_probes); 519 call_rcu_sched(&pos->u.rcu, rcu_free_old_probes);
@@ -509,7 +531,7 @@ EXPORT_SYMBOL_GPL(tracepoint_probe_update_all);
509 * Will return the first tracepoint in the range if the input tracepoint is 531 * Will return the first tracepoint in the range if the input tracepoint is
510 * NULL. 532 * NULL.
511 */ 533 */
512int tracepoint_get_iter_range(struct tracepoint * const **tracepoint, 534static int tracepoint_get_iter_range(struct tracepoint * const **tracepoint,
513 struct tracepoint * const *begin, struct tracepoint * const *end) 535 struct tracepoint * const *begin, struct tracepoint * const *end)
514{ 536{
515 if (!*tracepoint && begin != end) { 537 if (!*tracepoint && begin != end) {
@@ -520,11 +542,12 @@ int tracepoint_get_iter_range(struct tracepoint * const **tracepoint,
520 return 1; 542 return 1;
521 return 0; 543 return 0;
522} 544}
523EXPORT_SYMBOL_GPL(tracepoint_get_iter_range);
524 545
546#ifdef CONFIG_MODULES
525static void tracepoint_get_iter(struct tracepoint_iter *iter) 547static void tracepoint_get_iter(struct tracepoint_iter *iter)
526{ 548{
527 int found = 0; 549 int found = 0;
550 struct tp_module *iter_mod;
528 551
529 /* Core kernel tracepoints */ 552 /* Core kernel tracepoints */
530 if (!iter->module) { 553 if (!iter->module) {
@@ -534,12 +557,43 @@ static void tracepoint_get_iter(struct tracepoint_iter *iter)
534 if (found) 557 if (found)
535 goto end; 558 goto end;
536 } 559 }
537 /* tracepoints in modules. */ 560 /* Tracepoints in modules */
538 found = module_get_iter_tracepoints(iter); 561 mutex_lock(&tracepoints_mutex);
562 list_for_each_entry(iter_mod, &tracepoint_module_list, list) {
563 /*
564 * Sorted module list
565 */
566 if (iter_mod < iter->module)
567 continue;
568 else if (iter_mod > iter->module)
569 iter->tracepoint = NULL;
570 found = tracepoint_get_iter_range(&iter->tracepoint,
571 iter_mod->tracepoints_ptrs,
572 iter_mod->tracepoints_ptrs
573 + iter_mod->num_tracepoints);
574 if (found) {
575 iter->module = iter_mod;
576 break;
577 }
578 }
579 mutex_unlock(&tracepoints_mutex);
539end: 580end:
540 if (!found) 581 if (!found)
541 tracepoint_iter_reset(iter); 582 tracepoint_iter_reset(iter);
542} 583}
584#else /* CONFIG_MODULES */
585static void tracepoint_get_iter(struct tracepoint_iter *iter)
586{
587 int found = 0;
588
589 /* Core kernel tracepoints */
590 found = tracepoint_get_iter_range(&iter->tracepoint,
591 __start___tracepoints_ptrs,
592 __stop___tracepoints_ptrs);
593 if (!found)
594 tracepoint_iter_reset(iter);
595}
596#endif /* CONFIG_MODULES */
543 597
544void tracepoint_iter_start(struct tracepoint_iter *iter) 598void tracepoint_iter_start(struct tracepoint_iter *iter)
545{ 599{
@@ -566,26 +620,98 @@ EXPORT_SYMBOL_GPL(tracepoint_iter_stop);
566 620
567void tracepoint_iter_reset(struct tracepoint_iter *iter) 621void tracepoint_iter_reset(struct tracepoint_iter *iter)
568{ 622{
623#ifdef CONFIG_MODULES
569 iter->module = NULL; 624 iter->module = NULL;
625#endif /* CONFIG_MODULES */
570 iter->tracepoint = NULL; 626 iter->tracepoint = NULL;
571} 627}
572EXPORT_SYMBOL_GPL(tracepoint_iter_reset); 628EXPORT_SYMBOL_GPL(tracepoint_iter_reset);
573 629
574#ifdef CONFIG_MODULES 630#ifdef CONFIG_MODULES
631static int tracepoint_module_coming(struct module *mod)
632{
633 struct tp_module *tp_mod, *iter;
634 int ret = 0;
635
636 /*
637 * We skip modules that tain the kernel, especially those with different
638 * module header (for forced load), to make sure we don't cause a crash.
639 */
640 if (mod->taints)
641 return 0;
642 mutex_lock(&tracepoints_mutex);
643 tp_mod = kmalloc(sizeof(struct tp_module), GFP_KERNEL);
644 if (!tp_mod) {
645 ret = -ENOMEM;
646 goto end;
647 }
648 tp_mod->num_tracepoints = mod->num_tracepoints;
649 tp_mod->tracepoints_ptrs = mod->tracepoints_ptrs;
650
651 /*
652 * tracepoint_module_list is kept sorted by struct module pointer
653 * address for iteration on tracepoints from a seq_file that can release
654 * the mutex between calls.
655 */
656 list_for_each_entry_reverse(iter, &tracepoint_module_list, list) {
657 BUG_ON(iter == tp_mod); /* Should never be in the list twice */
658 if (iter < tp_mod) {
659 /* We belong to the location right after iter. */
660 list_add(&tp_mod->list, &iter->list);
661 goto module_added;
662 }
663 }
664 /* We belong to the beginning of the list */
665 list_add(&tp_mod->list, &tracepoint_module_list);
666module_added:
667 tracepoint_update_probe_range(mod->tracepoints_ptrs,
668 mod->tracepoints_ptrs + mod->num_tracepoints);
669end:
670 mutex_unlock(&tracepoints_mutex);
671 return ret;
672}
673
674static int tracepoint_module_going(struct module *mod)
675{
676 struct tp_module *pos;
677
678 mutex_lock(&tracepoints_mutex);
679 tracepoint_update_probe_range(mod->tracepoints_ptrs,
680 mod->tracepoints_ptrs + mod->num_tracepoints);
681 list_for_each_entry(pos, &tracepoint_module_list, list) {
682 if (pos->tracepoints_ptrs == mod->tracepoints_ptrs) {
683 list_del(&pos->list);
684 kfree(pos);
685 break;
686 }
687 }
688 /*
689 * In the case of modules that were tainted at "coming", we'll simply
690 * walk through the list without finding it. We cannot use the "tainted"
691 * flag on "going", in case a module taints the kernel only after being
692 * loaded.
693 */
694 mutex_unlock(&tracepoints_mutex);
695 return 0;
696}
575 697
576int tracepoint_module_notify(struct notifier_block *self, 698int tracepoint_module_notify(struct notifier_block *self,
577 unsigned long val, void *data) 699 unsigned long val, void *data)
578{ 700{
579 struct module *mod = data; 701 struct module *mod = data;
702 int ret = 0;
580 703
581 switch (val) { 704 switch (val) {
582 case MODULE_STATE_COMING: 705 case MODULE_STATE_COMING:
706 ret = tracepoint_module_coming(mod);
707 break;
708 case MODULE_STATE_LIVE:
709 break;
583 case MODULE_STATE_GOING: 710 case MODULE_STATE_GOING:
584 tracepoint_update_probe_range(mod->tracepoints_ptrs, 711 ret = tracepoint_module_going(mod);
585 mod->tracepoints_ptrs + mod->num_tracepoints);
586 break; 712 break;
587 } 713 }
588 return 0; 714 return ret;
589} 715}
590 716
591struct notifier_block tracepoint_module_nb = { 717struct notifier_block tracepoint_module_nb = {
@@ -598,7 +724,6 @@ static int init_tracepoints(void)
598 return register_module_notifier(&tracepoint_module_nb); 724 return register_module_notifier(&tracepoint_module_nb);
599} 725}
600__initcall(init_tracepoints); 726__initcall(init_tracepoints);
601
602#endif /* CONFIG_MODULES */ 727#endif /* CONFIG_MODULES */
603 728
604#ifdef CONFIG_HAVE_SYSCALL_TRACEPOINTS 729#ifdef CONFIG_HAVE_SYSCALL_TRACEPOINTS
diff --git a/kernel/tsacct.c b/kernel/tsacct.c
index 24dc60d9fa1..5bbfac85866 100644
--- a/kernel/tsacct.c
+++ b/kernel/tsacct.c
@@ -78,6 +78,7 @@ void bacct_add_tsk(struct taskstats *stats, struct task_struct *tsk)
78 78
79#define KB 1024 79#define KB 1024
80#define MB (1024*KB) 80#define MB (1024*KB)
81#define KB_MASK (~(KB-1))
81/* 82/*
82 * fill in extended accounting fields 83 * fill in extended accounting fields
83 */ 84 */
@@ -95,14 +96,14 @@ void xacct_add_tsk(struct taskstats *stats, struct task_struct *p)
95 stats->hiwater_vm = get_mm_hiwater_vm(mm) * PAGE_SIZE / KB; 96 stats->hiwater_vm = get_mm_hiwater_vm(mm) * PAGE_SIZE / KB;
96 mmput(mm); 97 mmput(mm);
97 } 98 }
98 stats->read_char = p->ioac.rchar; 99 stats->read_char = p->ioac.rchar & KB_MASK;
99 stats->write_char = p->ioac.wchar; 100 stats->write_char = p->ioac.wchar & KB_MASK;
100 stats->read_syscalls = p->ioac.syscr; 101 stats->read_syscalls = p->ioac.syscr & KB_MASK;
101 stats->write_syscalls = p->ioac.syscw; 102 stats->write_syscalls = p->ioac.syscw & KB_MASK;
102#ifdef CONFIG_TASK_IO_ACCOUNTING 103#ifdef CONFIG_TASK_IO_ACCOUNTING
103 stats->read_bytes = p->ioac.read_bytes; 104 stats->read_bytes = p->ioac.read_bytes & KB_MASK;
104 stats->write_bytes = p->ioac.write_bytes; 105 stats->write_bytes = p->ioac.write_bytes & KB_MASK;
105 stats->cancelled_write_bytes = p->ioac.cancelled_write_bytes; 106 stats->cancelled_write_bytes = p->ioac.cancelled_write_bytes & KB_MASK;
106#else 107#else
107 stats->read_bytes = 0; 108 stats->read_bytes = 0;
108 stats->write_bytes = 0; 109 stats->write_bytes = 0;
diff --git a/kernel/up.c b/kernel/up.c
index 1ff27a28bb7..c54c75e9faf 100644
--- a/kernel/up.c
+++ b/kernel/up.c
@@ -4,7 +4,7 @@
4 4
5#include <linux/interrupt.h> 5#include <linux/interrupt.h>
6#include <linux/kernel.h> 6#include <linux/kernel.h>
7#include <linux/module.h> 7#include <linux/export.h>
8#include <linux/smp.h> 8#include <linux/smp.h>
9 9
10int smp_call_function_single(int cpu, void (*func) (void *info), void *info, 10int smp_call_function_single(int cpu, void (*func) (void *info), void *info,
diff --git a/kernel/user-return-notifier.c b/kernel/user-return-notifier.c
index 92cb706c7fc..1744bb80f1f 100644
--- a/kernel/user-return-notifier.c
+++ b/kernel/user-return-notifier.c
@@ -2,7 +2,7 @@
2#include <linux/user-return-notifier.h> 2#include <linux/user-return-notifier.h>
3#include <linux/percpu.h> 3#include <linux/percpu.h>
4#include <linux/sched.h> 4#include <linux/sched.h>
5#include <linux/module.h> 5#include <linux/export.h>
6 6
7static DEFINE_PER_CPU(struct hlist_head, return_notifier_list); 7static DEFINE_PER_CPU(struct hlist_head, return_notifier_list);
8 8
diff --git a/kernel/user.c b/kernel/user.c
index 9e03e9c1df8..71dd2363ab0 100644
--- a/kernel/user.c
+++ b/kernel/user.c
@@ -14,7 +14,7 @@
14#include <linux/bitops.h> 14#include <linux/bitops.h>
15#include <linux/key.h> 15#include <linux/key.h>
16#include <linux/interrupt.h> 16#include <linux/interrupt.h>
17#include <linux/module.h> 17#include <linux/export.h>
18#include <linux/user_namespace.h> 18#include <linux/user_namespace.h>
19 19
20/* 20/*
diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c
index 9da289c34f2..3b906e98b1d 100644
--- a/kernel/user_namespace.c
+++ b/kernel/user_namespace.c
@@ -5,7 +5,7 @@
5 * License. 5 * License.
6 */ 6 */
7 7
8#include <linux/module.h> 8#include <linux/export.h>
9#include <linux/nsproxy.h> 9#include <linux/nsproxy.h>
10#include <linux/slab.h> 10#include <linux/slab.h>
11#include <linux/user_namespace.h> 11#include <linux/user_namespace.h>
diff --git a/kernel/utsname.c b/kernel/utsname.c
index bff131b9510..405caf91aad 100644
--- a/kernel/utsname.c
+++ b/kernel/utsname.c
@@ -9,7 +9,7 @@
9 * License. 9 * License.
10 */ 10 */
11 11
12#include <linux/module.h> 12#include <linux/export.h>
13#include <linux/uts.h> 13#include <linux/uts.h>
14#include <linux/utsname.h> 14#include <linux/utsname.h>
15#include <linux/err.h> 15#include <linux/err.h>
diff --git a/kernel/utsname_sysctl.c b/kernel/utsname_sysctl.c
index a2cd77e70d4..63da38c2d82 100644
--- a/kernel/utsname_sysctl.c
+++ b/kernel/utsname_sysctl.c
@@ -9,10 +9,11 @@
9 * License. 9 * License.
10 */ 10 */
11 11
12#include <linux/module.h> 12#include <linux/export.h>
13#include <linux/uts.h> 13#include <linux/uts.h>
14#include <linux/utsname.h> 14#include <linux/utsname.h>
15#include <linux/sysctl.h> 15#include <linux/sysctl.h>
16#include <linux/wait.h>
16 17
17static void *get_uts(ctl_table *table, int write) 18static void *get_uts(ctl_table *table, int write)
18{ 19{
@@ -51,12 +52,19 @@ static int proc_do_uts_string(ctl_table *table, int write,
51 uts_table.data = get_uts(table, write); 52 uts_table.data = get_uts(table, write);
52 r = proc_dostring(&uts_table,write,buffer,lenp, ppos); 53 r = proc_dostring(&uts_table,write,buffer,lenp, ppos);
53 put_uts(table, write, uts_table.data); 54 put_uts(table, write, uts_table.data);
55
56 if (write)
57 proc_sys_poll_notify(table->poll);
58
54 return r; 59 return r;
55} 60}
56#else 61#else
57#define proc_do_uts_string NULL 62#define proc_do_uts_string NULL
58#endif 63#endif
59 64
65static DEFINE_CTL_TABLE_POLL(hostname_poll);
66static DEFINE_CTL_TABLE_POLL(domainname_poll);
67
60static struct ctl_table uts_kern_table[] = { 68static struct ctl_table uts_kern_table[] = {
61 { 69 {
62 .procname = "ostype", 70 .procname = "ostype",
@@ -85,6 +93,7 @@ static struct ctl_table uts_kern_table[] = {
85 .maxlen = sizeof(init_uts_ns.name.nodename), 93 .maxlen = sizeof(init_uts_ns.name.nodename),
86 .mode = 0644, 94 .mode = 0644,
87 .proc_handler = proc_do_uts_string, 95 .proc_handler = proc_do_uts_string,
96 .poll = &hostname_poll,
88 }, 97 },
89 { 98 {
90 .procname = "domainname", 99 .procname = "domainname",
@@ -92,6 +101,7 @@ static struct ctl_table uts_kern_table[] = {
92 .maxlen = sizeof(init_uts_ns.name.domainname), 101 .maxlen = sizeof(init_uts_ns.name.domainname),
93 .mode = 0644, 102 .mode = 0644,
94 .proc_handler = proc_do_uts_string, 103 .proc_handler = proc_do_uts_string,
104 .poll = &domainname_poll,
95 }, 105 },
96 {} 106 {}
97}; 107};
@@ -105,6 +115,19 @@ static struct ctl_table uts_root_table[] = {
105 {} 115 {}
106}; 116};
107 117
118#ifdef CONFIG_PROC_SYSCTL
119/*
120 * Notify userspace about a change in a certain entry of uts_kern_table,
121 * identified by the parameter proc.
122 */
123void uts_proc_notify(enum uts_proc proc)
124{
125 struct ctl_table *table = &uts_kern_table[proc];
126
127 proc_sys_poll_notify(table->poll);
128}
129#endif
130
108static int __init utsname_sysctl_init(void) 131static int __init utsname_sysctl_init(void)
109{ 132{
110 register_sysctl_table(uts_root_table); 133 register_sysctl_table(uts_root_table);
diff --git a/kernel/wait.c b/kernel/wait.c
index f45ea8d2a1c..26fa7797f90 100644
--- a/kernel/wait.c
+++ b/kernel/wait.c
@@ -4,7 +4,7 @@
4 * (C) 2004 William Irwin, Oracle 4 * (C) 2004 William Irwin, Oracle
5 */ 5 */
6#include <linux/init.h> 6#include <linux/init.h>
7#include <linux/module.h> 7#include <linux/export.h>
8#include <linux/sched.h> 8#include <linux/sched.h>
9#include <linux/mm.h> 9#include <linux/mm.h>
10#include <linux/wait.h> 10#include <linux/wait.h>
diff --git a/kernel/watchdog.c b/kernel/watchdog.c
index 36491cd5b7d..1d7bca7f4f5 100644
--- a/kernel/watchdog.c
+++ b/kernel/watchdog.c
@@ -321,7 +321,7 @@ static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer)
321 */ 321 */
322static int watchdog(void *unused) 322static int watchdog(void *unused)
323{ 323{
324 static struct sched_param param = { .sched_priority = MAX_RT_PRIO-1 }; 324 struct sched_param param = { .sched_priority = MAX_RT_PRIO-1 };
325 struct hrtimer *hrtimer = &__raw_get_cpu_var(watchdog_hrtimer); 325 struct hrtimer *hrtimer = &__raw_get_cpu_var(watchdog_hrtimer);
326 326
327 sched_setscheduler(current, SCHED_FIFO, &param); 327 sched_setscheduler(current, SCHED_FIFO, &param);
@@ -350,7 +350,8 @@ static int watchdog(void *unused)
350 set_current_state(TASK_INTERRUPTIBLE); 350 set_current_state(TASK_INTERRUPTIBLE);
351 } 351 }
352 __set_current_state(TASK_RUNNING); 352 __set_current_state(TASK_RUNNING);
353 353 param.sched_priority = 0;
354 sched_setscheduler(current, SCHED_NORMAL, &param);
354 return 0; 355 return 0;
355} 356}
356 357
@@ -438,7 +439,7 @@ static int watchdog_enable(int cpu)
438 439
439 /* create the watchdog thread */ 440 /* create the watchdog thread */
440 if (!p) { 441 if (!p) {
441 p = kthread_create(watchdog, (void *)(unsigned long)cpu, "watchdog/%d", cpu); 442 p = kthread_create_on_node(watchdog, NULL, cpu_to_node(cpu), "watchdog/%d", cpu);
442 if (IS_ERR(p)) { 443 if (IS_ERR(p)) {
443 printk(KERN_ERR "softlockup watchdog for %i failed\n", cpu); 444 printk(KERN_ERR "softlockup watchdog for %i failed\n", cpu);
444 if (!err) { 445 if (!err) {
@@ -480,6 +481,8 @@ static void watchdog_disable(int cpu)
480 } 481 }
481} 482}
482 483
484/* sysctl functions */
485#ifdef CONFIG_SYSCTL
483static void watchdog_enable_all_cpus(void) 486static void watchdog_enable_all_cpus(void)
484{ 487{
485 int cpu; 488 int cpu;
@@ -509,8 +512,6 @@ static void watchdog_disable_all_cpus(void)
509} 512}
510 513
511 514
512/* sysctl functions */
513#ifdef CONFIG_SYSCTL
514/* 515/*
515 * proc handler for /proc/sys/kernel/nmi_watchdog,watchdog_thresh 516 * proc handler for /proc/sys/kernel/nmi_watchdog,watchdog_thresh
516 */ 517 */
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 25fb1b0e53f..42fa9ad0a81 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -23,7 +23,7 @@
23 * Please read Documentation/workqueue.txt for details. 23 * Please read Documentation/workqueue.txt for details.
24 */ 24 */
25 25
26#include <linux/module.h> 26#include <linux/export.h>
27#include <linux/kernel.h> 27#include <linux/kernel.h>
28#include <linux/sched.h> 28#include <linux/sched.h>
29#include <linux/init.h> 29#include <linux/init.h>
@@ -2412,8 +2412,13 @@ reflush:
2412 2412
2413 for_each_cwq_cpu(cpu, wq) { 2413 for_each_cwq_cpu(cpu, wq) {
2414 struct cpu_workqueue_struct *cwq = get_cwq(cpu, wq); 2414 struct cpu_workqueue_struct *cwq = get_cwq(cpu, wq);
2415 bool drained;
2415 2416
2416 if (!cwq->nr_active && list_empty(&cwq->delayed_works)) 2417 spin_lock_irq(&cwq->gcwq->lock);
2418 drained = !cwq->nr_active && list_empty(&cwq->delayed_works);
2419 spin_unlock_irq(&cwq->gcwq->lock);
2420
2421 if (drained)
2417 continue; 2422 continue;
2418 2423
2419 if (++flush_cnt == 10 || 2424 if (++flush_cnt == 10 ||