aboutsummaryrefslogtreecommitdiffstats
path: root/kernel
diff options
context:
space:
mode:
Diffstat (limited to 'kernel')
-rw-r--r--kernel/Makefile2
-rw-r--r--kernel/async.c4
-rw-r--r--kernel/cgroup.c18
-rw-r--r--kernel/cred.c18
-rw-r--r--kernel/events/core.c97
-rw-r--r--kernel/freezer.c2
-rw-r--r--kernel/futex.c10
-rw-r--r--kernel/irq/irqdomain.c6
-rw-r--r--kernel/irq/pm.c48
-rw-r--r--kernel/irq_work.c91
-rw-r--r--kernel/kmod.c4
-rw-r--r--kernel/kprobes.c34
-rw-r--r--kernel/latencytop.c14
-rw-r--r--kernel/lockdep.c240
-rw-r--r--kernel/module.c47
-rw-r--r--kernel/params.c21
-rw-r--r--kernel/pid.c4
-rw-r--r--kernel/posix-cpu-timers.c22
-rw-r--r--kernel/power/Kconfig4
-rw-r--r--kernel/power/Makefile4
-rw-r--r--kernel/power/console.c4
-rw-r--r--kernel/power/hibernate.c53
-rw-r--r--kernel/power/main.c102
-rw-r--r--kernel/power/power.h4
-rw-r--r--kernel/power/process.c30
-rw-r--r--kernel/power/qos.c (renamed from kernel/pm_qos_params.c)273
-rw-r--r--kernel/power/snapshot.c18
-rw-r--r--kernel/power/suspend.c17
-rw-r--r--kernel/power/swap.c818
-rw-r--r--kernel/printk.c46
-rw-r--r--kernel/ptrace.c23
-rw-r--r--kernel/rcu.h85
-rw-r--r--kernel/rcupdate.c26
-rw-r--r--kernel/rcutiny.c117
-rw-r--r--kernel/rcutiny_plugin.h134
-rw-r--r--kernel/rcutorture.c77
-rw-r--r--kernel/rcutree.c290
-rw-r--r--kernel/rcutree.h17
-rw-r--r--kernel/rcutree_plugin.h150
-rw-r--r--kernel/rcutree_trace.c13
-rw-r--r--kernel/resource.c7
-rw-r--r--kernel/rtmutex-debug.c77
-rw-r--r--kernel/rtmutex.c8
-rw-r--r--kernel/sched.c707
-rw-r--r--kernel/sched_cpupri.c89
-rw-r--r--kernel/sched_cpupri.h7
-rw-r--r--kernel/sched_fair.c761
-rw-r--r--kernel/sched_features.h5
-rw-r--r--kernel/sched_rt.c103
-rw-r--r--kernel/sched_stats.h12
-rw-r--r--kernel/sched_stoptask.c2
-rw-r--r--kernel/semaphore.c28
-rw-r--r--kernel/signal.c24
-rw-r--r--kernel/sys.c3
-rw-r--r--kernel/sysctl.c10
-rw-r--r--kernel/sysctl_binary.c2
-rw-r--r--kernel/taskstats.c1
-rw-r--r--kernel/time.c2
-rw-r--r--kernel/time/Kconfig2
-rw-r--r--kernel/time/alarmtimer.c266
-rw-r--r--kernel/time/clockevents.c129
-rw-r--r--kernel/time/clocksource.c38
-rw-r--r--kernel/time/tick-broadcast.c4
-rw-r--r--kernel/time/tick-common.c4
-rw-r--r--kernel/time/tick-internal.h2
-rw-r--r--kernel/time/tick-oneshot.c77
-rw-r--r--kernel/time/tick-sched.c61
-rw-r--r--kernel/time/timer_stats.c6
-rw-r--r--kernel/trace/Makefile5
-rw-r--r--kernel/trace/ftrace.c8
-rw-r--r--kernel/trace/ring_buffer.c122
-rw-r--r--kernel/trace/rpm-traces.c20
-rw-r--r--kernel/trace/trace.c191
-rw-r--r--kernel/trace/trace.h16
-rw-r--r--kernel/trace/trace_clock.c12
-rw-r--r--kernel/trace/trace_events_filter.c795
-rw-r--r--kernel/trace/trace_events_filter_test.h50
-rw-r--r--kernel/trace/trace_irqsoff.c10
-rw-r--r--kernel/trace/trace_kprobe.c58
-rw-r--r--kernel/trace/trace_printk.c19
-rw-r--r--kernel/tracepoint.c169
-rw-r--r--kernel/tsacct.c15
-rw-r--r--kernel/watchdog.c7
-rw-r--r--kernel/workqueue.c7
84 files changed, 4872 insertions, 2056 deletions
diff --git a/kernel/Makefile b/kernel/Makefile
index 988cb3da7031..e898c5b9d02c 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -9,7 +9,7 @@ obj-y = sched.o fork.o exec_domain.o panic.o printk.o \
9 rcupdate.o extable.o params.o posix-timers.o \ 9 rcupdate.o extable.o params.o posix-timers.o \
10 kthread.o wait.o kfifo.o sys_ni.o posix-cpu-timers.o mutex.o \ 10 kthread.o wait.o kfifo.o sys_ni.o posix-cpu-timers.o mutex.o \
11 hrtimer.o rwsem.o nsproxy.o srcu.o semaphore.o \ 11 hrtimer.o rwsem.o nsproxy.o srcu.o semaphore.o \
12 notifier.o ksysfs.o pm_qos_params.o sched_clock.o cred.o \ 12 notifier.o ksysfs.o sched_clock.o cred.o \
13 async.o range.o 13 async.o range.o
14obj-y += groups.o 14obj-y += groups.o
15 15
diff --git a/kernel/async.c b/kernel/async.c
index d5fe7af0de2e..4c2843c0043e 100644
--- a/kernel/async.c
+++ b/kernel/async.c
@@ -120,7 +120,7 @@ static void async_run_entry_fn(struct work_struct *work)
120 struct async_entry *entry = 120 struct async_entry *entry =
121 container_of(work, struct async_entry, work); 121 container_of(work, struct async_entry, work);
122 unsigned long flags; 122 unsigned long flags;
123 ktime_t calltime, delta, rettime; 123 ktime_t uninitialized_var(calltime), delta, rettime;
124 124
125 /* 1) move self to the running queue */ 125 /* 1) move self to the running queue */
126 spin_lock_irqsave(&async_lock, flags); 126 spin_lock_irqsave(&async_lock, flags);
@@ -269,7 +269,7 @@ EXPORT_SYMBOL_GPL(async_synchronize_full_domain);
269void async_synchronize_cookie_domain(async_cookie_t cookie, 269void async_synchronize_cookie_domain(async_cookie_t cookie,
270 struct list_head *running) 270 struct list_head *running)
271{ 271{
272 ktime_t starttime, delta, endtime; 272 ktime_t uninitialized_var(starttime), delta, endtime;
273 273
274 if (initcall_debug && system_state == SYSTEM_BOOTING) { 274 if (initcall_debug && system_state == SYSTEM_BOOTING) {
275 printk(KERN_DEBUG "async_waiting @ %i\n", task_pid_nr(current)); 275 printk(KERN_DEBUG "async_waiting @ %i\n", task_pid_nr(current));
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 1d2b6ceea95d..453100a4159d 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -265,7 +265,7 @@ list_for_each_entry(_root, &roots, root_list)
265/* the list of cgroups eligible for automatic release. Protected by 265/* the list of cgroups eligible for automatic release. Protected by
266 * release_list_lock */ 266 * release_list_lock */
267static LIST_HEAD(release_list); 267static LIST_HEAD(release_list);
268static DEFINE_SPINLOCK(release_list_lock); 268static DEFINE_RAW_SPINLOCK(release_list_lock);
269static void cgroup_release_agent(struct work_struct *work); 269static void cgroup_release_agent(struct work_struct *work);
270static DECLARE_WORK(release_agent_work, cgroup_release_agent); 270static DECLARE_WORK(release_agent_work, cgroup_release_agent);
271static void check_for_release(struct cgroup *cgrp); 271static void check_for_release(struct cgroup *cgrp);
@@ -4014,11 +4014,11 @@ again:
4014 finish_wait(&cgroup_rmdir_waitq, &wait); 4014 finish_wait(&cgroup_rmdir_waitq, &wait);
4015 clear_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags); 4015 clear_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags);
4016 4016
4017 spin_lock(&release_list_lock); 4017 raw_spin_lock(&release_list_lock);
4018 set_bit(CGRP_REMOVED, &cgrp->flags); 4018 set_bit(CGRP_REMOVED, &cgrp->flags);
4019 if (!list_empty(&cgrp->release_list)) 4019 if (!list_empty(&cgrp->release_list))
4020 list_del_init(&cgrp->release_list); 4020 list_del_init(&cgrp->release_list);
4021 spin_unlock(&release_list_lock); 4021 raw_spin_unlock(&release_list_lock);
4022 4022
4023 cgroup_lock_hierarchy(cgrp->root); 4023 cgroup_lock_hierarchy(cgrp->root);
4024 /* delete this cgroup from parent->children */ 4024 /* delete this cgroup from parent->children */
@@ -4671,13 +4671,13 @@ static void check_for_release(struct cgroup *cgrp)
4671 * already queued for a userspace notification, queue 4671 * already queued for a userspace notification, queue
4672 * it now */ 4672 * it now */
4673 int need_schedule_work = 0; 4673 int need_schedule_work = 0;
4674 spin_lock(&release_list_lock); 4674 raw_spin_lock(&release_list_lock);
4675 if (!cgroup_is_removed(cgrp) && 4675 if (!cgroup_is_removed(cgrp) &&
4676 list_empty(&cgrp->release_list)) { 4676 list_empty(&cgrp->release_list)) {
4677 list_add(&cgrp->release_list, &release_list); 4677 list_add(&cgrp->release_list, &release_list);
4678 need_schedule_work = 1; 4678 need_schedule_work = 1;
4679 } 4679 }
4680 spin_unlock(&release_list_lock); 4680 raw_spin_unlock(&release_list_lock);
4681 if (need_schedule_work) 4681 if (need_schedule_work)
4682 schedule_work(&release_agent_work); 4682 schedule_work(&release_agent_work);
4683 } 4683 }
@@ -4729,7 +4729,7 @@ static void cgroup_release_agent(struct work_struct *work)
4729{ 4729{
4730 BUG_ON(work != &release_agent_work); 4730 BUG_ON(work != &release_agent_work);
4731 mutex_lock(&cgroup_mutex); 4731 mutex_lock(&cgroup_mutex);
4732 spin_lock(&release_list_lock); 4732 raw_spin_lock(&release_list_lock);
4733 while (!list_empty(&release_list)) { 4733 while (!list_empty(&release_list)) {
4734 char *argv[3], *envp[3]; 4734 char *argv[3], *envp[3];
4735 int i; 4735 int i;
@@ -4738,7 +4738,7 @@ static void cgroup_release_agent(struct work_struct *work)
4738 struct cgroup, 4738 struct cgroup,
4739 release_list); 4739 release_list);
4740 list_del_init(&cgrp->release_list); 4740 list_del_init(&cgrp->release_list);
4741 spin_unlock(&release_list_lock); 4741 raw_spin_unlock(&release_list_lock);
4742 pathbuf = kmalloc(PAGE_SIZE, GFP_KERNEL); 4742 pathbuf = kmalloc(PAGE_SIZE, GFP_KERNEL);
4743 if (!pathbuf) 4743 if (!pathbuf)
4744 goto continue_free; 4744 goto continue_free;
@@ -4768,9 +4768,9 @@ static void cgroup_release_agent(struct work_struct *work)
4768 continue_free: 4768 continue_free:
4769 kfree(pathbuf); 4769 kfree(pathbuf);
4770 kfree(agentbuf); 4770 kfree(agentbuf);
4771 spin_lock(&release_list_lock); 4771 raw_spin_lock(&release_list_lock);
4772 } 4772 }
4773 spin_unlock(&release_list_lock); 4773 raw_spin_unlock(&release_list_lock);
4774 mutex_unlock(&cgroup_mutex); 4774 mutex_unlock(&cgroup_mutex);
4775} 4775}
4776 4776
diff --git a/kernel/cred.c b/kernel/cred.c
index 8ef31f53c44c..bb55d052d858 100644
--- a/kernel/cred.c
+++ b/kernel/cred.c
@@ -644,6 +644,9 @@ void __init cred_init(void)
644 */ 644 */
645struct cred *prepare_kernel_cred(struct task_struct *daemon) 645struct cred *prepare_kernel_cred(struct task_struct *daemon)
646{ 646{
647#ifdef CONFIG_KEYS
648 struct thread_group_cred *tgcred;
649#endif
647 const struct cred *old; 650 const struct cred *old;
648 struct cred *new; 651 struct cred *new;
649 652
@@ -651,6 +654,14 @@ struct cred *prepare_kernel_cred(struct task_struct *daemon)
651 if (!new) 654 if (!new)
652 return NULL; 655 return NULL;
653 656
657#ifdef CONFIG_KEYS
658 tgcred = kmalloc(sizeof(*tgcred), GFP_KERNEL);
659 if (!tgcred) {
660 kmem_cache_free(cred_jar, new);
661 return NULL;
662 }
663#endif
664
654 kdebug("prepare_kernel_cred() alloc %p", new); 665 kdebug("prepare_kernel_cred() alloc %p", new);
655 666
656 if (daemon) 667 if (daemon)
@@ -667,8 +678,11 @@ struct cred *prepare_kernel_cred(struct task_struct *daemon)
667 get_group_info(new->group_info); 678 get_group_info(new->group_info);
668 679
669#ifdef CONFIG_KEYS 680#ifdef CONFIG_KEYS
670 atomic_inc(&init_tgcred.usage); 681 atomic_set(&tgcred->usage, 1);
671 new->tgcred = &init_tgcred; 682 spin_lock_init(&tgcred->lock);
683 tgcred->process_keyring = NULL;
684 tgcred->session_keyring = NULL;
685 new->tgcred = tgcred;
672 new->request_key_auth = NULL; 686 new->request_key_auth = NULL;
673 new->thread_keyring = NULL; 687 new->thread_keyring = NULL;
674 new->jit_keyring = KEY_REQKEY_DEFL_THREAD_KEYRING; 688 new->jit_keyring = KEY_REQKEY_DEFL_THREAD_KEYRING;
diff --git a/kernel/events/core.c b/kernel/events/core.c
index fbe38f2e8edb..d1a1bee35228 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -29,6 +29,7 @@
29#include <linux/hardirq.h> 29#include <linux/hardirq.h>
30#include <linux/rculist.h> 30#include <linux/rculist.h>
31#include <linux/uaccess.h> 31#include <linux/uaccess.h>
32#include <linux/suspend.h>
32#include <linux/syscalls.h> 33#include <linux/syscalls.h>
33#include <linux/anon_inodes.h> 34#include <linux/anon_inodes.h>
34#include <linux/kernel_stat.h> 35#include <linux/kernel_stat.h>
@@ -6852,7 +6853,7 @@ static void __cpuinit perf_event_init_cpu(int cpu)
6852 struct swevent_htable *swhash = &per_cpu(swevent_htable, cpu); 6853 struct swevent_htable *swhash = &per_cpu(swevent_htable, cpu);
6853 6854
6854 mutex_lock(&swhash->hlist_mutex); 6855 mutex_lock(&swhash->hlist_mutex);
6855 if (swhash->hlist_refcount > 0) { 6856 if (swhash->hlist_refcount > 0 && !swhash->swevent_hlist) {
6856 struct swevent_hlist *hlist; 6857 struct swevent_hlist *hlist;
6857 6858
6858 hlist = kzalloc_node(sizeof(*hlist), GFP_KERNEL, cpu_to_node(cpu)); 6859 hlist = kzalloc_node(sizeof(*hlist), GFP_KERNEL, cpu_to_node(cpu));
@@ -6941,7 +6942,14 @@ perf_cpu_notify(struct notifier_block *self, unsigned long action, void *hcpu)
6941{ 6942{
6942 unsigned int cpu = (long)hcpu; 6943 unsigned int cpu = (long)hcpu;
6943 6944
6944 switch (action & ~CPU_TASKS_FROZEN) { 6945 /*
6946 * Ignore suspend/resume action, the perf_pm_notifier will
6947 * take care of that.
6948 */
6949 if (action & CPU_TASKS_FROZEN)
6950 return NOTIFY_OK;
6951
6952 switch (action) {
6945 6953
6946 case CPU_UP_PREPARE: 6954 case CPU_UP_PREPARE:
6947 case CPU_DOWN_FAILED: 6955 case CPU_DOWN_FAILED:
@@ -6960,6 +6968,90 @@ perf_cpu_notify(struct notifier_block *self, unsigned long action, void *hcpu)
6960 return NOTIFY_OK; 6968 return NOTIFY_OK;
6961} 6969}
6962 6970
6971static void perf_pm_resume_cpu(void *unused)
6972{
6973 struct perf_cpu_context *cpuctx;
6974 struct perf_event_context *ctx;
6975 struct pmu *pmu;
6976 int idx;
6977
6978 idx = srcu_read_lock(&pmus_srcu);
6979 list_for_each_entry_rcu(pmu, &pmus, entry) {
6980 cpuctx = this_cpu_ptr(pmu->pmu_cpu_context);
6981 ctx = cpuctx->task_ctx;
6982
6983 perf_ctx_lock(cpuctx, ctx);
6984 perf_pmu_disable(cpuctx->ctx.pmu);
6985
6986 cpu_ctx_sched_out(cpuctx, EVENT_ALL);
6987 if (ctx)
6988 ctx_sched_out(ctx, cpuctx, EVENT_ALL);
6989
6990 perf_pmu_enable(cpuctx->ctx.pmu);
6991 perf_ctx_unlock(cpuctx, ctx);
6992 }
6993 srcu_read_unlock(&pmus_srcu, idx);
6994}
6995
6996static void perf_pm_suspend_cpu(void *unused)
6997{
6998 struct perf_cpu_context *cpuctx;
6999 struct perf_event_context *ctx;
7000 struct pmu *pmu;
7001 int idx;
7002
7003 idx = srcu_read_lock(&pmus_srcu);
7004 list_for_each_entry_rcu(pmu, &pmus, entry) {
7005 cpuctx = this_cpu_ptr(pmu->pmu_cpu_context);
7006 ctx = cpuctx->task_ctx;
7007
7008 perf_ctx_lock(cpuctx, ctx);
7009 perf_pmu_disable(cpuctx->ctx.pmu);
7010
7011 perf_event_sched_in(cpuctx, ctx, current);
7012
7013 perf_pmu_enable(cpuctx->ctx.pmu);
7014 perf_ctx_unlock(cpuctx, ctx);
7015 }
7016 srcu_read_unlock(&pmus_srcu, idx);
7017}
7018
7019static int perf_resume(void)
7020{
7021 get_online_cpus();
7022 smp_call_function(perf_pm_resume_cpu, NULL, 1);
7023 put_online_cpus();
7024
7025 return NOTIFY_OK;
7026}
7027
7028static int perf_suspend(void)
7029{
7030 get_online_cpus();
7031 smp_call_function(perf_pm_suspend_cpu, NULL, 1);
7032 put_online_cpus();
7033
7034 return NOTIFY_OK;
7035}
7036
7037static int perf_pm(struct notifier_block *self, unsigned long action, void *ptr)
7038{
7039 switch (action) {
7040 case PM_POST_HIBERNATION:
7041 case PM_POST_SUSPEND:
7042 return perf_resume();
7043 case PM_HIBERNATION_PREPARE:
7044 case PM_SUSPEND_PREPARE:
7045 return perf_suspend();
7046 default:
7047 return NOTIFY_DONE;
7048 }
7049}
7050
7051static struct notifier_block perf_pm_notifier = {
7052 .notifier_call = perf_pm,
7053};
7054
6963void __init perf_event_init(void) 7055void __init perf_event_init(void)
6964{ 7056{
6965 int ret; 7057 int ret;
@@ -6974,6 +7066,7 @@ void __init perf_event_init(void)
6974 perf_tp_register(); 7066 perf_tp_register();
6975 perf_cpu_notifier(perf_cpu_notify); 7067 perf_cpu_notifier(perf_cpu_notify);
6976 register_reboot_notifier(&perf_reboot_notifier); 7068 register_reboot_notifier(&perf_reboot_notifier);
7069 register_pm_notifier(&perf_pm_notifier);
6977 7070
6978 ret = init_hw_breakpoint(); 7071 ret = init_hw_breakpoint();
6979 WARN(ret, "hw_breakpoint initialization failed with: %d", ret); 7072 WARN(ret, "hw_breakpoint initialization failed with: %d", ret);
diff --git a/kernel/freezer.c b/kernel/freezer.c
index 7b01de98bb6a..66a594e8ad2f 100644
--- a/kernel/freezer.c
+++ b/kernel/freezer.c
@@ -67,7 +67,7 @@ static void fake_signal_wake_up(struct task_struct *p)
67 unsigned long flags; 67 unsigned long flags;
68 68
69 spin_lock_irqsave(&p->sighand->siglock, flags); 69 spin_lock_irqsave(&p->sighand->siglock, flags);
70 signal_wake_up(p, 0); 70 signal_wake_up(p, 1);
71 spin_unlock_irqrestore(&p->sighand->siglock, flags); 71 spin_unlock_irqrestore(&p->sighand->siglock, flags);
72} 72}
73 73
diff --git a/kernel/futex.c b/kernel/futex.c
index 11cbe052b2e8..1511dff0cfd6 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -854,7 +854,7 @@ static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_q *this)
854{ 854{
855 struct task_struct *new_owner; 855 struct task_struct *new_owner;
856 struct futex_pi_state *pi_state = this->pi_state; 856 struct futex_pi_state *pi_state = this->pi_state;
857 u32 curval, newval; 857 u32 uninitialized_var(curval), newval;
858 858
859 if (!pi_state) 859 if (!pi_state)
860 return -EINVAL; 860 return -EINVAL;
@@ -916,7 +916,7 @@ static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_q *this)
916 916
917static int unlock_futex_pi(u32 __user *uaddr, u32 uval) 917static int unlock_futex_pi(u32 __user *uaddr, u32 uval)
918{ 918{
919 u32 oldval; 919 u32 uninitialized_var(oldval);
920 920
921 /* 921 /*
922 * There is no waiter, so we unlock the futex. The owner died 922 * There is no waiter, so we unlock the futex. The owner died
@@ -1576,7 +1576,7 @@ static int fixup_pi_state_owner(u32 __user *uaddr, struct futex_q *q,
1576 u32 newtid = task_pid_vnr(newowner) | FUTEX_WAITERS; 1576 u32 newtid = task_pid_vnr(newowner) | FUTEX_WAITERS;
1577 struct futex_pi_state *pi_state = q->pi_state; 1577 struct futex_pi_state *pi_state = q->pi_state;
1578 struct task_struct *oldowner = pi_state->owner; 1578 struct task_struct *oldowner = pi_state->owner;
1579 u32 uval, curval, newval; 1579 u32 uval, uninitialized_var(curval), newval;
1580 int ret; 1580 int ret;
1581 1581
1582 /* Owner died? */ 1582 /* Owner died? */
@@ -1793,7 +1793,7 @@ static void futex_wait_queue_me(struct futex_hash_bucket *hb, struct futex_q *q,
1793 * 1793 *
1794 * Returns: 1794 * Returns:
1795 * 0 - uaddr contains val and hb has been locked 1795 * 0 - uaddr contains val and hb has been locked
1796 * <1 - -EFAULT or -EWOULDBLOCK (uaddr does not contain val) and hb is unlcoked 1796 * <1 - -EFAULT or -EWOULDBLOCK (uaddr does not contain val) and hb is unlocked
1797 */ 1797 */
1798static int futex_wait_setup(u32 __user *uaddr, u32 val, unsigned int flags, 1798static int futex_wait_setup(u32 __user *uaddr, u32 val, unsigned int flags,
1799 struct futex_q *q, struct futex_hash_bucket **hb) 1799 struct futex_q *q, struct futex_hash_bucket **hb)
@@ -2481,7 +2481,7 @@ err_unlock:
2481 */ 2481 */
2482int handle_futex_death(u32 __user *uaddr, struct task_struct *curr, int pi) 2482int handle_futex_death(u32 __user *uaddr, struct task_struct *curr, int pi)
2483{ 2483{
2484 u32 uval, nval, mval; 2484 u32 uval, uninitialized_var(nval), mval;
2485 2485
2486retry: 2486retry:
2487 if (get_user(uval, uaddr)) 2487 if (get_user(uval, uaddr))
diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c
index d5828da3fd38..b57a3776de44 100644
--- a/kernel/irq/irqdomain.c
+++ b/kernel/irq/irqdomain.c
@@ -29,7 +29,11 @@ void irq_domain_add(struct irq_domain *domain)
29 */ 29 */
30 for (hwirq = 0; hwirq < domain->nr_irq; hwirq++) { 30 for (hwirq = 0; hwirq < domain->nr_irq; hwirq++) {
31 d = irq_get_irq_data(irq_domain_to_irq(domain, hwirq)); 31 d = irq_get_irq_data(irq_domain_to_irq(domain, hwirq));
32 if (d || d->domain) { 32 if (!d) {
33 WARN(1, "error: assigning domain to non existant irq_desc");
34 return;
35 }
36 if (d->domain) {
33 /* things are broken; just report, don't clean up */ 37 /* things are broken; just report, don't clean up */
34 WARN(1, "error: irq_desc already assigned to a domain"); 38 WARN(1, "error: irq_desc already assigned to a domain");
35 return; 39 return;
diff --git a/kernel/irq/pm.c b/kernel/irq/pm.c
index f76fc00c9877..15e53b1766a6 100644
--- a/kernel/irq/pm.c
+++ b/kernel/irq/pm.c
@@ -9,6 +9,7 @@
9#include <linux/irq.h> 9#include <linux/irq.h>
10#include <linux/module.h> 10#include <linux/module.h>
11#include <linux/interrupt.h> 11#include <linux/interrupt.h>
12#include <linux/syscore_ops.h>
12 13
13#include "internals.h" 14#include "internals.h"
14 15
@@ -39,25 +40,58 @@ void suspend_device_irqs(void)
39} 40}
40EXPORT_SYMBOL_GPL(suspend_device_irqs); 41EXPORT_SYMBOL_GPL(suspend_device_irqs);
41 42
42/** 43static void resume_irqs(bool want_early)
43 * resume_device_irqs - enable interrupt lines disabled by suspend_device_irqs()
44 *
45 * Enable all interrupt lines previously disabled by suspend_device_irqs() that
46 * have the IRQS_SUSPENDED flag set.
47 */
48void resume_device_irqs(void)
49{ 44{
50 struct irq_desc *desc; 45 struct irq_desc *desc;
51 int irq; 46 int irq;
52 47
53 for_each_irq_desc(irq, desc) { 48 for_each_irq_desc(irq, desc) {
54 unsigned long flags; 49 unsigned long flags;
50 bool is_early = desc->action &&
51 desc->action->flags & IRQF_EARLY_RESUME;
52
53 if (is_early != want_early)
54 continue;
55 55
56 raw_spin_lock_irqsave(&desc->lock, flags); 56 raw_spin_lock_irqsave(&desc->lock, flags);
57 __enable_irq(desc, irq, true); 57 __enable_irq(desc, irq, true);
58 raw_spin_unlock_irqrestore(&desc->lock, flags); 58 raw_spin_unlock_irqrestore(&desc->lock, flags);
59 } 59 }
60} 60}
61
62/**
63 * irq_pm_syscore_ops - enable interrupt lines early
64 *
65 * Enable all interrupt lines with %IRQF_EARLY_RESUME set.
66 */
67static void irq_pm_syscore_resume(void)
68{
69 resume_irqs(true);
70}
71
72static struct syscore_ops irq_pm_syscore_ops = {
73 .resume = irq_pm_syscore_resume,
74};
75
76static int __init irq_pm_init_ops(void)
77{
78 register_syscore_ops(&irq_pm_syscore_ops);
79 return 0;
80}
81
82device_initcall(irq_pm_init_ops);
83
84/**
85 * resume_device_irqs - enable interrupt lines disabled by suspend_device_irqs()
86 *
87 * Enable all non-%IRQF_EARLY_RESUME interrupt lines previously
88 * disabled by suspend_device_irqs() that have the IRQS_SUSPENDED flag
89 * set as well as those with %IRQF_FORCE_RESUME.
90 */
91void resume_device_irqs(void)
92{
93 resume_irqs(false);
94}
61EXPORT_SYMBOL_GPL(resume_device_irqs); 95EXPORT_SYMBOL_GPL(resume_device_irqs);
62 96
63/** 97/**
diff --git a/kernel/irq_work.c b/kernel/irq_work.c
index c58fa7da8aef..0e2cde4f380b 100644
--- a/kernel/irq_work.c
+++ b/kernel/irq_work.c
@@ -17,54 +17,34 @@
17 * claimed NULL, 3 -> {pending} : claimed to be enqueued 17 * claimed NULL, 3 -> {pending} : claimed to be enqueued
18 * pending next, 3 -> {busy} : queued, pending callback 18 * pending next, 3 -> {busy} : queued, pending callback
19 * busy NULL, 2 -> {free, claimed} : callback in progress, can be claimed 19 * busy NULL, 2 -> {free, claimed} : callback in progress, can be claimed
20 *
21 * We use the lower two bits of the next pointer to keep PENDING and BUSY
22 * flags.
23 */ 20 */
24 21
25#define IRQ_WORK_PENDING 1UL 22#define IRQ_WORK_PENDING 1UL
26#define IRQ_WORK_BUSY 2UL 23#define IRQ_WORK_BUSY 2UL
27#define IRQ_WORK_FLAGS 3UL 24#define IRQ_WORK_FLAGS 3UL
28 25
29static inline bool irq_work_is_set(struct irq_work *entry, int flags) 26static DEFINE_PER_CPU(struct llist_head, irq_work_list);
30{
31 return (unsigned long)entry->next & flags;
32}
33
34static inline struct irq_work *irq_work_next(struct irq_work *entry)
35{
36 unsigned long next = (unsigned long)entry->next;
37 next &= ~IRQ_WORK_FLAGS;
38 return (struct irq_work *)next;
39}
40
41static inline struct irq_work *next_flags(struct irq_work *entry, int flags)
42{
43 unsigned long next = (unsigned long)entry;
44 next |= flags;
45 return (struct irq_work *)next;
46}
47
48static DEFINE_PER_CPU(struct irq_work *, irq_work_list);
49 27
50/* 28/*
51 * Claim the entry so that no one else will poke at it. 29 * Claim the entry so that no one else will poke at it.
52 */ 30 */
53static bool irq_work_claim(struct irq_work *entry) 31static bool irq_work_claim(struct irq_work *work)
54{ 32{
55 struct irq_work *next, *nflags; 33 unsigned long flags, nflags;
56 34
57 do { 35 for (;;) {
58 next = entry->next; 36 flags = work->flags;
59 if ((unsigned long)next & IRQ_WORK_PENDING) 37 if (flags & IRQ_WORK_PENDING)
60 return false; 38 return false;
61 nflags = next_flags(next, IRQ_WORK_FLAGS); 39 nflags = flags | IRQ_WORK_FLAGS;
62 } while (cmpxchg(&entry->next, next, nflags) != next); 40 if (cmpxchg(&work->flags, flags, nflags) == flags)
41 break;
42 cpu_relax();
43 }
63 44
64 return true; 45 return true;
65} 46}
66 47
67
68void __weak arch_irq_work_raise(void) 48void __weak arch_irq_work_raise(void)
69{ 49{
70 /* 50 /*
@@ -75,20 +55,15 @@ void __weak arch_irq_work_raise(void)
75/* 55/*
76 * Queue the entry and raise the IPI if needed. 56 * Queue the entry and raise the IPI if needed.
77 */ 57 */
78static void __irq_work_queue(struct irq_work *entry) 58static void __irq_work_queue(struct irq_work *work)
79{ 59{
80 struct irq_work *next; 60 bool empty;
81 61
82 preempt_disable(); 62 preempt_disable();
83 63
84 do { 64 empty = llist_add(&work->llnode, &__get_cpu_var(irq_work_list));
85 next = __this_cpu_read(irq_work_list);
86 /* Can assign non-atomic because we keep the flags set. */
87 entry->next = next_flags(next, IRQ_WORK_FLAGS);
88 } while (this_cpu_cmpxchg(irq_work_list, next, entry) != next);
89
90 /* The list was empty, raise self-interrupt to start processing. */ 65 /* The list was empty, raise self-interrupt to start processing. */
91 if (!irq_work_next(entry)) 66 if (empty)
92 arch_irq_work_raise(); 67 arch_irq_work_raise();
93 68
94 preempt_enable(); 69 preempt_enable();
@@ -100,16 +75,16 @@ static void __irq_work_queue(struct irq_work *entry)
100 * 75 *
101 * Can be re-enqueued while the callback is still in progress. 76 * Can be re-enqueued while the callback is still in progress.
102 */ 77 */
103bool irq_work_queue(struct irq_work *entry) 78bool irq_work_queue(struct irq_work *work)
104{ 79{
105 if (!irq_work_claim(entry)) { 80 if (!irq_work_claim(work)) {
106 /* 81 /*
107 * Already enqueued, can't do! 82 * Already enqueued, can't do!
108 */ 83 */
109 return false; 84 return false;
110 } 85 }
111 86
112 __irq_work_queue(entry); 87 __irq_work_queue(work);
113 return true; 88 return true;
114} 89}
115EXPORT_SYMBOL_GPL(irq_work_queue); 90EXPORT_SYMBOL_GPL(irq_work_queue);
@@ -120,34 +95,34 @@ EXPORT_SYMBOL_GPL(irq_work_queue);
120 */ 95 */
121void irq_work_run(void) 96void irq_work_run(void)
122{ 97{
123 struct irq_work *list; 98 struct irq_work *work;
99 struct llist_head *this_list;
100 struct llist_node *llnode;
124 101
125 if (this_cpu_read(irq_work_list) == NULL) 102 this_list = &__get_cpu_var(irq_work_list);
103 if (llist_empty(this_list))
126 return; 104 return;
127 105
128 BUG_ON(!in_irq()); 106 BUG_ON(!in_irq());
129 BUG_ON(!irqs_disabled()); 107 BUG_ON(!irqs_disabled());
130 108
131 list = this_cpu_xchg(irq_work_list, NULL); 109 llnode = llist_del_all(this_list);
132 110 while (llnode != NULL) {
133 while (list != NULL) { 111 work = llist_entry(llnode, struct irq_work, llnode);
134 struct irq_work *entry = list;
135 112
136 list = irq_work_next(list); 113 llnode = llist_next(llnode);
137 114
138 /* 115 /*
139 * Clear the PENDING bit, after this point the @entry 116 * Clear the PENDING bit, after this point the @work
140 * can be re-used. 117 * can be re-used.
141 */ 118 */
142 entry->next = next_flags(NULL, IRQ_WORK_BUSY); 119 work->flags = IRQ_WORK_BUSY;
143 entry->func(entry); 120 work->func(work);
144 /* 121 /*
145 * Clear the BUSY bit and return to the free state if 122 * Clear the BUSY bit and return to the free state if
146 * no-one else claimed it meanwhile. 123 * no-one else claimed it meanwhile.
147 */ 124 */
148 (void)cmpxchg(&entry->next, 125 (void)cmpxchg(&work->flags, IRQ_WORK_BUSY, 0);
149 next_flags(NULL, IRQ_WORK_BUSY),
150 NULL);
151 } 126 }
152} 127}
153EXPORT_SYMBOL_GPL(irq_work_run); 128EXPORT_SYMBOL_GPL(irq_work_run);
@@ -156,11 +131,11 @@ EXPORT_SYMBOL_GPL(irq_work_run);
156 * Synchronize against the irq_work @entry, ensures the entry is not 131 * Synchronize against the irq_work @entry, ensures the entry is not
157 * currently in use. 132 * currently in use.
158 */ 133 */
159void irq_work_sync(struct irq_work *entry) 134void irq_work_sync(struct irq_work *work)
160{ 135{
161 WARN_ON_ONCE(irqs_disabled()); 136 WARN_ON_ONCE(irqs_disabled());
162 137
163 while (irq_work_is_set(entry, IRQ_WORK_BUSY)) 138 while (work->flags & IRQ_WORK_BUSY)
164 cpu_relax(); 139 cpu_relax();
165} 140}
166EXPORT_SYMBOL_GPL(irq_work_sync); 141EXPORT_SYMBOL_GPL(irq_work_sync);
diff --git a/kernel/kmod.c b/kernel/kmod.c
index ddc7644c1305..a4bea97c75b6 100644
--- a/kernel/kmod.c
+++ b/kernel/kmod.c
@@ -114,10 +114,12 @@ int __request_module(bool wait, const char *fmt, ...)
114 atomic_inc(&kmod_concurrent); 114 atomic_inc(&kmod_concurrent);
115 if (atomic_read(&kmod_concurrent) > max_modprobes) { 115 if (atomic_read(&kmod_concurrent) > max_modprobes) {
116 /* We may be blaming an innocent here, but unlikely */ 116 /* We may be blaming an innocent here, but unlikely */
117 if (kmod_loop_msg++ < 5) 117 if (kmod_loop_msg < 5) {
118 printk(KERN_ERR 118 printk(KERN_ERR
119 "request_module: runaway loop modprobe %s\n", 119 "request_module: runaway loop modprobe %s\n",
120 module_name); 120 module_name);
121 kmod_loop_msg++;
122 }
121 atomic_dec(&kmod_concurrent); 123 atomic_dec(&kmod_concurrent);
122 return -ENOMEM; 124 return -ENOMEM;
123 } 125 }
diff --git a/kernel/kprobes.c b/kernel/kprobes.c
index b30fd54eb985..2f193d0ba7f2 100644
--- a/kernel/kprobes.c
+++ b/kernel/kprobes.c
@@ -78,10 +78,10 @@ static bool kprobes_all_disarmed;
78static DEFINE_MUTEX(kprobe_mutex); 78static DEFINE_MUTEX(kprobe_mutex);
79static DEFINE_PER_CPU(struct kprobe *, kprobe_instance) = NULL; 79static DEFINE_PER_CPU(struct kprobe *, kprobe_instance) = NULL;
80static struct { 80static struct {
81 spinlock_t lock ____cacheline_aligned_in_smp; 81 raw_spinlock_t lock ____cacheline_aligned_in_smp;
82} kretprobe_table_locks[KPROBE_TABLE_SIZE]; 82} kretprobe_table_locks[KPROBE_TABLE_SIZE];
83 83
84static spinlock_t *kretprobe_table_lock_ptr(unsigned long hash) 84static raw_spinlock_t *kretprobe_table_lock_ptr(unsigned long hash)
85{ 85{
86 return &(kretprobe_table_locks[hash].lock); 86 return &(kretprobe_table_locks[hash].lock);
87} 87}
@@ -1013,9 +1013,9 @@ void __kprobes recycle_rp_inst(struct kretprobe_instance *ri,
1013 hlist_del(&ri->hlist); 1013 hlist_del(&ri->hlist);
1014 INIT_HLIST_NODE(&ri->hlist); 1014 INIT_HLIST_NODE(&ri->hlist);
1015 if (likely(rp)) { 1015 if (likely(rp)) {
1016 spin_lock(&rp->lock); 1016 raw_spin_lock(&rp->lock);
1017 hlist_add_head(&ri->hlist, &rp->free_instances); 1017 hlist_add_head(&ri->hlist, &rp->free_instances);
1018 spin_unlock(&rp->lock); 1018 raw_spin_unlock(&rp->lock);
1019 } else 1019 } else
1020 /* Unregistering */ 1020 /* Unregistering */
1021 hlist_add_head(&ri->hlist, head); 1021 hlist_add_head(&ri->hlist, head);
@@ -1026,19 +1026,19 @@ void __kprobes kretprobe_hash_lock(struct task_struct *tsk,
1026__acquires(hlist_lock) 1026__acquires(hlist_lock)
1027{ 1027{
1028 unsigned long hash = hash_ptr(tsk, KPROBE_HASH_BITS); 1028 unsigned long hash = hash_ptr(tsk, KPROBE_HASH_BITS);
1029 spinlock_t *hlist_lock; 1029 raw_spinlock_t *hlist_lock;
1030 1030
1031 *head = &kretprobe_inst_table[hash]; 1031 *head = &kretprobe_inst_table[hash];
1032 hlist_lock = kretprobe_table_lock_ptr(hash); 1032 hlist_lock = kretprobe_table_lock_ptr(hash);
1033 spin_lock_irqsave(hlist_lock, *flags); 1033 raw_spin_lock_irqsave(hlist_lock, *flags);
1034} 1034}
1035 1035
1036static void __kprobes kretprobe_table_lock(unsigned long hash, 1036static void __kprobes kretprobe_table_lock(unsigned long hash,
1037 unsigned long *flags) 1037 unsigned long *flags)
1038__acquires(hlist_lock) 1038__acquires(hlist_lock)
1039{ 1039{
1040 spinlock_t *hlist_lock = kretprobe_table_lock_ptr(hash); 1040 raw_spinlock_t *hlist_lock = kretprobe_table_lock_ptr(hash);
1041 spin_lock_irqsave(hlist_lock, *flags); 1041 raw_spin_lock_irqsave(hlist_lock, *flags);
1042} 1042}
1043 1043
1044void __kprobes kretprobe_hash_unlock(struct task_struct *tsk, 1044void __kprobes kretprobe_hash_unlock(struct task_struct *tsk,
@@ -1046,18 +1046,18 @@ void __kprobes kretprobe_hash_unlock(struct task_struct *tsk,
1046__releases(hlist_lock) 1046__releases(hlist_lock)
1047{ 1047{
1048 unsigned long hash = hash_ptr(tsk, KPROBE_HASH_BITS); 1048 unsigned long hash = hash_ptr(tsk, KPROBE_HASH_BITS);
1049 spinlock_t *hlist_lock; 1049 raw_spinlock_t *hlist_lock;
1050 1050
1051 hlist_lock = kretprobe_table_lock_ptr(hash); 1051 hlist_lock = kretprobe_table_lock_ptr(hash);
1052 spin_unlock_irqrestore(hlist_lock, *flags); 1052 raw_spin_unlock_irqrestore(hlist_lock, *flags);
1053} 1053}
1054 1054
1055static void __kprobes kretprobe_table_unlock(unsigned long hash, 1055static void __kprobes kretprobe_table_unlock(unsigned long hash,
1056 unsigned long *flags) 1056 unsigned long *flags)
1057__releases(hlist_lock) 1057__releases(hlist_lock)
1058{ 1058{
1059 spinlock_t *hlist_lock = kretprobe_table_lock_ptr(hash); 1059 raw_spinlock_t *hlist_lock = kretprobe_table_lock_ptr(hash);
1060 spin_unlock_irqrestore(hlist_lock, *flags); 1060 raw_spin_unlock_irqrestore(hlist_lock, *flags);
1061} 1061}
1062 1062
1063/* 1063/*
@@ -1663,12 +1663,12 @@ static int __kprobes pre_handler_kretprobe(struct kprobe *p,
1663 1663
1664 /*TODO: consider to only swap the RA after the last pre_handler fired */ 1664 /*TODO: consider to only swap the RA after the last pre_handler fired */
1665 hash = hash_ptr(current, KPROBE_HASH_BITS); 1665 hash = hash_ptr(current, KPROBE_HASH_BITS);
1666 spin_lock_irqsave(&rp->lock, flags); 1666 raw_spin_lock_irqsave(&rp->lock, flags);
1667 if (!hlist_empty(&rp->free_instances)) { 1667 if (!hlist_empty(&rp->free_instances)) {
1668 ri = hlist_entry(rp->free_instances.first, 1668 ri = hlist_entry(rp->free_instances.first,
1669 struct kretprobe_instance, hlist); 1669 struct kretprobe_instance, hlist);
1670 hlist_del(&ri->hlist); 1670 hlist_del(&ri->hlist);
1671 spin_unlock_irqrestore(&rp->lock, flags); 1671 raw_spin_unlock_irqrestore(&rp->lock, flags);
1672 1672
1673 ri->rp = rp; 1673 ri->rp = rp;
1674 ri->task = current; 1674 ri->task = current;
@@ -1685,7 +1685,7 @@ static int __kprobes pre_handler_kretprobe(struct kprobe *p,
1685 kretprobe_table_unlock(hash, &flags); 1685 kretprobe_table_unlock(hash, &flags);
1686 } else { 1686 } else {
1687 rp->nmissed++; 1687 rp->nmissed++;
1688 spin_unlock_irqrestore(&rp->lock, flags); 1688 raw_spin_unlock_irqrestore(&rp->lock, flags);
1689 } 1689 }
1690 return 0; 1690 return 0;
1691} 1691}
@@ -1721,7 +1721,7 @@ int __kprobes register_kretprobe(struct kretprobe *rp)
1721 rp->maxactive = num_possible_cpus(); 1721 rp->maxactive = num_possible_cpus();
1722#endif 1722#endif
1723 } 1723 }
1724 spin_lock_init(&rp->lock); 1724 raw_spin_lock_init(&rp->lock);
1725 INIT_HLIST_HEAD(&rp->free_instances); 1725 INIT_HLIST_HEAD(&rp->free_instances);
1726 for (i = 0; i < rp->maxactive; i++) { 1726 for (i = 0; i < rp->maxactive; i++) {
1727 inst = kmalloc(sizeof(struct kretprobe_instance) + 1727 inst = kmalloc(sizeof(struct kretprobe_instance) +
@@ -1959,7 +1959,7 @@ static int __init init_kprobes(void)
1959 for (i = 0; i < KPROBE_TABLE_SIZE; i++) { 1959 for (i = 0; i < KPROBE_TABLE_SIZE; i++) {
1960 INIT_HLIST_HEAD(&kprobe_table[i]); 1960 INIT_HLIST_HEAD(&kprobe_table[i]);
1961 INIT_HLIST_HEAD(&kretprobe_inst_table[i]); 1961 INIT_HLIST_HEAD(&kretprobe_inst_table[i]);
1962 spin_lock_init(&(kretprobe_table_locks[i].lock)); 1962 raw_spin_lock_init(&(kretprobe_table_locks[i].lock));
1963 } 1963 }
1964 1964
1965 /* 1965 /*
diff --git a/kernel/latencytop.c b/kernel/latencytop.c
index 376066e10413..4ac8ebfcab59 100644
--- a/kernel/latencytop.c
+++ b/kernel/latencytop.c
@@ -58,7 +58,7 @@
58#include <linux/list.h> 58#include <linux/list.h>
59#include <linux/stacktrace.h> 59#include <linux/stacktrace.h>
60 60
61static DEFINE_SPINLOCK(latency_lock); 61static DEFINE_RAW_SPINLOCK(latency_lock);
62 62
63#define MAXLR 128 63#define MAXLR 128
64static struct latency_record latency_record[MAXLR]; 64static struct latency_record latency_record[MAXLR];
@@ -72,19 +72,19 @@ void clear_all_latency_tracing(struct task_struct *p)
72 if (!latencytop_enabled) 72 if (!latencytop_enabled)
73 return; 73 return;
74 74
75 spin_lock_irqsave(&latency_lock, flags); 75 raw_spin_lock_irqsave(&latency_lock, flags);
76 memset(&p->latency_record, 0, sizeof(p->latency_record)); 76 memset(&p->latency_record, 0, sizeof(p->latency_record));
77 p->latency_record_count = 0; 77 p->latency_record_count = 0;
78 spin_unlock_irqrestore(&latency_lock, flags); 78 raw_spin_unlock_irqrestore(&latency_lock, flags);
79} 79}
80 80
81static void clear_global_latency_tracing(void) 81static void clear_global_latency_tracing(void)
82{ 82{
83 unsigned long flags; 83 unsigned long flags;
84 84
85 spin_lock_irqsave(&latency_lock, flags); 85 raw_spin_lock_irqsave(&latency_lock, flags);
86 memset(&latency_record, 0, sizeof(latency_record)); 86 memset(&latency_record, 0, sizeof(latency_record));
87 spin_unlock_irqrestore(&latency_lock, flags); 87 raw_spin_unlock_irqrestore(&latency_lock, flags);
88} 88}
89 89
90static void __sched 90static void __sched
@@ -190,7 +190,7 @@ __account_scheduler_latency(struct task_struct *tsk, int usecs, int inter)
190 lat.max = usecs; 190 lat.max = usecs;
191 store_stacktrace(tsk, &lat); 191 store_stacktrace(tsk, &lat);
192 192
193 spin_lock_irqsave(&latency_lock, flags); 193 raw_spin_lock_irqsave(&latency_lock, flags);
194 194
195 account_global_scheduler_latency(tsk, &lat); 195 account_global_scheduler_latency(tsk, &lat);
196 196
@@ -231,7 +231,7 @@ __account_scheduler_latency(struct task_struct *tsk, int usecs, int inter)
231 memcpy(&tsk->latency_record[i], &lat, sizeof(struct latency_record)); 231 memcpy(&tsk->latency_record[i], &lat, sizeof(struct latency_record));
232 232
233out_unlock: 233out_unlock:
234 spin_unlock_irqrestore(&latency_lock, flags); 234 raw_spin_unlock_irqrestore(&latency_lock, flags);
235} 235}
236 236
237static int lstats_show(struct seq_file *m, void *v) 237static int lstats_show(struct seq_file *m, void *v)
diff --git a/kernel/lockdep.c b/kernel/lockdep.c
index 91d67ce3a8d5..e69434b070da 100644
--- a/kernel/lockdep.c
+++ b/kernel/lockdep.c
@@ -96,8 +96,13 @@ static int graph_lock(void)
96 96
97static inline int graph_unlock(void) 97static inline int graph_unlock(void)
98{ 98{
99 if (debug_locks && !arch_spin_is_locked(&lockdep_lock)) 99 if (debug_locks && !arch_spin_is_locked(&lockdep_lock)) {
100 /*
101 * The lockdep graph lock isn't locked while we expect it to
102 * be, we're confused now, bye!
103 */
100 return DEBUG_LOCKS_WARN_ON(1); 104 return DEBUG_LOCKS_WARN_ON(1);
105 }
101 106
102 current->lockdep_recursion--; 107 current->lockdep_recursion--;
103 arch_spin_unlock(&lockdep_lock); 108 arch_spin_unlock(&lockdep_lock);
@@ -134,6 +139,9 @@ static struct lock_class lock_classes[MAX_LOCKDEP_KEYS];
134static inline struct lock_class *hlock_class(struct held_lock *hlock) 139static inline struct lock_class *hlock_class(struct held_lock *hlock)
135{ 140{
136 if (!hlock->class_idx) { 141 if (!hlock->class_idx) {
142 /*
143 * Someone passed in garbage, we give up.
144 */
137 DEBUG_LOCKS_WARN_ON(1); 145 DEBUG_LOCKS_WARN_ON(1);
138 return NULL; 146 return NULL;
139 } 147 }
@@ -687,6 +695,10 @@ look_up_lock_class(struct lockdep_map *lock, unsigned int subclass)
687 */ 695 */
688 list_for_each_entry(class, hash_head, hash_entry) { 696 list_for_each_entry(class, hash_head, hash_entry) {
689 if (class->key == key) { 697 if (class->key == key) {
698 /*
699 * Huh! same key, different name? Did someone trample
700 * on some memory? We're most confused.
701 */
690 WARN_ON_ONCE(class->name != lock->name); 702 WARN_ON_ONCE(class->name != lock->name);
691 return class; 703 return class;
692 } 704 }
@@ -800,6 +812,10 @@ out_unlock_set:
800 else if (subclass < NR_LOCKDEP_CACHING_CLASSES) 812 else if (subclass < NR_LOCKDEP_CACHING_CLASSES)
801 lock->class_cache[subclass] = class; 813 lock->class_cache[subclass] = class;
802 814
815 /*
816 * Hash collision, did we smoke some? We found a class with a matching
817 * hash but the subclass -- which is hashed in -- didn't match.
818 */
803 if (DEBUG_LOCKS_WARN_ON(class->subclass != subclass)) 819 if (DEBUG_LOCKS_WARN_ON(class->subclass != subclass))
804 return NULL; 820 return NULL;
805 821
@@ -926,7 +942,7 @@ static inline void mark_lock_accessed(struct lock_list *lock,
926 unsigned long nr; 942 unsigned long nr;
927 943
928 nr = lock - list_entries; 944 nr = lock - list_entries;
929 WARN_ON(nr >= nr_list_entries); 945 WARN_ON(nr >= nr_list_entries); /* Out-of-bounds, input fail */
930 lock->parent = parent; 946 lock->parent = parent;
931 lock->class->dep_gen_id = lockdep_dependency_gen_id; 947 lock->class->dep_gen_id = lockdep_dependency_gen_id;
932} 948}
@@ -936,7 +952,7 @@ static inline unsigned long lock_accessed(struct lock_list *lock)
936 unsigned long nr; 952 unsigned long nr;
937 953
938 nr = lock - list_entries; 954 nr = lock - list_entries;
939 WARN_ON(nr >= nr_list_entries); 955 WARN_ON(nr >= nr_list_entries); /* Out-of-bounds, input fail */
940 return lock->class->dep_gen_id == lockdep_dependency_gen_id; 956 return lock->class->dep_gen_id == lockdep_dependency_gen_id;
941} 957}
942 958
@@ -1129,10 +1145,11 @@ print_circular_bug_header(struct lock_list *entry, unsigned int depth,
1129 if (debug_locks_silent) 1145 if (debug_locks_silent)
1130 return 0; 1146 return 0;
1131 1147
1132 printk("\n=======================================================\n"); 1148 printk("\n");
1133 printk( "[ INFO: possible circular locking dependency detected ]\n"); 1149 printk("======================================================\n");
1150 printk("[ INFO: possible circular locking dependency detected ]\n");
1134 print_kernel_version(); 1151 print_kernel_version();
1135 printk( "-------------------------------------------------------\n"); 1152 printk("-------------------------------------------------------\n");
1136 printk("%s/%d is trying to acquire lock:\n", 1153 printk("%s/%d is trying to acquire lock:\n",
1137 curr->comm, task_pid_nr(curr)); 1154 curr->comm, task_pid_nr(curr));
1138 print_lock(check_src); 1155 print_lock(check_src);
@@ -1196,6 +1213,9 @@ static noinline int print_bfs_bug(int ret)
1196 if (!debug_locks_off_graph_unlock()) 1213 if (!debug_locks_off_graph_unlock())
1197 return 0; 1214 return 0;
1198 1215
1216 /*
1217 * Breadth-first-search failed, graph got corrupted?
1218 */
1199 WARN(1, "lockdep bfs error:%d\n", ret); 1219 WARN(1, "lockdep bfs error:%d\n", ret);
1200 1220
1201 return 0; 1221 return 0;
@@ -1463,11 +1483,12 @@ print_bad_irq_dependency(struct task_struct *curr,
1463 if (!debug_locks_off_graph_unlock() || debug_locks_silent) 1483 if (!debug_locks_off_graph_unlock() || debug_locks_silent)
1464 return 0; 1484 return 0;
1465 1485
1466 printk("\n======================================================\n"); 1486 printk("\n");
1467 printk( "[ INFO: %s-safe -> %s-unsafe lock order detected ]\n", 1487 printk("======================================================\n");
1488 printk("[ INFO: %s-safe -> %s-unsafe lock order detected ]\n",
1468 irqclass, irqclass); 1489 irqclass, irqclass);
1469 print_kernel_version(); 1490 print_kernel_version();
1470 printk( "------------------------------------------------------\n"); 1491 printk("------------------------------------------------------\n");
1471 printk("%s/%d [HC%u[%lu]:SC%u[%lu]:HE%u:SE%u] is trying to acquire:\n", 1492 printk("%s/%d [HC%u[%lu]:SC%u[%lu]:HE%u:SE%u] is trying to acquire:\n",
1472 curr->comm, task_pid_nr(curr), 1493 curr->comm, task_pid_nr(curr),
1473 curr->hardirq_context, hardirq_count() >> HARDIRQ_SHIFT, 1494 curr->hardirq_context, hardirq_count() >> HARDIRQ_SHIFT,
@@ -1692,10 +1713,11 @@ print_deadlock_bug(struct task_struct *curr, struct held_lock *prev,
1692 if (!debug_locks_off_graph_unlock() || debug_locks_silent) 1713 if (!debug_locks_off_graph_unlock() || debug_locks_silent)
1693 return 0; 1714 return 0;
1694 1715
1695 printk("\n=============================================\n"); 1716 printk("\n");
1696 printk( "[ INFO: possible recursive locking detected ]\n"); 1717 printk("=============================================\n");
1718 printk("[ INFO: possible recursive locking detected ]\n");
1697 print_kernel_version(); 1719 print_kernel_version();
1698 printk( "---------------------------------------------\n"); 1720 printk("---------------------------------------------\n");
1699 printk("%s/%d is trying to acquire lock:\n", 1721 printk("%s/%d is trying to acquire lock:\n",
1700 curr->comm, task_pid_nr(curr)); 1722 curr->comm, task_pid_nr(curr));
1701 print_lock(next); 1723 print_lock(next);
@@ -1944,6 +1966,11 @@ out_bug:
1944 if (!debug_locks_off_graph_unlock()) 1966 if (!debug_locks_off_graph_unlock())
1945 return 0; 1967 return 0;
1946 1968
1969 /*
1970 * Clearly we all shouldn't be here, but since we made it we
1971 * can reliable say we messed up our state. See the above two
1972 * gotos for reasons why we could possibly end up here.
1973 */
1947 WARN_ON(1); 1974 WARN_ON(1);
1948 1975
1949 return 0; 1976 return 0;
@@ -1975,6 +2002,11 @@ static inline int lookup_chain_cache(struct task_struct *curr,
1975 struct held_lock *hlock_curr, *hlock_next; 2002 struct held_lock *hlock_curr, *hlock_next;
1976 int i, j; 2003 int i, j;
1977 2004
2005 /*
2006 * We might need to take the graph lock, ensure we've got IRQs
2007 * disabled to make this an IRQ-safe lock.. for recursion reasons
2008 * lockdep won't complain about its own locking errors.
2009 */
1978 if (DEBUG_LOCKS_WARN_ON(!irqs_disabled())) 2010 if (DEBUG_LOCKS_WARN_ON(!irqs_disabled()))
1979 return 0; 2011 return 0;
1980 /* 2012 /*
@@ -2126,6 +2158,10 @@ static void check_chain_key(struct task_struct *curr)
2126 hlock = curr->held_locks + i; 2158 hlock = curr->held_locks + i;
2127 if (chain_key != hlock->prev_chain_key) { 2159 if (chain_key != hlock->prev_chain_key) {
2128 debug_locks_off(); 2160 debug_locks_off();
2161 /*
2162 * We got mighty confused, our chain keys don't match
2163 * with what we expect, someone trample on our task state?
2164 */
2129 WARN(1, "hm#1, depth: %u [%u], %016Lx != %016Lx\n", 2165 WARN(1, "hm#1, depth: %u [%u], %016Lx != %016Lx\n",
2130 curr->lockdep_depth, i, 2166 curr->lockdep_depth, i,
2131 (unsigned long long)chain_key, 2167 (unsigned long long)chain_key,
@@ -2133,6 +2169,9 @@ static void check_chain_key(struct task_struct *curr)
2133 return; 2169 return;
2134 } 2170 }
2135 id = hlock->class_idx - 1; 2171 id = hlock->class_idx - 1;
2172 /*
2173 * Whoops ran out of static storage again?
2174 */
2136 if (DEBUG_LOCKS_WARN_ON(id >= MAX_LOCKDEP_KEYS)) 2175 if (DEBUG_LOCKS_WARN_ON(id >= MAX_LOCKDEP_KEYS))
2137 return; 2176 return;
2138 2177
@@ -2144,6 +2183,10 @@ static void check_chain_key(struct task_struct *curr)
2144 } 2183 }
2145 if (chain_key != curr->curr_chain_key) { 2184 if (chain_key != curr->curr_chain_key) {
2146 debug_locks_off(); 2185 debug_locks_off();
2186 /*
2187 * More smoking hash instead of calculating it, damn see these
2188 * numbers float.. I bet that a pink elephant stepped on my memory.
2189 */
2147 WARN(1, "hm#2, depth: %u [%u], %016Lx != %016Lx\n", 2190 WARN(1, "hm#2, depth: %u [%u], %016Lx != %016Lx\n",
2148 curr->lockdep_depth, i, 2191 curr->lockdep_depth, i,
2149 (unsigned long long)chain_key, 2192 (unsigned long long)chain_key,
@@ -2177,10 +2220,11 @@ print_usage_bug(struct task_struct *curr, struct held_lock *this,
2177 if (!debug_locks_off_graph_unlock() || debug_locks_silent) 2220 if (!debug_locks_off_graph_unlock() || debug_locks_silent)
2178 return 0; 2221 return 0;
2179 2222
2180 printk("\n=================================\n"); 2223 printk("\n");
2181 printk( "[ INFO: inconsistent lock state ]\n"); 2224 printk("=================================\n");
2225 printk("[ INFO: inconsistent lock state ]\n");
2182 print_kernel_version(); 2226 print_kernel_version();
2183 printk( "---------------------------------\n"); 2227 printk("---------------------------------\n");
2184 2228
2185 printk("inconsistent {%s} -> {%s} usage.\n", 2229 printk("inconsistent {%s} -> {%s} usage.\n",
2186 usage_str[prev_bit], usage_str[new_bit]); 2230 usage_str[prev_bit], usage_str[new_bit]);
@@ -2241,10 +2285,11 @@ print_irq_inversion_bug(struct task_struct *curr,
2241 if (!debug_locks_off_graph_unlock() || debug_locks_silent) 2285 if (!debug_locks_off_graph_unlock() || debug_locks_silent)
2242 return 0; 2286 return 0;
2243 2287
2244 printk("\n=========================================================\n"); 2288 printk("\n");
2245 printk( "[ INFO: possible irq lock inversion dependency detected ]\n"); 2289 printk("=========================================================\n");
2290 printk("[ INFO: possible irq lock inversion dependency detected ]\n");
2246 print_kernel_version(); 2291 print_kernel_version();
2247 printk( "---------------------------------------------------------\n"); 2292 printk("---------------------------------------------------------\n");
2248 printk("%s/%d just changed the state of lock:\n", 2293 printk("%s/%d just changed the state of lock:\n",
2249 curr->comm, task_pid_nr(curr)); 2294 curr->comm, task_pid_nr(curr));
2250 print_lock(this); 2295 print_lock(this);
@@ -2525,12 +2570,24 @@ void trace_hardirqs_on_caller(unsigned long ip)
2525 return; 2570 return;
2526 } 2571 }
2527 2572
2573 /*
2574 * We're enabling irqs and according to our state above irqs weren't
2575 * already enabled, yet we find the hardware thinks they are in fact
2576 * enabled.. someone messed up their IRQ state tracing.
2577 */
2528 if (DEBUG_LOCKS_WARN_ON(!irqs_disabled())) 2578 if (DEBUG_LOCKS_WARN_ON(!irqs_disabled()))
2529 return; 2579 return;
2530 2580
2581 /*
2582 * See the fine text that goes along with this variable definition.
2583 */
2531 if (DEBUG_LOCKS_WARN_ON(unlikely(early_boot_irqs_disabled))) 2584 if (DEBUG_LOCKS_WARN_ON(unlikely(early_boot_irqs_disabled)))
2532 return; 2585 return;
2533 2586
2587 /*
2588 * Can't allow enabling interrupts while in an interrupt handler,
2589 * that's general bad form and such. Recursion, limited stack etc..
2590 */
2534 if (DEBUG_LOCKS_WARN_ON(current->hardirq_context)) 2591 if (DEBUG_LOCKS_WARN_ON(current->hardirq_context))
2535 return; 2592 return;
2536 2593
@@ -2558,6 +2615,10 @@ void trace_hardirqs_off_caller(unsigned long ip)
2558 if (unlikely(!debug_locks || current->lockdep_recursion)) 2615 if (unlikely(!debug_locks || current->lockdep_recursion))
2559 return; 2616 return;
2560 2617
2618 /*
2619 * So we're supposed to get called after you mask local IRQs, but for
2620 * some reason the hardware doesn't quite think you did a proper job.
2621 */
2561 if (DEBUG_LOCKS_WARN_ON(!irqs_disabled())) 2622 if (DEBUG_LOCKS_WARN_ON(!irqs_disabled()))
2562 return; 2623 return;
2563 2624
@@ -2590,6 +2651,10 @@ void trace_softirqs_on(unsigned long ip)
2590 if (unlikely(!debug_locks || current->lockdep_recursion)) 2651 if (unlikely(!debug_locks || current->lockdep_recursion))
2591 return; 2652 return;
2592 2653
2654 /*
2655 * We fancy IRQs being disabled here, see softirq.c, avoids
2656 * funny state and nesting things.
2657 */
2593 if (DEBUG_LOCKS_WARN_ON(!irqs_disabled())) 2658 if (DEBUG_LOCKS_WARN_ON(!irqs_disabled()))
2594 return; 2659 return;
2595 2660
@@ -2626,6 +2691,9 @@ void trace_softirqs_off(unsigned long ip)
2626 if (unlikely(!debug_locks || current->lockdep_recursion)) 2691 if (unlikely(!debug_locks || current->lockdep_recursion))
2627 return; 2692 return;
2628 2693
2694 /*
2695 * We fancy IRQs being disabled here, see softirq.c
2696 */
2629 if (DEBUG_LOCKS_WARN_ON(!irqs_disabled())) 2697 if (DEBUG_LOCKS_WARN_ON(!irqs_disabled()))
2630 return; 2698 return;
2631 2699
@@ -2637,6 +2705,9 @@ void trace_softirqs_off(unsigned long ip)
2637 curr->softirq_disable_ip = ip; 2705 curr->softirq_disable_ip = ip;
2638 curr->softirq_disable_event = ++curr->irq_events; 2706 curr->softirq_disable_event = ++curr->irq_events;
2639 debug_atomic_inc(softirqs_off_events); 2707 debug_atomic_inc(softirqs_off_events);
2708 /*
2709 * Whoops, we wanted softirqs off, so why aren't they?
2710 */
2640 DEBUG_LOCKS_WARN_ON(!softirq_count()); 2711 DEBUG_LOCKS_WARN_ON(!softirq_count());
2641 } else 2712 } else
2642 debug_atomic_inc(redundant_softirqs_off); 2713 debug_atomic_inc(redundant_softirqs_off);
@@ -2661,6 +2732,9 @@ static void __lockdep_trace_alloc(gfp_t gfp_mask, unsigned long flags)
2661 if (!(gfp_mask & __GFP_FS)) 2732 if (!(gfp_mask & __GFP_FS))
2662 return; 2733 return;
2663 2734
2735 /*
2736 * Oi! Can't be having __GFP_FS allocations with IRQs disabled.
2737 */
2664 if (DEBUG_LOCKS_WARN_ON(irqs_disabled_flags(flags))) 2738 if (DEBUG_LOCKS_WARN_ON(irqs_disabled_flags(flags)))
2665 return; 2739 return;
2666 2740
@@ -2773,13 +2847,13 @@ static int separate_irq_context(struct task_struct *curr,
2773 return 0; 2847 return 0;
2774} 2848}
2775 2849
2776#else 2850#else /* defined(CONFIG_TRACE_IRQFLAGS) && defined(CONFIG_PROVE_LOCKING) */
2777 2851
2778static inline 2852static inline
2779int mark_lock_irq(struct task_struct *curr, struct held_lock *this, 2853int mark_lock_irq(struct task_struct *curr, struct held_lock *this,
2780 enum lock_usage_bit new_bit) 2854 enum lock_usage_bit new_bit)
2781{ 2855{
2782 WARN_ON(1); 2856 WARN_ON(1); /* Impossible innit? when we don't have TRACE_IRQFLAG */
2783 return 1; 2857 return 1;
2784} 2858}
2785 2859
@@ -2799,7 +2873,7 @@ void lockdep_trace_alloc(gfp_t gfp_mask)
2799{ 2873{
2800} 2874}
2801 2875
2802#endif 2876#endif /* defined(CONFIG_TRACE_IRQFLAGS) && defined(CONFIG_PROVE_LOCKING) */
2803 2877
2804/* 2878/*
2805 * Mark a lock with a usage bit, and validate the state transition: 2879 * Mark a lock with a usage bit, and validate the state transition:
@@ -2880,6 +2954,9 @@ void lockdep_init_map(struct lockdep_map *lock, const char *name,
2880 lock->cpu = raw_smp_processor_id(); 2954 lock->cpu = raw_smp_processor_id();
2881#endif 2955#endif
2882 2956
2957 /*
2958 * Can't be having no nameless bastards around this place!
2959 */
2883 if (DEBUG_LOCKS_WARN_ON(!name)) { 2960 if (DEBUG_LOCKS_WARN_ON(!name)) {
2884 lock->name = "NULL"; 2961 lock->name = "NULL";
2885 return; 2962 return;
@@ -2887,6 +2964,9 @@ void lockdep_init_map(struct lockdep_map *lock, const char *name,
2887 2964
2888 lock->name = name; 2965 lock->name = name;
2889 2966
2967 /*
2968 * No key, no joy, we need to hash something.
2969 */
2890 if (DEBUG_LOCKS_WARN_ON(!key)) 2970 if (DEBUG_LOCKS_WARN_ON(!key))
2891 return; 2971 return;
2892 /* 2972 /*
@@ -2894,6 +2974,9 @@ void lockdep_init_map(struct lockdep_map *lock, const char *name,
2894 */ 2974 */
2895 if (!static_obj(key)) { 2975 if (!static_obj(key)) {
2896 printk("BUG: key %p not in .data!\n", key); 2976 printk("BUG: key %p not in .data!\n", key);
2977 /*
2978 * What it says above ^^^^^, I suggest you read it.
2979 */
2897 DEBUG_LOCKS_WARN_ON(1); 2980 DEBUG_LOCKS_WARN_ON(1);
2898 return; 2981 return;
2899 } 2982 }
@@ -2932,6 +3015,11 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass,
2932 if (unlikely(!debug_locks)) 3015 if (unlikely(!debug_locks))
2933 return 0; 3016 return 0;
2934 3017
3018 /*
3019 * Lockdep should run with IRQs disabled, otherwise we could
3020 * get an interrupt which would want to take locks, which would
3021 * end up in lockdep and have you got a head-ache already?
3022 */
2935 if (DEBUG_LOCKS_WARN_ON(!irqs_disabled())) 3023 if (DEBUG_LOCKS_WARN_ON(!irqs_disabled()))
2936 return 0; 3024 return 0;
2937 3025
@@ -2963,6 +3051,9 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass,
2963 * dependency checks are done) 3051 * dependency checks are done)
2964 */ 3052 */
2965 depth = curr->lockdep_depth; 3053 depth = curr->lockdep_depth;
3054 /*
3055 * Ran out of static storage for our per-task lock stack again have we?
3056 */
2966 if (DEBUG_LOCKS_WARN_ON(depth >= MAX_LOCK_DEPTH)) 3057 if (DEBUG_LOCKS_WARN_ON(depth >= MAX_LOCK_DEPTH))
2967 return 0; 3058 return 0;
2968 3059
@@ -2981,6 +3072,10 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass,
2981 } 3072 }
2982 3073
2983 hlock = curr->held_locks + depth; 3074 hlock = curr->held_locks + depth;
3075 /*
3076 * Plain impossible, we just registered it and checked it weren't no
3077 * NULL like.. I bet this mushroom I ate was good!
3078 */
2984 if (DEBUG_LOCKS_WARN_ON(!class)) 3079 if (DEBUG_LOCKS_WARN_ON(!class))
2985 return 0; 3080 return 0;
2986 hlock->class_idx = class_idx; 3081 hlock->class_idx = class_idx;
@@ -3015,11 +3110,17 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass,
3015 * the hash, not class->key. 3110 * the hash, not class->key.
3016 */ 3111 */
3017 id = class - lock_classes; 3112 id = class - lock_classes;
3113 /*
3114 * Whoops, we did it again.. ran straight out of our static allocation.
3115 */
3018 if (DEBUG_LOCKS_WARN_ON(id >= MAX_LOCKDEP_KEYS)) 3116 if (DEBUG_LOCKS_WARN_ON(id >= MAX_LOCKDEP_KEYS))
3019 return 0; 3117 return 0;
3020 3118
3021 chain_key = curr->curr_chain_key; 3119 chain_key = curr->curr_chain_key;
3022 if (!depth) { 3120 if (!depth) {
3121 /*
3122 * How can we have a chain hash when we ain't got no keys?!
3123 */
3023 if (DEBUG_LOCKS_WARN_ON(chain_key != 0)) 3124 if (DEBUG_LOCKS_WARN_ON(chain_key != 0))
3024 return 0; 3125 return 0;
3025 chain_head = 1; 3126 chain_head = 1;
@@ -3065,9 +3166,10 @@ print_unlock_inbalance_bug(struct task_struct *curr, struct lockdep_map *lock,
3065 if (debug_locks_silent) 3166 if (debug_locks_silent)
3066 return 0; 3167 return 0;
3067 3168
3068 printk("\n=====================================\n"); 3169 printk("\n");
3069 printk( "[ BUG: bad unlock balance detected! ]\n"); 3170 printk("=====================================\n");
3070 printk( "-------------------------------------\n"); 3171 printk("[ BUG: bad unlock balance detected! ]\n");
3172 printk("-------------------------------------\n");
3071 printk("%s/%d is trying to release lock (", 3173 printk("%s/%d is trying to release lock (",
3072 curr->comm, task_pid_nr(curr)); 3174 curr->comm, task_pid_nr(curr));
3073 print_lockdep_cache(lock); 3175 print_lockdep_cache(lock);
@@ -3091,6 +3193,9 @@ static int check_unlock(struct task_struct *curr, struct lockdep_map *lock,
3091{ 3193{
3092 if (unlikely(!debug_locks)) 3194 if (unlikely(!debug_locks))
3093 return 0; 3195 return 0;
3196 /*
3197 * Lockdep should run with IRQs disabled, recursion, head-ache, etc..
3198 */
3094 if (DEBUG_LOCKS_WARN_ON(!irqs_disabled())) 3199 if (DEBUG_LOCKS_WARN_ON(!irqs_disabled()))
3095 return 0; 3200 return 0;
3096 3201
@@ -3120,6 +3225,11 @@ static int match_held_lock(struct held_lock *hlock, struct lockdep_map *lock)
3120 if (!class) 3225 if (!class)
3121 return 0; 3226 return 0;
3122 3227
3228 /*
3229 * References, but not a lock we're actually ref-counting?
3230 * State got messed up, follow the sites that change ->references
3231 * and try to make sense of it.
3232 */
3123 if (DEBUG_LOCKS_WARN_ON(!hlock->nest_lock)) 3233 if (DEBUG_LOCKS_WARN_ON(!hlock->nest_lock))
3124 return 0; 3234 return 0;
3125 3235
@@ -3142,6 +3252,10 @@ __lock_set_class(struct lockdep_map *lock, const char *name,
3142 int i; 3252 int i;
3143 3253
3144 depth = curr->lockdep_depth; 3254 depth = curr->lockdep_depth;
3255 /*
3256 * This function is about (re)setting the class of a held lock,
3257 * yet we're not actually holding any locks. Naughty user!
3258 */
3145 if (DEBUG_LOCKS_WARN_ON(!depth)) 3259 if (DEBUG_LOCKS_WARN_ON(!depth))
3146 return 0; 3260 return 0;
3147 3261
@@ -3177,6 +3291,10 @@ found_it:
3177 return 0; 3291 return 0;
3178 } 3292 }
3179 3293
3294 /*
3295 * I took it apart and put it back together again, except now I have
3296 * these 'spare' parts.. where shall I put them.
3297 */
3180 if (DEBUG_LOCKS_WARN_ON(curr->lockdep_depth != depth)) 3298 if (DEBUG_LOCKS_WARN_ON(curr->lockdep_depth != depth))
3181 return 0; 3299 return 0;
3182 return 1; 3300 return 1;
@@ -3201,6 +3319,10 @@ lock_release_non_nested(struct task_struct *curr,
3201 * of held locks: 3319 * of held locks:
3202 */ 3320 */
3203 depth = curr->lockdep_depth; 3321 depth = curr->lockdep_depth;
3322 /*
3323 * So we're all set to release this lock.. wait what lock? We don't
3324 * own any locks, you've been drinking again?
3325 */
3204 if (DEBUG_LOCKS_WARN_ON(!depth)) 3326 if (DEBUG_LOCKS_WARN_ON(!depth))
3205 return 0; 3327 return 0;
3206 3328
@@ -3253,6 +3375,10 @@ found_it:
3253 return 0; 3375 return 0;
3254 } 3376 }
3255 3377
3378 /*
3379 * We had N bottles of beer on the wall, we drank one, but now
3380 * there's not N-1 bottles of beer left on the wall...
3381 */
3256 if (DEBUG_LOCKS_WARN_ON(curr->lockdep_depth != depth - 1)) 3382 if (DEBUG_LOCKS_WARN_ON(curr->lockdep_depth != depth - 1))
3257 return 0; 3383 return 0;
3258 return 1; 3384 return 1;
@@ -3283,6 +3409,9 @@ static int lock_release_nested(struct task_struct *curr,
3283 return lock_release_non_nested(curr, lock, ip); 3409 return lock_release_non_nested(curr, lock, ip);
3284 curr->lockdep_depth--; 3410 curr->lockdep_depth--;
3285 3411
3412 /*
3413 * No more locks, but somehow we've got hash left over, who left it?
3414 */
3286 if (DEBUG_LOCKS_WARN_ON(!depth && (hlock->prev_chain_key != 0))) 3415 if (DEBUG_LOCKS_WARN_ON(!depth && (hlock->prev_chain_key != 0)))
3287 return 0; 3416 return 0;
3288 3417
@@ -3365,10 +3494,13 @@ static void check_flags(unsigned long flags)
3365 * check if not in hardirq contexts: 3494 * check if not in hardirq contexts:
3366 */ 3495 */
3367 if (!hardirq_count()) { 3496 if (!hardirq_count()) {
3368 if (softirq_count()) 3497 if (softirq_count()) {
3498 /* like the above, but with softirqs */
3369 DEBUG_LOCKS_WARN_ON(current->softirqs_enabled); 3499 DEBUG_LOCKS_WARN_ON(current->softirqs_enabled);
3370 else 3500 } else {
3501 /* lick the above, does it taste good? */
3371 DEBUG_LOCKS_WARN_ON(!current->softirqs_enabled); 3502 DEBUG_LOCKS_WARN_ON(!current->softirqs_enabled);
3503 }
3372 } 3504 }
3373 3505
3374 if (!debug_locks) 3506 if (!debug_locks)
@@ -3478,9 +3610,10 @@ print_lock_contention_bug(struct task_struct *curr, struct lockdep_map *lock,
3478 if (debug_locks_silent) 3610 if (debug_locks_silent)
3479 return 0; 3611 return 0;
3480 3612
3481 printk("\n=================================\n"); 3613 printk("\n");
3482 printk( "[ BUG: bad contention detected! ]\n"); 3614 printk("=================================\n");
3483 printk( "---------------------------------\n"); 3615 printk("[ BUG: bad contention detected! ]\n");
3616 printk("---------------------------------\n");
3484 printk("%s/%d is trying to contend lock (", 3617 printk("%s/%d is trying to contend lock (",
3485 curr->comm, task_pid_nr(curr)); 3618 curr->comm, task_pid_nr(curr));
3486 print_lockdep_cache(lock); 3619 print_lockdep_cache(lock);
@@ -3506,6 +3639,10 @@ __lock_contended(struct lockdep_map *lock, unsigned long ip)
3506 int i, contention_point, contending_point; 3639 int i, contention_point, contending_point;
3507 3640
3508 depth = curr->lockdep_depth; 3641 depth = curr->lockdep_depth;
3642 /*
3643 * Whee, we contended on this lock, except it seems we're not
3644 * actually trying to acquire anything much at all..
3645 */
3509 if (DEBUG_LOCKS_WARN_ON(!depth)) 3646 if (DEBUG_LOCKS_WARN_ON(!depth))
3510 return; 3647 return;
3511 3648
@@ -3555,6 +3692,10 @@ __lock_acquired(struct lockdep_map *lock, unsigned long ip)
3555 int i, cpu; 3692 int i, cpu;
3556 3693
3557 depth = curr->lockdep_depth; 3694 depth = curr->lockdep_depth;
3695 /*
3696 * Yay, we acquired ownership of this lock we didn't try to
3697 * acquire, how the heck did that happen?
3698 */
3558 if (DEBUG_LOCKS_WARN_ON(!depth)) 3699 if (DEBUG_LOCKS_WARN_ON(!depth))
3559 return; 3700 return;
3560 3701
@@ -3759,8 +3900,12 @@ void lockdep_reset_lock(struct lockdep_map *lock)
3759 match |= class == lock->class_cache[j]; 3900 match |= class == lock->class_cache[j];
3760 3901
3761 if (unlikely(match)) { 3902 if (unlikely(match)) {
3762 if (debug_locks_off_graph_unlock()) 3903 if (debug_locks_off_graph_unlock()) {
3904 /*
3905 * We all just reset everything, how did it match?
3906 */
3763 WARN_ON(1); 3907 WARN_ON(1);
3908 }
3764 goto out_restore; 3909 goto out_restore;
3765 } 3910 }
3766 } 3911 }
@@ -3839,9 +3984,10 @@ print_freed_lock_bug(struct task_struct *curr, const void *mem_from,
3839 if (debug_locks_silent) 3984 if (debug_locks_silent)
3840 return; 3985 return;
3841 3986
3842 printk("\n=========================\n"); 3987 printk("\n");
3843 printk( "[ BUG: held lock freed! ]\n"); 3988 printk("=========================\n");
3844 printk( "-------------------------\n"); 3989 printk("[ BUG: held lock freed! ]\n");
3990 printk("-------------------------\n");
3845 printk("%s/%d is freeing memory %p-%p, with a lock still held there!\n", 3991 printk("%s/%d is freeing memory %p-%p, with a lock still held there!\n",
3846 curr->comm, task_pid_nr(curr), mem_from, mem_to-1); 3992 curr->comm, task_pid_nr(curr), mem_from, mem_to-1);
3847 print_lock(hlock); 3993 print_lock(hlock);
@@ -3895,9 +4041,10 @@ static void print_held_locks_bug(struct task_struct *curr)
3895 if (debug_locks_silent) 4041 if (debug_locks_silent)
3896 return; 4042 return;
3897 4043
3898 printk("\n=====================================\n"); 4044 printk("\n");
3899 printk( "[ BUG: lock held at task exit time! ]\n"); 4045 printk("=====================================\n");
3900 printk( "-------------------------------------\n"); 4046 printk("[ BUG: lock held at task exit time! ]\n");
4047 printk("-------------------------------------\n");
3901 printk("%s/%d is exiting with locks still held!\n", 4048 printk("%s/%d is exiting with locks still held!\n",
3902 curr->comm, task_pid_nr(curr)); 4049 curr->comm, task_pid_nr(curr));
3903 lockdep_print_held_locks(curr); 4050 lockdep_print_held_locks(curr);
@@ -3991,16 +4138,17 @@ void lockdep_sys_exit(void)
3991 if (unlikely(curr->lockdep_depth)) { 4138 if (unlikely(curr->lockdep_depth)) {
3992 if (!debug_locks_off()) 4139 if (!debug_locks_off())
3993 return; 4140 return;
3994 printk("\n================================================\n"); 4141 printk("\n");
3995 printk( "[ BUG: lock held when returning to user space! ]\n"); 4142 printk("================================================\n");
3996 printk( "------------------------------------------------\n"); 4143 printk("[ BUG: lock held when returning to user space! ]\n");
4144 printk("------------------------------------------------\n");
3997 printk("%s/%d is leaving the kernel with locks still held!\n", 4145 printk("%s/%d is leaving the kernel with locks still held!\n",
3998 curr->comm, curr->pid); 4146 curr->comm, curr->pid);
3999 lockdep_print_held_locks(curr); 4147 lockdep_print_held_locks(curr);
4000 } 4148 }
4001} 4149}
4002 4150
4003void lockdep_rcu_dereference(const char *file, const int line) 4151void lockdep_rcu_suspicious(const char *file, const int line, const char *s)
4004{ 4152{
4005 struct task_struct *curr = current; 4153 struct task_struct *curr = current;
4006 4154
@@ -4009,15 +4157,15 @@ void lockdep_rcu_dereference(const char *file, const int line)
4009 return; 4157 return;
4010#endif /* #ifdef CONFIG_PROVE_RCU_REPEATEDLY */ 4158#endif /* #ifdef CONFIG_PROVE_RCU_REPEATEDLY */
4011 /* Note: the following can be executed concurrently, so be careful. */ 4159 /* Note: the following can be executed concurrently, so be careful. */
4012 printk("\n===================================================\n"); 4160 printk("\n");
4013 printk( "[ INFO: suspicious rcu_dereference_check() usage. ]\n"); 4161 printk("===============================\n");
4014 printk( "---------------------------------------------------\n"); 4162 printk("[ INFO: suspicious RCU usage. ]\n");
4015 printk("%s:%d invoked rcu_dereference_check() without protection!\n", 4163 printk("-------------------------------\n");
4016 file, line); 4164 printk("%s:%d %s!\n", file, line, s);
4017 printk("\nother info that might help us debug this:\n\n"); 4165 printk("\nother info that might help us debug this:\n\n");
4018 printk("\nrcu_scheduler_active = %d, debug_locks = %d\n", rcu_scheduler_active, debug_locks); 4166 printk("\nrcu_scheduler_active = %d, debug_locks = %d\n", rcu_scheduler_active, debug_locks);
4019 lockdep_print_held_locks(curr); 4167 lockdep_print_held_locks(curr);
4020 printk("\nstack backtrace:\n"); 4168 printk("\nstack backtrace:\n");
4021 dump_stack(); 4169 dump_stack();
4022} 4170}
4023EXPORT_SYMBOL_GPL(lockdep_rcu_dereference); 4171EXPORT_SYMBOL_GPL(lockdep_rcu_suspicious);
diff --git a/kernel/module.c b/kernel/module.c
index 04379f92f843..93342d992f34 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -3487,50 +3487,3 @@ void module_layout(struct module *mod,
3487} 3487}
3488EXPORT_SYMBOL(module_layout); 3488EXPORT_SYMBOL(module_layout);
3489#endif 3489#endif
3490
3491#ifdef CONFIG_TRACEPOINTS
3492void module_update_tracepoints(void)
3493{
3494 struct module *mod;
3495
3496 mutex_lock(&module_mutex);
3497 list_for_each_entry(mod, &modules, list)
3498 if (!mod->taints)
3499 tracepoint_update_probe_range(mod->tracepoints_ptrs,
3500 mod->tracepoints_ptrs + mod->num_tracepoints);
3501 mutex_unlock(&module_mutex);
3502}
3503
3504/*
3505 * Returns 0 if current not found.
3506 * Returns 1 if current found.
3507 */
3508int module_get_iter_tracepoints(struct tracepoint_iter *iter)
3509{
3510 struct module *iter_mod;
3511 int found = 0;
3512
3513 mutex_lock(&module_mutex);
3514 list_for_each_entry(iter_mod, &modules, list) {
3515 if (!iter_mod->taints) {
3516 /*
3517 * Sorted module list
3518 */
3519 if (iter_mod < iter->module)
3520 continue;
3521 else if (iter_mod > iter->module)
3522 iter->tracepoint = NULL;
3523 found = tracepoint_get_iter_range(&iter->tracepoint,
3524 iter_mod->tracepoints_ptrs,
3525 iter_mod->tracepoints_ptrs
3526 + iter_mod->num_tracepoints);
3527 if (found) {
3528 iter->module = iter_mod;
3529 break;
3530 }
3531 }
3532 }
3533 mutex_unlock(&module_mutex);
3534 return found;
3535}
3536#endif
diff --git a/kernel/params.c b/kernel/params.c
index 22df3e0d142a..821788947e40 100644
--- a/kernel/params.c
+++ b/kernel/params.c
@@ -67,20 +67,27 @@ static void maybe_kfree_parameter(void *param)
67 } 67 }
68} 68}
69 69
70static inline char dash2underscore(char c) 70static char dash2underscore(char c)
71{ 71{
72 if (c == '-') 72 if (c == '-')
73 return '_'; 73 return '_';
74 return c; 74 return c;
75} 75}
76 76
77static inline int parameq(const char *input, const char *paramname) 77bool parameqn(const char *a, const char *b, size_t n)
78{ 78{
79 unsigned int i; 79 size_t i;
80 for (i = 0; dash2underscore(input[i]) == paramname[i]; i++) 80
81 if (input[i] == '\0') 81 for (i = 0; i < n; i++) {
82 return 1; 82 if (dash2underscore(a[i]) != dash2underscore(b[i]))
83 return 0; 83 return false;
84 }
85 return true;
86}
87
88bool parameq(const char *a, const char *b)
89{
90 return parameqn(a, b, strlen(a)+1);
84} 91}
85 92
86static int parse_one(char *param, 93static int parse_one(char *param,
diff --git a/kernel/pid.c b/kernel/pid.c
index e432057f3b21..8cafe7e72ad2 100644
--- a/kernel/pid.c
+++ b/kernel/pid.c
@@ -418,7 +418,9 @@ EXPORT_SYMBOL(pid_task);
418 */ 418 */
419struct task_struct *find_task_by_pid_ns(pid_t nr, struct pid_namespace *ns) 419struct task_struct *find_task_by_pid_ns(pid_t nr, struct pid_namespace *ns)
420{ 420{
421 rcu_lockdep_assert(rcu_read_lock_held()); 421 rcu_lockdep_assert(rcu_read_lock_held(),
422 "find_task_by_pid_ns() needs rcu_read_lock()"
423 " protection");
422 return pid_task(find_pid_ns(nr, ns), PIDTYPE_PID); 424 return pid_task(find_pid_ns(nr, ns), PIDTYPE_PID);
423} 425}
424 426
diff --git a/kernel/posix-cpu-timers.c b/kernel/posix-cpu-timers.c
index 58f405b581e7..e7cb76dc18f5 100644
--- a/kernel/posix-cpu-timers.c
+++ b/kernel/posix-cpu-timers.c
@@ -250,7 +250,7 @@ void thread_group_cputime(struct task_struct *tsk, struct task_cputime *times)
250 do { 250 do {
251 times->utime = cputime_add(times->utime, t->utime); 251 times->utime = cputime_add(times->utime, t->utime);
252 times->stime = cputime_add(times->stime, t->stime); 252 times->stime = cputime_add(times->stime, t->stime);
253 times->sum_exec_runtime += t->se.sum_exec_runtime; 253 times->sum_exec_runtime += task_sched_runtime(t);
254 } while_each_thread(tsk, t); 254 } while_each_thread(tsk, t);
255out: 255out:
256 rcu_read_unlock(); 256 rcu_read_unlock();
@@ -274,9 +274,7 @@ void thread_group_cputimer(struct task_struct *tsk, struct task_cputime *times)
274 struct task_cputime sum; 274 struct task_cputime sum;
275 unsigned long flags; 275 unsigned long flags;
276 276
277 spin_lock_irqsave(&cputimer->lock, flags);
278 if (!cputimer->running) { 277 if (!cputimer->running) {
279 cputimer->running = 1;
280 /* 278 /*
281 * The POSIX timer interface allows for absolute time expiry 279 * The POSIX timer interface allows for absolute time expiry
282 * values through the TIMER_ABSTIME flag, therefore we have 280 * values through the TIMER_ABSTIME flag, therefore we have
@@ -284,10 +282,13 @@ void thread_group_cputimer(struct task_struct *tsk, struct task_cputime *times)
284 * it. 282 * it.
285 */ 283 */
286 thread_group_cputime(tsk, &sum); 284 thread_group_cputime(tsk, &sum);
285 raw_spin_lock_irqsave(&cputimer->lock, flags);
286 cputimer->running = 1;
287 update_gt_cputime(&cputimer->cputime, &sum); 287 update_gt_cputime(&cputimer->cputime, &sum);
288 } 288 } else
289 raw_spin_lock_irqsave(&cputimer->lock, flags);
289 *times = cputimer->cputime; 290 *times = cputimer->cputime;
290 spin_unlock_irqrestore(&cputimer->lock, flags); 291 raw_spin_unlock_irqrestore(&cputimer->lock, flags);
291} 292}
292 293
293/* 294/*
@@ -312,7 +313,8 @@ static int cpu_clock_sample_group(const clockid_t which_clock,
312 cpu->cpu = cputime.utime; 313 cpu->cpu = cputime.utime;
313 break; 314 break;
314 case CPUCLOCK_SCHED: 315 case CPUCLOCK_SCHED:
315 cpu->sched = thread_group_sched_runtime(p); 316 thread_group_cputime(p, &cputime);
317 cpu->sched = cputime.sum_exec_runtime;
316 break; 318 break;
317 } 319 }
318 return 0; 320 return 0;
@@ -997,9 +999,9 @@ static void stop_process_timers(struct signal_struct *sig)
997 struct thread_group_cputimer *cputimer = &sig->cputimer; 999 struct thread_group_cputimer *cputimer = &sig->cputimer;
998 unsigned long flags; 1000 unsigned long flags;
999 1001
1000 spin_lock_irqsave(&cputimer->lock, flags); 1002 raw_spin_lock_irqsave(&cputimer->lock, flags);
1001 cputimer->running = 0; 1003 cputimer->running = 0;
1002 spin_unlock_irqrestore(&cputimer->lock, flags); 1004 raw_spin_unlock_irqrestore(&cputimer->lock, flags);
1003} 1005}
1004 1006
1005static u32 onecputick; 1007static u32 onecputick;
@@ -1289,9 +1291,9 @@ static inline int fastpath_timer_check(struct task_struct *tsk)
1289 if (sig->cputimer.running) { 1291 if (sig->cputimer.running) {
1290 struct task_cputime group_sample; 1292 struct task_cputime group_sample;
1291 1293
1292 spin_lock(&sig->cputimer.lock); 1294 raw_spin_lock(&sig->cputimer.lock);
1293 group_sample = sig->cputimer.cputime; 1295 group_sample = sig->cputimer.cputime;
1294 spin_unlock(&sig->cputimer.lock); 1296 raw_spin_unlock(&sig->cputimer.lock);
1295 1297
1296 if (task_cputime_expired(&group_sample, &sig->cputime_expires)) 1298 if (task_cputime_expired(&group_sample, &sig->cputime_expires))
1297 return 1; 1299 return 1;
diff --git a/kernel/power/Kconfig b/kernel/power/Kconfig
index 80a85971cf64..deb5461e3216 100644
--- a/kernel/power/Kconfig
+++ b/kernel/power/Kconfig
@@ -27,6 +27,7 @@ config HIBERNATION
27 select HIBERNATE_CALLBACKS 27 select HIBERNATE_CALLBACKS
28 select LZO_COMPRESS 28 select LZO_COMPRESS
29 select LZO_DECOMPRESS 29 select LZO_DECOMPRESS
30 select CRC32
30 ---help--- 31 ---help---
31 Enable the suspend to disk (STD) functionality, which is usually 32 Enable the suspend to disk (STD) functionality, which is usually
32 called "hibernation" in user interfaces. STD checkpoints the 33 called "hibernation" in user interfaces. STD checkpoints the
@@ -65,6 +66,9 @@ config HIBERNATION
65 66
66 For more information take a look at <file:Documentation/power/swsusp.txt>. 67 For more information take a look at <file:Documentation/power/swsusp.txt>.
67 68
69config ARCH_SAVE_PAGE_KEYS
70 bool
71
68config PM_STD_PARTITION 72config PM_STD_PARTITION
69 string "Default resume partition" 73 string "Default resume partition"
70 depends on HIBERNATION 74 depends on HIBERNATION
diff --git a/kernel/power/Makefile b/kernel/power/Makefile
index c5ebc6a90643..07e0e28ffba7 100644
--- a/kernel/power/Makefile
+++ b/kernel/power/Makefile
@@ -1,8 +1,8 @@
1 1
2ccflags-$(CONFIG_PM_DEBUG) := -DDEBUG 2ccflags-$(CONFIG_PM_DEBUG) := -DDEBUG
3 3
4obj-$(CONFIG_PM) += main.o 4obj-$(CONFIG_PM) += main.o qos.o
5obj-$(CONFIG_PM_SLEEP) += console.o 5obj-$(CONFIG_VT_CONSOLE_SLEEP) += console.o
6obj-$(CONFIG_FREEZER) += process.o 6obj-$(CONFIG_FREEZER) += process.o
7obj-$(CONFIG_SUSPEND) += suspend.o 7obj-$(CONFIG_SUSPEND) += suspend.o
8obj-$(CONFIG_PM_TEST_SUSPEND) += suspend_test.o 8obj-$(CONFIG_PM_TEST_SUSPEND) += suspend_test.o
diff --git a/kernel/power/console.c b/kernel/power/console.c
index 218e5af90156..b1dc456474b5 100644
--- a/kernel/power/console.c
+++ b/kernel/power/console.c
@@ -1,5 +1,5 @@
1/* 1/*
2 * drivers/power/process.c - Functions for saving/restoring console. 2 * Functions for saving/restoring console.
3 * 3 *
4 * Originally from swsusp. 4 * Originally from swsusp.
5 */ 5 */
@@ -10,7 +10,6 @@
10#include <linux/module.h> 10#include <linux/module.h>
11#include "power.h" 11#include "power.h"
12 12
13#if defined(CONFIG_VT) && defined(CONFIG_VT_CONSOLE)
14#define SUSPEND_CONSOLE (MAX_NR_CONSOLES-1) 13#define SUSPEND_CONSOLE (MAX_NR_CONSOLES-1)
15 14
16static int orig_fgconsole, orig_kmsg; 15static int orig_fgconsole, orig_kmsg;
@@ -32,4 +31,3 @@ void pm_restore_console(void)
32 vt_kmsg_redirect(orig_kmsg); 31 vt_kmsg_redirect(orig_kmsg);
33 } 32 }
34} 33}
35#endif
diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c
index 8f7b1db1ece1..1c53f7fad5f7 100644
--- a/kernel/power/hibernate.c
+++ b/kernel/power/hibernate.c
@@ -14,6 +14,7 @@
14#include <linux/reboot.h> 14#include <linux/reboot.h>
15#include <linux/string.h> 15#include <linux/string.h>
16#include <linux/device.h> 16#include <linux/device.h>
17#include <linux/async.h>
17#include <linux/kmod.h> 18#include <linux/kmod.h>
18#include <linux/delay.h> 19#include <linux/delay.h>
19#include <linux/fs.h> 20#include <linux/fs.h>
@@ -29,12 +30,14 @@
29#include "power.h" 30#include "power.h"
30 31
31 32
32static int nocompress = 0; 33static int nocompress;
33static int noresume = 0; 34static int noresume;
35static int resume_wait;
36static int resume_delay;
34static char resume_file[256] = CONFIG_PM_STD_PARTITION; 37static char resume_file[256] = CONFIG_PM_STD_PARTITION;
35dev_t swsusp_resume_device; 38dev_t swsusp_resume_device;
36sector_t swsusp_resume_block; 39sector_t swsusp_resume_block;
37int in_suspend __nosavedata = 0; 40int in_suspend __nosavedata;
38 41
39enum { 42enum {
40 HIBERNATION_INVALID, 43 HIBERNATION_INVALID,
@@ -334,13 +337,17 @@ int hibernation_snapshot(int platform_mode)
334 if (error) 337 if (error)
335 goto Close; 338 goto Close;
336 339
337 error = dpm_prepare(PMSG_FREEZE);
338 if (error)
339 goto Complete_devices;
340
341 /* Preallocate image memory before shutting down devices. */ 340 /* Preallocate image memory before shutting down devices. */
342 error = hibernate_preallocate_memory(); 341 error = hibernate_preallocate_memory();
343 if (error) 342 if (error)
343 goto Close;
344
345 error = freeze_kernel_threads();
346 if (error)
347 goto Close;
348
349 error = dpm_prepare(PMSG_FREEZE);
350 if (error)
344 goto Complete_devices; 351 goto Complete_devices;
345 352
346 suspend_console(); 353 suspend_console();
@@ -463,7 +470,7 @@ static int resume_target_kernel(bool platform_mode)
463 * @platform_mode: If set, use platform driver to prepare for the transition. 470 * @platform_mode: If set, use platform driver to prepare for the transition.
464 * 471 *
465 * This routine must be called with pm_mutex held. If it is successful, control 472 * This routine must be called with pm_mutex held. If it is successful, control
466 * reappears in the restored target kernel in hibernation_snaphot(). 473 * reappears in the restored target kernel in hibernation_snapshot().
467 */ 474 */
468int hibernation_restore(int platform_mode) 475int hibernation_restore(int platform_mode)
469{ 476{
@@ -650,6 +657,9 @@ int hibernate(void)
650 flags |= SF_PLATFORM_MODE; 657 flags |= SF_PLATFORM_MODE;
651 if (nocompress) 658 if (nocompress)
652 flags |= SF_NOCOMPRESS_MODE; 659 flags |= SF_NOCOMPRESS_MODE;
660 else
661 flags |= SF_CRC32_MODE;
662
653 pr_debug("PM: writing image.\n"); 663 pr_debug("PM: writing image.\n");
654 error = swsusp_write(flags); 664 error = swsusp_write(flags);
655 swsusp_free(); 665 swsusp_free();
@@ -724,6 +734,12 @@ static int software_resume(void)
724 734
725 pr_debug("PM: Checking hibernation image partition %s\n", resume_file); 735 pr_debug("PM: Checking hibernation image partition %s\n", resume_file);
726 736
737 if (resume_delay) {
738 printk(KERN_INFO "Waiting %dsec before reading resume device...\n",
739 resume_delay);
740 ssleep(resume_delay);
741 }
742
727 /* Check if the device is there */ 743 /* Check if the device is there */
728 swsusp_resume_device = name_to_dev_t(resume_file); 744 swsusp_resume_device = name_to_dev_t(resume_file);
729 if (!swsusp_resume_device) { 745 if (!swsusp_resume_device) {
@@ -732,6 +748,13 @@ static int software_resume(void)
732 * to wait for this to finish. 748 * to wait for this to finish.
733 */ 749 */
734 wait_for_device_probe(); 750 wait_for_device_probe();
751
752 if (resume_wait) {
753 while ((swsusp_resume_device = name_to_dev_t(resume_file)) == 0)
754 msleep(10);
755 async_synchronize_full();
756 }
757
735 /* 758 /*
736 * We can't depend on SCSI devices being available after loading 759 * We can't depend on SCSI devices being available after loading
737 * one of their modules until scsi_complete_async_scans() is 760 * one of their modules until scsi_complete_async_scans() is
@@ -1060,7 +1083,21 @@ static int __init noresume_setup(char *str)
1060 return 1; 1083 return 1;
1061} 1084}
1062 1085
1086static int __init resumewait_setup(char *str)
1087{
1088 resume_wait = 1;
1089 return 1;
1090}
1091
1092static int __init resumedelay_setup(char *str)
1093{
1094 resume_delay = simple_strtoul(str, NULL, 0);
1095 return 1;
1096}
1097
1063__setup("noresume", noresume_setup); 1098__setup("noresume", noresume_setup);
1064__setup("resume_offset=", resume_offset_setup); 1099__setup("resume_offset=", resume_offset_setup);
1065__setup("resume=", resume_setup); 1100__setup("resume=", resume_setup);
1066__setup("hibernate=", hibernate_setup); 1101__setup("hibernate=", hibernate_setup);
1102__setup("resumewait", resumewait_setup);
1103__setup("resumedelay=", resumedelay_setup);
diff --git a/kernel/power/main.c b/kernel/power/main.c
index 6c601f871964..a52e88425a31 100644
--- a/kernel/power/main.c
+++ b/kernel/power/main.c
@@ -12,6 +12,8 @@
12#include <linux/string.h> 12#include <linux/string.h>
13#include <linux/resume-trace.h> 13#include <linux/resume-trace.h>
14#include <linux/workqueue.h> 14#include <linux/workqueue.h>
15#include <linux/debugfs.h>
16#include <linux/seq_file.h>
15 17
16#include "power.h" 18#include "power.h"
17 19
@@ -131,6 +133,101 @@ static ssize_t pm_test_store(struct kobject *kobj, struct kobj_attribute *attr,
131power_attr(pm_test); 133power_attr(pm_test);
132#endif /* CONFIG_PM_DEBUG */ 134#endif /* CONFIG_PM_DEBUG */
133 135
136#ifdef CONFIG_DEBUG_FS
137static char *suspend_step_name(enum suspend_stat_step step)
138{
139 switch (step) {
140 case SUSPEND_FREEZE:
141 return "freeze";
142 case SUSPEND_PREPARE:
143 return "prepare";
144 case SUSPEND_SUSPEND:
145 return "suspend";
146 case SUSPEND_SUSPEND_NOIRQ:
147 return "suspend_noirq";
148 case SUSPEND_RESUME_NOIRQ:
149 return "resume_noirq";
150 case SUSPEND_RESUME:
151 return "resume";
152 default:
153 return "";
154 }
155}
156
157static int suspend_stats_show(struct seq_file *s, void *unused)
158{
159 int i, index, last_dev, last_errno, last_step;
160
161 last_dev = suspend_stats.last_failed_dev + REC_FAILED_NUM - 1;
162 last_dev %= REC_FAILED_NUM;
163 last_errno = suspend_stats.last_failed_errno + REC_FAILED_NUM - 1;
164 last_errno %= REC_FAILED_NUM;
165 last_step = suspend_stats.last_failed_step + REC_FAILED_NUM - 1;
166 last_step %= REC_FAILED_NUM;
167 seq_printf(s, "%s: %d\n%s: %d\n%s: %d\n%s: %d\n"
168 "%s: %d\n%s: %d\n%s: %d\n%s: %d\n",
169 "success", suspend_stats.success,
170 "fail", suspend_stats.fail,
171 "failed_freeze", suspend_stats.failed_freeze,
172 "failed_prepare", suspend_stats.failed_prepare,
173 "failed_suspend", suspend_stats.failed_suspend,
174 "failed_suspend_noirq",
175 suspend_stats.failed_suspend_noirq,
176 "failed_resume", suspend_stats.failed_resume,
177 "failed_resume_noirq",
178 suspend_stats.failed_resume_noirq);
179 seq_printf(s, "failures:\n last_failed_dev:\t%-s\n",
180 suspend_stats.failed_devs[last_dev]);
181 for (i = 1; i < REC_FAILED_NUM; i++) {
182 index = last_dev + REC_FAILED_NUM - i;
183 index %= REC_FAILED_NUM;
184 seq_printf(s, "\t\t\t%-s\n",
185 suspend_stats.failed_devs[index]);
186 }
187 seq_printf(s, " last_failed_errno:\t%-d\n",
188 suspend_stats.errno[last_errno]);
189 for (i = 1; i < REC_FAILED_NUM; i++) {
190 index = last_errno + REC_FAILED_NUM - i;
191 index %= REC_FAILED_NUM;
192 seq_printf(s, "\t\t\t%-d\n",
193 suspend_stats.errno[index]);
194 }
195 seq_printf(s, " last_failed_step:\t%-s\n",
196 suspend_step_name(
197 suspend_stats.failed_steps[last_step]));
198 for (i = 1; i < REC_FAILED_NUM; i++) {
199 index = last_step + REC_FAILED_NUM - i;
200 index %= REC_FAILED_NUM;
201 seq_printf(s, "\t\t\t%-s\n",
202 suspend_step_name(
203 suspend_stats.failed_steps[index]));
204 }
205
206 return 0;
207}
208
209static int suspend_stats_open(struct inode *inode, struct file *file)
210{
211 return single_open(file, suspend_stats_show, NULL);
212}
213
214static const struct file_operations suspend_stats_operations = {
215 .open = suspend_stats_open,
216 .read = seq_read,
217 .llseek = seq_lseek,
218 .release = single_release,
219};
220
221static int __init pm_debugfs_init(void)
222{
223 debugfs_create_file("suspend_stats", S_IFREG | S_IRUGO,
224 NULL, NULL, &suspend_stats_operations);
225 return 0;
226}
227
228late_initcall(pm_debugfs_init);
229#endif /* CONFIG_DEBUG_FS */
230
134#endif /* CONFIG_PM_SLEEP */ 231#endif /* CONFIG_PM_SLEEP */
135 232
136struct kobject *power_kobj; 233struct kobject *power_kobj;
@@ -194,6 +291,11 @@ static ssize_t state_store(struct kobject *kobj, struct kobj_attribute *attr,
194 } 291 }
195 if (state < PM_SUSPEND_MAX && *s) 292 if (state < PM_SUSPEND_MAX && *s)
196 error = enter_state(state); 293 error = enter_state(state);
294 if (error) {
295 suspend_stats.fail++;
296 dpm_save_failed_errno(error);
297 } else
298 suspend_stats.success++;
197#endif 299#endif
198 300
199 Exit: 301 Exit:
diff --git a/kernel/power/power.h b/kernel/power/power.h
index 9a00a0a26280..23a2db1ec442 100644
--- a/kernel/power/power.h
+++ b/kernel/power/power.h
@@ -146,6 +146,7 @@ extern int swsusp_swap_in_use(void);
146 */ 146 */
147#define SF_PLATFORM_MODE 1 147#define SF_PLATFORM_MODE 1
148#define SF_NOCOMPRESS_MODE 2 148#define SF_NOCOMPRESS_MODE 2
149#define SF_CRC32_MODE 4
149 150
150/* kernel/power/hibernate.c */ 151/* kernel/power/hibernate.c */
151extern int swsusp_check(void); 152extern int swsusp_check(void);
@@ -228,7 +229,8 @@ extern int pm_test_level;
228#ifdef CONFIG_SUSPEND_FREEZER 229#ifdef CONFIG_SUSPEND_FREEZER
229static inline int suspend_freeze_processes(void) 230static inline int suspend_freeze_processes(void)
230{ 231{
231 return freeze_processes(); 232 int error = freeze_processes();
233 return error ? : freeze_kernel_threads();
232} 234}
233 235
234static inline void suspend_thaw_processes(void) 236static inline void suspend_thaw_processes(void)
diff --git a/kernel/power/process.c b/kernel/power/process.c
index 0cf3a27a6c9d..addbbe5531bc 100644
--- a/kernel/power/process.c
+++ b/kernel/power/process.c
@@ -135,7 +135,7 @@ static int try_to_freeze_tasks(bool sig_only)
135} 135}
136 136
137/** 137/**
138 * freeze_processes - tell processes to enter the refrigerator 138 * freeze_processes - Signal user space processes to enter the refrigerator.
139 */ 139 */
140int freeze_processes(void) 140int freeze_processes(void)
141{ 141{
@@ -143,20 +143,30 @@ int freeze_processes(void)
143 143
144 printk("Freezing user space processes ... "); 144 printk("Freezing user space processes ... ");
145 error = try_to_freeze_tasks(true); 145 error = try_to_freeze_tasks(true);
146 if (error) 146 if (!error) {
147 goto Exit; 147 printk("done.");
148 printk("done.\n"); 148 oom_killer_disable();
149 }
150 printk("\n");
151 BUG_ON(in_atomic());
152
153 return error;
154}
155
156/**
157 * freeze_kernel_threads - Make freezable kernel threads go to the refrigerator.
158 */
159int freeze_kernel_threads(void)
160{
161 int error;
149 162
150 printk("Freezing remaining freezable tasks ... "); 163 printk("Freezing remaining freezable tasks ... ");
151 error = try_to_freeze_tasks(false); 164 error = try_to_freeze_tasks(false);
152 if (error) 165 if (!error)
153 goto Exit; 166 printk("done.");
154 printk("done.");
155 167
156 oom_killer_disable();
157 Exit:
158 BUG_ON(in_atomic());
159 printk("\n"); 168 printk("\n");
169 BUG_ON(in_atomic());
160 170
161 return error; 171 return error;
162} 172}
diff --git a/kernel/pm_qos_params.c b/kernel/power/qos.c
index 37f05d0f0793..1c1797dd1d1d 100644
--- a/kernel/pm_qos_params.c
+++ b/kernel/power/qos.c
@@ -29,7 +29,7 @@
29 29
30/*#define DEBUG*/ 30/*#define DEBUG*/
31 31
32#include <linux/pm_qos_params.h> 32#include <linux/pm_qos.h>
33#include <linux/sched.h> 33#include <linux/sched.h>
34#include <linux/spinlock.h> 34#include <linux/spinlock.h>
35#include <linux/slab.h> 35#include <linux/slab.h>
@@ -45,62 +45,57 @@
45#include <linux/uaccess.h> 45#include <linux/uaccess.h>
46 46
47/* 47/*
48 * locking rule: all changes to requests or notifiers lists 48 * locking rule: all changes to constraints or notifiers lists
49 * or pm_qos_object list and pm_qos_objects need to happen with pm_qos_lock 49 * or pm_qos_object list and pm_qos_objects need to happen with pm_qos_lock
50 * held, taken with _irqsave. One lock to rule them all 50 * held, taken with _irqsave. One lock to rule them all
51 */ 51 */
52enum pm_qos_type {
53 PM_QOS_MAX, /* return the largest value */
54 PM_QOS_MIN /* return the smallest value */
55};
56
57/*
58 * Note: The lockless read path depends on the CPU accessing
59 * target_value atomically. Atomic access is only guaranteed on all CPU
60 * types linux supports for 32 bit quantites
61 */
62struct pm_qos_object { 52struct pm_qos_object {
63 struct plist_head requests; 53 struct pm_qos_constraints *constraints;
64 struct blocking_notifier_head *notifiers;
65 struct miscdevice pm_qos_power_miscdev; 54 struct miscdevice pm_qos_power_miscdev;
66 char *name; 55 char *name;
67 s32 target_value; /* Do not change to 64 bit */
68 s32 default_value;
69 enum pm_qos_type type;
70}; 56};
71 57
72static DEFINE_SPINLOCK(pm_qos_lock); 58static DEFINE_SPINLOCK(pm_qos_lock);
73 59
74static struct pm_qos_object null_pm_qos; 60static struct pm_qos_object null_pm_qos;
61
75static BLOCKING_NOTIFIER_HEAD(cpu_dma_lat_notifier); 62static BLOCKING_NOTIFIER_HEAD(cpu_dma_lat_notifier);
76static struct pm_qos_object cpu_dma_pm_qos = { 63static struct pm_qos_constraints cpu_dma_constraints = {
77 .requests = PLIST_HEAD_INIT(cpu_dma_pm_qos.requests), 64 .list = PLIST_HEAD_INIT(cpu_dma_constraints.list),
78 .notifiers = &cpu_dma_lat_notifier,
79 .name = "cpu_dma_latency",
80 .target_value = PM_QOS_CPU_DMA_LAT_DEFAULT_VALUE, 65 .target_value = PM_QOS_CPU_DMA_LAT_DEFAULT_VALUE,
81 .default_value = PM_QOS_CPU_DMA_LAT_DEFAULT_VALUE, 66 .default_value = PM_QOS_CPU_DMA_LAT_DEFAULT_VALUE,
82 .type = PM_QOS_MIN, 67 .type = PM_QOS_MIN,
68 .notifiers = &cpu_dma_lat_notifier,
69};
70static struct pm_qos_object cpu_dma_pm_qos = {
71 .constraints = &cpu_dma_constraints,
83}; 72};
84 73
85static BLOCKING_NOTIFIER_HEAD(network_lat_notifier); 74static BLOCKING_NOTIFIER_HEAD(network_lat_notifier);
86static struct pm_qos_object network_lat_pm_qos = { 75static struct pm_qos_constraints network_lat_constraints = {
87 .requests = PLIST_HEAD_INIT(network_lat_pm_qos.requests), 76 .list = PLIST_HEAD_INIT(network_lat_constraints.list),
88 .notifiers = &network_lat_notifier,
89 .name = "network_latency",
90 .target_value = PM_QOS_NETWORK_LAT_DEFAULT_VALUE, 77 .target_value = PM_QOS_NETWORK_LAT_DEFAULT_VALUE,
91 .default_value = PM_QOS_NETWORK_LAT_DEFAULT_VALUE, 78 .default_value = PM_QOS_NETWORK_LAT_DEFAULT_VALUE,
92 .type = PM_QOS_MIN 79 .type = PM_QOS_MIN,
80 .notifiers = &network_lat_notifier,
81};
82static struct pm_qos_object network_lat_pm_qos = {
83 .constraints = &network_lat_constraints,
84 .name = "network_latency",
93}; 85};
94 86
95 87
96static BLOCKING_NOTIFIER_HEAD(network_throughput_notifier); 88static BLOCKING_NOTIFIER_HEAD(network_throughput_notifier);
97static struct pm_qos_object network_throughput_pm_qos = { 89static struct pm_qos_constraints network_tput_constraints = {
98 .requests = PLIST_HEAD_INIT(network_throughput_pm_qos.requests), 90 .list = PLIST_HEAD_INIT(network_tput_constraints.list),
99 .notifiers = &network_throughput_notifier,
100 .name = "network_throughput",
101 .target_value = PM_QOS_NETWORK_THROUGHPUT_DEFAULT_VALUE, 91 .target_value = PM_QOS_NETWORK_THROUGHPUT_DEFAULT_VALUE,
102 .default_value = PM_QOS_NETWORK_THROUGHPUT_DEFAULT_VALUE, 92 .default_value = PM_QOS_NETWORK_THROUGHPUT_DEFAULT_VALUE,
103 .type = PM_QOS_MAX, 93 .type = PM_QOS_MAX,
94 .notifiers = &network_throughput_notifier,
95};
96static struct pm_qos_object network_throughput_pm_qos = {
97 .constraints = &network_tput_constraints,
98 .name = "network_throughput",
104}; 99};
105 100
106 101
@@ -127,17 +122,17 @@ static const struct file_operations pm_qos_power_fops = {
127}; 122};
128 123
129/* unlocked internal variant */ 124/* unlocked internal variant */
130static inline int pm_qos_get_value(struct pm_qos_object *o) 125static inline int pm_qos_get_value(struct pm_qos_constraints *c)
131{ 126{
132 if (plist_head_empty(&o->requests)) 127 if (plist_head_empty(&c->list))
133 return o->default_value; 128 return c->default_value;
134 129
135 switch (o->type) { 130 switch (c->type) {
136 case PM_QOS_MIN: 131 case PM_QOS_MIN:
137 return plist_first(&o->requests)->prio; 132 return plist_first(&c->list)->prio;
138 133
139 case PM_QOS_MAX: 134 case PM_QOS_MAX:
140 return plist_last(&o->requests)->prio; 135 return plist_last(&c->list)->prio;
141 136
142 default: 137 default:
143 /* runtime check for not using enum */ 138 /* runtime check for not using enum */
@@ -145,69 +140,73 @@ static inline int pm_qos_get_value(struct pm_qos_object *o)
145 } 140 }
146} 141}
147 142
148static inline s32 pm_qos_read_value(struct pm_qos_object *o) 143s32 pm_qos_read_value(struct pm_qos_constraints *c)
149{ 144{
150 return o->target_value; 145 return c->target_value;
151} 146}
152 147
153static inline void pm_qos_set_value(struct pm_qos_object *o, s32 value) 148static inline void pm_qos_set_value(struct pm_qos_constraints *c, s32 value)
154{ 149{
155 o->target_value = value; 150 c->target_value = value;
156} 151}
157 152
158static void update_target(struct pm_qos_object *o, struct plist_node *node, 153/**
159 int del, int value) 154 * pm_qos_update_target - manages the constraints list and calls the notifiers
155 * if needed
156 * @c: constraints data struct
157 * @node: request to add to the list, to update or to remove
158 * @action: action to take on the constraints list
159 * @value: value of the request to add or update
160 *
161 * This function returns 1 if the aggregated constraint value has changed, 0
162 * otherwise.
163 */
164int pm_qos_update_target(struct pm_qos_constraints *c, struct plist_node *node,
165 enum pm_qos_req_action action, int value)
160{ 166{
161 unsigned long flags; 167 unsigned long flags;
162 int prev_value, curr_value; 168 int prev_value, curr_value, new_value;
163 169
164 spin_lock_irqsave(&pm_qos_lock, flags); 170 spin_lock_irqsave(&pm_qos_lock, flags);
165 prev_value = pm_qos_get_value(o); 171 prev_value = pm_qos_get_value(c);
166 /* PM_QOS_DEFAULT_VALUE is a signal that the value is unchanged */ 172 if (value == PM_QOS_DEFAULT_VALUE)
167 if (value != PM_QOS_DEFAULT_VALUE) { 173 new_value = c->default_value;
174 else
175 new_value = value;
176
177 switch (action) {
178 case PM_QOS_REMOVE_REQ:
179 plist_del(node, &c->list);
180 break;
181 case PM_QOS_UPDATE_REQ:
168 /* 182 /*
169 * to change the list, we atomically remove, reinit 183 * to change the list, we atomically remove, reinit
170 * with new value and add, then see if the extremal 184 * with new value and add, then see if the extremal
171 * changed 185 * changed
172 */ 186 */
173 plist_del(node, &o->requests); 187 plist_del(node, &c->list);
174 plist_node_init(node, value); 188 case PM_QOS_ADD_REQ:
175 plist_add(node, &o->requests); 189 plist_node_init(node, new_value);
176 } else if (del) { 190 plist_add(node, &c->list);
177 plist_del(node, &o->requests); 191 break;
178 } else { 192 default:
179 plist_add(node, &o->requests); 193 /* no action */
194 ;
180 } 195 }
181 curr_value = pm_qos_get_value(o); 196
182 pm_qos_set_value(o, curr_value); 197 curr_value = pm_qos_get_value(c);
198 pm_qos_set_value(c, curr_value);
199
183 spin_unlock_irqrestore(&pm_qos_lock, flags); 200 spin_unlock_irqrestore(&pm_qos_lock, flags);
184 201
185 if (prev_value != curr_value) 202 if (prev_value != curr_value) {
186 blocking_notifier_call_chain(o->notifiers, 203 blocking_notifier_call_chain(c->notifiers,
187 (unsigned long)curr_value, 204 (unsigned long)curr_value,
188 NULL); 205 NULL);
189} 206 return 1;
190 207 } else {
191static int register_pm_qos_misc(struct pm_qos_object *qos) 208 return 0;
192{
193 qos->pm_qos_power_miscdev.minor = MISC_DYNAMIC_MINOR;
194 qos->pm_qos_power_miscdev.name = qos->name;
195 qos->pm_qos_power_miscdev.fops = &pm_qos_power_fops;
196
197 return misc_register(&qos->pm_qos_power_miscdev);
198}
199
200static int find_pm_qos_object_by_minor(int minor)
201{
202 int pm_qos_class;
203
204 for (pm_qos_class = 0;
205 pm_qos_class < PM_QOS_NUM_CLASSES; pm_qos_class++) {
206 if (minor ==
207 pm_qos_array[pm_qos_class]->pm_qos_power_miscdev.minor)
208 return pm_qos_class;
209 } 209 }
210 return -1;
211} 210}
212 211
213/** 212/**
@@ -218,11 +217,11 @@ static int find_pm_qos_object_by_minor(int minor)
218 */ 217 */
219int pm_qos_request(int pm_qos_class) 218int pm_qos_request(int pm_qos_class)
220{ 219{
221 return pm_qos_read_value(pm_qos_array[pm_qos_class]); 220 return pm_qos_read_value(pm_qos_array[pm_qos_class]->constraints);
222} 221}
223EXPORT_SYMBOL_GPL(pm_qos_request); 222EXPORT_SYMBOL_GPL(pm_qos_request);
224 223
225int pm_qos_request_active(struct pm_qos_request_list *req) 224int pm_qos_request_active(struct pm_qos_request *req)
226{ 225{
227 return req->pm_qos_class != 0; 226 return req->pm_qos_class != 0;
228} 227}
@@ -230,40 +229,36 @@ EXPORT_SYMBOL_GPL(pm_qos_request_active);
230 229
231/** 230/**
232 * pm_qos_add_request - inserts new qos request into the list 231 * pm_qos_add_request - inserts new qos request into the list
233 * @dep: pointer to a preallocated handle 232 * @req: pointer to a preallocated handle
234 * @pm_qos_class: identifies which list of qos request to use 233 * @pm_qos_class: identifies which list of qos request to use
235 * @value: defines the qos request 234 * @value: defines the qos request
236 * 235 *
237 * This function inserts a new entry in the pm_qos_class list of requested qos 236 * This function inserts a new entry in the pm_qos_class list of requested qos
238 * performance characteristics. It recomputes the aggregate QoS expectations 237 * performance characteristics. It recomputes the aggregate QoS expectations
239 * for the pm_qos_class of parameters and initializes the pm_qos_request_list 238 * for the pm_qos_class of parameters and initializes the pm_qos_request
240 * handle. Caller needs to save this handle for later use in updates and 239 * handle. Caller needs to save this handle for later use in updates and
241 * removal. 240 * removal.
242 */ 241 */
243 242
244void pm_qos_add_request(struct pm_qos_request_list *dep, 243void pm_qos_add_request(struct pm_qos_request *req,
245 int pm_qos_class, s32 value) 244 int pm_qos_class, s32 value)
246{ 245{
247 struct pm_qos_object *o = pm_qos_array[pm_qos_class]; 246 if (!req) /*guard against callers passing in null */
248 int new_value; 247 return;
249 248
250 if (pm_qos_request_active(dep)) { 249 if (pm_qos_request_active(req)) {
251 WARN(1, KERN_ERR "pm_qos_add_request() called for already added request\n"); 250 WARN(1, KERN_ERR "pm_qos_add_request() called for already added request\n");
252 return; 251 return;
253 } 252 }
254 if (value == PM_QOS_DEFAULT_VALUE) 253 req->pm_qos_class = pm_qos_class;
255 new_value = o->default_value; 254 pm_qos_update_target(pm_qos_array[pm_qos_class]->constraints,
256 else 255 &req->node, PM_QOS_ADD_REQ, value);
257 new_value = value;
258 plist_node_init(&dep->list, new_value);
259 dep->pm_qos_class = pm_qos_class;
260 update_target(o, &dep->list, 0, PM_QOS_DEFAULT_VALUE);
261} 256}
262EXPORT_SYMBOL_GPL(pm_qos_add_request); 257EXPORT_SYMBOL_GPL(pm_qos_add_request);
263 258
264/** 259/**
265 * pm_qos_update_request - modifies an existing qos request 260 * pm_qos_update_request - modifies an existing qos request
266 * @pm_qos_req : handle to list element holding a pm_qos request to use 261 * @req : handle to list element holding a pm_qos request to use
267 * @value: defines the qos request 262 * @value: defines the qos request
268 * 263 *
269 * Updates an existing qos request for the pm_qos_class of parameters along 264 * Updates an existing qos request for the pm_qos_class of parameters along
@@ -271,56 +266,47 @@ EXPORT_SYMBOL_GPL(pm_qos_add_request);
271 * 266 *
272 * Attempts are made to make this code callable on hot code paths. 267 * Attempts are made to make this code callable on hot code paths.
273 */ 268 */
274void pm_qos_update_request(struct pm_qos_request_list *pm_qos_req, 269void pm_qos_update_request(struct pm_qos_request *req,
275 s32 new_value) 270 s32 new_value)
276{ 271{
277 s32 temp; 272 if (!req) /*guard against callers passing in null */
278 struct pm_qos_object *o;
279
280 if (!pm_qos_req) /*guard against callers passing in null */
281 return; 273 return;
282 274
283 if (!pm_qos_request_active(pm_qos_req)) { 275 if (!pm_qos_request_active(req)) {
284 WARN(1, KERN_ERR "pm_qos_update_request() called for unknown object\n"); 276 WARN(1, KERN_ERR "pm_qos_update_request() called for unknown object\n");
285 return; 277 return;
286 } 278 }
287 279
288 o = pm_qos_array[pm_qos_req->pm_qos_class]; 280 if (new_value != req->node.prio)
289 281 pm_qos_update_target(
290 if (new_value == PM_QOS_DEFAULT_VALUE) 282 pm_qos_array[req->pm_qos_class]->constraints,
291 temp = o->default_value; 283 &req->node, PM_QOS_UPDATE_REQ, new_value);
292 else
293 temp = new_value;
294
295 if (temp != pm_qos_req->list.prio)
296 update_target(o, &pm_qos_req->list, 0, temp);
297} 284}
298EXPORT_SYMBOL_GPL(pm_qos_update_request); 285EXPORT_SYMBOL_GPL(pm_qos_update_request);
299 286
300/** 287/**
301 * pm_qos_remove_request - modifies an existing qos request 288 * pm_qos_remove_request - modifies an existing qos request
302 * @pm_qos_req: handle to request list element 289 * @req: handle to request list element
303 * 290 *
304 * Will remove pm qos request from the list of requests and 291 * Will remove pm qos request from the list of constraints and
305 * recompute the current target value for the pm_qos_class. Call this 292 * recompute the current target value for the pm_qos_class. Call this
306 * on slow code paths. 293 * on slow code paths.
307 */ 294 */
308void pm_qos_remove_request(struct pm_qos_request_list *pm_qos_req) 295void pm_qos_remove_request(struct pm_qos_request *req)
309{ 296{
310 struct pm_qos_object *o; 297 if (!req) /*guard against callers passing in null */
311
312 if (pm_qos_req == NULL)
313 return; 298 return;
314 /* silent return to keep pcm code cleaner */ 299 /* silent return to keep pcm code cleaner */
315 300
316 if (!pm_qos_request_active(pm_qos_req)) { 301 if (!pm_qos_request_active(req)) {
317 WARN(1, KERN_ERR "pm_qos_remove_request() called for unknown object\n"); 302 WARN(1, KERN_ERR "pm_qos_remove_request() called for unknown object\n");
318 return; 303 return;
319 } 304 }
320 305
321 o = pm_qos_array[pm_qos_req->pm_qos_class]; 306 pm_qos_update_target(pm_qos_array[req->pm_qos_class]->constraints,
322 update_target(o, &pm_qos_req->list, 1, PM_QOS_DEFAULT_VALUE); 307 &req->node, PM_QOS_REMOVE_REQ,
323 memset(pm_qos_req, 0, sizeof(*pm_qos_req)); 308 PM_QOS_DEFAULT_VALUE);
309 memset(req, 0, sizeof(*req));
324} 310}
325EXPORT_SYMBOL_GPL(pm_qos_remove_request); 311EXPORT_SYMBOL_GPL(pm_qos_remove_request);
326 312
@@ -337,7 +323,8 @@ int pm_qos_add_notifier(int pm_qos_class, struct notifier_block *notifier)
337 int retval; 323 int retval;
338 324
339 retval = blocking_notifier_chain_register( 325 retval = blocking_notifier_chain_register(
340 pm_qos_array[pm_qos_class]->notifiers, notifier); 326 pm_qos_array[pm_qos_class]->constraints->notifiers,
327 notifier);
341 328
342 return retval; 329 return retval;
343} 330}
@@ -356,19 +343,43 @@ int pm_qos_remove_notifier(int pm_qos_class, struct notifier_block *notifier)
356 int retval; 343 int retval;
357 344
358 retval = blocking_notifier_chain_unregister( 345 retval = blocking_notifier_chain_unregister(
359 pm_qos_array[pm_qos_class]->notifiers, notifier); 346 pm_qos_array[pm_qos_class]->constraints->notifiers,
347 notifier);
360 348
361 return retval; 349 return retval;
362} 350}
363EXPORT_SYMBOL_GPL(pm_qos_remove_notifier); 351EXPORT_SYMBOL_GPL(pm_qos_remove_notifier);
364 352
353/* User space interface to PM QoS classes via misc devices */
354static int register_pm_qos_misc(struct pm_qos_object *qos)
355{
356 qos->pm_qos_power_miscdev.minor = MISC_DYNAMIC_MINOR;
357 qos->pm_qos_power_miscdev.name = qos->name;
358 qos->pm_qos_power_miscdev.fops = &pm_qos_power_fops;
359
360 return misc_register(&qos->pm_qos_power_miscdev);
361}
362
363static int find_pm_qos_object_by_minor(int minor)
364{
365 int pm_qos_class;
366
367 for (pm_qos_class = 0;
368 pm_qos_class < PM_QOS_NUM_CLASSES; pm_qos_class++) {
369 if (minor ==
370 pm_qos_array[pm_qos_class]->pm_qos_power_miscdev.minor)
371 return pm_qos_class;
372 }
373 return -1;
374}
375
365static int pm_qos_power_open(struct inode *inode, struct file *filp) 376static int pm_qos_power_open(struct inode *inode, struct file *filp)
366{ 377{
367 long pm_qos_class; 378 long pm_qos_class;
368 379
369 pm_qos_class = find_pm_qos_object_by_minor(iminor(inode)); 380 pm_qos_class = find_pm_qos_object_by_minor(iminor(inode));
370 if (pm_qos_class >= 0) { 381 if (pm_qos_class >= 0) {
371 struct pm_qos_request_list *req = kzalloc(sizeof(*req), GFP_KERNEL); 382 struct pm_qos_request *req = kzalloc(sizeof(*req), GFP_KERNEL);
372 if (!req) 383 if (!req)
373 return -ENOMEM; 384 return -ENOMEM;
374 385
@@ -383,7 +394,7 @@ static int pm_qos_power_open(struct inode *inode, struct file *filp)
383 394
384static int pm_qos_power_release(struct inode *inode, struct file *filp) 395static int pm_qos_power_release(struct inode *inode, struct file *filp)
385{ 396{
386 struct pm_qos_request_list *req; 397 struct pm_qos_request *req;
387 398
388 req = filp->private_data; 399 req = filp->private_data;
389 pm_qos_remove_request(req); 400 pm_qos_remove_request(req);
@@ -398,17 +409,15 @@ static ssize_t pm_qos_power_read(struct file *filp, char __user *buf,
398{ 409{
399 s32 value; 410 s32 value;
400 unsigned long flags; 411 unsigned long flags;
401 struct pm_qos_object *o; 412 struct pm_qos_request *req = filp->private_data;
402 struct pm_qos_request_list *pm_qos_req = filp->private_data;
403 413
404 if (!pm_qos_req) 414 if (!req)
405 return -EINVAL; 415 return -EINVAL;
406 if (!pm_qos_request_active(pm_qos_req)) 416 if (!pm_qos_request_active(req))
407 return -EINVAL; 417 return -EINVAL;
408 418
409 o = pm_qos_array[pm_qos_req->pm_qos_class];
410 spin_lock_irqsave(&pm_qos_lock, flags); 419 spin_lock_irqsave(&pm_qos_lock, flags);
411 value = pm_qos_get_value(o); 420 value = pm_qos_get_value(pm_qos_array[req->pm_qos_class]->constraints);
412 spin_unlock_irqrestore(&pm_qos_lock, flags); 421 spin_unlock_irqrestore(&pm_qos_lock, flags);
413 422
414 return simple_read_from_buffer(buf, count, f_pos, &value, sizeof(s32)); 423 return simple_read_from_buffer(buf, count, f_pos, &value, sizeof(s32));
@@ -418,7 +427,7 @@ static ssize_t pm_qos_power_write(struct file *filp, const char __user *buf,
418 size_t count, loff_t *f_pos) 427 size_t count, loff_t *f_pos)
419{ 428{
420 s32 value; 429 s32 value;
421 struct pm_qos_request_list *pm_qos_req; 430 struct pm_qos_request *req;
422 431
423 if (count == sizeof(s32)) { 432 if (count == sizeof(s32)) {
424 if (copy_from_user(&value, buf, sizeof(s32))) 433 if (copy_from_user(&value, buf, sizeof(s32)))
@@ -449,8 +458,8 @@ static ssize_t pm_qos_power_write(struct file *filp, const char __user *buf,
449 return -EINVAL; 458 return -EINVAL;
450 } 459 }
451 460
452 pm_qos_req = filp->private_data; 461 req = filp->private_data;
453 pm_qos_update_request(pm_qos_req, value); 462 pm_qos_update_request(req, value);
454 463
455 return count; 464 return count;
456} 465}
diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c
index 06efa54f93d6..cbe2c1441392 100644
--- a/kernel/power/snapshot.c
+++ b/kernel/power/snapshot.c
@@ -1339,6 +1339,9 @@ int hibernate_preallocate_memory(void)
1339 count += highmem; 1339 count += highmem;
1340 count -= totalreserve_pages; 1340 count -= totalreserve_pages;
1341 1341
1342 /* Add number of pages required for page keys (s390 only). */
1343 size += page_key_additional_pages(saveable);
1344
1342 /* Compute the maximum number of saveable pages to leave in memory. */ 1345 /* Compute the maximum number of saveable pages to leave in memory. */
1343 max_size = (count - (size + PAGES_FOR_IO)) / 2 1346 max_size = (count - (size + PAGES_FOR_IO)) / 2
1344 - 2 * DIV_ROUND_UP(reserved_size, PAGE_SIZE); 1347 - 2 * DIV_ROUND_UP(reserved_size, PAGE_SIZE);
@@ -1662,6 +1665,8 @@ pack_pfns(unsigned long *buf, struct memory_bitmap *bm)
1662 buf[j] = memory_bm_next_pfn(bm); 1665 buf[j] = memory_bm_next_pfn(bm);
1663 if (unlikely(buf[j] == BM_END_OF_MAP)) 1666 if (unlikely(buf[j] == BM_END_OF_MAP))
1664 break; 1667 break;
1668 /* Save page key for data page (s390 only). */
1669 page_key_read(buf + j);
1665 } 1670 }
1666} 1671}
1667 1672
@@ -1821,6 +1826,9 @@ static int unpack_orig_pfns(unsigned long *buf, struct memory_bitmap *bm)
1821 if (unlikely(buf[j] == BM_END_OF_MAP)) 1826 if (unlikely(buf[j] == BM_END_OF_MAP))
1822 break; 1827 break;
1823 1828
1829 /* Extract and buffer page key for data page (s390 only). */
1830 page_key_memorize(buf + j);
1831
1824 if (memory_bm_pfn_present(bm, buf[j])) 1832 if (memory_bm_pfn_present(bm, buf[j]))
1825 memory_bm_set_bit(bm, buf[j]); 1833 memory_bm_set_bit(bm, buf[j]);
1826 else 1834 else
@@ -2223,6 +2231,11 @@ int snapshot_write_next(struct snapshot_handle *handle)
2223 if (error) 2231 if (error)
2224 return error; 2232 return error;
2225 2233
2234 /* Allocate buffer for page keys. */
2235 error = page_key_alloc(nr_copy_pages);
2236 if (error)
2237 return error;
2238
2226 } else if (handle->cur <= nr_meta_pages + 1) { 2239 } else if (handle->cur <= nr_meta_pages + 1) {
2227 error = unpack_orig_pfns(buffer, &copy_bm); 2240 error = unpack_orig_pfns(buffer, &copy_bm);
2228 if (error) 2241 if (error)
@@ -2243,6 +2256,8 @@ int snapshot_write_next(struct snapshot_handle *handle)
2243 } 2256 }
2244 } else { 2257 } else {
2245 copy_last_highmem_page(); 2258 copy_last_highmem_page();
2259 /* Restore page key for data page (s390 only). */
2260 page_key_write(handle->buffer);
2246 handle->buffer = get_buffer(&orig_bm, &ca); 2261 handle->buffer = get_buffer(&orig_bm, &ca);
2247 if (IS_ERR(handle->buffer)) 2262 if (IS_ERR(handle->buffer))
2248 return PTR_ERR(handle->buffer); 2263 return PTR_ERR(handle->buffer);
@@ -2264,6 +2279,9 @@ int snapshot_write_next(struct snapshot_handle *handle)
2264void snapshot_write_finalize(struct snapshot_handle *handle) 2279void snapshot_write_finalize(struct snapshot_handle *handle)
2265{ 2280{
2266 copy_last_highmem_page(); 2281 copy_last_highmem_page();
2282 /* Restore page key for data page (s390 only). */
2283 page_key_write(handle->buffer);
2284 page_key_free();
2267 /* Free only if we have loaded the image entirely */ 2285 /* Free only if we have loaded the image entirely */
2268 if (handle->cur > 1 && handle->cur > nr_meta_pages + nr_copy_pages) { 2286 if (handle->cur > 1 && handle->cur > nr_meta_pages + nr_copy_pages) {
2269 memory_bm_free(&orig_bm, PG_UNSAFE_CLEAR); 2287 memory_bm_free(&orig_bm, PG_UNSAFE_CLEAR);
diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c
index b6b71ad2208f..fdd4263b995d 100644
--- a/kernel/power/suspend.c
+++ b/kernel/power/suspend.c
@@ -104,7 +104,10 @@ static int suspend_prepare(void)
104 goto Finish; 104 goto Finish;
105 105
106 error = suspend_freeze_processes(); 106 error = suspend_freeze_processes();
107 if (!error) 107 if (error) {
108 suspend_stats.failed_freeze++;
109 dpm_save_failed_step(SUSPEND_FREEZE);
110 } else
108 return 0; 111 return 0;
109 112
110 suspend_thaw_processes(); 113 suspend_thaw_processes();
@@ -315,8 +318,16 @@ int enter_state(suspend_state_t state)
315 */ 318 */
316int pm_suspend(suspend_state_t state) 319int pm_suspend(suspend_state_t state)
317{ 320{
318 if (state > PM_SUSPEND_ON && state <= PM_SUSPEND_MAX) 321 int ret;
319 return enter_state(state); 322 if (state > PM_SUSPEND_ON && state < PM_SUSPEND_MAX) {
323 ret = enter_state(state);
324 if (ret) {
325 suspend_stats.fail++;
326 dpm_save_failed_errno(ret);
327 } else
328 suspend_stats.success++;
329 return ret;
330 }
320 return -EINVAL; 331 return -EINVAL;
321} 332}
322EXPORT_SYMBOL(pm_suspend); 333EXPORT_SYMBOL(pm_suspend);
diff --git a/kernel/power/swap.c b/kernel/power/swap.c
index 7c97c3a0eee3..11a594c4ba25 100644
--- a/kernel/power/swap.c
+++ b/kernel/power/swap.c
@@ -27,6 +27,10 @@
27#include <linux/slab.h> 27#include <linux/slab.h>
28#include <linux/lzo.h> 28#include <linux/lzo.h>
29#include <linux/vmalloc.h> 29#include <linux/vmalloc.h>
30#include <linux/cpumask.h>
31#include <linux/atomic.h>
32#include <linux/kthread.h>
33#include <linux/crc32.h>
30 34
31#include "power.h" 35#include "power.h"
32 36
@@ -43,8 +47,7 @@
43 * allocated and populated one at a time, so we only need one memory 47 * allocated and populated one at a time, so we only need one memory
44 * page to set up the entire structure. 48 * page to set up the entire structure.
45 * 49 *
46 * During resume we also only need to use one swap_map_page structure 50 * During resume we pick up all swap_map_page structures into a list.
47 * at a time.
48 */ 51 */
49 52
50#define MAP_PAGE_ENTRIES (PAGE_SIZE / sizeof(sector_t) - 1) 53#define MAP_PAGE_ENTRIES (PAGE_SIZE / sizeof(sector_t) - 1)
@@ -54,6 +57,11 @@ struct swap_map_page {
54 sector_t next_swap; 57 sector_t next_swap;
55}; 58};
56 59
60struct swap_map_page_list {
61 struct swap_map_page *map;
62 struct swap_map_page_list *next;
63};
64
57/** 65/**
58 * The swap_map_handle structure is used for handling swap in 66 * The swap_map_handle structure is used for handling swap in
59 * a file-alike way 67 * a file-alike way
@@ -61,13 +69,18 @@ struct swap_map_page {
61 69
62struct swap_map_handle { 70struct swap_map_handle {
63 struct swap_map_page *cur; 71 struct swap_map_page *cur;
72 struct swap_map_page_list *maps;
64 sector_t cur_swap; 73 sector_t cur_swap;
65 sector_t first_sector; 74 sector_t first_sector;
66 unsigned int k; 75 unsigned int k;
76 unsigned long nr_free_pages, written;
77 u32 crc32;
67}; 78};
68 79
69struct swsusp_header { 80struct swsusp_header {
70 char reserved[PAGE_SIZE - 20 - sizeof(sector_t) - sizeof(int)]; 81 char reserved[PAGE_SIZE - 20 - sizeof(sector_t) - sizeof(int) -
82 sizeof(u32)];
83 u32 crc32;
71 sector_t image; 84 sector_t image;
72 unsigned int flags; /* Flags to pass to the "boot" kernel */ 85 unsigned int flags; /* Flags to pass to the "boot" kernel */
73 char orig_sig[10]; 86 char orig_sig[10];
@@ -199,6 +212,8 @@ static int mark_swapfiles(struct swap_map_handle *handle, unsigned int flags)
199 memcpy(swsusp_header->sig, HIBERNATE_SIG, 10); 212 memcpy(swsusp_header->sig, HIBERNATE_SIG, 10);
200 swsusp_header->image = handle->first_sector; 213 swsusp_header->image = handle->first_sector;
201 swsusp_header->flags = flags; 214 swsusp_header->flags = flags;
215 if (flags & SF_CRC32_MODE)
216 swsusp_header->crc32 = handle->crc32;
202 error = hib_bio_write_page(swsusp_resume_block, 217 error = hib_bio_write_page(swsusp_resume_block,
203 swsusp_header, NULL); 218 swsusp_header, NULL);
204 } else { 219 } else {
@@ -245,6 +260,7 @@ static int swsusp_swap_check(void)
245static int write_page(void *buf, sector_t offset, struct bio **bio_chain) 260static int write_page(void *buf, sector_t offset, struct bio **bio_chain)
246{ 261{
247 void *src; 262 void *src;
263 int ret;
248 264
249 if (!offset) 265 if (!offset)
250 return -ENOSPC; 266 return -ENOSPC;
@@ -254,9 +270,17 @@ static int write_page(void *buf, sector_t offset, struct bio **bio_chain)
254 if (src) { 270 if (src) {
255 copy_page(src, buf); 271 copy_page(src, buf);
256 } else { 272 } else {
257 WARN_ON_ONCE(1); 273 ret = hib_wait_on_bio_chain(bio_chain); /* Free pages */
258 bio_chain = NULL; /* Go synchronous */ 274 if (ret)
259 src = buf; 275 return ret;
276 src = (void *)__get_free_page(__GFP_WAIT | __GFP_HIGH);
277 if (src) {
278 copy_page(src, buf);
279 } else {
280 WARN_ON_ONCE(1);
281 bio_chain = NULL; /* Go synchronous */
282 src = buf;
283 }
260 } 284 }
261 } else { 285 } else {
262 src = buf; 286 src = buf;
@@ -293,6 +317,8 @@ static int get_swap_writer(struct swap_map_handle *handle)
293 goto err_rel; 317 goto err_rel;
294 } 318 }
295 handle->k = 0; 319 handle->k = 0;
320 handle->nr_free_pages = nr_free_pages() >> 1;
321 handle->written = 0;
296 handle->first_sector = handle->cur_swap; 322 handle->first_sector = handle->cur_swap;
297 return 0; 323 return 0;
298err_rel: 324err_rel:
@@ -316,20 +342,23 @@ static int swap_write_page(struct swap_map_handle *handle, void *buf,
316 return error; 342 return error;
317 handle->cur->entries[handle->k++] = offset; 343 handle->cur->entries[handle->k++] = offset;
318 if (handle->k >= MAP_PAGE_ENTRIES) { 344 if (handle->k >= MAP_PAGE_ENTRIES) {
319 error = hib_wait_on_bio_chain(bio_chain);
320 if (error)
321 goto out;
322 offset = alloc_swapdev_block(root_swap); 345 offset = alloc_swapdev_block(root_swap);
323 if (!offset) 346 if (!offset)
324 return -ENOSPC; 347 return -ENOSPC;
325 handle->cur->next_swap = offset; 348 handle->cur->next_swap = offset;
326 error = write_page(handle->cur, handle->cur_swap, NULL); 349 error = write_page(handle->cur, handle->cur_swap, bio_chain);
327 if (error) 350 if (error)
328 goto out; 351 goto out;
329 clear_page(handle->cur); 352 clear_page(handle->cur);
330 handle->cur_swap = offset; 353 handle->cur_swap = offset;
331 handle->k = 0; 354 handle->k = 0;
332 } 355 }
356 if (bio_chain && ++handle->written > handle->nr_free_pages) {
357 error = hib_wait_on_bio_chain(bio_chain);
358 if (error)
359 goto out;
360 handle->written = 0;
361 }
333 out: 362 out:
334 return error; 363 return error;
335} 364}
@@ -372,6 +401,13 @@ static int swap_writer_finish(struct swap_map_handle *handle,
372 LZO_HEADER, PAGE_SIZE) 401 LZO_HEADER, PAGE_SIZE)
373#define LZO_CMP_SIZE (LZO_CMP_PAGES * PAGE_SIZE) 402#define LZO_CMP_SIZE (LZO_CMP_PAGES * PAGE_SIZE)
374 403
404/* Maximum number of threads for compression/decompression. */
405#define LZO_THREADS 3
406
407/* Maximum number of pages for read buffering. */
408#define LZO_READ_PAGES (MAP_PAGE_ENTRIES * 8)
409
410
375/** 411/**
376 * save_image - save the suspend image data 412 * save_image - save the suspend image data
377 */ 413 */
@@ -419,6 +455,92 @@ static int save_image(struct swap_map_handle *handle,
419 return ret; 455 return ret;
420} 456}
421 457
458/**
459 * Structure used for CRC32.
460 */
461struct crc_data {
462 struct task_struct *thr; /* thread */
463 atomic_t ready; /* ready to start flag */
464 atomic_t stop; /* ready to stop flag */
465 unsigned run_threads; /* nr current threads */
466 wait_queue_head_t go; /* start crc update */
467 wait_queue_head_t done; /* crc update done */
468 u32 *crc32; /* points to handle's crc32 */
469 size_t *unc_len[LZO_THREADS]; /* uncompressed lengths */
470 unsigned char *unc[LZO_THREADS]; /* uncompressed data */
471};
472
473/**
474 * CRC32 update function that runs in its own thread.
475 */
476static int crc32_threadfn(void *data)
477{
478 struct crc_data *d = data;
479 unsigned i;
480
481 while (1) {
482 wait_event(d->go, atomic_read(&d->ready) ||
483 kthread_should_stop());
484 if (kthread_should_stop()) {
485 d->thr = NULL;
486 atomic_set(&d->stop, 1);
487 wake_up(&d->done);
488 break;
489 }
490 atomic_set(&d->ready, 0);
491
492 for (i = 0; i < d->run_threads; i++)
493 *d->crc32 = crc32_le(*d->crc32,
494 d->unc[i], *d->unc_len[i]);
495 atomic_set(&d->stop, 1);
496 wake_up(&d->done);
497 }
498 return 0;
499}
500/**
501 * Structure used for LZO data compression.
502 */
503struct cmp_data {
504 struct task_struct *thr; /* thread */
505 atomic_t ready; /* ready to start flag */
506 atomic_t stop; /* ready to stop flag */
507 int ret; /* return code */
508 wait_queue_head_t go; /* start compression */
509 wait_queue_head_t done; /* compression done */
510 size_t unc_len; /* uncompressed length */
511 size_t cmp_len; /* compressed length */
512 unsigned char unc[LZO_UNC_SIZE]; /* uncompressed buffer */
513 unsigned char cmp[LZO_CMP_SIZE]; /* compressed buffer */
514 unsigned char wrk[LZO1X_1_MEM_COMPRESS]; /* compression workspace */
515};
516
517/**
518 * Compression function that runs in its own thread.
519 */
520static int lzo_compress_threadfn(void *data)
521{
522 struct cmp_data *d = data;
523
524 while (1) {
525 wait_event(d->go, atomic_read(&d->ready) ||
526 kthread_should_stop());
527 if (kthread_should_stop()) {
528 d->thr = NULL;
529 d->ret = -1;
530 atomic_set(&d->stop, 1);
531 wake_up(&d->done);
532 break;
533 }
534 atomic_set(&d->ready, 0);
535
536 d->ret = lzo1x_1_compress(d->unc, d->unc_len,
537 d->cmp + LZO_HEADER, &d->cmp_len,
538 d->wrk);
539 atomic_set(&d->stop, 1);
540 wake_up(&d->done);
541 }
542 return 0;
543}
422 544
423/** 545/**
424 * save_image_lzo - Save the suspend image data compressed with LZO. 546 * save_image_lzo - Save the suspend image data compressed with LZO.
@@ -437,42 +559,93 @@ static int save_image_lzo(struct swap_map_handle *handle,
437 struct bio *bio; 559 struct bio *bio;
438 struct timeval start; 560 struct timeval start;
439 struct timeval stop; 561 struct timeval stop;
440 size_t off, unc_len, cmp_len; 562 size_t off;
441 unsigned char *unc, *cmp, *wrk, *page; 563 unsigned thr, run_threads, nr_threads;
564 unsigned char *page = NULL;
565 struct cmp_data *data = NULL;
566 struct crc_data *crc = NULL;
567
568 /*
569 * We'll limit the number of threads for compression to limit memory
570 * footprint.
571 */
572 nr_threads = num_online_cpus() - 1;
573 nr_threads = clamp_val(nr_threads, 1, LZO_THREADS);
442 574
443 page = (void *)__get_free_page(__GFP_WAIT | __GFP_HIGH); 575 page = (void *)__get_free_page(__GFP_WAIT | __GFP_HIGH);
444 if (!page) { 576 if (!page) {
445 printk(KERN_ERR "PM: Failed to allocate LZO page\n"); 577 printk(KERN_ERR "PM: Failed to allocate LZO page\n");
446 return -ENOMEM; 578 ret = -ENOMEM;
579 goto out_clean;
447 } 580 }
448 581
449 wrk = vmalloc(LZO1X_1_MEM_COMPRESS); 582 data = vmalloc(sizeof(*data) * nr_threads);
450 if (!wrk) { 583 if (!data) {
451 printk(KERN_ERR "PM: Failed to allocate LZO workspace\n"); 584 printk(KERN_ERR "PM: Failed to allocate LZO data\n");
452 free_page((unsigned long)page); 585 ret = -ENOMEM;
453 return -ENOMEM; 586 goto out_clean;
454 } 587 }
588 for (thr = 0; thr < nr_threads; thr++)
589 memset(&data[thr], 0, offsetof(struct cmp_data, go));
455 590
456 unc = vmalloc(LZO_UNC_SIZE); 591 crc = kmalloc(sizeof(*crc), GFP_KERNEL);
457 if (!unc) { 592 if (!crc) {
458 printk(KERN_ERR "PM: Failed to allocate LZO uncompressed\n"); 593 printk(KERN_ERR "PM: Failed to allocate crc\n");
459 vfree(wrk); 594 ret = -ENOMEM;
460 free_page((unsigned long)page); 595 goto out_clean;
461 return -ENOMEM; 596 }
597 memset(crc, 0, offsetof(struct crc_data, go));
598
599 /*
600 * Start the compression threads.
601 */
602 for (thr = 0; thr < nr_threads; thr++) {
603 init_waitqueue_head(&data[thr].go);
604 init_waitqueue_head(&data[thr].done);
605
606 data[thr].thr = kthread_run(lzo_compress_threadfn,
607 &data[thr],
608 "image_compress/%u", thr);
609 if (IS_ERR(data[thr].thr)) {
610 data[thr].thr = NULL;
611 printk(KERN_ERR
612 "PM: Cannot start compression threads\n");
613 ret = -ENOMEM;
614 goto out_clean;
615 }
462 } 616 }
463 617
464 cmp = vmalloc(LZO_CMP_SIZE); 618 /*
465 if (!cmp) { 619 * Adjust number of free pages after all allocations have been done.
466 printk(KERN_ERR "PM: Failed to allocate LZO compressed\n"); 620 * We don't want to run out of pages when writing.
467 vfree(unc); 621 */
468 vfree(wrk); 622 handle->nr_free_pages = nr_free_pages() >> 1;
469 free_page((unsigned long)page); 623
470 return -ENOMEM; 624 /*
625 * Start the CRC32 thread.
626 */
627 init_waitqueue_head(&crc->go);
628 init_waitqueue_head(&crc->done);
629
630 handle->crc32 = 0;
631 crc->crc32 = &handle->crc32;
632 for (thr = 0; thr < nr_threads; thr++) {
633 crc->unc[thr] = data[thr].unc;
634 crc->unc_len[thr] = &data[thr].unc_len;
635 }
636
637 crc->thr = kthread_run(crc32_threadfn, crc, "image_crc32");
638 if (IS_ERR(crc->thr)) {
639 crc->thr = NULL;
640 printk(KERN_ERR "PM: Cannot start CRC32 thread\n");
641 ret = -ENOMEM;
642 goto out_clean;
471 } 643 }
472 644
473 printk(KERN_INFO 645 printk(KERN_INFO
646 "PM: Using %u thread(s) for compression.\n"
474 "PM: Compressing and saving image data (%u pages) ... ", 647 "PM: Compressing and saving image data (%u pages) ... ",
475 nr_to_write); 648 nr_threads, nr_to_write);
476 m = nr_to_write / 100; 649 m = nr_to_write / 100;
477 if (!m) 650 if (!m)
478 m = 1; 651 m = 1;
@@ -480,55 +653,83 @@ static int save_image_lzo(struct swap_map_handle *handle,
480 bio = NULL; 653 bio = NULL;
481 do_gettimeofday(&start); 654 do_gettimeofday(&start);
482 for (;;) { 655 for (;;) {
483 for (off = 0; off < LZO_UNC_SIZE; off += PAGE_SIZE) { 656 for (thr = 0; thr < nr_threads; thr++) {
484 ret = snapshot_read_next(snapshot); 657 for (off = 0; off < LZO_UNC_SIZE; off += PAGE_SIZE) {
485 if (ret < 0) 658 ret = snapshot_read_next(snapshot);
486 goto out_finish; 659 if (ret < 0)
487 660 goto out_finish;
488 if (!ret) 661
662 if (!ret)
663 break;
664
665 memcpy(data[thr].unc + off,
666 data_of(*snapshot), PAGE_SIZE);
667
668 if (!(nr_pages % m))
669 printk(KERN_CONT "\b\b\b\b%3d%%",
670 nr_pages / m);
671 nr_pages++;
672 }
673 if (!off)
489 break; 674 break;
490 675
491 memcpy(unc + off, data_of(*snapshot), PAGE_SIZE); 676 data[thr].unc_len = off;
492 677
493 if (!(nr_pages % m)) 678 atomic_set(&data[thr].ready, 1);
494 printk(KERN_CONT "\b\b\b\b%3d%%", nr_pages / m); 679 wake_up(&data[thr].go);
495 nr_pages++;
496 } 680 }
497 681
498 if (!off) 682 if (!thr)
499 break; 683 break;
500 684
501 unc_len = off; 685 crc->run_threads = thr;
502 ret = lzo1x_1_compress(unc, unc_len, 686 atomic_set(&crc->ready, 1);
503 cmp + LZO_HEADER, &cmp_len, wrk); 687 wake_up(&crc->go);
504 if (ret < 0) {
505 printk(KERN_ERR "PM: LZO compression failed\n");
506 break;
507 }
508 688
509 if (unlikely(!cmp_len || 689 for (run_threads = thr, thr = 0; thr < run_threads; thr++) {
510 cmp_len > lzo1x_worst_compress(unc_len))) { 690 wait_event(data[thr].done,
511 printk(KERN_ERR "PM: Invalid LZO compressed length\n"); 691 atomic_read(&data[thr].stop));
512 ret = -1; 692 atomic_set(&data[thr].stop, 0);
513 break;
514 }
515 693
516 *(size_t *)cmp = cmp_len; 694 ret = data[thr].ret;
517 695
518 /* 696 if (ret < 0) {
519 * Given we are writing one page at a time to disk, we copy 697 printk(KERN_ERR "PM: LZO compression failed\n");
520 * that much from the buffer, although the last bit will likely 698 goto out_finish;
521 * be smaller than full page. This is OK - we saved the length 699 }
522 * of the compressed data, so any garbage at the end will be
523 * discarded when we read it.
524 */
525 for (off = 0; off < LZO_HEADER + cmp_len; off += PAGE_SIZE) {
526 memcpy(page, cmp + off, PAGE_SIZE);
527 700
528 ret = swap_write_page(handle, page, &bio); 701 if (unlikely(!data[thr].cmp_len ||
529 if (ret) 702 data[thr].cmp_len >
703 lzo1x_worst_compress(data[thr].unc_len))) {
704 printk(KERN_ERR
705 "PM: Invalid LZO compressed length\n");
706 ret = -1;
530 goto out_finish; 707 goto out_finish;
708 }
709
710 *(size_t *)data[thr].cmp = data[thr].cmp_len;
711
712 /*
713 * Given we are writing one page at a time to disk, we
714 * copy that much from the buffer, although the last
715 * bit will likely be smaller than full page. This is
716 * OK - we saved the length of the compressed data, so
717 * any garbage at the end will be discarded when we
718 * read it.
719 */
720 for (off = 0;
721 off < LZO_HEADER + data[thr].cmp_len;
722 off += PAGE_SIZE) {
723 memcpy(page, data[thr].cmp + off, PAGE_SIZE);
724
725 ret = swap_write_page(handle, page, &bio);
726 if (ret)
727 goto out_finish;
728 }
531 } 729 }
730
731 wait_event(crc->done, atomic_read(&crc->stop));
732 atomic_set(&crc->stop, 0);
532 } 733 }
533 734
534out_finish: 735out_finish:
@@ -536,16 +737,25 @@ out_finish:
536 do_gettimeofday(&stop); 737 do_gettimeofday(&stop);
537 if (!ret) 738 if (!ret)
538 ret = err2; 739 ret = err2;
539 if (!ret) 740 if (!ret) {
540 printk(KERN_CONT "\b\b\b\bdone\n"); 741 printk(KERN_CONT "\b\b\b\bdone\n");
541 else 742 } else {
542 printk(KERN_CONT "\n"); 743 printk(KERN_CONT "\n");
744 }
543 swsusp_show_speed(&start, &stop, nr_to_write, "Wrote"); 745 swsusp_show_speed(&start, &stop, nr_to_write, "Wrote");
544 746out_clean:
545 vfree(cmp); 747 if (crc) {
546 vfree(unc); 748 if (crc->thr)
547 vfree(wrk); 749 kthread_stop(crc->thr);
548 free_page((unsigned long)page); 750 kfree(crc);
751 }
752 if (data) {
753 for (thr = 0; thr < nr_threads; thr++)
754 if (data[thr].thr)
755 kthread_stop(data[thr].thr);
756 vfree(data);
757 }
758 if (page) free_page((unsigned long)page);
549 759
550 return ret; 760 return ret;
551} 761}
@@ -625,8 +835,15 @@ out_finish:
625 835
626static void release_swap_reader(struct swap_map_handle *handle) 836static void release_swap_reader(struct swap_map_handle *handle)
627{ 837{
628 if (handle->cur) 838 struct swap_map_page_list *tmp;
629 free_page((unsigned long)handle->cur); 839
840 while (handle->maps) {
841 if (handle->maps->map)
842 free_page((unsigned long)handle->maps->map);
843 tmp = handle->maps;
844 handle->maps = handle->maps->next;
845 kfree(tmp);
846 }
630 handle->cur = NULL; 847 handle->cur = NULL;
631} 848}
632 849
@@ -634,22 +851,46 @@ static int get_swap_reader(struct swap_map_handle *handle,
634 unsigned int *flags_p) 851 unsigned int *flags_p)
635{ 852{
636 int error; 853 int error;
854 struct swap_map_page_list *tmp, *last;
855 sector_t offset;
637 856
638 *flags_p = swsusp_header->flags; 857 *flags_p = swsusp_header->flags;
639 858
640 if (!swsusp_header->image) /* how can this happen? */ 859 if (!swsusp_header->image) /* how can this happen? */
641 return -EINVAL; 860 return -EINVAL;
642 861
643 handle->cur = (struct swap_map_page *)get_zeroed_page(__GFP_WAIT | __GFP_HIGH); 862 handle->cur = NULL;
644 if (!handle->cur) 863 last = handle->maps = NULL;
645 return -ENOMEM; 864 offset = swsusp_header->image;
865 while (offset) {
866 tmp = kmalloc(sizeof(*handle->maps), GFP_KERNEL);
867 if (!tmp) {
868 release_swap_reader(handle);
869 return -ENOMEM;
870 }
871 memset(tmp, 0, sizeof(*tmp));
872 if (!handle->maps)
873 handle->maps = tmp;
874 if (last)
875 last->next = tmp;
876 last = tmp;
877
878 tmp->map = (struct swap_map_page *)
879 __get_free_page(__GFP_WAIT | __GFP_HIGH);
880 if (!tmp->map) {
881 release_swap_reader(handle);
882 return -ENOMEM;
883 }
646 884
647 error = hib_bio_read_page(swsusp_header->image, handle->cur, NULL); 885 error = hib_bio_read_page(offset, tmp->map, NULL);
648 if (error) { 886 if (error) {
649 release_swap_reader(handle); 887 release_swap_reader(handle);
650 return error; 888 return error;
889 }
890 offset = tmp->map->next_swap;
651 } 891 }
652 handle->k = 0; 892 handle->k = 0;
893 handle->cur = handle->maps->map;
653 return 0; 894 return 0;
654} 895}
655 896
@@ -658,6 +899,7 @@ static int swap_read_page(struct swap_map_handle *handle, void *buf,
658{ 899{
659 sector_t offset; 900 sector_t offset;
660 int error; 901 int error;
902 struct swap_map_page_list *tmp;
661 903
662 if (!handle->cur) 904 if (!handle->cur)
663 return -EINVAL; 905 return -EINVAL;
@@ -668,13 +910,15 @@ static int swap_read_page(struct swap_map_handle *handle, void *buf,
668 if (error) 910 if (error)
669 return error; 911 return error;
670 if (++handle->k >= MAP_PAGE_ENTRIES) { 912 if (++handle->k >= MAP_PAGE_ENTRIES) {
671 error = hib_wait_on_bio_chain(bio_chain);
672 handle->k = 0; 913 handle->k = 0;
673 offset = handle->cur->next_swap; 914 free_page((unsigned long)handle->maps->map);
674 if (!offset) 915 tmp = handle->maps;
916 handle->maps = handle->maps->next;
917 kfree(tmp);
918 if (!handle->maps)
675 release_swap_reader(handle); 919 release_swap_reader(handle);
676 else if (!error) 920 else
677 error = hib_bio_read_page(offset, handle->cur, NULL); 921 handle->cur = handle->maps->map;
678 } 922 }
679 return error; 923 return error;
680} 924}
@@ -697,7 +941,7 @@ static int load_image(struct swap_map_handle *handle,
697 unsigned int nr_to_read) 941 unsigned int nr_to_read)
698{ 942{
699 unsigned int m; 943 unsigned int m;
700 int error = 0; 944 int ret = 0;
701 struct timeval start; 945 struct timeval start;
702 struct timeval stop; 946 struct timeval stop;
703 struct bio *bio; 947 struct bio *bio;
@@ -713,15 +957,15 @@ static int load_image(struct swap_map_handle *handle,
713 bio = NULL; 957 bio = NULL;
714 do_gettimeofday(&start); 958 do_gettimeofday(&start);
715 for ( ; ; ) { 959 for ( ; ; ) {
716 error = snapshot_write_next(snapshot); 960 ret = snapshot_write_next(snapshot);
717 if (error <= 0) 961 if (ret <= 0)
718 break; 962 break;
719 error = swap_read_page(handle, data_of(*snapshot), &bio); 963 ret = swap_read_page(handle, data_of(*snapshot), &bio);
720 if (error) 964 if (ret)
721 break; 965 break;
722 if (snapshot->sync_read) 966 if (snapshot->sync_read)
723 error = hib_wait_on_bio_chain(&bio); 967 ret = hib_wait_on_bio_chain(&bio);
724 if (error) 968 if (ret)
725 break; 969 break;
726 if (!(nr_pages % m)) 970 if (!(nr_pages % m))
727 printk("\b\b\b\b%3d%%", nr_pages / m); 971 printk("\b\b\b\b%3d%%", nr_pages / m);
@@ -729,17 +973,61 @@ static int load_image(struct swap_map_handle *handle,
729 } 973 }
730 err2 = hib_wait_on_bio_chain(&bio); 974 err2 = hib_wait_on_bio_chain(&bio);
731 do_gettimeofday(&stop); 975 do_gettimeofday(&stop);
732 if (!error) 976 if (!ret)
733 error = err2; 977 ret = err2;
734 if (!error) { 978 if (!ret) {
735 printk("\b\b\b\bdone\n"); 979 printk("\b\b\b\bdone\n");
736 snapshot_write_finalize(snapshot); 980 snapshot_write_finalize(snapshot);
737 if (!snapshot_image_loaded(snapshot)) 981 if (!snapshot_image_loaded(snapshot))
738 error = -ENODATA; 982 ret = -ENODATA;
739 } else 983 } else
740 printk("\n"); 984 printk("\n");
741 swsusp_show_speed(&start, &stop, nr_to_read, "Read"); 985 swsusp_show_speed(&start, &stop, nr_to_read, "Read");
742 return error; 986 return ret;
987}
988
989/**
990 * Structure used for LZO data decompression.
991 */
992struct dec_data {
993 struct task_struct *thr; /* thread */
994 atomic_t ready; /* ready to start flag */
995 atomic_t stop; /* ready to stop flag */
996 int ret; /* return code */
997 wait_queue_head_t go; /* start decompression */
998 wait_queue_head_t done; /* decompression done */
999 size_t unc_len; /* uncompressed length */
1000 size_t cmp_len; /* compressed length */
1001 unsigned char unc[LZO_UNC_SIZE]; /* uncompressed buffer */
1002 unsigned char cmp[LZO_CMP_SIZE]; /* compressed buffer */
1003};
1004
1005/**
1006 * Deompression function that runs in its own thread.
1007 */
1008static int lzo_decompress_threadfn(void *data)
1009{
1010 struct dec_data *d = data;
1011
1012 while (1) {
1013 wait_event(d->go, atomic_read(&d->ready) ||
1014 kthread_should_stop());
1015 if (kthread_should_stop()) {
1016 d->thr = NULL;
1017 d->ret = -1;
1018 atomic_set(&d->stop, 1);
1019 wake_up(&d->done);
1020 break;
1021 }
1022 atomic_set(&d->ready, 0);
1023
1024 d->unc_len = LZO_UNC_SIZE;
1025 d->ret = lzo1x_decompress_safe(d->cmp + LZO_HEADER, d->cmp_len,
1026 d->unc, &d->unc_len);
1027 atomic_set(&d->stop, 1);
1028 wake_up(&d->done);
1029 }
1030 return 0;
743} 1031}
744 1032
745/** 1033/**
@@ -753,50 +1041,120 @@ static int load_image_lzo(struct swap_map_handle *handle,
753 unsigned int nr_to_read) 1041 unsigned int nr_to_read)
754{ 1042{
755 unsigned int m; 1043 unsigned int m;
756 int error = 0; 1044 int ret = 0;
1045 int eof = 0;
757 struct bio *bio; 1046 struct bio *bio;
758 struct timeval start; 1047 struct timeval start;
759 struct timeval stop; 1048 struct timeval stop;
760 unsigned nr_pages; 1049 unsigned nr_pages;
761 size_t i, off, unc_len, cmp_len; 1050 size_t off;
762 unsigned char *unc, *cmp, *page[LZO_CMP_PAGES]; 1051 unsigned i, thr, run_threads, nr_threads;
763 1052 unsigned ring = 0, pg = 0, ring_size = 0,
764 for (i = 0; i < LZO_CMP_PAGES; i++) { 1053 have = 0, want, need, asked = 0;
765 page[i] = (void *)__get_free_page(__GFP_WAIT | __GFP_HIGH); 1054 unsigned long read_pages;
766 if (!page[i]) { 1055 unsigned char **page = NULL;
767 printk(KERN_ERR "PM: Failed to allocate LZO page\n"); 1056 struct dec_data *data = NULL;
1057 struct crc_data *crc = NULL;
1058
1059 /*
1060 * We'll limit the number of threads for decompression to limit memory
1061 * footprint.
1062 */
1063 nr_threads = num_online_cpus() - 1;
1064 nr_threads = clamp_val(nr_threads, 1, LZO_THREADS);
1065
1066 page = vmalloc(sizeof(*page) * LZO_READ_PAGES);
1067 if (!page) {
1068 printk(KERN_ERR "PM: Failed to allocate LZO page\n");
1069 ret = -ENOMEM;
1070 goto out_clean;
1071 }
768 1072
769 while (i) 1073 data = vmalloc(sizeof(*data) * nr_threads);
770 free_page((unsigned long)page[--i]); 1074 if (!data) {
1075 printk(KERN_ERR "PM: Failed to allocate LZO data\n");
1076 ret = -ENOMEM;
1077 goto out_clean;
1078 }
1079 for (thr = 0; thr < nr_threads; thr++)
1080 memset(&data[thr], 0, offsetof(struct dec_data, go));
771 1081
772 return -ENOMEM; 1082 crc = kmalloc(sizeof(*crc), GFP_KERNEL);
1083 if (!crc) {
1084 printk(KERN_ERR "PM: Failed to allocate crc\n");
1085 ret = -ENOMEM;
1086 goto out_clean;
1087 }
1088 memset(crc, 0, offsetof(struct crc_data, go));
1089
1090 /*
1091 * Start the decompression threads.
1092 */
1093 for (thr = 0; thr < nr_threads; thr++) {
1094 init_waitqueue_head(&data[thr].go);
1095 init_waitqueue_head(&data[thr].done);
1096
1097 data[thr].thr = kthread_run(lzo_decompress_threadfn,
1098 &data[thr],
1099 "image_decompress/%u", thr);
1100 if (IS_ERR(data[thr].thr)) {
1101 data[thr].thr = NULL;
1102 printk(KERN_ERR
1103 "PM: Cannot start decompression threads\n");
1104 ret = -ENOMEM;
1105 goto out_clean;
773 } 1106 }
774 } 1107 }
775 1108
776 unc = vmalloc(LZO_UNC_SIZE); 1109 /*
777 if (!unc) { 1110 * Start the CRC32 thread.
778 printk(KERN_ERR "PM: Failed to allocate LZO uncompressed\n"); 1111 */
779 1112 init_waitqueue_head(&crc->go);
780 for (i = 0; i < LZO_CMP_PAGES; i++) 1113 init_waitqueue_head(&crc->done);
781 free_page((unsigned long)page[i]); 1114
782 1115 handle->crc32 = 0;
783 return -ENOMEM; 1116 crc->crc32 = &handle->crc32;
1117 for (thr = 0; thr < nr_threads; thr++) {
1118 crc->unc[thr] = data[thr].unc;
1119 crc->unc_len[thr] = &data[thr].unc_len;
784 } 1120 }
785 1121
786 cmp = vmalloc(LZO_CMP_SIZE); 1122 crc->thr = kthread_run(crc32_threadfn, crc, "image_crc32");
787 if (!cmp) { 1123 if (IS_ERR(crc->thr)) {
788 printk(KERN_ERR "PM: Failed to allocate LZO compressed\n"); 1124 crc->thr = NULL;
1125 printk(KERN_ERR "PM: Cannot start CRC32 thread\n");
1126 ret = -ENOMEM;
1127 goto out_clean;
1128 }
789 1129
790 vfree(unc); 1130 /*
791 for (i = 0; i < LZO_CMP_PAGES; i++) 1131 * Adjust number of pages for read buffering, in case we are short.
792 free_page((unsigned long)page[i]); 1132 */
1133 read_pages = (nr_free_pages() - snapshot_get_image_size()) >> 1;
1134 read_pages = clamp_val(read_pages, LZO_CMP_PAGES, LZO_READ_PAGES);
793 1135
794 return -ENOMEM; 1136 for (i = 0; i < read_pages; i++) {
1137 page[i] = (void *)__get_free_page(i < LZO_CMP_PAGES ?
1138 __GFP_WAIT | __GFP_HIGH :
1139 __GFP_WAIT);
1140 if (!page[i]) {
1141 if (i < LZO_CMP_PAGES) {
1142 ring_size = i;
1143 printk(KERN_ERR
1144 "PM: Failed to allocate LZO pages\n");
1145 ret = -ENOMEM;
1146 goto out_clean;
1147 } else {
1148 break;
1149 }
1150 }
795 } 1151 }
1152 want = ring_size = i;
796 1153
797 printk(KERN_INFO 1154 printk(KERN_INFO
1155 "PM: Using %u thread(s) for decompression.\n"
798 "PM: Loading and decompressing image data (%u pages) ... ", 1156 "PM: Loading and decompressing image data (%u pages) ... ",
799 nr_to_read); 1157 nr_threads, nr_to_read);
800 m = nr_to_read / 100; 1158 m = nr_to_read / 100;
801 if (!m) 1159 if (!m)
802 m = 1; 1160 m = 1;
@@ -804,85 +1162,189 @@ static int load_image_lzo(struct swap_map_handle *handle,
804 bio = NULL; 1162 bio = NULL;
805 do_gettimeofday(&start); 1163 do_gettimeofday(&start);
806 1164
807 error = snapshot_write_next(snapshot); 1165 ret = snapshot_write_next(snapshot);
808 if (error <= 0) 1166 if (ret <= 0)
809 goto out_finish; 1167 goto out_finish;
810 1168
811 for (;;) { 1169 for(;;) {
812 error = swap_read_page(handle, page[0], NULL); /* sync */ 1170 for (i = 0; !eof && i < want; i++) {
813 if (error) 1171 ret = swap_read_page(handle, page[ring], &bio);
814 break; 1172 if (ret) {
815 1173 /*
816 cmp_len = *(size_t *)page[0]; 1174 * On real read error, finish. On end of data,
817 if (unlikely(!cmp_len || 1175 * set EOF flag and just exit the read loop.
818 cmp_len > lzo1x_worst_compress(LZO_UNC_SIZE))) { 1176 */
819 printk(KERN_ERR "PM: Invalid LZO compressed length\n"); 1177 if (handle->cur &&
820 error = -1; 1178 handle->cur->entries[handle->k]) {
821 break; 1179 goto out_finish;
1180 } else {
1181 eof = 1;
1182 break;
1183 }
1184 }
1185 if (++ring >= ring_size)
1186 ring = 0;
822 } 1187 }
1188 asked += i;
1189 want -= i;
823 1190
824 for (off = PAGE_SIZE, i = 1; 1191 /*
825 off < LZO_HEADER + cmp_len; off += PAGE_SIZE, i++) { 1192 * We are out of data, wait for some more.
826 error = swap_read_page(handle, page[i], &bio); 1193 */
827 if (error) 1194 if (!have) {
1195 if (!asked)
1196 break;
1197
1198 ret = hib_wait_on_bio_chain(&bio);
1199 if (ret)
828 goto out_finish; 1200 goto out_finish;
1201 have += asked;
1202 asked = 0;
1203 if (eof)
1204 eof = 2;
829 } 1205 }
830 1206
831 error = hib_wait_on_bio_chain(&bio); /* need all data now */ 1207 if (crc->run_threads) {
832 if (error) 1208 wait_event(crc->done, atomic_read(&crc->stop));
833 goto out_finish; 1209 atomic_set(&crc->stop, 0);
834 1210 crc->run_threads = 0;
835 for (off = 0, i = 0;
836 off < LZO_HEADER + cmp_len; off += PAGE_SIZE, i++) {
837 memcpy(cmp + off, page[i], PAGE_SIZE);
838 } 1211 }
839 1212
840 unc_len = LZO_UNC_SIZE; 1213 for (thr = 0; have && thr < nr_threads; thr++) {
841 error = lzo1x_decompress_safe(cmp + LZO_HEADER, cmp_len, 1214 data[thr].cmp_len = *(size_t *)page[pg];
842 unc, &unc_len); 1215 if (unlikely(!data[thr].cmp_len ||
843 if (error < 0) { 1216 data[thr].cmp_len >
844 printk(KERN_ERR "PM: LZO decompression failed\n"); 1217 lzo1x_worst_compress(LZO_UNC_SIZE))) {
845 break; 1218 printk(KERN_ERR
1219 "PM: Invalid LZO compressed length\n");
1220 ret = -1;
1221 goto out_finish;
1222 }
1223
1224 need = DIV_ROUND_UP(data[thr].cmp_len + LZO_HEADER,
1225 PAGE_SIZE);
1226 if (need > have) {
1227 if (eof > 1) {
1228 ret = -1;
1229 goto out_finish;
1230 }
1231 break;
1232 }
1233
1234 for (off = 0;
1235 off < LZO_HEADER + data[thr].cmp_len;
1236 off += PAGE_SIZE) {
1237 memcpy(data[thr].cmp + off,
1238 page[pg], PAGE_SIZE);
1239 have--;
1240 want++;
1241 if (++pg >= ring_size)
1242 pg = 0;
1243 }
1244
1245 atomic_set(&data[thr].ready, 1);
1246 wake_up(&data[thr].go);
846 } 1247 }
847 1248
848 if (unlikely(!unc_len || 1249 /*
849 unc_len > LZO_UNC_SIZE || 1250 * Wait for more data while we are decompressing.
850 unc_len & (PAGE_SIZE - 1))) { 1251 */
851 printk(KERN_ERR "PM: Invalid LZO uncompressed length\n"); 1252 if (have < LZO_CMP_PAGES && asked) {
852 error = -1; 1253 ret = hib_wait_on_bio_chain(&bio);
853 break; 1254 if (ret)
1255 goto out_finish;
1256 have += asked;
1257 asked = 0;
1258 if (eof)
1259 eof = 2;
854 } 1260 }
855 1261
856 for (off = 0; off < unc_len; off += PAGE_SIZE) { 1262 for (run_threads = thr, thr = 0; thr < run_threads; thr++) {
857 memcpy(data_of(*snapshot), unc + off, PAGE_SIZE); 1263 wait_event(data[thr].done,
1264 atomic_read(&data[thr].stop));
1265 atomic_set(&data[thr].stop, 0);
1266
1267 ret = data[thr].ret;
858 1268
859 if (!(nr_pages % m)) 1269 if (ret < 0) {
860 printk("\b\b\b\b%3d%%", nr_pages / m); 1270 printk(KERN_ERR
861 nr_pages++; 1271 "PM: LZO decompression failed\n");
1272 goto out_finish;
1273 }
862 1274
863 error = snapshot_write_next(snapshot); 1275 if (unlikely(!data[thr].unc_len ||
864 if (error <= 0) 1276 data[thr].unc_len > LZO_UNC_SIZE ||
1277 data[thr].unc_len & (PAGE_SIZE - 1))) {
1278 printk(KERN_ERR
1279 "PM: Invalid LZO uncompressed length\n");
1280 ret = -1;
865 goto out_finish; 1281 goto out_finish;
1282 }
1283
1284 for (off = 0;
1285 off < data[thr].unc_len; off += PAGE_SIZE) {
1286 memcpy(data_of(*snapshot),
1287 data[thr].unc + off, PAGE_SIZE);
1288
1289 if (!(nr_pages % m))
1290 printk("\b\b\b\b%3d%%", nr_pages / m);
1291 nr_pages++;
1292
1293 ret = snapshot_write_next(snapshot);
1294 if (ret <= 0) {
1295 crc->run_threads = thr + 1;
1296 atomic_set(&crc->ready, 1);
1297 wake_up(&crc->go);
1298 goto out_finish;
1299 }
1300 }
866 } 1301 }
1302
1303 crc->run_threads = thr;
1304 atomic_set(&crc->ready, 1);
1305 wake_up(&crc->go);
867 } 1306 }
868 1307
869out_finish: 1308out_finish:
1309 if (crc->run_threads) {
1310 wait_event(crc->done, atomic_read(&crc->stop));
1311 atomic_set(&crc->stop, 0);
1312 }
870 do_gettimeofday(&stop); 1313 do_gettimeofday(&stop);
871 if (!error) { 1314 if (!ret) {
872 printk("\b\b\b\bdone\n"); 1315 printk("\b\b\b\bdone\n");
873 snapshot_write_finalize(snapshot); 1316 snapshot_write_finalize(snapshot);
874 if (!snapshot_image_loaded(snapshot)) 1317 if (!snapshot_image_loaded(snapshot))
875 error = -ENODATA; 1318 ret = -ENODATA;
1319 if (!ret) {
1320 if (swsusp_header->flags & SF_CRC32_MODE) {
1321 if(handle->crc32 != swsusp_header->crc32) {
1322 printk(KERN_ERR
1323 "PM: Invalid image CRC32!\n");
1324 ret = -ENODATA;
1325 }
1326 }
1327 }
876 } else 1328 } else
877 printk("\n"); 1329 printk("\n");
878 swsusp_show_speed(&start, &stop, nr_to_read, "Read"); 1330 swsusp_show_speed(&start, &stop, nr_to_read, "Read");
879 1331out_clean:
880 vfree(cmp); 1332 for (i = 0; i < ring_size; i++)
881 vfree(unc);
882 for (i = 0; i < LZO_CMP_PAGES; i++)
883 free_page((unsigned long)page[i]); 1333 free_page((unsigned long)page[i]);
1334 if (crc) {
1335 if (crc->thr)
1336 kthread_stop(crc->thr);
1337 kfree(crc);
1338 }
1339 if (data) {
1340 for (thr = 0; thr < nr_threads; thr++)
1341 if (data[thr].thr)
1342 kthread_stop(data[thr].thr);
1343 vfree(data);
1344 }
1345 if (page) vfree(page);
884 1346
885 return error; 1347 return ret;
886} 1348}
887 1349
888/** 1350/**
diff --git a/kernel/printk.c b/kernel/printk.c
index 28a40d8171b8..b7da18391c38 100644
--- a/kernel/printk.c
+++ b/kernel/printk.c
@@ -100,7 +100,7 @@ static int console_locked, console_suspended;
100 * It is also used in interesting ways to provide interlocking in 100 * It is also used in interesting ways to provide interlocking in
101 * console_unlock();. 101 * console_unlock();.
102 */ 102 */
103static DEFINE_SPINLOCK(logbuf_lock); 103static DEFINE_RAW_SPINLOCK(logbuf_lock);
104 104
105#define LOG_BUF_MASK (log_buf_len-1) 105#define LOG_BUF_MASK (log_buf_len-1)
106#define LOG_BUF(idx) (log_buf[(idx) & LOG_BUF_MASK]) 106#define LOG_BUF(idx) (log_buf[(idx) & LOG_BUF_MASK])
@@ -212,7 +212,7 @@ void __init setup_log_buf(int early)
212 return; 212 return;
213 } 213 }
214 214
215 spin_lock_irqsave(&logbuf_lock, flags); 215 raw_spin_lock_irqsave(&logbuf_lock, flags);
216 log_buf_len = new_log_buf_len; 216 log_buf_len = new_log_buf_len;
217 log_buf = new_log_buf; 217 log_buf = new_log_buf;
218 new_log_buf_len = 0; 218 new_log_buf_len = 0;
@@ -230,7 +230,7 @@ void __init setup_log_buf(int early)
230 log_start -= offset; 230 log_start -= offset;
231 con_start -= offset; 231 con_start -= offset;
232 log_end -= offset; 232 log_end -= offset;
233 spin_unlock_irqrestore(&logbuf_lock, flags); 233 raw_spin_unlock_irqrestore(&logbuf_lock, flags);
234 234
235 pr_info("log_buf_len: %d\n", log_buf_len); 235 pr_info("log_buf_len: %d\n", log_buf_len);
236 pr_info("early log buf free: %d(%d%%)\n", 236 pr_info("early log buf free: %d(%d%%)\n",
@@ -365,18 +365,18 @@ int do_syslog(int type, char __user *buf, int len, bool from_file)
365 if (error) 365 if (error)
366 goto out; 366 goto out;
367 i = 0; 367 i = 0;
368 spin_lock_irq(&logbuf_lock); 368 raw_spin_lock_irq(&logbuf_lock);
369 while (!error && (log_start != log_end) && i < len) { 369 while (!error && (log_start != log_end) && i < len) {
370 c = LOG_BUF(log_start); 370 c = LOG_BUF(log_start);
371 log_start++; 371 log_start++;
372 spin_unlock_irq(&logbuf_lock); 372 raw_spin_unlock_irq(&logbuf_lock);
373 error = __put_user(c,buf); 373 error = __put_user(c,buf);
374 buf++; 374 buf++;
375 i++; 375 i++;
376 cond_resched(); 376 cond_resched();
377 spin_lock_irq(&logbuf_lock); 377 raw_spin_lock_irq(&logbuf_lock);
378 } 378 }
379 spin_unlock_irq(&logbuf_lock); 379 raw_spin_unlock_irq(&logbuf_lock);
380 if (!error) 380 if (!error)
381 error = i; 381 error = i;
382 break; 382 break;
@@ -399,7 +399,7 @@ int do_syslog(int type, char __user *buf, int len, bool from_file)
399 count = len; 399 count = len;
400 if (count > log_buf_len) 400 if (count > log_buf_len)
401 count = log_buf_len; 401 count = log_buf_len;
402 spin_lock_irq(&logbuf_lock); 402 raw_spin_lock_irq(&logbuf_lock);
403 if (count > logged_chars) 403 if (count > logged_chars)
404 count = logged_chars; 404 count = logged_chars;
405 if (do_clear) 405 if (do_clear)
@@ -416,12 +416,12 @@ int do_syslog(int type, char __user *buf, int len, bool from_file)
416 if (j + log_buf_len < log_end) 416 if (j + log_buf_len < log_end)
417 break; 417 break;
418 c = LOG_BUF(j); 418 c = LOG_BUF(j);
419 spin_unlock_irq(&logbuf_lock); 419 raw_spin_unlock_irq(&logbuf_lock);
420 error = __put_user(c,&buf[count-1-i]); 420 error = __put_user(c,&buf[count-1-i]);
421 cond_resched(); 421 cond_resched();
422 spin_lock_irq(&logbuf_lock); 422 raw_spin_lock_irq(&logbuf_lock);
423 } 423 }
424 spin_unlock_irq(&logbuf_lock); 424 raw_spin_unlock_irq(&logbuf_lock);
425 if (error) 425 if (error)
426 break; 426 break;
427 error = i; 427 error = i;
@@ -689,7 +689,7 @@ static void zap_locks(void)
689 oops_timestamp = jiffies; 689 oops_timestamp = jiffies;
690 690
691 /* If a crash is occurring, make sure we can't deadlock */ 691 /* If a crash is occurring, make sure we can't deadlock */
692 spin_lock_init(&logbuf_lock); 692 raw_spin_lock_init(&logbuf_lock);
693 /* And make sure that we print immediately */ 693 /* And make sure that we print immediately */
694 sema_init(&console_sem, 1); 694 sema_init(&console_sem, 1);
695} 695}
@@ -802,9 +802,9 @@ static int console_trylock_for_printk(unsigned int cpu)
802 } 802 }
803 } 803 }
804 printk_cpu = UINT_MAX; 804 printk_cpu = UINT_MAX;
805 spin_unlock(&logbuf_lock);
806 if (wake) 805 if (wake)
807 up(&console_sem); 806 up(&console_sem);
807 raw_spin_unlock(&logbuf_lock);
808 return retval; 808 return retval;
809} 809}
810static const char recursion_bug_msg [] = 810static const char recursion_bug_msg [] =
@@ -864,7 +864,7 @@ asmlinkage int vprintk(const char *fmt, va_list args)
864 } 864 }
865 865
866 lockdep_off(); 866 lockdep_off();
867 spin_lock(&logbuf_lock); 867 raw_spin_lock(&logbuf_lock);
868 printk_cpu = this_cpu; 868 printk_cpu = this_cpu;
869 869
870 if (recursion_bug) { 870 if (recursion_bug) {
@@ -1257,14 +1257,14 @@ void console_unlock(void)
1257 1257
1258again: 1258again:
1259 for ( ; ; ) { 1259 for ( ; ; ) {
1260 spin_lock_irqsave(&logbuf_lock, flags); 1260 raw_spin_lock_irqsave(&logbuf_lock, flags);
1261 wake_klogd |= log_start - log_end; 1261 wake_klogd |= log_start - log_end;
1262 if (con_start == log_end) 1262 if (con_start == log_end)
1263 break; /* Nothing to print */ 1263 break; /* Nothing to print */
1264 _con_start = con_start; 1264 _con_start = con_start;
1265 _log_end = log_end; 1265 _log_end = log_end;
1266 con_start = log_end; /* Flush */ 1266 con_start = log_end; /* Flush */
1267 spin_unlock(&logbuf_lock); 1267 raw_spin_unlock(&logbuf_lock);
1268 stop_critical_timings(); /* don't trace print latency */ 1268 stop_critical_timings(); /* don't trace print latency */
1269 call_console_drivers(_con_start, _log_end); 1269 call_console_drivers(_con_start, _log_end);
1270 start_critical_timings(); 1270 start_critical_timings();
@@ -1276,7 +1276,7 @@ again:
1276 if (unlikely(exclusive_console)) 1276 if (unlikely(exclusive_console))
1277 exclusive_console = NULL; 1277 exclusive_console = NULL;
1278 1278
1279 spin_unlock(&logbuf_lock); 1279 raw_spin_unlock(&logbuf_lock);
1280 1280
1281 up(&console_sem); 1281 up(&console_sem);
1282 1282
@@ -1286,13 +1286,13 @@ again:
1286 * there's a new owner and the console_unlock() from them will do the 1286 * there's a new owner and the console_unlock() from them will do the
1287 * flush, no worries. 1287 * flush, no worries.
1288 */ 1288 */
1289 spin_lock(&logbuf_lock); 1289 raw_spin_lock(&logbuf_lock);
1290 if (con_start != log_end) 1290 if (con_start != log_end)
1291 retry = 1; 1291 retry = 1;
1292 spin_unlock_irqrestore(&logbuf_lock, flags);
1293 if (retry && console_trylock()) 1292 if (retry && console_trylock())
1294 goto again; 1293 goto again;
1295 1294
1295 raw_spin_unlock_irqrestore(&logbuf_lock, flags);
1296 if (wake_klogd) 1296 if (wake_klogd)
1297 wake_up_klogd(); 1297 wake_up_klogd();
1298} 1298}
@@ -1522,9 +1522,9 @@ void register_console(struct console *newcon)
1522 * console_unlock(); will print out the buffered messages 1522 * console_unlock(); will print out the buffered messages
1523 * for us. 1523 * for us.
1524 */ 1524 */
1525 spin_lock_irqsave(&logbuf_lock, flags); 1525 raw_spin_lock_irqsave(&logbuf_lock, flags);
1526 con_start = log_start; 1526 con_start = log_start;
1527 spin_unlock_irqrestore(&logbuf_lock, flags); 1527 raw_spin_unlock_irqrestore(&logbuf_lock, flags);
1528 /* 1528 /*
1529 * We're about to replay the log buffer. Only do this to the 1529 * We're about to replay the log buffer. Only do this to the
1530 * just-registered console to avoid excessive message spam to 1530 * just-registered console to avoid excessive message spam to
@@ -1731,10 +1731,10 @@ void kmsg_dump(enum kmsg_dump_reason reason)
1731 /* Theoretically, the log could move on after we do this, but 1731 /* Theoretically, the log could move on after we do this, but
1732 there's not a lot we can do about that. The new messages 1732 there's not a lot we can do about that. The new messages
1733 will overwrite the start of what we dump. */ 1733 will overwrite the start of what we dump. */
1734 spin_lock_irqsave(&logbuf_lock, flags); 1734 raw_spin_lock_irqsave(&logbuf_lock, flags);
1735 end = log_end & LOG_BUF_MASK; 1735 end = log_end & LOG_BUF_MASK;
1736 chars = logged_chars; 1736 chars = logged_chars;
1737 spin_unlock_irqrestore(&logbuf_lock, flags); 1737 raw_spin_unlock_irqrestore(&logbuf_lock, flags);
1738 1738
1739 if (chars > end) { 1739 if (chars > end) {
1740 s1 = log_buf + log_buf_len - chars + end; 1740 s1 = log_buf + log_buf_len - chars + end;
diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index 9de3ecfd20f9..a70d2a5d8c7b 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -744,20 +744,17 @@ int ptrace_request(struct task_struct *child, long request,
744 break; 744 break;
745 745
746 si = child->last_siginfo; 746 si = child->last_siginfo;
747 if (unlikely(!si || si->si_code >> 8 != PTRACE_EVENT_STOP)) 747 if (likely(si && (si->si_code >> 8) == PTRACE_EVENT_STOP)) {
748 break; 748 child->jobctl |= JOBCTL_LISTENING;
749 749 /*
750 child->jobctl |= JOBCTL_LISTENING; 750 * If NOTIFY is set, it means event happened between
751 751 * start of this trap and now. Trigger re-trap.
752 /* 752 */
753 * If NOTIFY is set, it means event happened between start 753 if (child->jobctl & JOBCTL_TRAP_NOTIFY)
754 * of this trap and now. Trigger re-trap immediately. 754 signal_wake_up(child, true);
755 */ 755 ret = 0;
756 if (child->jobctl & JOBCTL_TRAP_NOTIFY) 756 }
757 signal_wake_up(child, true);
758
759 unlock_task_sighand(child, &flags); 757 unlock_task_sighand(child, &flags);
760 ret = 0;
761 break; 758 break;
762 759
763 case PTRACE_DETACH: /* detach a process that was attached. */ 760 case PTRACE_DETACH: /* detach a process that was attached. */
diff --git a/kernel/rcu.h b/kernel/rcu.h
new file mode 100644
index 000000000000..f600868d550d
--- /dev/null
+++ b/kernel/rcu.h
@@ -0,0 +1,85 @@
1/*
2 * Read-Copy Update definitions shared among RCU implementations.
3 *
4 * This program is free software; you can redistribute it and/or modify
5 * it under the terms of the GNU General Public License as published by
6 * the Free Software Foundation; either version 2 of the License, or
7 * (at your option) any later version.
8 *
9 * This program is distributed in the hope that it will be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 * GNU General Public License for more details.
13 *
14 * You should have received a copy of the GNU General Public License
15 * along with this program; if not, write to the Free Software
16 * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
17 *
18 * Copyright IBM Corporation, 2011
19 *
20 * Author: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
21 */
22
23#ifndef __LINUX_RCU_H
24#define __LINUX_RCU_H
25
26#ifdef CONFIG_RCU_TRACE
27#define RCU_TRACE(stmt) stmt
28#else /* #ifdef CONFIG_RCU_TRACE */
29#define RCU_TRACE(stmt)
30#endif /* #else #ifdef CONFIG_RCU_TRACE */
31
32/*
33 * debug_rcu_head_queue()/debug_rcu_head_unqueue() are used internally
34 * by call_rcu() and rcu callback execution, and are therefore not part of the
35 * RCU API. Leaving in rcupdate.h because they are used by all RCU flavors.
36 */
37
38#ifdef CONFIG_DEBUG_OBJECTS_RCU_HEAD
39# define STATE_RCU_HEAD_READY 0
40# define STATE_RCU_HEAD_QUEUED 1
41
42extern struct debug_obj_descr rcuhead_debug_descr;
43
44static inline void debug_rcu_head_queue(struct rcu_head *head)
45{
46 WARN_ON_ONCE((unsigned long)head & 0x3);
47 debug_object_activate(head, &rcuhead_debug_descr);
48 debug_object_active_state(head, &rcuhead_debug_descr,
49 STATE_RCU_HEAD_READY,
50 STATE_RCU_HEAD_QUEUED);
51}
52
53static inline void debug_rcu_head_unqueue(struct rcu_head *head)
54{
55 debug_object_active_state(head, &rcuhead_debug_descr,
56 STATE_RCU_HEAD_QUEUED,
57 STATE_RCU_HEAD_READY);
58 debug_object_deactivate(head, &rcuhead_debug_descr);
59}
60#else /* !CONFIG_DEBUG_OBJECTS_RCU_HEAD */
61static inline void debug_rcu_head_queue(struct rcu_head *head)
62{
63}
64
65static inline void debug_rcu_head_unqueue(struct rcu_head *head)
66{
67}
68#endif /* #else !CONFIG_DEBUG_OBJECTS_RCU_HEAD */
69
70extern void kfree(const void *);
71
72static inline void __rcu_reclaim(char *rn, struct rcu_head *head)
73{
74 unsigned long offset = (unsigned long)head->func;
75
76 if (__is_kfree_rcu_offset(offset)) {
77 RCU_TRACE(trace_rcu_invoke_kfree_callback(rn, head, offset));
78 kfree((void *)head - offset);
79 } else {
80 RCU_TRACE(trace_rcu_invoke_callback(rn, head));
81 head->func(head);
82 }
83}
84
85#endif /* __LINUX_RCU_H */
diff --git a/kernel/rcupdate.c b/kernel/rcupdate.c
index ddddb320be61..ca0d23b6b3e8 100644
--- a/kernel/rcupdate.c
+++ b/kernel/rcupdate.c
@@ -46,6 +46,11 @@
46#include <linux/module.h> 46#include <linux/module.h>
47#include <linux/hardirq.h> 47#include <linux/hardirq.h>
48 48
49#define CREATE_TRACE_POINTS
50#include <trace/events/rcu.h>
51
52#include "rcu.h"
53
49#ifdef CONFIG_DEBUG_LOCK_ALLOC 54#ifdef CONFIG_DEBUG_LOCK_ALLOC
50static struct lock_class_key rcu_lock_key; 55static struct lock_class_key rcu_lock_key;
51struct lockdep_map rcu_lock_map = 56struct lockdep_map rcu_lock_map =
@@ -94,11 +99,16 @@ EXPORT_SYMBOL_GPL(rcu_read_lock_bh_held);
94 99
95#endif /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */ 100#endif /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */
96 101
102struct rcu_synchronize {
103 struct rcu_head head;
104 struct completion completion;
105};
106
97/* 107/*
98 * Awaken the corresponding synchronize_rcu() instance now that a 108 * Awaken the corresponding synchronize_rcu() instance now that a
99 * grace period has elapsed. 109 * grace period has elapsed.
100 */ 110 */
101void wakeme_after_rcu(struct rcu_head *head) 111static void wakeme_after_rcu(struct rcu_head *head)
102{ 112{
103 struct rcu_synchronize *rcu; 113 struct rcu_synchronize *rcu;
104 114
@@ -106,6 +116,20 @@ void wakeme_after_rcu(struct rcu_head *head)
106 complete(&rcu->completion); 116 complete(&rcu->completion);
107} 117}
108 118
119void wait_rcu_gp(call_rcu_func_t crf)
120{
121 struct rcu_synchronize rcu;
122
123 init_rcu_head_on_stack(&rcu.head);
124 init_completion(&rcu.completion);
125 /* Will wake me after RCU finished. */
126 crf(&rcu.head, wakeme_after_rcu);
127 /* Wait for it. */
128 wait_for_completion(&rcu.completion);
129 destroy_rcu_head_on_stack(&rcu.head);
130}
131EXPORT_SYMBOL_GPL(wait_rcu_gp);
132
109#ifdef CONFIG_PROVE_RCU 133#ifdef CONFIG_PROVE_RCU
110/* 134/*
111 * wrapper function to avoid #include problems. 135 * wrapper function to avoid #include problems.
diff --git a/kernel/rcutiny.c b/kernel/rcutiny.c
index 7bbac7d0f5ab..da775c87f27f 100644
--- a/kernel/rcutiny.c
+++ b/kernel/rcutiny.c
@@ -37,16 +37,17 @@
37#include <linux/cpu.h> 37#include <linux/cpu.h>
38#include <linux/prefetch.h> 38#include <linux/prefetch.h>
39 39
40/* Controls for rcu_kthread() kthread, replacing RCU_SOFTIRQ used previously. */ 40#ifdef CONFIG_RCU_TRACE
41static struct task_struct *rcu_kthread_task; 41#include <trace/events/rcu.h>
42static DECLARE_WAIT_QUEUE_HEAD(rcu_kthread_wq); 42#endif /* #else #ifdef CONFIG_RCU_TRACE */
43static unsigned long have_rcu_kthread_work; 43
44#include "rcu.h"
44 45
45/* Forward declarations for rcutiny_plugin.h. */ 46/* Forward declarations for rcutiny_plugin.h. */
46struct rcu_ctrlblk; 47struct rcu_ctrlblk;
47static void invoke_rcu_kthread(void); 48static void invoke_rcu_callbacks(void);
48static void rcu_process_callbacks(struct rcu_ctrlblk *rcp); 49static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp);
49static int rcu_kthread(void *arg); 50static void rcu_process_callbacks(struct softirq_action *unused);
50static void __call_rcu(struct rcu_head *head, 51static void __call_rcu(struct rcu_head *head,
51 void (*func)(struct rcu_head *rcu), 52 void (*func)(struct rcu_head *rcu),
52 struct rcu_ctrlblk *rcp); 53 struct rcu_ctrlblk *rcp);
@@ -96,16 +97,6 @@ static int rcu_qsctr_help(struct rcu_ctrlblk *rcp)
96} 97}
97 98
98/* 99/*
99 * Wake up rcu_kthread() to process callbacks now eligible for invocation
100 * or to boost readers.
101 */
102static void invoke_rcu_kthread(void)
103{
104 have_rcu_kthread_work = 1;
105 wake_up(&rcu_kthread_wq);
106}
107
108/*
109 * Record an rcu quiescent state. And an rcu_bh quiescent state while we 100 * Record an rcu quiescent state. And an rcu_bh quiescent state while we
110 * are at it, given that any rcu quiescent state is also an rcu_bh 101 * are at it, given that any rcu quiescent state is also an rcu_bh
111 * quiescent state. Use "+" instead of "||" to defeat short circuiting. 102 * quiescent state. Use "+" instead of "||" to defeat short circuiting.
@@ -117,7 +108,7 @@ void rcu_sched_qs(int cpu)
117 local_irq_save(flags); 108 local_irq_save(flags);
118 if (rcu_qsctr_help(&rcu_sched_ctrlblk) + 109 if (rcu_qsctr_help(&rcu_sched_ctrlblk) +
119 rcu_qsctr_help(&rcu_bh_ctrlblk)) 110 rcu_qsctr_help(&rcu_bh_ctrlblk))
120 invoke_rcu_kthread(); 111 invoke_rcu_callbacks();
121 local_irq_restore(flags); 112 local_irq_restore(flags);
122} 113}
123 114
@@ -130,7 +121,7 @@ void rcu_bh_qs(int cpu)
130 121
131 local_irq_save(flags); 122 local_irq_save(flags);
132 if (rcu_qsctr_help(&rcu_bh_ctrlblk)) 123 if (rcu_qsctr_help(&rcu_bh_ctrlblk))
133 invoke_rcu_kthread(); 124 invoke_rcu_callbacks();
134 local_irq_restore(flags); 125 local_irq_restore(flags);
135} 126}
136 127
@@ -154,18 +145,23 @@ void rcu_check_callbacks(int cpu, int user)
154 * Invoke the RCU callbacks on the specified rcu_ctrlkblk structure 145 * Invoke the RCU callbacks on the specified rcu_ctrlkblk structure
155 * whose grace period has elapsed. 146 * whose grace period has elapsed.
156 */ 147 */
157static void rcu_process_callbacks(struct rcu_ctrlblk *rcp) 148static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp)
158{ 149{
150 char *rn = NULL;
159 struct rcu_head *next, *list; 151 struct rcu_head *next, *list;
160 unsigned long flags; 152 unsigned long flags;
161 RCU_TRACE(int cb_count = 0); 153 RCU_TRACE(int cb_count = 0);
162 154
163 /* If no RCU callbacks ready to invoke, just return. */ 155 /* If no RCU callbacks ready to invoke, just return. */
164 if (&rcp->rcucblist == rcp->donetail) 156 if (&rcp->rcucblist == rcp->donetail) {
157 RCU_TRACE(trace_rcu_batch_start(rcp->name, 0, -1));
158 RCU_TRACE(trace_rcu_batch_end(rcp->name, 0));
165 return; 159 return;
160 }
166 161
167 /* Move the ready-to-invoke callbacks to a local list. */ 162 /* Move the ready-to-invoke callbacks to a local list. */
168 local_irq_save(flags); 163 local_irq_save(flags);
164 RCU_TRACE(trace_rcu_batch_start(rcp->name, 0, -1));
169 list = rcp->rcucblist; 165 list = rcp->rcucblist;
170 rcp->rcucblist = *rcp->donetail; 166 rcp->rcucblist = *rcp->donetail;
171 *rcp->donetail = NULL; 167 *rcp->donetail = NULL;
@@ -176,49 +172,26 @@ static void rcu_process_callbacks(struct rcu_ctrlblk *rcp)
176 local_irq_restore(flags); 172 local_irq_restore(flags);
177 173
178 /* Invoke the callbacks on the local list. */ 174 /* Invoke the callbacks on the local list. */
175 RCU_TRACE(rn = rcp->name);
179 while (list) { 176 while (list) {
180 next = list->next; 177 next = list->next;
181 prefetch(next); 178 prefetch(next);
182 debug_rcu_head_unqueue(list); 179 debug_rcu_head_unqueue(list);
183 local_bh_disable(); 180 local_bh_disable();
184 __rcu_reclaim(list); 181 __rcu_reclaim(rn, list);
185 local_bh_enable(); 182 local_bh_enable();
186 list = next; 183 list = next;
187 RCU_TRACE(cb_count++); 184 RCU_TRACE(cb_count++);
188 } 185 }
189 RCU_TRACE(rcu_trace_sub_qlen(rcp, cb_count)); 186 RCU_TRACE(rcu_trace_sub_qlen(rcp, cb_count));
187 RCU_TRACE(trace_rcu_batch_end(rcp->name, cb_count));
190} 188}
191 189
192/* 190static void rcu_process_callbacks(struct softirq_action *unused)
193 * This kthread invokes RCU callbacks whose grace periods have
194 * elapsed. It is awakened as needed, and takes the place of the
195 * RCU_SOFTIRQ that was used previously for this purpose.
196 * This is a kthread, but it is never stopped, at least not until
197 * the system goes down.
198 */
199static int rcu_kthread(void *arg)
200{ 191{
201 unsigned long work; 192 __rcu_process_callbacks(&rcu_sched_ctrlblk);
202 unsigned long morework; 193 __rcu_process_callbacks(&rcu_bh_ctrlblk);
203 unsigned long flags; 194 rcu_preempt_process_callbacks();
204
205 for (;;) {
206 wait_event_interruptible(rcu_kthread_wq,
207 have_rcu_kthread_work != 0);
208 morework = rcu_boost();
209 local_irq_save(flags);
210 work = have_rcu_kthread_work;
211 have_rcu_kthread_work = morework;
212 local_irq_restore(flags);
213 if (work) {
214 rcu_process_callbacks(&rcu_sched_ctrlblk);
215 rcu_process_callbacks(&rcu_bh_ctrlblk);
216 rcu_preempt_process_callbacks();
217 }
218 schedule_timeout_interruptible(1); /* Leave CPU for others. */
219 }
220
221 return 0; /* Not reached, but needed to shut gcc up. */
222} 195}
223 196
224/* 197/*
@@ -280,45 +253,3 @@ void call_rcu_bh(struct rcu_head *head, void (*func)(struct rcu_head *rcu))
280 __call_rcu(head, func, &rcu_bh_ctrlblk); 253 __call_rcu(head, func, &rcu_bh_ctrlblk);
281} 254}
282EXPORT_SYMBOL_GPL(call_rcu_bh); 255EXPORT_SYMBOL_GPL(call_rcu_bh);
283
284void rcu_barrier_bh(void)
285{
286 struct rcu_synchronize rcu;
287
288 init_rcu_head_on_stack(&rcu.head);
289 init_completion(&rcu.completion);
290 /* Will wake me after RCU finished. */
291 call_rcu_bh(&rcu.head, wakeme_after_rcu);
292 /* Wait for it. */
293 wait_for_completion(&rcu.completion);
294 destroy_rcu_head_on_stack(&rcu.head);
295}
296EXPORT_SYMBOL_GPL(rcu_barrier_bh);
297
298void rcu_barrier_sched(void)
299{
300 struct rcu_synchronize rcu;
301
302 init_rcu_head_on_stack(&rcu.head);
303 init_completion(&rcu.completion);
304 /* Will wake me after RCU finished. */
305 call_rcu_sched(&rcu.head, wakeme_after_rcu);
306 /* Wait for it. */
307 wait_for_completion(&rcu.completion);
308 destroy_rcu_head_on_stack(&rcu.head);
309}
310EXPORT_SYMBOL_GPL(rcu_barrier_sched);
311
312/*
313 * Spawn the kthread that invokes RCU callbacks.
314 */
315static int __init rcu_spawn_kthreads(void)
316{
317 struct sched_param sp;
318
319 rcu_kthread_task = kthread_run(rcu_kthread, NULL, "rcu_kthread");
320 sp.sched_priority = RCU_BOOST_PRIO;
321 sched_setscheduler_nocheck(rcu_kthread_task, SCHED_FIFO, &sp);
322 return 0;
323}
324early_initcall(rcu_spawn_kthreads);
diff --git a/kernel/rcutiny_plugin.h b/kernel/rcutiny_plugin.h
index f259c676195f..02aa7139861c 100644
--- a/kernel/rcutiny_plugin.h
+++ b/kernel/rcutiny_plugin.h
@@ -26,29 +26,26 @@
26#include <linux/debugfs.h> 26#include <linux/debugfs.h>
27#include <linux/seq_file.h> 27#include <linux/seq_file.h>
28 28
29#ifdef CONFIG_RCU_TRACE
30#define RCU_TRACE(stmt) stmt
31#else /* #ifdef CONFIG_RCU_TRACE */
32#define RCU_TRACE(stmt)
33#endif /* #else #ifdef CONFIG_RCU_TRACE */
34
35/* Global control variables for rcupdate callback mechanism. */ 29/* Global control variables for rcupdate callback mechanism. */
36struct rcu_ctrlblk { 30struct rcu_ctrlblk {
37 struct rcu_head *rcucblist; /* List of pending callbacks (CBs). */ 31 struct rcu_head *rcucblist; /* List of pending callbacks (CBs). */
38 struct rcu_head **donetail; /* ->next pointer of last "done" CB. */ 32 struct rcu_head **donetail; /* ->next pointer of last "done" CB. */
39 struct rcu_head **curtail; /* ->next pointer of last CB. */ 33 struct rcu_head **curtail; /* ->next pointer of last CB. */
40 RCU_TRACE(long qlen); /* Number of pending CBs. */ 34 RCU_TRACE(long qlen); /* Number of pending CBs. */
35 RCU_TRACE(char *name); /* Name of RCU type. */
41}; 36};
42 37
43/* Definition for rcupdate control block. */ 38/* Definition for rcupdate control block. */
44static struct rcu_ctrlblk rcu_sched_ctrlblk = { 39static struct rcu_ctrlblk rcu_sched_ctrlblk = {
45 .donetail = &rcu_sched_ctrlblk.rcucblist, 40 .donetail = &rcu_sched_ctrlblk.rcucblist,
46 .curtail = &rcu_sched_ctrlblk.rcucblist, 41 .curtail = &rcu_sched_ctrlblk.rcucblist,
42 RCU_TRACE(.name = "rcu_sched")
47}; 43};
48 44
49static struct rcu_ctrlblk rcu_bh_ctrlblk = { 45static struct rcu_ctrlblk rcu_bh_ctrlblk = {
50 .donetail = &rcu_bh_ctrlblk.rcucblist, 46 .donetail = &rcu_bh_ctrlblk.rcucblist,
51 .curtail = &rcu_bh_ctrlblk.rcucblist, 47 .curtail = &rcu_bh_ctrlblk.rcucblist,
48 RCU_TRACE(.name = "rcu_bh")
52}; 49};
53 50
54#ifdef CONFIG_DEBUG_LOCK_ALLOC 51#ifdef CONFIG_DEBUG_LOCK_ALLOC
@@ -131,6 +128,7 @@ static struct rcu_preempt_ctrlblk rcu_preempt_ctrlblk = {
131 .rcb.curtail = &rcu_preempt_ctrlblk.rcb.rcucblist, 128 .rcb.curtail = &rcu_preempt_ctrlblk.rcb.rcucblist,
132 .nexttail = &rcu_preempt_ctrlblk.rcb.rcucblist, 129 .nexttail = &rcu_preempt_ctrlblk.rcb.rcucblist,
133 .blkd_tasks = LIST_HEAD_INIT(rcu_preempt_ctrlblk.blkd_tasks), 130 .blkd_tasks = LIST_HEAD_INIT(rcu_preempt_ctrlblk.blkd_tasks),
131 RCU_TRACE(.rcb.name = "rcu_preempt")
134}; 132};
135 133
136static int rcu_preempted_readers_exp(void); 134static int rcu_preempted_readers_exp(void);
@@ -247,6 +245,13 @@ static void show_tiny_preempt_stats(struct seq_file *m)
247 245
248#include "rtmutex_common.h" 246#include "rtmutex_common.h"
249 247
248#define RCU_BOOST_PRIO CONFIG_RCU_BOOST_PRIO
249
250/* Controls for rcu_kthread() kthread. */
251static struct task_struct *rcu_kthread_task;
252static DECLARE_WAIT_QUEUE_HEAD(rcu_kthread_wq);
253static unsigned long have_rcu_kthread_work;
254
250/* 255/*
251 * Carry out RCU priority boosting on the task indicated by ->boost_tasks, 256 * Carry out RCU priority boosting on the task indicated by ->boost_tasks,
252 * and advance ->boost_tasks to the next task in the ->blkd_tasks list. 257 * and advance ->boost_tasks to the next task in the ->blkd_tasks list.
@@ -334,7 +339,7 @@ static int rcu_initiate_boost(void)
334 if (rcu_preempt_ctrlblk.exp_tasks == NULL) 339 if (rcu_preempt_ctrlblk.exp_tasks == NULL)
335 rcu_preempt_ctrlblk.boost_tasks = 340 rcu_preempt_ctrlblk.boost_tasks =
336 rcu_preempt_ctrlblk.gp_tasks; 341 rcu_preempt_ctrlblk.gp_tasks;
337 invoke_rcu_kthread(); 342 invoke_rcu_callbacks();
338 } else 343 } else
339 RCU_TRACE(rcu_initiate_boost_trace()); 344 RCU_TRACE(rcu_initiate_boost_trace());
340 return 1; 345 return 1;
@@ -353,14 +358,6 @@ static void rcu_preempt_boost_start_gp(void)
353#else /* #ifdef CONFIG_RCU_BOOST */ 358#else /* #ifdef CONFIG_RCU_BOOST */
354 359
355/* 360/*
356 * If there is no RCU priority boosting, we don't boost.
357 */
358static int rcu_boost(void)
359{
360 return 0;
361}
362
363/*
364 * If there is no RCU priority boosting, we don't initiate boosting, 361 * If there is no RCU priority boosting, we don't initiate boosting,
365 * but we do indicate whether there are blocked readers blocking the 362 * but we do indicate whether there are blocked readers blocking the
366 * current grace period. 363 * current grace period.
@@ -427,7 +424,7 @@ static void rcu_preempt_cpu_qs(void)
427 424
428 /* If there are done callbacks, cause them to be invoked. */ 425 /* If there are done callbacks, cause them to be invoked. */
429 if (*rcu_preempt_ctrlblk.rcb.donetail != NULL) 426 if (*rcu_preempt_ctrlblk.rcb.donetail != NULL)
430 invoke_rcu_kthread(); 427 invoke_rcu_callbacks();
431} 428}
432 429
433/* 430/*
@@ -648,7 +645,7 @@ static void rcu_preempt_check_callbacks(void)
648 rcu_preempt_cpu_qs(); 645 rcu_preempt_cpu_qs();
649 if (&rcu_preempt_ctrlblk.rcb.rcucblist != 646 if (&rcu_preempt_ctrlblk.rcb.rcucblist !=
650 rcu_preempt_ctrlblk.rcb.donetail) 647 rcu_preempt_ctrlblk.rcb.donetail)
651 invoke_rcu_kthread(); 648 invoke_rcu_callbacks();
652 if (rcu_preempt_gp_in_progress() && 649 if (rcu_preempt_gp_in_progress() &&
653 rcu_cpu_blocking_cur_gp() && 650 rcu_cpu_blocking_cur_gp() &&
654 rcu_preempt_running_reader()) 651 rcu_preempt_running_reader())
@@ -674,7 +671,7 @@ static void rcu_preempt_remove_callbacks(struct rcu_ctrlblk *rcp)
674 */ 671 */
675static void rcu_preempt_process_callbacks(void) 672static void rcu_preempt_process_callbacks(void)
676{ 673{
677 rcu_process_callbacks(&rcu_preempt_ctrlblk.rcb); 674 __rcu_process_callbacks(&rcu_preempt_ctrlblk.rcb);
678} 675}
679 676
680/* 677/*
@@ -697,20 +694,6 @@ void call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu))
697} 694}
698EXPORT_SYMBOL_GPL(call_rcu); 695EXPORT_SYMBOL_GPL(call_rcu);
699 696
700void rcu_barrier(void)
701{
702 struct rcu_synchronize rcu;
703
704 init_rcu_head_on_stack(&rcu.head);
705 init_completion(&rcu.completion);
706 /* Will wake me after RCU finished. */
707 call_rcu(&rcu.head, wakeme_after_rcu);
708 /* Wait for it. */
709 wait_for_completion(&rcu.completion);
710 destroy_rcu_head_on_stack(&rcu.head);
711}
712EXPORT_SYMBOL_GPL(rcu_barrier);
713
714/* 697/*
715 * synchronize_rcu - wait until a grace period has elapsed. 698 * synchronize_rcu - wait until a grace period has elapsed.
716 * 699 *
@@ -864,15 +847,6 @@ static void show_tiny_preempt_stats(struct seq_file *m)
864#endif /* #ifdef CONFIG_RCU_TRACE */ 847#endif /* #ifdef CONFIG_RCU_TRACE */
865 848
866/* 849/*
867 * Because preemptible RCU does not exist, it is never necessary to
868 * boost preempted RCU readers.
869 */
870static int rcu_boost(void)
871{
872 return 0;
873}
874
875/*
876 * Because preemptible RCU does not exist, it never has any callbacks 850 * Because preemptible RCU does not exist, it never has any callbacks
877 * to check. 851 * to check.
878 */ 852 */
@@ -898,6 +872,78 @@ static void rcu_preempt_process_callbacks(void)
898 872
899#endif /* #else #ifdef CONFIG_TINY_PREEMPT_RCU */ 873#endif /* #else #ifdef CONFIG_TINY_PREEMPT_RCU */
900 874
875#ifdef CONFIG_RCU_BOOST
876
877/*
878 * Wake up rcu_kthread() to process callbacks now eligible for invocation
879 * or to boost readers.
880 */
881static void invoke_rcu_callbacks(void)
882{
883 have_rcu_kthread_work = 1;
884 wake_up(&rcu_kthread_wq);
885}
886
887/*
888 * This kthread invokes RCU callbacks whose grace periods have
889 * elapsed. It is awakened as needed, and takes the place of the
890 * RCU_SOFTIRQ that is used for this purpose when boosting is disabled.
891 * This is a kthread, but it is never stopped, at least not until
892 * the system goes down.
893 */
894static int rcu_kthread(void *arg)
895{
896 unsigned long work;
897 unsigned long morework;
898 unsigned long flags;
899
900 for (;;) {
901 wait_event_interruptible(rcu_kthread_wq,
902 have_rcu_kthread_work != 0);
903 morework = rcu_boost();
904 local_irq_save(flags);
905 work = have_rcu_kthread_work;
906 have_rcu_kthread_work = morework;
907 local_irq_restore(flags);
908 if (work)
909 rcu_process_callbacks(NULL);
910 schedule_timeout_interruptible(1); /* Leave CPU for others. */
911 }
912
913 return 0; /* Not reached, but needed to shut gcc up. */
914}
915
916/*
917 * Spawn the kthread that invokes RCU callbacks.
918 */
919static int __init rcu_spawn_kthreads(void)
920{
921 struct sched_param sp;
922
923 rcu_kthread_task = kthread_run(rcu_kthread, NULL, "rcu_kthread");
924 sp.sched_priority = RCU_BOOST_PRIO;
925 sched_setscheduler_nocheck(rcu_kthread_task, SCHED_FIFO, &sp);
926 return 0;
927}
928early_initcall(rcu_spawn_kthreads);
929
930#else /* #ifdef CONFIG_RCU_BOOST */
931
932/*
933 * Start up softirq processing of callbacks.
934 */
935void invoke_rcu_callbacks(void)
936{
937 raise_softirq(RCU_SOFTIRQ);
938}
939
940void rcu_init(void)
941{
942 open_softirq(RCU_SOFTIRQ, rcu_process_callbacks);
943}
944
945#endif /* #else #ifdef CONFIG_RCU_BOOST */
946
901#ifdef CONFIG_DEBUG_LOCK_ALLOC 947#ifdef CONFIG_DEBUG_LOCK_ALLOC
902#include <linux/kernel_stat.h> 948#include <linux/kernel_stat.h>
903 949
@@ -913,12 +959,6 @@ void __init rcu_scheduler_starting(void)
913 959
914#endif /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */ 960#endif /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */
915 961
916#ifdef CONFIG_RCU_BOOST
917#define RCU_BOOST_PRIO CONFIG_RCU_BOOST_PRIO
918#else /* #ifdef CONFIG_RCU_BOOST */
919#define RCU_BOOST_PRIO 1
920#endif /* #else #ifdef CONFIG_RCU_BOOST */
921
922#ifdef CONFIG_RCU_TRACE 962#ifdef CONFIG_RCU_TRACE
923 963
924#ifdef CONFIG_RCU_BOOST 964#ifdef CONFIG_RCU_BOOST
diff --git a/kernel/rcutorture.c b/kernel/rcutorture.c
index 98f51b13bb7e..764825c2685c 100644
--- a/kernel/rcutorture.c
+++ b/kernel/rcutorture.c
@@ -73,7 +73,7 @@ module_param(nreaders, int, 0444);
73MODULE_PARM_DESC(nreaders, "Number of RCU reader threads"); 73MODULE_PARM_DESC(nreaders, "Number of RCU reader threads");
74module_param(nfakewriters, int, 0444); 74module_param(nfakewriters, int, 0444);
75MODULE_PARM_DESC(nfakewriters, "Number of RCU fake writer threads"); 75MODULE_PARM_DESC(nfakewriters, "Number of RCU fake writer threads");
76module_param(stat_interval, int, 0444); 76module_param(stat_interval, int, 0644);
77MODULE_PARM_DESC(stat_interval, "Number of seconds between stats printk()s"); 77MODULE_PARM_DESC(stat_interval, "Number of seconds between stats printk()s");
78module_param(verbose, bool, 0444); 78module_param(verbose, bool, 0444);
79MODULE_PARM_DESC(verbose, "Enable verbose debugging printk()s"); 79MODULE_PARM_DESC(verbose, "Enable verbose debugging printk()s");
@@ -480,30 +480,6 @@ static void rcu_bh_torture_deferred_free(struct rcu_torture *p)
480 call_rcu_bh(&p->rtort_rcu, rcu_torture_cb); 480 call_rcu_bh(&p->rtort_rcu, rcu_torture_cb);
481} 481}
482 482
483struct rcu_bh_torture_synchronize {
484 struct rcu_head head;
485 struct completion completion;
486};
487
488static void rcu_bh_torture_wakeme_after_cb(struct rcu_head *head)
489{
490 struct rcu_bh_torture_synchronize *rcu;
491
492 rcu = container_of(head, struct rcu_bh_torture_synchronize, head);
493 complete(&rcu->completion);
494}
495
496static void rcu_bh_torture_synchronize(void)
497{
498 struct rcu_bh_torture_synchronize rcu;
499
500 init_rcu_head_on_stack(&rcu.head);
501 init_completion(&rcu.completion);
502 call_rcu_bh(&rcu.head, rcu_bh_torture_wakeme_after_cb);
503 wait_for_completion(&rcu.completion);
504 destroy_rcu_head_on_stack(&rcu.head);
505}
506
507static struct rcu_torture_ops rcu_bh_ops = { 483static struct rcu_torture_ops rcu_bh_ops = {
508 .init = NULL, 484 .init = NULL,
509 .cleanup = NULL, 485 .cleanup = NULL,
@@ -512,7 +488,7 @@ static struct rcu_torture_ops rcu_bh_ops = {
512 .readunlock = rcu_bh_torture_read_unlock, 488 .readunlock = rcu_bh_torture_read_unlock,
513 .completed = rcu_bh_torture_completed, 489 .completed = rcu_bh_torture_completed,
514 .deferred_free = rcu_bh_torture_deferred_free, 490 .deferred_free = rcu_bh_torture_deferred_free,
515 .sync = rcu_bh_torture_synchronize, 491 .sync = synchronize_rcu_bh,
516 .cb_barrier = rcu_barrier_bh, 492 .cb_barrier = rcu_barrier_bh,
517 .fqs = rcu_bh_force_quiescent_state, 493 .fqs = rcu_bh_force_quiescent_state,
518 .stats = NULL, 494 .stats = NULL,
@@ -528,7 +504,7 @@ static struct rcu_torture_ops rcu_bh_sync_ops = {
528 .readunlock = rcu_bh_torture_read_unlock, 504 .readunlock = rcu_bh_torture_read_unlock,
529 .completed = rcu_bh_torture_completed, 505 .completed = rcu_bh_torture_completed,
530 .deferred_free = rcu_sync_torture_deferred_free, 506 .deferred_free = rcu_sync_torture_deferred_free,
531 .sync = rcu_bh_torture_synchronize, 507 .sync = synchronize_rcu_bh,
532 .cb_barrier = NULL, 508 .cb_barrier = NULL,
533 .fqs = rcu_bh_force_quiescent_state, 509 .fqs = rcu_bh_force_quiescent_state,
534 .stats = NULL, 510 .stats = NULL,
@@ -536,6 +512,22 @@ static struct rcu_torture_ops rcu_bh_sync_ops = {
536 .name = "rcu_bh_sync" 512 .name = "rcu_bh_sync"
537}; 513};
538 514
515static struct rcu_torture_ops rcu_bh_expedited_ops = {
516 .init = rcu_sync_torture_init,
517 .cleanup = NULL,
518 .readlock = rcu_bh_torture_read_lock,
519 .read_delay = rcu_read_delay, /* just reuse rcu's version. */
520 .readunlock = rcu_bh_torture_read_unlock,
521 .completed = rcu_bh_torture_completed,
522 .deferred_free = rcu_sync_torture_deferred_free,
523 .sync = synchronize_rcu_bh_expedited,
524 .cb_barrier = NULL,
525 .fqs = rcu_bh_force_quiescent_state,
526 .stats = NULL,
527 .irq_capable = 1,
528 .name = "rcu_bh_expedited"
529};
530
539/* 531/*
540 * Definitions for srcu torture testing. 532 * Definitions for srcu torture testing.
541 */ 533 */
@@ -659,11 +651,6 @@ static void rcu_sched_torture_deferred_free(struct rcu_torture *p)
659 call_rcu_sched(&p->rtort_rcu, rcu_torture_cb); 651 call_rcu_sched(&p->rtort_rcu, rcu_torture_cb);
660} 652}
661 653
662static void sched_torture_synchronize(void)
663{
664 synchronize_sched();
665}
666
667static struct rcu_torture_ops sched_ops = { 654static struct rcu_torture_ops sched_ops = {
668 .init = rcu_sync_torture_init, 655 .init = rcu_sync_torture_init,
669 .cleanup = NULL, 656 .cleanup = NULL,
@@ -672,7 +659,7 @@ static struct rcu_torture_ops sched_ops = {
672 .readunlock = sched_torture_read_unlock, 659 .readunlock = sched_torture_read_unlock,
673 .completed = rcu_no_completed, 660 .completed = rcu_no_completed,
674 .deferred_free = rcu_sched_torture_deferred_free, 661 .deferred_free = rcu_sched_torture_deferred_free,
675 .sync = sched_torture_synchronize, 662 .sync = synchronize_sched,
676 .cb_barrier = rcu_barrier_sched, 663 .cb_barrier = rcu_barrier_sched,
677 .fqs = rcu_sched_force_quiescent_state, 664 .fqs = rcu_sched_force_quiescent_state,
678 .stats = NULL, 665 .stats = NULL,
@@ -688,7 +675,7 @@ static struct rcu_torture_ops sched_sync_ops = {
688 .readunlock = sched_torture_read_unlock, 675 .readunlock = sched_torture_read_unlock,
689 .completed = rcu_no_completed, 676 .completed = rcu_no_completed,
690 .deferred_free = rcu_sync_torture_deferred_free, 677 .deferred_free = rcu_sync_torture_deferred_free,
691 .sync = sched_torture_synchronize, 678 .sync = synchronize_sched,
692 .cb_barrier = NULL, 679 .cb_barrier = NULL,
693 .fqs = rcu_sched_force_quiescent_state, 680 .fqs = rcu_sched_force_quiescent_state,
694 .stats = NULL, 681 .stats = NULL,
@@ -754,7 +741,7 @@ static int rcu_torture_boost(void *arg)
754 do { 741 do {
755 /* Wait for the next test interval. */ 742 /* Wait for the next test interval. */
756 oldstarttime = boost_starttime; 743 oldstarttime = boost_starttime;
757 while (jiffies - oldstarttime > ULONG_MAX / 2) { 744 while (ULONG_CMP_LT(jiffies, oldstarttime)) {
758 schedule_timeout_uninterruptible(1); 745 schedule_timeout_uninterruptible(1);
759 rcu_stutter_wait("rcu_torture_boost"); 746 rcu_stutter_wait("rcu_torture_boost");
760 if (kthread_should_stop() || 747 if (kthread_should_stop() ||
@@ -765,7 +752,7 @@ static int rcu_torture_boost(void *arg)
765 /* Do one boost-test interval. */ 752 /* Do one boost-test interval. */
766 endtime = oldstarttime + test_boost_duration * HZ; 753 endtime = oldstarttime + test_boost_duration * HZ;
767 call_rcu_time = jiffies; 754 call_rcu_time = jiffies;
768 while (jiffies - endtime > ULONG_MAX / 2) { 755 while (ULONG_CMP_LT(jiffies, endtime)) {
769 /* If we don't have a callback in flight, post one. */ 756 /* If we don't have a callback in flight, post one. */
770 if (!rbi.inflight) { 757 if (!rbi.inflight) {
771 smp_mb(); /* RCU core before ->inflight = 1. */ 758 smp_mb(); /* RCU core before ->inflight = 1. */
@@ -792,7 +779,8 @@ static int rcu_torture_boost(void *arg)
792 * interval. Besides, we are running at RT priority, 779 * interval. Besides, we are running at RT priority,
793 * so delays should be relatively rare. 780 * so delays should be relatively rare.
794 */ 781 */
795 while (oldstarttime == boost_starttime) { 782 while (oldstarttime == boost_starttime &&
783 !kthread_should_stop()) {
796 if (mutex_trylock(&boost_mutex)) { 784 if (mutex_trylock(&boost_mutex)) {
797 boost_starttime = jiffies + 785 boost_starttime = jiffies +
798 test_boost_interval * HZ; 786 test_boost_interval * HZ;
@@ -809,11 +797,11 @@ checkwait: rcu_stutter_wait("rcu_torture_boost");
809 797
810 /* Clean up and exit. */ 798 /* Clean up and exit. */
811 VERBOSE_PRINTK_STRING("rcu_torture_boost task stopping"); 799 VERBOSE_PRINTK_STRING("rcu_torture_boost task stopping");
812 destroy_rcu_head_on_stack(&rbi.rcu);
813 rcutorture_shutdown_absorb("rcu_torture_boost"); 800 rcutorture_shutdown_absorb("rcu_torture_boost");
814 while (!kthread_should_stop() || rbi.inflight) 801 while (!kthread_should_stop() || rbi.inflight)
815 schedule_timeout_uninterruptible(1); 802 schedule_timeout_uninterruptible(1);
816 smp_mb(); /* order accesses to ->inflight before stack-frame death. */ 803 smp_mb(); /* order accesses to ->inflight before stack-frame death. */
804 destroy_rcu_head_on_stack(&rbi.rcu);
817 return 0; 805 return 0;
818} 806}
819 807
@@ -831,11 +819,13 @@ rcu_torture_fqs(void *arg)
831 VERBOSE_PRINTK_STRING("rcu_torture_fqs task started"); 819 VERBOSE_PRINTK_STRING("rcu_torture_fqs task started");
832 do { 820 do {
833 fqs_resume_time = jiffies + fqs_stutter * HZ; 821 fqs_resume_time = jiffies + fqs_stutter * HZ;
834 while (jiffies - fqs_resume_time > LONG_MAX) { 822 while (ULONG_CMP_LT(jiffies, fqs_resume_time) &&
823 !kthread_should_stop()) {
835 schedule_timeout_interruptible(1); 824 schedule_timeout_interruptible(1);
836 } 825 }
837 fqs_burst_remaining = fqs_duration; 826 fqs_burst_remaining = fqs_duration;
838 while (fqs_burst_remaining > 0) { 827 while (fqs_burst_remaining > 0 &&
828 !kthread_should_stop()) {
839 cur_ops->fqs(); 829 cur_ops->fqs();
840 udelay(fqs_holdoff); 830 udelay(fqs_holdoff);
841 fqs_burst_remaining -= fqs_holdoff; 831 fqs_burst_remaining -= fqs_holdoff;
@@ -1280,8 +1270,9 @@ static int rcutorture_booster_init(int cpu)
1280 /* Don't allow time recalculation while creating a new task. */ 1270 /* Don't allow time recalculation while creating a new task. */
1281 mutex_lock(&boost_mutex); 1271 mutex_lock(&boost_mutex);
1282 VERBOSE_PRINTK_STRING("Creating rcu_torture_boost task"); 1272 VERBOSE_PRINTK_STRING("Creating rcu_torture_boost task");
1283 boost_tasks[cpu] = kthread_create(rcu_torture_boost, NULL, 1273 boost_tasks[cpu] = kthread_create_on_node(rcu_torture_boost, NULL,
1284 "rcu_torture_boost"); 1274 cpu_to_node(cpu),
1275 "rcu_torture_boost");
1285 if (IS_ERR(boost_tasks[cpu])) { 1276 if (IS_ERR(boost_tasks[cpu])) {
1286 retval = PTR_ERR(boost_tasks[cpu]); 1277 retval = PTR_ERR(boost_tasks[cpu]);
1287 VERBOSE_PRINTK_STRING("rcu_torture_boost task create failed"); 1278 VERBOSE_PRINTK_STRING("rcu_torture_boost task create failed");
@@ -1424,7 +1415,7 @@ rcu_torture_init(void)
1424 int firsterr = 0; 1415 int firsterr = 0;
1425 static struct rcu_torture_ops *torture_ops[] = 1416 static struct rcu_torture_ops *torture_ops[] =
1426 { &rcu_ops, &rcu_sync_ops, &rcu_expedited_ops, 1417 { &rcu_ops, &rcu_sync_ops, &rcu_expedited_ops,
1427 &rcu_bh_ops, &rcu_bh_sync_ops, 1418 &rcu_bh_ops, &rcu_bh_sync_ops, &rcu_bh_expedited_ops,
1428 &srcu_ops, &srcu_expedited_ops, 1419 &srcu_ops, &srcu_expedited_ops,
1429 &sched_ops, &sched_sync_ops, &sched_expedited_ops, }; 1420 &sched_ops, &sched_sync_ops, &sched_expedited_ops, };
1430 1421
diff --git a/kernel/rcutree.c b/kernel/rcutree.c
index ba06207b1dd3..e234eb92a177 100644
--- a/kernel/rcutree.c
+++ b/kernel/rcutree.c
@@ -52,13 +52,16 @@
52#include <linux/prefetch.h> 52#include <linux/prefetch.h>
53 53
54#include "rcutree.h" 54#include "rcutree.h"
55#include <trace/events/rcu.h>
56
57#include "rcu.h"
55 58
56/* Data structures. */ 59/* Data structures. */
57 60
58static struct lock_class_key rcu_node_class[NUM_RCU_LVLS]; 61static struct lock_class_key rcu_node_class[NUM_RCU_LVLS];
59 62
60#define RCU_STATE_INITIALIZER(structname) { \ 63#define RCU_STATE_INITIALIZER(structname) { \
61 .level = { &structname.node[0] }, \ 64 .level = { &structname##_state.node[0] }, \
62 .levelcnt = { \ 65 .levelcnt = { \
63 NUM_RCU_LVL_0, /* root of hierarchy. */ \ 66 NUM_RCU_LVL_0, /* root of hierarchy. */ \
64 NUM_RCU_LVL_1, \ 67 NUM_RCU_LVL_1, \
@@ -69,17 +72,17 @@ static struct lock_class_key rcu_node_class[NUM_RCU_LVLS];
69 .signaled = RCU_GP_IDLE, \ 72 .signaled = RCU_GP_IDLE, \
70 .gpnum = -300, \ 73 .gpnum = -300, \
71 .completed = -300, \ 74 .completed = -300, \
72 .onofflock = __RAW_SPIN_LOCK_UNLOCKED(&structname.onofflock), \ 75 .onofflock = __RAW_SPIN_LOCK_UNLOCKED(&structname##_state.onofflock), \
73 .fqslock = __RAW_SPIN_LOCK_UNLOCKED(&structname.fqslock), \ 76 .fqslock = __RAW_SPIN_LOCK_UNLOCKED(&structname##_state.fqslock), \
74 .n_force_qs = 0, \ 77 .n_force_qs = 0, \
75 .n_force_qs_ngp = 0, \ 78 .n_force_qs_ngp = 0, \
76 .name = #structname, \ 79 .name = #structname, \
77} 80}
78 81
79struct rcu_state rcu_sched_state = RCU_STATE_INITIALIZER(rcu_sched_state); 82struct rcu_state rcu_sched_state = RCU_STATE_INITIALIZER(rcu_sched);
80DEFINE_PER_CPU(struct rcu_data, rcu_sched_data); 83DEFINE_PER_CPU(struct rcu_data, rcu_sched_data);
81 84
82struct rcu_state rcu_bh_state = RCU_STATE_INITIALIZER(rcu_bh_state); 85struct rcu_state rcu_bh_state = RCU_STATE_INITIALIZER(rcu_bh);
83DEFINE_PER_CPU(struct rcu_data, rcu_bh_data); 86DEFINE_PER_CPU(struct rcu_data, rcu_bh_data);
84 87
85static struct rcu_state *rcu_state; 88static struct rcu_state *rcu_state;
@@ -128,8 +131,6 @@ static void rcu_node_kthread_setaffinity(struct rcu_node *rnp, int outgoingcpu);
128static void invoke_rcu_core(void); 131static void invoke_rcu_core(void);
129static void invoke_rcu_callbacks(struct rcu_state *rsp, struct rcu_data *rdp); 132static void invoke_rcu_callbacks(struct rcu_state *rsp, struct rcu_data *rdp);
130 133
131#define RCU_KTHREAD_PRIO 1 /* RT priority for per-CPU kthreads. */
132
133/* 134/*
134 * Track the rcutorture test sequence number and the update version 135 * Track the rcutorture test sequence number and the update version
135 * number within a given test. The rcutorture_testseq is incremented 136 * number within a given test. The rcutorture_testseq is incremented
@@ -156,33 +157,41 @@ static int rcu_gp_in_progress(struct rcu_state *rsp)
156 * Note a quiescent state. Because we do not need to know 157 * Note a quiescent state. Because we do not need to know
157 * how many quiescent states passed, just if there was at least 158 * how many quiescent states passed, just if there was at least
158 * one since the start of the grace period, this just sets a flag. 159 * one since the start of the grace period, this just sets a flag.
160 * The caller must have disabled preemption.
159 */ 161 */
160void rcu_sched_qs(int cpu) 162void rcu_sched_qs(int cpu)
161{ 163{
162 struct rcu_data *rdp = &per_cpu(rcu_sched_data, cpu); 164 struct rcu_data *rdp = &per_cpu(rcu_sched_data, cpu);
163 165
164 rdp->passed_quiesc_completed = rdp->gpnum - 1; 166 rdp->passed_quiesce_gpnum = rdp->gpnum;
165 barrier(); 167 barrier();
166 rdp->passed_quiesc = 1; 168 if (rdp->passed_quiesce == 0)
169 trace_rcu_grace_period("rcu_sched", rdp->gpnum, "cpuqs");
170 rdp->passed_quiesce = 1;
167} 171}
168 172
169void rcu_bh_qs(int cpu) 173void rcu_bh_qs(int cpu)
170{ 174{
171 struct rcu_data *rdp = &per_cpu(rcu_bh_data, cpu); 175 struct rcu_data *rdp = &per_cpu(rcu_bh_data, cpu);
172 176
173 rdp->passed_quiesc_completed = rdp->gpnum - 1; 177 rdp->passed_quiesce_gpnum = rdp->gpnum;
174 barrier(); 178 barrier();
175 rdp->passed_quiesc = 1; 179 if (rdp->passed_quiesce == 0)
180 trace_rcu_grace_period("rcu_bh", rdp->gpnum, "cpuqs");
181 rdp->passed_quiesce = 1;
176} 182}
177 183
178/* 184/*
179 * Note a context switch. This is a quiescent state for RCU-sched, 185 * Note a context switch. This is a quiescent state for RCU-sched,
180 * and requires special handling for preemptible RCU. 186 * and requires special handling for preemptible RCU.
187 * The caller must have disabled preemption.
181 */ 188 */
182void rcu_note_context_switch(int cpu) 189void rcu_note_context_switch(int cpu)
183{ 190{
191 trace_rcu_utilization("Start context switch");
184 rcu_sched_qs(cpu); 192 rcu_sched_qs(cpu);
185 rcu_preempt_note_context_switch(cpu); 193 rcu_preempt_note_context_switch(cpu);
194 trace_rcu_utilization("End context switch");
186} 195}
187EXPORT_SYMBOL_GPL(rcu_note_context_switch); 196EXPORT_SYMBOL_GPL(rcu_note_context_switch);
188 197
@@ -193,7 +202,7 @@ DEFINE_PER_CPU(struct rcu_dynticks, rcu_dynticks) = {
193}; 202};
194#endif /* #ifdef CONFIG_NO_HZ */ 203#endif /* #ifdef CONFIG_NO_HZ */
195 204
196static int blimit = 10; /* Maximum callbacks per softirq. */ 205static int blimit = 10; /* Maximum callbacks per rcu_do_batch. */
197static int qhimark = 10000; /* If this many pending, ignore blimit. */ 206static int qhimark = 10000; /* If this many pending, ignore blimit. */
198static int qlowmark = 100; /* Once only this many pending, use blimit. */ 207static int qlowmark = 100; /* Once only this many pending, use blimit. */
199 208
@@ -314,6 +323,7 @@ static int rcu_implicit_offline_qs(struct rcu_data *rdp)
314 * trust its state not to change because interrupts are disabled. 323 * trust its state not to change because interrupts are disabled.
315 */ 324 */
316 if (cpu_is_offline(rdp->cpu)) { 325 if (cpu_is_offline(rdp->cpu)) {
326 trace_rcu_fqs(rdp->rsp->name, rdp->gpnum, rdp->cpu, "ofl");
317 rdp->offline_fqs++; 327 rdp->offline_fqs++;
318 return 1; 328 return 1;
319 } 329 }
@@ -354,19 +364,13 @@ void rcu_enter_nohz(void)
354 local_irq_restore(flags); 364 local_irq_restore(flags);
355 return; 365 return;
356 } 366 }
367 trace_rcu_dyntick("Start");
357 /* CPUs seeing atomic_inc() must see prior RCU read-side crit sects */ 368 /* CPUs seeing atomic_inc() must see prior RCU read-side crit sects */
358 smp_mb__before_atomic_inc(); /* See above. */ 369 smp_mb__before_atomic_inc(); /* See above. */
359 atomic_inc(&rdtp->dynticks); 370 atomic_inc(&rdtp->dynticks);
360 smp_mb__after_atomic_inc(); /* Force ordering with next sojourn. */ 371 smp_mb__after_atomic_inc(); /* Force ordering with next sojourn. */
361 WARN_ON_ONCE(atomic_read(&rdtp->dynticks) & 0x1); 372 WARN_ON_ONCE(atomic_read(&rdtp->dynticks) & 0x1);
362 local_irq_restore(flags); 373 local_irq_restore(flags);
363
364 /* If the interrupt queued a callback, get out of dyntick mode. */
365 if (in_irq() &&
366 (__get_cpu_var(rcu_sched_data).nxtlist ||
367 __get_cpu_var(rcu_bh_data).nxtlist ||
368 rcu_preempt_needs_cpu(smp_processor_id())))
369 set_need_resched();
370} 374}
371 375
372/* 376/*
@@ -391,6 +395,7 @@ void rcu_exit_nohz(void)
391 /* CPUs seeing atomic_inc() must see later RCU read-side crit sects */ 395 /* CPUs seeing atomic_inc() must see later RCU read-side crit sects */
392 smp_mb__after_atomic_inc(); /* See above. */ 396 smp_mb__after_atomic_inc(); /* See above. */
393 WARN_ON_ONCE(!(atomic_read(&rdtp->dynticks) & 0x1)); 397 WARN_ON_ONCE(!(atomic_read(&rdtp->dynticks) & 0x1));
398 trace_rcu_dyntick("End");
394 local_irq_restore(flags); 399 local_irq_restore(flags);
395} 400}
396 401
@@ -481,11 +486,11 @@ static int dyntick_save_progress_counter(struct rcu_data *rdp)
481 */ 486 */
482static int rcu_implicit_dynticks_qs(struct rcu_data *rdp) 487static int rcu_implicit_dynticks_qs(struct rcu_data *rdp)
483{ 488{
484 unsigned long curr; 489 unsigned int curr;
485 unsigned long snap; 490 unsigned int snap;
486 491
487 curr = (unsigned long)atomic_add_return(0, &rdp->dynticks->dynticks); 492 curr = (unsigned int)atomic_add_return(0, &rdp->dynticks->dynticks);
488 snap = (unsigned long)rdp->dynticks_snap; 493 snap = (unsigned int)rdp->dynticks_snap;
489 494
490 /* 495 /*
491 * If the CPU passed through or entered a dynticks idle phase with 496 * If the CPU passed through or entered a dynticks idle phase with
@@ -495,7 +500,8 @@ static int rcu_implicit_dynticks_qs(struct rcu_data *rdp)
495 * read-side critical section that started before the beginning 500 * read-side critical section that started before the beginning
496 * of the current RCU grace period. 501 * of the current RCU grace period.
497 */ 502 */
498 if ((curr & 0x1) == 0 || ULONG_CMP_GE(curr, snap + 2)) { 503 if ((curr & 0x1) == 0 || UINT_CMP_GE(curr, snap + 2)) {
504 trace_rcu_fqs(rdp->rsp->name, rdp->gpnum, rdp->cpu, "dti");
499 rdp->dynticks_fqs++; 505 rdp->dynticks_fqs++;
500 return 1; 506 return 1;
501 } 507 }
@@ -537,6 +543,7 @@ static void print_other_cpu_stall(struct rcu_state *rsp)
537 int cpu; 543 int cpu;
538 long delta; 544 long delta;
539 unsigned long flags; 545 unsigned long flags;
546 int ndetected;
540 struct rcu_node *rnp = rcu_get_root(rsp); 547 struct rcu_node *rnp = rcu_get_root(rsp);
541 548
542 /* Only let one CPU complain about others per time interval. */ 549 /* Only let one CPU complain about others per time interval. */
@@ -553,7 +560,7 @@ static void print_other_cpu_stall(struct rcu_state *rsp)
553 * Now rat on any tasks that got kicked up to the root rcu_node 560 * Now rat on any tasks that got kicked up to the root rcu_node
554 * due to CPU offlining. 561 * due to CPU offlining.
555 */ 562 */
556 rcu_print_task_stall(rnp); 563 ndetected = rcu_print_task_stall(rnp);
557 raw_spin_unlock_irqrestore(&rnp->lock, flags); 564 raw_spin_unlock_irqrestore(&rnp->lock, flags);
558 565
559 /* 566 /*
@@ -565,17 +572,22 @@ static void print_other_cpu_stall(struct rcu_state *rsp)
565 rsp->name); 572 rsp->name);
566 rcu_for_each_leaf_node(rsp, rnp) { 573 rcu_for_each_leaf_node(rsp, rnp) {
567 raw_spin_lock_irqsave(&rnp->lock, flags); 574 raw_spin_lock_irqsave(&rnp->lock, flags);
568 rcu_print_task_stall(rnp); 575 ndetected += rcu_print_task_stall(rnp);
569 raw_spin_unlock_irqrestore(&rnp->lock, flags); 576 raw_spin_unlock_irqrestore(&rnp->lock, flags);
570 if (rnp->qsmask == 0) 577 if (rnp->qsmask == 0)
571 continue; 578 continue;
572 for (cpu = 0; cpu <= rnp->grphi - rnp->grplo; cpu++) 579 for (cpu = 0; cpu <= rnp->grphi - rnp->grplo; cpu++)
573 if (rnp->qsmask & (1UL << cpu)) 580 if (rnp->qsmask & (1UL << cpu)) {
574 printk(" %d", rnp->grplo + cpu); 581 printk(" %d", rnp->grplo + cpu);
582 ndetected++;
583 }
575 } 584 }
576 printk("} (detected by %d, t=%ld jiffies)\n", 585 printk("} (detected by %d, t=%ld jiffies)\n",
577 smp_processor_id(), (long)(jiffies - rsp->gp_start)); 586 smp_processor_id(), (long)(jiffies - rsp->gp_start));
578 trigger_all_cpu_backtrace(); 587 if (ndetected == 0)
588 printk(KERN_ERR "INFO: Stall ended before state dump start\n");
589 else if (!trigger_all_cpu_backtrace())
590 dump_stack();
579 591
580 /* If so configured, complain about tasks blocking the grace period. */ 592 /* If so configured, complain about tasks blocking the grace period. */
581 593
@@ -596,7 +608,8 @@ static void print_cpu_stall(struct rcu_state *rsp)
596 */ 608 */
597 printk(KERN_ERR "INFO: %s detected stall on CPU %d (t=%lu jiffies)\n", 609 printk(KERN_ERR "INFO: %s detected stall on CPU %d (t=%lu jiffies)\n",
598 rsp->name, smp_processor_id(), jiffies - rsp->gp_start); 610 rsp->name, smp_processor_id(), jiffies - rsp->gp_start);
599 trigger_all_cpu_backtrace(); 611 if (!trigger_all_cpu_backtrace())
612 dump_stack();
600 613
601 raw_spin_lock_irqsave(&rnp->lock, flags); 614 raw_spin_lock_irqsave(&rnp->lock, flags);
602 if (ULONG_CMP_GE(jiffies, rsp->jiffies_stall)) 615 if (ULONG_CMP_GE(jiffies, rsp->jiffies_stall))
@@ -678,9 +691,10 @@ static void __note_new_gpnum(struct rcu_state *rsp, struct rcu_node *rnp, struct
678 * go looking for one. 691 * go looking for one.
679 */ 692 */
680 rdp->gpnum = rnp->gpnum; 693 rdp->gpnum = rnp->gpnum;
694 trace_rcu_grace_period(rsp->name, rdp->gpnum, "cpustart");
681 if (rnp->qsmask & rdp->grpmask) { 695 if (rnp->qsmask & rdp->grpmask) {
682 rdp->qs_pending = 1; 696 rdp->qs_pending = 1;
683 rdp->passed_quiesc = 0; 697 rdp->passed_quiesce = 0;
684 } else 698 } else
685 rdp->qs_pending = 0; 699 rdp->qs_pending = 0;
686 } 700 }
@@ -741,6 +755,7 @@ __rcu_process_gp_end(struct rcu_state *rsp, struct rcu_node *rnp, struct rcu_dat
741 755
742 /* Remember that we saw this grace-period completion. */ 756 /* Remember that we saw this grace-period completion. */
743 rdp->completed = rnp->completed; 757 rdp->completed = rnp->completed;
758 trace_rcu_grace_period(rsp->name, rdp->gpnum, "cpuend");
744 759
745 /* 760 /*
746 * If we were in an extended quiescent state, we may have 761 * If we were in an extended quiescent state, we may have
@@ -826,31 +841,31 @@ rcu_start_gp(struct rcu_state *rsp, unsigned long flags)
826 struct rcu_data *rdp = this_cpu_ptr(rsp->rda); 841 struct rcu_data *rdp = this_cpu_ptr(rsp->rda);
827 struct rcu_node *rnp = rcu_get_root(rsp); 842 struct rcu_node *rnp = rcu_get_root(rsp);
828 843
829 if (!cpu_needs_another_gp(rsp, rdp) || rsp->fqs_active) { 844 if (!rcu_scheduler_fully_active ||
830 if (cpu_needs_another_gp(rsp, rdp)) 845 !cpu_needs_another_gp(rsp, rdp)) {
831 rsp->fqs_need_gp = 1; 846 /*
832 if (rnp->completed == rsp->completed) { 847 * Either the scheduler hasn't yet spawned the first
833 raw_spin_unlock_irqrestore(&rnp->lock, flags); 848 * non-idle task or this CPU does not need another
834 return; 849 * grace period. Either way, don't start a new grace
835 } 850 * period.
836 raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */ 851 */
852 raw_spin_unlock_irqrestore(&rnp->lock, flags);
853 return;
854 }
837 855
856 if (rsp->fqs_active) {
838 /* 857 /*
839 * Propagate new ->completed value to rcu_node structures 858 * This CPU needs a grace period, but force_quiescent_state()
840 * so that other CPUs don't have to wait until the start 859 * is running. Tell it to start one on this CPU's behalf.
841 * of the next grace period to process their callbacks.
842 */ 860 */
843 rcu_for_each_node_breadth_first(rsp, rnp) { 861 rsp->fqs_need_gp = 1;
844 raw_spin_lock(&rnp->lock); /* irqs already disabled. */ 862 raw_spin_unlock_irqrestore(&rnp->lock, flags);
845 rnp->completed = rsp->completed;
846 raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */
847 }
848 local_irq_restore(flags);
849 return; 863 return;
850 } 864 }
851 865
852 /* Advance to a new grace period and initialize state. */ 866 /* Advance to a new grace period and initialize state. */
853 rsp->gpnum++; 867 rsp->gpnum++;
868 trace_rcu_grace_period(rsp->name, rsp->gpnum, "start");
854 WARN_ON_ONCE(rsp->signaled == RCU_GP_INIT); 869 WARN_ON_ONCE(rsp->signaled == RCU_GP_INIT);
855 rsp->signaled = RCU_GP_INIT; /* Hold off force_quiescent_state. */ 870 rsp->signaled = RCU_GP_INIT; /* Hold off force_quiescent_state. */
856 rsp->jiffies_force_qs = jiffies + RCU_JIFFIES_TILL_FORCE_QS; 871 rsp->jiffies_force_qs = jiffies + RCU_JIFFIES_TILL_FORCE_QS;
@@ -865,6 +880,9 @@ rcu_start_gp(struct rcu_state *rsp, unsigned long flags)
865 rsp->signaled = RCU_SIGNAL_INIT; /* force_quiescent_state OK. */ 880 rsp->signaled = RCU_SIGNAL_INIT; /* force_quiescent_state OK. */
866 rcu_start_gp_per_cpu(rsp, rnp, rdp); 881 rcu_start_gp_per_cpu(rsp, rnp, rdp);
867 rcu_preempt_boost_start_gp(rnp); 882 rcu_preempt_boost_start_gp(rnp);
883 trace_rcu_grace_period_init(rsp->name, rnp->gpnum,
884 rnp->level, rnp->grplo,
885 rnp->grphi, rnp->qsmask);
868 raw_spin_unlock_irqrestore(&rnp->lock, flags); 886 raw_spin_unlock_irqrestore(&rnp->lock, flags);
869 return; 887 return;
870 } 888 }
@@ -901,6 +919,9 @@ rcu_start_gp(struct rcu_state *rsp, unsigned long flags)
901 if (rnp == rdp->mynode) 919 if (rnp == rdp->mynode)
902 rcu_start_gp_per_cpu(rsp, rnp, rdp); 920 rcu_start_gp_per_cpu(rsp, rnp, rdp);
903 rcu_preempt_boost_start_gp(rnp); 921 rcu_preempt_boost_start_gp(rnp);
922 trace_rcu_grace_period_init(rsp->name, rnp->gpnum,
923 rnp->level, rnp->grplo,
924 rnp->grphi, rnp->qsmask);
904 raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */ 925 raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */
905 } 926 }
906 927
@@ -922,6 +943,8 @@ static void rcu_report_qs_rsp(struct rcu_state *rsp, unsigned long flags)
922 __releases(rcu_get_root(rsp)->lock) 943 __releases(rcu_get_root(rsp)->lock)
923{ 944{
924 unsigned long gp_duration; 945 unsigned long gp_duration;
946 struct rcu_node *rnp = rcu_get_root(rsp);
947 struct rcu_data *rdp = this_cpu_ptr(rsp->rda);
925 948
926 WARN_ON_ONCE(!rcu_gp_in_progress(rsp)); 949 WARN_ON_ONCE(!rcu_gp_in_progress(rsp));
927 950
@@ -933,7 +956,41 @@ static void rcu_report_qs_rsp(struct rcu_state *rsp, unsigned long flags)
933 gp_duration = jiffies - rsp->gp_start; 956 gp_duration = jiffies - rsp->gp_start;
934 if (gp_duration > rsp->gp_max) 957 if (gp_duration > rsp->gp_max)
935 rsp->gp_max = gp_duration; 958 rsp->gp_max = gp_duration;
936 rsp->completed = rsp->gpnum; 959
960 /*
961 * We know the grace period is complete, but to everyone else
962 * it appears to still be ongoing. But it is also the case
963 * that to everyone else it looks like there is nothing that
964 * they can do to advance the grace period. It is therefore
965 * safe for us to drop the lock in order to mark the grace
966 * period as completed in all of the rcu_node structures.
967 *
968 * But if this CPU needs another grace period, it will take
969 * care of this while initializing the next grace period.
970 * We use RCU_WAIT_TAIL instead of the usual RCU_DONE_TAIL
971 * because the callbacks have not yet been advanced: Those
972 * callbacks are waiting on the grace period that just now
973 * completed.
974 */
975 if (*rdp->nxttail[RCU_WAIT_TAIL] == NULL) {
976 raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */
977
978 /*
979 * Propagate new ->completed value to rcu_node structures
980 * so that other CPUs don't have to wait until the start
981 * of the next grace period to process their callbacks.
982 */
983 rcu_for_each_node_breadth_first(rsp, rnp) {
984 raw_spin_lock(&rnp->lock); /* irqs already disabled. */
985 rnp->completed = rsp->gpnum;
986 raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */
987 }
988 rnp = rcu_get_root(rsp);
989 raw_spin_lock(&rnp->lock); /* irqs already disabled. */
990 }
991
992 rsp->completed = rsp->gpnum; /* Declare the grace period complete. */
993 trace_rcu_grace_period(rsp->name, rsp->completed, "end");
937 rsp->signaled = RCU_GP_IDLE; 994 rsp->signaled = RCU_GP_IDLE;
938 rcu_start_gp(rsp, flags); /* releases root node's rnp->lock. */ 995 rcu_start_gp(rsp, flags); /* releases root node's rnp->lock. */
939} 996}
@@ -962,6 +1019,10 @@ rcu_report_qs_rnp(unsigned long mask, struct rcu_state *rsp,
962 return; 1019 return;
963 } 1020 }
964 rnp->qsmask &= ~mask; 1021 rnp->qsmask &= ~mask;
1022 trace_rcu_quiescent_state_report(rsp->name, rnp->gpnum,
1023 mask, rnp->qsmask, rnp->level,
1024 rnp->grplo, rnp->grphi,
1025 !!rnp->gp_tasks);
965 if (rnp->qsmask != 0 || rcu_preempt_blocked_readers_cgp(rnp)) { 1026 if (rnp->qsmask != 0 || rcu_preempt_blocked_readers_cgp(rnp)) {
966 1027
967 /* Other bits still set at this level, so done. */ 1028 /* Other bits still set at this level, so done. */
@@ -1000,7 +1061,7 @@ rcu_report_qs_rnp(unsigned long mask, struct rcu_state *rsp,
1000 * based on quiescent states detected in an earlier grace period! 1061 * based on quiescent states detected in an earlier grace period!
1001 */ 1062 */
1002static void 1063static void
1003rcu_report_qs_rdp(int cpu, struct rcu_state *rsp, struct rcu_data *rdp, long lastcomp) 1064rcu_report_qs_rdp(int cpu, struct rcu_state *rsp, struct rcu_data *rdp, long lastgp)
1004{ 1065{
1005 unsigned long flags; 1066 unsigned long flags;
1006 unsigned long mask; 1067 unsigned long mask;
@@ -1008,17 +1069,15 @@ rcu_report_qs_rdp(int cpu, struct rcu_state *rsp, struct rcu_data *rdp, long las
1008 1069
1009 rnp = rdp->mynode; 1070 rnp = rdp->mynode;
1010 raw_spin_lock_irqsave(&rnp->lock, flags); 1071 raw_spin_lock_irqsave(&rnp->lock, flags);
1011 if (lastcomp != rnp->completed) { 1072 if (lastgp != rnp->gpnum || rnp->completed == rnp->gpnum) {
1012 1073
1013 /* 1074 /*
1014 * Someone beat us to it for this grace period, so leave. 1075 * The grace period in which this quiescent state was
1015 * The race with GP start is resolved by the fact that we 1076 * recorded has ended, so don't report it upwards.
1016 * hold the leaf rcu_node lock, so that the per-CPU bits 1077 * We will instead need a new quiescent state that lies
1017 * cannot yet be initialized -- so we would simply find our 1078 * within the current grace period.
1018 * CPU's bit already cleared in rcu_report_qs_rnp() if this
1019 * race occurred.
1020 */ 1079 */
1021 rdp->passed_quiesc = 0; /* try again later! */ 1080 rdp->passed_quiesce = 0; /* need qs for new gp. */
1022 raw_spin_unlock_irqrestore(&rnp->lock, flags); 1081 raw_spin_unlock_irqrestore(&rnp->lock, flags);
1023 return; 1082 return;
1024 } 1083 }
@@ -1062,14 +1121,14 @@ rcu_check_quiescent_state(struct rcu_state *rsp, struct rcu_data *rdp)
1062 * Was there a quiescent state since the beginning of the grace 1121 * Was there a quiescent state since the beginning of the grace
1063 * period? If no, then exit and wait for the next call. 1122 * period? If no, then exit and wait for the next call.
1064 */ 1123 */
1065 if (!rdp->passed_quiesc) 1124 if (!rdp->passed_quiesce)
1066 return; 1125 return;
1067 1126
1068 /* 1127 /*
1069 * Tell RCU we are done (but rcu_report_qs_rdp() will be the 1128 * Tell RCU we are done (but rcu_report_qs_rdp() will be the
1070 * judge of that). 1129 * judge of that).
1071 */ 1130 */
1072 rcu_report_qs_rdp(rdp->cpu, rsp, rdp, rdp->passed_quiesc_completed); 1131 rcu_report_qs_rdp(rdp->cpu, rsp, rdp, rdp->passed_quiesce_gpnum);
1073} 1132}
1074 1133
1075#ifdef CONFIG_HOTPLUG_CPU 1134#ifdef CONFIG_HOTPLUG_CPU
@@ -1130,11 +1189,20 @@ static void __rcu_offline_cpu(int cpu, struct rcu_state *rsp)
1130 if (rnp->qsmaskinit != 0) { 1189 if (rnp->qsmaskinit != 0) {
1131 if (rnp != rdp->mynode) 1190 if (rnp != rdp->mynode)
1132 raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */ 1191 raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */
1192 else
1193 trace_rcu_grace_period(rsp->name,
1194 rnp->gpnum + 1 -
1195 !!(rnp->qsmask & mask),
1196 "cpuofl");
1133 break; 1197 break;
1134 } 1198 }
1135 if (rnp == rdp->mynode) 1199 if (rnp == rdp->mynode) {
1200 trace_rcu_grace_period(rsp->name,
1201 rnp->gpnum + 1 -
1202 !!(rnp->qsmask & mask),
1203 "cpuofl");
1136 need_report = rcu_preempt_offline_tasks(rsp, rnp, rdp); 1204 need_report = rcu_preempt_offline_tasks(rsp, rnp, rdp);
1137 else 1205 } else
1138 raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */ 1206 raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */
1139 mask = rnp->grpmask; 1207 mask = rnp->grpmask;
1140 rnp = rnp->parent; 1208 rnp = rnp->parent;
@@ -1190,17 +1258,22 @@ static void rcu_do_batch(struct rcu_state *rsp, struct rcu_data *rdp)
1190{ 1258{
1191 unsigned long flags; 1259 unsigned long flags;
1192 struct rcu_head *next, *list, **tail; 1260 struct rcu_head *next, *list, **tail;
1193 int count; 1261 int bl, count;
1194 1262
1195 /* If no callbacks are ready, just return.*/ 1263 /* If no callbacks are ready, just return.*/
1196 if (!cpu_has_callbacks_ready_to_invoke(rdp)) 1264 if (!cpu_has_callbacks_ready_to_invoke(rdp)) {
1265 trace_rcu_batch_start(rsp->name, 0, 0);
1266 trace_rcu_batch_end(rsp->name, 0);
1197 return; 1267 return;
1268 }
1198 1269
1199 /* 1270 /*
1200 * Extract the list of ready callbacks, disabling to prevent 1271 * Extract the list of ready callbacks, disabling to prevent
1201 * races with call_rcu() from interrupt handlers. 1272 * races with call_rcu() from interrupt handlers.
1202 */ 1273 */
1203 local_irq_save(flags); 1274 local_irq_save(flags);
1275 bl = rdp->blimit;
1276 trace_rcu_batch_start(rsp->name, rdp->qlen, bl);
1204 list = rdp->nxtlist; 1277 list = rdp->nxtlist;
1205 rdp->nxtlist = *rdp->nxttail[RCU_DONE_TAIL]; 1278 rdp->nxtlist = *rdp->nxttail[RCU_DONE_TAIL];
1206 *rdp->nxttail[RCU_DONE_TAIL] = NULL; 1279 *rdp->nxttail[RCU_DONE_TAIL] = NULL;
@@ -1216,13 +1289,14 @@ static void rcu_do_batch(struct rcu_state *rsp, struct rcu_data *rdp)
1216 next = list->next; 1289 next = list->next;
1217 prefetch(next); 1290 prefetch(next);
1218 debug_rcu_head_unqueue(list); 1291 debug_rcu_head_unqueue(list);
1219 __rcu_reclaim(list); 1292 __rcu_reclaim(rsp->name, list);
1220 list = next; 1293 list = next;
1221 if (++count >= rdp->blimit) 1294 if (++count >= bl)
1222 break; 1295 break;
1223 } 1296 }
1224 1297
1225 local_irq_save(flags); 1298 local_irq_save(flags);
1299 trace_rcu_batch_end(rsp->name, count);
1226 1300
1227 /* Update count, and requeue any remaining callbacks. */ 1301 /* Update count, and requeue any remaining callbacks. */
1228 rdp->qlen -= count; 1302 rdp->qlen -= count;
@@ -1250,7 +1324,7 @@ static void rcu_do_batch(struct rcu_state *rsp, struct rcu_data *rdp)
1250 1324
1251 local_irq_restore(flags); 1325 local_irq_restore(flags);
1252 1326
1253 /* Re-raise the RCU softirq if there are callbacks remaining. */ 1327 /* Re-invoke RCU core processing if there are callbacks remaining. */
1254 if (cpu_has_callbacks_ready_to_invoke(rdp)) 1328 if (cpu_has_callbacks_ready_to_invoke(rdp))
1255 invoke_rcu_core(); 1329 invoke_rcu_core();
1256} 1330}
@@ -1258,7 +1332,7 @@ static void rcu_do_batch(struct rcu_state *rsp, struct rcu_data *rdp)
1258/* 1332/*
1259 * Check to see if this CPU is in a non-context-switch quiescent state 1333 * Check to see if this CPU is in a non-context-switch quiescent state
1260 * (user mode or idle loop for rcu, non-softirq execution for rcu_bh). 1334 * (user mode or idle loop for rcu, non-softirq execution for rcu_bh).
1261 * Also schedule the RCU softirq handler. 1335 * Also schedule RCU core processing.
1262 * 1336 *
1263 * This function must be called with hardirqs disabled. It is normally 1337 * This function must be called with hardirqs disabled. It is normally
1264 * invoked from the scheduling-clock interrupt. If rcu_pending returns 1338 * invoked from the scheduling-clock interrupt. If rcu_pending returns
@@ -1266,6 +1340,7 @@ static void rcu_do_batch(struct rcu_state *rsp, struct rcu_data *rdp)
1266 */ 1340 */
1267void rcu_check_callbacks(int cpu, int user) 1341void rcu_check_callbacks(int cpu, int user)
1268{ 1342{
1343 trace_rcu_utilization("Start scheduler-tick");
1269 if (user || 1344 if (user ||
1270 (idle_cpu(cpu) && rcu_scheduler_active && 1345 (idle_cpu(cpu) && rcu_scheduler_active &&
1271 !in_softirq() && hardirq_count() <= (1 << HARDIRQ_SHIFT))) { 1346 !in_softirq() && hardirq_count() <= (1 << HARDIRQ_SHIFT))) {
@@ -1299,6 +1374,7 @@ void rcu_check_callbacks(int cpu, int user)
1299 rcu_preempt_check_callbacks(cpu); 1374 rcu_preempt_check_callbacks(cpu);
1300 if (rcu_pending(cpu)) 1375 if (rcu_pending(cpu))
1301 invoke_rcu_core(); 1376 invoke_rcu_core();
1377 trace_rcu_utilization("End scheduler-tick");
1302} 1378}
1303 1379
1304#ifdef CONFIG_SMP 1380#ifdef CONFIG_SMP
@@ -1360,10 +1436,14 @@ static void force_quiescent_state(struct rcu_state *rsp, int relaxed)
1360 unsigned long flags; 1436 unsigned long flags;
1361 struct rcu_node *rnp = rcu_get_root(rsp); 1437 struct rcu_node *rnp = rcu_get_root(rsp);
1362 1438
1363 if (!rcu_gp_in_progress(rsp)) 1439 trace_rcu_utilization("Start fqs");
1440 if (!rcu_gp_in_progress(rsp)) {
1441 trace_rcu_utilization("End fqs");
1364 return; /* No grace period in progress, nothing to force. */ 1442 return; /* No grace period in progress, nothing to force. */
1443 }
1365 if (!raw_spin_trylock_irqsave(&rsp->fqslock, flags)) { 1444 if (!raw_spin_trylock_irqsave(&rsp->fqslock, flags)) {
1366 rsp->n_force_qs_lh++; /* Inexact, can lose counts. Tough! */ 1445 rsp->n_force_qs_lh++; /* Inexact, can lose counts. Tough! */
1446 trace_rcu_utilization("End fqs");
1367 return; /* Someone else is already on the job. */ 1447 return; /* Someone else is already on the job. */
1368 } 1448 }
1369 if (relaxed && ULONG_CMP_GE(rsp->jiffies_force_qs, jiffies)) 1449 if (relaxed && ULONG_CMP_GE(rsp->jiffies_force_qs, jiffies))
@@ -1412,11 +1492,13 @@ static void force_quiescent_state(struct rcu_state *rsp, int relaxed)
1412 raw_spin_unlock(&rsp->fqslock); /* irqs remain disabled */ 1492 raw_spin_unlock(&rsp->fqslock); /* irqs remain disabled */
1413 rsp->fqs_need_gp = 0; 1493 rsp->fqs_need_gp = 0;
1414 rcu_start_gp(rsp, flags); /* releases rnp->lock */ 1494 rcu_start_gp(rsp, flags); /* releases rnp->lock */
1495 trace_rcu_utilization("End fqs");
1415 return; 1496 return;
1416 } 1497 }
1417 raw_spin_unlock(&rnp->lock); /* irqs remain disabled */ 1498 raw_spin_unlock(&rnp->lock); /* irqs remain disabled */
1418unlock_fqs_ret: 1499unlock_fqs_ret:
1419 raw_spin_unlock_irqrestore(&rsp->fqslock, flags); 1500 raw_spin_unlock_irqrestore(&rsp->fqslock, flags);
1501 trace_rcu_utilization("End fqs");
1420} 1502}
1421 1503
1422#else /* #ifdef CONFIG_SMP */ 1504#else /* #ifdef CONFIG_SMP */
@@ -1429,9 +1511,9 @@ static void force_quiescent_state(struct rcu_state *rsp, int relaxed)
1429#endif /* #else #ifdef CONFIG_SMP */ 1511#endif /* #else #ifdef CONFIG_SMP */
1430 1512
1431/* 1513/*
1432 * This does the RCU processing work from softirq context for the 1514 * This does the RCU core processing work for the specified rcu_state
1433 * specified rcu_state and rcu_data structures. This may be called 1515 * and rcu_data structures. This may be called only from the CPU to
1434 * only from the CPU to whom the rdp belongs. 1516 * whom the rdp belongs.
1435 */ 1517 */
1436static void 1518static void
1437__rcu_process_callbacks(struct rcu_state *rsp, struct rcu_data *rdp) 1519__rcu_process_callbacks(struct rcu_state *rsp, struct rcu_data *rdp)
@@ -1468,24 +1550,24 @@ __rcu_process_callbacks(struct rcu_state *rsp, struct rcu_data *rdp)
1468} 1550}
1469 1551
1470/* 1552/*
1471 * Do softirq processing for the current CPU. 1553 * Do RCU core processing for the current CPU.
1472 */ 1554 */
1473static void rcu_process_callbacks(struct softirq_action *unused) 1555static void rcu_process_callbacks(struct softirq_action *unused)
1474{ 1556{
1557 trace_rcu_utilization("Start RCU core");
1475 __rcu_process_callbacks(&rcu_sched_state, 1558 __rcu_process_callbacks(&rcu_sched_state,
1476 &__get_cpu_var(rcu_sched_data)); 1559 &__get_cpu_var(rcu_sched_data));
1477 __rcu_process_callbacks(&rcu_bh_state, &__get_cpu_var(rcu_bh_data)); 1560 __rcu_process_callbacks(&rcu_bh_state, &__get_cpu_var(rcu_bh_data));
1478 rcu_preempt_process_callbacks(); 1561 rcu_preempt_process_callbacks();
1479 1562 trace_rcu_utilization("End RCU core");
1480 /* If we are last CPU on way to dyntick-idle mode, accelerate it. */
1481 rcu_needs_cpu_flush();
1482} 1563}
1483 1564
1484/* 1565/*
1485 * Wake up the current CPU's kthread. This replaces raise_softirq() 1566 * Schedule RCU callback invocation. If the specified type of RCU
1486 * in earlier versions of RCU. Note that because we are running on 1567 * does not support RCU priority boosting, just do a direct call,
1487 * the current CPU with interrupts disabled, the rcu_cpu_kthread_task 1568 * otherwise wake up the per-CPU kernel kthread. Note that because we
1488 * cannot disappear out from under us. 1569 * are running on the current CPU with interrupts disabled, the
1570 * rcu_cpu_kthread_task cannot disappear out from under us.
1489 */ 1571 */
1490static void invoke_rcu_callbacks(struct rcu_state *rsp, struct rcu_data *rdp) 1572static void invoke_rcu_callbacks(struct rcu_state *rsp, struct rcu_data *rdp)
1491{ 1573{
@@ -1530,6 +1612,12 @@ __call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu),
1530 rdp->nxttail[RCU_NEXT_TAIL] = &head->next; 1612 rdp->nxttail[RCU_NEXT_TAIL] = &head->next;
1531 rdp->qlen++; 1613 rdp->qlen++;
1532 1614
1615 if (__is_kfree_rcu_offset((unsigned long)func))
1616 trace_rcu_kfree_callback(rsp->name, head, (unsigned long)func,
1617 rdp->qlen);
1618 else
1619 trace_rcu_callback(rsp->name, head, rdp->qlen);
1620
1533 /* If interrupts were disabled, don't dive into RCU core. */ 1621 /* If interrupts were disabled, don't dive into RCU core. */
1534 if (irqs_disabled_flags(flags)) { 1622 if (irqs_disabled_flags(flags)) {
1535 local_irq_restore(flags); 1623 local_irq_restore(flags);
@@ -1613,18 +1701,9 @@ EXPORT_SYMBOL_GPL(call_rcu_bh);
1613 */ 1701 */
1614void synchronize_sched(void) 1702void synchronize_sched(void)
1615{ 1703{
1616 struct rcu_synchronize rcu;
1617
1618 if (rcu_blocking_is_gp()) 1704 if (rcu_blocking_is_gp())
1619 return; 1705 return;
1620 1706 wait_rcu_gp(call_rcu_sched);
1621 init_rcu_head_on_stack(&rcu.head);
1622 init_completion(&rcu.completion);
1623 /* Will wake me after RCU finished. */
1624 call_rcu_sched(&rcu.head, wakeme_after_rcu);
1625 /* Wait for it. */
1626 wait_for_completion(&rcu.completion);
1627 destroy_rcu_head_on_stack(&rcu.head);
1628} 1707}
1629EXPORT_SYMBOL_GPL(synchronize_sched); 1708EXPORT_SYMBOL_GPL(synchronize_sched);
1630 1709
@@ -1639,18 +1718,9 @@ EXPORT_SYMBOL_GPL(synchronize_sched);
1639 */ 1718 */
1640void synchronize_rcu_bh(void) 1719void synchronize_rcu_bh(void)
1641{ 1720{
1642 struct rcu_synchronize rcu;
1643
1644 if (rcu_blocking_is_gp()) 1721 if (rcu_blocking_is_gp())
1645 return; 1722 return;
1646 1723 wait_rcu_gp(call_rcu_bh);
1647 init_rcu_head_on_stack(&rcu.head);
1648 init_completion(&rcu.completion);
1649 /* Will wake me after RCU finished. */
1650 call_rcu_bh(&rcu.head, wakeme_after_rcu);
1651 /* Wait for it. */
1652 wait_for_completion(&rcu.completion);
1653 destroy_rcu_head_on_stack(&rcu.head);
1654} 1724}
1655EXPORT_SYMBOL_GPL(synchronize_rcu_bh); 1725EXPORT_SYMBOL_GPL(synchronize_rcu_bh);
1656 1726
@@ -1671,7 +1741,8 @@ static int __rcu_pending(struct rcu_state *rsp, struct rcu_data *rdp)
1671 check_cpu_stall(rsp, rdp); 1741 check_cpu_stall(rsp, rdp);
1672 1742
1673 /* Is the RCU core waiting for a quiescent state from this CPU? */ 1743 /* Is the RCU core waiting for a quiescent state from this CPU? */
1674 if (rdp->qs_pending && !rdp->passed_quiesc) { 1744 if (rcu_scheduler_fully_active &&
1745 rdp->qs_pending && !rdp->passed_quiesce) {
1675 1746
1676 /* 1747 /*
1677 * If force_quiescent_state() coming soon and this CPU 1748 * If force_quiescent_state() coming soon and this CPU
@@ -1683,7 +1754,7 @@ static int __rcu_pending(struct rcu_state *rsp, struct rcu_data *rdp)
1683 ULONG_CMP_LT(ACCESS_ONCE(rsp->jiffies_force_qs) - 1, 1754 ULONG_CMP_LT(ACCESS_ONCE(rsp->jiffies_force_qs) - 1,
1684 jiffies)) 1755 jiffies))
1685 set_need_resched(); 1756 set_need_resched();
1686 } else if (rdp->qs_pending && rdp->passed_quiesc) { 1757 } else if (rdp->qs_pending && rdp->passed_quiesce) {
1687 rdp->n_rp_report_qs++; 1758 rdp->n_rp_report_qs++;
1688 return 1; 1759 return 1;
1689 } 1760 }
@@ -1846,6 +1917,7 @@ rcu_boot_init_percpu_data(int cpu, struct rcu_state *rsp)
1846 rdp->dynticks = &per_cpu(rcu_dynticks, cpu); 1917 rdp->dynticks = &per_cpu(rcu_dynticks, cpu);
1847#endif /* #ifdef CONFIG_NO_HZ */ 1918#endif /* #ifdef CONFIG_NO_HZ */
1848 rdp->cpu = cpu; 1919 rdp->cpu = cpu;
1920 rdp->rsp = rsp;
1849 raw_spin_unlock_irqrestore(&rnp->lock, flags); 1921 raw_spin_unlock_irqrestore(&rnp->lock, flags);
1850} 1922}
1851 1923
@@ -1865,8 +1937,6 @@ rcu_init_percpu_data(int cpu, struct rcu_state *rsp, int preemptible)
1865 1937
1866 /* Set up local state, ensuring consistent view of global state. */ 1938 /* Set up local state, ensuring consistent view of global state. */
1867 raw_spin_lock_irqsave(&rnp->lock, flags); 1939 raw_spin_lock_irqsave(&rnp->lock, flags);
1868 rdp->passed_quiesc = 0; /* We could be racing with new GP, */
1869 rdp->qs_pending = 1; /* so set up to respond to current GP. */
1870 rdp->beenonline = 1; /* We have now been online. */ 1940 rdp->beenonline = 1; /* We have now been online. */
1871 rdp->preemptible = preemptible; 1941 rdp->preemptible = preemptible;
1872 rdp->qlen_last_fqs_check = 0; 1942 rdp->qlen_last_fqs_check = 0;
@@ -1891,9 +1961,17 @@ rcu_init_percpu_data(int cpu, struct rcu_state *rsp, int preemptible)
1891 rnp->qsmaskinit |= mask; 1961 rnp->qsmaskinit |= mask;
1892 mask = rnp->grpmask; 1962 mask = rnp->grpmask;
1893 if (rnp == rdp->mynode) { 1963 if (rnp == rdp->mynode) {
1894 rdp->gpnum = rnp->completed; /* if GP in progress... */ 1964 /*
1965 * If there is a grace period in progress, we will
1966 * set up to wait for it next time we run the
1967 * RCU core code.
1968 */
1969 rdp->gpnum = rnp->completed;
1895 rdp->completed = rnp->completed; 1970 rdp->completed = rnp->completed;
1896 rdp->passed_quiesc_completed = rnp->completed - 1; 1971 rdp->passed_quiesce = 0;
1972 rdp->qs_pending = 0;
1973 rdp->passed_quiesce_gpnum = rnp->gpnum - 1;
1974 trace_rcu_grace_period(rsp->name, rdp->gpnum, "cpuonl");
1897 } 1975 }
1898 raw_spin_unlock(&rnp->lock); /* irqs already disabled. */ 1976 raw_spin_unlock(&rnp->lock); /* irqs already disabled. */
1899 rnp = rnp->parent; 1977 rnp = rnp->parent;
@@ -1919,6 +1997,7 @@ static int __cpuinit rcu_cpu_notify(struct notifier_block *self,
1919 struct rcu_data *rdp = per_cpu_ptr(rcu_state->rda, cpu); 1997 struct rcu_data *rdp = per_cpu_ptr(rcu_state->rda, cpu);
1920 struct rcu_node *rnp = rdp->mynode; 1998 struct rcu_node *rnp = rdp->mynode;
1921 1999
2000 trace_rcu_utilization("Start CPU hotplug");
1922 switch (action) { 2001 switch (action) {
1923 case CPU_UP_PREPARE: 2002 case CPU_UP_PREPARE:
1924 case CPU_UP_PREPARE_FROZEN: 2003 case CPU_UP_PREPARE_FROZEN:
@@ -1954,6 +2033,7 @@ static int __cpuinit rcu_cpu_notify(struct notifier_block *self,
1954 default: 2033 default:
1955 break; 2034 break;
1956 } 2035 }
2036 trace_rcu_utilization("End CPU hotplug");
1957 return NOTIFY_OK; 2037 return NOTIFY_OK;
1958} 2038}
1959 2039
diff --git a/kernel/rcutree.h b/kernel/rcutree.h
index 01b2ccda26fb..849ce9ec51fe 100644
--- a/kernel/rcutree.h
+++ b/kernel/rcutree.h
@@ -230,9 +230,9 @@ struct rcu_data {
230 /* in order to detect GP end. */ 230 /* in order to detect GP end. */
231 unsigned long gpnum; /* Highest gp number that this CPU */ 231 unsigned long gpnum; /* Highest gp number that this CPU */
232 /* is aware of having started. */ 232 /* is aware of having started. */
233 unsigned long passed_quiesc_completed; 233 unsigned long passed_quiesce_gpnum;
234 /* Value of completed at time of qs. */ 234 /* gpnum at time of quiescent state. */
235 bool passed_quiesc; /* User-mode/idle loop etc. */ 235 bool passed_quiesce; /* User-mode/idle loop etc. */
236 bool qs_pending; /* Core waits for quiesc state. */ 236 bool qs_pending; /* Core waits for quiesc state. */
237 bool beenonline; /* CPU online at least once. */ 237 bool beenonline; /* CPU online at least once. */
238 bool preemptible; /* Preemptible RCU? */ 238 bool preemptible; /* Preemptible RCU? */
@@ -299,6 +299,7 @@ struct rcu_data {
299 unsigned long n_rp_need_nothing; 299 unsigned long n_rp_need_nothing;
300 300
301 int cpu; 301 int cpu;
302 struct rcu_state *rsp;
302}; 303};
303 304
304/* Values for signaled field in struct rcu_state. */ 305/* Values for signaled field in struct rcu_state. */
@@ -417,6 +418,13 @@ extern struct rcu_state rcu_preempt_state;
417DECLARE_PER_CPU(struct rcu_data, rcu_preempt_data); 418DECLARE_PER_CPU(struct rcu_data, rcu_preempt_data);
418#endif /* #ifdef CONFIG_TREE_PREEMPT_RCU */ 419#endif /* #ifdef CONFIG_TREE_PREEMPT_RCU */
419 420
421#ifdef CONFIG_RCU_BOOST
422DECLARE_PER_CPU(unsigned int, rcu_cpu_kthread_status);
423DECLARE_PER_CPU(int, rcu_cpu_kthread_cpu);
424DECLARE_PER_CPU(unsigned int, rcu_cpu_kthread_loops);
425DECLARE_PER_CPU(char, rcu_cpu_has_work);
426#endif /* #ifdef CONFIG_RCU_BOOST */
427
420#ifndef RCU_TREE_NONCORE 428#ifndef RCU_TREE_NONCORE
421 429
422/* Forward declarations for rcutree_plugin.h */ 430/* Forward declarations for rcutree_plugin.h */
@@ -430,7 +438,7 @@ static void rcu_report_unblock_qs_rnp(struct rcu_node *rnp,
430static void rcu_stop_cpu_kthread(int cpu); 438static void rcu_stop_cpu_kthread(int cpu);
431#endif /* #ifdef CONFIG_HOTPLUG_CPU */ 439#endif /* #ifdef CONFIG_HOTPLUG_CPU */
432static void rcu_print_detail_task_stall(struct rcu_state *rsp); 440static void rcu_print_detail_task_stall(struct rcu_state *rsp);
433static void rcu_print_task_stall(struct rcu_node *rnp); 441static int rcu_print_task_stall(struct rcu_node *rnp);
434static void rcu_preempt_stall_reset(void); 442static void rcu_preempt_stall_reset(void);
435static void rcu_preempt_check_blocked_tasks(struct rcu_node *rnp); 443static void rcu_preempt_check_blocked_tasks(struct rcu_node *rnp);
436#ifdef CONFIG_HOTPLUG_CPU 444#ifdef CONFIG_HOTPLUG_CPU
@@ -450,7 +458,6 @@ static int rcu_preempt_needs_cpu(int cpu);
450static void __cpuinit rcu_preempt_init_percpu_data(int cpu); 458static void __cpuinit rcu_preempt_init_percpu_data(int cpu);
451static void rcu_preempt_send_cbs_to_online(void); 459static void rcu_preempt_send_cbs_to_online(void);
452static void __init __rcu_init_preempt(void); 460static void __init __rcu_init_preempt(void);
453static void rcu_needs_cpu_flush(void);
454static void rcu_initiate_boost(struct rcu_node *rnp, unsigned long flags); 461static void rcu_initiate_boost(struct rcu_node *rnp, unsigned long flags);
455static void rcu_preempt_boost_start_gp(struct rcu_node *rnp); 462static void rcu_preempt_boost_start_gp(struct rcu_node *rnp);
456static void invoke_rcu_callbacks_kthread(void); 463static void invoke_rcu_callbacks_kthread(void);
diff --git a/kernel/rcutree_plugin.h b/kernel/rcutree_plugin.h
index 8aafbb80b8b0..4b9b9f8a4184 100644
--- a/kernel/rcutree_plugin.h
+++ b/kernel/rcutree_plugin.h
@@ -27,6 +27,14 @@
27#include <linux/delay.h> 27#include <linux/delay.h>
28#include <linux/stop_machine.h> 28#include <linux/stop_machine.h>
29 29
30#define RCU_KTHREAD_PRIO 1
31
32#ifdef CONFIG_RCU_BOOST
33#define RCU_BOOST_PRIO CONFIG_RCU_BOOST_PRIO
34#else
35#define RCU_BOOST_PRIO RCU_KTHREAD_PRIO
36#endif
37
30/* 38/*
31 * Check the RCU kernel configuration parameters and print informative 39 * Check the RCU kernel configuration parameters and print informative
32 * messages about anything out of the ordinary. If you like #ifdef, you 40 * messages about anything out of the ordinary. If you like #ifdef, you
@@ -64,7 +72,7 @@ static void __init rcu_bootup_announce_oddness(void)
64 72
65#ifdef CONFIG_TREE_PREEMPT_RCU 73#ifdef CONFIG_TREE_PREEMPT_RCU
66 74
67struct rcu_state rcu_preempt_state = RCU_STATE_INITIALIZER(rcu_preempt_state); 75struct rcu_state rcu_preempt_state = RCU_STATE_INITIALIZER(rcu_preempt);
68DEFINE_PER_CPU(struct rcu_data, rcu_preempt_data); 76DEFINE_PER_CPU(struct rcu_data, rcu_preempt_data);
69static struct rcu_state *rcu_state = &rcu_preempt_state; 77static struct rcu_state *rcu_state = &rcu_preempt_state;
70 78
@@ -122,9 +130,11 @@ static void rcu_preempt_qs(int cpu)
122{ 130{
123 struct rcu_data *rdp = &per_cpu(rcu_preempt_data, cpu); 131 struct rcu_data *rdp = &per_cpu(rcu_preempt_data, cpu);
124 132
125 rdp->passed_quiesc_completed = rdp->gpnum - 1; 133 rdp->passed_quiesce_gpnum = rdp->gpnum;
126 barrier(); 134 barrier();
127 rdp->passed_quiesc = 1; 135 if (rdp->passed_quiesce == 0)
136 trace_rcu_grace_period("rcu_preempt", rdp->gpnum, "cpuqs");
137 rdp->passed_quiesce = 1;
128 current->rcu_read_unlock_special &= ~RCU_READ_UNLOCK_NEED_QS; 138 current->rcu_read_unlock_special &= ~RCU_READ_UNLOCK_NEED_QS;
129} 139}
130 140
@@ -190,6 +200,11 @@ static void rcu_preempt_note_context_switch(int cpu)
190 if (rnp->qsmask & rdp->grpmask) 200 if (rnp->qsmask & rdp->grpmask)
191 rnp->gp_tasks = &t->rcu_node_entry; 201 rnp->gp_tasks = &t->rcu_node_entry;
192 } 202 }
203 trace_rcu_preempt_task(rdp->rsp->name,
204 t->pid,
205 (rnp->qsmask & rdp->grpmask)
206 ? rnp->gpnum
207 : rnp->gpnum + 1);
193 raw_spin_unlock_irqrestore(&rnp->lock, flags); 208 raw_spin_unlock_irqrestore(&rnp->lock, flags);
194 } else if (t->rcu_read_lock_nesting < 0 && 209 } else if (t->rcu_read_lock_nesting < 0 &&
195 t->rcu_read_unlock_special) { 210 t->rcu_read_unlock_special) {
@@ -299,6 +314,9 @@ static noinline void rcu_read_unlock_special(struct task_struct *t)
299 int empty_exp; 314 int empty_exp;
300 unsigned long flags; 315 unsigned long flags;
301 struct list_head *np; 316 struct list_head *np;
317#ifdef CONFIG_RCU_BOOST
318 struct rt_mutex *rbmp = NULL;
319#endif /* #ifdef CONFIG_RCU_BOOST */
302 struct rcu_node *rnp; 320 struct rcu_node *rnp;
303 int special; 321 int special;
304 322
@@ -344,6 +362,9 @@ static noinline void rcu_read_unlock_special(struct task_struct *t)
344 smp_mb(); /* ensure expedited fastpath sees end of RCU c-s. */ 362 smp_mb(); /* ensure expedited fastpath sees end of RCU c-s. */
345 np = rcu_next_node_entry(t, rnp); 363 np = rcu_next_node_entry(t, rnp);
346 list_del_init(&t->rcu_node_entry); 364 list_del_init(&t->rcu_node_entry);
365 t->rcu_blocked_node = NULL;
366 trace_rcu_unlock_preempted_task("rcu_preempt",
367 rnp->gpnum, t->pid);
347 if (&t->rcu_node_entry == rnp->gp_tasks) 368 if (&t->rcu_node_entry == rnp->gp_tasks)
348 rnp->gp_tasks = np; 369 rnp->gp_tasks = np;
349 if (&t->rcu_node_entry == rnp->exp_tasks) 370 if (&t->rcu_node_entry == rnp->exp_tasks)
@@ -351,30 +372,34 @@ static noinline void rcu_read_unlock_special(struct task_struct *t)
351#ifdef CONFIG_RCU_BOOST 372#ifdef CONFIG_RCU_BOOST
352 if (&t->rcu_node_entry == rnp->boost_tasks) 373 if (&t->rcu_node_entry == rnp->boost_tasks)
353 rnp->boost_tasks = np; 374 rnp->boost_tasks = np;
354 /* Snapshot and clear ->rcu_boosted with rcu_node lock held. */ 375 /* Snapshot/clear ->rcu_boost_mutex with rcu_node lock held. */
355 if (t->rcu_boosted) { 376 if (t->rcu_boost_mutex) {
356 special |= RCU_READ_UNLOCK_BOOSTED; 377 rbmp = t->rcu_boost_mutex;
357 t->rcu_boosted = 0; 378 t->rcu_boost_mutex = NULL;
358 } 379 }
359#endif /* #ifdef CONFIG_RCU_BOOST */ 380#endif /* #ifdef CONFIG_RCU_BOOST */
360 t->rcu_blocked_node = NULL;
361 381
362 /* 382 /*
363 * If this was the last task on the current list, and if 383 * If this was the last task on the current list, and if
364 * we aren't waiting on any CPUs, report the quiescent state. 384 * we aren't waiting on any CPUs, report the quiescent state.
365 * Note that rcu_report_unblock_qs_rnp() releases rnp->lock. 385 * Note that rcu_report_unblock_qs_rnp() releases rnp->lock.
366 */ 386 */
367 if (empty) 387 if (!empty && !rcu_preempt_blocked_readers_cgp(rnp)) {
368 raw_spin_unlock_irqrestore(&rnp->lock, flags); 388 trace_rcu_quiescent_state_report("preempt_rcu",
369 else 389 rnp->gpnum,
390 0, rnp->qsmask,
391 rnp->level,
392 rnp->grplo,
393 rnp->grphi,
394 !!rnp->gp_tasks);
370 rcu_report_unblock_qs_rnp(rnp, flags); 395 rcu_report_unblock_qs_rnp(rnp, flags);
396 } else
397 raw_spin_unlock_irqrestore(&rnp->lock, flags);
371 398
372#ifdef CONFIG_RCU_BOOST 399#ifdef CONFIG_RCU_BOOST
373 /* Unboost if we were boosted. */ 400 /* Unboost if we were boosted. */
374 if (special & RCU_READ_UNLOCK_BOOSTED) { 401 if (rbmp)
375 rt_mutex_unlock(t->rcu_boost_mutex); 402 rt_mutex_unlock(rbmp);
376 t->rcu_boost_mutex = NULL;
377 }
378#endif /* #ifdef CONFIG_RCU_BOOST */ 403#endif /* #ifdef CONFIG_RCU_BOOST */
379 404
380 /* 405 /*
@@ -399,10 +424,10 @@ void __rcu_read_unlock(void)
399{ 424{
400 struct task_struct *t = current; 425 struct task_struct *t = current;
401 426
402 barrier(); /* needed if we ever invoke rcu_read_unlock in rcutree.c */
403 if (t->rcu_read_lock_nesting != 1) 427 if (t->rcu_read_lock_nesting != 1)
404 --t->rcu_read_lock_nesting; 428 --t->rcu_read_lock_nesting;
405 else { 429 else {
430 barrier(); /* critical section before exit code. */
406 t->rcu_read_lock_nesting = INT_MIN; 431 t->rcu_read_lock_nesting = INT_MIN;
407 barrier(); /* assign before ->rcu_read_unlock_special load */ 432 barrier(); /* assign before ->rcu_read_unlock_special load */
408 if (unlikely(ACCESS_ONCE(t->rcu_read_unlock_special))) 433 if (unlikely(ACCESS_ONCE(t->rcu_read_unlock_special)))
@@ -466,16 +491,20 @@ static void rcu_print_detail_task_stall(struct rcu_state *rsp)
466 * Scan the current list of tasks blocked within RCU read-side critical 491 * Scan the current list of tasks blocked within RCU read-side critical
467 * sections, printing out the tid of each. 492 * sections, printing out the tid of each.
468 */ 493 */
469static void rcu_print_task_stall(struct rcu_node *rnp) 494static int rcu_print_task_stall(struct rcu_node *rnp)
470{ 495{
471 struct task_struct *t; 496 struct task_struct *t;
497 int ndetected = 0;
472 498
473 if (!rcu_preempt_blocked_readers_cgp(rnp)) 499 if (!rcu_preempt_blocked_readers_cgp(rnp))
474 return; 500 return 0;
475 t = list_entry(rnp->gp_tasks, 501 t = list_entry(rnp->gp_tasks,
476 struct task_struct, rcu_node_entry); 502 struct task_struct, rcu_node_entry);
477 list_for_each_entry_continue(t, &rnp->blkd_tasks, rcu_node_entry) 503 list_for_each_entry_continue(t, &rnp->blkd_tasks, rcu_node_entry) {
478 printk(" P%d", t->pid); 504 printk(" P%d", t->pid);
505 ndetected++;
506 }
507 return ndetected;
479} 508}
480 509
481/* 510/*
@@ -656,18 +685,9 @@ EXPORT_SYMBOL_GPL(call_rcu);
656 */ 685 */
657void synchronize_rcu(void) 686void synchronize_rcu(void)
658{ 687{
659 struct rcu_synchronize rcu;
660
661 if (!rcu_scheduler_active) 688 if (!rcu_scheduler_active)
662 return; 689 return;
663 690 wait_rcu_gp(call_rcu);
664 init_rcu_head_on_stack(&rcu.head);
665 init_completion(&rcu.completion);
666 /* Will wake me after RCU finished. */
667 call_rcu(&rcu.head, wakeme_after_rcu);
668 /* Wait for it. */
669 wait_for_completion(&rcu.completion);
670 destroy_rcu_head_on_stack(&rcu.head);
671} 691}
672EXPORT_SYMBOL_GPL(synchronize_rcu); 692EXPORT_SYMBOL_GPL(synchronize_rcu);
673 693
@@ -968,8 +988,9 @@ static void rcu_print_detail_task_stall(struct rcu_state *rsp)
968 * Because preemptible RCU does not exist, we never have to check for 988 * Because preemptible RCU does not exist, we never have to check for
969 * tasks blocked within RCU read-side critical sections. 989 * tasks blocked within RCU read-side critical sections.
970 */ 990 */
971static void rcu_print_task_stall(struct rcu_node *rnp) 991static int rcu_print_task_stall(struct rcu_node *rnp)
972{ 992{
993 return 0;
973} 994}
974 995
975/* 996/*
@@ -1136,6 +1157,8 @@ static void rcu_initiate_boost_trace(struct rcu_node *rnp)
1136 1157
1137#endif /* #else #ifdef CONFIG_RCU_TRACE */ 1158#endif /* #else #ifdef CONFIG_RCU_TRACE */
1138 1159
1160static struct lock_class_key rcu_boost_class;
1161
1139/* 1162/*
1140 * Carry out RCU priority boosting on the task indicated by ->exp_tasks 1163 * Carry out RCU priority boosting on the task indicated by ->exp_tasks
1141 * or ->boost_tasks, advancing the pointer to the next task in the 1164 * or ->boost_tasks, advancing the pointer to the next task in the
@@ -1198,8 +1221,10 @@ static int rcu_boost(struct rcu_node *rnp)
1198 */ 1221 */
1199 t = container_of(tb, struct task_struct, rcu_node_entry); 1222 t = container_of(tb, struct task_struct, rcu_node_entry);
1200 rt_mutex_init_proxy_locked(&mtx, t); 1223 rt_mutex_init_proxy_locked(&mtx, t);
1224 /* Avoid lockdep false positives. This rt_mutex is its own thing. */
1225 lockdep_set_class_and_name(&mtx.wait_lock, &rcu_boost_class,
1226 "rcu_boost_mutex");
1201 t->rcu_boost_mutex = &mtx; 1227 t->rcu_boost_mutex = &mtx;
1202 t->rcu_boosted = 1;
1203 raw_spin_unlock_irqrestore(&rnp->lock, flags); 1228 raw_spin_unlock_irqrestore(&rnp->lock, flags);
1204 rt_mutex_lock(&mtx); /* Side effect: boosts task t's priority. */ 1229 rt_mutex_lock(&mtx); /* Side effect: boosts task t's priority. */
1205 rt_mutex_unlock(&mtx); /* Keep lockdep happy. */ 1230 rt_mutex_unlock(&mtx); /* Keep lockdep happy. */
@@ -1228,9 +1253,12 @@ static int rcu_boost_kthread(void *arg)
1228 int spincnt = 0; 1253 int spincnt = 0;
1229 int more2boost; 1254 int more2boost;
1230 1255
1256 trace_rcu_utilization("Start boost kthread@init");
1231 for (;;) { 1257 for (;;) {
1232 rnp->boost_kthread_status = RCU_KTHREAD_WAITING; 1258 rnp->boost_kthread_status = RCU_KTHREAD_WAITING;
1259 trace_rcu_utilization("End boost kthread@rcu_wait");
1233 rcu_wait(rnp->boost_tasks || rnp->exp_tasks); 1260 rcu_wait(rnp->boost_tasks || rnp->exp_tasks);
1261 trace_rcu_utilization("Start boost kthread@rcu_wait");
1234 rnp->boost_kthread_status = RCU_KTHREAD_RUNNING; 1262 rnp->boost_kthread_status = RCU_KTHREAD_RUNNING;
1235 more2boost = rcu_boost(rnp); 1263 more2boost = rcu_boost(rnp);
1236 if (more2boost) 1264 if (more2boost)
@@ -1238,11 +1266,14 @@ static int rcu_boost_kthread(void *arg)
1238 else 1266 else
1239 spincnt = 0; 1267 spincnt = 0;
1240 if (spincnt > 10) { 1268 if (spincnt > 10) {
1269 trace_rcu_utilization("End boost kthread@rcu_yield");
1241 rcu_yield(rcu_boost_kthread_timer, (unsigned long)rnp); 1270 rcu_yield(rcu_boost_kthread_timer, (unsigned long)rnp);
1271 trace_rcu_utilization("Start boost kthread@rcu_yield");
1242 spincnt = 0; 1272 spincnt = 0;
1243 } 1273 }
1244 } 1274 }
1245 /* NOTREACHED */ 1275 /* NOTREACHED */
1276 trace_rcu_utilization("End boost kthread@notreached");
1246 return 0; 1277 return 0;
1247} 1278}
1248 1279
@@ -1291,11 +1322,9 @@ static void invoke_rcu_callbacks_kthread(void)
1291 1322
1292 local_irq_save(flags); 1323 local_irq_save(flags);
1293 __this_cpu_write(rcu_cpu_has_work, 1); 1324 __this_cpu_write(rcu_cpu_has_work, 1);
1294 if (__this_cpu_read(rcu_cpu_kthread_task) == NULL) { 1325 if (__this_cpu_read(rcu_cpu_kthread_task) != NULL &&
1295 local_irq_restore(flags); 1326 current != __this_cpu_read(rcu_cpu_kthread_task))
1296 return; 1327 wake_up_process(__this_cpu_read(rcu_cpu_kthread_task));
1297 }
1298 wake_up_process(__this_cpu_read(rcu_cpu_kthread_task));
1299 local_irq_restore(flags); 1328 local_irq_restore(flags);
1300} 1329}
1301 1330
@@ -1343,13 +1372,13 @@ static int __cpuinit rcu_spawn_one_boost_kthread(struct rcu_state *rsp,
1343 if (rnp->boost_kthread_task != NULL) 1372 if (rnp->boost_kthread_task != NULL)
1344 return 0; 1373 return 0;
1345 t = kthread_create(rcu_boost_kthread, (void *)rnp, 1374 t = kthread_create(rcu_boost_kthread, (void *)rnp,
1346 "rcub%d", rnp_index); 1375 "rcub/%d", rnp_index);
1347 if (IS_ERR(t)) 1376 if (IS_ERR(t))
1348 return PTR_ERR(t); 1377 return PTR_ERR(t);
1349 raw_spin_lock_irqsave(&rnp->lock, flags); 1378 raw_spin_lock_irqsave(&rnp->lock, flags);
1350 rnp->boost_kthread_task = t; 1379 rnp->boost_kthread_task = t;
1351 raw_spin_unlock_irqrestore(&rnp->lock, flags); 1380 raw_spin_unlock_irqrestore(&rnp->lock, flags);
1352 sp.sched_priority = RCU_KTHREAD_PRIO; 1381 sp.sched_priority = RCU_BOOST_PRIO;
1353 sched_setscheduler_nocheck(t, SCHED_FIFO, &sp); 1382 sched_setscheduler_nocheck(t, SCHED_FIFO, &sp);
1354 wake_up_process(t); /* get to TASK_INTERRUPTIBLE quickly. */ 1383 wake_up_process(t); /* get to TASK_INTERRUPTIBLE quickly. */
1355 return 0; 1384 return 0;
@@ -1444,6 +1473,7 @@ static void rcu_yield(void (*f)(unsigned long), unsigned long arg)
1444{ 1473{
1445 struct sched_param sp; 1474 struct sched_param sp;
1446 struct timer_list yield_timer; 1475 struct timer_list yield_timer;
1476 int prio = current->rt_priority;
1447 1477
1448 setup_timer_on_stack(&yield_timer, f, arg); 1478 setup_timer_on_stack(&yield_timer, f, arg);
1449 mod_timer(&yield_timer, jiffies + 2); 1479 mod_timer(&yield_timer, jiffies + 2);
@@ -1451,7 +1481,8 @@ static void rcu_yield(void (*f)(unsigned long), unsigned long arg)
1451 sched_setscheduler_nocheck(current, SCHED_NORMAL, &sp); 1481 sched_setscheduler_nocheck(current, SCHED_NORMAL, &sp);
1452 set_user_nice(current, 19); 1482 set_user_nice(current, 19);
1453 schedule(); 1483 schedule();
1454 sp.sched_priority = RCU_KTHREAD_PRIO; 1484 set_user_nice(current, 0);
1485 sp.sched_priority = prio;
1455 sched_setscheduler_nocheck(current, SCHED_FIFO, &sp); 1486 sched_setscheduler_nocheck(current, SCHED_FIFO, &sp);
1456 del_timer(&yield_timer); 1487 del_timer(&yield_timer);
1457} 1488}
@@ -1489,7 +1520,8 @@ static int rcu_cpu_kthread_should_stop(int cpu)
1489 1520
1490/* 1521/*
1491 * Per-CPU kernel thread that invokes RCU callbacks. This replaces the 1522 * Per-CPU kernel thread that invokes RCU callbacks. This replaces the
1492 * earlier RCU softirq. 1523 * RCU softirq used in flavors and configurations of RCU that do not
1524 * support RCU priority boosting.
1493 */ 1525 */
1494static int rcu_cpu_kthread(void *arg) 1526static int rcu_cpu_kthread(void *arg)
1495{ 1527{
@@ -1500,9 +1532,12 @@ static int rcu_cpu_kthread(void *arg)
1500 char work; 1532 char work;
1501 char *workp = &per_cpu(rcu_cpu_has_work, cpu); 1533 char *workp = &per_cpu(rcu_cpu_has_work, cpu);
1502 1534
1535 trace_rcu_utilization("Start CPU kthread@init");
1503 for (;;) { 1536 for (;;) {
1504 *statusp = RCU_KTHREAD_WAITING; 1537 *statusp = RCU_KTHREAD_WAITING;
1538 trace_rcu_utilization("End CPU kthread@rcu_wait");
1505 rcu_wait(*workp != 0 || kthread_should_stop()); 1539 rcu_wait(*workp != 0 || kthread_should_stop());
1540 trace_rcu_utilization("Start CPU kthread@rcu_wait");
1506 local_bh_disable(); 1541 local_bh_disable();
1507 if (rcu_cpu_kthread_should_stop(cpu)) { 1542 if (rcu_cpu_kthread_should_stop(cpu)) {
1508 local_bh_enable(); 1543 local_bh_enable();
@@ -1523,11 +1558,14 @@ static int rcu_cpu_kthread(void *arg)
1523 spincnt = 0; 1558 spincnt = 0;
1524 if (spincnt > 10) { 1559 if (spincnt > 10) {
1525 *statusp = RCU_KTHREAD_YIELDING; 1560 *statusp = RCU_KTHREAD_YIELDING;
1561 trace_rcu_utilization("End CPU kthread@rcu_yield");
1526 rcu_yield(rcu_cpu_kthread_timer, (unsigned long)cpu); 1562 rcu_yield(rcu_cpu_kthread_timer, (unsigned long)cpu);
1563 trace_rcu_utilization("Start CPU kthread@rcu_yield");
1527 spincnt = 0; 1564 spincnt = 0;
1528 } 1565 }
1529 } 1566 }
1530 *statusp = RCU_KTHREAD_STOPPED; 1567 *statusp = RCU_KTHREAD_STOPPED;
1568 trace_rcu_utilization("End CPU kthread@term");
1531 return 0; 1569 return 0;
1532} 1570}
1533 1571
@@ -1560,7 +1598,10 @@ static int __cpuinit rcu_spawn_one_cpu_kthread(int cpu)
1560 if (!rcu_scheduler_fully_active || 1598 if (!rcu_scheduler_fully_active ||
1561 per_cpu(rcu_cpu_kthread_task, cpu) != NULL) 1599 per_cpu(rcu_cpu_kthread_task, cpu) != NULL)
1562 return 0; 1600 return 0;
1563 t = kthread_create(rcu_cpu_kthread, (void *)(long)cpu, "rcuc%d", cpu); 1601 t = kthread_create_on_node(rcu_cpu_kthread,
1602 (void *)(long)cpu,
1603 cpu_to_node(cpu),
1604 "rcuc/%d", cpu);
1564 if (IS_ERR(t)) 1605 if (IS_ERR(t))
1565 return PTR_ERR(t); 1606 return PTR_ERR(t);
1566 if (cpu_online(cpu)) 1607 if (cpu_online(cpu))
@@ -1669,7 +1710,7 @@ static int __cpuinit rcu_spawn_one_node_kthread(struct rcu_state *rsp,
1669 return 0; 1710 return 0;
1670 if (rnp->node_kthread_task == NULL) { 1711 if (rnp->node_kthread_task == NULL) {
1671 t = kthread_create(rcu_node_kthread, (void *)rnp, 1712 t = kthread_create(rcu_node_kthread, (void *)rnp,
1672 "rcun%d", rnp_index); 1713 "rcun/%d", rnp_index);
1673 if (IS_ERR(t)) 1714 if (IS_ERR(t))
1674 return PTR_ERR(t); 1715 return PTR_ERR(t);
1675 raw_spin_lock_irqsave(&rnp->lock, flags); 1716 raw_spin_lock_irqsave(&rnp->lock, flags);
@@ -1907,15 +1948,6 @@ int rcu_needs_cpu(int cpu)
1907 return rcu_needs_cpu_quick_check(cpu); 1948 return rcu_needs_cpu_quick_check(cpu);
1908} 1949}
1909 1950
1910/*
1911 * Check to see if we need to continue a callback-flush operations to
1912 * allow the last CPU to enter dyntick-idle mode. But fast dyntick-idle
1913 * entry is not configured, so we never do need to.
1914 */
1915static void rcu_needs_cpu_flush(void)
1916{
1917}
1918
1919#else /* #if !defined(CONFIG_RCU_FAST_NO_HZ) */ 1951#else /* #if !defined(CONFIG_RCU_FAST_NO_HZ) */
1920 1952
1921#define RCU_NEEDS_CPU_FLUSHES 5 1953#define RCU_NEEDS_CPU_FLUSHES 5
@@ -1991,20 +2023,4 @@ int rcu_needs_cpu(int cpu)
1991 return c; 2023 return c;
1992} 2024}
1993 2025
1994/*
1995 * Check to see if we need to continue a callback-flush operations to
1996 * allow the last CPU to enter dyntick-idle mode.
1997 */
1998static void rcu_needs_cpu_flush(void)
1999{
2000 int cpu = smp_processor_id();
2001 unsigned long flags;
2002
2003 if (per_cpu(rcu_dyntick_drain, cpu) <= 0)
2004 return;
2005 local_irq_save(flags);
2006 (void)rcu_needs_cpu(cpu);
2007 local_irq_restore(flags);
2008}
2009
2010#endif /* #else #if !defined(CONFIG_RCU_FAST_NO_HZ) */ 2026#endif /* #else #if !defined(CONFIG_RCU_FAST_NO_HZ) */
diff --git a/kernel/rcutree_trace.c b/kernel/rcutree_trace.c
index 3b0c0986afc0..9feffa4c0695 100644
--- a/kernel/rcutree_trace.c
+++ b/kernel/rcutree_trace.c
@@ -48,11 +48,6 @@
48 48
49#ifdef CONFIG_RCU_BOOST 49#ifdef CONFIG_RCU_BOOST
50 50
51DECLARE_PER_CPU(unsigned int, rcu_cpu_kthread_status);
52DECLARE_PER_CPU(unsigned int, rcu_cpu_kthread_cpu);
53DECLARE_PER_CPU(unsigned int, rcu_cpu_kthread_loops);
54DECLARE_PER_CPU(char, rcu_cpu_has_work);
55
56static char convert_kthread_status(unsigned int kthread_status) 51static char convert_kthread_status(unsigned int kthread_status)
57{ 52{
58 if (kthread_status > RCU_KTHREAD_MAX) 53 if (kthread_status > RCU_KTHREAD_MAX)
@@ -66,11 +61,11 @@ static void print_one_rcu_data(struct seq_file *m, struct rcu_data *rdp)
66{ 61{
67 if (!rdp->beenonline) 62 if (!rdp->beenonline)
68 return; 63 return;
69 seq_printf(m, "%3d%cc=%lu g=%lu pq=%d pqc=%lu qp=%d", 64 seq_printf(m, "%3d%cc=%lu g=%lu pq=%d pgp=%lu qp=%d",
70 rdp->cpu, 65 rdp->cpu,
71 cpu_is_offline(rdp->cpu) ? '!' : ' ', 66 cpu_is_offline(rdp->cpu) ? '!' : ' ',
72 rdp->completed, rdp->gpnum, 67 rdp->completed, rdp->gpnum,
73 rdp->passed_quiesc, rdp->passed_quiesc_completed, 68 rdp->passed_quiesce, rdp->passed_quiesce_gpnum,
74 rdp->qs_pending); 69 rdp->qs_pending);
75#ifdef CONFIG_NO_HZ 70#ifdef CONFIG_NO_HZ
76 seq_printf(m, " dt=%d/%d/%d df=%lu", 71 seq_printf(m, " dt=%d/%d/%d df=%lu",
@@ -144,7 +139,7 @@ static void print_one_rcu_data_csv(struct seq_file *m, struct rcu_data *rdp)
144 rdp->cpu, 139 rdp->cpu,
145 cpu_is_offline(rdp->cpu) ? "\"N\"" : "\"Y\"", 140 cpu_is_offline(rdp->cpu) ? "\"N\"" : "\"Y\"",
146 rdp->completed, rdp->gpnum, 141 rdp->completed, rdp->gpnum,
147 rdp->passed_quiesc, rdp->passed_quiesc_completed, 142 rdp->passed_quiesce, rdp->passed_quiesce_gpnum,
148 rdp->qs_pending); 143 rdp->qs_pending);
149#ifdef CONFIG_NO_HZ 144#ifdef CONFIG_NO_HZ
150 seq_printf(m, ",%d,%d,%d,%lu", 145 seq_printf(m, ",%d,%d,%d,%lu",
@@ -175,7 +170,7 @@ static void print_one_rcu_data_csv(struct seq_file *m, struct rcu_data *rdp)
175 170
176static int show_rcudata_csv(struct seq_file *m, void *unused) 171static int show_rcudata_csv(struct seq_file *m, void *unused)
177{ 172{
178 seq_puts(m, "\"CPU\",\"Online?\",\"c\",\"g\",\"pq\",\"pqc\",\"pq\","); 173 seq_puts(m, "\"CPU\",\"Online?\",\"c\",\"g\",\"pq\",\"pgp\",\"pq\",");
179#ifdef CONFIG_NO_HZ 174#ifdef CONFIG_NO_HZ
180 seq_puts(m, "\"dt\",\"dt nesting\",\"dt NMI nesting\",\"df\","); 175 seq_puts(m, "\"dt\",\"dt nesting\",\"dt NMI nesting\",\"df\",");
181#endif /* #ifdef CONFIG_NO_HZ */ 176#endif /* #ifdef CONFIG_NO_HZ */
diff --git a/kernel/resource.c b/kernel/resource.c
index 3b3cedc52592..c8dc249da5ce 100644
--- a/kernel/resource.c
+++ b/kernel/resource.c
@@ -419,6 +419,9 @@ static int __find_resource(struct resource *root, struct resource *old,
419 else 419 else
420 tmp.end = root->end; 420 tmp.end = root->end;
421 421
422 if (tmp.end < tmp.start)
423 goto next;
424
422 resource_clip(&tmp, constraint->min, constraint->max); 425 resource_clip(&tmp, constraint->min, constraint->max);
423 arch_remove_reservations(&tmp); 426 arch_remove_reservations(&tmp);
424 427
@@ -436,8 +439,10 @@ static int __find_resource(struct resource *root, struct resource *old,
436 return 0; 439 return 0;
437 } 440 }
438 } 441 }
439 if (!this) 442
443next: if (!this || this->end == root->end)
440 break; 444 break;
445
441 if (this != old) 446 if (this != old)
442 tmp.start = this->end + 1; 447 tmp.start = this->end + 1;
443 this = this->sibling; 448 this = this->sibling;
diff --git a/kernel/rtmutex-debug.c b/kernel/rtmutex-debug.c
index 3c7cbc2c33be..a2e7e7210f3e 100644
--- a/kernel/rtmutex-debug.c
+++ b/kernel/rtmutex-debug.c
@@ -29,61 +29,6 @@
29 29
30#include "rtmutex_common.h" 30#include "rtmutex_common.h"
31 31
32# define TRACE_WARN_ON(x) WARN_ON(x)
33# define TRACE_BUG_ON(x) BUG_ON(x)
34
35# define TRACE_OFF() \
36do { \
37 if (rt_trace_on) { \
38 rt_trace_on = 0; \
39 console_verbose(); \
40 if (raw_spin_is_locked(&current->pi_lock)) \
41 raw_spin_unlock(&current->pi_lock); \
42 } \
43} while (0)
44
45# define TRACE_OFF_NOLOCK() \
46do { \
47 if (rt_trace_on) { \
48 rt_trace_on = 0; \
49 console_verbose(); \
50 } \
51} while (0)
52
53# define TRACE_BUG_LOCKED() \
54do { \
55 TRACE_OFF(); \
56 BUG(); \
57} while (0)
58
59# define TRACE_WARN_ON_LOCKED(c) \
60do { \
61 if (unlikely(c)) { \
62 TRACE_OFF(); \
63 WARN_ON(1); \
64 } \
65} while (0)
66
67# define TRACE_BUG_ON_LOCKED(c) \
68do { \
69 if (unlikely(c)) \
70 TRACE_BUG_LOCKED(); \
71} while (0)
72
73#ifdef CONFIG_SMP
74# define SMP_TRACE_BUG_ON_LOCKED(c) TRACE_BUG_ON_LOCKED(c)
75#else
76# define SMP_TRACE_BUG_ON_LOCKED(c) do { } while (0)
77#endif
78
79/*
80 * deadlock detection flag. We turn it off when we detect
81 * the first problem because we dont want to recurse back
82 * into the tracing code when doing error printk or
83 * executing a BUG():
84 */
85static int rt_trace_on = 1;
86
87static void printk_task(struct task_struct *p) 32static void printk_task(struct task_struct *p)
88{ 33{
89 if (p) 34 if (p)
@@ -111,8 +56,8 @@ static void printk_lock(struct rt_mutex *lock, int print_owner)
111 56
112void rt_mutex_debug_task_free(struct task_struct *task) 57void rt_mutex_debug_task_free(struct task_struct *task)
113{ 58{
114 WARN_ON(!plist_head_empty(&task->pi_waiters)); 59 DEBUG_LOCKS_WARN_ON(!plist_head_empty(&task->pi_waiters));
115 WARN_ON(task->pi_blocked_on); 60 DEBUG_LOCKS_WARN_ON(task->pi_blocked_on);
116} 61}
117 62
118/* 63/*
@@ -125,7 +70,7 @@ void debug_rt_mutex_deadlock(int detect, struct rt_mutex_waiter *act_waiter,
125{ 70{
126 struct task_struct *task; 71 struct task_struct *task;
127 72
128 if (!rt_trace_on || detect || !act_waiter) 73 if (!debug_locks || detect || !act_waiter)
129 return; 74 return;
130 75
131 task = rt_mutex_owner(act_waiter->lock); 76 task = rt_mutex_owner(act_waiter->lock);
@@ -139,7 +84,7 @@ void debug_rt_mutex_print_deadlock(struct rt_mutex_waiter *waiter)
139{ 84{
140 struct task_struct *task; 85 struct task_struct *task;
141 86
142 if (!waiter->deadlock_lock || !rt_trace_on) 87 if (!waiter->deadlock_lock || !debug_locks)
143 return; 88 return;
144 89
145 rcu_read_lock(); 90 rcu_read_lock();
@@ -149,7 +94,10 @@ void debug_rt_mutex_print_deadlock(struct rt_mutex_waiter *waiter)
149 return; 94 return;
150 } 95 }
151 96
152 TRACE_OFF_NOLOCK(); 97 if (!debug_locks_off()) {
98 rcu_read_unlock();
99 return;
100 }
153 101
154 printk("\n============================================\n"); 102 printk("\n============================================\n");
155 printk( "[ BUG: circular locking deadlock detected! ]\n"); 103 printk( "[ BUG: circular locking deadlock detected! ]\n");
@@ -180,7 +128,6 @@ void debug_rt_mutex_print_deadlock(struct rt_mutex_waiter *waiter)
180 128
181 printk("[ turning off deadlock detection." 129 printk("[ turning off deadlock detection."
182 "Please report this trace. ]\n\n"); 130 "Please report this trace. ]\n\n");
183 local_irq_disable();
184} 131}
185 132
186void debug_rt_mutex_lock(struct rt_mutex *lock) 133void debug_rt_mutex_lock(struct rt_mutex *lock)
@@ -189,7 +136,7 @@ void debug_rt_mutex_lock(struct rt_mutex *lock)
189 136
190void debug_rt_mutex_unlock(struct rt_mutex *lock) 137void debug_rt_mutex_unlock(struct rt_mutex *lock)
191{ 138{
192 TRACE_WARN_ON_LOCKED(rt_mutex_owner(lock) != current); 139 DEBUG_LOCKS_WARN_ON(rt_mutex_owner(lock) != current);
193} 140}
194 141
195void 142void
@@ -199,7 +146,7 @@ debug_rt_mutex_proxy_lock(struct rt_mutex *lock, struct task_struct *powner)
199 146
200void debug_rt_mutex_proxy_unlock(struct rt_mutex *lock) 147void debug_rt_mutex_proxy_unlock(struct rt_mutex *lock)
201{ 148{
202 TRACE_WARN_ON_LOCKED(!rt_mutex_owner(lock)); 149 DEBUG_LOCKS_WARN_ON(!rt_mutex_owner(lock));
203} 150}
204 151
205void debug_rt_mutex_init_waiter(struct rt_mutex_waiter *waiter) 152void debug_rt_mutex_init_waiter(struct rt_mutex_waiter *waiter)
@@ -213,8 +160,8 @@ void debug_rt_mutex_init_waiter(struct rt_mutex_waiter *waiter)
213void debug_rt_mutex_free_waiter(struct rt_mutex_waiter *waiter) 160void debug_rt_mutex_free_waiter(struct rt_mutex_waiter *waiter)
214{ 161{
215 put_pid(waiter->deadlock_task_pid); 162 put_pid(waiter->deadlock_task_pid);
216 TRACE_WARN_ON(!plist_node_empty(&waiter->list_entry)); 163 DEBUG_LOCKS_WARN_ON(!plist_node_empty(&waiter->list_entry));
217 TRACE_WARN_ON(!plist_node_empty(&waiter->pi_list_entry)); 164 DEBUG_LOCKS_WARN_ON(!plist_node_empty(&waiter->pi_list_entry));
218 memset(waiter, 0x22, sizeof(*waiter)); 165 memset(waiter, 0x22, sizeof(*waiter));
219} 166}
220 167
diff --git a/kernel/rtmutex.c b/kernel/rtmutex.c
index 255e1662acdb..5e8d9cce7470 100644
--- a/kernel/rtmutex.c
+++ b/kernel/rtmutex.c
@@ -579,6 +579,7 @@ __rt_mutex_slowlock(struct rt_mutex *lock, int state,
579 struct rt_mutex_waiter *waiter) 579 struct rt_mutex_waiter *waiter)
580{ 580{
581 int ret = 0; 581 int ret = 0;
582 int was_disabled;
582 583
583 for (;;) { 584 for (;;) {
584 /* Try to acquire the lock: */ 585 /* Try to acquire the lock: */
@@ -601,10 +602,17 @@ __rt_mutex_slowlock(struct rt_mutex *lock, int state,
601 602
602 raw_spin_unlock(&lock->wait_lock); 603 raw_spin_unlock(&lock->wait_lock);
603 604
605 was_disabled = irqs_disabled();
606 if (was_disabled)
607 local_irq_enable();
608
604 debug_rt_mutex_print_deadlock(waiter); 609 debug_rt_mutex_print_deadlock(waiter);
605 610
606 schedule_rt_mutex(lock); 611 schedule_rt_mutex(lock);
607 612
613 if (was_disabled)
614 local_irq_disable();
615
608 raw_spin_lock(&lock->wait_lock); 616 raw_spin_lock(&lock->wait_lock);
609 set_current_state(state); 617 set_current_state(state);
610 } 618 }
diff --git a/kernel/sched.c b/kernel/sched.c
index ec5f472bc5b9..d87c6e5d4e8c 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -196,10 +196,28 @@ static inline int rt_bandwidth_enabled(void)
196 return sysctl_sched_rt_runtime >= 0; 196 return sysctl_sched_rt_runtime >= 0;
197} 197}
198 198
199static void start_rt_bandwidth(struct rt_bandwidth *rt_b) 199static void start_bandwidth_timer(struct hrtimer *period_timer, ktime_t period)
200{ 200{
201 ktime_t now; 201 unsigned long delta;
202 ktime_t soft, hard, now;
203
204 for (;;) {
205 if (hrtimer_active(period_timer))
206 break;
207
208 now = hrtimer_cb_get_time(period_timer);
209 hrtimer_forward(period_timer, now, period);
202 210
211 soft = hrtimer_get_softexpires(period_timer);
212 hard = hrtimer_get_expires(period_timer);
213 delta = ktime_to_ns(ktime_sub(hard, soft));
214 __hrtimer_start_range_ns(period_timer, soft, delta,
215 HRTIMER_MODE_ABS_PINNED, 0);
216 }
217}
218
219static void start_rt_bandwidth(struct rt_bandwidth *rt_b)
220{
203 if (!rt_bandwidth_enabled() || rt_b->rt_runtime == RUNTIME_INF) 221 if (!rt_bandwidth_enabled() || rt_b->rt_runtime == RUNTIME_INF)
204 return; 222 return;
205 223
@@ -207,22 +225,7 @@ static void start_rt_bandwidth(struct rt_bandwidth *rt_b)
207 return; 225 return;
208 226
209 raw_spin_lock(&rt_b->rt_runtime_lock); 227 raw_spin_lock(&rt_b->rt_runtime_lock);
210 for (;;) { 228 start_bandwidth_timer(&rt_b->rt_period_timer, rt_b->rt_period);
211 unsigned long delta;
212 ktime_t soft, hard;
213
214 if (hrtimer_active(&rt_b->rt_period_timer))
215 break;
216
217 now = hrtimer_cb_get_time(&rt_b->rt_period_timer);
218 hrtimer_forward(&rt_b->rt_period_timer, now, rt_b->rt_period);
219
220 soft = hrtimer_get_softexpires(&rt_b->rt_period_timer);
221 hard = hrtimer_get_expires(&rt_b->rt_period_timer);
222 delta = ktime_to_ns(ktime_sub(hard, soft));
223 __hrtimer_start_range_ns(&rt_b->rt_period_timer, soft, delta,
224 HRTIMER_MODE_ABS_PINNED, 0);
225 }
226 raw_spin_unlock(&rt_b->rt_runtime_lock); 229 raw_spin_unlock(&rt_b->rt_runtime_lock);
227} 230}
228 231
@@ -247,6 +250,24 @@ struct cfs_rq;
247 250
248static LIST_HEAD(task_groups); 251static LIST_HEAD(task_groups);
249 252
253struct cfs_bandwidth {
254#ifdef CONFIG_CFS_BANDWIDTH
255 raw_spinlock_t lock;
256 ktime_t period;
257 u64 quota, runtime;
258 s64 hierarchal_quota;
259 u64 runtime_expires;
260
261 int idle, timer_active;
262 struct hrtimer period_timer, slack_timer;
263 struct list_head throttled_cfs_rq;
264
265 /* statistics */
266 int nr_periods, nr_throttled;
267 u64 throttled_time;
268#endif
269};
270
250/* task group related information */ 271/* task group related information */
251struct task_group { 272struct task_group {
252 struct cgroup_subsys_state css; 273 struct cgroup_subsys_state css;
@@ -278,6 +299,8 @@ struct task_group {
278#ifdef CONFIG_SCHED_AUTOGROUP 299#ifdef CONFIG_SCHED_AUTOGROUP
279 struct autogroup *autogroup; 300 struct autogroup *autogroup;
280#endif 301#endif
302
303 struct cfs_bandwidth cfs_bandwidth;
281}; 304};
282 305
283/* task_group_lock serializes the addition/removal of task groups */ 306/* task_group_lock serializes the addition/removal of task groups */
@@ -311,7 +334,7 @@ struct task_group root_task_group;
311/* CFS-related fields in a runqueue */ 334/* CFS-related fields in a runqueue */
312struct cfs_rq { 335struct cfs_rq {
313 struct load_weight load; 336 struct load_weight load;
314 unsigned long nr_running; 337 unsigned long nr_running, h_nr_running;
315 338
316 u64 exec_clock; 339 u64 exec_clock;
317 u64 min_vruntime; 340 u64 min_vruntime;
@@ -377,9 +400,120 @@ struct cfs_rq {
377 400
378 unsigned long load_contribution; 401 unsigned long load_contribution;
379#endif 402#endif
403#ifdef CONFIG_CFS_BANDWIDTH
404 int runtime_enabled;
405 u64 runtime_expires;
406 s64 runtime_remaining;
407
408 u64 throttled_timestamp;
409 int throttled, throttle_count;
410 struct list_head throttled_list;
411#endif
380#endif 412#endif
381}; 413};
382 414
415#ifdef CONFIG_FAIR_GROUP_SCHED
416#ifdef CONFIG_CFS_BANDWIDTH
417static inline struct cfs_bandwidth *tg_cfs_bandwidth(struct task_group *tg)
418{
419 return &tg->cfs_bandwidth;
420}
421
422static inline u64 default_cfs_period(void);
423static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun);
424static void do_sched_cfs_slack_timer(struct cfs_bandwidth *cfs_b);
425
426static enum hrtimer_restart sched_cfs_slack_timer(struct hrtimer *timer)
427{
428 struct cfs_bandwidth *cfs_b =
429 container_of(timer, struct cfs_bandwidth, slack_timer);
430 do_sched_cfs_slack_timer(cfs_b);
431
432 return HRTIMER_NORESTART;
433}
434
435static enum hrtimer_restart sched_cfs_period_timer(struct hrtimer *timer)
436{
437 struct cfs_bandwidth *cfs_b =
438 container_of(timer, struct cfs_bandwidth, period_timer);
439 ktime_t now;
440 int overrun;
441 int idle = 0;
442
443 for (;;) {
444 now = hrtimer_cb_get_time(timer);
445 overrun = hrtimer_forward(timer, now, cfs_b->period);
446
447 if (!overrun)
448 break;
449
450 idle = do_sched_cfs_period_timer(cfs_b, overrun);
451 }
452
453 return idle ? HRTIMER_NORESTART : HRTIMER_RESTART;
454}
455
456static void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
457{
458 raw_spin_lock_init(&cfs_b->lock);
459 cfs_b->runtime = 0;
460 cfs_b->quota = RUNTIME_INF;
461 cfs_b->period = ns_to_ktime(default_cfs_period());
462
463 INIT_LIST_HEAD(&cfs_b->throttled_cfs_rq);
464 hrtimer_init(&cfs_b->period_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
465 cfs_b->period_timer.function = sched_cfs_period_timer;
466 hrtimer_init(&cfs_b->slack_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
467 cfs_b->slack_timer.function = sched_cfs_slack_timer;
468}
469
470static void init_cfs_rq_runtime(struct cfs_rq *cfs_rq)
471{
472 cfs_rq->runtime_enabled = 0;
473 INIT_LIST_HEAD(&cfs_rq->throttled_list);
474}
475
476/* requires cfs_b->lock, may release to reprogram timer */
477static void __start_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
478{
479 /*
480 * The timer may be active because we're trying to set a new bandwidth
481 * period or because we're racing with the tear-down path
482 * (timer_active==0 becomes visible before the hrtimer call-back
483 * terminates). In either case we ensure that it's re-programmed
484 */
485 while (unlikely(hrtimer_active(&cfs_b->period_timer))) {
486 raw_spin_unlock(&cfs_b->lock);
487 /* ensure cfs_b->lock is available while we wait */
488 hrtimer_cancel(&cfs_b->period_timer);
489
490 raw_spin_lock(&cfs_b->lock);
491 /* if someone else restarted the timer then we're done */
492 if (cfs_b->timer_active)
493 return;
494 }
495
496 cfs_b->timer_active = 1;
497 start_bandwidth_timer(&cfs_b->period_timer, cfs_b->period);
498}
499
500static void destroy_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
501{
502 hrtimer_cancel(&cfs_b->period_timer);
503 hrtimer_cancel(&cfs_b->slack_timer);
504}
505#else
506static void init_cfs_rq_runtime(struct cfs_rq *cfs_rq) {}
507static void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b) {}
508static void destroy_cfs_bandwidth(struct cfs_bandwidth *cfs_b) {}
509
510static inline struct cfs_bandwidth *tg_cfs_bandwidth(struct task_group *tg)
511{
512 return NULL;
513}
514#endif /* CONFIG_CFS_BANDWIDTH */
515#endif /* CONFIG_FAIR_GROUP_SCHED */
516
383/* Real-Time classes' related field in a runqueue: */ 517/* Real-Time classes' related field in a runqueue: */
384struct rt_rq { 518struct rt_rq {
385 struct rt_prio_array active; 519 struct rt_prio_array active;
@@ -510,7 +644,7 @@ struct rq {
510 644
511 unsigned long cpu_power; 645 unsigned long cpu_power;
512 646
513 unsigned char idle_at_tick; 647 unsigned char idle_balance;
514 /* For active balancing */ 648 /* For active balancing */
515 int post_schedule; 649 int post_schedule;
516 int active_balance; 650 int active_balance;
@@ -520,8 +654,6 @@ struct rq {
520 int cpu; 654 int cpu;
521 int online; 655 int online;
522 656
523 unsigned long avg_load_per_task;
524
525 u64 rt_avg; 657 u64 rt_avg;
526 u64 age_stamp; 658 u64 age_stamp;
527 u64 idle_stamp; 659 u64 idle_stamp;
@@ -570,7 +702,7 @@ struct rq {
570#endif 702#endif
571 703
572#ifdef CONFIG_SMP 704#ifdef CONFIG_SMP
573 struct task_struct *wake_list; 705 struct llist_head wake_list;
574#endif 706#endif
575}; 707};
576 708
@@ -1272,6 +1404,18 @@ void wake_up_idle_cpu(int cpu)
1272 smp_send_reschedule(cpu); 1404 smp_send_reschedule(cpu);
1273} 1405}
1274 1406
1407static inline bool got_nohz_idle_kick(void)
1408{
1409 return idle_cpu(smp_processor_id()) && this_rq()->nohz_balance_kick;
1410}
1411
1412#else /* CONFIG_NO_HZ */
1413
1414static inline bool got_nohz_idle_kick(void)
1415{
1416 return false;
1417}
1418
1275#endif /* CONFIG_NO_HZ */ 1419#endif /* CONFIG_NO_HZ */
1276 1420
1277static u64 sched_avg_period(void) 1421static u64 sched_avg_period(void)
@@ -1471,24 +1615,28 @@ static inline void dec_cpu_load(struct rq *rq, unsigned long load)
1471 update_load_sub(&rq->load, load); 1615 update_load_sub(&rq->load, load);
1472} 1616}
1473 1617
1474#if (defined(CONFIG_SMP) && defined(CONFIG_FAIR_GROUP_SCHED)) || defined(CONFIG_RT_GROUP_SCHED) 1618#if defined(CONFIG_RT_GROUP_SCHED) || (defined(CONFIG_FAIR_GROUP_SCHED) && \
1619 (defined(CONFIG_SMP) || defined(CONFIG_CFS_BANDWIDTH)))
1475typedef int (*tg_visitor)(struct task_group *, void *); 1620typedef int (*tg_visitor)(struct task_group *, void *);
1476 1621
1477/* 1622/*
1478 * Iterate the full tree, calling @down when first entering a node and @up when 1623 * Iterate task_group tree rooted at *from, calling @down when first entering a
1479 * leaving it for the final time. 1624 * node and @up when leaving it for the final time.
1625 *
1626 * Caller must hold rcu_lock or sufficient equivalent.
1480 */ 1627 */
1481static int walk_tg_tree(tg_visitor down, tg_visitor up, void *data) 1628static int walk_tg_tree_from(struct task_group *from,
1629 tg_visitor down, tg_visitor up, void *data)
1482{ 1630{
1483 struct task_group *parent, *child; 1631 struct task_group *parent, *child;
1484 int ret; 1632 int ret;
1485 1633
1486 rcu_read_lock(); 1634 parent = from;
1487 parent = &root_task_group; 1635
1488down: 1636down:
1489 ret = (*down)(parent, data); 1637 ret = (*down)(parent, data);
1490 if (ret) 1638 if (ret)
1491 goto out_unlock; 1639 goto out;
1492 list_for_each_entry_rcu(child, &parent->children, siblings) { 1640 list_for_each_entry_rcu(child, &parent->children, siblings) {
1493 parent = child; 1641 parent = child;
1494 goto down; 1642 goto down;
@@ -1497,19 +1645,29 @@ up:
1497 continue; 1645 continue;
1498 } 1646 }
1499 ret = (*up)(parent, data); 1647 ret = (*up)(parent, data);
1500 if (ret) 1648 if (ret || parent == from)
1501 goto out_unlock; 1649 goto out;
1502 1650
1503 child = parent; 1651 child = parent;
1504 parent = parent->parent; 1652 parent = parent->parent;
1505 if (parent) 1653 if (parent)
1506 goto up; 1654 goto up;
1507out_unlock: 1655out:
1508 rcu_read_unlock();
1509
1510 return ret; 1656 return ret;
1511} 1657}
1512 1658
1659/*
1660 * Iterate the full tree, calling @down when first entering a node and @up when
1661 * leaving it for the final time.
1662 *
1663 * Caller must hold rcu_lock or sufficient equivalent.
1664 */
1665
1666static inline int walk_tg_tree(tg_visitor down, tg_visitor up, void *data)
1667{
1668 return walk_tg_tree_from(&root_task_group, down, up, data);
1669}
1670
1513static int tg_nop(struct task_group *tg, void *data) 1671static int tg_nop(struct task_group *tg, void *data)
1514{ 1672{
1515 return 0; 1673 return 0;
@@ -1569,11 +1727,9 @@ static unsigned long cpu_avg_load_per_task(int cpu)
1569 unsigned long nr_running = ACCESS_ONCE(rq->nr_running); 1727 unsigned long nr_running = ACCESS_ONCE(rq->nr_running);
1570 1728
1571 if (nr_running) 1729 if (nr_running)
1572 rq->avg_load_per_task = rq->load.weight / nr_running; 1730 return rq->load.weight / nr_running;
1573 else
1574 rq->avg_load_per_task = 0;
1575 1731
1576 return rq->avg_load_per_task; 1732 return 0;
1577} 1733}
1578 1734
1579#ifdef CONFIG_PREEMPT 1735#ifdef CONFIG_PREEMPT
@@ -1739,7 +1895,7 @@ static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu)
1739#ifdef CONFIG_SMP 1895#ifdef CONFIG_SMP
1740 /* 1896 /*
1741 * After ->cpu is set up to a new value, task_rq_lock(p, ...) can be 1897 * After ->cpu is set up to a new value, task_rq_lock(p, ...) can be
1742 * successfuly executed on another CPU. We must ensure that updates of 1898 * successfully executed on another CPU. We must ensure that updates of
1743 * per-task data have been completed by this moment. 1899 * per-task data have been completed by this moment.
1744 */ 1900 */
1745 smp_wmb(); 1901 smp_wmb();
@@ -1806,7 +1962,6 @@ static void activate_task(struct rq *rq, struct task_struct *p, int flags)
1806 rq->nr_uninterruptible--; 1962 rq->nr_uninterruptible--;
1807 1963
1808 enqueue_task(rq, p, flags); 1964 enqueue_task(rq, p, flags);
1809 inc_nr_running(rq);
1810} 1965}
1811 1966
1812/* 1967/*
@@ -1818,7 +1973,6 @@ static void deactivate_task(struct rq *rq, struct task_struct *p, int flags)
1818 rq->nr_uninterruptible++; 1973 rq->nr_uninterruptible++;
1819 1974
1820 dequeue_task(rq, p, flags); 1975 dequeue_task(rq, p, flags);
1821 dec_nr_running(rq);
1822} 1976}
1823 1977
1824#ifdef CONFIG_IRQ_TIME_ACCOUNTING 1978#ifdef CONFIG_IRQ_TIME_ACCOUNTING
@@ -2390,11 +2544,11 @@ static int select_fallback_rq(int cpu, struct task_struct *p)
2390 2544
2391 /* Look for allowed, online CPU in same node. */ 2545 /* Look for allowed, online CPU in same node. */
2392 for_each_cpu_and(dest_cpu, nodemask, cpu_active_mask) 2546 for_each_cpu_and(dest_cpu, nodemask, cpu_active_mask)
2393 if (cpumask_test_cpu(dest_cpu, &p->cpus_allowed)) 2547 if (cpumask_test_cpu(dest_cpu, tsk_cpus_allowed(p)))
2394 return dest_cpu; 2548 return dest_cpu;
2395 2549
2396 /* Any allowed, online CPU? */ 2550 /* Any allowed, online CPU? */
2397 dest_cpu = cpumask_any_and(&p->cpus_allowed, cpu_active_mask); 2551 dest_cpu = cpumask_any_and(tsk_cpus_allowed(p), cpu_active_mask);
2398 if (dest_cpu < nr_cpu_ids) 2552 if (dest_cpu < nr_cpu_ids)
2399 return dest_cpu; 2553 return dest_cpu;
2400 2554
@@ -2431,7 +2585,7 @@ int select_task_rq(struct task_struct *p, int sd_flags, int wake_flags)
2431 * [ this allows ->select_task() to simply return task_cpu(p) and 2585 * [ this allows ->select_task() to simply return task_cpu(p) and
2432 * not worry about this generic constraint ] 2586 * not worry about this generic constraint ]
2433 */ 2587 */
2434 if (unlikely(!cpumask_test_cpu(cpu, &p->cpus_allowed) || 2588 if (unlikely(!cpumask_test_cpu(cpu, tsk_cpus_allowed(p)) ||
2435 !cpu_online(cpu))) 2589 !cpu_online(cpu)))
2436 cpu = select_fallback_rq(task_cpu(p), p); 2590 cpu = select_fallback_rq(task_cpu(p), p);
2437 2591
@@ -2556,42 +2710,26 @@ static int ttwu_remote(struct task_struct *p, int wake_flags)
2556} 2710}
2557 2711
2558#ifdef CONFIG_SMP 2712#ifdef CONFIG_SMP
2559static void sched_ttwu_do_pending(struct task_struct *list) 2713static void sched_ttwu_pending(void)
2560{ 2714{
2561 struct rq *rq = this_rq(); 2715 struct rq *rq = this_rq();
2716 struct llist_node *llist = llist_del_all(&rq->wake_list);
2717 struct task_struct *p;
2562 2718
2563 raw_spin_lock(&rq->lock); 2719 raw_spin_lock(&rq->lock);
2564 2720
2565 while (list) { 2721 while (llist) {
2566 struct task_struct *p = list; 2722 p = llist_entry(llist, struct task_struct, wake_entry);
2567 list = list->wake_entry; 2723 llist = llist_next(llist);
2568 ttwu_do_activate(rq, p, 0); 2724 ttwu_do_activate(rq, p, 0);
2569 } 2725 }
2570 2726
2571 raw_spin_unlock(&rq->lock); 2727 raw_spin_unlock(&rq->lock);
2572} 2728}
2573 2729
2574#ifdef CONFIG_HOTPLUG_CPU
2575
2576static void sched_ttwu_pending(void)
2577{
2578 struct rq *rq = this_rq();
2579 struct task_struct *list = xchg(&rq->wake_list, NULL);
2580
2581 if (!list)
2582 return;
2583
2584 sched_ttwu_do_pending(list);
2585}
2586
2587#endif /* CONFIG_HOTPLUG_CPU */
2588
2589void scheduler_ipi(void) 2730void scheduler_ipi(void)
2590{ 2731{
2591 struct rq *rq = this_rq(); 2732 if (llist_empty(&this_rq()->wake_list) && !got_nohz_idle_kick())
2592 struct task_struct *list = xchg(&rq->wake_list, NULL);
2593
2594 if (!list)
2595 return; 2733 return;
2596 2734
2597 /* 2735 /*
@@ -2608,25 +2746,21 @@ void scheduler_ipi(void)
2608 * somewhat pessimize the simple resched case. 2746 * somewhat pessimize the simple resched case.
2609 */ 2747 */
2610 irq_enter(); 2748 irq_enter();
2611 sched_ttwu_do_pending(list); 2749 sched_ttwu_pending();
2750
2751 /*
2752 * Check if someone kicked us for doing the nohz idle load balance.
2753 */
2754 if (unlikely(got_nohz_idle_kick() && !need_resched())) {
2755 this_rq()->idle_balance = 1;
2756 raise_softirq_irqoff(SCHED_SOFTIRQ);
2757 }
2612 irq_exit(); 2758 irq_exit();
2613} 2759}
2614 2760
2615static void ttwu_queue_remote(struct task_struct *p, int cpu) 2761static void ttwu_queue_remote(struct task_struct *p, int cpu)
2616{ 2762{
2617 struct rq *rq = cpu_rq(cpu); 2763 if (llist_add(&p->wake_entry, &cpu_rq(cpu)->wake_list))
2618 struct task_struct *next = rq->wake_list;
2619
2620 for (;;) {
2621 struct task_struct *old = next;
2622
2623 p->wake_entry = next;
2624 next = cmpxchg(&rq->wake_list, old, p);
2625 if (next == old)
2626 break;
2627 }
2628
2629 if (!next)
2630 smp_send_reschedule(cpu); 2764 smp_send_reschedule(cpu);
2631} 2765}
2632 2766
@@ -2848,19 +2982,23 @@ void sched_fork(struct task_struct *p)
2848 p->state = TASK_RUNNING; 2982 p->state = TASK_RUNNING;
2849 2983
2850 /* 2984 /*
2985 * Make sure we do not leak PI boosting priority to the child.
2986 */
2987 p->prio = current->normal_prio;
2988
2989 /*
2851 * Revert to default priority/policy on fork if requested. 2990 * Revert to default priority/policy on fork if requested.
2852 */ 2991 */
2853 if (unlikely(p->sched_reset_on_fork)) { 2992 if (unlikely(p->sched_reset_on_fork)) {
2854 if (p->policy == SCHED_FIFO || p->policy == SCHED_RR) { 2993 if (task_has_rt_policy(p)) {
2855 p->policy = SCHED_NORMAL; 2994 p->policy = SCHED_NORMAL;
2856 p->normal_prio = p->static_prio;
2857 }
2858
2859 if (PRIO_TO_NICE(p->static_prio) < 0) {
2860 p->static_prio = NICE_TO_PRIO(0); 2995 p->static_prio = NICE_TO_PRIO(0);
2861 p->normal_prio = p->static_prio; 2996 p->rt_priority = 0;
2862 set_load_weight(p); 2997 } else if (PRIO_TO_NICE(p->static_prio) < 0)
2863 } 2998 p->static_prio = NICE_TO_PRIO(0);
2999
3000 p->prio = p->normal_prio = __normal_prio(p);
3001 set_load_weight(p);
2864 3002
2865 /* 3003 /*
2866 * We don't need the reset flag anymore after the fork. It has 3004 * We don't need the reset flag anymore after the fork. It has
@@ -2869,11 +3007,6 @@ void sched_fork(struct task_struct *p)
2869 p->sched_reset_on_fork = 0; 3007 p->sched_reset_on_fork = 0;
2870 } 3008 }
2871 3009
2872 /*
2873 * Make sure we do not leak PI boosting priority to the child.
2874 */
2875 p->prio = current->normal_prio;
2876
2877 if (!rt_prio(p->prio)) 3010 if (!rt_prio(p->prio))
2878 p->sched_class = &fair_sched_class; 3011 p->sched_class = &fair_sched_class;
2879 3012
@@ -3725,30 +3858,6 @@ unsigned long long task_sched_runtime(struct task_struct *p)
3725} 3858}
3726 3859
3727/* 3860/*
3728 * Return sum_exec_runtime for the thread group.
3729 * In case the task is currently running, return the sum plus current's
3730 * pending runtime that have not been accounted yet.
3731 *
3732 * Note that the thread group might have other running tasks as well,
3733 * so the return value not includes other pending runtime that other
3734 * running tasks might have.
3735 */
3736unsigned long long thread_group_sched_runtime(struct task_struct *p)
3737{
3738 struct task_cputime totals;
3739 unsigned long flags;
3740 struct rq *rq;
3741 u64 ns;
3742
3743 rq = task_rq_lock(p, &flags);
3744 thread_group_cputime(p, &totals);
3745 ns = totals.sum_exec_runtime + do_task_delta_exec(p, rq);
3746 task_rq_unlock(rq, p, &flags);
3747
3748 return ns;
3749}
3750
3751/*
3752 * Account user cpu time to a process. 3861 * Account user cpu time to a process.
3753 * @p: the process that the cpu time gets accounted to 3862 * @p: the process that the cpu time gets accounted to
3754 * @cputime: the cpu time spent in user space since the last update 3863 * @cputime: the cpu time spent in user space since the last update
@@ -4140,7 +4249,7 @@ void scheduler_tick(void)
4140 perf_event_task_tick(); 4249 perf_event_task_tick();
4141 4250
4142#ifdef CONFIG_SMP 4251#ifdef CONFIG_SMP
4143 rq->idle_at_tick = idle_cpu(cpu); 4252 rq->idle_balance = idle_cpu(cpu);
4144 trigger_load_balance(rq, cpu); 4253 trigger_load_balance(rq, cpu);
4145#endif 4254#endif
4146} 4255}
@@ -4237,6 +4346,7 @@ static inline void schedule_debug(struct task_struct *prev)
4237 */ 4346 */
4238 if (unlikely(in_atomic_preempt_off() && !prev->exit_state)) 4347 if (unlikely(in_atomic_preempt_off() && !prev->exit_state))
4239 __schedule_bug(prev); 4348 __schedule_bug(prev);
4349 rcu_sleep_check();
4240 4350
4241 profile_hit(SCHED_PROFILING, __builtin_return_address(0)); 4351 profile_hit(SCHED_PROFILING, __builtin_return_address(0));
4242 4352
@@ -4263,7 +4373,7 @@ pick_next_task(struct rq *rq)
4263 * Optimization: we know that if all tasks are in 4373 * Optimization: we know that if all tasks are in
4264 * the fair class we can call that function directly: 4374 * the fair class we can call that function directly:
4265 */ 4375 */
4266 if (likely(rq->nr_running == rq->cfs.nr_running)) { 4376 if (likely(rq->nr_running == rq->cfs.h_nr_running)) {
4267 p = fair_sched_class.pick_next_task(rq); 4377 p = fair_sched_class.pick_next_task(rq);
4268 if (likely(p)) 4378 if (likely(p))
4269 return p; 4379 return p;
@@ -4372,7 +4482,7 @@ static inline void sched_submit_work(struct task_struct *tsk)
4372 blk_schedule_flush_plug(tsk); 4482 blk_schedule_flush_plug(tsk);
4373} 4483}
4374 4484
4375asmlinkage void schedule(void) 4485asmlinkage void __sched schedule(void)
4376{ 4486{
4377 struct task_struct *tsk = current; 4487 struct task_struct *tsk = current;
4378 4488
@@ -5049,7 +5159,20 @@ EXPORT_SYMBOL(task_nice);
5049 */ 5159 */
5050int idle_cpu(int cpu) 5160int idle_cpu(int cpu)
5051{ 5161{
5052 return cpu_curr(cpu) == cpu_rq(cpu)->idle; 5162 struct rq *rq = cpu_rq(cpu);
5163
5164 if (rq->curr != rq->idle)
5165 return 0;
5166
5167 if (rq->nr_running)
5168 return 0;
5169
5170#ifdef CONFIG_SMP
5171 if (!llist_empty(&rq->wake_list))
5172 return 0;
5173#endif
5174
5175 return 1;
5053} 5176}
5054 5177
5055/** 5178/**
@@ -5899,7 +6022,7 @@ void show_state_filter(unsigned long state_filter)
5899 printk(KERN_INFO 6022 printk(KERN_INFO
5900 " task PC stack pid father\n"); 6023 " task PC stack pid father\n");
5901#endif 6024#endif
5902 read_lock(&tasklist_lock); 6025 rcu_read_lock();
5903 do_each_thread(g, p) { 6026 do_each_thread(g, p) {
5904 /* 6027 /*
5905 * reset the NMI-timeout, listing all files on a slow 6028 * reset the NMI-timeout, listing all files on a slow
@@ -5915,7 +6038,7 @@ void show_state_filter(unsigned long state_filter)
5915#ifdef CONFIG_SCHED_DEBUG 6038#ifdef CONFIG_SCHED_DEBUG
5916 sysrq_sched_debug_show(); 6039 sysrq_sched_debug_show();
5917#endif 6040#endif
5918 read_unlock(&tasklist_lock); 6041 rcu_read_unlock();
5919 /* 6042 /*
5920 * Only show locks if all tasks are dumped: 6043 * Only show locks if all tasks are dumped:
5921 */ 6044 */
@@ -5979,15 +6102,6 @@ void __cpuinit init_idle(struct task_struct *idle, int cpu)
5979} 6102}
5980 6103
5981/* 6104/*
5982 * In a system that switches off the HZ timer nohz_cpu_mask
5983 * indicates which cpus entered this state. This is used
5984 * in the rcu update to wait only for active cpus. For system
5985 * which do not switch off the HZ timer nohz_cpu_mask should
5986 * always be CPU_BITS_NONE.
5987 */
5988cpumask_var_t nohz_cpu_mask;
5989
5990/*
5991 * Increase the granularity value when there are more CPUs, 6105 * Increase the granularity value when there are more CPUs,
5992 * because with more CPUs the 'effective latency' as visible 6106 * because with more CPUs the 'effective latency' as visible
5993 * to users decreases. But the relationship is not linear, 6107 * to users decreases. But the relationship is not linear,
@@ -6039,10 +6153,9 @@ void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask)
6039{ 6153{
6040 if (p->sched_class && p->sched_class->set_cpus_allowed) 6154 if (p->sched_class && p->sched_class->set_cpus_allowed)
6041 p->sched_class->set_cpus_allowed(p, new_mask); 6155 p->sched_class->set_cpus_allowed(p, new_mask);
6042 else { 6156
6043 cpumask_copy(&p->cpus_allowed, new_mask); 6157 cpumask_copy(&p->cpus_allowed, new_mask);
6044 p->rt.nr_cpus_allowed = cpumask_weight(new_mask); 6158 p->rt.nr_cpus_allowed = cpumask_weight(new_mask);
6045 }
6046} 6159}
6047 6160
6048/* 6161/*
@@ -6140,7 +6253,7 @@ static int __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu)
6140 if (task_cpu(p) != src_cpu) 6253 if (task_cpu(p) != src_cpu)
6141 goto done; 6254 goto done;
6142 /* Affinity changed (again). */ 6255 /* Affinity changed (again). */
6143 if (!cpumask_test_cpu(dest_cpu, &p->cpus_allowed)) 6256 if (!cpumask_test_cpu(dest_cpu, tsk_cpus_allowed(p)))
6144 goto fail; 6257 goto fail;
6145 6258
6146 /* 6259 /*
@@ -6221,6 +6334,30 @@ static void calc_global_load_remove(struct rq *rq)
6221 rq->calc_load_active = 0; 6334 rq->calc_load_active = 0;
6222} 6335}
6223 6336
6337#ifdef CONFIG_CFS_BANDWIDTH
6338static void unthrottle_offline_cfs_rqs(struct rq *rq)
6339{
6340 struct cfs_rq *cfs_rq;
6341
6342 for_each_leaf_cfs_rq(rq, cfs_rq) {
6343 struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
6344
6345 if (!cfs_rq->runtime_enabled)
6346 continue;
6347
6348 /*
6349 * clock_task is not advancing so we just need to make sure
6350 * there's some valid quota amount
6351 */
6352 cfs_rq->runtime_remaining = cfs_b->quota;
6353 if (cfs_rq_throttled(cfs_rq))
6354 unthrottle_cfs_rq(cfs_rq);
6355 }
6356}
6357#else
6358static void unthrottle_offline_cfs_rqs(struct rq *rq) {}
6359#endif
6360
6224/* 6361/*
6225 * Migrate all tasks from the rq, sleeping tasks will be migrated by 6362 * Migrate all tasks from the rq, sleeping tasks will be migrated by
6226 * try_to_wake_up()->select_task_rq(). 6363 * try_to_wake_up()->select_task_rq().
@@ -6246,6 +6383,9 @@ static void migrate_tasks(unsigned int dead_cpu)
6246 */ 6383 */
6247 rq->stop = NULL; 6384 rq->stop = NULL;
6248 6385
6386 /* Ensure any throttled groups are reachable by pick_next_task */
6387 unthrottle_offline_cfs_rqs(rq);
6388
6249 for ( ; ; ) { 6389 for ( ; ; ) {
6250 /* 6390 /*
6251 * There's this thread running, bail when that's the only 6391 * There's this thread running, bail when that's the only
@@ -7989,6 +8129,7 @@ static void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq,
7989 /* allow initial update_cfs_load() to truncate */ 8129 /* allow initial update_cfs_load() to truncate */
7990 cfs_rq->load_stamp = 1; 8130 cfs_rq->load_stamp = 1;
7991#endif 8131#endif
8132 init_cfs_rq_runtime(cfs_rq);
7992 8133
7993 tg->cfs_rq[cpu] = cfs_rq; 8134 tg->cfs_rq[cpu] = cfs_rq;
7994 tg->se[cpu] = se; 8135 tg->se[cpu] = se;
@@ -8128,6 +8269,7 @@ void __init sched_init(void)
8128 * We achieve this by letting root_task_group's tasks sit 8269 * We achieve this by letting root_task_group's tasks sit
8129 * directly in rq->cfs (i.e root_task_group->se[] = NULL). 8270 * directly in rq->cfs (i.e root_task_group->se[] = NULL).
8130 */ 8271 */
8272 init_cfs_bandwidth(&root_task_group.cfs_bandwidth);
8131 init_tg_cfs_entry(&root_task_group, &rq->cfs, NULL, i, NULL); 8273 init_tg_cfs_entry(&root_task_group, &rq->cfs, NULL, i, NULL);
8132#endif /* CONFIG_FAIR_GROUP_SCHED */ 8274#endif /* CONFIG_FAIR_GROUP_SCHED */
8133 8275
@@ -8157,7 +8299,6 @@ void __init sched_init(void)
8157 rq_attach_root(rq, &def_root_domain); 8299 rq_attach_root(rq, &def_root_domain);
8158#ifdef CONFIG_NO_HZ 8300#ifdef CONFIG_NO_HZ
8159 rq->nohz_balance_kick = 0; 8301 rq->nohz_balance_kick = 0;
8160 init_sched_softirq_csd(&per_cpu(remote_sched_softirq_cb, i));
8161#endif 8302#endif
8162#endif 8303#endif
8163 init_rq_hrtick(rq); 8304 init_rq_hrtick(rq);
@@ -8199,8 +8340,6 @@ void __init sched_init(void)
8199 */ 8340 */
8200 current->sched_class = &fair_sched_class; 8341 current->sched_class = &fair_sched_class;
8201 8342
8202 /* Allocate the nohz_cpu_mask if CONFIG_CPUMASK_OFFSTACK */
8203 zalloc_cpumask_var(&nohz_cpu_mask, GFP_NOWAIT);
8204#ifdef CONFIG_SMP 8343#ifdef CONFIG_SMP
8205 zalloc_cpumask_var(&sched_domains_tmpmask, GFP_NOWAIT); 8344 zalloc_cpumask_var(&sched_domains_tmpmask, GFP_NOWAIT);
8206#ifdef CONFIG_NO_HZ 8345#ifdef CONFIG_NO_HZ
@@ -8230,6 +8369,7 @@ void __might_sleep(const char *file, int line, int preempt_offset)
8230{ 8369{
8231 static unsigned long prev_jiffy; /* ratelimiting */ 8370 static unsigned long prev_jiffy; /* ratelimiting */
8232 8371
8372 rcu_sleep_check(); /* WARN_ON_ONCE() by default, no rate limit reqd. */
8233 if ((preempt_count_equals(preempt_offset) && !irqs_disabled()) || 8373 if ((preempt_count_equals(preempt_offset) && !irqs_disabled()) ||
8234 system_state != SYSTEM_RUNNING || oops_in_progress) 8374 system_state != SYSTEM_RUNNING || oops_in_progress)
8235 return; 8375 return;
@@ -8369,6 +8509,8 @@ static void free_fair_sched_group(struct task_group *tg)
8369{ 8509{
8370 int i; 8510 int i;
8371 8511
8512 destroy_cfs_bandwidth(tg_cfs_bandwidth(tg));
8513
8372 for_each_possible_cpu(i) { 8514 for_each_possible_cpu(i) {
8373 if (tg->cfs_rq) 8515 if (tg->cfs_rq)
8374 kfree(tg->cfs_rq[i]); 8516 kfree(tg->cfs_rq[i]);
@@ -8396,6 +8538,8 @@ int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)
8396 8538
8397 tg->shares = NICE_0_LOAD; 8539 tg->shares = NICE_0_LOAD;
8398 8540
8541 init_cfs_bandwidth(tg_cfs_bandwidth(tg));
8542
8399 for_each_possible_cpu(i) { 8543 for_each_possible_cpu(i) {
8400 cfs_rq = kzalloc_node(sizeof(struct cfs_rq), 8544 cfs_rq = kzalloc_node(sizeof(struct cfs_rq),
8401 GFP_KERNEL, cpu_to_node(i)); 8545 GFP_KERNEL, cpu_to_node(i));
@@ -8671,12 +8815,7 @@ unsigned long sched_group_shares(struct task_group *tg)
8671} 8815}
8672#endif 8816#endif
8673 8817
8674#ifdef CONFIG_RT_GROUP_SCHED 8818#if defined(CONFIG_RT_GROUP_SCHED) || defined(CONFIG_CFS_BANDWIDTH)
8675/*
8676 * Ensure that the real time constraints are schedulable.
8677 */
8678static DEFINE_MUTEX(rt_constraints_mutex);
8679
8680static unsigned long to_ratio(u64 period, u64 runtime) 8819static unsigned long to_ratio(u64 period, u64 runtime)
8681{ 8820{
8682 if (runtime == RUNTIME_INF) 8821 if (runtime == RUNTIME_INF)
@@ -8684,6 +8823,13 @@ static unsigned long to_ratio(u64 period, u64 runtime)
8684 8823
8685 return div64_u64(runtime << 20, period); 8824 return div64_u64(runtime << 20, period);
8686} 8825}
8826#endif
8827
8828#ifdef CONFIG_RT_GROUP_SCHED
8829/*
8830 * Ensure that the real time constraints are schedulable.
8831 */
8832static DEFINE_MUTEX(rt_constraints_mutex);
8687 8833
8688/* Must be called with tasklist_lock held */ 8834/* Must be called with tasklist_lock held */
8689static inline int tg_has_rt_tasks(struct task_group *tg) 8835static inline int tg_has_rt_tasks(struct task_group *tg)
@@ -8704,7 +8850,7 @@ struct rt_schedulable_data {
8704 u64 rt_runtime; 8850 u64 rt_runtime;
8705}; 8851};
8706 8852
8707static int tg_schedulable(struct task_group *tg, void *data) 8853static int tg_rt_schedulable(struct task_group *tg, void *data)
8708{ 8854{
8709 struct rt_schedulable_data *d = data; 8855 struct rt_schedulable_data *d = data;
8710 struct task_group *child; 8856 struct task_group *child;
@@ -8762,16 +8908,22 @@ static int tg_schedulable(struct task_group *tg, void *data)
8762 8908
8763static int __rt_schedulable(struct task_group *tg, u64 period, u64 runtime) 8909static int __rt_schedulable(struct task_group *tg, u64 period, u64 runtime)
8764{ 8910{
8911 int ret;
8912
8765 struct rt_schedulable_data data = { 8913 struct rt_schedulable_data data = {
8766 .tg = tg, 8914 .tg = tg,
8767 .rt_period = period, 8915 .rt_period = period,
8768 .rt_runtime = runtime, 8916 .rt_runtime = runtime,
8769 }; 8917 };
8770 8918
8771 return walk_tg_tree(tg_schedulable, tg_nop, &data); 8919 rcu_read_lock();
8920 ret = walk_tg_tree(tg_rt_schedulable, tg_nop, &data);
8921 rcu_read_unlock();
8922
8923 return ret;
8772} 8924}
8773 8925
8774static int tg_set_bandwidth(struct task_group *tg, 8926static int tg_set_rt_bandwidth(struct task_group *tg,
8775 u64 rt_period, u64 rt_runtime) 8927 u64 rt_period, u64 rt_runtime)
8776{ 8928{
8777 int i, err = 0; 8929 int i, err = 0;
@@ -8810,7 +8962,7 @@ int sched_group_set_rt_runtime(struct task_group *tg, long rt_runtime_us)
8810 if (rt_runtime_us < 0) 8962 if (rt_runtime_us < 0)
8811 rt_runtime = RUNTIME_INF; 8963 rt_runtime = RUNTIME_INF;
8812 8964
8813 return tg_set_bandwidth(tg, rt_period, rt_runtime); 8965 return tg_set_rt_bandwidth(tg, rt_period, rt_runtime);
8814} 8966}
8815 8967
8816long sched_group_rt_runtime(struct task_group *tg) 8968long sched_group_rt_runtime(struct task_group *tg)
@@ -8835,7 +8987,7 @@ int sched_group_set_rt_period(struct task_group *tg, long rt_period_us)
8835 if (rt_period == 0) 8987 if (rt_period == 0)
8836 return -EINVAL; 8988 return -EINVAL;
8837 8989
8838 return tg_set_bandwidth(tg, rt_period, rt_runtime); 8990 return tg_set_rt_bandwidth(tg, rt_period, rt_runtime);
8839} 8991}
8840 8992
8841long sched_group_rt_period(struct task_group *tg) 8993long sched_group_rt_period(struct task_group *tg)
@@ -9025,6 +9177,238 @@ static u64 cpu_shares_read_u64(struct cgroup *cgrp, struct cftype *cft)
9025 9177
9026 return (u64) scale_load_down(tg->shares); 9178 return (u64) scale_load_down(tg->shares);
9027} 9179}
9180
9181#ifdef CONFIG_CFS_BANDWIDTH
9182static DEFINE_MUTEX(cfs_constraints_mutex);
9183
9184const u64 max_cfs_quota_period = 1 * NSEC_PER_SEC; /* 1s */
9185const u64 min_cfs_quota_period = 1 * NSEC_PER_MSEC; /* 1ms */
9186
9187static int __cfs_schedulable(struct task_group *tg, u64 period, u64 runtime);
9188
9189static int tg_set_cfs_bandwidth(struct task_group *tg, u64 period, u64 quota)
9190{
9191 int i, ret = 0, runtime_enabled;
9192 struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(tg);
9193
9194 if (tg == &root_task_group)
9195 return -EINVAL;
9196
9197 /*
9198 * Ensure we have at some amount of bandwidth every period. This is
9199 * to prevent reaching a state of large arrears when throttled via
9200 * entity_tick() resulting in prolonged exit starvation.
9201 */
9202 if (quota < min_cfs_quota_period || period < min_cfs_quota_period)
9203 return -EINVAL;
9204
9205 /*
9206 * Likewise, bound things on the otherside by preventing insane quota
9207 * periods. This also allows us to normalize in computing quota
9208 * feasibility.
9209 */
9210 if (period > max_cfs_quota_period)
9211 return -EINVAL;
9212
9213 mutex_lock(&cfs_constraints_mutex);
9214 ret = __cfs_schedulable(tg, period, quota);
9215 if (ret)
9216 goto out_unlock;
9217
9218 runtime_enabled = quota != RUNTIME_INF;
9219 raw_spin_lock_irq(&cfs_b->lock);
9220 cfs_b->period = ns_to_ktime(period);
9221 cfs_b->quota = quota;
9222
9223 __refill_cfs_bandwidth_runtime(cfs_b);
9224 /* restart the period timer (if active) to handle new period expiry */
9225 if (runtime_enabled && cfs_b->timer_active) {
9226 /* force a reprogram */
9227 cfs_b->timer_active = 0;
9228 __start_cfs_bandwidth(cfs_b);
9229 }
9230 raw_spin_unlock_irq(&cfs_b->lock);
9231
9232 for_each_possible_cpu(i) {
9233 struct cfs_rq *cfs_rq = tg->cfs_rq[i];
9234 struct rq *rq = rq_of(cfs_rq);
9235
9236 raw_spin_lock_irq(&rq->lock);
9237 cfs_rq->runtime_enabled = runtime_enabled;
9238 cfs_rq->runtime_remaining = 0;
9239
9240 if (cfs_rq_throttled(cfs_rq))
9241 unthrottle_cfs_rq(cfs_rq);
9242 raw_spin_unlock_irq(&rq->lock);
9243 }
9244out_unlock:
9245 mutex_unlock(&cfs_constraints_mutex);
9246
9247 return ret;
9248}
9249
9250int tg_set_cfs_quota(struct task_group *tg, long cfs_quota_us)
9251{
9252 u64 quota, period;
9253
9254 period = ktime_to_ns(tg_cfs_bandwidth(tg)->period);
9255 if (cfs_quota_us < 0)
9256 quota = RUNTIME_INF;
9257 else
9258 quota = (u64)cfs_quota_us * NSEC_PER_USEC;
9259
9260 return tg_set_cfs_bandwidth(tg, period, quota);
9261}
9262
9263long tg_get_cfs_quota(struct task_group *tg)
9264{
9265 u64 quota_us;
9266
9267 if (tg_cfs_bandwidth(tg)->quota == RUNTIME_INF)
9268 return -1;
9269
9270 quota_us = tg_cfs_bandwidth(tg)->quota;
9271 do_div(quota_us, NSEC_PER_USEC);
9272
9273 return quota_us;
9274}
9275
9276int tg_set_cfs_period(struct task_group *tg, long cfs_period_us)
9277{
9278 u64 quota, period;
9279
9280 period = (u64)cfs_period_us * NSEC_PER_USEC;
9281 quota = tg_cfs_bandwidth(tg)->quota;
9282
9283 if (period <= 0)
9284 return -EINVAL;
9285
9286 return tg_set_cfs_bandwidth(tg, period, quota);
9287}
9288
9289long tg_get_cfs_period(struct task_group *tg)
9290{
9291 u64 cfs_period_us;
9292
9293 cfs_period_us = ktime_to_ns(tg_cfs_bandwidth(tg)->period);
9294 do_div(cfs_period_us, NSEC_PER_USEC);
9295
9296 return cfs_period_us;
9297}
9298
9299static s64 cpu_cfs_quota_read_s64(struct cgroup *cgrp, struct cftype *cft)
9300{
9301 return tg_get_cfs_quota(cgroup_tg(cgrp));
9302}
9303
9304static int cpu_cfs_quota_write_s64(struct cgroup *cgrp, struct cftype *cftype,
9305 s64 cfs_quota_us)
9306{
9307 return tg_set_cfs_quota(cgroup_tg(cgrp), cfs_quota_us);
9308}
9309
9310static u64 cpu_cfs_period_read_u64(struct cgroup *cgrp, struct cftype *cft)
9311{
9312 return tg_get_cfs_period(cgroup_tg(cgrp));
9313}
9314
9315static int cpu_cfs_period_write_u64(struct cgroup *cgrp, struct cftype *cftype,
9316 u64 cfs_period_us)
9317{
9318 return tg_set_cfs_period(cgroup_tg(cgrp), cfs_period_us);
9319}
9320
9321struct cfs_schedulable_data {
9322 struct task_group *tg;
9323 u64 period, quota;
9324};
9325
9326/*
9327 * normalize group quota/period to be quota/max_period
9328 * note: units are usecs
9329 */
9330static u64 normalize_cfs_quota(struct task_group *tg,
9331 struct cfs_schedulable_data *d)
9332{
9333 u64 quota, period;
9334
9335 if (tg == d->tg) {
9336 period = d->period;
9337 quota = d->quota;
9338 } else {
9339 period = tg_get_cfs_period(tg);
9340 quota = tg_get_cfs_quota(tg);
9341 }
9342
9343 /* note: these should typically be equivalent */
9344 if (quota == RUNTIME_INF || quota == -1)
9345 return RUNTIME_INF;
9346
9347 return to_ratio(period, quota);
9348}
9349
9350static int tg_cfs_schedulable_down(struct task_group *tg, void *data)
9351{
9352 struct cfs_schedulable_data *d = data;
9353 struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(tg);
9354 s64 quota = 0, parent_quota = -1;
9355
9356 if (!tg->parent) {
9357 quota = RUNTIME_INF;
9358 } else {
9359 struct cfs_bandwidth *parent_b = tg_cfs_bandwidth(tg->parent);
9360
9361 quota = normalize_cfs_quota(tg, d);
9362 parent_quota = parent_b->hierarchal_quota;
9363
9364 /*
9365 * ensure max(child_quota) <= parent_quota, inherit when no
9366 * limit is set
9367 */
9368 if (quota == RUNTIME_INF)
9369 quota = parent_quota;
9370 else if (parent_quota != RUNTIME_INF && quota > parent_quota)
9371 return -EINVAL;
9372 }
9373 cfs_b->hierarchal_quota = quota;
9374
9375 return 0;
9376}
9377
9378static int __cfs_schedulable(struct task_group *tg, u64 period, u64 quota)
9379{
9380 int ret;
9381 struct cfs_schedulable_data data = {
9382 .tg = tg,
9383 .period = period,
9384 .quota = quota,
9385 };
9386
9387 if (quota != RUNTIME_INF) {
9388 do_div(data.period, NSEC_PER_USEC);
9389 do_div(data.quota, NSEC_PER_USEC);
9390 }
9391
9392 rcu_read_lock();
9393 ret = walk_tg_tree(tg_cfs_schedulable_down, tg_nop, &data);
9394 rcu_read_unlock();
9395
9396 return ret;
9397}
9398
9399static int cpu_stats_show(struct cgroup *cgrp, struct cftype *cft,
9400 struct cgroup_map_cb *cb)
9401{
9402 struct task_group *tg = cgroup_tg(cgrp);
9403 struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(tg);
9404
9405 cb->fill(cb, "nr_periods", cfs_b->nr_periods);
9406 cb->fill(cb, "nr_throttled", cfs_b->nr_throttled);
9407 cb->fill(cb, "throttled_time", cfs_b->throttled_time);
9408
9409 return 0;
9410}
9411#endif /* CONFIG_CFS_BANDWIDTH */
9028#endif /* CONFIG_FAIR_GROUP_SCHED */ 9412#endif /* CONFIG_FAIR_GROUP_SCHED */
9029 9413
9030#ifdef CONFIG_RT_GROUP_SCHED 9414#ifdef CONFIG_RT_GROUP_SCHED
@@ -9059,6 +9443,22 @@ static struct cftype cpu_files[] = {
9059 .write_u64 = cpu_shares_write_u64, 9443 .write_u64 = cpu_shares_write_u64,
9060 }, 9444 },
9061#endif 9445#endif
9446#ifdef CONFIG_CFS_BANDWIDTH
9447 {
9448 .name = "cfs_quota_us",
9449 .read_s64 = cpu_cfs_quota_read_s64,
9450 .write_s64 = cpu_cfs_quota_write_s64,
9451 },
9452 {
9453 .name = "cfs_period_us",
9454 .read_u64 = cpu_cfs_period_read_u64,
9455 .write_u64 = cpu_cfs_period_write_u64,
9456 },
9457 {
9458 .name = "stat",
9459 .read_map = cpu_stats_show,
9460 },
9461#endif
9062#ifdef CONFIG_RT_GROUP_SCHED 9462#ifdef CONFIG_RT_GROUP_SCHED
9063 { 9463 {
9064 .name = "rt_runtime_us", 9464 .name = "rt_runtime_us",
@@ -9368,4 +9768,3 @@ struct cgroup_subsys cpuacct_subsys = {
9368 .subsys_id = cpuacct_subsys_id, 9768 .subsys_id = cpuacct_subsys_id,
9369}; 9769};
9370#endif /* CONFIG_CGROUP_CPUACCT */ 9770#endif /* CONFIG_CGROUP_CPUACCT */
9371
diff --git a/kernel/sched_cpupri.c b/kernel/sched_cpupri.c
index 2722dc1b4138..a86cf9d9eb11 100644
--- a/kernel/sched_cpupri.c
+++ b/kernel/sched_cpupri.c
@@ -47,9 +47,6 @@ static int convert_prio(int prio)
47 return cpupri; 47 return cpupri;
48} 48}
49 49
50#define for_each_cpupri_active(array, idx) \
51 for_each_set_bit(idx, array, CPUPRI_NR_PRIORITIES)
52
53/** 50/**
54 * cpupri_find - find the best (lowest-pri) CPU in the system 51 * cpupri_find - find the best (lowest-pri) CPU in the system
55 * @cp: The cpupri context 52 * @cp: The cpupri context
@@ -71,11 +68,38 @@ int cpupri_find(struct cpupri *cp, struct task_struct *p,
71 int idx = 0; 68 int idx = 0;
72 int task_pri = convert_prio(p->prio); 69 int task_pri = convert_prio(p->prio);
73 70
74 for_each_cpupri_active(cp->pri_active, idx) { 71 if (task_pri >= MAX_RT_PRIO)
75 struct cpupri_vec *vec = &cp->pri_to_cpu[idx]; 72 return 0;
76 73
77 if (idx >= task_pri) 74 for (idx = 0; idx < task_pri; idx++) {
78 break; 75 struct cpupri_vec *vec = &cp->pri_to_cpu[idx];
76 int skip = 0;
77
78 if (!atomic_read(&(vec)->count))
79 skip = 1;
80 /*
81 * When looking at the vector, we need to read the counter,
82 * do a memory barrier, then read the mask.
83 *
84 * Note: This is still all racey, but we can deal with it.
85 * Ideally, we only want to look at masks that are set.
86 *
87 * If a mask is not set, then the only thing wrong is that we
88 * did a little more work than necessary.
89 *
90 * If we read a zero count but the mask is set, because of the
91 * memory barriers, that can only happen when the highest prio
92 * task for a run queue has left the run queue, in which case,
93 * it will be followed by a pull. If the task we are processing
94 * fails to find a proper place to go, that pull request will
95 * pull this task if the run queue is running at a lower
96 * priority.
97 */
98 smp_rmb();
99
100 /* Need to do the rmb for every iteration */
101 if (skip)
102 continue;
79 103
80 if (cpumask_any_and(&p->cpus_allowed, vec->mask) >= nr_cpu_ids) 104 if (cpumask_any_and(&p->cpus_allowed, vec->mask) >= nr_cpu_ids)
81 continue; 105 continue;
@@ -115,7 +139,7 @@ void cpupri_set(struct cpupri *cp, int cpu, int newpri)
115{ 139{
116 int *currpri = &cp->cpu_to_pri[cpu]; 140 int *currpri = &cp->cpu_to_pri[cpu];
117 int oldpri = *currpri; 141 int oldpri = *currpri;
118 unsigned long flags; 142 int do_mb = 0;
119 143
120 newpri = convert_prio(newpri); 144 newpri = convert_prio(newpri);
121 145
@@ -128,32 +152,46 @@ void cpupri_set(struct cpupri *cp, int cpu, int newpri)
128 * If the cpu was currently mapped to a different value, we 152 * If the cpu was currently mapped to a different value, we
129 * need to map it to the new value then remove the old value. 153 * need to map it to the new value then remove the old value.
130 * Note, we must add the new value first, otherwise we risk the 154 * Note, we must add the new value first, otherwise we risk the
131 * cpu being cleared from pri_active, and this cpu could be 155 * cpu being missed by the priority loop in cpupri_find.
132 * missed for a push or pull.
133 */ 156 */
134 if (likely(newpri != CPUPRI_INVALID)) { 157 if (likely(newpri != CPUPRI_INVALID)) {
135 struct cpupri_vec *vec = &cp->pri_to_cpu[newpri]; 158 struct cpupri_vec *vec = &cp->pri_to_cpu[newpri];
136 159
137 raw_spin_lock_irqsave(&vec->lock, flags);
138
139 cpumask_set_cpu(cpu, vec->mask); 160 cpumask_set_cpu(cpu, vec->mask);
140 vec->count++; 161 /*
141 if (vec->count == 1) 162 * When adding a new vector, we update the mask first,
142 set_bit(newpri, cp->pri_active); 163 * do a write memory barrier, and then update the count, to
143 164 * make sure the vector is visible when count is set.
144 raw_spin_unlock_irqrestore(&vec->lock, flags); 165 */
166 smp_mb__before_atomic_inc();
167 atomic_inc(&(vec)->count);
168 do_mb = 1;
145 } 169 }
146 if (likely(oldpri != CPUPRI_INVALID)) { 170 if (likely(oldpri != CPUPRI_INVALID)) {
147 struct cpupri_vec *vec = &cp->pri_to_cpu[oldpri]; 171 struct cpupri_vec *vec = &cp->pri_to_cpu[oldpri];
148 172
149 raw_spin_lock_irqsave(&vec->lock, flags); 173 /*
150 174 * Because the order of modification of the vec->count
151 vec->count--; 175 * is important, we must make sure that the update
152 if (!vec->count) 176 * of the new prio is seen before we decrement the
153 clear_bit(oldpri, cp->pri_active); 177 * old prio. This makes sure that the loop sees
178 * one or the other when we raise the priority of
179 * the run queue. We don't care about when we lower the
180 * priority, as that will trigger an rt pull anyway.
181 *
182 * We only need to do a memory barrier if we updated
183 * the new priority vec.
184 */
185 if (do_mb)
186 smp_mb__after_atomic_inc();
187
188 /*
189 * When removing from the vector, we decrement the counter first
190 * do a memory barrier and then clear the mask.
191 */
192 atomic_dec(&(vec)->count);
193 smp_mb__after_atomic_inc();
154 cpumask_clear_cpu(cpu, vec->mask); 194 cpumask_clear_cpu(cpu, vec->mask);
155
156 raw_spin_unlock_irqrestore(&vec->lock, flags);
157 } 195 }
158 196
159 *currpri = newpri; 197 *currpri = newpri;
@@ -175,8 +213,7 @@ int cpupri_init(struct cpupri *cp)
175 for (i = 0; i < CPUPRI_NR_PRIORITIES; i++) { 213 for (i = 0; i < CPUPRI_NR_PRIORITIES; i++) {
176 struct cpupri_vec *vec = &cp->pri_to_cpu[i]; 214 struct cpupri_vec *vec = &cp->pri_to_cpu[i];
177 215
178 raw_spin_lock_init(&vec->lock); 216 atomic_set(&vec->count, 0);
179 vec->count = 0;
180 if (!zalloc_cpumask_var(&vec->mask, GFP_KERNEL)) 217 if (!zalloc_cpumask_var(&vec->mask, GFP_KERNEL))
181 goto cleanup; 218 goto cleanup;
182 } 219 }
diff --git a/kernel/sched_cpupri.h b/kernel/sched_cpupri.h
index 9fc7d386fea4..f6d756173491 100644
--- a/kernel/sched_cpupri.h
+++ b/kernel/sched_cpupri.h
@@ -4,7 +4,6 @@
4#include <linux/sched.h> 4#include <linux/sched.h>
5 5
6#define CPUPRI_NR_PRIORITIES (MAX_RT_PRIO + 2) 6#define CPUPRI_NR_PRIORITIES (MAX_RT_PRIO + 2)
7#define CPUPRI_NR_PRI_WORDS BITS_TO_LONGS(CPUPRI_NR_PRIORITIES)
8 7
9#define CPUPRI_INVALID -1 8#define CPUPRI_INVALID -1
10#define CPUPRI_IDLE 0 9#define CPUPRI_IDLE 0
@@ -12,14 +11,12 @@
12/* values 2-101 are RT priorities 0-99 */ 11/* values 2-101 are RT priorities 0-99 */
13 12
14struct cpupri_vec { 13struct cpupri_vec {
15 raw_spinlock_t lock; 14 atomic_t count;
16 int count; 15 cpumask_var_t mask;
17 cpumask_var_t mask;
18}; 16};
19 17
20struct cpupri { 18struct cpupri {
21 struct cpupri_vec pri_to_cpu[CPUPRI_NR_PRIORITIES]; 19 struct cpupri_vec pri_to_cpu[CPUPRI_NR_PRIORITIES];
22 long pri_active[CPUPRI_NR_PRI_WORDS];
23 int cpu_to_pri[NR_CPUS]; 20 int cpu_to_pri[NR_CPUS];
24}; 21};
25 22
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index bc8ee9993814..5c9e67923b7c 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -89,6 +89,20 @@ const_debug unsigned int sysctl_sched_migration_cost = 500000UL;
89 */ 89 */
90unsigned int __read_mostly sysctl_sched_shares_window = 10000000UL; 90unsigned int __read_mostly sysctl_sched_shares_window = 10000000UL;
91 91
92#ifdef CONFIG_CFS_BANDWIDTH
93/*
94 * Amount of runtime to allocate from global (tg) to local (per-cfs_rq) pool
95 * each time a cfs_rq requests quota.
96 *
97 * Note: in the case that the slice exceeds the runtime remaining (either due
98 * to consumption or the quota being specified to be smaller than the slice)
99 * we will always only issue the remaining available time.
100 *
101 * default: 5 msec, units: microseconds
102 */
103unsigned int sysctl_sched_cfs_bandwidth_slice = 5000UL;
104#endif
105
92static const struct sched_class fair_sched_class; 106static const struct sched_class fair_sched_class;
93 107
94/************************************************************** 108/**************************************************************
@@ -292,6 +306,8 @@ find_matching_se(struct sched_entity **se, struct sched_entity **pse)
292 306
293#endif /* CONFIG_FAIR_GROUP_SCHED */ 307#endif /* CONFIG_FAIR_GROUP_SCHED */
294 308
309static void account_cfs_rq_runtime(struct cfs_rq *cfs_rq,
310 unsigned long delta_exec);
295 311
296/************************************************************** 312/**************************************************************
297 * Scheduling class tree data structure manipulation methods: 313 * Scheduling class tree data structure manipulation methods:
@@ -583,6 +599,8 @@ static void update_curr(struct cfs_rq *cfs_rq)
583 cpuacct_charge(curtask, delta_exec); 599 cpuacct_charge(curtask, delta_exec);
584 account_group_exec_runtime(curtask, delta_exec); 600 account_group_exec_runtime(curtask, delta_exec);
585 } 601 }
602
603 account_cfs_rq_runtime(cfs_rq, delta_exec);
586} 604}
587 605
588static inline void 606static inline void
@@ -688,6 +706,8 @@ account_entity_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se)
688} 706}
689 707
690#ifdef CONFIG_FAIR_GROUP_SCHED 708#ifdef CONFIG_FAIR_GROUP_SCHED
709/* we need this in update_cfs_load and load-balance functions below */
710static inline int throttled_hierarchy(struct cfs_rq *cfs_rq);
691# ifdef CONFIG_SMP 711# ifdef CONFIG_SMP
692static void update_cfs_rq_load_contribution(struct cfs_rq *cfs_rq, 712static void update_cfs_rq_load_contribution(struct cfs_rq *cfs_rq,
693 int global_update) 713 int global_update)
@@ -710,7 +730,7 @@ static void update_cfs_load(struct cfs_rq *cfs_rq, int global_update)
710 u64 now, delta; 730 u64 now, delta;
711 unsigned long load = cfs_rq->load.weight; 731 unsigned long load = cfs_rq->load.weight;
712 732
713 if (cfs_rq->tg == &root_task_group) 733 if (cfs_rq->tg == &root_task_group || throttled_hierarchy(cfs_rq))
714 return; 734 return;
715 735
716 now = rq_of(cfs_rq)->clock_task; 736 now = rq_of(cfs_rq)->clock_task;
@@ -819,7 +839,7 @@ static void update_cfs_shares(struct cfs_rq *cfs_rq)
819 839
820 tg = cfs_rq->tg; 840 tg = cfs_rq->tg;
821 se = tg->se[cpu_of(rq_of(cfs_rq))]; 841 se = tg->se[cpu_of(rq_of(cfs_rq))];
822 if (!se) 842 if (!se || throttled_hierarchy(cfs_rq))
823 return; 843 return;
824#ifndef CONFIG_SMP 844#ifndef CONFIG_SMP
825 if (likely(se->load.weight == tg->shares)) 845 if (likely(se->load.weight == tg->shares))
@@ -950,6 +970,8 @@ place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial)
950 se->vruntime = vruntime; 970 se->vruntime = vruntime;
951} 971}
952 972
973static void check_enqueue_throttle(struct cfs_rq *cfs_rq);
974
953static void 975static void
954enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) 976enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
955{ 977{
@@ -979,8 +1001,10 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
979 __enqueue_entity(cfs_rq, se); 1001 __enqueue_entity(cfs_rq, se);
980 se->on_rq = 1; 1002 se->on_rq = 1;
981 1003
982 if (cfs_rq->nr_running == 1) 1004 if (cfs_rq->nr_running == 1) {
983 list_add_leaf_cfs_rq(cfs_rq); 1005 list_add_leaf_cfs_rq(cfs_rq);
1006 check_enqueue_throttle(cfs_rq);
1007 }
984} 1008}
985 1009
986static void __clear_buddies_last(struct sched_entity *se) 1010static void __clear_buddies_last(struct sched_entity *se)
@@ -1028,6 +1052,8 @@ static void clear_buddies(struct cfs_rq *cfs_rq, struct sched_entity *se)
1028 __clear_buddies_skip(se); 1052 __clear_buddies_skip(se);
1029} 1053}
1030 1054
1055static void return_cfs_rq_runtime(struct cfs_rq *cfs_rq);
1056
1031static void 1057static void
1032dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) 1058dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
1033{ 1059{
@@ -1066,6 +1092,9 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
1066 if (!(flags & DEQUEUE_SLEEP)) 1092 if (!(flags & DEQUEUE_SLEEP))
1067 se->vruntime -= cfs_rq->min_vruntime; 1093 se->vruntime -= cfs_rq->min_vruntime;
1068 1094
1095 /* return excess runtime on last dequeue */
1096 return_cfs_rq_runtime(cfs_rq);
1097
1069 update_min_vruntime(cfs_rq); 1098 update_min_vruntime(cfs_rq);
1070 update_cfs_shares(cfs_rq); 1099 update_cfs_shares(cfs_rq);
1071} 1100}
@@ -1077,6 +1106,8 @@ static void
1077check_preempt_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr) 1106check_preempt_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr)
1078{ 1107{
1079 unsigned long ideal_runtime, delta_exec; 1108 unsigned long ideal_runtime, delta_exec;
1109 struct sched_entity *se;
1110 s64 delta;
1080 1111
1081 ideal_runtime = sched_slice(cfs_rq, curr); 1112 ideal_runtime = sched_slice(cfs_rq, curr);
1082 delta_exec = curr->sum_exec_runtime - curr->prev_sum_exec_runtime; 1113 delta_exec = curr->sum_exec_runtime - curr->prev_sum_exec_runtime;
@@ -1095,22 +1126,17 @@ check_preempt_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr)
1095 * narrow margin doesn't have to wait for a full slice. 1126 * narrow margin doesn't have to wait for a full slice.
1096 * This also mitigates buddy induced latencies under load. 1127 * This also mitigates buddy induced latencies under load.
1097 */ 1128 */
1098 if (!sched_feat(WAKEUP_PREEMPT))
1099 return;
1100
1101 if (delta_exec < sysctl_sched_min_granularity) 1129 if (delta_exec < sysctl_sched_min_granularity)
1102 return; 1130 return;
1103 1131
1104 if (cfs_rq->nr_running > 1) { 1132 se = __pick_first_entity(cfs_rq);
1105 struct sched_entity *se = __pick_first_entity(cfs_rq); 1133 delta = curr->vruntime - se->vruntime;
1106 s64 delta = curr->vruntime - se->vruntime;
1107 1134
1108 if (delta < 0) 1135 if (delta < 0)
1109 return; 1136 return;
1110 1137
1111 if (delta > ideal_runtime) 1138 if (delta > ideal_runtime)
1112 resched_task(rq_of(cfs_rq)->curr); 1139 resched_task(rq_of(cfs_rq)->curr);
1113 }
1114} 1140}
1115 1141
1116static void 1142static void
@@ -1185,6 +1211,8 @@ static struct sched_entity *pick_next_entity(struct cfs_rq *cfs_rq)
1185 return se; 1211 return se;
1186} 1212}
1187 1213
1214static void check_cfs_rq_runtime(struct cfs_rq *cfs_rq);
1215
1188static void put_prev_entity(struct cfs_rq *cfs_rq, struct sched_entity *prev) 1216static void put_prev_entity(struct cfs_rq *cfs_rq, struct sched_entity *prev)
1189{ 1217{
1190 /* 1218 /*
@@ -1194,6 +1222,9 @@ static void put_prev_entity(struct cfs_rq *cfs_rq, struct sched_entity *prev)
1194 if (prev->on_rq) 1222 if (prev->on_rq)
1195 update_curr(cfs_rq); 1223 update_curr(cfs_rq);
1196 1224
1225 /* throttle cfs_rqs exceeding runtime */
1226 check_cfs_rq_runtime(cfs_rq);
1227
1197 check_spread(cfs_rq, prev); 1228 check_spread(cfs_rq, prev);
1198 if (prev->on_rq) { 1229 if (prev->on_rq) {
1199 update_stats_wait_start(cfs_rq, prev); 1230 update_stats_wait_start(cfs_rq, prev);
@@ -1233,10 +1264,583 @@ entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr, int queued)
1233 return; 1264 return;
1234#endif 1265#endif
1235 1266
1236 if (cfs_rq->nr_running > 1 || !sched_feat(WAKEUP_PREEMPT)) 1267 if (cfs_rq->nr_running > 1)
1237 check_preempt_tick(cfs_rq, curr); 1268 check_preempt_tick(cfs_rq, curr);
1238} 1269}
1239 1270
1271
1272/**************************************************
1273 * CFS bandwidth control machinery
1274 */
1275
1276#ifdef CONFIG_CFS_BANDWIDTH
1277/*
1278 * default period for cfs group bandwidth.
1279 * default: 0.1s, units: nanoseconds
1280 */
1281static inline u64 default_cfs_period(void)
1282{
1283 return 100000000ULL;
1284}
1285
1286static inline u64 sched_cfs_bandwidth_slice(void)
1287{
1288 return (u64)sysctl_sched_cfs_bandwidth_slice * NSEC_PER_USEC;
1289}
1290
1291/*
1292 * Replenish runtime according to assigned quota and update expiration time.
1293 * We use sched_clock_cpu directly instead of rq->clock to avoid adding
1294 * additional synchronization around rq->lock.
1295 *
1296 * requires cfs_b->lock
1297 */
1298static void __refill_cfs_bandwidth_runtime(struct cfs_bandwidth *cfs_b)
1299{
1300 u64 now;
1301
1302 if (cfs_b->quota == RUNTIME_INF)
1303 return;
1304
1305 now = sched_clock_cpu(smp_processor_id());
1306 cfs_b->runtime = cfs_b->quota;
1307 cfs_b->runtime_expires = now + ktime_to_ns(cfs_b->period);
1308}
1309
1310/* returns 0 on failure to allocate runtime */
1311static int assign_cfs_rq_runtime(struct cfs_rq *cfs_rq)
1312{
1313 struct task_group *tg = cfs_rq->tg;
1314 struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(tg);
1315 u64 amount = 0, min_amount, expires;
1316
1317 /* note: this is a positive sum as runtime_remaining <= 0 */
1318 min_amount = sched_cfs_bandwidth_slice() - cfs_rq->runtime_remaining;
1319
1320 raw_spin_lock(&cfs_b->lock);
1321 if (cfs_b->quota == RUNTIME_INF)
1322 amount = min_amount;
1323 else {
1324 /*
1325 * If the bandwidth pool has become inactive, then at least one
1326 * period must have elapsed since the last consumption.
1327 * Refresh the global state and ensure bandwidth timer becomes
1328 * active.
1329 */
1330 if (!cfs_b->timer_active) {
1331 __refill_cfs_bandwidth_runtime(cfs_b);
1332 __start_cfs_bandwidth(cfs_b);
1333 }
1334
1335 if (cfs_b->runtime > 0) {
1336 amount = min(cfs_b->runtime, min_amount);
1337 cfs_b->runtime -= amount;
1338 cfs_b->idle = 0;
1339 }
1340 }
1341 expires = cfs_b->runtime_expires;
1342 raw_spin_unlock(&cfs_b->lock);
1343
1344 cfs_rq->runtime_remaining += amount;
1345 /*
1346 * we may have advanced our local expiration to account for allowed
1347 * spread between our sched_clock and the one on which runtime was
1348 * issued.
1349 */
1350 if ((s64)(expires - cfs_rq->runtime_expires) > 0)
1351 cfs_rq->runtime_expires = expires;
1352
1353 return cfs_rq->runtime_remaining > 0;
1354}
1355
1356/*
1357 * Note: This depends on the synchronization provided by sched_clock and the
1358 * fact that rq->clock snapshots this value.
1359 */
1360static void expire_cfs_rq_runtime(struct cfs_rq *cfs_rq)
1361{
1362 struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
1363 struct rq *rq = rq_of(cfs_rq);
1364
1365 /* if the deadline is ahead of our clock, nothing to do */
1366 if (likely((s64)(rq->clock - cfs_rq->runtime_expires) < 0))
1367 return;
1368
1369 if (cfs_rq->runtime_remaining < 0)
1370 return;
1371
1372 /*
1373 * If the local deadline has passed we have to consider the
1374 * possibility that our sched_clock is 'fast' and the global deadline
1375 * has not truly expired.
1376 *
1377 * Fortunately we can check determine whether this the case by checking
1378 * whether the global deadline has advanced.
1379 */
1380
1381 if ((s64)(cfs_rq->runtime_expires - cfs_b->runtime_expires) >= 0) {
1382 /* extend local deadline, drift is bounded above by 2 ticks */
1383 cfs_rq->runtime_expires += TICK_NSEC;
1384 } else {
1385 /* global deadline is ahead, expiration has passed */
1386 cfs_rq->runtime_remaining = 0;
1387 }
1388}
1389
1390static void __account_cfs_rq_runtime(struct cfs_rq *cfs_rq,
1391 unsigned long delta_exec)
1392{
1393 /* dock delta_exec before expiring quota (as it could span periods) */
1394 cfs_rq->runtime_remaining -= delta_exec;
1395 expire_cfs_rq_runtime(cfs_rq);
1396
1397 if (likely(cfs_rq->runtime_remaining > 0))
1398 return;
1399
1400 /*
1401 * if we're unable to extend our runtime we resched so that the active
1402 * hierarchy can be throttled
1403 */
1404 if (!assign_cfs_rq_runtime(cfs_rq) && likely(cfs_rq->curr))
1405 resched_task(rq_of(cfs_rq)->curr);
1406}
1407
1408static __always_inline void account_cfs_rq_runtime(struct cfs_rq *cfs_rq,
1409 unsigned long delta_exec)
1410{
1411 if (!cfs_rq->runtime_enabled)
1412 return;
1413
1414 __account_cfs_rq_runtime(cfs_rq, delta_exec);
1415}
1416
1417static inline int cfs_rq_throttled(struct cfs_rq *cfs_rq)
1418{
1419 return cfs_rq->throttled;
1420}
1421
1422/* check whether cfs_rq, or any parent, is throttled */
1423static inline int throttled_hierarchy(struct cfs_rq *cfs_rq)
1424{
1425 return cfs_rq->throttle_count;
1426}
1427
1428/*
1429 * Ensure that neither of the group entities corresponding to src_cpu or
1430 * dest_cpu are members of a throttled hierarchy when performing group
1431 * load-balance operations.
1432 */
1433static inline int throttled_lb_pair(struct task_group *tg,
1434 int src_cpu, int dest_cpu)
1435{
1436 struct cfs_rq *src_cfs_rq, *dest_cfs_rq;
1437
1438 src_cfs_rq = tg->cfs_rq[src_cpu];
1439 dest_cfs_rq = tg->cfs_rq[dest_cpu];
1440
1441 return throttled_hierarchy(src_cfs_rq) ||
1442 throttled_hierarchy(dest_cfs_rq);
1443}
1444
1445/* updated child weight may affect parent so we have to do this bottom up */
1446static int tg_unthrottle_up(struct task_group *tg, void *data)
1447{
1448 struct rq *rq = data;
1449 struct cfs_rq *cfs_rq = tg->cfs_rq[cpu_of(rq)];
1450
1451 cfs_rq->throttle_count--;
1452#ifdef CONFIG_SMP
1453 if (!cfs_rq->throttle_count) {
1454 u64 delta = rq->clock_task - cfs_rq->load_stamp;
1455
1456 /* leaving throttled state, advance shares averaging windows */
1457 cfs_rq->load_stamp += delta;
1458 cfs_rq->load_last += delta;
1459
1460 /* update entity weight now that we are on_rq again */
1461 update_cfs_shares(cfs_rq);
1462 }
1463#endif
1464
1465 return 0;
1466}
1467
1468static int tg_throttle_down(struct task_group *tg, void *data)
1469{
1470 struct rq *rq = data;
1471 struct cfs_rq *cfs_rq = tg->cfs_rq[cpu_of(rq)];
1472
1473 /* group is entering throttled state, record last load */
1474 if (!cfs_rq->throttle_count)
1475 update_cfs_load(cfs_rq, 0);
1476 cfs_rq->throttle_count++;
1477
1478 return 0;
1479}
1480
1481static void throttle_cfs_rq(struct cfs_rq *cfs_rq)
1482{
1483 struct rq *rq = rq_of(cfs_rq);
1484 struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
1485 struct sched_entity *se;
1486 long task_delta, dequeue = 1;
1487
1488 se = cfs_rq->tg->se[cpu_of(rq_of(cfs_rq))];
1489
1490 /* account load preceding throttle */
1491 rcu_read_lock();
1492 walk_tg_tree_from(cfs_rq->tg, tg_throttle_down, tg_nop, (void *)rq);
1493 rcu_read_unlock();
1494
1495 task_delta = cfs_rq->h_nr_running;
1496 for_each_sched_entity(se) {
1497 struct cfs_rq *qcfs_rq = cfs_rq_of(se);
1498 /* throttled entity or throttle-on-deactivate */
1499 if (!se->on_rq)
1500 break;
1501
1502 if (dequeue)
1503 dequeue_entity(qcfs_rq, se, DEQUEUE_SLEEP);
1504 qcfs_rq->h_nr_running -= task_delta;
1505
1506 if (qcfs_rq->load.weight)
1507 dequeue = 0;
1508 }
1509
1510 if (!se)
1511 rq->nr_running -= task_delta;
1512
1513 cfs_rq->throttled = 1;
1514 cfs_rq->throttled_timestamp = rq->clock;
1515 raw_spin_lock(&cfs_b->lock);
1516 list_add_tail_rcu(&cfs_rq->throttled_list, &cfs_b->throttled_cfs_rq);
1517 raw_spin_unlock(&cfs_b->lock);
1518}
1519
1520static void unthrottle_cfs_rq(struct cfs_rq *cfs_rq)
1521{
1522 struct rq *rq = rq_of(cfs_rq);
1523 struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
1524 struct sched_entity *se;
1525 int enqueue = 1;
1526 long task_delta;
1527
1528 se = cfs_rq->tg->se[cpu_of(rq_of(cfs_rq))];
1529
1530 cfs_rq->throttled = 0;
1531 raw_spin_lock(&cfs_b->lock);
1532 cfs_b->throttled_time += rq->clock - cfs_rq->throttled_timestamp;
1533 list_del_rcu(&cfs_rq->throttled_list);
1534 raw_spin_unlock(&cfs_b->lock);
1535 cfs_rq->throttled_timestamp = 0;
1536
1537 update_rq_clock(rq);
1538 /* update hierarchical throttle state */
1539 walk_tg_tree_from(cfs_rq->tg, tg_nop, tg_unthrottle_up, (void *)rq);
1540
1541 if (!cfs_rq->load.weight)
1542 return;
1543
1544 task_delta = cfs_rq->h_nr_running;
1545 for_each_sched_entity(se) {
1546 if (se->on_rq)
1547 enqueue = 0;
1548
1549 cfs_rq = cfs_rq_of(se);
1550 if (enqueue)
1551 enqueue_entity(cfs_rq, se, ENQUEUE_WAKEUP);
1552 cfs_rq->h_nr_running += task_delta;
1553
1554 if (cfs_rq_throttled(cfs_rq))
1555 break;
1556 }
1557
1558 if (!se)
1559 rq->nr_running += task_delta;
1560
1561 /* determine whether we need to wake up potentially idle cpu */
1562 if (rq->curr == rq->idle && rq->cfs.nr_running)
1563 resched_task(rq->curr);
1564}
1565
1566static u64 distribute_cfs_runtime(struct cfs_bandwidth *cfs_b,
1567 u64 remaining, u64 expires)
1568{
1569 struct cfs_rq *cfs_rq;
1570 u64 runtime = remaining;
1571
1572 rcu_read_lock();
1573 list_for_each_entry_rcu(cfs_rq, &cfs_b->throttled_cfs_rq,
1574 throttled_list) {
1575 struct rq *rq = rq_of(cfs_rq);
1576
1577 raw_spin_lock(&rq->lock);
1578 if (!cfs_rq_throttled(cfs_rq))
1579 goto next;
1580
1581 runtime = -cfs_rq->runtime_remaining + 1;
1582 if (runtime > remaining)
1583 runtime = remaining;
1584 remaining -= runtime;
1585
1586 cfs_rq->runtime_remaining += runtime;
1587 cfs_rq->runtime_expires = expires;
1588
1589 /* we check whether we're throttled above */
1590 if (cfs_rq->runtime_remaining > 0)
1591 unthrottle_cfs_rq(cfs_rq);
1592
1593next:
1594 raw_spin_unlock(&rq->lock);
1595
1596 if (!remaining)
1597 break;
1598 }
1599 rcu_read_unlock();
1600
1601 return remaining;
1602}
1603
1604/*
1605 * Responsible for refilling a task_group's bandwidth and unthrottling its
1606 * cfs_rqs as appropriate. If there has been no activity within the last
1607 * period the timer is deactivated until scheduling resumes; cfs_b->idle is
1608 * used to track this state.
1609 */
1610static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun)
1611{
1612 u64 runtime, runtime_expires;
1613 int idle = 1, throttled;
1614
1615 raw_spin_lock(&cfs_b->lock);
1616 /* no need to continue the timer with no bandwidth constraint */
1617 if (cfs_b->quota == RUNTIME_INF)
1618 goto out_unlock;
1619
1620 throttled = !list_empty(&cfs_b->throttled_cfs_rq);
1621 /* idle depends on !throttled (for the case of a large deficit) */
1622 idle = cfs_b->idle && !throttled;
1623 cfs_b->nr_periods += overrun;
1624
1625 /* if we're going inactive then everything else can be deferred */
1626 if (idle)
1627 goto out_unlock;
1628
1629 __refill_cfs_bandwidth_runtime(cfs_b);
1630
1631 if (!throttled) {
1632 /* mark as potentially idle for the upcoming period */
1633 cfs_b->idle = 1;
1634 goto out_unlock;
1635 }
1636
1637 /* account preceding periods in which throttling occurred */
1638 cfs_b->nr_throttled += overrun;
1639
1640 /*
1641 * There are throttled entities so we must first use the new bandwidth
1642 * to unthrottle them before making it generally available. This
1643 * ensures that all existing debts will be paid before a new cfs_rq is
1644 * allowed to run.
1645 */
1646 runtime = cfs_b->runtime;
1647 runtime_expires = cfs_b->runtime_expires;
1648 cfs_b->runtime = 0;
1649
1650 /*
1651 * This check is repeated as we are holding onto the new bandwidth
1652 * while we unthrottle. This can potentially race with an unthrottled
1653 * group trying to acquire new bandwidth from the global pool.
1654 */
1655 while (throttled && runtime > 0) {
1656 raw_spin_unlock(&cfs_b->lock);
1657 /* we can't nest cfs_b->lock while distributing bandwidth */
1658 runtime = distribute_cfs_runtime(cfs_b, runtime,
1659 runtime_expires);
1660 raw_spin_lock(&cfs_b->lock);
1661
1662 throttled = !list_empty(&cfs_b->throttled_cfs_rq);
1663 }
1664
1665 /* return (any) remaining runtime */
1666 cfs_b->runtime = runtime;
1667 /*
1668 * While we are ensured activity in the period following an
1669 * unthrottle, this also covers the case in which the new bandwidth is
1670 * insufficient to cover the existing bandwidth deficit. (Forcing the
1671 * timer to remain active while there are any throttled entities.)
1672 */
1673 cfs_b->idle = 0;
1674out_unlock:
1675 if (idle)
1676 cfs_b->timer_active = 0;
1677 raw_spin_unlock(&cfs_b->lock);
1678
1679 return idle;
1680}
1681
1682/* a cfs_rq won't donate quota below this amount */
1683static const u64 min_cfs_rq_runtime = 1 * NSEC_PER_MSEC;
1684/* minimum remaining period time to redistribute slack quota */
1685static const u64 min_bandwidth_expiration = 2 * NSEC_PER_MSEC;
1686/* how long we wait to gather additional slack before distributing */
1687static const u64 cfs_bandwidth_slack_period = 5 * NSEC_PER_MSEC;
1688
1689/* are we near the end of the current quota period? */
1690static int runtime_refresh_within(struct cfs_bandwidth *cfs_b, u64 min_expire)
1691{
1692 struct hrtimer *refresh_timer = &cfs_b->period_timer;
1693 u64 remaining;
1694
1695 /* if the call-back is running a quota refresh is already occurring */
1696 if (hrtimer_callback_running(refresh_timer))
1697 return 1;
1698
1699 /* is a quota refresh about to occur? */
1700 remaining = ktime_to_ns(hrtimer_expires_remaining(refresh_timer));
1701 if (remaining < min_expire)
1702 return 1;
1703
1704 return 0;
1705}
1706
1707static void start_cfs_slack_bandwidth(struct cfs_bandwidth *cfs_b)
1708{
1709 u64 min_left = cfs_bandwidth_slack_period + min_bandwidth_expiration;
1710
1711 /* if there's a quota refresh soon don't bother with slack */
1712 if (runtime_refresh_within(cfs_b, min_left))
1713 return;
1714
1715 start_bandwidth_timer(&cfs_b->slack_timer,
1716 ns_to_ktime(cfs_bandwidth_slack_period));
1717}
1718
1719/* we know any runtime found here is valid as update_curr() precedes return */
1720static void __return_cfs_rq_runtime(struct cfs_rq *cfs_rq)
1721{
1722 struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
1723 s64 slack_runtime = cfs_rq->runtime_remaining - min_cfs_rq_runtime;
1724
1725 if (slack_runtime <= 0)
1726 return;
1727
1728 raw_spin_lock(&cfs_b->lock);
1729 if (cfs_b->quota != RUNTIME_INF &&
1730 cfs_rq->runtime_expires == cfs_b->runtime_expires) {
1731 cfs_b->runtime += slack_runtime;
1732
1733 /* we are under rq->lock, defer unthrottling using a timer */
1734 if (cfs_b->runtime > sched_cfs_bandwidth_slice() &&
1735 !list_empty(&cfs_b->throttled_cfs_rq))
1736 start_cfs_slack_bandwidth(cfs_b);
1737 }
1738 raw_spin_unlock(&cfs_b->lock);
1739
1740 /* even if it's not valid for return we don't want to try again */
1741 cfs_rq->runtime_remaining -= slack_runtime;
1742}
1743
1744static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq)
1745{
1746 if (!cfs_rq->runtime_enabled || !cfs_rq->nr_running)
1747 return;
1748
1749 __return_cfs_rq_runtime(cfs_rq);
1750}
1751
1752/*
1753 * This is done with a timer (instead of inline with bandwidth return) since
1754 * it's necessary to juggle rq->locks to unthrottle their respective cfs_rqs.
1755 */
1756static void do_sched_cfs_slack_timer(struct cfs_bandwidth *cfs_b)
1757{
1758 u64 runtime = 0, slice = sched_cfs_bandwidth_slice();
1759 u64 expires;
1760
1761 /* confirm we're still not at a refresh boundary */
1762 if (runtime_refresh_within(cfs_b, min_bandwidth_expiration))
1763 return;
1764
1765 raw_spin_lock(&cfs_b->lock);
1766 if (cfs_b->quota != RUNTIME_INF && cfs_b->runtime > slice) {
1767 runtime = cfs_b->runtime;
1768 cfs_b->runtime = 0;
1769 }
1770 expires = cfs_b->runtime_expires;
1771 raw_spin_unlock(&cfs_b->lock);
1772
1773 if (!runtime)
1774 return;
1775
1776 runtime = distribute_cfs_runtime(cfs_b, runtime, expires);
1777
1778 raw_spin_lock(&cfs_b->lock);
1779 if (expires == cfs_b->runtime_expires)
1780 cfs_b->runtime = runtime;
1781 raw_spin_unlock(&cfs_b->lock);
1782}
1783
1784/*
1785 * When a group wakes up we want to make sure that its quota is not already
1786 * expired/exceeded, otherwise it may be allowed to steal additional ticks of
1787 * runtime as update_curr() throttling can not not trigger until it's on-rq.
1788 */
1789static void check_enqueue_throttle(struct cfs_rq *cfs_rq)
1790{
1791 /* an active group must be handled by the update_curr()->put() path */
1792 if (!cfs_rq->runtime_enabled || cfs_rq->curr)
1793 return;
1794
1795 /* ensure the group is not already throttled */
1796 if (cfs_rq_throttled(cfs_rq))
1797 return;
1798
1799 /* update runtime allocation */
1800 account_cfs_rq_runtime(cfs_rq, 0);
1801 if (cfs_rq->runtime_remaining <= 0)
1802 throttle_cfs_rq(cfs_rq);
1803}
1804
1805/* conditionally throttle active cfs_rq's from put_prev_entity() */
1806static void check_cfs_rq_runtime(struct cfs_rq *cfs_rq)
1807{
1808 if (likely(!cfs_rq->runtime_enabled || cfs_rq->runtime_remaining > 0))
1809 return;
1810
1811 /*
1812 * it's possible for a throttled entity to be forced into a running
1813 * state (e.g. set_curr_task), in this case we're finished.
1814 */
1815 if (cfs_rq_throttled(cfs_rq))
1816 return;
1817
1818 throttle_cfs_rq(cfs_rq);
1819}
1820#else
1821static void account_cfs_rq_runtime(struct cfs_rq *cfs_rq,
1822 unsigned long delta_exec) {}
1823static void check_cfs_rq_runtime(struct cfs_rq *cfs_rq) {}
1824static void check_enqueue_throttle(struct cfs_rq *cfs_rq) {}
1825static void return_cfs_rq_runtime(struct cfs_rq *cfs_rq) {}
1826
1827static inline int cfs_rq_throttled(struct cfs_rq *cfs_rq)
1828{
1829 return 0;
1830}
1831
1832static inline int throttled_hierarchy(struct cfs_rq *cfs_rq)
1833{
1834 return 0;
1835}
1836
1837static inline int throttled_lb_pair(struct task_group *tg,
1838 int src_cpu, int dest_cpu)
1839{
1840 return 0;
1841}
1842#endif
1843
1240/************************************************** 1844/**************************************************
1241 * CFS operations on tasks: 1845 * CFS operations on tasks:
1242 */ 1846 */
@@ -1313,16 +1917,33 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
1313 break; 1917 break;
1314 cfs_rq = cfs_rq_of(se); 1918 cfs_rq = cfs_rq_of(se);
1315 enqueue_entity(cfs_rq, se, flags); 1919 enqueue_entity(cfs_rq, se, flags);
1920
1921 /*
1922 * end evaluation on encountering a throttled cfs_rq
1923 *
1924 * note: in the case of encountering a throttled cfs_rq we will
1925 * post the final h_nr_running increment below.
1926 */
1927 if (cfs_rq_throttled(cfs_rq))
1928 break;
1929 cfs_rq->h_nr_running++;
1930
1316 flags = ENQUEUE_WAKEUP; 1931 flags = ENQUEUE_WAKEUP;
1317 } 1932 }
1318 1933
1319 for_each_sched_entity(se) { 1934 for_each_sched_entity(se) {
1320 cfs_rq = cfs_rq_of(se); 1935 cfs_rq = cfs_rq_of(se);
1936 cfs_rq->h_nr_running++;
1937
1938 if (cfs_rq_throttled(cfs_rq))
1939 break;
1321 1940
1322 update_cfs_load(cfs_rq, 0); 1941 update_cfs_load(cfs_rq, 0);
1323 update_cfs_shares(cfs_rq); 1942 update_cfs_shares(cfs_rq);
1324 } 1943 }
1325 1944
1945 if (!se)
1946 inc_nr_running(rq);
1326 hrtick_update(rq); 1947 hrtick_update(rq);
1327} 1948}
1328 1949
@@ -1343,6 +1964,16 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
1343 cfs_rq = cfs_rq_of(se); 1964 cfs_rq = cfs_rq_of(se);
1344 dequeue_entity(cfs_rq, se, flags); 1965 dequeue_entity(cfs_rq, se, flags);
1345 1966
1967 /*
1968 * end evaluation on encountering a throttled cfs_rq
1969 *
1970 * note: in the case of encountering a throttled cfs_rq we will
1971 * post the final h_nr_running decrement below.
1972 */
1973 if (cfs_rq_throttled(cfs_rq))
1974 break;
1975 cfs_rq->h_nr_running--;
1976
1346 /* Don't dequeue parent if it has other entities besides us */ 1977 /* Don't dequeue parent if it has other entities besides us */
1347 if (cfs_rq->load.weight) { 1978 if (cfs_rq->load.weight) {
1348 /* 1979 /*
@@ -1361,11 +1992,17 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
1361 1992
1362 for_each_sched_entity(se) { 1993 for_each_sched_entity(se) {
1363 cfs_rq = cfs_rq_of(se); 1994 cfs_rq = cfs_rq_of(se);
1995 cfs_rq->h_nr_running--;
1996
1997 if (cfs_rq_throttled(cfs_rq))
1998 break;
1364 1999
1365 update_cfs_load(cfs_rq, 0); 2000 update_cfs_load(cfs_rq, 0);
1366 update_cfs_shares(cfs_rq); 2001 update_cfs_shares(cfs_rq);
1367 } 2002 }
1368 2003
2004 if (!se)
2005 dec_nr_running(rq);
1369 hrtick_update(rq); 2006 hrtick_update(rq);
1370} 2007}
1371 2008
@@ -1434,7 +2071,6 @@ static long effective_load(struct task_group *tg, int cpu, long wl, long wg)
1434 2071
1435 return wl; 2072 return wl;
1436} 2073}
1437
1438#else 2074#else
1439 2075
1440static inline unsigned long effective_load(struct task_group *tg, int cpu, 2076static inline unsigned long effective_load(struct task_group *tg, int cpu,
@@ -1547,7 +2183,7 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p,
1547 2183
1548 /* Skip over this group if it has no CPUs allowed */ 2184 /* Skip over this group if it has no CPUs allowed */
1549 if (!cpumask_intersects(sched_group_cpus(group), 2185 if (!cpumask_intersects(sched_group_cpus(group),
1550 &p->cpus_allowed)) 2186 tsk_cpus_allowed(p)))
1551 continue; 2187 continue;
1552 2188
1553 local_group = cpumask_test_cpu(this_cpu, 2189 local_group = cpumask_test_cpu(this_cpu,
@@ -1593,7 +2229,7 @@ find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu)
1593 int i; 2229 int i;
1594 2230
1595 /* Traverse only the allowed CPUs */ 2231 /* Traverse only the allowed CPUs */
1596 for_each_cpu_and(i, sched_group_cpus(group), &p->cpus_allowed) { 2232 for_each_cpu_and(i, sched_group_cpus(group), tsk_cpus_allowed(p)) {
1597 load = weighted_cpuload(i); 2233 load = weighted_cpuload(i);
1598 2234
1599 if (load < min_load || (load == min_load && i == this_cpu)) { 2235 if (load < min_load || (load == min_load && i == this_cpu)) {
@@ -1637,7 +2273,7 @@ static int select_idle_sibling(struct task_struct *p, int target)
1637 if (!(sd->flags & SD_SHARE_PKG_RESOURCES)) 2273 if (!(sd->flags & SD_SHARE_PKG_RESOURCES))
1638 break; 2274 break;
1639 2275
1640 for_each_cpu_and(i, sched_domain_span(sd), &p->cpus_allowed) { 2276 for_each_cpu_and(i, sched_domain_span(sd), tsk_cpus_allowed(p)) {
1641 if (idle_cpu(i)) { 2277 if (idle_cpu(i)) {
1642 target = i; 2278 target = i;
1643 break; 2279 break;
@@ -1680,7 +2316,7 @@ select_task_rq_fair(struct task_struct *p, int sd_flag, int wake_flags)
1680 int sync = wake_flags & WF_SYNC; 2316 int sync = wake_flags & WF_SYNC;
1681 2317
1682 if (sd_flag & SD_BALANCE_WAKE) { 2318 if (sd_flag & SD_BALANCE_WAKE) {
1683 if (cpumask_test_cpu(cpu, &p->cpus_allowed)) 2319 if (cpumask_test_cpu(cpu, tsk_cpus_allowed(p)))
1684 want_affine = 1; 2320 want_affine = 1;
1685 new_cpu = prev_cpu; 2321 new_cpu = prev_cpu;
1686 } 2322 }
@@ -1875,6 +2511,15 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_
1875 if (unlikely(se == pse)) 2511 if (unlikely(se == pse))
1876 return; 2512 return;
1877 2513
2514 /*
2515 * This is possible from callers such as pull_task(), in which we
2516 * unconditionally check_prempt_curr() after an enqueue (which may have
2517 * lead to a throttle). This both saves work and prevents false
2518 * next-buddy nomination below.
2519 */
2520 if (unlikely(throttled_hierarchy(cfs_rq_of(pse))))
2521 return;
2522
1878 if (sched_feat(NEXT_BUDDY) && scale && !(wake_flags & WF_FORK)) { 2523 if (sched_feat(NEXT_BUDDY) && scale && !(wake_flags & WF_FORK)) {
1879 set_next_buddy(pse); 2524 set_next_buddy(pse);
1880 next_buddy_marked = 1; 2525 next_buddy_marked = 1;
@@ -1883,6 +2528,12 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_
1883 /* 2528 /*
1884 * We can come here with TIF_NEED_RESCHED already set from new task 2529 * We can come here with TIF_NEED_RESCHED already set from new task
1885 * wake up path. 2530 * wake up path.
2531 *
2532 * Note: this also catches the edge-case of curr being in a throttled
2533 * group (e.g. via set_curr_task), since update_curr() (in the
2534 * enqueue of curr) will have resulted in resched being set. This
2535 * prevents us from potentially nominating it as a false LAST_BUDDY
2536 * below.
1886 */ 2537 */
1887 if (test_tsk_need_resched(curr)) 2538 if (test_tsk_need_resched(curr))
1888 return; 2539 return;
@@ -1899,10 +2550,6 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_
1899 if (unlikely(p->policy != SCHED_NORMAL)) 2550 if (unlikely(p->policy != SCHED_NORMAL))
1900 return; 2551 return;
1901 2552
1902
1903 if (!sched_feat(WAKEUP_PREEMPT))
1904 return;
1905
1906 find_matching_se(&se, &pse); 2553 find_matching_se(&se, &pse);
1907 update_curr(cfs_rq_of(se)); 2554 update_curr(cfs_rq_of(se));
1908 BUG_ON(!pse); 2555 BUG_ON(!pse);
@@ -2005,7 +2652,8 @@ static bool yield_to_task_fair(struct rq *rq, struct task_struct *p, bool preemp
2005{ 2652{
2006 struct sched_entity *se = &p->se; 2653 struct sched_entity *se = &p->se;
2007 2654
2008 if (!se->on_rq) 2655 /* throttled hierarchies are not runnable */
2656 if (!se->on_rq || throttled_hierarchy(cfs_rq_of(se)))
2009 return false; 2657 return false;
2010 2658
2011 /* Tell the scheduler that we'd really like pse to run next. */ 2659 /* Tell the scheduler that we'd really like pse to run next. */
@@ -2049,7 +2697,7 @@ int can_migrate_task(struct task_struct *p, struct rq *rq, int this_cpu,
2049 * 2) cannot be migrated to this CPU due to cpus_allowed, or 2697 * 2) cannot be migrated to this CPU due to cpus_allowed, or
2050 * 3) are cache-hot on their current CPU. 2698 * 3) are cache-hot on their current CPU.
2051 */ 2699 */
2052 if (!cpumask_test_cpu(this_cpu, &p->cpus_allowed)) { 2700 if (!cpumask_test_cpu(this_cpu, tsk_cpus_allowed(p))) {
2053 schedstat_inc(p, se.statistics.nr_failed_migrations_affine); 2701 schedstat_inc(p, se.statistics.nr_failed_migrations_affine);
2054 return 0; 2702 return 0;
2055 } 2703 }
@@ -2102,6 +2750,9 @@ move_one_task(struct rq *this_rq, int this_cpu, struct rq *busiest,
2102 2750
2103 for_each_leaf_cfs_rq(busiest, cfs_rq) { 2751 for_each_leaf_cfs_rq(busiest, cfs_rq) {
2104 list_for_each_entry_safe(p, n, &cfs_rq->tasks, se.group_node) { 2752 list_for_each_entry_safe(p, n, &cfs_rq->tasks, se.group_node) {
2753 if (throttled_lb_pair(task_group(p),
2754 busiest->cpu, this_cpu))
2755 break;
2105 2756
2106 if (!can_migrate_task(p, busiest, this_cpu, 2757 if (!can_migrate_task(p, busiest, this_cpu,
2107 sd, idle, &pinned)) 2758 sd, idle, &pinned))
@@ -2217,8 +2868,13 @@ static void update_shares(int cpu)
2217 * Iterates the task_group tree in a bottom up fashion, see 2868 * Iterates the task_group tree in a bottom up fashion, see
2218 * list_add_leaf_cfs_rq() for details. 2869 * list_add_leaf_cfs_rq() for details.
2219 */ 2870 */
2220 for_each_leaf_cfs_rq(rq, cfs_rq) 2871 for_each_leaf_cfs_rq(rq, cfs_rq) {
2872 /* throttled entities do not contribute to load */
2873 if (throttled_hierarchy(cfs_rq))
2874 continue;
2875
2221 update_shares_cpu(cfs_rq->tg, cpu); 2876 update_shares_cpu(cfs_rq->tg, cpu);
2877 }
2222 rcu_read_unlock(); 2878 rcu_read_unlock();
2223} 2879}
2224 2880
@@ -2268,9 +2924,10 @@ load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest,
2268 u64 rem_load, moved_load; 2924 u64 rem_load, moved_load;
2269 2925
2270 /* 2926 /*
2271 * empty group 2927 * empty group or part of a throttled hierarchy
2272 */ 2928 */
2273 if (!busiest_cfs_rq->task_weight) 2929 if (!busiest_cfs_rq->task_weight ||
2930 throttled_lb_pair(busiest_cfs_rq->tg, cpu_of(busiest), this_cpu))
2274 continue; 2931 continue;
2275 2932
2276 rem_load = (u64)rem_load_move * busiest_weight; 2933 rem_load = (u64)rem_load_move * busiest_weight;
@@ -3430,7 +4087,7 @@ redo:
3430 * moved to this_cpu 4087 * moved to this_cpu
3431 */ 4088 */
3432 if (!cpumask_test_cpu(this_cpu, 4089 if (!cpumask_test_cpu(this_cpu,
3433 &busiest->curr->cpus_allowed)) { 4090 tsk_cpus_allowed(busiest->curr))) {
3434 raw_spin_unlock_irqrestore(&busiest->lock, 4091 raw_spin_unlock_irqrestore(&busiest->lock,
3435 flags); 4092 flags);
3436 all_pinned = 1; 4093 all_pinned = 1;
@@ -3612,22 +4269,6 @@ out_unlock:
3612} 4269}
3613 4270
3614#ifdef CONFIG_NO_HZ 4271#ifdef CONFIG_NO_HZ
3615
3616static DEFINE_PER_CPU(struct call_single_data, remote_sched_softirq_cb);
3617
3618static void trigger_sched_softirq(void *data)
3619{
3620 raise_softirq_irqoff(SCHED_SOFTIRQ);
3621}
3622
3623static inline void init_sched_softirq_csd(struct call_single_data *csd)
3624{
3625 csd->func = trigger_sched_softirq;
3626 csd->info = NULL;
3627 csd->flags = 0;
3628 csd->priv = 0;
3629}
3630
3631/* 4272/*
3632 * idle load balancing details 4273 * idle load balancing details
3633 * - One of the idle CPUs nominates itself as idle load_balancer, while 4274 * - One of the idle CPUs nominates itself as idle load_balancer, while
@@ -3667,7 +4308,7 @@ static inline struct sched_domain *lowest_flag_domain(int cpu, int flag)
3667 struct sched_domain *sd; 4308 struct sched_domain *sd;
3668 4309
3669 for_each_domain(cpu, sd) 4310 for_each_domain(cpu, sd)
3670 if (sd && (sd->flags & flag)) 4311 if (sd->flags & flag)
3671 break; 4312 break;
3672 4313
3673 return sd; 4314 return sd;
@@ -3793,11 +4434,16 @@ static void nohz_balancer_kick(int cpu)
3793 } 4434 }
3794 4435
3795 if (!cpu_rq(ilb_cpu)->nohz_balance_kick) { 4436 if (!cpu_rq(ilb_cpu)->nohz_balance_kick) {
3796 struct call_single_data *cp;
3797
3798 cpu_rq(ilb_cpu)->nohz_balance_kick = 1; 4437 cpu_rq(ilb_cpu)->nohz_balance_kick = 1;
3799 cp = &per_cpu(remote_sched_softirq_cb, cpu); 4438
3800 __smp_call_function_single(ilb_cpu, cp, 0); 4439 smp_mb();
4440 /*
4441 * Use smp_send_reschedule() instead of resched_cpu().
4442 * This way we generate a sched IPI on the target cpu which
4443 * is idle. And the softirq performing nohz idle load balance
4444 * will be run before returning from the IPI.
4445 */
4446 smp_send_reschedule(ilb_cpu);
3801 } 4447 }
3802 return; 4448 return;
3803} 4449}
@@ -4030,7 +4676,7 @@ static inline int nohz_kick_needed(struct rq *rq, int cpu)
4030 if (time_before(now, nohz.next_balance)) 4676 if (time_before(now, nohz.next_balance))
4031 return 0; 4677 return 0;
4032 4678
4033 if (rq->idle_at_tick) 4679 if (idle_cpu(cpu))
4034 return 0; 4680 return 0;
4035 4681
4036 first_pick_cpu = atomic_read(&nohz.first_pick_cpu); 4682 first_pick_cpu = atomic_read(&nohz.first_pick_cpu);
@@ -4066,7 +4712,7 @@ static void run_rebalance_domains(struct softirq_action *h)
4066{ 4712{
4067 int this_cpu = smp_processor_id(); 4713 int this_cpu = smp_processor_id();
4068 struct rq *this_rq = cpu_rq(this_cpu); 4714 struct rq *this_rq = cpu_rq(this_cpu);
4069 enum cpu_idle_type idle = this_rq->idle_at_tick ? 4715 enum cpu_idle_type idle = this_rq->idle_balance ?
4070 CPU_IDLE : CPU_NOT_IDLE; 4716 CPU_IDLE : CPU_NOT_IDLE;
4071 4717
4072 rebalance_domains(this_cpu, idle); 4718 rebalance_domains(this_cpu, idle);
@@ -4251,8 +4897,13 @@ static void set_curr_task_fair(struct rq *rq)
4251{ 4897{
4252 struct sched_entity *se = &rq->curr->se; 4898 struct sched_entity *se = &rq->curr->se;
4253 4899
4254 for_each_sched_entity(se) 4900 for_each_sched_entity(se) {
4255 set_next_entity(cfs_rq_of(se), se); 4901 struct cfs_rq *cfs_rq = cfs_rq_of(se);
4902
4903 set_next_entity(cfs_rq, se);
4904 /* ensure bandwidth has been allocated on our new cfs_rq */
4905 account_cfs_rq_runtime(cfs_rq, 0);
4906 }
4256} 4907}
4257 4908
4258#ifdef CONFIG_FAIR_GROUP_SCHED 4909#ifdef CONFIG_FAIR_GROUP_SCHED
diff --git a/kernel/sched_features.h b/kernel/sched_features.h
index 2e74677cb040..efa0a7b75dde 100644
--- a/kernel/sched_features.h
+++ b/kernel/sched_features.h
@@ -12,11 +12,6 @@ SCHED_FEAT(GENTLE_FAIR_SLEEPERS, 1)
12SCHED_FEAT(START_DEBIT, 1) 12SCHED_FEAT(START_DEBIT, 1)
13 13
14/* 14/*
15 * Should wakeups try to preempt running tasks.
16 */
17SCHED_FEAT(WAKEUP_PREEMPT, 1)
18
19/*
20 * Based on load and program behaviour, see if it makes sense to place 15 * Based on load and program behaviour, see if it makes sense to place
21 * a newly woken task on the same cpu as the task that woke it -- 16 * a newly woken task on the same cpu as the task that woke it --
22 * improve cache locality. Typically used with SYNC wakeups as 17 * improve cache locality. Typically used with SYNC wakeups as
diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c
index 97540f0c9e47..056cbd2e2a27 100644
--- a/kernel/sched_rt.c
+++ b/kernel/sched_rt.c
@@ -124,21 +124,33 @@ static void dec_rt_migration(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
124 update_rt_migration(rt_rq); 124 update_rt_migration(rt_rq);
125} 125}
126 126
127static inline int has_pushable_tasks(struct rq *rq)
128{
129 return !plist_head_empty(&rq->rt.pushable_tasks);
130}
131
127static void enqueue_pushable_task(struct rq *rq, struct task_struct *p) 132static void enqueue_pushable_task(struct rq *rq, struct task_struct *p)
128{ 133{
129 plist_del(&p->pushable_tasks, &rq->rt.pushable_tasks); 134 plist_del(&p->pushable_tasks, &rq->rt.pushable_tasks);
130 plist_node_init(&p->pushable_tasks, p->prio); 135 plist_node_init(&p->pushable_tasks, p->prio);
131 plist_add(&p->pushable_tasks, &rq->rt.pushable_tasks); 136 plist_add(&p->pushable_tasks, &rq->rt.pushable_tasks);
137
138 /* Update the highest prio pushable task */
139 if (p->prio < rq->rt.highest_prio.next)
140 rq->rt.highest_prio.next = p->prio;
132} 141}
133 142
134static void dequeue_pushable_task(struct rq *rq, struct task_struct *p) 143static void dequeue_pushable_task(struct rq *rq, struct task_struct *p)
135{ 144{
136 plist_del(&p->pushable_tasks, &rq->rt.pushable_tasks); 145 plist_del(&p->pushable_tasks, &rq->rt.pushable_tasks);
137}
138 146
139static inline int has_pushable_tasks(struct rq *rq) 147 /* Update the new highest prio pushable task */
140{ 148 if (has_pushable_tasks(rq)) {
141 return !plist_head_empty(&rq->rt.pushable_tasks); 149 p = plist_first_entry(&rq->rt.pushable_tasks,
150 struct task_struct, pushable_tasks);
151 rq->rt.highest_prio.next = p->prio;
152 } else
153 rq->rt.highest_prio.next = MAX_RT_PRIO;
142} 154}
143 155
144#else 156#else
@@ -643,6 +655,7 @@ static int sched_rt_runtime_exceeded(struct rt_rq *rt_rq)
643 655
644 if (rt_rq->rt_time > runtime) { 656 if (rt_rq->rt_time > runtime) {
645 rt_rq->rt_throttled = 1; 657 rt_rq->rt_throttled = 1;
658 printk_once(KERN_WARNING "sched: RT throttling activated\n");
646 if (rt_rq_throttled(rt_rq)) { 659 if (rt_rq_throttled(rt_rq)) {
647 sched_rt_rq_dequeue(rt_rq); 660 sched_rt_rq_dequeue(rt_rq);
648 return 1; 661 return 1;
@@ -698,47 +711,13 @@ static void update_curr_rt(struct rq *rq)
698 711
699#if defined CONFIG_SMP 712#if defined CONFIG_SMP
700 713
701static struct task_struct *pick_next_highest_task_rt(struct rq *rq, int cpu);
702
703static inline int next_prio(struct rq *rq)
704{
705 struct task_struct *next = pick_next_highest_task_rt(rq, rq->cpu);
706
707 if (next && rt_prio(next->prio))
708 return next->prio;
709 else
710 return MAX_RT_PRIO;
711}
712
713static void 714static void
714inc_rt_prio_smp(struct rt_rq *rt_rq, int prio, int prev_prio) 715inc_rt_prio_smp(struct rt_rq *rt_rq, int prio, int prev_prio)
715{ 716{
716 struct rq *rq = rq_of_rt_rq(rt_rq); 717 struct rq *rq = rq_of_rt_rq(rt_rq);
717 718
718 if (prio < prev_prio) { 719 if (rq->online && prio < prev_prio)
719 720 cpupri_set(&rq->rd->cpupri, rq->cpu, prio);
720 /*
721 * If the new task is higher in priority than anything on the
722 * run-queue, we know that the previous high becomes our
723 * next-highest.
724 */
725 rt_rq->highest_prio.next = prev_prio;
726
727 if (rq->online)
728 cpupri_set(&rq->rd->cpupri, rq->cpu, prio);
729
730 } else if (prio == rt_rq->highest_prio.curr)
731 /*
732 * If the next task is equal in priority to the highest on
733 * the run-queue, then we implicitly know that the next highest
734 * task cannot be any lower than current
735 */
736 rt_rq->highest_prio.next = prio;
737 else if (prio < rt_rq->highest_prio.next)
738 /*
739 * Otherwise, we need to recompute next-highest
740 */
741 rt_rq->highest_prio.next = next_prio(rq);
742} 721}
743 722
744static void 723static void
@@ -746,9 +725,6 @@ dec_rt_prio_smp(struct rt_rq *rt_rq, int prio, int prev_prio)
746{ 725{
747 struct rq *rq = rq_of_rt_rq(rt_rq); 726 struct rq *rq = rq_of_rt_rq(rt_rq);
748 727
749 if (rt_rq->rt_nr_running && (prio <= rt_rq->highest_prio.next))
750 rt_rq->highest_prio.next = next_prio(rq);
751
752 if (rq->online && rt_rq->highest_prio.curr != prev_prio) 728 if (rq->online && rt_rq->highest_prio.curr != prev_prio)
753 cpupri_set(&rq->rd->cpupri, rq->cpu, rt_rq->highest_prio.curr); 729 cpupri_set(&rq->rd->cpupri, rq->cpu, rt_rq->highest_prio.curr);
754} 730}
@@ -961,6 +937,8 @@ enqueue_task_rt(struct rq *rq, struct task_struct *p, int flags)
961 937
962 if (!task_current(rq, p) && p->rt.nr_cpus_allowed > 1) 938 if (!task_current(rq, p) && p->rt.nr_cpus_allowed > 1)
963 enqueue_pushable_task(rq, p); 939 enqueue_pushable_task(rq, p);
940
941 inc_nr_running(rq);
964} 942}
965 943
966static void dequeue_task_rt(struct rq *rq, struct task_struct *p, int flags) 944static void dequeue_task_rt(struct rq *rq, struct task_struct *p, int flags)
@@ -971,6 +949,8 @@ static void dequeue_task_rt(struct rq *rq, struct task_struct *p, int flags)
971 dequeue_rt_entity(rt_se); 949 dequeue_rt_entity(rt_se);
972 950
973 dequeue_pushable_task(rq, p); 951 dequeue_pushable_task(rq, p);
952
953 dec_nr_running(rq);
974} 954}
975 955
976/* 956/*
@@ -1017,10 +997,12 @@ select_task_rq_rt(struct task_struct *p, int sd_flag, int flags)
1017 struct rq *rq; 997 struct rq *rq;
1018 int cpu; 998 int cpu;
1019 999
1020 if (sd_flag != SD_BALANCE_WAKE)
1021 return smp_processor_id();
1022
1023 cpu = task_cpu(p); 1000 cpu = task_cpu(p);
1001
1002 /* For anything but wake ups, just return the task_cpu */
1003 if (sd_flag != SD_BALANCE_WAKE && sd_flag != SD_BALANCE_FORK)
1004 goto out;
1005
1024 rq = cpu_rq(cpu); 1006 rq = cpu_rq(cpu);
1025 1007
1026 rcu_read_lock(); 1008 rcu_read_lock();
@@ -1050,7 +1032,7 @@ select_task_rq_rt(struct task_struct *p, int sd_flag, int flags)
1050 */ 1032 */
1051 if (curr && unlikely(rt_task(curr)) && 1033 if (curr && unlikely(rt_task(curr)) &&
1052 (curr->rt.nr_cpus_allowed < 2 || 1034 (curr->rt.nr_cpus_allowed < 2 ||
1053 curr->prio < p->prio) && 1035 curr->prio <= p->prio) &&
1054 (p->rt.nr_cpus_allowed > 1)) { 1036 (p->rt.nr_cpus_allowed > 1)) {
1055 int target = find_lowest_rq(p); 1037 int target = find_lowest_rq(p);
1056 1038
@@ -1059,6 +1041,7 @@ select_task_rq_rt(struct task_struct *p, int sd_flag, int flags)
1059 } 1041 }
1060 rcu_read_unlock(); 1042 rcu_read_unlock();
1061 1043
1044out:
1062 return cpu; 1045 return cpu;
1063} 1046}
1064 1047
@@ -1178,7 +1161,6 @@ static struct task_struct *pick_next_task_rt(struct rq *rq)
1178static void put_prev_task_rt(struct rq *rq, struct task_struct *p) 1161static void put_prev_task_rt(struct rq *rq, struct task_struct *p)
1179{ 1162{
1180 update_curr_rt(rq); 1163 update_curr_rt(rq);
1181 p->se.exec_start = 0;
1182 1164
1183 /* 1165 /*
1184 * The previous task needs to be made eligible for pushing 1166 * The previous task needs to be made eligible for pushing
@@ -1198,7 +1180,7 @@ static void deactivate_task(struct rq *rq, struct task_struct *p, int sleep);
1198static int pick_rt_task(struct rq *rq, struct task_struct *p, int cpu) 1180static int pick_rt_task(struct rq *rq, struct task_struct *p, int cpu)
1199{ 1181{
1200 if (!task_running(rq, p) && 1182 if (!task_running(rq, p) &&
1201 (cpu < 0 || cpumask_test_cpu(cpu, &p->cpus_allowed)) && 1183 (cpu < 0 || cpumask_test_cpu(cpu, tsk_cpus_allowed(p))) &&
1202 (p->rt.nr_cpus_allowed > 1)) 1184 (p->rt.nr_cpus_allowed > 1))
1203 return 1; 1185 return 1;
1204 return 0; 1186 return 0;
@@ -1343,7 +1325,7 @@ static struct rq *find_lock_lowest_rq(struct task_struct *task, struct rq *rq)
1343 */ 1325 */
1344 if (unlikely(task_rq(task) != rq || 1326 if (unlikely(task_rq(task) != rq ||
1345 !cpumask_test_cpu(lowest_rq->cpu, 1327 !cpumask_test_cpu(lowest_rq->cpu,
1346 &task->cpus_allowed) || 1328 tsk_cpus_allowed(task)) ||
1347 task_running(rq, task) || 1329 task_running(rq, task) ||
1348 !task->on_rq)) { 1330 !task->on_rq)) {
1349 1331
@@ -1394,6 +1376,7 @@ static int push_rt_task(struct rq *rq)
1394{ 1376{
1395 struct task_struct *next_task; 1377 struct task_struct *next_task;
1396 struct rq *lowest_rq; 1378 struct rq *lowest_rq;
1379 int ret = 0;
1397 1380
1398 if (!rq->rt.overloaded) 1381 if (!rq->rt.overloaded)
1399 return 0; 1382 return 0;
@@ -1426,7 +1409,7 @@ retry:
1426 if (!lowest_rq) { 1409 if (!lowest_rq) {
1427 struct task_struct *task; 1410 struct task_struct *task;
1428 /* 1411 /*
1429 * find lock_lowest_rq releases rq->lock 1412 * find_lock_lowest_rq releases rq->lock
1430 * so it is possible that next_task has migrated. 1413 * so it is possible that next_task has migrated.
1431 * 1414 *
1432 * We need to make sure that the task is still on the same 1415 * We need to make sure that the task is still on the same
@@ -1436,12 +1419,11 @@ retry:
1436 task = pick_next_pushable_task(rq); 1419 task = pick_next_pushable_task(rq);
1437 if (task_cpu(next_task) == rq->cpu && task == next_task) { 1420 if (task_cpu(next_task) == rq->cpu && task == next_task) {
1438 /* 1421 /*
1439 * If we get here, the task hasn't moved at all, but 1422 * The task hasn't migrated, and is still the next
1440 * it has failed to push. We will not try again, 1423 * eligible task, but we failed to find a run-queue
1441 * since the other cpus will pull from us when they 1424 * to push it to. Do not retry in this case, since
1442 * are ready. 1425 * other cpus will pull from us when ready.
1443 */ 1426 */
1444 dequeue_pushable_task(rq, next_task);
1445 goto out; 1427 goto out;
1446 } 1428 }
1447 1429
@@ -1460,6 +1442,7 @@ retry:
1460 deactivate_task(rq, next_task, 0); 1442 deactivate_task(rq, next_task, 0);
1461 set_task_cpu(next_task, lowest_rq->cpu); 1443 set_task_cpu(next_task, lowest_rq->cpu);
1462 activate_task(lowest_rq, next_task, 0); 1444 activate_task(lowest_rq, next_task, 0);
1445 ret = 1;
1463 1446
1464 resched_task(lowest_rq->curr); 1447 resched_task(lowest_rq->curr);
1465 1448
@@ -1468,7 +1451,7 @@ retry:
1468out: 1451out:
1469 put_task_struct(next_task); 1452 put_task_struct(next_task);
1470 1453
1471 return 1; 1454 return ret;
1472} 1455}
1473 1456
1474static void push_rt_tasks(struct rq *rq) 1457static void push_rt_tasks(struct rq *rq)
@@ -1581,7 +1564,7 @@ static void task_woken_rt(struct rq *rq, struct task_struct *p)
1581 p->rt.nr_cpus_allowed > 1 && 1564 p->rt.nr_cpus_allowed > 1 &&
1582 rt_task(rq->curr) && 1565 rt_task(rq->curr) &&
1583 (rq->curr->rt.nr_cpus_allowed < 2 || 1566 (rq->curr->rt.nr_cpus_allowed < 2 ||
1584 rq->curr->prio < p->prio)) 1567 rq->curr->prio <= p->prio))
1585 push_rt_tasks(rq); 1568 push_rt_tasks(rq);
1586} 1569}
1587 1570
@@ -1626,9 +1609,6 @@ static void set_cpus_allowed_rt(struct task_struct *p,
1626 1609
1627 update_rt_migration(&rq->rt); 1610 update_rt_migration(&rq->rt);
1628 } 1611 }
1629
1630 cpumask_copy(&p->cpus_allowed, new_mask);
1631 p->rt.nr_cpus_allowed = weight;
1632} 1612}
1633 1613
1634/* Assumes rq->lock is held */ 1614/* Assumes rq->lock is held */
@@ -1863,4 +1843,3 @@ static void print_rt_stats(struct seq_file *m, int cpu)
1863 rcu_read_unlock(); 1843 rcu_read_unlock();
1864} 1844}
1865#endif /* CONFIG_SCHED_DEBUG */ 1845#endif /* CONFIG_SCHED_DEBUG */
1866
diff --git a/kernel/sched_stats.h b/kernel/sched_stats.h
index 331e01bcd026..87f9e36ea56e 100644
--- a/kernel/sched_stats.h
+++ b/kernel/sched_stats.h
@@ -282,10 +282,10 @@ static inline void account_group_user_time(struct task_struct *tsk,
282 if (!cputimer->running) 282 if (!cputimer->running)
283 return; 283 return;
284 284
285 spin_lock(&cputimer->lock); 285 raw_spin_lock(&cputimer->lock);
286 cputimer->cputime.utime = 286 cputimer->cputime.utime =
287 cputime_add(cputimer->cputime.utime, cputime); 287 cputime_add(cputimer->cputime.utime, cputime);
288 spin_unlock(&cputimer->lock); 288 raw_spin_unlock(&cputimer->lock);
289} 289}
290 290
291/** 291/**
@@ -306,10 +306,10 @@ static inline void account_group_system_time(struct task_struct *tsk,
306 if (!cputimer->running) 306 if (!cputimer->running)
307 return; 307 return;
308 308
309 spin_lock(&cputimer->lock); 309 raw_spin_lock(&cputimer->lock);
310 cputimer->cputime.stime = 310 cputimer->cputime.stime =
311 cputime_add(cputimer->cputime.stime, cputime); 311 cputime_add(cputimer->cputime.stime, cputime);
312 spin_unlock(&cputimer->lock); 312 raw_spin_unlock(&cputimer->lock);
313} 313}
314 314
315/** 315/**
@@ -330,7 +330,7 @@ static inline void account_group_exec_runtime(struct task_struct *tsk,
330 if (!cputimer->running) 330 if (!cputimer->running)
331 return; 331 return;
332 332
333 spin_lock(&cputimer->lock); 333 raw_spin_lock(&cputimer->lock);
334 cputimer->cputime.sum_exec_runtime += ns; 334 cputimer->cputime.sum_exec_runtime += ns;
335 spin_unlock(&cputimer->lock); 335 raw_spin_unlock(&cputimer->lock);
336} 336}
diff --git a/kernel/sched_stoptask.c b/kernel/sched_stoptask.c
index 6f437632afab..8b44e7fa7fb3 100644
--- a/kernel/sched_stoptask.c
+++ b/kernel/sched_stoptask.c
@@ -34,11 +34,13 @@ static struct task_struct *pick_next_task_stop(struct rq *rq)
34static void 34static void
35enqueue_task_stop(struct rq *rq, struct task_struct *p, int flags) 35enqueue_task_stop(struct rq *rq, struct task_struct *p, int flags)
36{ 36{
37 inc_nr_running(rq);
37} 38}
38 39
39static void 40static void
40dequeue_task_stop(struct rq *rq, struct task_struct *p, int flags) 41dequeue_task_stop(struct rq *rq, struct task_struct *p, int flags)
41{ 42{
43 dec_nr_running(rq);
42} 44}
43 45
44static void yield_task_stop(struct rq *rq) 46static void yield_task_stop(struct rq *rq)
diff --git a/kernel/semaphore.c b/kernel/semaphore.c
index 94a62c0d4ade..d831841e55a7 100644
--- a/kernel/semaphore.c
+++ b/kernel/semaphore.c
@@ -54,12 +54,12 @@ void down(struct semaphore *sem)
54{ 54{
55 unsigned long flags; 55 unsigned long flags;
56 56
57 spin_lock_irqsave(&sem->lock, flags); 57 raw_spin_lock_irqsave(&sem->lock, flags);
58 if (likely(sem->count > 0)) 58 if (likely(sem->count > 0))
59 sem->count--; 59 sem->count--;
60 else 60 else
61 __down(sem); 61 __down(sem);
62 spin_unlock_irqrestore(&sem->lock, flags); 62 raw_spin_unlock_irqrestore(&sem->lock, flags);
63} 63}
64EXPORT_SYMBOL(down); 64EXPORT_SYMBOL(down);
65 65
@@ -77,12 +77,12 @@ int down_interruptible(struct semaphore *sem)
77 unsigned long flags; 77 unsigned long flags;
78 int result = 0; 78 int result = 0;
79 79
80 spin_lock_irqsave(&sem->lock, flags); 80 raw_spin_lock_irqsave(&sem->lock, flags);
81 if (likely(sem->count > 0)) 81 if (likely(sem->count > 0))
82 sem->count--; 82 sem->count--;
83 else 83 else
84 result = __down_interruptible(sem); 84 result = __down_interruptible(sem);
85 spin_unlock_irqrestore(&sem->lock, flags); 85 raw_spin_unlock_irqrestore(&sem->lock, flags);
86 86
87 return result; 87 return result;
88} 88}
@@ -103,12 +103,12 @@ int down_killable(struct semaphore *sem)
103 unsigned long flags; 103 unsigned long flags;
104 int result = 0; 104 int result = 0;
105 105
106 spin_lock_irqsave(&sem->lock, flags); 106 raw_spin_lock_irqsave(&sem->lock, flags);
107 if (likely(sem->count > 0)) 107 if (likely(sem->count > 0))
108 sem->count--; 108 sem->count--;
109 else 109 else
110 result = __down_killable(sem); 110 result = __down_killable(sem);
111 spin_unlock_irqrestore(&sem->lock, flags); 111 raw_spin_unlock_irqrestore(&sem->lock, flags);
112 112
113 return result; 113 return result;
114} 114}
@@ -132,11 +132,11 @@ int down_trylock(struct semaphore *sem)
132 unsigned long flags; 132 unsigned long flags;
133 int count; 133 int count;
134 134
135 spin_lock_irqsave(&sem->lock, flags); 135 raw_spin_lock_irqsave(&sem->lock, flags);
136 count = sem->count - 1; 136 count = sem->count - 1;
137 if (likely(count >= 0)) 137 if (likely(count >= 0))
138 sem->count = count; 138 sem->count = count;
139 spin_unlock_irqrestore(&sem->lock, flags); 139 raw_spin_unlock_irqrestore(&sem->lock, flags);
140 140
141 return (count < 0); 141 return (count < 0);
142} 142}
@@ -157,12 +157,12 @@ int down_timeout(struct semaphore *sem, long jiffies)
157 unsigned long flags; 157 unsigned long flags;
158 int result = 0; 158 int result = 0;
159 159
160 spin_lock_irqsave(&sem->lock, flags); 160 raw_spin_lock_irqsave(&sem->lock, flags);
161 if (likely(sem->count > 0)) 161 if (likely(sem->count > 0))
162 sem->count--; 162 sem->count--;
163 else 163 else
164 result = __down_timeout(sem, jiffies); 164 result = __down_timeout(sem, jiffies);
165 spin_unlock_irqrestore(&sem->lock, flags); 165 raw_spin_unlock_irqrestore(&sem->lock, flags);
166 166
167 return result; 167 return result;
168} 168}
@@ -179,12 +179,12 @@ void up(struct semaphore *sem)
179{ 179{
180 unsigned long flags; 180 unsigned long flags;
181 181
182 spin_lock_irqsave(&sem->lock, flags); 182 raw_spin_lock_irqsave(&sem->lock, flags);
183 if (likely(list_empty(&sem->wait_list))) 183 if (likely(list_empty(&sem->wait_list)))
184 sem->count++; 184 sem->count++;
185 else 185 else
186 __up(sem); 186 __up(sem);
187 spin_unlock_irqrestore(&sem->lock, flags); 187 raw_spin_unlock_irqrestore(&sem->lock, flags);
188} 188}
189EXPORT_SYMBOL(up); 189EXPORT_SYMBOL(up);
190 190
@@ -217,9 +217,9 @@ static inline int __sched __down_common(struct semaphore *sem, long state,
217 if (timeout <= 0) 217 if (timeout <= 0)
218 goto timed_out; 218 goto timed_out;
219 __set_task_state(task, state); 219 __set_task_state(task, state);
220 spin_unlock_irq(&sem->lock); 220 raw_spin_unlock_irq(&sem->lock);
221 timeout = schedule_timeout(timeout); 221 timeout = schedule_timeout(timeout);
222 spin_lock_irq(&sem->lock); 222 raw_spin_lock_irq(&sem->lock);
223 if (waiter.up) 223 if (waiter.up)
224 return 0; 224 return 0;
225 } 225 }
diff --git a/kernel/signal.c b/kernel/signal.c
index 291c9700be75..d252be2d3de5 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -1344,13 +1344,24 @@ int kill_proc_info(int sig, struct siginfo *info, pid_t pid)
1344 return error; 1344 return error;
1345} 1345}
1346 1346
1347static int kill_as_cred_perm(const struct cred *cred,
1348 struct task_struct *target)
1349{
1350 const struct cred *pcred = __task_cred(target);
1351 if (cred->user_ns != pcred->user_ns)
1352 return 0;
1353 if (cred->euid != pcred->suid && cred->euid != pcred->uid &&
1354 cred->uid != pcred->suid && cred->uid != pcred->uid)
1355 return 0;
1356 return 1;
1357}
1358
1347/* like kill_pid_info(), but doesn't use uid/euid of "current" */ 1359/* like kill_pid_info(), but doesn't use uid/euid of "current" */
1348int kill_pid_info_as_uid(int sig, struct siginfo *info, struct pid *pid, 1360int kill_pid_info_as_cred(int sig, struct siginfo *info, struct pid *pid,
1349 uid_t uid, uid_t euid, u32 secid) 1361 const struct cred *cred, u32 secid)
1350{ 1362{
1351 int ret = -EINVAL; 1363 int ret = -EINVAL;
1352 struct task_struct *p; 1364 struct task_struct *p;
1353 const struct cred *pcred;
1354 unsigned long flags; 1365 unsigned long flags;
1355 1366
1356 if (!valid_signal(sig)) 1367 if (!valid_signal(sig))
@@ -1362,10 +1373,7 @@ int kill_pid_info_as_uid(int sig, struct siginfo *info, struct pid *pid,
1362 ret = -ESRCH; 1373 ret = -ESRCH;
1363 goto out_unlock; 1374 goto out_unlock;
1364 } 1375 }
1365 pcred = __task_cred(p); 1376 if (si_fromuser(info) && !kill_as_cred_perm(cred, p)) {
1366 if (si_fromuser(info) &&
1367 euid != pcred->suid && euid != pcred->uid &&
1368 uid != pcred->suid && uid != pcred->uid) {
1369 ret = -EPERM; 1377 ret = -EPERM;
1370 goto out_unlock; 1378 goto out_unlock;
1371 } 1379 }
@@ -1384,7 +1392,7 @@ out_unlock:
1384 rcu_read_unlock(); 1392 rcu_read_unlock();
1385 return ret; 1393 return ret;
1386} 1394}
1387EXPORT_SYMBOL_GPL(kill_pid_info_as_uid); 1395EXPORT_SYMBOL_GPL(kill_pid_info_as_cred);
1388 1396
1389/* 1397/*
1390 * kill_something_info() interprets pid in interesting ways just like kill(2). 1398 * kill_something_info() interprets pid in interesting ways just like kill(2).
diff --git a/kernel/sys.c b/kernel/sys.c
index 18ee1d2f6474..58459509b14c 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -1172,7 +1172,7 @@ DECLARE_RWSEM(uts_sem);
1172static int override_release(char __user *release, int len) 1172static int override_release(char __user *release, int len)
1173{ 1173{
1174 int ret = 0; 1174 int ret = 0;
1175 char buf[len]; 1175 char buf[65];
1176 1176
1177 if (current->personality & UNAME26) { 1177 if (current->personality & UNAME26) {
1178 char *rest = UTS_RELEASE; 1178 char *rest = UTS_RELEASE;
@@ -1759,6 +1759,7 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3,
1759 sizeof(me->comm) - 1) < 0) 1759 sizeof(me->comm) - 1) < 0)
1760 return -EFAULT; 1760 return -EFAULT;
1761 set_task_comm(me, comm); 1761 set_task_comm(me, comm);
1762 proc_comm_connector(me);
1762 return 0; 1763 return 0;
1763 case PR_GET_NAME: 1764 case PR_GET_NAME:
1764 get_task_comm(comm, me); 1765 get_task_comm(comm, me);
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 11d65b531e50..2d2ecdcc8cdb 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -379,6 +379,16 @@ static struct ctl_table kern_table[] = {
379 .extra2 = &one, 379 .extra2 = &one,
380 }, 380 },
381#endif 381#endif
382#ifdef CONFIG_CFS_BANDWIDTH
383 {
384 .procname = "sched_cfs_bandwidth_slice_us",
385 .data = &sysctl_sched_cfs_bandwidth_slice,
386 .maxlen = sizeof(unsigned int),
387 .mode = 0644,
388 .proc_handler = proc_dointvec_minmax,
389 .extra1 = &one,
390 },
391#endif
382#ifdef CONFIG_PROVE_LOCKING 392#ifdef CONFIG_PROVE_LOCKING
383 { 393 {
384 .procname = "prove_locking", 394 .procname = "prove_locking",
diff --git a/kernel/sysctl_binary.c b/kernel/sysctl_binary.c
index e8bffbe2ba4b..6318b511afa1 100644
--- a/kernel/sysctl_binary.c
+++ b/kernel/sysctl_binary.c
@@ -214,7 +214,7 @@ static const struct bin_table bin_net_ipv4_route_table[] = {
214 { CTL_INT, NET_IPV4_ROUTE_GC_MIN_INTERVAL, "gc_min_interval" }, 214 { CTL_INT, NET_IPV4_ROUTE_GC_MIN_INTERVAL, "gc_min_interval" },
215 { CTL_INT, NET_IPV4_ROUTE_GC_MIN_INTERVAL_MS, "gc_min_interval_ms" }, 215 { CTL_INT, NET_IPV4_ROUTE_GC_MIN_INTERVAL_MS, "gc_min_interval_ms" },
216 { CTL_INT, NET_IPV4_ROUTE_GC_TIMEOUT, "gc_timeout" }, 216 { CTL_INT, NET_IPV4_ROUTE_GC_TIMEOUT, "gc_timeout" },
217 { CTL_INT, NET_IPV4_ROUTE_GC_INTERVAL, "gc_interval" }, 217 /* NET_IPV4_ROUTE_GC_INTERVAL "gc_interval" no longer used */
218 { CTL_INT, NET_IPV4_ROUTE_REDIRECT_LOAD, "redirect_load" }, 218 { CTL_INT, NET_IPV4_ROUTE_REDIRECT_LOAD, "redirect_load" },
219 { CTL_INT, NET_IPV4_ROUTE_REDIRECT_NUMBER, "redirect_number" }, 219 { CTL_INT, NET_IPV4_ROUTE_REDIRECT_NUMBER, "redirect_number" },
220 { CTL_INT, NET_IPV4_ROUTE_REDIRECT_SILENCE, "redirect_silence" }, 220 { CTL_INT, NET_IPV4_ROUTE_REDIRECT_SILENCE, "redirect_silence" },
diff --git a/kernel/taskstats.c b/kernel/taskstats.c
index e19ce1454ee1..e66046456f4f 100644
--- a/kernel/taskstats.c
+++ b/kernel/taskstats.c
@@ -655,6 +655,7 @@ static struct genl_ops taskstats_ops = {
655 .cmd = TASKSTATS_CMD_GET, 655 .cmd = TASKSTATS_CMD_GET,
656 .doit = taskstats_user_cmd, 656 .doit = taskstats_user_cmd,
657 .policy = taskstats_cmd_get_policy, 657 .policy = taskstats_cmd_get_policy,
658 .flags = GENL_ADMIN_PERM,
658}; 659};
659 660
660static struct genl_ops cgroupstats_ops = { 661static struct genl_ops cgroupstats_ops = {
diff --git a/kernel/time.c b/kernel/time.c
index 8e8dc6d705c9..d77606214529 100644
--- a/kernel/time.c
+++ b/kernel/time.c
@@ -575,7 +575,7 @@ EXPORT_SYMBOL(jiffies_to_timeval);
575/* 575/*
576 * Convert jiffies/jiffies_64 to clock_t and back. 576 * Convert jiffies/jiffies_64 to clock_t and back.
577 */ 577 */
578clock_t jiffies_to_clock_t(long x) 578clock_t jiffies_to_clock_t(unsigned long x)
579{ 579{
580#if (TICK_NSEC % (NSEC_PER_SEC / USER_HZ)) == 0 580#if (TICK_NSEC % (NSEC_PER_SEC / USER_HZ)) == 0
581# if HZ < USER_HZ 581# if HZ < USER_HZ
diff --git a/kernel/time/Kconfig b/kernel/time/Kconfig
index f06a8a365648..b26c2228fe92 100644
--- a/kernel/time/Kconfig
+++ b/kernel/time/Kconfig
@@ -27,3 +27,5 @@ config GENERIC_CLOCKEVENTS_BUILD
27 default y 27 default y
28 depends on GENERIC_CLOCKEVENTS || GENERIC_CLOCKEVENTS_MIGR 28 depends on GENERIC_CLOCKEVENTS || GENERIC_CLOCKEVENTS_MIGR
29 29
30config GENERIC_CLOCKEVENTS_MIN_ADJUST
31 bool
diff --git a/kernel/time/alarmtimer.c b/kernel/time/alarmtimer.c
index ea5e1a928d5b..c436e790b21b 100644
--- a/kernel/time/alarmtimer.c
+++ b/kernel/time/alarmtimer.c
@@ -53,27 +53,6 @@ static struct rtc_device *rtcdev;
53static DEFINE_SPINLOCK(rtcdev_lock); 53static DEFINE_SPINLOCK(rtcdev_lock);
54 54
55/** 55/**
56 * has_wakealarm - check rtc device has wakealarm ability
57 * @dev: current device
58 * @name_ptr: name to be returned
59 *
60 * This helper function checks to see if the rtc device can wake
61 * from suspend.
62 */
63static int has_wakealarm(struct device *dev, void *name_ptr)
64{
65 struct rtc_device *candidate = to_rtc_device(dev);
66
67 if (!candidate->ops->set_alarm)
68 return 0;
69 if (!device_may_wakeup(candidate->dev.parent))
70 return 0;
71
72 *(const char **)name_ptr = dev_name(dev);
73 return 1;
74}
75
76/**
77 * alarmtimer_get_rtcdev - Return selected rtcdevice 56 * alarmtimer_get_rtcdev - Return selected rtcdevice
78 * 57 *
79 * This function returns the rtc device to use for wakealarms. 58 * This function returns the rtc device to use for wakealarms.
@@ -82,37 +61,64 @@ static int has_wakealarm(struct device *dev, void *name_ptr)
82 */ 61 */
83static struct rtc_device *alarmtimer_get_rtcdev(void) 62static struct rtc_device *alarmtimer_get_rtcdev(void)
84{ 63{
85 struct device *dev;
86 char *str;
87 unsigned long flags; 64 unsigned long flags;
88 struct rtc_device *ret; 65 struct rtc_device *ret;
89 66
90 spin_lock_irqsave(&rtcdev_lock, flags); 67 spin_lock_irqsave(&rtcdev_lock, flags);
91 if (!rtcdev) {
92 /* Find an rtc device and init the rtc_timer */
93 dev = class_find_device(rtc_class, NULL, &str, has_wakealarm);
94 /* If we have a device then str is valid. See has_wakealarm() */
95 if (dev) {
96 rtcdev = rtc_class_open(str);
97 /*
98 * Drop the reference we got in class_find_device,
99 * rtc_open takes its own.
100 */
101 put_device(dev);
102 rtc_timer_init(&rtctimer, NULL, NULL);
103 }
104 }
105 ret = rtcdev; 68 ret = rtcdev;
106 spin_unlock_irqrestore(&rtcdev_lock, flags); 69 spin_unlock_irqrestore(&rtcdev_lock, flags);
107 70
108 return ret; 71 return ret;
109} 72}
73
74
75static int alarmtimer_rtc_add_device(struct device *dev,
76 struct class_interface *class_intf)
77{
78 unsigned long flags;
79 struct rtc_device *rtc = to_rtc_device(dev);
80
81 if (rtcdev)
82 return -EBUSY;
83
84 if (!rtc->ops->set_alarm)
85 return -1;
86 if (!device_may_wakeup(rtc->dev.parent))
87 return -1;
88
89 spin_lock_irqsave(&rtcdev_lock, flags);
90 if (!rtcdev) {
91 rtcdev = rtc;
92 /* hold a reference so it doesn't go away */
93 get_device(dev);
94 }
95 spin_unlock_irqrestore(&rtcdev_lock, flags);
96 return 0;
97}
98
99static struct class_interface alarmtimer_rtc_interface = {
100 .add_dev = &alarmtimer_rtc_add_device,
101};
102
103static int alarmtimer_rtc_interface_setup(void)
104{
105 alarmtimer_rtc_interface.class = rtc_class;
106 return class_interface_register(&alarmtimer_rtc_interface);
107}
108static void alarmtimer_rtc_interface_remove(void)
109{
110 class_interface_unregister(&alarmtimer_rtc_interface);
111}
110#else 112#else
111#define alarmtimer_get_rtcdev() (0) 113static inline struct rtc_device *alarmtimer_get_rtcdev(void)
112#define rtcdev (0) 114{
115 return NULL;
116}
117#define rtcdev (NULL)
118static inline int alarmtimer_rtc_interface_setup(void) { return 0; }
119static inline void alarmtimer_rtc_interface_remove(void) { }
113#endif 120#endif
114 121
115
116/** 122/**
117 * alarmtimer_enqueue - Adds an alarm timer to an alarm_base timerqueue 123 * alarmtimer_enqueue - Adds an alarm timer to an alarm_base timerqueue
118 * @base: pointer to the base where the timer is being run 124 * @base: pointer to the base where the timer is being run
@@ -126,6 +132,8 @@ static struct rtc_device *alarmtimer_get_rtcdev(void)
126static void alarmtimer_enqueue(struct alarm_base *base, struct alarm *alarm) 132static void alarmtimer_enqueue(struct alarm_base *base, struct alarm *alarm)
127{ 133{
128 timerqueue_add(&base->timerqueue, &alarm->node); 134 timerqueue_add(&base->timerqueue, &alarm->node);
135 alarm->state |= ALARMTIMER_STATE_ENQUEUED;
136
129 if (&alarm->node == timerqueue_getnext(&base->timerqueue)) { 137 if (&alarm->node == timerqueue_getnext(&base->timerqueue)) {
130 hrtimer_try_to_cancel(&base->timer); 138 hrtimer_try_to_cancel(&base->timer);
131 hrtimer_start(&base->timer, alarm->node.expires, 139 hrtimer_start(&base->timer, alarm->node.expires,
@@ -147,7 +155,12 @@ static void alarmtimer_remove(struct alarm_base *base, struct alarm *alarm)
147{ 155{
148 struct timerqueue_node *next = timerqueue_getnext(&base->timerqueue); 156 struct timerqueue_node *next = timerqueue_getnext(&base->timerqueue);
149 157
158 if (!(alarm->state & ALARMTIMER_STATE_ENQUEUED))
159 return;
160
150 timerqueue_del(&base->timerqueue, &alarm->node); 161 timerqueue_del(&base->timerqueue, &alarm->node);
162 alarm->state &= ~ALARMTIMER_STATE_ENQUEUED;
163
151 if (next == &alarm->node) { 164 if (next == &alarm->node) {
152 hrtimer_try_to_cancel(&base->timer); 165 hrtimer_try_to_cancel(&base->timer);
153 next = timerqueue_getnext(&base->timerqueue); 166 next = timerqueue_getnext(&base->timerqueue);
@@ -174,6 +187,7 @@ static enum hrtimer_restart alarmtimer_fired(struct hrtimer *timer)
174 unsigned long flags; 187 unsigned long flags;
175 ktime_t now; 188 ktime_t now;
176 int ret = HRTIMER_NORESTART; 189 int ret = HRTIMER_NORESTART;
190 int restart = ALARMTIMER_NORESTART;
177 191
178 spin_lock_irqsave(&base->lock, flags); 192 spin_lock_irqsave(&base->lock, flags);
179 now = base->gettime(); 193 now = base->gettime();
@@ -187,17 +201,19 @@ static enum hrtimer_restart alarmtimer_fired(struct hrtimer *timer)
187 alarm = container_of(next, struct alarm, node); 201 alarm = container_of(next, struct alarm, node);
188 202
189 timerqueue_del(&base->timerqueue, &alarm->node); 203 timerqueue_del(&base->timerqueue, &alarm->node);
190 alarm->enabled = 0; 204 alarm->state &= ~ALARMTIMER_STATE_ENQUEUED;
191 /* Re-add periodic timers */ 205
192 if (alarm->period.tv64) { 206 alarm->state |= ALARMTIMER_STATE_CALLBACK;
193 alarm->node.expires = ktime_add(expired, alarm->period);
194 timerqueue_add(&base->timerqueue, &alarm->node);
195 alarm->enabled = 1;
196 }
197 spin_unlock_irqrestore(&base->lock, flags); 207 spin_unlock_irqrestore(&base->lock, flags);
198 if (alarm->function) 208 if (alarm->function)
199 alarm->function(alarm); 209 restart = alarm->function(alarm, now);
200 spin_lock_irqsave(&base->lock, flags); 210 spin_lock_irqsave(&base->lock, flags);
211 alarm->state &= ~ALARMTIMER_STATE_CALLBACK;
212
213 if (restart != ALARMTIMER_NORESTART) {
214 timerqueue_add(&base->timerqueue, &alarm->node);
215 alarm->state |= ALARMTIMER_STATE_ENQUEUED;
216 }
201 } 217 }
202 218
203 if (next) { 219 if (next) {
@@ -234,7 +250,7 @@ static int alarmtimer_suspend(struct device *dev)
234 freezer_delta = ktime_set(0, 0); 250 freezer_delta = ktime_set(0, 0);
235 spin_unlock_irqrestore(&freezer_delta_lock, flags); 251 spin_unlock_irqrestore(&freezer_delta_lock, flags);
236 252
237 rtc = rtcdev; 253 rtc = alarmtimer_get_rtcdev();
238 /* If we have no rtcdev, just return */ 254 /* If we have no rtcdev, just return */
239 if (!rtc) 255 if (!rtc)
240 return 0; 256 return 0;
@@ -299,53 +315,111 @@ static void alarmtimer_freezerset(ktime_t absexp, enum alarmtimer_type type)
299 * @function: callback that is run when the alarm fires 315 * @function: callback that is run when the alarm fires
300 */ 316 */
301void alarm_init(struct alarm *alarm, enum alarmtimer_type type, 317void alarm_init(struct alarm *alarm, enum alarmtimer_type type,
302 void (*function)(struct alarm *)) 318 enum alarmtimer_restart (*function)(struct alarm *, ktime_t))
303{ 319{
304 timerqueue_init(&alarm->node); 320 timerqueue_init(&alarm->node);
305 alarm->period = ktime_set(0, 0);
306 alarm->function = function; 321 alarm->function = function;
307 alarm->type = type; 322 alarm->type = type;
308 alarm->enabled = 0; 323 alarm->state = ALARMTIMER_STATE_INACTIVE;
309} 324}
310 325
311/** 326/**
312 * alarm_start - Sets an alarm to fire 327 * alarm_start - Sets an alarm to fire
313 * @alarm: ptr to alarm to set 328 * @alarm: ptr to alarm to set
314 * @start: time to run the alarm 329 * @start: time to run the alarm
315 * @period: period at which the alarm will recur
316 */ 330 */
317void alarm_start(struct alarm *alarm, ktime_t start, ktime_t period) 331void alarm_start(struct alarm *alarm, ktime_t start)
318{ 332{
319 struct alarm_base *base = &alarm_bases[alarm->type]; 333 struct alarm_base *base = &alarm_bases[alarm->type];
320 unsigned long flags; 334 unsigned long flags;
321 335
322 spin_lock_irqsave(&base->lock, flags); 336 spin_lock_irqsave(&base->lock, flags);
323 if (alarm->enabled) 337 if (alarmtimer_active(alarm))
324 alarmtimer_remove(base, alarm); 338 alarmtimer_remove(base, alarm);
325 alarm->node.expires = start; 339 alarm->node.expires = start;
326 alarm->period = period;
327 alarmtimer_enqueue(base, alarm); 340 alarmtimer_enqueue(base, alarm);
328 alarm->enabled = 1;
329 spin_unlock_irqrestore(&base->lock, flags); 341 spin_unlock_irqrestore(&base->lock, flags);
330} 342}
331 343
332/** 344/**
333 * alarm_cancel - Tries to cancel an alarm timer 345 * alarm_try_to_cancel - Tries to cancel an alarm timer
334 * @alarm: ptr to alarm to be canceled 346 * @alarm: ptr to alarm to be canceled
347 *
348 * Returns 1 if the timer was canceled, 0 if it was not running,
349 * and -1 if the callback was running
335 */ 350 */
336void alarm_cancel(struct alarm *alarm) 351int alarm_try_to_cancel(struct alarm *alarm)
337{ 352{
338 struct alarm_base *base = &alarm_bases[alarm->type]; 353 struct alarm_base *base = &alarm_bases[alarm->type];
339 unsigned long flags; 354 unsigned long flags;
340 355 int ret = -1;
341 spin_lock_irqsave(&base->lock, flags); 356 spin_lock_irqsave(&base->lock, flags);
342 if (alarm->enabled) 357
358 if (alarmtimer_callback_running(alarm))
359 goto out;
360
361 if (alarmtimer_is_queued(alarm)) {
343 alarmtimer_remove(base, alarm); 362 alarmtimer_remove(base, alarm);
344 alarm->enabled = 0; 363 ret = 1;
364 } else
365 ret = 0;
366out:
345 spin_unlock_irqrestore(&base->lock, flags); 367 spin_unlock_irqrestore(&base->lock, flags);
368 return ret;
369}
370
371
372/**
373 * alarm_cancel - Spins trying to cancel an alarm timer until it is done
374 * @alarm: ptr to alarm to be canceled
375 *
376 * Returns 1 if the timer was canceled, 0 if it was not active.
377 */
378int alarm_cancel(struct alarm *alarm)
379{
380 for (;;) {
381 int ret = alarm_try_to_cancel(alarm);
382 if (ret >= 0)
383 return ret;
384 cpu_relax();
385 }
386}
387
388
389u64 alarm_forward(struct alarm *alarm, ktime_t now, ktime_t interval)
390{
391 u64 overrun = 1;
392 ktime_t delta;
393
394 delta = ktime_sub(now, alarm->node.expires);
395
396 if (delta.tv64 < 0)
397 return 0;
398
399 if (unlikely(delta.tv64 >= interval.tv64)) {
400 s64 incr = ktime_to_ns(interval);
401
402 overrun = ktime_divns(delta, incr);
403
404 alarm->node.expires = ktime_add_ns(alarm->node.expires,
405 incr*overrun);
406
407 if (alarm->node.expires.tv64 > now.tv64)
408 return overrun;
409 /*
410 * This (and the ktime_add() below) is the
411 * correction for exact:
412 */
413 overrun++;
414 }
415
416 alarm->node.expires = ktime_add(alarm->node.expires, interval);
417 return overrun;
346} 418}
347 419
348 420
421
422
349/** 423/**
350 * clock2alarm - helper that converts from clockid to alarmtypes 424 * clock2alarm - helper that converts from clockid to alarmtypes
351 * @clockid: clockid. 425 * @clockid: clockid.
@@ -365,12 +439,21 @@ static enum alarmtimer_type clock2alarm(clockid_t clockid)
365 * 439 *
366 * Posix timer callback for expired alarm timers. 440 * Posix timer callback for expired alarm timers.
367 */ 441 */
368static void alarm_handle_timer(struct alarm *alarm) 442static enum alarmtimer_restart alarm_handle_timer(struct alarm *alarm,
443 ktime_t now)
369{ 444{
370 struct k_itimer *ptr = container_of(alarm, struct k_itimer, 445 struct k_itimer *ptr = container_of(alarm, struct k_itimer,
371 it.alarmtimer); 446 it.alarm.alarmtimer);
372 if (posix_timer_event(ptr, 0) != 0) 447 if (posix_timer_event(ptr, 0) != 0)
373 ptr->it_overrun++; 448 ptr->it_overrun++;
449
450 /* Re-add periodic timers */
451 if (ptr->it.alarm.interval.tv64) {
452 ptr->it_overrun += alarm_forward(alarm, now,
453 ptr->it.alarm.interval);
454 return ALARMTIMER_RESTART;
455 }
456 return ALARMTIMER_NORESTART;
374} 457}
375 458
376/** 459/**
@@ -427,7 +510,7 @@ static int alarm_timer_create(struct k_itimer *new_timer)
427 510
428 type = clock2alarm(new_timer->it_clock); 511 type = clock2alarm(new_timer->it_clock);
429 base = &alarm_bases[type]; 512 base = &alarm_bases[type];
430 alarm_init(&new_timer->it.alarmtimer, type, alarm_handle_timer); 513 alarm_init(&new_timer->it.alarm.alarmtimer, type, alarm_handle_timer);
431 return 0; 514 return 0;
432} 515}
433 516
@@ -444,9 +527,9 @@ static void alarm_timer_get(struct k_itimer *timr,
444 memset(cur_setting, 0, sizeof(struct itimerspec)); 527 memset(cur_setting, 0, sizeof(struct itimerspec));
445 528
446 cur_setting->it_interval = 529 cur_setting->it_interval =
447 ktime_to_timespec(timr->it.alarmtimer.period); 530 ktime_to_timespec(timr->it.alarm.interval);
448 cur_setting->it_value = 531 cur_setting->it_value =
449 ktime_to_timespec(timr->it.alarmtimer.node.expires); 532 ktime_to_timespec(timr->it.alarm.alarmtimer.node.expires);
450 return; 533 return;
451} 534}
452 535
@@ -461,7 +544,9 @@ static int alarm_timer_del(struct k_itimer *timr)
461 if (!rtcdev) 544 if (!rtcdev)
462 return -ENOTSUPP; 545 return -ENOTSUPP;
463 546
464 alarm_cancel(&timr->it.alarmtimer); 547 if (alarm_try_to_cancel(&timr->it.alarm.alarmtimer) < 0)
548 return TIMER_RETRY;
549
465 return 0; 550 return 0;
466} 551}
467 552
@@ -481,25 +566,17 @@ static int alarm_timer_set(struct k_itimer *timr, int flags,
481 if (!rtcdev) 566 if (!rtcdev)
482 return -ENOTSUPP; 567 return -ENOTSUPP;
483 568
484 /*
485 * XXX HACK! Currently we can DOS a system if the interval
486 * period on alarmtimers is too small. Cap the interval here
487 * to 100us and solve this properly in a future patch! -jstultz
488 */
489 if ((new_setting->it_interval.tv_sec == 0) &&
490 (new_setting->it_interval.tv_nsec < 100000))
491 new_setting->it_interval.tv_nsec = 100000;
492
493 if (old_setting) 569 if (old_setting)
494 alarm_timer_get(timr, old_setting); 570 alarm_timer_get(timr, old_setting);
495 571
496 /* If the timer was already set, cancel it */ 572 /* If the timer was already set, cancel it */
497 alarm_cancel(&timr->it.alarmtimer); 573 if (alarm_try_to_cancel(&timr->it.alarm.alarmtimer) < 0)
574 return TIMER_RETRY;
498 575
499 /* start the timer */ 576 /* start the timer */
500 alarm_start(&timr->it.alarmtimer, 577 timr->it.alarm.interval = timespec_to_ktime(new_setting->it_interval);
501 timespec_to_ktime(new_setting->it_value), 578 alarm_start(&timr->it.alarm.alarmtimer,
502 timespec_to_ktime(new_setting->it_interval)); 579 timespec_to_ktime(new_setting->it_value));
503 return 0; 580 return 0;
504} 581}
505 582
@@ -509,13 +586,15 @@ static int alarm_timer_set(struct k_itimer *timr, int flags,
509 * 586 *
510 * Wakes up the task that set the alarmtimer 587 * Wakes up the task that set the alarmtimer
511 */ 588 */
512static void alarmtimer_nsleep_wakeup(struct alarm *alarm) 589static enum alarmtimer_restart alarmtimer_nsleep_wakeup(struct alarm *alarm,
590 ktime_t now)
513{ 591{
514 struct task_struct *task = (struct task_struct *)alarm->data; 592 struct task_struct *task = (struct task_struct *)alarm->data;
515 593
516 alarm->data = NULL; 594 alarm->data = NULL;
517 if (task) 595 if (task)
518 wake_up_process(task); 596 wake_up_process(task);
597 return ALARMTIMER_NORESTART;
519} 598}
520 599
521/** 600/**
@@ -530,7 +609,7 @@ static int alarmtimer_do_nsleep(struct alarm *alarm, ktime_t absexp)
530 alarm->data = (void *)current; 609 alarm->data = (void *)current;
531 do { 610 do {
532 set_current_state(TASK_INTERRUPTIBLE); 611 set_current_state(TASK_INTERRUPTIBLE);
533 alarm_start(alarm, absexp, ktime_set(0, 0)); 612 alarm_start(alarm, absexp);
534 if (likely(alarm->data)) 613 if (likely(alarm->data))
535 schedule(); 614 schedule();
536 615
@@ -691,6 +770,7 @@ static struct platform_driver alarmtimer_driver = {
691 */ 770 */
692static int __init alarmtimer_init(void) 771static int __init alarmtimer_init(void)
693{ 772{
773 struct platform_device *pdev;
694 int error = 0; 774 int error = 0;
695 int i; 775 int i;
696 struct k_clock alarm_clock = { 776 struct k_clock alarm_clock = {
@@ -719,10 +799,26 @@ static int __init alarmtimer_init(void)
719 HRTIMER_MODE_ABS); 799 HRTIMER_MODE_ABS);
720 alarm_bases[i].timer.function = alarmtimer_fired; 800 alarm_bases[i].timer.function = alarmtimer_fired;
721 } 801 }
802
803 error = alarmtimer_rtc_interface_setup();
804 if (error)
805 return error;
806
722 error = platform_driver_register(&alarmtimer_driver); 807 error = platform_driver_register(&alarmtimer_driver);
723 platform_device_register_simple("alarmtimer", -1, NULL, 0); 808 if (error)
809 goto out_if;
724 810
811 pdev = platform_device_register_simple("alarmtimer", -1, NULL, 0);
812 if (IS_ERR(pdev)) {
813 error = PTR_ERR(pdev);
814 goto out_drv;
815 }
816 return 0;
817
818out_drv:
819 platform_driver_unregister(&alarmtimer_driver);
820out_if:
821 alarmtimer_rtc_interface_remove();
725 return error; 822 return error;
726} 823}
727device_initcall(alarmtimer_init); 824device_initcall(alarmtimer_init);
728
diff --git a/kernel/time/clockevents.c b/kernel/time/clockevents.c
index e4c699dfa4e8..1ecd6ba36d6c 100644
--- a/kernel/time/clockevents.c
+++ b/kernel/time/clockevents.c
@@ -94,42 +94,143 @@ void clockevents_shutdown(struct clock_event_device *dev)
94 dev->next_event.tv64 = KTIME_MAX; 94 dev->next_event.tv64 = KTIME_MAX;
95} 95}
96 96
97#ifdef CONFIG_GENERIC_CLOCKEVENTS_MIN_ADJUST
98
99/* Limit min_delta to a jiffie */
100#define MIN_DELTA_LIMIT (NSEC_PER_SEC / HZ)
101
102/**
103 * clockevents_increase_min_delta - raise minimum delta of a clock event device
104 * @dev: device to increase the minimum delta
105 *
106 * Returns 0 on success, -ETIME when the minimum delta reached the limit.
107 */
108static int clockevents_increase_min_delta(struct clock_event_device *dev)
109{
110 /* Nothing to do if we already reached the limit */
111 if (dev->min_delta_ns >= MIN_DELTA_LIMIT) {
112 printk(KERN_WARNING "CE: Reprogramming failure. Giving up\n");
113 dev->next_event.tv64 = KTIME_MAX;
114 return -ETIME;
115 }
116
117 if (dev->min_delta_ns < 5000)
118 dev->min_delta_ns = 5000;
119 else
120 dev->min_delta_ns += dev->min_delta_ns >> 1;
121
122 if (dev->min_delta_ns > MIN_DELTA_LIMIT)
123 dev->min_delta_ns = MIN_DELTA_LIMIT;
124
125 printk(KERN_WARNING "CE: %s increased min_delta_ns to %llu nsec\n",
126 dev->name ? dev->name : "?",
127 (unsigned long long) dev->min_delta_ns);
128 return 0;
129}
130
131/**
132 * clockevents_program_min_delta - Set clock event device to the minimum delay.
133 * @dev: device to program
134 *
135 * Returns 0 on success, -ETIME when the retry loop failed.
136 */
137static int clockevents_program_min_delta(struct clock_event_device *dev)
138{
139 unsigned long long clc;
140 int64_t delta;
141 int i;
142
143 for (i = 0;;) {
144 delta = dev->min_delta_ns;
145 dev->next_event = ktime_add_ns(ktime_get(), delta);
146
147 if (dev->mode == CLOCK_EVT_MODE_SHUTDOWN)
148 return 0;
149
150 dev->retries++;
151 clc = ((unsigned long long) delta * dev->mult) >> dev->shift;
152 if (dev->set_next_event((unsigned long) clc, dev) == 0)
153 return 0;
154
155 if (++i > 2) {
156 /*
157 * We tried 3 times to program the device with the
158 * given min_delta_ns. Try to increase the minimum
159 * delta, if that fails as well get out of here.
160 */
161 if (clockevents_increase_min_delta(dev))
162 return -ETIME;
163 i = 0;
164 }
165 }
166}
167
168#else /* CONFIG_GENERIC_CLOCKEVENTS_MIN_ADJUST */
169
170/**
171 * clockevents_program_min_delta - Set clock event device to the minimum delay.
172 * @dev: device to program
173 *
174 * Returns 0 on success, -ETIME when the retry loop failed.
175 */
176static int clockevents_program_min_delta(struct clock_event_device *dev)
177{
178 unsigned long long clc;
179 int64_t delta;
180
181 delta = dev->min_delta_ns;
182 dev->next_event = ktime_add_ns(ktime_get(), delta);
183
184 if (dev->mode == CLOCK_EVT_MODE_SHUTDOWN)
185 return 0;
186
187 dev->retries++;
188 clc = ((unsigned long long) delta * dev->mult) >> dev->shift;
189 return dev->set_next_event((unsigned long) clc, dev);
190}
191
192#endif /* CONFIG_GENERIC_CLOCKEVENTS_MIN_ADJUST */
193
97/** 194/**
98 * clockevents_program_event - Reprogram the clock event device. 195 * clockevents_program_event - Reprogram the clock event device.
196 * @dev: device to program
99 * @expires: absolute expiry time (monotonic clock) 197 * @expires: absolute expiry time (monotonic clock)
198 * @force: program minimum delay if expires can not be set
100 * 199 *
101 * Returns 0 on success, -ETIME when the event is in the past. 200 * Returns 0 on success, -ETIME when the event is in the past.
102 */ 201 */
103int clockevents_program_event(struct clock_event_device *dev, ktime_t expires, 202int clockevents_program_event(struct clock_event_device *dev, ktime_t expires,
104 ktime_t now) 203 bool force)
105{ 204{
106 unsigned long long clc; 205 unsigned long long clc;
107 int64_t delta; 206 int64_t delta;
207 int rc;
108 208
109 if (unlikely(expires.tv64 < 0)) { 209 if (unlikely(expires.tv64 < 0)) {
110 WARN_ON_ONCE(1); 210 WARN_ON_ONCE(1);
111 return -ETIME; 211 return -ETIME;
112 } 212 }
113 213
114 delta = ktime_to_ns(ktime_sub(expires, now));
115
116 if (delta <= 0)
117 return -ETIME;
118
119 dev->next_event = expires; 214 dev->next_event = expires;
120 215
121 if (dev->mode == CLOCK_EVT_MODE_SHUTDOWN) 216 if (dev->mode == CLOCK_EVT_MODE_SHUTDOWN)
122 return 0; 217 return 0;
123 218
124 if (delta > dev->max_delta_ns) 219 /* Shortcut for clockevent devices that can deal with ktime. */
125 delta = dev->max_delta_ns; 220 if (dev->features & CLOCK_EVT_FEAT_KTIME)
126 if (delta < dev->min_delta_ns) 221 return dev->set_next_ktime(expires, dev);
127 delta = dev->min_delta_ns; 222
223 delta = ktime_to_ns(ktime_sub(expires, ktime_get()));
224 if (delta <= 0)
225 return force ? clockevents_program_min_delta(dev) : -ETIME;
128 226
129 clc = delta * dev->mult; 227 delta = min(delta, (int64_t) dev->max_delta_ns);
130 clc >>= dev->shift; 228 delta = max(delta, (int64_t) dev->min_delta_ns);
131 229
132 return dev->set_next_event((unsigned long) clc, dev); 230 clc = ((unsigned long long) delta * dev->mult) >> dev->shift;
231 rc = dev->set_next_event((unsigned long) clc, dev);
232
233 return (rc && force) ? clockevents_program_min_delta(dev) : rc;
133} 234}
134 235
135/** 236/**
@@ -258,7 +359,7 @@ int clockevents_update_freq(struct clock_event_device *dev, u32 freq)
258 if (dev->mode != CLOCK_EVT_MODE_ONESHOT) 359 if (dev->mode != CLOCK_EVT_MODE_ONESHOT)
259 return 0; 360 return 0;
260 361
261 return clockevents_program_event(dev, dev->next_event, ktime_get()); 362 return clockevents_program_event(dev, dev->next_event, false);
262} 363}
263 364
264/* 365/*
diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c
index e0980f0d9a0a..cf52fda2e096 100644
--- a/kernel/time/clocksource.c
+++ b/kernel/time/clocksource.c
@@ -186,6 +186,7 @@ static struct timer_list watchdog_timer;
186static DECLARE_WORK(watchdog_work, clocksource_watchdog_work); 186static DECLARE_WORK(watchdog_work, clocksource_watchdog_work);
187static DEFINE_SPINLOCK(watchdog_lock); 187static DEFINE_SPINLOCK(watchdog_lock);
188static int watchdog_running; 188static int watchdog_running;
189static atomic_t watchdog_reset_pending;
189 190
190static int clocksource_watchdog_kthread(void *data); 191static int clocksource_watchdog_kthread(void *data);
191static void __clocksource_change_rating(struct clocksource *cs, int rating); 192static void __clocksource_change_rating(struct clocksource *cs, int rating);
@@ -247,12 +248,14 @@ static void clocksource_watchdog(unsigned long data)
247 struct clocksource *cs; 248 struct clocksource *cs;
248 cycle_t csnow, wdnow; 249 cycle_t csnow, wdnow;
249 int64_t wd_nsec, cs_nsec; 250 int64_t wd_nsec, cs_nsec;
250 int next_cpu; 251 int next_cpu, reset_pending;
251 252
252 spin_lock(&watchdog_lock); 253 spin_lock(&watchdog_lock);
253 if (!watchdog_running) 254 if (!watchdog_running)
254 goto out; 255 goto out;
255 256
257 reset_pending = atomic_read(&watchdog_reset_pending);
258
256 list_for_each_entry(cs, &watchdog_list, wd_list) { 259 list_for_each_entry(cs, &watchdog_list, wd_list) {
257 260
258 /* Clocksource already marked unstable? */ 261 /* Clocksource already marked unstable? */
@@ -268,7 +271,8 @@ static void clocksource_watchdog(unsigned long data)
268 local_irq_enable(); 271 local_irq_enable();
269 272
270 /* Clocksource initialized ? */ 273 /* Clocksource initialized ? */
271 if (!(cs->flags & CLOCK_SOURCE_WATCHDOG)) { 274 if (!(cs->flags & CLOCK_SOURCE_WATCHDOG) ||
275 atomic_read(&watchdog_reset_pending)) {
272 cs->flags |= CLOCK_SOURCE_WATCHDOG; 276 cs->flags |= CLOCK_SOURCE_WATCHDOG;
273 cs->wd_last = wdnow; 277 cs->wd_last = wdnow;
274 cs->cs_last = csnow; 278 cs->cs_last = csnow;
@@ -283,8 +287,11 @@ static void clocksource_watchdog(unsigned long data)
283 cs->cs_last = csnow; 287 cs->cs_last = csnow;
284 cs->wd_last = wdnow; 288 cs->wd_last = wdnow;
285 289
290 if (atomic_read(&watchdog_reset_pending))
291 continue;
292
286 /* Check the deviation from the watchdog clocksource. */ 293 /* Check the deviation from the watchdog clocksource. */
287 if (abs(cs_nsec - wd_nsec) > WATCHDOG_THRESHOLD) { 294 if ((abs(cs_nsec - wd_nsec) > WATCHDOG_THRESHOLD)) {
288 clocksource_unstable(cs, cs_nsec - wd_nsec); 295 clocksource_unstable(cs, cs_nsec - wd_nsec);
289 continue; 296 continue;
290 } 297 }
@@ -303,6 +310,13 @@ static void clocksource_watchdog(unsigned long data)
303 } 310 }
304 311
305 /* 312 /*
313 * We only clear the watchdog_reset_pending, when we did a
314 * full cycle through all clocksources.
315 */
316 if (reset_pending)
317 atomic_dec(&watchdog_reset_pending);
318
319 /*
306 * Cycle through CPUs to check if the CPUs stay synchronized 320 * Cycle through CPUs to check if the CPUs stay synchronized
307 * to each other. 321 * to each other.
308 */ 322 */
@@ -344,23 +358,7 @@ static inline void clocksource_reset_watchdog(void)
344 358
345static void clocksource_resume_watchdog(void) 359static void clocksource_resume_watchdog(void)
346{ 360{
347 unsigned long flags; 361 atomic_inc(&watchdog_reset_pending);
348
349 /*
350 * We use trylock here to avoid a potential dead lock when
351 * kgdb calls this code after the kernel has been stopped with
352 * watchdog_lock held. When watchdog_lock is held we just
353 * return and accept, that the watchdog might trigger and mark
354 * the monitored clock source (usually TSC) unstable.
355 *
356 * This does not affect the other caller clocksource_resume()
357 * because at this point the kernel is UP, interrupts are
358 * disabled and nothing can hold watchdog_lock.
359 */
360 if (!spin_trylock_irqsave(&watchdog_lock, flags))
361 return;
362 clocksource_reset_watchdog();
363 spin_unlock_irqrestore(&watchdog_lock, flags);
364} 362}
365 363
366static void clocksource_enqueue_watchdog(struct clocksource *cs) 364static void clocksource_enqueue_watchdog(struct clocksource *cs)
diff --git a/kernel/time/tick-broadcast.c b/kernel/time/tick-broadcast.c
index c7218d132738..f954282d9a82 100644
--- a/kernel/time/tick-broadcast.c
+++ b/kernel/time/tick-broadcast.c
@@ -194,7 +194,7 @@ static void tick_handle_periodic_broadcast(struct clock_event_device *dev)
194 for (next = dev->next_event; ;) { 194 for (next = dev->next_event; ;) {
195 next = ktime_add(next, tick_period); 195 next = ktime_add(next, tick_period);
196 196
197 if (!clockevents_program_event(dev, next, ktime_get())) 197 if (!clockevents_program_event(dev, next, false))
198 return; 198 return;
199 tick_do_periodic_broadcast(); 199 tick_do_periodic_broadcast();
200 } 200 }
@@ -373,7 +373,7 @@ static int tick_broadcast_set_event(ktime_t expires, int force)
373{ 373{
374 struct clock_event_device *bc = tick_broadcast_device.evtdev; 374 struct clock_event_device *bc = tick_broadcast_device.evtdev;
375 375
376 return tick_dev_program_event(bc, expires, force); 376 return clockevents_program_event(bc, expires, force);
377} 377}
378 378
379int tick_resume_broadcast_oneshot(struct clock_event_device *bc) 379int tick_resume_broadcast_oneshot(struct clock_event_device *bc)
diff --git a/kernel/time/tick-common.c b/kernel/time/tick-common.c
index 119528de8235..da6c9ecad4e4 100644
--- a/kernel/time/tick-common.c
+++ b/kernel/time/tick-common.c
@@ -94,7 +94,7 @@ void tick_handle_periodic(struct clock_event_device *dev)
94 */ 94 */
95 next = ktime_add(dev->next_event, tick_period); 95 next = ktime_add(dev->next_event, tick_period);
96 for (;;) { 96 for (;;) {
97 if (!clockevents_program_event(dev, next, ktime_get())) 97 if (!clockevents_program_event(dev, next, false))
98 return; 98 return;
99 /* 99 /*
100 * Have to be careful here. If we're in oneshot mode, 100 * Have to be careful here. If we're in oneshot mode,
@@ -137,7 +137,7 @@ void tick_setup_periodic(struct clock_event_device *dev, int broadcast)
137 clockevents_set_mode(dev, CLOCK_EVT_MODE_ONESHOT); 137 clockevents_set_mode(dev, CLOCK_EVT_MODE_ONESHOT);
138 138
139 for (;;) { 139 for (;;) {
140 if (!clockevents_program_event(dev, next, ktime_get())) 140 if (!clockevents_program_event(dev, next, false))
141 return; 141 return;
142 next = ktime_add(next, tick_period); 142 next = ktime_add(next, tick_period);
143 } 143 }
diff --git a/kernel/time/tick-internal.h b/kernel/time/tick-internal.h
index 1009b06d6f89..4e265b901fed 100644
--- a/kernel/time/tick-internal.h
+++ b/kernel/time/tick-internal.h
@@ -26,8 +26,6 @@ extern void clockevents_shutdown(struct clock_event_device *dev);
26extern void tick_setup_oneshot(struct clock_event_device *newdev, 26extern void tick_setup_oneshot(struct clock_event_device *newdev,
27 void (*handler)(struct clock_event_device *), 27 void (*handler)(struct clock_event_device *),
28 ktime_t nextevt); 28 ktime_t nextevt);
29extern int tick_dev_program_event(struct clock_event_device *dev,
30 ktime_t expires, int force);
31extern int tick_program_event(ktime_t expires, int force); 29extern int tick_program_event(ktime_t expires, int force);
32extern void tick_oneshot_notify(void); 30extern void tick_oneshot_notify(void);
33extern int tick_switch_to_oneshot(void (*handler)(struct clock_event_device *)); 31extern int tick_switch_to_oneshot(void (*handler)(struct clock_event_device *));
diff --git a/kernel/time/tick-oneshot.c b/kernel/time/tick-oneshot.c
index 2d04411a5f05..824109060a33 100644
--- a/kernel/time/tick-oneshot.c
+++ b/kernel/time/tick-oneshot.c
@@ -21,74 +21,6 @@
21 21
22#include "tick-internal.h" 22#include "tick-internal.h"
23 23
24/* Limit min_delta to a jiffie */
25#define MIN_DELTA_LIMIT (NSEC_PER_SEC / HZ)
26
27static int tick_increase_min_delta(struct clock_event_device *dev)
28{
29 /* Nothing to do if we already reached the limit */
30 if (dev->min_delta_ns >= MIN_DELTA_LIMIT)
31 return -ETIME;
32
33 if (dev->min_delta_ns < 5000)
34 dev->min_delta_ns = 5000;
35 else
36 dev->min_delta_ns += dev->min_delta_ns >> 1;
37
38 if (dev->min_delta_ns > MIN_DELTA_LIMIT)
39 dev->min_delta_ns = MIN_DELTA_LIMIT;
40
41 printk(KERN_WARNING "CE: %s increased min_delta_ns to %llu nsec\n",
42 dev->name ? dev->name : "?",
43 (unsigned long long) dev->min_delta_ns);
44 return 0;
45}
46
47/**
48 * tick_program_event internal worker function
49 */
50int tick_dev_program_event(struct clock_event_device *dev, ktime_t expires,
51 int force)
52{
53 ktime_t now = ktime_get();
54 int i;
55
56 for (i = 0;;) {
57 int ret = clockevents_program_event(dev, expires, now);
58
59 if (!ret || !force)
60 return ret;
61
62 dev->retries++;
63 /*
64 * We tried 3 times to program the device with the given
65 * min_delta_ns. If that's not working then we increase it
66 * and emit a warning.
67 */
68 if (++i > 2) {
69 /* Increase the min. delta and try again */
70 if (tick_increase_min_delta(dev)) {
71 /*
72 * Get out of the loop if min_delta_ns
73 * hit the limit already. That's
74 * better than staying here forever.
75 *
76 * We clear next_event so we have a
77 * chance that the box survives.
78 */
79 printk(KERN_WARNING
80 "CE: Reprogramming failure. Giving up\n");
81 dev->next_event.tv64 = KTIME_MAX;
82 return -ETIME;
83 }
84 i = 0;
85 }
86
87 now = ktime_get();
88 expires = ktime_add_ns(now, dev->min_delta_ns);
89 }
90}
91
92/** 24/**
93 * tick_program_event 25 * tick_program_event
94 */ 26 */
@@ -96,7 +28,7 @@ int tick_program_event(ktime_t expires, int force)
96{ 28{
97 struct clock_event_device *dev = __this_cpu_read(tick_cpu_device.evtdev); 29 struct clock_event_device *dev = __this_cpu_read(tick_cpu_device.evtdev);
98 30
99 return tick_dev_program_event(dev, expires, force); 31 return clockevents_program_event(dev, expires, force);
100} 32}
101 33
102/** 34/**
@@ -104,11 +36,10 @@ int tick_program_event(ktime_t expires, int force)
104 */ 36 */
105void tick_resume_oneshot(void) 37void tick_resume_oneshot(void)
106{ 38{
107 struct tick_device *td = &__get_cpu_var(tick_cpu_device); 39 struct clock_event_device *dev = __this_cpu_read(tick_cpu_device.evtdev);
108 struct clock_event_device *dev = td->evtdev;
109 40
110 clockevents_set_mode(dev, CLOCK_EVT_MODE_ONESHOT); 41 clockevents_set_mode(dev, CLOCK_EVT_MODE_ONESHOT);
111 tick_program_event(ktime_get(), 1); 42 clockevents_program_event(dev, ktime_get(), true);
112} 43}
113 44
114/** 45/**
@@ -120,7 +51,7 @@ void tick_setup_oneshot(struct clock_event_device *newdev,
120{ 51{
121 newdev->event_handler = handler; 52 newdev->event_handler = handler;
122 clockevents_set_mode(newdev, CLOCK_EVT_MODE_ONESHOT); 53 clockevents_set_mode(newdev, CLOCK_EVT_MODE_ONESHOT);
123 tick_dev_program_event(newdev, next_event, 1); 54 clockevents_program_event(newdev, next_event, true);
124} 55}
125 56
126/** 57/**
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index d5097c44b407..40420644d0ba 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -139,7 +139,6 @@ static void tick_nohz_update_jiffies(ktime_t now)
139 struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu); 139 struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu);
140 unsigned long flags; 140 unsigned long flags;
141 141
142 cpumask_clear_cpu(cpu, nohz_cpu_mask);
143 ts->idle_waketime = now; 142 ts->idle_waketime = now;
144 143
145 local_irq_save(flags); 144 local_irq_save(flags);
@@ -159,9 +158,10 @@ update_ts_time_stats(int cpu, struct tick_sched *ts, ktime_t now, u64 *last_upda
159 158
160 if (ts->idle_active) { 159 if (ts->idle_active) {
161 delta = ktime_sub(now, ts->idle_entrytime); 160 delta = ktime_sub(now, ts->idle_entrytime);
162 ts->idle_sleeptime = ktime_add(ts->idle_sleeptime, delta);
163 if (nr_iowait_cpu(cpu) > 0) 161 if (nr_iowait_cpu(cpu) > 0)
164 ts->iowait_sleeptime = ktime_add(ts->iowait_sleeptime, delta); 162 ts->iowait_sleeptime = ktime_add(ts->iowait_sleeptime, delta);
163 else
164 ts->idle_sleeptime = ktime_add(ts->idle_sleeptime, delta);
165 ts->idle_entrytime = now; 165 ts->idle_entrytime = now;
166 } 166 }
167 167
@@ -197,11 +197,11 @@ static ktime_t tick_nohz_start_idle(int cpu, struct tick_sched *ts)
197/** 197/**
198 * get_cpu_idle_time_us - get the total idle time of a cpu 198 * get_cpu_idle_time_us - get the total idle time of a cpu
199 * @cpu: CPU number to query 199 * @cpu: CPU number to query
200 * @last_update_time: variable to store update time in 200 * @last_update_time: variable to store update time in. Do not update
201 * counters if NULL.
201 * 202 *
202 * Return the cummulative idle time (since boot) for a given 203 * Return the cummulative idle time (since boot) for a given
203 * CPU, in microseconds. The idle time returned includes 204 * CPU, in microseconds.
204 * the iowait time (unlike what "top" and co report).
205 * 205 *
206 * This time is measured via accounting rather than sampling, 206 * This time is measured via accounting rather than sampling,
207 * and is as accurate as ktime_get() is. 207 * and is as accurate as ktime_get() is.
@@ -211,20 +211,35 @@ static ktime_t tick_nohz_start_idle(int cpu, struct tick_sched *ts)
211u64 get_cpu_idle_time_us(int cpu, u64 *last_update_time) 211u64 get_cpu_idle_time_us(int cpu, u64 *last_update_time)
212{ 212{
213 struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu); 213 struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu);
214 ktime_t now, idle;
214 215
215 if (!tick_nohz_enabled) 216 if (!tick_nohz_enabled)
216 return -1; 217 return -1;
217 218
218 update_ts_time_stats(cpu, ts, ktime_get(), last_update_time); 219 now = ktime_get();
220 if (last_update_time) {
221 update_ts_time_stats(cpu, ts, now, last_update_time);
222 idle = ts->idle_sleeptime;
223 } else {
224 if (ts->idle_active && !nr_iowait_cpu(cpu)) {
225 ktime_t delta = ktime_sub(now, ts->idle_entrytime);
226
227 idle = ktime_add(ts->idle_sleeptime, delta);
228 } else {
229 idle = ts->idle_sleeptime;
230 }
231 }
232
233 return ktime_to_us(idle);
219 234
220 return ktime_to_us(ts->idle_sleeptime);
221} 235}
222EXPORT_SYMBOL_GPL(get_cpu_idle_time_us); 236EXPORT_SYMBOL_GPL(get_cpu_idle_time_us);
223 237
224/* 238/**
225 * get_cpu_iowait_time_us - get the total iowait time of a cpu 239 * get_cpu_iowait_time_us - get the total iowait time of a cpu
226 * @cpu: CPU number to query 240 * @cpu: CPU number to query
227 * @last_update_time: variable to store update time in 241 * @last_update_time: variable to store update time in. Do not update
242 * counters if NULL.
228 * 243 *
229 * Return the cummulative iowait time (since boot) for a given 244 * Return the cummulative iowait time (since boot) for a given
230 * CPU, in microseconds. 245 * CPU, in microseconds.
@@ -237,13 +252,26 @@ EXPORT_SYMBOL_GPL(get_cpu_idle_time_us);
237u64 get_cpu_iowait_time_us(int cpu, u64 *last_update_time) 252u64 get_cpu_iowait_time_us(int cpu, u64 *last_update_time)
238{ 253{
239 struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu); 254 struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu);
255 ktime_t now, iowait;
240 256
241 if (!tick_nohz_enabled) 257 if (!tick_nohz_enabled)
242 return -1; 258 return -1;
243 259
244 update_ts_time_stats(cpu, ts, ktime_get(), last_update_time); 260 now = ktime_get();
261 if (last_update_time) {
262 update_ts_time_stats(cpu, ts, now, last_update_time);
263 iowait = ts->iowait_sleeptime;
264 } else {
265 if (ts->idle_active && nr_iowait_cpu(cpu) > 0) {
266 ktime_t delta = ktime_sub(now, ts->idle_entrytime);
267
268 iowait = ktime_add(ts->iowait_sleeptime, delta);
269 } else {
270 iowait = ts->iowait_sleeptime;
271 }
272 }
245 273
246 return ktime_to_us(ts->iowait_sleeptime); 274 return ktime_to_us(iowait);
247} 275}
248EXPORT_SYMBOL_GPL(get_cpu_iowait_time_us); 276EXPORT_SYMBOL_GPL(get_cpu_iowait_time_us);
249 277
@@ -389,9 +417,6 @@ void tick_nohz_stop_sched_tick(int inidle)
389 else 417 else
390 expires.tv64 = KTIME_MAX; 418 expires.tv64 = KTIME_MAX;
391 419
392 if (delta_jiffies > 1)
393 cpumask_set_cpu(cpu, nohz_cpu_mask);
394
395 /* Skip reprogram of event if its not changed */ 420 /* Skip reprogram of event if its not changed */
396 if (ts->tick_stopped && ktime_equal(expires, dev->next_event)) 421 if (ts->tick_stopped && ktime_equal(expires, dev->next_event))
397 goto out; 422 goto out;
@@ -441,7 +466,6 @@ void tick_nohz_stop_sched_tick(int inidle)
441 * softirq. 466 * softirq.
442 */ 467 */
443 tick_do_update_jiffies64(ktime_get()); 468 tick_do_update_jiffies64(ktime_get());
444 cpumask_clear_cpu(cpu, nohz_cpu_mask);
445 } 469 }
446 raise_softirq_irqoff(TIMER_SOFTIRQ); 470 raise_softirq_irqoff(TIMER_SOFTIRQ);
447out: 471out:
@@ -524,7 +548,6 @@ void tick_nohz_restart_sched_tick(void)
524 /* Update jiffies first */ 548 /* Update jiffies first */
525 select_nohz_load_balancer(0); 549 select_nohz_load_balancer(0);
526 tick_do_update_jiffies64(now); 550 tick_do_update_jiffies64(now);
527 cpumask_clear_cpu(cpu, nohz_cpu_mask);
528 551
529#ifndef CONFIG_VIRT_CPU_ACCOUNTING 552#ifndef CONFIG_VIRT_CPU_ACCOUNTING
530 /* 553 /*
@@ -640,8 +663,6 @@ static void tick_nohz_switch_to_nohz(void)
640 next = ktime_add(next, tick_period); 663 next = ktime_add(next, tick_period);
641 } 664 }
642 local_irq_enable(); 665 local_irq_enable();
643
644 printk(KERN_INFO "Switched to NOHz mode on CPU #%d\n", smp_processor_id());
645} 666}
646 667
647/* 668/*
@@ -793,10 +814,8 @@ void tick_setup_sched_timer(void)
793 } 814 }
794 815
795#ifdef CONFIG_NO_HZ 816#ifdef CONFIG_NO_HZ
796 if (tick_nohz_enabled) { 817 if (tick_nohz_enabled)
797 ts->nohz_mode = NOHZ_MODE_HIGHRES; 818 ts->nohz_mode = NOHZ_MODE_HIGHRES;
798 printk(KERN_INFO "Switched to NOHz mode on CPU #%d\n", smp_processor_id());
799 }
800#endif 819#endif
801} 820}
802#endif /* HIGH_RES_TIMERS */ 821#endif /* HIGH_RES_TIMERS */
diff --git a/kernel/time/timer_stats.c b/kernel/time/timer_stats.c
index a5d0a3a85dd8..0b537f27b559 100644
--- a/kernel/time/timer_stats.c
+++ b/kernel/time/timer_stats.c
@@ -81,7 +81,7 @@ struct entry {
81/* 81/*
82 * Spinlock protecting the tables - not taken during lookup: 82 * Spinlock protecting the tables - not taken during lookup:
83 */ 83 */
84static DEFINE_SPINLOCK(table_lock); 84static DEFINE_RAW_SPINLOCK(table_lock);
85 85
86/* 86/*
87 * Per-CPU lookup locks for fast hash lookup: 87 * Per-CPU lookup locks for fast hash lookup:
@@ -188,7 +188,7 @@ static struct entry *tstat_lookup(struct entry *entry, char *comm)
188 prev = NULL; 188 prev = NULL;
189 curr = *head; 189 curr = *head;
190 190
191 spin_lock(&table_lock); 191 raw_spin_lock(&table_lock);
192 /* 192 /*
193 * Make sure we have not raced with another CPU: 193 * Make sure we have not raced with another CPU:
194 */ 194 */
@@ -215,7 +215,7 @@ static struct entry *tstat_lookup(struct entry *entry, char *comm)
215 *head = curr; 215 *head = curr;
216 } 216 }
217 out_unlock: 217 out_unlock:
218 spin_unlock(&table_lock); 218 raw_spin_unlock(&table_lock);
219 219
220 return curr; 220 return curr;
221} 221}
diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile
index 761c510a06c5..5f39a07fe5ea 100644
--- a/kernel/trace/Makefile
+++ b/kernel/trace/Makefile
@@ -15,6 +15,8 @@ ifdef CONFIG_TRACING_BRANCHES
15KBUILD_CFLAGS += -DDISABLE_BRANCH_PROFILING 15KBUILD_CFLAGS += -DDISABLE_BRANCH_PROFILING
16endif 16endif
17 17
18CFLAGS_trace_events_filter.o := -I$(src)
19
18# 20#
19# Make the trace clocks available generally: it's infrastructure 21# Make the trace clocks available generally: it's infrastructure
20# relied on by ptrace for example: 22# relied on by ptrace for example:
@@ -53,6 +55,9 @@ endif
53obj-$(CONFIG_EVENT_TRACING) += trace_events_filter.o 55obj-$(CONFIG_EVENT_TRACING) += trace_events_filter.o
54obj-$(CONFIG_KPROBE_EVENT) += trace_kprobe.o 56obj-$(CONFIG_KPROBE_EVENT) += trace_kprobe.o
55obj-$(CONFIG_TRACEPOINTS) += power-traces.o 57obj-$(CONFIG_TRACEPOINTS) += power-traces.o
58ifeq ($(CONFIG_PM_RUNTIME),y)
59obj-$(CONFIG_TRACEPOINTS) += rpm-traces.o
60endif
56ifeq ($(CONFIG_TRACING),y) 61ifeq ($(CONFIG_TRACING),y)
57obj-$(CONFIG_KGDB_KDB) += trace_kdb.o 62obj-$(CONFIG_KGDB_KDB) += trace_kdb.o
58endif 63endif
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index c3e4575e7829..077d85387908 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -3863,6 +3863,14 @@ void ftrace_kill(void)
3863} 3863}
3864 3864
3865/** 3865/**
3866 * Test if ftrace is dead or not.
3867 */
3868int ftrace_is_dead(void)
3869{
3870 return ftrace_disabled;
3871}
3872
3873/**
3866 * register_ftrace_function - register a function for profiling 3874 * register_ftrace_function - register a function for profiling
3867 * @ops - ops structure that holds the function for profiling. 3875 * @ops - ops structure that holds the function for profiling.
3868 * 3876 *
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index 731201bf4acc..f5b7b5c1195b 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -478,7 +478,7 @@ struct ring_buffer_per_cpu {
478 int cpu; 478 int cpu;
479 atomic_t record_disabled; 479 atomic_t record_disabled;
480 struct ring_buffer *buffer; 480 struct ring_buffer *buffer;
481 spinlock_t reader_lock; /* serialize readers */ 481 raw_spinlock_t reader_lock; /* serialize readers */
482 arch_spinlock_t lock; 482 arch_spinlock_t lock;
483 struct lock_class_key lock_key; 483 struct lock_class_key lock_key;
484 struct list_head *pages; 484 struct list_head *pages;
@@ -488,12 +488,14 @@ struct ring_buffer_per_cpu {
488 struct buffer_page *reader_page; 488 struct buffer_page *reader_page;
489 unsigned long lost_events; 489 unsigned long lost_events;
490 unsigned long last_overrun; 490 unsigned long last_overrun;
491 local_t entries_bytes;
491 local_t commit_overrun; 492 local_t commit_overrun;
492 local_t overrun; 493 local_t overrun;
493 local_t entries; 494 local_t entries;
494 local_t committing; 495 local_t committing;
495 local_t commits; 496 local_t commits;
496 unsigned long read; 497 unsigned long read;
498 unsigned long read_bytes;
497 u64 write_stamp; 499 u64 write_stamp;
498 u64 read_stamp; 500 u64 read_stamp;
499}; 501};
@@ -1062,7 +1064,7 @@ rb_allocate_cpu_buffer(struct ring_buffer *buffer, int cpu)
1062 1064
1063 cpu_buffer->cpu = cpu; 1065 cpu_buffer->cpu = cpu;
1064 cpu_buffer->buffer = buffer; 1066 cpu_buffer->buffer = buffer;
1065 spin_lock_init(&cpu_buffer->reader_lock); 1067 raw_spin_lock_init(&cpu_buffer->reader_lock);
1066 lockdep_set_class(&cpu_buffer->reader_lock, buffer->reader_lock_key); 1068 lockdep_set_class(&cpu_buffer->reader_lock, buffer->reader_lock_key);
1067 cpu_buffer->lock = (arch_spinlock_t)__ARCH_SPIN_LOCK_UNLOCKED; 1069 cpu_buffer->lock = (arch_spinlock_t)__ARCH_SPIN_LOCK_UNLOCKED;
1068 1070
@@ -1259,7 +1261,7 @@ rb_remove_pages(struct ring_buffer_per_cpu *cpu_buffer, unsigned nr_pages)
1259 struct list_head *p; 1261 struct list_head *p;
1260 unsigned i; 1262 unsigned i;
1261 1263
1262 spin_lock_irq(&cpu_buffer->reader_lock); 1264 raw_spin_lock_irq(&cpu_buffer->reader_lock);
1263 rb_head_page_deactivate(cpu_buffer); 1265 rb_head_page_deactivate(cpu_buffer);
1264 1266
1265 for (i = 0; i < nr_pages; i++) { 1267 for (i = 0; i < nr_pages; i++) {
@@ -1277,7 +1279,7 @@ rb_remove_pages(struct ring_buffer_per_cpu *cpu_buffer, unsigned nr_pages)
1277 rb_check_pages(cpu_buffer); 1279 rb_check_pages(cpu_buffer);
1278 1280
1279out: 1281out:
1280 spin_unlock_irq(&cpu_buffer->reader_lock); 1282 raw_spin_unlock_irq(&cpu_buffer->reader_lock);
1281} 1283}
1282 1284
1283static void 1285static void
@@ -1288,7 +1290,7 @@ rb_insert_pages(struct ring_buffer_per_cpu *cpu_buffer,
1288 struct list_head *p; 1290 struct list_head *p;
1289 unsigned i; 1291 unsigned i;
1290 1292
1291 spin_lock_irq(&cpu_buffer->reader_lock); 1293 raw_spin_lock_irq(&cpu_buffer->reader_lock);
1292 rb_head_page_deactivate(cpu_buffer); 1294 rb_head_page_deactivate(cpu_buffer);
1293 1295
1294 for (i = 0; i < nr_pages; i++) { 1296 for (i = 0; i < nr_pages; i++) {
@@ -1303,7 +1305,7 @@ rb_insert_pages(struct ring_buffer_per_cpu *cpu_buffer,
1303 rb_check_pages(cpu_buffer); 1305 rb_check_pages(cpu_buffer);
1304 1306
1305out: 1307out:
1306 spin_unlock_irq(&cpu_buffer->reader_lock); 1308 raw_spin_unlock_irq(&cpu_buffer->reader_lock);
1307} 1309}
1308 1310
1309/** 1311/**
@@ -1708,6 +1710,7 @@ rb_handle_head_page(struct ring_buffer_per_cpu *cpu_buffer,
1708 * the counters. 1710 * the counters.
1709 */ 1711 */
1710 local_add(entries, &cpu_buffer->overrun); 1712 local_add(entries, &cpu_buffer->overrun);
1713 local_sub(BUF_PAGE_SIZE, &cpu_buffer->entries_bytes);
1711 1714
1712 /* 1715 /*
1713 * The entries will be zeroed out when we move the 1716 * The entries will be zeroed out when we move the
@@ -1863,6 +1866,9 @@ rb_reset_tail(struct ring_buffer_per_cpu *cpu_buffer,
1863 event = __rb_page_index(tail_page, tail); 1866 event = __rb_page_index(tail_page, tail);
1864 kmemcheck_annotate_bitfield(event, bitfield); 1867 kmemcheck_annotate_bitfield(event, bitfield);
1865 1868
1869 /* account for padding bytes */
1870 local_add(BUF_PAGE_SIZE - tail, &cpu_buffer->entries_bytes);
1871
1866 /* 1872 /*
1867 * Save the original length to the meta data. 1873 * Save the original length to the meta data.
1868 * This will be used by the reader to add lost event 1874 * This will be used by the reader to add lost event
@@ -2054,6 +2060,9 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer,
2054 if (!tail) 2060 if (!tail)
2055 tail_page->page->time_stamp = ts; 2061 tail_page->page->time_stamp = ts;
2056 2062
2063 /* account for these added bytes */
2064 local_add(length, &cpu_buffer->entries_bytes);
2065
2057 return event; 2066 return event;
2058} 2067}
2059 2068
@@ -2076,6 +2085,7 @@ rb_try_to_discard(struct ring_buffer_per_cpu *cpu_buffer,
2076 if (bpage->page == (void *)addr && rb_page_write(bpage) == old_index) { 2085 if (bpage->page == (void *)addr && rb_page_write(bpage) == old_index) {
2077 unsigned long write_mask = 2086 unsigned long write_mask =
2078 local_read(&bpage->write) & ~RB_WRITE_MASK; 2087 local_read(&bpage->write) & ~RB_WRITE_MASK;
2088 unsigned long event_length = rb_event_length(event);
2079 /* 2089 /*
2080 * This is on the tail page. It is possible that 2090 * This is on the tail page. It is possible that
2081 * a write could come in and move the tail page 2091 * a write could come in and move the tail page
@@ -2085,8 +2095,11 @@ rb_try_to_discard(struct ring_buffer_per_cpu *cpu_buffer,
2085 old_index += write_mask; 2095 old_index += write_mask;
2086 new_index += write_mask; 2096 new_index += write_mask;
2087 index = local_cmpxchg(&bpage->write, old_index, new_index); 2097 index = local_cmpxchg(&bpage->write, old_index, new_index);
2088 if (index == old_index) 2098 if (index == old_index) {
2099 /* update counters */
2100 local_sub(event_length, &cpu_buffer->entries_bytes);
2089 return 1; 2101 return 1;
2102 }
2090 } 2103 }
2091 2104
2092 /* could not discard */ 2105 /* could not discard */
@@ -2661,6 +2674,58 @@ rb_num_of_entries(struct ring_buffer_per_cpu *cpu_buffer)
2661} 2674}
2662 2675
2663/** 2676/**
2677 * ring_buffer_oldest_event_ts - get the oldest event timestamp from the buffer
2678 * @buffer: The ring buffer
2679 * @cpu: The per CPU buffer to read from.
2680 */
2681unsigned long ring_buffer_oldest_event_ts(struct ring_buffer *buffer, int cpu)
2682{
2683 unsigned long flags;
2684 struct ring_buffer_per_cpu *cpu_buffer;
2685 struct buffer_page *bpage;
2686 unsigned long ret;
2687
2688 if (!cpumask_test_cpu(cpu, buffer->cpumask))
2689 return 0;
2690
2691 cpu_buffer = buffer->buffers[cpu];
2692 raw_spin_lock_irqsave(&cpu_buffer->reader_lock, flags);
2693 /*
2694 * if the tail is on reader_page, oldest time stamp is on the reader
2695 * page
2696 */
2697 if (cpu_buffer->tail_page == cpu_buffer->reader_page)
2698 bpage = cpu_buffer->reader_page;
2699 else
2700 bpage = rb_set_head_page(cpu_buffer);
2701 ret = bpage->page->time_stamp;
2702 raw_spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags);
2703
2704 return ret;
2705}
2706EXPORT_SYMBOL_GPL(ring_buffer_oldest_event_ts);
2707
2708/**
2709 * ring_buffer_bytes_cpu - get the number of bytes consumed in a cpu buffer
2710 * @buffer: The ring buffer
2711 * @cpu: The per CPU buffer to read from.
2712 */
2713unsigned long ring_buffer_bytes_cpu(struct ring_buffer *buffer, int cpu)
2714{
2715 struct ring_buffer_per_cpu *cpu_buffer;
2716 unsigned long ret;
2717
2718 if (!cpumask_test_cpu(cpu, buffer->cpumask))
2719 return 0;
2720
2721 cpu_buffer = buffer->buffers[cpu];
2722 ret = local_read(&cpu_buffer->entries_bytes) - cpu_buffer->read_bytes;
2723
2724 return ret;
2725}
2726EXPORT_SYMBOL_GPL(ring_buffer_bytes_cpu);
2727
2728/**
2664 * ring_buffer_entries_cpu - get the number of entries in a cpu buffer 2729 * ring_buffer_entries_cpu - get the number of entries in a cpu buffer
2665 * @buffer: The ring buffer 2730 * @buffer: The ring buffer
2666 * @cpu: The per CPU buffer to get the entries from. 2731 * @cpu: The per CPU buffer to get the entries from.
@@ -2804,9 +2869,9 @@ void ring_buffer_iter_reset(struct ring_buffer_iter *iter)
2804 2869
2805 cpu_buffer = iter->cpu_buffer; 2870 cpu_buffer = iter->cpu_buffer;
2806 2871
2807 spin_lock_irqsave(&cpu_buffer->reader_lock, flags); 2872 raw_spin_lock_irqsave(&cpu_buffer->reader_lock, flags);
2808 rb_iter_reset(iter); 2873 rb_iter_reset(iter);
2809 spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags); 2874 raw_spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags);
2810} 2875}
2811EXPORT_SYMBOL_GPL(ring_buffer_iter_reset); 2876EXPORT_SYMBOL_GPL(ring_buffer_iter_reset);
2812 2877
@@ -3265,12 +3330,12 @@ ring_buffer_peek(struct ring_buffer *buffer, int cpu, u64 *ts,
3265 again: 3330 again:
3266 local_irq_save(flags); 3331 local_irq_save(flags);
3267 if (dolock) 3332 if (dolock)
3268 spin_lock(&cpu_buffer->reader_lock); 3333 raw_spin_lock(&cpu_buffer->reader_lock);
3269 event = rb_buffer_peek(cpu_buffer, ts, lost_events); 3334 event = rb_buffer_peek(cpu_buffer, ts, lost_events);
3270 if (event && event->type_len == RINGBUF_TYPE_PADDING) 3335 if (event && event->type_len == RINGBUF_TYPE_PADDING)
3271 rb_advance_reader(cpu_buffer); 3336 rb_advance_reader(cpu_buffer);
3272 if (dolock) 3337 if (dolock)
3273 spin_unlock(&cpu_buffer->reader_lock); 3338 raw_spin_unlock(&cpu_buffer->reader_lock);
3274 local_irq_restore(flags); 3339 local_irq_restore(flags);
3275 3340
3276 if (event && event->type_len == RINGBUF_TYPE_PADDING) 3341 if (event && event->type_len == RINGBUF_TYPE_PADDING)
@@ -3295,9 +3360,9 @@ ring_buffer_iter_peek(struct ring_buffer_iter *iter, u64 *ts)
3295 unsigned long flags; 3360 unsigned long flags;
3296 3361
3297 again: 3362 again:
3298 spin_lock_irqsave(&cpu_buffer->reader_lock, flags); 3363 raw_spin_lock_irqsave(&cpu_buffer->reader_lock, flags);
3299 event = rb_iter_peek(iter, ts); 3364 event = rb_iter_peek(iter, ts);
3300 spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags); 3365 raw_spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags);
3301 3366
3302 if (event && event->type_len == RINGBUF_TYPE_PADDING) 3367 if (event && event->type_len == RINGBUF_TYPE_PADDING)
3303 goto again; 3368 goto again;
@@ -3337,7 +3402,7 @@ ring_buffer_consume(struct ring_buffer *buffer, int cpu, u64 *ts,
3337 cpu_buffer = buffer->buffers[cpu]; 3402 cpu_buffer = buffer->buffers[cpu];
3338 local_irq_save(flags); 3403 local_irq_save(flags);
3339 if (dolock) 3404 if (dolock)
3340 spin_lock(&cpu_buffer->reader_lock); 3405 raw_spin_lock(&cpu_buffer->reader_lock);
3341 3406
3342 event = rb_buffer_peek(cpu_buffer, ts, lost_events); 3407 event = rb_buffer_peek(cpu_buffer, ts, lost_events);
3343 if (event) { 3408 if (event) {
@@ -3346,7 +3411,7 @@ ring_buffer_consume(struct ring_buffer *buffer, int cpu, u64 *ts,
3346 } 3411 }
3347 3412
3348 if (dolock) 3413 if (dolock)
3349 spin_unlock(&cpu_buffer->reader_lock); 3414 raw_spin_unlock(&cpu_buffer->reader_lock);
3350 local_irq_restore(flags); 3415 local_irq_restore(flags);
3351 3416
3352 out: 3417 out:
@@ -3438,11 +3503,11 @@ ring_buffer_read_start(struct ring_buffer_iter *iter)
3438 3503
3439 cpu_buffer = iter->cpu_buffer; 3504 cpu_buffer = iter->cpu_buffer;
3440 3505
3441 spin_lock_irqsave(&cpu_buffer->reader_lock, flags); 3506 raw_spin_lock_irqsave(&cpu_buffer->reader_lock, flags);
3442 arch_spin_lock(&cpu_buffer->lock); 3507 arch_spin_lock(&cpu_buffer->lock);
3443 rb_iter_reset(iter); 3508 rb_iter_reset(iter);
3444 arch_spin_unlock(&cpu_buffer->lock); 3509 arch_spin_unlock(&cpu_buffer->lock);
3445 spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags); 3510 raw_spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags);
3446} 3511}
3447EXPORT_SYMBOL_GPL(ring_buffer_read_start); 3512EXPORT_SYMBOL_GPL(ring_buffer_read_start);
3448 3513
@@ -3477,7 +3542,7 @@ ring_buffer_read(struct ring_buffer_iter *iter, u64 *ts)
3477 struct ring_buffer_per_cpu *cpu_buffer = iter->cpu_buffer; 3542 struct ring_buffer_per_cpu *cpu_buffer = iter->cpu_buffer;
3478 unsigned long flags; 3543 unsigned long flags;
3479 3544
3480 spin_lock_irqsave(&cpu_buffer->reader_lock, flags); 3545 raw_spin_lock_irqsave(&cpu_buffer->reader_lock, flags);
3481 again: 3546 again:
3482 event = rb_iter_peek(iter, ts); 3547 event = rb_iter_peek(iter, ts);
3483 if (!event) 3548 if (!event)
@@ -3488,7 +3553,7 @@ ring_buffer_read(struct ring_buffer_iter *iter, u64 *ts)
3488 3553
3489 rb_advance_iter(iter); 3554 rb_advance_iter(iter);
3490 out: 3555 out:
3491 spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags); 3556 raw_spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags);
3492 3557
3493 return event; 3558 return event;
3494} 3559}
@@ -3527,11 +3592,13 @@ rb_reset_cpu(struct ring_buffer_per_cpu *cpu_buffer)
3527 cpu_buffer->reader_page->read = 0; 3592 cpu_buffer->reader_page->read = 0;
3528 3593
3529 local_set(&cpu_buffer->commit_overrun, 0); 3594 local_set(&cpu_buffer->commit_overrun, 0);
3595 local_set(&cpu_buffer->entries_bytes, 0);
3530 local_set(&cpu_buffer->overrun, 0); 3596 local_set(&cpu_buffer->overrun, 0);
3531 local_set(&cpu_buffer->entries, 0); 3597 local_set(&cpu_buffer->entries, 0);
3532 local_set(&cpu_buffer->committing, 0); 3598 local_set(&cpu_buffer->committing, 0);
3533 local_set(&cpu_buffer->commits, 0); 3599 local_set(&cpu_buffer->commits, 0);
3534 cpu_buffer->read = 0; 3600 cpu_buffer->read = 0;
3601 cpu_buffer->read_bytes = 0;
3535 3602
3536 cpu_buffer->write_stamp = 0; 3603 cpu_buffer->write_stamp = 0;
3537 cpu_buffer->read_stamp = 0; 3604 cpu_buffer->read_stamp = 0;
@@ -3557,7 +3624,7 @@ void ring_buffer_reset_cpu(struct ring_buffer *buffer, int cpu)
3557 3624
3558 atomic_inc(&cpu_buffer->record_disabled); 3625 atomic_inc(&cpu_buffer->record_disabled);
3559 3626
3560 spin_lock_irqsave(&cpu_buffer->reader_lock, flags); 3627 raw_spin_lock_irqsave(&cpu_buffer->reader_lock, flags);
3561 3628
3562 if (RB_WARN_ON(cpu_buffer, local_read(&cpu_buffer->committing))) 3629 if (RB_WARN_ON(cpu_buffer, local_read(&cpu_buffer->committing)))
3563 goto out; 3630 goto out;
@@ -3569,7 +3636,7 @@ void ring_buffer_reset_cpu(struct ring_buffer *buffer, int cpu)
3569 arch_spin_unlock(&cpu_buffer->lock); 3636 arch_spin_unlock(&cpu_buffer->lock);
3570 3637
3571 out: 3638 out:
3572 spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags); 3639 raw_spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags);
3573 3640
3574 atomic_dec(&cpu_buffer->record_disabled); 3641 atomic_dec(&cpu_buffer->record_disabled);
3575} 3642}
@@ -3607,10 +3674,10 @@ int ring_buffer_empty(struct ring_buffer *buffer)
3607 cpu_buffer = buffer->buffers[cpu]; 3674 cpu_buffer = buffer->buffers[cpu];
3608 local_irq_save(flags); 3675 local_irq_save(flags);
3609 if (dolock) 3676 if (dolock)
3610 spin_lock(&cpu_buffer->reader_lock); 3677 raw_spin_lock(&cpu_buffer->reader_lock);
3611 ret = rb_per_cpu_empty(cpu_buffer); 3678 ret = rb_per_cpu_empty(cpu_buffer);
3612 if (dolock) 3679 if (dolock)
3613 spin_unlock(&cpu_buffer->reader_lock); 3680 raw_spin_unlock(&cpu_buffer->reader_lock);
3614 local_irq_restore(flags); 3681 local_irq_restore(flags);
3615 3682
3616 if (!ret) 3683 if (!ret)
@@ -3641,10 +3708,10 @@ int ring_buffer_empty_cpu(struct ring_buffer *buffer, int cpu)
3641 cpu_buffer = buffer->buffers[cpu]; 3708 cpu_buffer = buffer->buffers[cpu];
3642 local_irq_save(flags); 3709 local_irq_save(flags);
3643 if (dolock) 3710 if (dolock)
3644 spin_lock(&cpu_buffer->reader_lock); 3711 raw_spin_lock(&cpu_buffer->reader_lock);
3645 ret = rb_per_cpu_empty(cpu_buffer); 3712 ret = rb_per_cpu_empty(cpu_buffer);
3646 if (dolock) 3713 if (dolock)
3647 spin_unlock(&cpu_buffer->reader_lock); 3714 raw_spin_unlock(&cpu_buffer->reader_lock);
3648 local_irq_restore(flags); 3715 local_irq_restore(flags);
3649 3716
3650 return ret; 3717 return ret;
@@ -3841,7 +3908,7 @@ int ring_buffer_read_page(struct ring_buffer *buffer,
3841 if (!bpage) 3908 if (!bpage)
3842 goto out; 3909 goto out;
3843 3910
3844 spin_lock_irqsave(&cpu_buffer->reader_lock, flags); 3911 raw_spin_lock_irqsave(&cpu_buffer->reader_lock, flags);
3845 3912
3846 reader = rb_get_reader_page(cpu_buffer); 3913 reader = rb_get_reader_page(cpu_buffer);
3847 if (!reader) 3914 if (!reader)
@@ -3918,6 +3985,7 @@ int ring_buffer_read_page(struct ring_buffer *buffer,
3918 } else { 3985 } else {
3919 /* update the entry counter */ 3986 /* update the entry counter */
3920 cpu_buffer->read += rb_page_entries(reader); 3987 cpu_buffer->read += rb_page_entries(reader);
3988 cpu_buffer->read_bytes += BUF_PAGE_SIZE;
3921 3989
3922 /* swap the pages */ 3990 /* swap the pages */
3923 rb_init_page(bpage); 3991 rb_init_page(bpage);
@@ -3964,7 +4032,7 @@ int ring_buffer_read_page(struct ring_buffer *buffer,
3964 memset(&bpage->data[commit], 0, BUF_PAGE_SIZE - commit); 4032 memset(&bpage->data[commit], 0, BUF_PAGE_SIZE - commit);
3965 4033
3966 out_unlock: 4034 out_unlock:
3967 spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags); 4035 raw_spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags);
3968 4036
3969 out: 4037 out:
3970 return ret; 4038 return ret;
diff --git a/kernel/trace/rpm-traces.c b/kernel/trace/rpm-traces.c
new file mode 100644
index 000000000000..4b3b5eaf94d1
--- /dev/null
+++ b/kernel/trace/rpm-traces.c
@@ -0,0 +1,20 @@
1/*
2 * Power trace points
3 *
4 * Copyright (C) 2009 Ming Lei <ming.lei@canonical.com>
5 */
6
7#include <linux/string.h>
8#include <linux/types.h>
9#include <linux/workqueue.h>
10#include <linux/sched.h>
11#include <linux/module.h>
12#include <linux/usb.h>
13
14#define CREATE_TRACE_POINTS
15#include <trace/events/rpm.h>
16
17EXPORT_TRACEPOINT_SYMBOL_GPL(rpm_return_int);
18EXPORT_TRACEPOINT_SYMBOL_GPL(rpm_idle);
19EXPORT_TRACEPOINT_SYMBOL_GPL(rpm_suspend);
20EXPORT_TRACEPOINT_SYMBOL_GPL(rpm_resume);
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index e5df02c69b1d..f2bd275bb60f 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -341,7 +341,7 @@ unsigned long trace_flags = TRACE_ITER_PRINT_PARENT | TRACE_ITER_PRINTK |
341 TRACE_ITER_GRAPH_TIME | TRACE_ITER_RECORD_CMD | TRACE_ITER_OVERWRITE; 341 TRACE_ITER_GRAPH_TIME | TRACE_ITER_RECORD_CMD | TRACE_ITER_OVERWRITE;
342 342
343static int trace_stop_count; 343static int trace_stop_count;
344static DEFINE_SPINLOCK(tracing_start_lock); 344static DEFINE_RAW_SPINLOCK(tracing_start_lock);
345 345
346static void wakeup_work_handler(struct work_struct *work) 346static void wakeup_work_handler(struct work_struct *work)
347{ 347{
@@ -435,6 +435,7 @@ static struct {
435} trace_clocks[] = { 435} trace_clocks[] = {
436 { trace_clock_local, "local" }, 436 { trace_clock_local, "local" },
437 { trace_clock_global, "global" }, 437 { trace_clock_global, "global" },
438 { trace_clock_counter, "counter" },
438}; 439};
439 440
440int trace_clock_id; 441int trace_clock_id;
@@ -960,7 +961,7 @@ void tracing_start(void)
960 if (tracing_disabled) 961 if (tracing_disabled)
961 return; 962 return;
962 963
963 spin_lock_irqsave(&tracing_start_lock, flags); 964 raw_spin_lock_irqsave(&tracing_start_lock, flags);
964 if (--trace_stop_count) { 965 if (--trace_stop_count) {
965 if (trace_stop_count < 0) { 966 if (trace_stop_count < 0) {
966 /* Someone screwed up their debugging */ 967 /* Someone screwed up their debugging */
@@ -985,7 +986,7 @@ void tracing_start(void)
985 986
986 ftrace_start(); 987 ftrace_start();
987 out: 988 out:
988 spin_unlock_irqrestore(&tracing_start_lock, flags); 989 raw_spin_unlock_irqrestore(&tracing_start_lock, flags);
989} 990}
990 991
991/** 992/**
@@ -1000,7 +1001,7 @@ void tracing_stop(void)
1000 unsigned long flags; 1001 unsigned long flags;
1001 1002
1002 ftrace_stop(); 1003 ftrace_stop();
1003 spin_lock_irqsave(&tracing_start_lock, flags); 1004 raw_spin_lock_irqsave(&tracing_start_lock, flags);
1004 if (trace_stop_count++) 1005 if (trace_stop_count++)
1005 goto out; 1006 goto out;
1006 1007
@@ -1018,7 +1019,7 @@ void tracing_stop(void)
1018 arch_spin_unlock(&ftrace_max_lock); 1019 arch_spin_unlock(&ftrace_max_lock);
1019 1020
1020 out: 1021 out:
1021 spin_unlock_irqrestore(&tracing_start_lock, flags); 1022 raw_spin_unlock_irqrestore(&tracing_start_lock, flags);
1022} 1023}
1023 1024
1024void trace_stop_cmdline_recording(void); 1025void trace_stop_cmdline_recording(void);
@@ -2159,6 +2160,14 @@ void trace_default_header(struct seq_file *m)
2159 } 2160 }
2160} 2161}
2161 2162
2163static void test_ftrace_alive(struct seq_file *m)
2164{
2165 if (!ftrace_is_dead())
2166 return;
2167 seq_printf(m, "# WARNING: FUNCTION TRACING IS CORRUPTED\n");
2168 seq_printf(m, "# MAY BE MISSING FUNCTION EVENTS\n");
2169}
2170
2162static int s_show(struct seq_file *m, void *v) 2171static int s_show(struct seq_file *m, void *v)
2163{ 2172{
2164 struct trace_iterator *iter = v; 2173 struct trace_iterator *iter = v;
@@ -2168,6 +2177,7 @@ static int s_show(struct seq_file *m, void *v)
2168 if (iter->tr) { 2177 if (iter->tr) {
2169 seq_printf(m, "# tracer: %s\n", iter->trace->name); 2178 seq_printf(m, "# tracer: %s\n", iter->trace->name);
2170 seq_puts(m, "#\n"); 2179 seq_puts(m, "#\n");
2180 test_ftrace_alive(m);
2171 } 2181 }
2172 if (iter->trace && iter->trace->print_header) 2182 if (iter->trace && iter->trace->print_header)
2173 iter->trace->print_header(m); 2183 iter->trace->print_header(m);
@@ -2710,9 +2720,9 @@ static const char readme_msg[] =
2710 "# cat /sys/kernel/debug/tracing/trace_options\n" 2720 "# cat /sys/kernel/debug/tracing/trace_options\n"
2711 "noprint-parent nosym-offset nosym-addr noverbose\n" 2721 "noprint-parent nosym-offset nosym-addr noverbose\n"
2712 "# echo print-parent > /sys/kernel/debug/tracing/trace_options\n" 2722 "# echo print-parent > /sys/kernel/debug/tracing/trace_options\n"
2713 "# echo 1 > /sys/kernel/debug/tracing/tracing_enabled\n" 2723 "# echo 1 > /sys/kernel/debug/tracing/tracing_on\n"
2714 "# cat /sys/kernel/debug/tracing/trace > /tmp/trace.txt\n" 2724 "# cat /sys/kernel/debug/tracing/trace > /tmp/trace.txt\n"
2715 "# echo 0 > /sys/kernel/debug/tracing/tracing_enabled\n" 2725 "# echo 0 > /sys/kernel/debug/tracing/tracing_on\n"
2716; 2726;
2717 2727
2718static ssize_t 2728static ssize_t
@@ -3569,6 +3579,30 @@ tracing_entries_write(struct file *filp, const char __user *ubuf,
3569} 3579}
3570 3580
3571static ssize_t 3581static ssize_t
3582tracing_total_entries_read(struct file *filp, char __user *ubuf,
3583 size_t cnt, loff_t *ppos)
3584{
3585 struct trace_array *tr = filp->private_data;
3586 char buf[64];
3587 int r, cpu;
3588 unsigned long size = 0, expanded_size = 0;
3589
3590 mutex_lock(&trace_types_lock);
3591 for_each_tracing_cpu(cpu) {
3592 size += tr->entries >> 10;
3593 if (!ring_buffer_expanded)
3594 expanded_size += trace_buf_size >> 10;
3595 }
3596 if (ring_buffer_expanded)
3597 r = sprintf(buf, "%lu\n", size);
3598 else
3599 r = sprintf(buf, "%lu (expanded: %lu)\n", size, expanded_size);
3600 mutex_unlock(&trace_types_lock);
3601
3602 return simple_read_from_buffer(ubuf, cnt, ppos, buf, r);
3603}
3604
3605static ssize_t
3572tracing_free_buffer_write(struct file *filp, const char __user *ubuf, 3606tracing_free_buffer_write(struct file *filp, const char __user *ubuf,
3573 size_t cnt, loff_t *ppos) 3607 size_t cnt, loff_t *ppos)
3574{ 3608{
@@ -3594,22 +3628,24 @@ tracing_free_buffer_release(struct inode *inode, struct file *filp)
3594 return 0; 3628 return 0;
3595} 3629}
3596 3630
3597static int mark_printk(const char *fmt, ...)
3598{
3599 int ret;
3600 va_list args;
3601 va_start(args, fmt);
3602 ret = trace_vprintk(0, fmt, args);
3603 va_end(args);
3604 return ret;
3605}
3606
3607static ssize_t 3631static ssize_t
3608tracing_mark_write(struct file *filp, const char __user *ubuf, 3632tracing_mark_write(struct file *filp, const char __user *ubuf,
3609 size_t cnt, loff_t *fpos) 3633 size_t cnt, loff_t *fpos)
3610{ 3634{
3611 char *buf; 3635 unsigned long addr = (unsigned long)ubuf;
3612 size_t written; 3636 struct ring_buffer_event *event;
3637 struct ring_buffer *buffer;
3638 struct print_entry *entry;
3639 unsigned long irq_flags;
3640 struct page *pages[2];
3641 int nr_pages = 1;
3642 ssize_t written;
3643 void *page1;
3644 void *page2;
3645 int offset;
3646 int size;
3647 int len;
3648 int ret;
3613 3649
3614 if (tracing_disabled) 3650 if (tracing_disabled)
3615 return -EINVAL; 3651 return -EINVAL;
@@ -3617,28 +3653,81 @@ tracing_mark_write(struct file *filp, const char __user *ubuf,
3617 if (cnt > TRACE_BUF_SIZE) 3653 if (cnt > TRACE_BUF_SIZE)
3618 cnt = TRACE_BUF_SIZE; 3654 cnt = TRACE_BUF_SIZE;
3619 3655
3620 buf = kmalloc(cnt + 2, GFP_KERNEL); 3656 /*
3621 if (buf == NULL) 3657 * Userspace is injecting traces into the kernel trace buffer.
3622 return -ENOMEM; 3658 * We want to be as non intrusive as possible.
3659 * To do so, we do not want to allocate any special buffers
3660 * or take any locks, but instead write the userspace data
3661 * straight into the ring buffer.
3662 *
3663 * First we need to pin the userspace buffer into memory,
3664 * which, most likely it is, because it just referenced it.
3665 * But there's no guarantee that it is. By using get_user_pages_fast()
3666 * and kmap_atomic/kunmap_atomic() we can get access to the
3667 * pages directly. We then write the data directly into the
3668 * ring buffer.
3669 */
3670 BUILD_BUG_ON(TRACE_BUF_SIZE >= PAGE_SIZE);
3623 3671
3624 if (copy_from_user(buf, ubuf, cnt)) { 3672 /* check if we cross pages */
3625 kfree(buf); 3673 if ((addr & PAGE_MASK) != ((addr + cnt) & PAGE_MASK))
3626 return -EFAULT; 3674 nr_pages = 2;
3675
3676 offset = addr & (PAGE_SIZE - 1);
3677 addr &= PAGE_MASK;
3678
3679 ret = get_user_pages_fast(addr, nr_pages, 0, pages);
3680 if (ret < nr_pages) {
3681 while (--ret >= 0)
3682 put_page(pages[ret]);
3683 written = -EFAULT;
3684 goto out;
3685 }
3686
3687 page1 = kmap_atomic(pages[0]);
3688 if (nr_pages == 2)
3689 page2 = kmap_atomic(pages[1]);
3690
3691 local_save_flags(irq_flags);
3692 size = sizeof(*entry) + cnt + 2; /* possible \n added */
3693 buffer = global_trace.buffer;
3694 event = trace_buffer_lock_reserve(buffer, TRACE_PRINT, size,
3695 irq_flags, preempt_count());
3696 if (!event) {
3697 /* Ring buffer disabled, return as if not open for write */
3698 written = -EBADF;
3699 goto out_unlock;
3627 } 3700 }
3628 if (buf[cnt-1] != '\n') { 3701
3629 buf[cnt] = '\n'; 3702 entry = ring_buffer_event_data(event);
3630 buf[cnt+1] = '\0'; 3703 entry->ip = _THIS_IP_;
3704
3705 if (nr_pages == 2) {
3706 len = PAGE_SIZE - offset;
3707 memcpy(&entry->buf, page1 + offset, len);
3708 memcpy(&entry->buf[len], page2, cnt - len);
3631 } else 3709 } else
3632 buf[cnt] = '\0'; 3710 memcpy(&entry->buf, page1 + offset, cnt);
3633 3711
3634 written = mark_printk("%s", buf); 3712 if (entry->buf[cnt - 1] != '\n') {
3635 kfree(buf); 3713 entry->buf[cnt] = '\n';
3636 *fpos += written; 3714 entry->buf[cnt + 1] = '\0';
3715 } else
3716 entry->buf[cnt] = '\0';
3717
3718 ring_buffer_unlock_commit(buffer, event);
3637 3719
3638 /* don't tell userspace we wrote more - it might confuse them */ 3720 written = cnt;
3639 if (written > cnt)
3640 written = cnt;
3641 3721
3722 *fpos += written;
3723
3724 out_unlock:
3725 if (nr_pages == 2)
3726 kunmap_atomic(page2);
3727 kunmap_atomic(page1);
3728 while (nr_pages > 0)
3729 put_page(pages[--nr_pages]);
3730 out:
3642 return written; 3731 return written;
3643} 3732}
3644 3733
@@ -3739,6 +3828,12 @@ static const struct file_operations tracing_entries_fops = {
3739 .llseek = generic_file_llseek, 3828 .llseek = generic_file_llseek,
3740}; 3829};
3741 3830
3831static const struct file_operations tracing_total_entries_fops = {
3832 .open = tracing_open_generic,
3833 .read = tracing_total_entries_read,
3834 .llseek = generic_file_llseek,
3835};
3836
3742static const struct file_operations tracing_free_buffer_fops = { 3837static const struct file_operations tracing_free_buffer_fops = {
3743 .write = tracing_free_buffer_write, 3838 .write = tracing_free_buffer_write,
3744 .release = tracing_free_buffer_release, 3839 .release = tracing_free_buffer_release,
@@ -3808,8 +3903,6 @@ tracing_buffers_read(struct file *filp, char __user *ubuf,
3808 if (info->read < PAGE_SIZE) 3903 if (info->read < PAGE_SIZE)
3809 goto read; 3904 goto read;
3810 3905
3811 info->read = 0;
3812
3813 trace_access_lock(info->cpu); 3906 trace_access_lock(info->cpu);
3814 ret = ring_buffer_read_page(info->tr->buffer, 3907 ret = ring_buffer_read_page(info->tr->buffer,
3815 &info->spare, 3908 &info->spare,
@@ -3819,6 +3912,8 @@ tracing_buffers_read(struct file *filp, char __user *ubuf,
3819 if (ret < 0) 3912 if (ret < 0)
3820 return 0; 3913 return 0;
3821 3914
3915 info->read = 0;
3916
3822read: 3917read:
3823 size = PAGE_SIZE - info->read; 3918 size = PAGE_SIZE - info->read;
3824 if (size > count) 3919 if (size > count)
@@ -4026,6 +4121,8 @@ tracing_stats_read(struct file *filp, char __user *ubuf,
4026 struct trace_array *tr = &global_trace; 4121 struct trace_array *tr = &global_trace;
4027 struct trace_seq *s; 4122 struct trace_seq *s;
4028 unsigned long cnt; 4123 unsigned long cnt;
4124 unsigned long long t;
4125 unsigned long usec_rem;
4029 4126
4030 s = kmalloc(sizeof(*s), GFP_KERNEL); 4127 s = kmalloc(sizeof(*s), GFP_KERNEL);
4031 if (!s) 4128 if (!s)
@@ -4042,6 +4139,17 @@ tracing_stats_read(struct file *filp, char __user *ubuf,
4042 cnt = ring_buffer_commit_overrun_cpu(tr->buffer, cpu); 4139 cnt = ring_buffer_commit_overrun_cpu(tr->buffer, cpu);
4043 trace_seq_printf(s, "commit overrun: %ld\n", cnt); 4140 trace_seq_printf(s, "commit overrun: %ld\n", cnt);
4044 4141
4142 cnt = ring_buffer_bytes_cpu(tr->buffer, cpu);
4143 trace_seq_printf(s, "bytes: %ld\n", cnt);
4144
4145 t = ns2usecs(ring_buffer_oldest_event_ts(tr->buffer, cpu));
4146 usec_rem = do_div(t, USEC_PER_SEC);
4147 trace_seq_printf(s, "oldest event ts: %5llu.%06lu\n", t, usec_rem);
4148
4149 t = ns2usecs(ring_buffer_time_stamp(tr->buffer, cpu));
4150 usec_rem = do_div(t, USEC_PER_SEC);
4151 trace_seq_printf(s, "now ts: %5llu.%06lu\n", t, usec_rem);
4152
4045 count = simple_read_from_buffer(ubuf, count, ppos, s->buffer, s->len); 4153 count = simple_read_from_buffer(ubuf, count, ppos, s->buffer, s->len);
4046 4154
4047 kfree(s); 4155 kfree(s);
@@ -4450,6 +4558,9 @@ static __init int tracer_init_debugfs(void)
4450 trace_create_file("buffer_size_kb", 0644, d_tracer, 4558 trace_create_file("buffer_size_kb", 0644, d_tracer,
4451 &global_trace, &tracing_entries_fops); 4559 &global_trace, &tracing_entries_fops);
4452 4560
4561 trace_create_file("buffer_total_size_kb", 0444, d_tracer,
4562 &global_trace, &tracing_total_entries_fops);
4563
4453 trace_create_file("free_buffer", 0644, d_tracer, 4564 trace_create_file("free_buffer", 0644, d_tracer,
4454 &global_trace, &tracing_free_buffer_fops); 4565 &global_trace, &tracing_free_buffer_fops);
4455 4566
@@ -4566,6 +4677,12 @@ __ftrace_dump(bool disable_tracing, enum ftrace_dump_mode oops_dump_mode)
4566 4677
4567 tracing_off(); 4678 tracing_off();
4568 4679
4680 /* Did function tracer already get disabled? */
4681 if (ftrace_is_dead()) {
4682 printk("# WARNING: FUNCTION TRACING IS CORRUPTED\n");
4683 printk("# MAY BE MISSING FUNCTION EVENTS\n");
4684 }
4685
4569 if (disable_tracing) 4686 if (disable_tracing)
4570 ftrace_kill(); 4687 ftrace_kill();
4571 4688
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 616846bcfee5..092e1f8d18dc 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -579,11 +579,13 @@ static inline int ftrace_trace_task(struct task_struct *task)
579 579
580 return test_tsk_trace_trace(task); 580 return test_tsk_trace_trace(task);
581} 581}
582extern int ftrace_is_dead(void);
582#else 583#else
583static inline int ftrace_trace_task(struct task_struct *task) 584static inline int ftrace_trace_task(struct task_struct *task)
584{ 585{
585 return 1; 586 return 1;
586} 587}
588static inline int ftrace_is_dead(void) { return 0; }
587#endif 589#endif
588 590
589/* 591/*
@@ -761,16 +763,10 @@ struct filter_pred {
761 filter_pred_fn_t fn; 763 filter_pred_fn_t fn;
762 u64 val; 764 u64 val;
763 struct regex regex; 765 struct regex regex;
764 /* 766 unsigned short *ops;
765 * Leaf nodes use field_name, ops is used by AND and OR 767#ifdef CONFIG_FTRACE_STARTUP_TEST
766 * nodes. The field_name is always freed when freeing a pred. 768 struct ftrace_event_field *field;
767 * We can overload field_name for ops and have it freed 769#endif
768 * as well.
769 */
770 union {
771 char *field_name;
772 unsigned short *ops;
773 };
774 int offset; 770 int offset;
775 int not; 771 int not;
776 int op; 772 int op;
diff --git a/kernel/trace/trace_clock.c b/kernel/trace/trace_clock.c
index 6302747a1398..394783531cbb 100644
--- a/kernel/trace/trace_clock.c
+++ b/kernel/trace/trace_clock.c
@@ -113,3 +113,15 @@ u64 notrace trace_clock_global(void)
113 113
114 return now; 114 return now;
115} 115}
116
117static atomic64_t trace_counter;
118
119/*
120 * trace_clock_counter(): simply an atomic counter.
121 * Use the trace_counter "counter" for cases where you do not care
122 * about timings, but are interested in strict ordering.
123 */
124u64 notrace trace_clock_counter(void)
125{
126 return atomic64_add_return(1, &trace_counter);
127}
diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c
index 256764ecccd6..816d3d074979 100644
--- a/kernel/trace/trace_events_filter.c
+++ b/kernel/trace/trace_events_filter.c
@@ -381,6 +381,63 @@ get_pred_parent(struct filter_pred *pred, struct filter_pred *preds,
381 return pred; 381 return pred;
382} 382}
383 383
384enum walk_return {
385 WALK_PRED_ABORT,
386 WALK_PRED_PARENT,
387 WALK_PRED_DEFAULT,
388};
389
390typedef int (*filter_pred_walkcb_t) (enum move_type move,
391 struct filter_pred *pred,
392 int *err, void *data);
393
394static int walk_pred_tree(struct filter_pred *preds,
395 struct filter_pred *root,
396 filter_pred_walkcb_t cb, void *data)
397{
398 struct filter_pred *pred = root;
399 enum move_type move = MOVE_DOWN;
400 int done = 0;
401
402 if (!preds)
403 return -EINVAL;
404
405 do {
406 int err = 0, ret;
407
408 ret = cb(move, pred, &err, data);
409 if (ret == WALK_PRED_ABORT)
410 return err;
411 if (ret == WALK_PRED_PARENT)
412 goto get_parent;
413
414 switch (move) {
415 case MOVE_DOWN:
416 if (pred->left != FILTER_PRED_INVALID) {
417 pred = &preds[pred->left];
418 continue;
419 }
420 goto get_parent;
421 case MOVE_UP_FROM_LEFT:
422 pred = &preds[pred->right];
423 move = MOVE_DOWN;
424 continue;
425 case MOVE_UP_FROM_RIGHT:
426 get_parent:
427 if (pred == root)
428 break;
429 pred = get_pred_parent(pred, preds,
430 pred->parent,
431 &move);
432 continue;
433 }
434 done = 1;
435 } while (!done);
436
437 /* We are fine. */
438 return 0;
439}
440
384/* 441/*
385 * A series of AND or ORs where found together. Instead of 442 * A series of AND or ORs where found together. Instead of
386 * climbing up and down the tree branches, an array of the 443 * climbing up and down the tree branches, an array of the
@@ -410,99 +467,91 @@ static int process_ops(struct filter_pred *preds,
410 467
411 for (i = 0; i < op->val; i++) { 468 for (i = 0; i < op->val; i++) {
412 pred = &preds[op->ops[i]]; 469 pred = &preds[op->ops[i]];
413 match = pred->fn(pred, rec); 470 if (!WARN_ON_ONCE(!pred->fn))
471 match = pred->fn(pred, rec);
414 if (!!match == type) 472 if (!!match == type)
415 return match; 473 return match;
416 } 474 }
417 return match; 475 return match;
418} 476}
419 477
478struct filter_match_preds_data {
479 struct filter_pred *preds;
480 int match;
481 void *rec;
482};
483
484static int filter_match_preds_cb(enum move_type move, struct filter_pred *pred,
485 int *err, void *data)
486{
487 struct filter_match_preds_data *d = data;
488
489 *err = 0;
490 switch (move) {
491 case MOVE_DOWN:
492 /* only AND and OR have children */
493 if (pred->left != FILTER_PRED_INVALID) {
494 /* If ops is set, then it was folded. */
495 if (!pred->ops)
496 return WALK_PRED_DEFAULT;
497 /* We can treat folded ops as a leaf node */
498 d->match = process_ops(d->preds, pred, d->rec);
499 } else {
500 if (!WARN_ON_ONCE(!pred->fn))
501 d->match = pred->fn(pred, d->rec);
502 }
503
504 return WALK_PRED_PARENT;
505 case MOVE_UP_FROM_LEFT:
506 /*
507 * Check for short circuits.
508 *
509 * Optimization: !!match == (pred->op == OP_OR)
510 * is the same as:
511 * if ((match && pred->op == OP_OR) ||
512 * (!match && pred->op == OP_AND))
513 */
514 if (!!d->match == (pred->op == OP_OR))
515 return WALK_PRED_PARENT;
516 break;
517 case MOVE_UP_FROM_RIGHT:
518 break;
519 }
520
521 return WALK_PRED_DEFAULT;
522}
523
420/* return 1 if event matches, 0 otherwise (discard) */ 524/* return 1 if event matches, 0 otherwise (discard) */
421int filter_match_preds(struct event_filter *filter, void *rec) 525int filter_match_preds(struct event_filter *filter, void *rec)
422{ 526{
423 int match = -1;
424 enum move_type move = MOVE_DOWN;
425 struct filter_pred *preds; 527 struct filter_pred *preds;
426 struct filter_pred *pred;
427 struct filter_pred *root; 528 struct filter_pred *root;
428 int n_preds; 529 struct filter_match_preds_data data = {
429 int done = 0; 530 /* match is currently meaningless */
531 .match = -1,
532 .rec = rec,
533 };
534 int n_preds, ret;
430 535
431 /* no filter is considered a match */ 536 /* no filter is considered a match */
432 if (!filter) 537 if (!filter)
433 return 1; 538 return 1;
434 539
435 n_preds = filter->n_preds; 540 n_preds = filter->n_preds;
436
437 if (!n_preds) 541 if (!n_preds)
438 return 1; 542 return 1;
439 543
440 /* 544 /*
441 * n_preds, root and filter->preds are protect with preemption disabled. 545 * n_preds, root and filter->preds are protect with preemption disabled.
442 */ 546 */
443 preds = rcu_dereference_sched(filter->preds);
444 root = rcu_dereference_sched(filter->root); 547 root = rcu_dereference_sched(filter->root);
445 if (!root) 548 if (!root)
446 return 1; 549 return 1;
447 550
448 pred = root; 551 data.preds = preds = rcu_dereference_sched(filter->preds);
449 552 ret = walk_pred_tree(preds, root, filter_match_preds_cb, &data);
450 /* match is currently meaningless */ 553 WARN_ON(ret);
451 match = -1; 554 return data.match;
452
453 do {
454 switch (move) {
455 case MOVE_DOWN:
456 /* only AND and OR have children */
457 if (pred->left != FILTER_PRED_INVALID) {
458 /* If ops is set, then it was folded. */
459 if (!pred->ops) {
460 /* keep going to down the left side */
461 pred = &preds[pred->left];
462 continue;
463 }
464 /* We can treat folded ops as a leaf node */
465 match = process_ops(preds, pred, rec);
466 } else
467 match = pred->fn(pred, rec);
468 /* If this pred is the only pred */
469 if (pred == root)
470 break;
471 pred = get_pred_parent(pred, preds,
472 pred->parent, &move);
473 continue;
474 case MOVE_UP_FROM_LEFT:
475 /*
476 * Check for short circuits.
477 *
478 * Optimization: !!match == (pred->op == OP_OR)
479 * is the same as:
480 * if ((match && pred->op == OP_OR) ||
481 * (!match && pred->op == OP_AND))
482 */
483 if (!!match == (pred->op == OP_OR)) {
484 if (pred == root)
485 break;
486 pred = get_pred_parent(pred, preds,
487 pred->parent, &move);
488 continue;
489 }
490 /* now go down the right side of the tree. */
491 pred = &preds[pred->right];
492 move = MOVE_DOWN;
493 continue;
494 case MOVE_UP_FROM_RIGHT:
495 /* We finished this equation. */
496 if (pred == root)
497 break;
498 pred = get_pred_parent(pred, preds,
499 pred->parent, &move);
500 continue;
501 }
502 done = 1;
503 } while (!done);
504
505 return match;
506} 555}
507EXPORT_SYMBOL_GPL(filter_match_preds); 556EXPORT_SYMBOL_GPL(filter_match_preds);
508 557
@@ -628,22 +677,6 @@ find_event_field(struct ftrace_event_call *call, char *name)
628 return __find_event_field(head, name); 677 return __find_event_field(head, name);
629} 678}
630 679
631static void filter_free_pred(struct filter_pred *pred)
632{
633 if (!pred)
634 return;
635
636 kfree(pred->field_name);
637 kfree(pred);
638}
639
640static void filter_clear_pred(struct filter_pred *pred)
641{
642 kfree(pred->field_name);
643 pred->field_name = NULL;
644 pred->regex.len = 0;
645}
646
647static int __alloc_pred_stack(struct pred_stack *stack, int n_preds) 680static int __alloc_pred_stack(struct pred_stack *stack, int n_preds)
648{ 681{
649 stack->preds = kzalloc(sizeof(*stack->preds)*(n_preds + 1), GFP_KERNEL); 682 stack->preds = kzalloc(sizeof(*stack->preds)*(n_preds + 1), GFP_KERNEL);
@@ -689,20 +722,13 @@ __pop_pred_stack(struct pred_stack *stack)
689static int filter_set_pred(struct event_filter *filter, 722static int filter_set_pred(struct event_filter *filter,
690 int idx, 723 int idx,
691 struct pred_stack *stack, 724 struct pred_stack *stack,
692 struct filter_pred *src, 725 struct filter_pred *src)
693 filter_pred_fn_t fn)
694{ 726{
695 struct filter_pred *dest = &filter->preds[idx]; 727 struct filter_pred *dest = &filter->preds[idx];
696 struct filter_pred *left; 728 struct filter_pred *left;
697 struct filter_pred *right; 729 struct filter_pred *right;
698 730
699 *dest = *src; 731 *dest = *src;
700 if (src->field_name) {
701 dest->field_name = kstrdup(src->field_name, GFP_KERNEL);
702 if (!dest->field_name)
703 return -ENOMEM;
704 }
705 dest->fn = fn;
706 dest->index = idx; 732 dest->index = idx;
707 733
708 if (dest->op == OP_OR || dest->op == OP_AND) { 734 if (dest->op == OP_OR || dest->op == OP_AND) {
@@ -743,11 +769,7 @@ static int filter_set_pred(struct event_filter *filter,
743 769
744static void __free_preds(struct event_filter *filter) 770static void __free_preds(struct event_filter *filter)
745{ 771{
746 int i;
747
748 if (filter->preds) { 772 if (filter->preds) {
749 for (i = 0; i < filter->a_preds; i++)
750 kfree(filter->preds[i].field_name);
751 kfree(filter->preds); 773 kfree(filter->preds);
752 filter->preds = NULL; 774 filter->preds = NULL;
753 } 775 }
@@ -840,23 +862,19 @@ static void filter_free_subsystem_filters(struct event_subsystem *system)
840 } 862 }
841} 863}
842 864
843static int filter_add_pred_fn(struct filter_parse_state *ps, 865static int filter_add_pred(struct filter_parse_state *ps,
844 struct ftrace_event_call *call, 866 struct event_filter *filter,
845 struct event_filter *filter, 867 struct filter_pred *pred,
846 struct filter_pred *pred, 868 struct pred_stack *stack)
847 struct pred_stack *stack,
848 filter_pred_fn_t fn)
849{ 869{
850 int idx, err; 870 int err;
851 871
852 if (WARN_ON(filter->n_preds == filter->a_preds)) { 872 if (WARN_ON(filter->n_preds == filter->a_preds)) {
853 parse_error(ps, FILT_ERR_TOO_MANY_PREDS, 0); 873 parse_error(ps, FILT_ERR_TOO_MANY_PREDS, 0);
854 return -ENOSPC; 874 return -ENOSPC;
855 } 875 }
856 876
857 idx = filter->n_preds; 877 err = filter_set_pred(filter, filter->n_preds, stack, pred);
858 filter_clear_pred(&filter->preds[idx]);
859 err = filter_set_pred(filter, idx, stack, pred, fn);
860 if (err) 878 if (err)
861 return err; 879 return err;
862 880
@@ -937,31 +955,15 @@ static filter_pred_fn_t select_comparison_fn(int op, int field_size,
937 return fn; 955 return fn;
938} 956}
939 957
940static int filter_add_pred(struct filter_parse_state *ps, 958static int init_pred(struct filter_parse_state *ps,
941 struct ftrace_event_call *call, 959 struct ftrace_event_field *field,
942 struct event_filter *filter, 960 struct filter_pred *pred)
943 struct filter_pred *pred, 961
944 struct pred_stack *stack,
945 bool dry_run)
946{ 962{
947 struct ftrace_event_field *field; 963 filter_pred_fn_t fn = filter_pred_none;
948 filter_pred_fn_t fn;
949 unsigned long long val; 964 unsigned long long val;
950 int ret; 965 int ret;
951 966
952 fn = pred->fn = filter_pred_none;
953
954 if (pred->op == OP_AND)
955 goto add_pred_fn;
956 else if (pred->op == OP_OR)
957 goto add_pred_fn;
958
959 field = find_event_field(call, pred->field_name);
960 if (!field) {
961 parse_error(ps, FILT_ERR_FIELD_NOT_FOUND, 0);
962 return -EINVAL;
963 }
964
965 pred->offset = field->offset; 967 pred->offset = field->offset;
966 968
967 if (!is_legal_op(field, pred->op)) { 969 if (!is_legal_op(field, pred->op)) {
@@ -1001,9 +1003,7 @@ static int filter_add_pred(struct filter_parse_state *ps,
1001 if (pred->op == OP_NE) 1003 if (pred->op == OP_NE)
1002 pred->not = 1; 1004 pred->not = 1;
1003 1005
1004add_pred_fn: 1006 pred->fn = fn;
1005 if (!dry_run)
1006 return filter_add_pred_fn(ps, call, filter, pred, stack, fn);
1007 return 0; 1007 return 0;
1008} 1008}
1009 1009
@@ -1302,39 +1302,37 @@ parse_operand:
1302 return 0; 1302 return 0;
1303} 1303}
1304 1304
1305static struct filter_pred *create_pred(int op, char *operand1, char *operand2) 1305static struct filter_pred *create_pred(struct filter_parse_state *ps,
1306 struct ftrace_event_call *call,
1307 int op, char *operand1, char *operand2)
1306{ 1308{
1307 struct filter_pred *pred; 1309 struct ftrace_event_field *field;
1310 static struct filter_pred pred;
1308 1311
1309 pred = kzalloc(sizeof(*pred), GFP_KERNEL); 1312 memset(&pred, 0, sizeof(pred));
1310 if (!pred) 1313 pred.op = op;
1311 return NULL;
1312 1314
1313 pred->field_name = kstrdup(operand1, GFP_KERNEL); 1315 if (op == OP_AND || op == OP_OR)
1314 if (!pred->field_name) { 1316 return &pred;
1315 kfree(pred); 1317
1318 if (!operand1 || !operand2) {
1319 parse_error(ps, FILT_ERR_MISSING_FIELD, 0);
1316 return NULL; 1320 return NULL;
1317 } 1321 }
1318 1322
1319 strcpy(pred->regex.pattern, operand2); 1323 field = find_event_field(call, operand1);
1320 pred->regex.len = strlen(pred->regex.pattern); 1324 if (!field) {
1321 1325 parse_error(ps, FILT_ERR_FIELD_NOT_FOUND, 0);
1322 pred->op = op;
1323
1324 return pred;
1325}
1326
1327static struct filter_pred *create_logical_pred(int op)
1328{
1329 struct filter_pred *pred;
1330
1331 pred = kzalloc(sizeof(*pred), GFP_KERNEL);
1332 if (!pred)
1333 return NULL; 1326 return NULL;
1327 }
1334 1328
1335 pred->op = op; 1329 strcpy(pred.regex.pattern, operand2);
1330 pred.regex.len = strlen(pred.regex.pattern);
1336 1331
1337 return pred; 1332#ifdef CONFIG_FTRACE_STARTUP_TEST
1333 pred.field = field;
1334#endif
1335 return init_pred(ps, field, &pred) ? NULL : &pred;
1338} 1336}
1339 1337
1340static int check_preds(struct filter_parse_state *ps) 1338static int check_preds(struct filter_parse_state *ps)
@@ -1375,6 +1373,23 @@ static int count_preds(struct filter_parse_state *ps)
1375 return n_preds; 1373 return n_preds;
1376} 1374}
1377 1375
1376struct check_pred_data {
1377 int count;
1378 int max;
1379};
1380
1381static int check_pred_tree_cb(enum move_type move, struct filter_pred *pred,
1382 int *err, void *data)
1383{
1384 struct check_pred_data *d = data;
1385
1386 if (WARN_ON(d->count++ > d->max)) {
1387 *err = -EINVAL;
1388 return WALK_PRED_ABORT;
1389 }
1390 return WALK_PRED_DEFAULT;
1391}
1392
1378/* 1393/*
1379 * The tree is walked at filtering of an event. If the tree is not correctly 1394 * The tree is walked at filtering of an event. If the tree is not correctly
1380 * built, it may cause an infinite loop. Check here that the tree does 1395 * built, it may cause an infinite loop. Check here that the tree does
@@ -1383,107 +1398,76 @@ static int count_preds(struct filter_parse_state *ps)
1383static int check_pred_tree(struct event_filter *filter, 1398static int check_pred_tree(struct event_filter *filter,
1384 struct filter_pred *root) 1399 struct filter_pred *root)
1385{ 1400{
1386 struct filter_pred *preds; 1401 struct check_pred_data data = {
1387 struct filter_pred *pred; 1402 /*
1388 enum move_type move = MOVE_DOWN; 1403 * The max that we can hit a node is three times.
1389 int count = 0; 1404 * Once going down, once coming up from left, and
1390 int done = 0; 1405 * once coming up from right. This is more than enough
1391 int max; 1406 * since leafs are only hit a single time.
1392 1407 */
1393 /* 1408 .max = 3 * filter->n_preds,
1394 * The max that we can hit a node is three times. 1409 .count = 0,
1395 * Once going down, once coming up from left, and 1410 };
1396 * once coming up from right. This is more than enough
1397 * since leafs are only hit a single time.
1398 */
1399 max = 3 * filter->n_preds;
1400 1411
1401 preds = filter->preds; 1412 return walk_pred_tree(filter->preds, root,
1402 if (!preds) 1413 check_pred_tree_cb, &data);
1403 return -EINVAL; 1414}
1404 pred = root;
1405 1415
1406 do { 1416static int count_leafs_cb(enum move_type move, struct filter_pred *pred,
1407 if (WARN_ON(count++ > max)) 1417 int *err, void *data)
1408 return -EINVAL; 1418{
1419 int *count = data;
1409 1420
1410 switch (move) { 1421 if ((move == MOVE_DOWN) &&
1411 case MOVE_DOWN: 1422 (pred->left == FILTER_PRED_INVALID))
1412 if (pred->left != FILTER_PRED_INVALID) { 1423 (*count)++;
1413 pred = &preds[pred->left];
1414 continue;
1415 }
1416 /* A leaf at the root is just a leaf in the tree */
1417 if (pred == root)
1418 break;
1419 pred = get_pred_parent(pred, preds,
1420 pred->parent, &move);
1421 continue;
1422 case MOVE_UP_FROM_LEFT:
1423 pred = &preds[pred->right];
1424 move = MOVE_DOWN;
1425 continue;
1426 case MOVE_UP_FROM_RIGHT:
1427 if (pred == root)
1428 break;
1429 pred = get_pred_parent(pred, preds,
1430 pred->parent, &move);
1431 continue;
1432 }
1433 done = 1;
1434 } while (!done);
1435 1424
1436 /* We are fine. */ 1425 return WALK_PRED_DEFAULT;
1437 return 0;
1438} 1426}
1439 1427
1440static int count_leafs(struct filter_pred *preds, struct filter_pred *root) 1428static int count_leafs(struct filter_pred *preds, struct filter_pred *root)
1441{ 1429{
1442 struct filter_pred *pred; 1430 int count = 0, ret;
1443 enum move_type move = MOVE_DOWN;
1444 int count = 0;
1445 int done = 0;
1446 1431
1447 pred = root; 1432 ret = walk_pred_tree(preds, root, count_leafs_cb, &count);
1433 WARN_ON(ret);
1434 return count;
1435}
1448 1436
1449 do { 1437struct fold_pred_data {
1450 switch (move) { 1438 struct filter_pred *root;
1451 case MOVE_DOWN: 1439 int count;
1452 if (pred->left != FILTER_PRED_INVALID) { 1440 int children;
1453 pred = &preds[pred->left]; 1441};
1454 continue;
1455 }
1456 /* A leaf at the root is just a leaf in the tree */
1457 if (pred == root)
1458 return 1;
1459 count++;
1460 pred = get_pred_parent(pred, preds,
1461 pred->parent, &move);
1462 continue;
1463 case MOVE_UP_FROM_LEFT:
1464 pred = &preds[pred->right];
1465 move = MOVE_DOWN;
1466 continue;
1467 case MOVE_UP_FROM_RIGHT:
1468 if (pred == root)
1469 break;
1470 pred = get_pred_parent(pred, preds,
1471 pred->parent, &move);
1472 continue;
1473 }
1474 done = 1;
1475 } while (!done);
1476 1442
1477 return count; 1443static int fold_pred_cb(enum move_type move, struct filter_pred *pred,
1444 int *err, void *data)
1445{
1446 struct fold_pred_data *d = data;
1447 struct filter_pred *root = d->root;
1448
1449 if (move != MOVE_DOWN)
1450 return WALK_PRED_DEFAULT;
1451 if (pred->left != FILTER_PRED_INVALID)
1452 return WALK_PRED_DEFAULT;
1453
1454 if (WARN_ON(d->count == d->children)) {
1455 *err = -EINVAL;
1456 return WALK_PRED_ABORT;
1457 }
1458
1459 pred->index &= ~FILTER_PRED_FOLD;
1460 root->ops[d->count++] = pred->index;
1461 return WALK_PRED_DEFAULT;
1478} 1462}
1479 1463
1480static int fold_pred(struct filter_pred *preds, struct filter_pred *root) 1464static int fold_pred(struct filter_pred *preds, struct filter_pred *root)
1481{ 1465{
1482 struct filter_pred *pred; 1466 struct fold_pred_data data = {
1483 enum move_type move = MOVE_DOWN; 1467 .root = root,
1484 int count = 0; 1468 .count = 0,
1469 };
1485 int children; 1470 int children;
1486 int done = 0;
1487 1471
1488 /* No need to keep the fold flag */ 1472 /* No need to keep the fold flag */
1489 root->index &= ~FILTER_PRED_FOLD; 1473 root->index &= ~FILTER_PRED_FOLD;
@@ -1501,37 +1485,26 @@ static int fold_pred(struct filter_pred *preds, struct filter_pred *root)
1501 return -ENOMEM; 1485 return -ENOMEM;
1502 1486
1503 root->val = children; 1487 root->val = children;
1488 data.children = children;
1489 return walk_pred_tree(preds, root, fold_pred_cb, &data);
1490}
1504 1491
1505 pred = root; 1492static int fold_pred_tree_cb(enum move_type move, struct filter_pred *pred,
1506 do { 1493 int *err, void *data)
1507 switch (move) { 1494{
1508 case MOVE_DOWN: 1495 struct filter_pred *preds = data;
1509 if (pred->left != FILTER_PRED_INVALID) {
1510 pred = &preds[pred->left];
1511 continue;
1512 }
1513 if (WARN_ON(count == children))
1514 return -EINVAL;
1515 pred->index &= ~FILTER_PRED_FOLD;
1516 root->ops[count++] = pred->index;
1517 pred = get_pred_parent(pred, preds,
1518 pred->parent, &move);
1519 continue;
1520 case MOVE_UP_FROM_LEFT:
1521 pred = &preds[pred->right];
1522 move = MOVE_DOWN;
1523 continue;
1524 case MOVE_UP_FROM_RIGHT:
1525 if (pred == root)
1526 break;
1527 pred = get_pred_parent(pred, preds,
1528 pred->parent, &move);
1529 continue;
1530 }
1531 done = 1;
1532 } while (!done);
1533 1496
1534 return 0; 1497 if (move != MOVE_DOWN)
1498 return WALK_PRED_DEFAULT;
1499 if (!(pred->index & FILTER_PRED_FOLD))
1500 return WALK_PRED_DEFAULT;
1501
1502 *err = fold_pred(preds, pred);
1503 if (*err)
1504 return WALK_PRED_ABORT;
1505
1506 /* eveyrhing below is folded, continue with parent */
1507 return WALK_PRED_PARENT;
1535} 1508}
1536 1509
1537/* 1510/*
@@ -1542,51 +1515,8 @@ static int fold_pred(struct filter_pred *preds, struct filter_pred *root)
1542static int fold_pred_tree(struct event_filter *filter, 1515static int fold_pred_tree(struct event_filter *filter,
1543 struct filter_pred *root) 1516 struct filter_pred *root)
1544{ 1517{
1545 struct filter_pred *preds; 1518 return walk_pred_tree(filter->preds, root, fold_pred_tree_cb,
1546 struct filter_pred *pred; 1519 filter->preds);
1547 enum move_type move = MOVE_DOWN;
1548 int done = 0;
1549 int err;
1550
1551 preds = filter->preds;
1552 if (!preds)
1553 return -EINVAL;
1554 pred = root;
1555
1556 do {
1557 switch (move) {
1558 case MOVE_DOWN:
1559 if (pred->index & FILTER_PRED_FOLD) {
1560 err = fold_pred(preds, pred);
1561 if (err)
1562 return err;
1563 /* Folded nodes are like leafs */
1564 } else if (pred->left != FILTER_PRED_INVALID) {
1565 pred = &preds[pred->left];
1566 continue;
1567 }
1568
1569 /* A leaf at the root is just a leaf in the tree */
1570 if (pred == root)
1571 break;
1572 pred = get_pred_parent(pred, preds,
1573 pred->parent, &move);
1574 continue;
1575 case MOVE_UP_FROM_LEFT:
1576 pred = &preds[pred->right];
1577 move = MOVE_DOWN;
1578 continue;
1579 case MOVE_UP_FROM_RIGHT:
1580 if (pred == root)
1581 break;
1582 pred = get_pred_parent(pred, preds,
1583 pred->parent, &move);
1584 continue;
1585 }
1586 done = 1;
1587 } while (!done);
1588
1589 return 0;
1590} 1520}
1591 1521
1592static int replace_preds(struct ftrace_event_call *call, 1522static int replace_preds(struct ftrace_event_call *call,
@@ -1643,27 +1573,17 @@ static int replace_preds(struct ftrace_event_call *call,
1643 goto fail; 1573 goto fail;
1644 } 1574 }
1645 1575
1646 if (elt->op == OP_AND || elt->op == OP_OR) { 1576 pred = create_pred(ps, call, elt->op, operand1, operand2);
1647 pred = create_logical_pred(elt->op); 1577 if (!pred) {
1648 goto add_pred;
1649 }
1650
1651 if (!operand1 || !operand2) {
1652 parse_error(ps, FILT_ERR_MISSING_FIELD, 0);
1653 err = -EINVAL; 1578 err = -EINVAL;
1654 goto fail; 1579 goto fail;
1655 } 1580 }
1656 1581
1657 pred = create_pred(elt->op, operand1, operand2); 1582 if (!dry_run) {
1658add_pred: 1583 err = filter_add_pred(ps, filter, pred, &stack);
1659 if (!pred) { 1584 if (err)
1660 err = -ENOMEM; 1585 goto fail;
1661 goto fail;
1662 } 1586 }
1663 err = filter_add_pred(ps, call, filter, pred, &stack, dry_run);
1664 filter_free_pred(pred);
1665 if (err)
1666 goto fail;
1667 1587
1668 operand1 = operand2 = NULL; 1588 operand1 = operand2 = NULL;
1669 } 1589 }
@@ -1958,17 +1878,14 @@ int ftrace_profile_set_filter(struct perf_event *event, int event_id,
1958 int err; 1878 int err;
1959 struct event_filter *filter; 1879 struct event_filter *filter;
1960 struct filter_parse_state *ps; 1880 struct filter_parse_state *ps;
1961 struct ftrace_event_call *call = NULL; 1881 struct ftrace_event_call *call;
1962 1882
1963 mutex_lock(&event_mutex); 1883 mutex_lock(&event_mutex);
1964 1884
1965 list_for_each_entry(call, &ftrace_events, list) { 1885 call = event->tp_event;
1966 if (call->event.type == event_id)
1967 break;
1968 }
1969 1886
1970 err = -EINVAL; 1887 err = -EINVAL;
1971 if (&call->list == &ftrace_events) 1888 if (!call)
1972 goto out_unlock; 1889 goto out_unlock;
1973 1890
1974 err = -EEXIST; 1891 err = -EEXIST;
@@ -2012,3 +1929,215 @@ out_unlock:
2012 1929
2013#endif /* CONFIG_PERF_EVENTS */ 1930#endif /* CONFIG_PERF_EVENTS */
2014 1931
1932#ifdef CONFIG_FTRACE_STARTUP_TEST
1933
1934#include <linux/types.h>
1935#include <linux/tracepoint.h>
1936
1937#define CREATE_TRACE_POINTS
1938#include "trace_events_filter_test.h"
1939
1940static int test_get_filter(char *filter_str, struct ftrace_event_call *call,
1941 struct event_filter **pfilter)
1942{
1943 struct event_filter *filter;
1944 struct filter_parse_state *ps;
1945 int err = -ENOMEM;
1946
1947 filter = __alloc_filter();
1948 if (!filter)
1949 goto out;
1950
1951 ps = kzalloc(sizeof(*ps), GFP_KERNEL);
1952 if (!ps)
1953 goto free_filter;
1954
1955 parse_init(ps, filter_ops, filter_str);
1956 err = filter_parse(ps);
1957 if (err)
1958 goto free_ps;
1959
1960 err = replace_preds(call, filter, ps, filter_str, false);
1961 if (!err)
1962 *pfilter = filter;
1963
1964 free_ps:
1965 filter_opstack_clear(ps);
1966 postfix_clear(ps);
1967 kfree(ps);
1968
1969 free_filter:
1970 if (err)
1971 __free_filter(filter);
1972
1973 out:
1974 return err;
1975}
1976
1977#define DATA_REC(m, va, vb, vc, vd, ve, vf, vg, vh, nvisit) \
1978{ \
1979 .filter = FILTER, \
1980 .rec = { .a = va, .b = vb, .c = vc, .d = vd, \
1981 .e = ve, .f = vf, .g = vg, .h = vh }, \
1982 .match = m, \
1983 .not_visited = nvisit, \
1984}
1985#define YES 1
1986#define NO 0
1987
1988static struct test_filter_data_t {
1989 char *filter;
1990 struct ftrace_raw_ftrace_test_filter rec;
1991 int match;
1992 char *not_visited;
1993} test_filter_data[] = {
1994#define FILTER "a == 1 && b == 1 && c == 1 && d == 1 && " \
1995 "e == 1 && f == 1 && g == 1 && h == 1"
1996 DATA_REC(YES, 1, 1, 1, 1, 1, 1, 1, 1, ""),
1997 DATA_REC(NO, 0, 1, 1, 1, 1, 1, 1, 1, "bcdefgh"),
1998 DATA_REC(NO, 1, 1, 1, 1, 1, 1, 1, 0, ""),
1999#undef FILTER
2000#define FILTER "a == 1 || b == 1 || c == 1 || d == 1 || " \
2001 "e == 1 || f == 1 || g == 1 || h == 1"
2002 DATA_REC(NO, 0, 0, 0, 0, 0, 0, 0, 0, ""),
2003 DATA_REC(YES, 0, 0, 0, 0, 0, 0, 0, 1, ""),
2004 DATA_REC(YES, 1, 0, 0, 0, 0, 0, 0, 0, "bcdefgh"),
2005#undef FILTER
2006#define FILTER "(a == 1 || b == 1) && (c == 1 || d == 1) && " \
2007 "(e == 1 || f == 1) && (g == 1 || h == 1)"
2008 DATA_REC(NO, 0, 0, 1, 1, 1, 1, 1, 1, "dfh"),
2009 DATA_REC(YES, 0, 1, 0, 1, 0, 1, 0, 1, ""),
2010 DATA_REC(YES, 1, 0, 1, 0, 0, 1, 0, 1, "bd"),
2011 DATA_REC(NO, 1, 0, 1, 0, 0, 1, 0, 0, "bd"),
2012#undef FILTER
2013#define FILTER "(a == 1 && b == 1) || (c == 1 && d == 1) || " \
2014 "(e == 1 && f == 1) || (g == 1 && h == 1)"
2015 DATA_REC(YES, 1, 0, 1, 1, 1, 1, 1, 1, "efgh"),
2016 DATA_REC(YES, 0, 0, 0, 0, 0, 0, 1, 1, ""),
2017 DATA_REC(NO, 0, 0, 0, 0, 0, 0, 0, 1, ""),
2018#undef FILTER
2019#define FILTER "(a == 1 && b == 1) && (c == 1 && d == 1) && " \
2020 "(e == 1 && f == 1) || (g == 1 && h == 1)"
2021 DATA_REC(YES, 1, 1, 1, 1, 1, 1, 0, 0, "gh"),
2022 DATA_REC(NO, 0, 0, 0, 0, 0, 0, 0, 1, ""),
2023 DATA_REC(YES, 1, 1, 1, 1, 1, 0, 1, 1, ""),
2024#undef FILTER
2025#define FILTER "((a == 1 || b == 1) || (c == 1 || d == 1) || " \
2026 "(e == 1 || f == 1)) && (g == 1 || h == 1)"
2027 DATA_REC(YES, 1, 1, 1, 1, 1, 1, 0, 1, "bcdef"),
2028 DATA_REC(NO, 0, 0, 0, 0, 0, 0, 0, 0, ""),
2029 DATA_REC(YES, 1, 1, 1, 1, 1, 0, 1, 1, "h"),
2030#undef FILTER
2031#define FILTER "((((((((a == 1) && (b == 1)) || (c == 1)) && (d == 1)) || " \
2032 "(e == 1)) && (f == 1)) || (g == 1)) && (h == 1))"
2033 DATA_REC(YES, 1, 1, 1, 1, 1, 1, 1, 1, "ceg"),
2034 DATA_REC(NO, 0, 1, 0, 1, 0, 1, 0, 1, ""),
2035 DATA_REC(NO, 1, 0, 1, 0, 1, 0, 1, 0, ""),
2036#undef FILTER
2037#define FILTER "((((((((a == 1) || (b == 1)) && (c == 1)) || (d == 1)) && " \
2038 "(e == 1)) || (f == 1)) && (g == 1)) || (h == 1))"
2039 DATA_REC(YES, 1, 1, 1, 1, 1, 1, 1, 1, "bdfh"),
2040 DATA_REC(YES, 0, 1, 0, 1, 0, 1, 0, 1, ""),
2041 DATA_REC(YES, 1, 0, 1, 0, 1, 0, 1, 0, "bdfh"),
2042};
2043
2044#undef DATA_REC
2045#undef FILTER
2046#undef YES
2047#undef NO
2048
2049#define DATA_CNT (sizeof(test_filter_data)/sizeof(struct test_filter_data_t))
2050
2051static int test_pred_visited;
2052
2053static int test_pred_visited_fn(struct filter_pred *pred, void *event)
2054{
2055 struct ftrace_event_field *field = pred->field;
2056
2057 test_pred_visited = 1;
2058 printk(KERN_INFO "\npred visited %s\n", field->name);
2059 return 1;
2060}
2061
2062static int test_walk_pred_cb(enum move_type move, struct filter_pred *pred,
2063 int *err, void *data)
2064{
2065 char *fields = data;
2066
2067 if ((move == MOVE_DOWN) &&
2068 (pred->left == FILTER_PRED_INVALID)) {
2069 struct ftrace_event_field *field = pred->field;
2070
2071 if (!field) {
2072 WARN(1, "all leafs should have field defined");
2073 return WALK_PRED_DEFAULT;
2074 }
2075 if (!strchr(fields, *field->name))
2076 return WALK_PRED_DEFAULT;
2077
2078 WARN_ON(!pred->fn);
2079 pred->fn = test_pred_visited_fn;
2080 }
2081 return WALK_PRED_DEFAULT;
2082}
2083
2084static __init int ftrace_test_event_filter(void)
2085{
2086 int i;
2087
2088 printk(KERN_INFO "Testing ftrace filter: ");
2089
2090 for (i = 0; i < DATA_CNT; i++) {
2091 struct event_filter *filter = NULL;
2092 struct test_filter_data_t *d = &test_filter_data[i];
2093 int err;
2094
2095 err = test_get_filter(d->filter, &event_ftrace_test_filter,
2096 &filter);
2097 if (err) {
2098 printk(KERN_INFO
2099 "Failed to get filter for '%s', err %d\n",
2100 d->filter, err);
2101 break;
2102 }
2103
2104 /*
2105 * The preemption disabling is not really needed for self
2106 * tests, but the rcu dereference will complain without it.
2107 */
2108 preempt_disable();
2109 if (*d->not_visited)
2110 walk_pred_tree(filter->preds, filter->root,
2111 test_walk_pred_cb,
2112 d->not_visited);
2113
2114 test_pred_visited = 0;
2115 err = filter_match_preds(filter, &d->rec);
2116 preempt_enable();
2117
2118 __free_filter(filter);
2119
2120 if (test_pred_visited) {
2121 printk(KERN_INFO
2122 "Failed, unwanted pred visited for filter %s\n",
2123 d->filter);
2124 break;
2125 }
2126
2127 if (err != d->match) {
2128 printk(KERN_INFO
2129 "Failed to match filter '%s', expected %d\n",
2130 d->filter, d->match);
2131 break;
2132 }
2133 }
2134
2135 if (i == DATA_CNT)
2136 printk(KERN_CONT "OK\n");
2137
2138 return 0;
2139}
2140
2141late_initcall(ftrace_test_event_filter);
2142
2143#endif /* CONFIG_FTRACE_STARTUP_TEST */
diff --git a/kernel/trace/trace_events_filter_test.h b/kernel/trace/trace_events_filter_test.h
new file mode 100644
index 000000000000..bfd4dba0d603
--- /dev/null
+++ b/kernel/trace/trace_events_filter_test.h
@@ -0,0 +1,50 @@
1#undef TRACE_SYSTEM
2#define TRACE_SYSTEM test
3
4#if !defined(_TRACE_TEST_H) || defined(TRACE_HEADER_MULTI_READ)
5#define _TRACE_TEST_H
6
7#include <linux/tracepoint.h>
8
9TRACE_EVENT(ftrace_test_filter,
10
11 TP_PROTO(int a, int b, int c, int d, int e, int f, int g, int h),
12
13 TP_ARGS(a, b, c, d, e, f, g, h),
14
15 TP_STRUCT__entry(
16 __field(int, a)
17 __field(int, b)
18 __field(int, c)
19 __field(int, d)
20 __field(int, e)
21 __field(int, f)
22 __field(int, g)
23 __field(int, h)
24 ),
25
26 TP_fast_assign(
27 __entry->a = a;
28 __entry->b = b;
29 __entry->c = c;
30 __entry->d = d;
31 __entry->e = e;
32 __entry->f = f;
33 __entry->g = g;
34 __entry->h = h;
35 ),
36
37 TP_printk("a %d, b %d, c %d, d %d, e %d, f %d, g %d, h %d",
38 __entry->a, __entry->b, __entry->c, __entry->d,
39 __entry->e, __entry->f, __entry->g, __entry->h)
40);
41
42#endif /* _TRACE_TEST_H || TRACE_HEADER_MULTI_READ */
43
44#undef TRACE_INCLUDE_PATH
45#undef TRACE_INCLUDE_FILE
46#define TRACE_INCLUDE_PATH .
47#define TRACE_INCLUDE_FILE trace_events_filter_test
48
49/* This part must be outside protection */
50#include <trace/define_trace.h>
diff --git a/kernel/trace/trace_irqsoff.c b/kernel/trace/trace_irqsoff.c
index 667aa8cc0cfc..20dad0d7a163 100644
--- a/kernel/trace/trace_irqsoff.c
+++ b/kernel/trace/trace_irqsoff.c
@@ -23,7 +23,7 @@ static int tracer_enabled __read_mostly;
23 23
24static DEFINE_PER_CPU(int, tracing_cpu); 24static DEFINE_PER_CPU(int, tracing_cpu);
25 25
26static DEFINE_SPINLOCK(max_trace_lock); 26static DEFINE_RAW_SPINLOCK(max_trace_lock);
27 27
28enum { 28enum {
29 TRACER_IRQS_OFF = (1 << 1), 29 TRACER_IRQS_OFF = (1 << 1),
@@ -321,7 +321,7 @@ check_critical_timing(struct trace_array *tr,
321 if (!report_latency(delta)) 321 if (!report_latency(delta))
322 goto out; 322 goto out;
323 323
324 spin_lock_irqsave(&max_trace_lock, flags); 324 raw_spin_lock_irqsave(&max_trace_lock, flags);
325 325
326 /* check if we are still the max latency */ 326 /* check if we are still the max latency */
327 if (!report_latency(delta)) 327 if (!report_latency(delta))
@@ -344,7 +344,7 @@ check_critical_timing(struct trace_array *tr,
344 max_sequence++; 344 max_sequence++;
345 345
346out_unlock: 346out_unlock:
347 spin_unlock_irqrestore(&max_trace_lock, flags); 347 raw_spin_unlock_irqrestore(&max_trace_lock, flags);
348 348
349out: 349out:
350 data->critical_sequence = max_sequence; 350 data->critical_sequence = max_sequence;
@@ -505,13 +505,13 @@ EXPORT_SYMBOL(trace_hardirqs_off_caller);
505#ifdef CONFIG_PREEMPT_TRACER 505#ifdef CONFIG_PREEMPT_TRACER
506void trace_preempt_on(unsigned long a0, unsigned long a1) 506void trace_preempt_on(unsigned long a0, unsigned long a1)
507{ 507{
508 if (preempt_trace()) 508 if (preempt_trace() && !irq_trace())
509 stop_critical_timing(a0, a1); 509 stop_critical_timing(a0, a1);
510} 510}
511 511
512void trace_preempt_off(unsigned long a0, unsigned long a1) 512void trace_preempt_off(unsigned long a0, unsigned long a1)
513{ 513{
514 if (preempt_trace()) 514 if (preempt_trace() && !irq_trace())
515 start_critical_timing(a0, a1); 515 start_critical_timing(a0, a1);
516} 516}
517#endif /* CONFIG_PREEMPT_TRACER */ 517#endif /* CONFIG_PREEMPT_TRACER */
diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c
index 5fb3697bf0e5..00d527c945a4 100644
--- a/kernel/trace/trace_kprobe.c
+++ b/kernel/trace/trace_kprobe.c
@@ -836,11 +836,17 @@ static void __unregister_trace_probe(struct trace_probe *tp)
836} 836}
837 837
838/* Unregister a trace_probe and probe_event: call with locking probe_lock */ 838/* Unregister a trace_probe and probe_event: call with locking probe_lock */
839static void unregister_trace_probe(struct trace_probe *tp) 839static int unregister_trace_probe(struct trace_probe *tp)
840{ 840{
841 /* Enabled event can not be unregistered */
842 if (trace_probe_is_enabled(tp))
843 return -EBUSY;
844
841 __unregister_trace_probe(tp); 845 __unregister_trace_probe(tp);
842 list_del(&tp->list); 846 list_del(&tp->list);
843 unregister_probe_event(tp); 847 unregister_probe_event(tp);
848
849 return 0;
844} 850}
845 851
846/* Register a trace_probe and probe_event */ 852/* Register a trace_probe and probe_event */
@@ -854,7 +860,9 @@ static int register_trace_probe(struct trace_probe *tp)
854 /* Delete old (same name) event if exist */ 860 /* Delete old (same name) event if exist */
855 old_tp = find_trace_probe(tp->call.name, tp->call.class->system); 861 old_tp = find_trace_probe(tp->call.name, tp->call.class->system);
856 if (old_tp) { 862 if (old_tp) {
857 unregister_trace_probe(old_tp); 863 ret = unregister_trace_probe(old_tp);
864 if (ret < 0)
865 goto end;
858 free_trace_probe(old_tp); 866 free_trace_probe(old_tp);
859 } 867 }
860 868
@@ -892,6 +900,7 @@ static int trace_probe_module_callback(struct notifier_block *nb,
892 mutex_lock(&probe_lock); 900 mutex_lock(&probe_lock);
893 list_for_each_entry(tp, &probe_list, list) { 901 list_for_each_entry(tp, &probe_list, list) {
894 if (trace_probe_within_module(tp, mod)) { 902 if (trace_probe_within_module(tp, mod)) {
903 /* Don't need to check busy - this should have gone. */
895 __unregister_trace_probe(tp); 904 __unregister_trace_probe(tp);
896 ret = __register_trace_probe(tp); 905 ret = __register_trace_probe(tp);
897 if (ret) 906 if (ret)
@@ -1205,10 +1214,11 @@ static int create_trace_probe(int argc, char **argv)
1205 return -ENOENT; 1214 return -ENOENT;
1206 } 1215 }
1207 /* delete an event */ 1216 /* delete an event */
1208 unregister_trace_probe(tp); 1217 ret = unregister_trace_probe(tp);
1209 free_trace_probe(tp); 1218 if (ret == 0)
1219 free_trace_probe(tp);
1210 mutex_unlock(&probe_lock); 1220 mutex_unlock(&probe_lock);
1211 return 0; 1221 return ret;
1212 } 1222 }
1213 1223
1214 if (argc < 2) { 1224 if (argc < 2) {
@@ -1317,18 +1327,29 @@ error:
1317 return ret; 1327 return ret;
1318} 1328}
1319 1329
1320static void release_all_trace_probes(void) 1330static int release_all_trace_probes(void)
1321{ 1331{
1322 struct trace_probe *tp; 1332 struct trace_probe *tp;
1333 int ret = 0;
1323 1334
1324 mutex_lock(&probe_lock); 1335 mutex_lock(&probe_lock);
1336 /* Ensure no probe is in use. */
1337 list_for_each_entry(tp, &probe_list, list)
1338 if (trace_probe_is_enabled(tp)) {
1339 ret = -EBUSY;
1340 goto end;
1341 }
1325 /* TODO: Use batch unregistration */ 1342 /* TODO: Use batch unregistration */
1326 while (!list_empty(&probe_list)) { 1343 while (!list_empty(&probe_list)) {
1327 tp = list_entry(probe_list.next, struct trace_probe, list); 1344 tp = list_entry(probe_list.next, struct trace_probe, list);
1328 unregister_trace_probe(tp); 1345 unregister_trace_probe(tp);
1329 free_trace_probe(tp); 1346 free_trace_probe(tp);
1330 } 1347 }
1348
1349end:
1331 mutex_unlock(&probe_lock); 1350 mutex_unlock(&probe_lock);
1351
1352 return ret;
1332} 1353}
1333 1354
1334/* Probes listing interfaces */ 1355/* Probes listing interfaces */
@@ -1380,9 +1401,13 @@ static const struct seq_operations probes_seq_op = {
1380 1401
1381static int probes_open(struct inode *inode, struct file *file) 1402static int probes_open(struct inode *inode, struct file *file)
1382{ 1403{
1383 if ((file->f_mode & FMODE_WRITE) && 1404 int ret;
1384 (file->f_flags & O_TRUNC)) 1405
1385 release_all_trace_probes(); 1406 if ((file->f_mode & FMODE_WRITE) && (file->f_flags & O_TRUNC)) {
1407 ret = release_all_trace_probes();
1408 if (ret < 0)
1409 return ret;
1410 }
1386 1411
1387 return seq_open(file, &probes_seq_op); 1412 return seq_open(file, &probes_seq_op);
1388} 1413}
@@ -2055,6 +2080,21 @@ static __init int kprobe_trace_self_tests_init(void)
2055 2080
2056 ret = target(1, 2, 3, 4, 5, 6); 2081 ret = target(1, 2, 3, 4, 5, 6);
2057 2082
2083 /* Disable trace points before removing it */
2084 tp = find_trace_probe("testprobe", KPROBE_EVENT_SYSTEM);
2085 if (WARN_ON_ONCE(tp == NULL)) {
2086 pr_warning("error on getting test probe.\n");
2087 warn++;
2088 } else
2089 disable_trace_probe(tp, TP_FLAG_TRACE);
2090
2091 tp = find_trace_probe("testprobe2", KPROBE_EVENT_SYSTEM);
2092 if (WARN_ON_ONCE(tp == NULL)) {
2093 pr_warning("error on getting 2nd test probe.\n");
2094 warn++;
2095 } else
2096 disable_trace_probe(tp, TP_FLAG_TRACE);
2097
2058 ret = command_trace_probe("-:testprobe"); 2098 ret = command_trace_probe("-:testprobe");
2059 if (WARN_ON_ONCE(ret)) { 2099 if (WARN_ON_ONCE(ret)) {
2060 pr_warning("error on deleting a probe.\n"); 2100 pr_warning("error on deleting a probe.\n");
diff --git a/kernel/trace/trace_printk.c b/kernel/trace/trace_printk.c
index 1f06468a10d7..6fd4ffd042f9 100644
--- a/kernel/trace/trace_printk.c
+++ b/kernel/trace/trace_printk.c
@@ -59,18 +59,19 @@ void hold_module_trace_bprintk_format(const char **start, const char **end)
59 continue; 59 continue;
60 } 60 }
61 61
62 fmt = NULL;
62 tb_fmt = kmalloc(sizeof(*tb_fmt), GFP_KERNEL); 63 tb_fmt = kmalloc(sizeof(*tb_fmt), GFP_KERNEL);
63 if (tb_fmt) 64 if (tb_fmt) {
64 fmt = kmalloc(strlen(*iter) + 1, GFP_KERNEL); 65 fmt = kmalloc(strlen(*iter) + 1, GFP_KERNEL);
65 if (tb_fmt && fmt) { 66 if (fmt) {
66 list_add_tail(&tb_fmt->list, &trace_bprintk_fmt_list); 67 list_add_tail(&tb_fmt->list, &trace_bprintk_fmt_list);
67 strcpy(fmt, *iter); 68 strcpy(fmt, *iter);
68 tb_fmt->fmt = fmt; 69 tb_fmt->fmt = fmt;
69 *iter = tb_fmt->fmt; 70 } else
70 } else { 71 kfree(tb_fmt);
71 kfree(tb_fmt);
72 *iter = NULL;
73 } 72 }
73 *iter = fmt;
74
74 } 75 }
75 mutex_unlock(&btrace_mutex); 76 mutex_unlock(&btrace_mutex);
76} 77}
diff --git a/kernel/tracepoint.c b/kernel/tracepoint.c
index b219f1449c54..db110b8ae030 100644
--- a/kernel/tracepoint.c
+++ b/kernel/tracepoint.c
@@ -34,11 +34,16 @@ extern struct tracepoint * const __stop___tracepoints_ptrs[];
34static const int tracepoint_debug; 34static const int tracepoint_debug;
35 35
36/* 36/*
37 * tracepoints_mutex nests inside module_mutex. Tracepoints mutex protects the 37 * Tracepoints mutex protects the builtin and module tracepoints and the hash
38 * builtin and module tracepoints and the hash table. 38 * table, as well as the local module list.
39 */ 39 */
40static DEFINE_MUTEX(tracepoints_mutex); 40static DEFINE_MUTEX(tracepoints_mutex);
41 41
42#ifdef CONFIG_MODULES
43/* Local list of struct module */
44static LIST_HEAD(tracepoint_module_list);
45#endif /* CONFIG_MODULES */
46
42/* 47/*
43 * Tracepoint hash table, containing the active tracepoints. 48 * Tracepoint hash table, containing the active tracepoints.
44 * Protected by tracepoints_mutex. 49 * Protected by tracepoints_mutex.
@@ -292,9 +297,10 @@ static void disable_tracepoint(struct tracepoint *elem)
292 * @end: end of the range 297 * @end: end of the range
293 * 298 *
294 * Updates the probe callback corresponding to a range of tracepoints. 299 * Updates the probe callback corresponding to a range of tracepoints.
300 * Called with tracepoints_mutex held.
295 */ 301 */
296void tracepoint_update_probe_range(struct tracepoint * const *begin, 302static void tracepoint_update_probe_range(struct tracepoint * const *begin,
297 struct tracepoint * const *end) 303 struct tracepoint * const *end)
298{ 304{
299 struct tracepoint * const *iter; 305 struct tracepoint * const *iter;
300 struct tracepoint_entry *mark_entry; 306 struct tracepoint_entry *mark_entry;
@@ -302,7 +308,6 @@ void tracepoint_update_probe_range(struct tracepoint * const *begin,
302 if (!begin) 308 if (!begin)
303 return; 309 return;
304 310
305 mutex_lock(&tracepoints_mutex);
306 for (iter = begin; iter < end; iter++) { 311 for (iter = begin; iter < end; iter++) {
307 mark_entry = get_tracepoint((*iter)->name); 312 mark_entry = get_tracepoint((*iter)->name);
308 if (mark_entry) { 313 if (mark_entry) {
@@ -312,11 +317,27 @@ void tracepoint_update_probe_range(struct tracepoint * const *begin,
312 disable_tracepoint(*iter); 317 disable_tracepoint(*iter);
313 } 318 }
314 } 319 }
315 mutex_unlock(&tracepoints_mutex);
316} 320}
317 321
322#ifdef CONFIG_MODULES
323void module_update_tracepoints(void)
324{
325 struct tp_module *tp_mod;
326
327 list_for_each_entry(tp_mod, &tracepoint_module_list, list)
328 tracepoint_update_probe_range(tp_mod->tracepoints_ptrs,
329 tp_mod->tracepoints_ptrs + tp_mod->num_tracepoints);
330}
331#else /* CONFIG_MODULES */
332void module_update_tracepoints(void)
333{
334}
335#endif /* CONFIG_MODULES */
336
337
318/* 338/*
319 * Update probes, removing the faulty probes. 339 * Update probes, removing the faulty probes.
340 * Called with tracepoints_mutex held.
320 */ 341 */
321static void tracepoint_update_probes(void) 342static void tracepoint_update_probes(void)
322{ 343{
@@ -359,11 +380,12 @@ int tracepoint_probe_register(const char *name, void *probe, void *data)
359 380
360 mutex_lock(&tracepoints_mutex); 381 mutex_lock(&tracepoints_mutex);
361 old = tracepoint_add_probe(name, probe, data); 382 old = tracepoint_add_probe(name, probe, data);
362 mutex_unlock(&tracepoints_mutex); 383 if (IS_ERR(old)) {
363 if (IS_ERR(old)) 384 mutex_unlock(&tracepoints_mutex);
364 return PTR_ERR(old); 385 return PTR_ERR(old);
365 386 }
366 tracepoint_update_probes(); /* may update entry */ 387 tracepoint_update_probes(); /* may update entry */
388 mutex_unlock(&tracepoints_mutex);
367 release_probes(old); 389 release_probes(old);
368 return 0; 390 return 0;
369} 391}
@@ -402,11 +424,12 @@ int tracepoint_probe_unregister(const char *name, void *probe, void *data)
402 424
403 mutex_lock(&tracepoints_mutex); 425 mutex_lock(&tracepoints_mutex);
404 old = tracepoint_remove_probe(name, probe, data); 426 old = tracepoint_remove_probe(name, probe, data);
405 mutex_unlock(&tracepoints_mutex); 427 if (IS_ERR(old)) {
406 if (IS_ERR(old)) 428 mutex_unlock(&tracepoints_mutex);
407 return PTR_ERR(old); 429 return PTR_ERR(old);
408 430 }
409 tracepoint_update_probes(); /* may update entry */ 431 tracepoint_update_probes(); /* may update entry */
432 mutex_unlock(&tracepoints_mutex);
410 release_probes(old); 433 release_probes(old);
411 return 0; 434 return 0;
412} 435}
@@ -489,9 +512,8 @@ void tracepoint_probe_update_all(void)
489 if (!list_empty(&old_probes)) 512 if (!list_empty(&old_probes))
490 list_replace_init(&old_probes, &release_probes); 513 list_replace_init(&old_probes, &release_probes);
491 need_update = 0; 514 need_update = 0;
492 mutex_unlock(&tracepoints_mutex);
493
494 tracepoint_update_probes(); 515 tracepoint_update_probes();
516 mutex_unlock(&tracepoints_mutex);
495 list_for_each_entry_safe(pos, next, &release_probes, u.list) { 517 list_for_each_entry_safe(pos, next, &release_probes, u.list) {
496 list_del(&pos->u.list); 518 list_del(&pos->u.list);
497 call_rcu_sched(&pos->u.rcu, rcu_free_old_probes); 519 call_rcu_sched(&pos->u.rcu, rcu_free_old_probes);
@@ -509,7 +531,7 @@ EXPORT_SYMBOL_GPL(tracepoint_probe_update_all);
509 * Will return the first tracepoint in the range if the input tracepoint is 531 * Will return the first tracepoint in the range if the input tracepoint is
510 * NULL. 532 * NULL.
511 */ 533 */
512int tracepoint_get_iter_range(struct tracepoint * const **tracepoint, 534static int tracepoint_get_iter_range(struct tracepoint * const **tracepoint,
513 struct tracepoint * const *begin, struct tracepoint * const *end) 535 struct tracepoint * const *begin, struct tracepoint * const *end)
514{ 536{
515 if (!*tracepoint && begin != end) { 537 if (!*tracepoint && begin != end) {
@@ -520,11 +542,12 @@ int tracepoint_get_iter_range(struct tracepoint * const **tracepoint,
520 return 1; 542 return 1;
521 return 0; 543 return 0;
522} 544}
523EXPORT_SYMBOL_GPL(tracepoint_get_iter_range);
524 545
546#ifdef CONFIG_MODULES
525static void tracepoint_get_iter(struct tracepoint_iter *iter) 547static void tracepoint_get_iter(struct tracepoint_iter *iter)
526{ 548{
527 int found = 0; 549 int found = 0;
550 struct tp_module *iter_mod;
528 551
529 /* Core kernel tracepoints */ 552 /* Core kernel tracepoints */
530 if (!iter->module) { 553 if (!iter->module) {
@@ -534,12 +557,43 @@ static void tracepoint_get_iter(struct tracepoint_iter *iter)
534 if (found) 557 if (found)
535 goto end; 558 goto end;
536 } 559 }
537 /* tracepoints in modules. */ 560 /* Tracepoints in modules */
538 found = module_get_iter_tracepoints(iter); 561 mutex_lock(&tracepoints_mutex);
562 list_for_each_entry(iter_mod, &tracepoint_module_list, list) {
563 /*
564 * Sorted module list
565 */
566 if (iter_mod < iter->module)
567 continue;
568 else if (iter_mod > iter->module)
569 iter->tracepoint = NULL;
570 found = tracepoint_get_iter_range(&iter->tracepoint,
571 iter_mod->tracepoints_ptrs,
572 iter_mod->tracepoints_ptrs
573 + iter_mod->num_tracepoints);
574 if (found) {
575 iter->module = iter_mod;
576 break;
577 }
578 }
579 mutex_unlock(&tracepoints_mutex);
539end: 580end:
540 if (!found) 581 if (!found)
541 tracepoint_iter_reset(iter); 582 tracepoint_iter_reset(iter);
542} 583}
584#else /* CONFIG_MODULES */
585static void tracepoint_get_iter(struct tracepoint_iter *iter)
586{
587 int found = 0;
588
589 /* Core kernel tracepoints */
590 found = tracepoint_get_iter_range(&iter->tracepoint,
591 __start___tracepoints_ptrs,
592 __stop___tracepoints_ptrs);
593 if (!found)
594 tracepoint_iter_reset(iter);
595}
596#endif /* CONFIG_MODULES */
543 597
544void tracepoint_iter_start(struct tracepoint_iter *iter) 598void tracepoint_iter_start(struct tracepoint_iter *iter)
545{ 599{
@@ -566,26 +620,98 @@ EXPORT_SYMBOL_GPL(tracepoint_iter_stop);
566 620
567void tracepoint_iter_reset(struct tracepoint_iter *iter) 621void tracepoint_iter_reset(struct tracepoint_iter *iter)
568{ 622{
623#ifdef CONFIG_MODULES
569 iter->module = NULL; 624 iter->module = NULL;
625#endif /* CONFIG_MODULES */
570 iter->tracepoint = NULL; 626 iter->tracepoint = NULL;
571} 627}
572EXPORT_SYMBOL_GPL(tracepoint_iter_reset); 628EXPORT_SYMBOL_GPL(tracepoint_iter_reset);
573 629
574#ifdef CONFIG_MODULES 630#ifdef CONFIG_MODULES
631static int tracepoint_module_coming(struct module *mod)
632{
633 struct tp_module *tp_mod, *iter;
634 int ret = 0;
635
636 /*
637 * We skip modules that tain the kernel, especially those with different
638 * module header (for forced load), to make sure we don't cause a crash.
639 */
640 if (mod->taints)
641 return 0;
642 mutex_lock(&tracepoints_mutex);
643 tp_mod = kmalloc(sizeof(struct tp_module), GFP_KERNEL);
644 if (!tp_mod) {
645 ret = -ENOMEM;
646 goto end;
647 }
648 tp_mod->num_tracepoints = mod->num_tracepoints;
649 tp_mod->tracepoints_ptrs = mod->tracepoints_ptrs;
650
651 /*
652 * tracepoint_module_list is kept sorted by struct module pointer
653 * address for iteration on tracepoints from a seq_file that can release
654 * the mutex between calls.
655 */
656 list_for_each_entry_reverse(iter, &tracepoint_module_list, list) {
657 BUG_ON(iter == tp_mod); /* Should never be in the list twice */
658 if (iter < tp_mod) {
659 /* We belong to the location right after iter. */
660 list_add(&tp_mod->list, &iter->list);
661 goto module_added;
662 }
663 }
664 /* We belong to the beginning of the list */
665 list_add(&tp_mod->list, &tracepoint_module_list);
666module_added:
667 tracepoint_update_probe_range(mod->tracepoints_ptrs,
668 mod->tracepoints_ptrs + mod->num_tracepoints);
669end:
670 mutex_unlock(&tracepoints_mutex);
671 return ret;
672}
673
674static int tracepoint_module_going(struct module *mod)
675{
676 struct tp_module *pos;
677
678 mutex_lock(&tracepoints_mutex);
679 tracepoint_update_probe_range(mod->tracepoints_ptrs,
680 mod->tracepoints_ptrs + mod->num_tracepoints);
681 list_for_each_entry(pos, &tracepoint_module_list, list) {
682 if (pos->tracepoints_ptrs == mod->tracepoints_ptrs) {
683 list_del(&pos->list);
684 kfree(pos);
685 break;
686 }
687 }
688 /*
689 * In the case of modules that were tainted at "coming", we'll simply
690 * walk through the list without finding it. We cannot use the "tainted"
691 * flag on "going", in case a module taints the kernel only after being
692 * loaded.
693 */
694 mutex_unlock(&tracepoints_mutex);
695 return 0;
696}
575 697
576int tracepoint_module_notify(struct notifier_block *self, 698int tracepoint_module_notify(struct notifier_block *self,
577 unsigned long val, void *data) 699 unsigned long val, void *data)
578{ 700{
579 struct module *mod = data; 701 struct module *mod = data;
702 int ret = 0;
580 703
581 switch (val) { 704 switch (val) {
582 case MODULE_STATE_COMING: 705 case MODULE_STATE_COMING:
706 ret = tracepoint_module_coming(mod);
707 break;
708 case MODULE_STATE_LIVE:
709 break;
583 case MODULE_STATE_GOING: 710 case MODULE_STATE_GOING:
584 tracepoint_update_probe_range(mod->tracepoints_ptrs, 711 ret = tracepoint_module_going(mod);
585 mod->tracepoints_ptrs + mod->num_tracepoints);
586 break; 712 break;
587 } 713 }
588 return 0; 714 return ret;
589} 715}
590 716
591struct notifier_block tracepoint_module_nb = { 717struct notifier_block tracepoint_module_nb = {
@@ -598,7 +724,6 @@ static int init_tracepoints(void)
598 return register_module_notifier(&tracepoint_module_nb); 724 return register_module_notifier(&tracepoint_module_nb);
599} 725}
600__initcall(init_tracepoints); 726__initcall(init_tracepoints);
601
602#endif /* CONFIG_MODULES */ 727#endif /* CONFIG_MODULES */
603 728
604#ifdef CONFIG_HAVE_SYSCALL_TRACEPOINTS 729#ifdef CONFIG_HAVE_SYSCALL_TRACEPOINTS
diff --git a/kernel/tsacct.c b/kernel/tsacct.c
index 24dc60d9fa1f..5bbfac85866e 100644
--- a/kernel/tsacct.c
+++ b/kernel/tsacct.c
@@ -78,6 +78,7 @@ void bacct_add_tsk(struct taskstats *stats, struct task_struct *tsk)
78 78
79#define KB 1024 79#define KB 1024
80#define MB (1024*KB) 80#define MB (1024*KB)
81#define KB_MASK (~(KB-1))
81/* 82/*
82 * fill in extended accounting fields 83 * fill in extended accounting fields
83 */ 84 */
@@ -95,14 +96,14 @@ void xacct_add_tsk(struct taskstats *stats, struct task_struct *p)
95 stats->hiwater_vm = get_mm_hiwater_vm(mm) * PAGE_SIZE / KB; 96 stats->hiwater_vm = get_mm_hiwater_vm(mm) * PAGE_SIZE / KB;
96 mmput(mm); 97 mmput(mm);
97 } 98 }
98 stats->read_char = p->ioac.rchar; 99 stats->read_char = p->ioac.rchar & KB_MASK;
99 stats->write_char = p->ioac.wchar; 100 stats->write_char = p->ioac.wchar & KB_MASK;
100 stats->read_syscalls = p->ioac.syscr; 101 stats->read_syscalls = p->ioac.syscr & KB_MASK;
101 stats->write_syscalls = p->ioac.syscw; 102 stats->write_syscalls = p->ioac.syscw & KB_MASK;
102#ifdef CONFIG_TASK_IO_ACCOUNTING 103#ifdef CONFIG_TASK_IO_ACCOUNTING
103 stats->read_bytes = p->ioac.read_bytes; 104 stats->read_bytes = p->ioac.read_bytes & KB_MASK;
104 stats->write_bytes = p->ioac.write_bytes; 105 stats->write_bytes = p->ioac.write_bytes & KB_MASK;
105 stats->cancelled_write_bytes = p->ioac.cancelled_write_bytes; 106 stats->cancelled_write_bytes = p->ioac.cancelled_write_bytes & KB_MASK;
106#else 107#else
107 stats->read_bytes = 0; 108 stats->read_bytes = 0;
108 stats->write_bytes = 0; 109 stats->write_bytes = 0;
diff --git a/kernel/watchdog.c b/kernel/watchdog.c
index 36491cd5b7d4..d680381b0e9c 100644
--- a/kernel/watchdog.c
+++ b/kernel/watchdog.c
@@ -321,7 +321,7 @@ static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer)
321 */ 321 */
322static int watchdog(void *unused) 322static int watchdog(void *unused)
323{ 323{
324 static struct sched_param param = { .sched_priority = MAX_RT_PRIO-1 }; 324 struct sched_param param = { .sched_priority = MAX_RT_PRIO-1 };
325 struct hrtimer *hrtimer = &__raw_get_cpu_var(watchdog_hrtimer); 325 struct hrtimer *hrtimer = &__raw_get_cpu_var(watchdog_hrtimer);
326 326
327 sched_setscheduler(current, SCHED_FIFO, &param); 327 sched_setscheduler(current, SCHED_FIFO, &param);
@@ -350,7 +350,8 @@ static int watchdog(void *unused)
350 set_current_state(TASK_INTERRUPTIBLE); 350 set_current_state(TASK_INTERRUPTIBLE);
351 } 351 }
352 __set_current_state(TASK_RUNNING); 352 __set_current_state(TASK_RUNNING);
353 353 param.sched_priority = 0;
354 sched_setscheduler(current, SCHED_NORMAL, &param);
354 return 0; 355 return 0;
355} 356}
356 357
@@ -438,7 +439,7 @@ static int watchdog_enable(int cpu)
438 439
439 /* create the watchdog thread */ 440 /* create the watchdog thread */
440 if (!p) { 441 if (!p) {
441 p = kthread_create(watchdog, (void *)(unsigned long)cpu, "watchdog/%d", cpu); 442 p = kthread_create_on_node(watchdog, NULL, cpu_to_node(cpu), "watchdog/%d", cpu);
442 if (IS_ERR(p)) { 443 if (IS_ERR(p)) {
443 printk(KERN_ERR "softlockup watchdog for %i failed\n", cpu); 444 printk(KERN_ERR "softlockup watchdog for %i failed\n", cpu);
444 if (!err) { 445 if (!err) {
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 25fb1b0e53fa..1783aabc6128 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -2412,8 +2412,13 @@ reflush:
2412 2412
2413 for_each_cwq_cpu(cpu, wq) { 2413 for_each_cwq_cpu(cpu, wq) {
2414 struct cpu_workqueue_struct *cwq = get_cwq(cpu, wq); 2414 struct cpu_workqueue_struct *cwq = get_cwq(cpu, wq);
2415 bool drained;
2415 2416
2416 if (!cwq->nr_active && list_empty(&cwq->delayed_works)) 2417 spin_lock_irq(&cwq->gcwq->lock);
2418 drained = !cwq->nr_active && list_empty(&cwq->delayed_works);
2419 spin_unlock_irq(&cwq->gcwq->lock);
2420
2421 if (drained)
2417 continue; 2422 continue;
2418 2423
2419 if (++flush_cnt == 10 || 2424 if (++flush_cnt == 10 ||