diff options
Diffstat (limited to 'kernel')
84 files changed, 4872 insertions, 2056 deletions
diff --git a/kernel/Makefile b/kernel/Makefile index 988cb3da7031..e898c5b9d02c 100644 --- a/kernel/Makefile +++ b/kernel/Makefile | |||
@@ -9,7 +9,7 @@ obj-y = sched.o fork.o exec_domain.o panic.o printk.o \ | |||
9 | rcupdate.o extable.o params.o posix-timers.o \ | 9 | rcupdate.o extable.o params.o posix-timers.o \ |
10 | kthread.o wait.o kfifo.o sys_ni.o posix-cpu-timers.o mutex.o \ | 10 | kthread.o wait.o kfifo.o sys_ni.o posix-cpu-timers.o mutex.o \ |
11 | hrtimer.o rwsem.o nsproxy.o srcu.o semaphore.o \ | 11 | hrtimer.o rwsem.o nsproxy.o srcu.o semaphore.o \ |
12 | notifier.o ksysfs.o pm_qos_params.o sched_clock.o cred.o \ | 12 | notifier.o ksysfs.o sched_clock.o cred.o \ |
13 | async.o range.o | 13 | async.o range.o |
14 | obj-y += groups.o | 14 | obj-y += groups.o |
15 | 15 | ||
diff --git a/kernel/async.c b/kernel/async.c index d5fe7af0de2e..4c2843c0043e 100644 --- a/kernel/async.c +++ b/kernel/async.c | |||
@@ -120,7 +120,7 @@ static void async_run_entry_fn(struct work_struct *work) | |||
120 | struct async_entry *entry = | 120 | struct async_entry *entry = |
121 | container_of(work, struct async_entry, work); | 121 | container_of(work, struct async_entry, work); |
122 | unsigned long flags; | 122 | unsigned long flags; |
123 | ktime_t calltime, delta, rettime; | 123 | ktime_t uninitialized_var(calltime), delta, rettime; |
124 | 124 | ||
125 | /* 1) move self to the running queue */ | 125 | /* 1) move self to the running queue */ |
126 | spin_lock_irqsave(&async_lock, flags); | 126 | spin_lock_irqsave(&async_lock, flags); |
@@ -269,7 +269,7 @@ EXPORT_SYMBOL_GPL(async_synchronize_full_domain); | |||
269 | void async_synchronize_cookie_domain(async_cookie_t cookie, | 269 | void async_synchronize_cookie_domain(async_cookie_t cookie, |
270 | struct list_head *running) | 270 | struct list_head *running) |
271 | { | 271 | { |
272 | ktime_t starttime, delta, endtime; | 272 | ktime_t uninitialized_var(starttime), delta, endtime; |
273 | 273 | ||
274 | if (initcall_debug && system_state == SYSTEM_BOOTING) { | 274 | if (initcall_debug && system_state == SYSTEM_BOOTING) { |
275 | printk(KERN_DEBUG "async_waiting @ %i\n", task_pid_nr(current)); | 275 | printk(KERN_DEBUG "async_waiting @ %i\n", task_pid_nr(current)); |
diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 1d2b6ceea95d..453100a4159d 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c | |||
@@ -265,7 +265,7 @@ list_for_each_entry(_root, &roots, root_list) | |||
265 | /* the list of cgroups eligible for automatic release. Protected by | 265 | /* the list of cgroups eligible for automatic release. Protected by |
266 | * release_list_lock */ | 266 | * release_list_lock */ |
267 | static LIST_HEAD(release_list); | 267 | static LIST_HEAD(release_list); |
268 | static DEFINE_SPINLOCK(release_list_lock); | 268 | static DEFINE_RAW_SPINLOCK(release_list_lock); |
269 | static void cgroup_release_agent(struct work_struct *work); | 269 | static void cgroup_release_agent(struct work_struct *work); |
270 | static DECLARE_WORK(release_agent_work, cgroup_release_agent); | 270 | static DECLARE_WORK(release_agent_work, cgroup_release_agent); |
271 | static void check_for_release(struct cgroup *cgrp); | 271 | static void check_for_release(struct cgroup *cgrp); |
@@ -4014,11 +4014,11 @@ again: | |||
4014 | finish_wait(&cgroup_rmdir_waitq, &wait); | 4014 | finish_wait(&cgroup_rmdir_waitq, &wait); |
4015 | clear_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags); | 4015 | clear_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags); |
4016 | 4016 | ||
4017 | spin_lock(&release_list_lock); | 4017 | raw_spin_lock(&release_list_lock); |
4018 | set_bit(CGRP_REMOVED, &cgrp->flags); | 4018 | set_bit(CGRP_REMOVED, &cgrp->flags); |
4019 | if (!list_empty(&cgrp->release_list)) | 4019 | if (!list_empty(&cgrp->release_list)) |
4020 | list_del_init(&cgrp->release_list); | 4020 | list_del_init(&cgrp->release_list); |
4021 | spin_unlock(&release_list_lock); | 4021 | raw_spin_unlock(&release_list_lock); |
4022 | 4022 | ||
4023 | cgroup_lock_hierarchy(cgrp->root); | 4023 | cgroup_lock_hierarchy(cgrp->root); |
4024 | /* delete this cgroup from parent->children */ | 4024 | /* delete this cgroup from parent->children */ |
@@ -4671,13 +4671,13 @@ static void check_for_release(struct cgroup *cgrp) | |||
4671 | * already queued for a userspace notification, queue | 4671 | * already queued for a userspace notification, queue |
4672 | * it now */ | 4672 | * it now */ |
4673 | int need_schedule_work = 0; | 4673 | int need_schedule_work = 0; |
4674 | spin_lock(&release_list_lock); | 4674 | raw_spin_lock(&release_list_lock); |
4675 | if (!cgroup_is_removed(cgrp) && | 4675 | if (!cgroup_is_removed(cgrp) && |
4676 | list_empty(&cgrp->release_list)) { | 4676 | list_empty(&cgrp->release_list)) { |
4677 | list_add(&cgrp->release_list, &release_list); | 4677 | list_add(&cgrp->release_list, &release_list); |
4678 | need_schedule_work = 1; | 4678 | need_schedule_work = 1; |
4679 | } | 4679 | } |
4680 | spin_unlock(&release_list_lock); | 4680 | raw_spin_unlock(&release_list_lock); |
4681 | if (need_schedule_work) | 4681 | if (need_schedule_work) |
4682 | schedule_work(&release_agent_work); | 4682 | schedule_work(&release_agent_work); |
4683 | } | 4683 | } |
@@ -4729,7 +4729,7 @@ static void cgroup_release_agent(struct work_struct *work) | |||
4729 | { | 4729 | { |
4730 | BUG_ON(work != &release_agent_work); | 4730 | BUG_ON(work != &release_agent_work); |
4731 | mutex_lock(&cgroup_mutex); | 4731 | mutex_lock(&cgroup_mutex); |
4732 | spin_lock(&release_list_lock); | 4732 | raw_spin_lock(&release_list_lock); |
4733 | while (!list_empty(&release_list)) { | 4733 | while (!list_empty(&release_list)) { |
4734 | char *argv[3], *envp[3]; | 4734 | char *argv[3], *envp[3]; |
4735 | int i; | 4735 | int i; |
@@ -4738,7 +4738,7 @@ static void cgroup_release_agent(struct work_struct *work) | |||
4738 | struct cgroup, | 4738 | struct cgroup, |
4739 | release_list); | 4739 | release_list); |
4740 | list_del_init(&cgrp->release_list); | 4740 | list_del_init(&cgrp->release_list); |
4741 | spin_unlock(&release_list_lock); | 4741 | raw_spin_unlock(&release_list_lock); |
4742 | pathbuf = kmalloc(PAGE_SIZE, GFP_KERNEL); | 4742 | pathbuf = kmalloc(PAGE_SIZE, GFP_KERNEL); |
4743 | if (!pathbuf) | 4743 | if (!pathbuf) |
4744 | goto continue_free; | 4744 | goto continue_free; |
@@ -4768,9 +4768,9 @@ static void cgroup_release_agent(struct work_struct *work) | |||
4768 | continue_free: | 4768 | continue_free: |
4769 | kfree(pathbuf); | 4769 | kfree(pathbuf); |
4770 | kfree(agentbuf); | 4770 | kfree(agentbuf); |
4771 | spin_lock(&release_list_lock); | 4771 | raw_spin_lock(&release_list_lock); |
4772 | } | 4772 | } |
4773 | spin_unlock(&release_list_lock); | 4773 | raw_spin_unlock(&release_list_lock); |
4774 | mutex_unlock(&cgroup_mutex); | 4774 | mutex_unlock(&cgroup_mutex); |
4775 | } | 4775 | } |
4776 | 4776 | ||
diff --git a/kernel/cred.c b/kernel/cred.c index 8ef31f53c44c..bb55d052d858 100644 --- a/kernel/cred.c +++ b/kernel/cred.c | |||
@@ -644,6 +644,9 @@ void __init cred_init(void) | |||
644 | */ | 644 | */ |
645 | struct cred *prepare_kernel_cred(struct task_struct *daemon) | 645 | struct cred *prepare_kernel_cred(struct task_struct *daemon) |
646 | { | 646 | { |
647 | #ifdef CONFIG_KEYS | ||
648 | struct thread_group_cred *tgcred; | ||
649 | #endif | ||
647 | const struct cred *old; | 650 | const struct cred *old; |
648 | struct cred *new; | 651 | struct cred *new; |
649 | 652 | ||
@@ -651,6 +654,14 @@ struct cred *prepare_kernel_cred(struct task_struct *daemon) | |||
651 | if (!new) | 654 | if (!new) |
652 | return NULL; | 655 | return NULL; |
653 | 656 | ||
657 | #ifdef CONFIG_KEYS | ||
658 | tgcred = kmalloc(sizeof(*tgcred), GFP_KERNEL); | ||
659 | if (!tgcred) { | ||
660 | kmem_cache_free(cred_jar, new); | ||
661 | return NULL; | ||
662 | } | ||
663 | #endif | ||
664 | |||
654 | kdebug("prepare_kernel_cred() alloc %p", new); | 665 | kdebug("prepare_kernel_cred() alloc %p", new); |
655 | 666 | ||
656 | if (daemon) | 667 | if (daemon) |
@@ -667,8 +678,11 @@ struct cred *prepare_kernel_cred(struct task_struct *daemon) | |||
667 | get_group_info(new->group_info); | 678 | get_group_info(new->group_info); |
668 | 679 | ||
669 | #ifdef CONFIG_KEYS | 680 | #ifdef CONFIG_KEYS |
670 | atomic_inc(&init_tgcred.usage); | 681 | atomic_set(&tgcred->usage, 1); |
671 | new->tgcred = &init_tgcred; | 682 | spin_lock_init(&tgcred->lock); |
683 | tgcred->process_keyring = NULL; | ||
684 | tgcred->session_keyring = NULL; | ||
685 | new->tgcred = tgcred; | ||
672 | new->request_key_auth = NULL; | 686 | new->request_key_auth = NULL; |
673 | new->thread_keyring = NULL; | 687 | new->thread_keyring = NULL; |
674 | new->jit_keyring = KEY_REQKEY_DEFL_THREAD_KEYRING; | 688 | new->jit_keyring = KEY_REQKEY_DEFL_THREAD_KEYRING; |
diff --git a/kernel/events/core.c b/kernel/events/core.c index fbe38f2e8edb..d1a1bee35228 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c | |||
@@ -29,6 +29,7 @@ | |||
29 | #include <linux/hardirq.h> | 29 | #include <linux/hardirq.h> |
30 | #include <linux/rculist.h> | 30 | #include <linux/rculist.h> |
31 | #include <linux/uaccess.h> | 31 | #include <linux/uaccess.h> |
32 | #include <linux/suspend.h> | ||
32 | #include <linux/syscalls.h> | 33 | #include <linux/syscalls.h> |
33 | #include <linux/anon_inodes.h> | 34 | #include <linux/anon_inodes.h> |
34 | #include <linux/kernel_stat.h> | 35 | #include <linux/kernel_stat.h> |
@@ -6852,7 +6853,7 @@ static void __cpuinit perf_event_init_cpu(int cpu) | |||
6852 | struct swevent_htable *swhash = &per_cpu(swevent_htable, cpu); | 6853 | struct swevent_htable *swhash = &per_cpu(swevent_htable, cpu); |
6853 | 6854 | ||
6854 | mutex_lock(&swhash->hlist_mutex); | 6855 | mutex_lock(&swhash->hlist_mutex); |
6855 | if (swhash->hlist_refcount > 0) { | 6856 | if (swhash->hlist_refcount > 0 && !swhash->swevent_hlist) { |
6856 | struct swevent_hlist *hlist; | 6857 | struct swevent_hlist *hlist; |
6857 | 6858 | ||
6858 | hlist = kzalloc_node(sizeof(*hlist), GFP_KERNEL, cpu_to_node(cpu)); | 6859 | hlist = kzalloc_node(sizeof(*hlist), GFP_KERNEL, cpu_to_node(cpu)); |
@@ -6941,7 +6942,14 @@ perf_cpu_notify(struct notifier_block *self, unsigned long action, void *hcpu) | |||
6941 | { | 6942 | { |
6942 | unsigned int cpu = (long)hcpu; | 6943 | unsigned int cpu = (long)hcpu; |
6943 | 6944 | ||
6944 | switch (action & ~CPU_TASKS_FROZEN) { | 6945 | /* |
6946 | * Ignore suspend/resume action, the perf_pm_notifier will | ||
6947 | * take care of that. | ||
6948 | */ | ||
6949 | if (action & CPU_TASKS_FROZEN) | ||
6950 | return NOTIFY_OK; | ||
6951 | |||
6952 | switch (action) { | ||
6945 | 6953 | ||
6946 | case CPU_UP_PREPARE: | 6954 | case CPU_UP_PREPARE: |
6947 | case CPU_DOWN_FAILED: | 6955 | case CPU_DOWN_FAILED: |
@@ -6960,6 +6968,90 @@ perf_cpu_notify(struct notifier_block *self, unsigned long action, void *hcpu) | |||
6960 | return NOTIFY_OK; | 6968 | return NOTIFY_OK; |
6961 | } | 6969 | } |
6962 | 6970 | ||
6971 | static void perf_pm_resume_cpu(void *unused) | ||
6972 | { | ||
6973 | struct perf_cpu_context *cpuctx; | ||
6974 | struct perf_event_context *ctx; | ||
6975 | struct pmu *pmu; | ||
6976 | int idx; | ||
6977 | |||
6978 | idx = srcu_read_lock(&pmus_srcu); | ||
6979 | list_for_each_entry_rcu(pmu, &pmus, entry) { | ||
6980 | cpuctx = this_cpu_ptr(pmu->pmu_cpu_context); | ||
6981 | ctx = cpuctx->task_ctx; | ||
6982 | |||
6983 | perf_ctx_lock(cpuctx, ctx); | ||
6984 | perf_pmu_disable(cpuctx->ctx.pmu); | ||
6985 | |||
6986 | cpu_ctx_sched_out(cpuctx, EVENT_ALL); | ||
6987 | if (ctx) | ||
6988 | ctx_sched_out(ctx, cpuctx, EVENT_ALL); | ||
6989 | |||
6990 | perf_pmu_enable(cpuctx->ctx.pmu); | ||
6991 | perf_ctx_unlock(cpuctx, ctx); | ||
6992 | } | ||
6993 | srcu_read_unlock(&pmus_srcu, idx); | ||
6994 | } | ||
6995 | |||
6996 | static void perf_pm_suspend_cpu(void *unused) | ||
6997 | { | ||
6998 | struct perf_cpu_context *cpuctx; | ||
6999 | struct perf_event_context *ctx; | ||
7000 | struct pmu *pmu; | ||
7001 | int idx; | ||
7002 | |||
7003 | idx = srcu_read_lock(&pmus_srcu); | ||
7004 | list_for_each_entry_rcu(pmu, &pmus, entry) { | ||
7005 | cpuctx = this_cpu_ptr(pmu->pmu_cpu_context); | ||
7006 | ctx = cpuctx->task_ctx; | ||
7007 | |||
7008 | perf_ctx_lock(cpuctx, ctx); | ||
7009 | perf_pmu_disable(cpuctx->ctx.pmu); | ||
7010 | |||
7011 | perf_event_sched_in(cpuctx, ctx, current); | ||
7012 | |||
7013 | perf_pmu_enable(cpuctx->ctx.pmu); | ||
7014 | perf_ctx_unlock(cpuctx, ctx); | ||
7015 | } | ||
7016 | srcu_read_unlock(&pmus_srcu, idx); | ||
7017 | } | ||
7018 | |||
7019 | static int perf_resume(void) | ||
7020 | { | ||
7021 | get_online_cpus(); | ||
7022 | smp_call_function(perf_pm_resume_cpu, NULL, 1); | ||
7023 | put_online_cpus(); | ||
7024 | |||
7025 | return NOTIFY_OK; | ||
7026 | } | ||
7027 | |||
7028 | static int perf_suspend(void) | ||
7029 | { | ||
7030 | get_online_cpus(); | ||
7031 | smp_call_function(perf_pm_suspend_cpu, NULL, 1); | ||
7032 | put_online_cpus(); | ||
7033 | |||
7034 | return NOTIFY_OK; | ||
7035 | } | ||
7036 | |||
7037 | static int perf_pm(struct notifier_block *self, unsigned long action, void *ptr) | ||
7038 | { | ||
7039 | switch (action) { | ||
7040 | case PM_POST_HIBERNATION: | ||
7041 | case PM_POST_SUSPEND: | ||
7042 | return perf_resume(); | ||
7043 | case PM_HIBERNATION_PREPARE: | ||
7044 | case PM_SUSPEND_PREPARE: | ||
7045 | return perf_suspend(); | ||
7046 | default: | ||
7047 | return NOTIFY_DONE; | ||
7048 | } | ||
7049 | } | ||
7050 | |||
7051 | static struct notifier_block perf_pm_notifier = { | ||
7052 | .notifier_call = perf_pm, | ||
7053 | }; | ||
7054 | |||
6963 | void __init perf_event_init(void) | 7055 | void __init perf_event_init(void) |
6964 | { | 7056 | { |
6965 | int ret; | 7057 | int ret; |
@@ -6974,6 +7066,7 @@ void __init perf_event_init(void) | |||
6974 | perf_tp_register(); | 7066 | perf_tp_register(); |
6975 | perf_cpu_notifier(perf_cpu_notify); | 7067 | perf_cpu_notifier(perf_cpu_notify); |
6976 | register_reboot_notifier(&perf_reboot_notifier); | 7068 | register_reboot_notifier(&perf_reboot_notifier); |
7069 | register_pm_notifier(&perf_pm_notifier); | ||
6977 | 7070 | ||
6978 | ret = init_hw_breakpoint(); | 7071 | ret = init_hw_breakpoint(); |
6979 | WARN(ret, "hw_breakpoint initialization failed with: %d", ret); | 7072 | WARN(ret, "hw_breakpoint initialization failed with: %d", ret); |
diff --git a/kernel/freezer.c b/kernel/freezer.c index 7b01de98bb6a..66a594e8ad2f 100644 --- a/kernel/freezer.c +++ b/kernel/freezer.c | |||
@@ -67,7 +67,7 @@ static void fake_signal_wake_up(struct task_struct *p) | |||
67 | unsigned long flags; | 67 | unsigned long flags; |
68 | 68 | ||
69 | spin_lock_irqsave(&p->sighand->siglock, flags); | 69 | spin_lock_irqsave(&p->sighand->siglock, flags); |
70 | signal_wake_up(p, 0); | 70 | signal_wake_up(p, 1); |
71 | spin_unlock_irqrestore(&p->sighand->siglock, flags); | 71 | spin_unlock_irqrestore(&p->sighand->siglock, flags); |
72 | } | 72 | } |
73 | 73 | ||
diff --git a/kernel/futex.c b/kernel/futex.c index 11cbe052b2e8..1511dff0cfd6 100644 --- a/kernel/futex.c +++ b/kernel/futex.c | |||
@@ -854,7 +854,7 @@ static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_q *this) | |||
854 | { | 854 | { |
855 | struct task_struct *new_owner; | 855 | struct task_struct *new_owner; |
856 | struct futex_pi_state *pi_state = this->pi_state; | 856 | struct futex_pi_state *pi_state = this->pi_state; |
857 | u32 curval, newval; | 857 | u32 uninitialized_var(curval), newval; |
858 | 858 | ||
859 | if (!pi_state) | 859 | if (!pi_state) |
860 | return -EINVAL; | 860 | return -EINVAL; |
@@ -916,7 +916,7 @@ static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_q *this) | |||
916 | 916 | ||
917 | static int unlock_futex_pi(u32 __user *uaddr, u32 uval) | 917 | static int unlock_futex_pi(u32 __user *uaddr, u32 uval) |
918 | { | 918 | { |
919 | u32 oldval; | 919 | u32 uninitialized_var(oldval); |
920 | 920 | ||
921 | /* | 921 | /* |
922 | * There is no waiter, so we unlock the futex. The owner died | 922 | * There is no waiter, so we unlock the futex. The owner died |
@@ -1576,7 +1576,7 @@ static int fixup_pi_state_owner(u32 __user *uaddr, struct futex_q *q, | |||
1576 | u32 newtid = task_pid_vnr(newowner) | FUTEX_WAITERS; | 1576 | u32 newtid = task_pid_vnr(newowner) | FUTEX_WAITERS; |
1577 | struct futex_pi_state *pi_state = q->pi_state; | 1577 | struct futex_pi_state *pi_state = q->pi_state; |
1578 | struct task_struct *oldowner = pi_state->owner; | 1578 | struct task_struct *oldowner = pi_state->owner; |
1579 | u32 uval, curval, newval; | 1579 | u32 uval, uninitialized_var(curval), newval; |
1580 | int ret; | 1580 | int ret; |
1581 | 1581 | ||
1582 | /* Owner died? */ | 1582 | /* Owner died? */ |
@@ -1793,7 +1793,7 @@ static void futex_wait_queue_me(struct futex_hash_bucket *hb, struct futex_q *q, | |||
1793 | * | 1793 | * |
1794 | * Returns: | 1794 | * Returns: |
1795 | * 0 - uaddr contains val and hb has been locked | 1795 | * 0 - uaddr contains val and hb has been locked |
1796 | * <1 - -EFAULT or -EWOULDBLOCK (uaddr does not contain val) and hb is unlcoked | 1796 | * <1 - -EFAULT or -EWOULDBLOCK (uaddr does not contain val) and hb is unlocked |
1797 | */ | 1797 | */ |
1798 | static int futex_wait_setup(u32 __user *uaddr, u32 val, unsigned int flags, | 1798 | static int futex_wait_setup(u32 __user *uaddr, u32 val, unsigned int flags, |
1799 | struct futex_q *q, struct futex_hash_bucket **hb) | 1799 | struct futex_q *q, struct futex_hash_bucket **hb) |
@@ -2481,7 +2481,7 @@ err_unlock: | |||
2481 | */ | 2481 | */ |
2482 | int handle_futex_death(u32 __user *uaddr, struct task_struct *curr, int pi) | 2482 | int handle_futex_death(u32 __user *uaddr, struct task_struct *curr, int pi) |
2483 | { | 2483 | { |
2484 | u32 uval, nval, mval; | 2484 | u32 uval, uninitialized_var(nval), mval; |
2485 | 2485 | ||
2486 | retry: | 2486 | retry: |
2487 | if (get_user(uval, uaddr)) | 2487 | if (get_user(uval, uaddr)) |
diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c index d5828da3fd38..b57a3776de44 100644 --- a/kernel/irq/irqdomain.c +++ b/kernel/irq/irqdomain.c | |||
@@ -29,7 +29,11 @@ void irq_domain_add(struct irq_domain *domain) | |||
29 | */ | 29 | */ |
30 | for (hwirq = 0; hwirq < domain->nr_irq; hwirq++) { | 30 | for (hwirq = 0; hwirq < domain->nr_irq; hwirq++) { |
31 | d = irq_get_irq_data(irq_domain_to_irq(domain, hwirq)); | 31 | d = irq_get_irq_data(irq_domain_to_irq(domain, hwirq)); |
32 | if (d || d->domain) { | 32 | if (!d) { |
33 | WARN(1, "error: assigning domain to non existant irq_desc"); | ||
34 | return; | ||
35 | } | ||
36 | if (d->domain) { | ||
33 | /* things are broken; just report, don't clean up */ | 37 | /* things are broken; just report, don't clean up */ |
34 | WARN(1, "error: irq_desc already assigned to a domain"); | 38 | WARN(1, "error: irq_desc already assigned to a domain"); |
35 | return; | 39 | return; |
diff --git a/kernel/irq/pm.c b/kernel/irq/pm.c index f76fc00c9877..15e53b1766a6 100644 --- a/kernel/irq/pm.c +++ b/kernel/irq/pm.c | |||
@@ -9,6 +9,7 @@ | |||
9 | #include <linux/irq.h> | 9 | #include <linux/irq.h> |
10 | #include <linux/module.h> | 10 | #include <linux/module.h> |
11 | #include <linux/interrupt.h> | 11 | #include <linux/interrupt.h> |
12 | #include <linux/syscore_ops.h> | ||
12 | 13 | ||
13 | #include "internals.h" | 14 | #include "internals.h" |
14 | 15 | ||
@@ -39,25 +40,58 @@ void suspend_device_irqs(void) | |||
39 | } | 40 | } |
40 | EXPORT_SYMBOL_GPL(suspend_device_irqs); | 41 | EXPORT_SYMBOL_GPL(suspend_device_irqs); |
41 | 42 | ||
42 | /** | 43 | static void resume_irqs(bool want_early) |
43 | * resume_device_irqs - enable interrupt lines disabled by suspend_device_irqs() | ||
44 | * | ||
45 | * Enable all interrupt lines previously disabled by suspend_device_irqs() that | ||
46 | * have the IRQS_SUSPENDED flag set. | ||
47 | */ | ||
48 | void resume_device_irqs(void) | ||
49 | { | 44 | { |
50 | struct irq_desc *desc; | 45 | struct irq_desc *desc; |
51 | int irq; | 46 | int irq; |
52 | 47 | ||
53 | for_each_irq_desc(irq, desc) { | 48 | for_each_irq_desc(irq, desc) { |
54 | unsigned long flags; | 49 | unsigned long flags; |
50 | bool is_early = desc->action && | ||
51 | desc->action->flags & IRQF_EARLY_RESUME; | ||
52 | |||
53 | if (is_early != want_early) | ||
54 | continue; | ||
55 | 55 | ||
56 | raw_spin_lock_irqsave(&desc->lock, flags); | 56 | raw_spin_lock_irqsave(&desc->lock, flags); |
57 | __enable_irq(desc, irq, true); | 57 | __enable_irq(desc, irq, true); |
58 | raw_spin_unlock_irqrestore(&desc->lock, flags); | 58 | raw_spin_unlock_irqrestore(&desc->lock, flags); |
59 | } | 59 | } |
60 | } | 60 | } |
61 | |||
62 | /** | ||
63 | * irq_pm_syscore_ops - enable interrupt lines early | ||
64 | * | ||
65 | * Enable all interrupt lines with %IRQF_EARLY_RESUME set. | ||
66 | */ | ||
67 | static void irq_pm_syscore_resume(void) | ||
68 | { | ||
69 | resume_irqs(true); | ||
70 | } | ||
71 | |||
72 | static struct syscore_ops irq_pm_syscore_ops = { | ||
73 | .resume = irq_pm_syscore_resume, | ||
74 | }; | ||
75 | |||
76 | static int __init irq_pm_init_ops(void) | ||
77 | { | ||
78 | register_syscore_ops(&irq_pm_syscore_ops); | ||
79 | return 0; | ||
80 | } | ||
81 | |||
82 | device_initcall(irq_pm_init_ops); | ||
83 | |||
84 | /** | ||
85 | * resume_device_irqs - enable interrupt lines disabled by suspend_device_irqs() | ||
86 | * | ||
87 | * Enable all non-%IRQF_EARLY_RESUME interrupt lines previously | ||
88 | * disabled by suspend_device_irqs() that have the IRQS_SUSPENDED flag | ||
89 | * set as well as those with %IRQF_FORCE_RESUME. | ||
90 | */ | ||
91 | void resume_device_irqs(void) | ||
92 | { | ||
93 | resume_irqs(false); | ||
94 | } | ||
61 | EXPORT_SYMBOL_GPL(resume_device_irqs); | 95 | EXPORT_SYMBOL_GPL(resume_device_irqs); |
62 | 96 | ||
63 | /** | 97 | /** |
diff --git a/kernel/irq_work.c b/kernel/irq_work.c index c58fa7da8aef..0e2cde4f380b 100644 --- a/kernel/irq_work.c +++ b/kernel/irq_work.c | |||
@@ -17,54 +17,34 @@ | |||
17 | * claimed NULL, 3 -> {pending} : claimed to be enqueued | 17 | * claimed NULL, 3 -> {pending} : claimed to be enqueued |
18 | * pending next, 3 -> {busy} : queued, pending callback | 18 | * pending next, 3 -> {busy} : queued, pending callback |
19 | * busy NULL, 2 -> {free, claimed} : callback in progress, can be claimed | 19 | * busy NULL, 2 -> {free, claimed} : callback in progress, can be claimed |
20 | * | ||
21 | * We use the lower two bits of the next pointer to keep PENDING and BUSY | ||
22 | * flags. | ||
23 | */ | 20 | */ |
24 | 21 | ||
25 | #define IRQ_WORK_PENDING 1UL | 22 | #define IRQ_WORK_PENDING 1UL |
26 | #define IRQ_WORK_BUSY 2UL | 23 | #define IRQ_WORK_BUSY 2UL |
27 | #define IRQ_WORK_FLAGS 3UL | 24 | #define IRQ_WORK_FLAGS 3UL |
28 | 25 | ||
29 | static inline bool irq_work_is_set(struct irq_work *entry, int flags) | 26 | static DEFINE_PER_CPU(struct llist_head, irq_work_list); |
30 | { | ||
31 | return (unsigned long)entry->next & flags; | ||
32 | } | ||
33 | |||
34 | static inline struct irq_work *irq_work_next(struct irq_work *entry) | ||
35 | { | ||
36 | unsigned long next = (unsigned long)entry->next; | ||
37 | next &= ~IRQ_WORK_FLAGS; | ||
38 | return (struct irq_work *)next; | ||
39 | } | ||
40 | |||
41 | static inline struct irq_work *next_flags(struct irq_work *entry, int flags) | ||
42 | { | ||
43 | unsigned long next = (unsigned long)entry; | ||
44 | next |= flags; | ||
45 | return (struct irq_work *)next; | ||
46 | } | ||
47 | |||
48 | static DEFINE_PER_CPU(struct irq_work *, irq_work_list); | ||
49 | 27 | ||
50 | /* | 28 | /* |
51 | * Claim the entry so that no one else will poke at it. | 29 | * Claim the entry so that no one else will poke at it. |
52 | */ | 30 | */ |
53 | static bool irq_work_claim(struct irq_work *entry) | 31 | static bool irq_work_claim(struct irq_work *work) |
54 | { | 32 | { |
55 | struct irq_work *next, *nflags; | 33 | unsigned long flags, nflags; |
56 | 34 | ||
57 | do { | 35 | for (;;) { |
58 | next = entry->next; | 36 | flags = work->flags; |
59 | if ((unsigned long)next & IRQ_WORK_PENDING) | 37 | if (flags & IRQ_WORK_PENDING) |
60 | return false; | 38 | return false; |
61 | nflags = next_flags(next, IRQ_WORK_FLAGS); | 39 | nflags = flags | IRQ_WORK_FLAGS; |
62 | } while (cmpxchg(&entry->next, next, nflags) != next); | 40 | if (cmpxchg(&work->flags, flags, nflags) == flags) |
41 | break; | ||
42 | cpu_relax(); | ||
43 | } | ||
63 | 44 | ||
64 | return true; | 45 | return true; |
65 | } | 46 | } |
66 | 47 | ||
67 | |||
68 | void __weak arch_irq_work_raise(void) | 48 | void __weak arch_irq_work_raise(void) |
69 | { | 49 | { |
70 | /* | 50 | /* |
@@ -75,20 +55,15 @@ void __weak arch_irq_work_raise(void) | |||
75 | /* | 55 | /* |
76 | * Queue the entry and raise the IPI if needed. | 56 | * Queue the entry and raise the IPI if needed. |
77 | */ | 57 | */ |
78 | static void __irq_work_queue(struct irq_work *entry) | 58 | static void __irq_work_queue(struct irq_work *work) |
79 | { | 59 | { |
80 | struct irq_work *next; | 60 | bool empty; |
81 | 61 | ||
82 | preempt_disable(); | 62 | preempt_disable(); |
83 | 63 | ||
84 | do { | 64 | empty = llist_add(&work->llnode, &__get_cpu_var(irq_work_list)); |
85 | next = __this_cpu_read(irq_work_list); | ||
86 | /* Can assign non-atomic because we keep the flags set. */ | ||
87 | entry->next = next_flags(next, IRQ_WORK_FLAGS); | ||
88 | } while (this_cpu_cmpxchg(irq_work_list, next, entry) != next); | ||
89 | |||
90 | /* The list was empty, raise self-interrupt to start processing. */ | 65 | /* The list was empty, raise self-interrupt to start processing. */ |
91 | if (!irq_work_next(entry)) | 66 | if (empty) |
92 | arch_irq_work_raise(); | 67 | arch_irq_work_raise(); |
93 | 68 | ||
94 | preempt_enable(); | 69 | preempt_enable(); |
@@ -100,16 +75,16 @@ static void __irq_work_queue(struct irq_work *entry) | |||
100 | * | 75 | * |
101 | * Can be re-enqueued while the callback is still in progress. | 76 | * Can be re-enqueued while the callback is still in progress. |
102 | */ | 77 | */ |
103 | bool irq_work_queue(struct irq_work *entry) | 78 | bool irq_work_queue(struct irq_work *work) |
104 | { | 79 | { |
105 | if (!irq_work_claim(entry)) { | 80 | if (!irq_work_claim(work)) { |
106 | /* | 81 | /* |
107 | * Already enqueued, can't do! | 82 | * Already enqueued, can't do! |
108 | */ | 83 | */ |
109 | return false; | 84 | return false; |
110 | } | 85 | } |
111 | 86 | ||
112 | __irq_work_queue(entry); | 87 | __irq_work_queue(work); |
113 | return true; | 88 | return true; |
114 | } | 89 | } |
115 | EXPORT_SYMBOL_GPL(irq_work_queue); | 90 | EXPORT_SYMBOL_GPL(irq_work_queue); |
@@ -120,34 +95,34 @@ EXPORT_SYMBOL_GPL(irq_work_queue); | |||
120 | */ | 95 | */ |
121 | void irq_work_run(void) | 96 | void irq_work_run(void) |
122 | { | 97 | { |
123 | struct irq_work *list; | 98 | struct irq_work *work; |
99 | struct llist_head *this_list; | ||
100 | struct llist_node *llnode; | ||
124 | 101 | ||
125 | if (this_cpu_read(irq_work_list) == NULL) | 102 | this_list = &__get_cpu_var(irq_work_list); |
103 | if (llist_empty(this_list)) | ||
126 | return; | 104 | return; |
127 | 105 | ||
128 | BUG_ON(!in_irq()); | 106 | BUG_ON(!in_irq()); |
129 | BUG_ON(!irqs_disabled()); | 107 | BUG_ON(!irqs_disabled()); |
130 | 108 | ||
131 | list = this_cpu_xchg(irq_work_list, NULL); | 109 | llnode = llist_del_all(this_list); |
132 | 110 | while (llnode != NULL) { | |
133 | while (list != NULL) { | 111 | work = llist_entry(llnode, struct irq_work, llnode); |
134 | struct irq_work *entry = list; | ||
135 | 112 | ||
136 | list = irq_work_next(list); | 113 | llnode = llist_next(llnode); |
137 | 114 | ||
138 | /* | 115 | /* |
139 | * Clear the PENDING bit, after this point the @entry | 116 | * Clear the PENDING bit, after this point the @work |
140 | * can be re-used. | 117 | * can be re-used. |
141 | */ | 118 | */ |
142 | entry->next = next_flags(NULL, IRQ_WORK_BUSY); | 119 | work->flags = IRQ_WORK_BUSY; |
143 | entry->func(entry); | 120 | work->func(work); |
144 | /* | 121 | /* |
145 | * Clear the BUSY bit and return to the free state if | 122 | * Clear the BUSY bit and return to the free state if |
146 | * no-one else claimed it meanwhile. | 123 | * no-one else claimed it meanwhile. |
147 | */ | 124 | */ |
148 | (void)cmpxchg(&entry->next, | 125 | (void)cmpxchg(&work->flags, IRQ_WORK_BUSY, 0); |
149 | next_flags(NULL, IRQ_WORK_BUSY), | ||
150 | NULL); | ||
151 | } | 126 | } |
152 | } | 127 | } |
153 | EXPORT_SYMBOL_GPL(irq_work_run); | 128 | EXPORT_SYMBOL_GPL(irq_work_run); |
@@ -156,11 +131,11 @@ EXPORT_SYMBOL_GPL(irq_work_run); | |||
156 | * Synchronize against the irq_work @entry, ensures the entry is not | 131 | * Synchronize against the irq_work @entry, ensures the entry is not |
157 | * currently in use. | 132 | * currently in use. |
158 | */ | 133 | */ |
159 | void irq_work_sync(struct irq_work *entry) | 134 | void irq_work_sync(struct irq_work *work) |
160 | { | 135 | { |
161 | WARN_ON_ONCE(irqs_disabled()); | 136 | WARN_ON_ONCE(irqs_disabled()); |
162 | 137 | ||
163 | while (irq_work_is_set(entry, IRQ_WORK_BUSY)) | 138 | while (work->flags & IRQ_WORK_BUSY) |
164 | cpu_relax(); | 139 | cpu_relax(); |
165 | } | 140 | } |
166 | EXPORT_SYMBOL_GPL(irq_work_sync); | 141 | EXPORT_SYMBOL_GPL(irq_work_sync); |
diff --git a/kernel/kmod.c b/kernel/kmod.c index ddc7644c1305..a4bea97c75b6 100644 --- a/kernel/kmod.c +++ b/kernel/kmod.c | |||
@@ -114,10 +114,12 @@ int __request_module(bool wait, const char *fmt, ...) | |||
114 | atomic_inc(&kmod_concurrent); | 114 | atomic_inc(&kmod_concurrent); |
115 | if (atomic_read(&kmod_concurrent) > max_modprobes) { | 115 | if (atomic_read(&kmod_concurrent) > max_modprobes) { |
116 | /* We may be blaming an innocent here, but unlikely */ | 116 | /* We may be blaming an innocent here, but unlikely */ |
117 | if (kmod_loop_msg++ < 5) | 117 | if (kmod_loop_msg < 5) { |
118 | printk(KERN_ERR | 118 | printk(KERN_ERR |
119 | "request_module: runaway loop modprobe %s\n", | 119 | "request_module: runaway loop modprobe %s\n", |
120 | module_name); | 120 | module_name); |
121 | kmod_loop_msg++; | ||
122 | } | ||
121 | atomic_dec(&kmod_concurrent); | 123 | atomic_dec(&kmod_concurrent); |
122 | return -ENOMEM; | 124 | return -ENOMEM; |
123 | } | 125 | } |
diff --git a/kernel/kprobes.c b/kernel/kprobes.c index b30fd54eb985..2f193d0ba7f2 100644 --- a/kernel/kprobes.c +++ b/kernel/kprobes.c | |||
@@ -78,10 +78,10 @@ static bool kprobes_all_disarmed; | |||
78 | static DEFINE_MUTEX(kprobe_mutex); | 78 | static DEFINE_MUTEX(kprobe_mutex); |
79 | static DEFINE_PER_CPU(struct kprobe *, kprobe_instance) = NULL; | 79 | static DEFINE_PER_CPU(struct kprobe *, kprobe_instance) = NULL; |
80 | static struct { | 80 | static struct { |
81 | spinlock_t lock ____cacheline_aligned_in_smp; | 81 | raw_spinlock_t lock ____cacheline_aligned_in_smp; |
82 | } kretprobe_table_locks[KPROBE_TABLE_SIZE]; | 82 | } kretprobe_table_locks[KPROBE_TABLE_SIZE]; |
83 | 83 | ||
84 | static spinlock_t *kretprobe_table_lock_ptr(unsigned long hash) | 84 | static raw_spinlock_t *kretprobe_table_lock_ptr(unsigned long hash) |
85 | { | 85 | { |
86 | return &(kretprobe_table_locks[hash].lock); | 86 | return &(kretprobe_table_locks[hash].lock); |
87 | } | 87 | } |
@@ -1013,9 +1013,9 @@ void __kprobes recycle_rp_inst(struct kretprobe_instance *ri, | |||
1013 | hlist_del(&ri->hlist); | 1013 | hlist_del(&ri->hlist); |
1014 | INIT_HLIST_NODE(&ri->hlist); | 1014 | INIT_HLIST_NODE(&ri->hlist); |
1015 | if (likely(rp)) { | 1015 | if (likely(rp)) { |
1016 | spin_lock(&rp->lock); | 1016 | raw_spin_lock(&rp->lock); |
1017 | hlist_add_head(&ri->hlist, &rp->free_instances); | 1017 | hlist_add_head(&ri->hlist, &rp->free_instances); |
1018 | spin_unlock(&rp->lock); | 1018 | raw_spin_unlock(&rp->lock); |
1019 | } else | 1019 | } else |
1020 | /* Unregistering */ | 1020 | /* Unregistering */ |
1021 | hlist_add_head(&ri->hlist, head); | 1021 | hlist_add_head(&ri->hlist, head); |
@@ -1026,19 +1026,19 @@ void __kprobes kretprobe_hash_lock(struct task_struct *tsk, | |||
1026 | __acquires(hlist_lock) | 1026 | __acquires(hlist_lock) |
1027 | { | 1027 | { |
1028 | unsigned long hash = hash_ptr(tsk, KPROBE_HASH_BITS); | 1028 | unsigned long hash = hash_ptr(tsk, KPROBE_HASH_BITS); |
1029 | spinlock_t *hlist_lock; | 1029 | raw_spinlock_t *hlist_lock; |
1030 | 1030 | ||
1031 | *head = &kretprobe_inst_table[hash]; | 1031 | *head = &kretprobe_inst_table[hash]; |
1032 | hlist_lock = kretprobe_table_lock_ptr(hash); | 1032 | hlist_lock = kretprobe_table_lock_ptr(hash); |
1033 | spin_lock_irqsave(hlist_lock, *flags); | 1033 | raw_spin_lock_irqsave(hlist_lock, *flags); |
1034 | } | 1034 | } |
1035 | 1035 | ||
1036 | static void __kprobes kretprobe_table_lock(unsigned long hash, | 1036 | static void __kprobes kretprobe_table_lock(unsigned long hash, |
1037 | unsigned long *flags) | 1037 | unsigned long *flags) |
1038 | __acquires(hlist_lock) | 1038 | __acquires(hlist_lock) |
1039 | { | 1039 | { |
1040 | spinlock_t *hlist_lock = kretprobe_table_lock_ptr(hash); | 1040 | raw_spinlock_t *hlist_lock = kretprobe_table_lock_ptr(hash); |
1041 | spin_lock_irqsave(hlist_lock, *flags); | 1041 | raw_spin_lock_irqsave(hlist_lock, *flags); |
1042 | } | 1042 | } |
1043 | 1043 | ||
1044 | void __kprobes kretprobe_hash_unlock(struct task_struct *tsk, | 1044 | void __kprobes kretprobe_hash_unlock(struct task_struct *tsk, |
@@ -1046,18 +1046,18 @@ void __kprobes kretprobe_hash_unlock(struct task_struct *tsk, | |||
1046 | __releases(hlist_lock) | 1046 | __releases(hlist_lock) |
1047 | { | 1047 | { |
1048 | unsigned long hash = hash_ptr(tsk, KPROBE_HASH_BITS); | 1048 | unsigned long hash = hash_ptr(tsk, KPROBE_HASH_BITS); |
1049 | spinlock_t *hlist_lock; | 1049 | raw_spinlock_t *hlist_lock; |
1050 | 1050 | ||
1051 | hlist_lock = kretprobe_table_lock_ptr(hash); | 1051 | hlist_lock = kretprobe_table_lock_ptr(hash); |
1052 | spin_unlock_irqrestore(hlist_lock, *flags); | 1052 | raw_spin_unlock_irqrestore(hlist_lock, *flags); |
1053 | } | 1053 | } |
1054 | 1054 | ||
1055 | static void __kprobes kretprobe_table_unlock(unsigned long hash, | 1055 | static void __kprobes kretprobe_table_unlock(unsigned long hash, |
1056 | unsigned long *flags) | 1056 | unsigned long *flags) |
1057 | __releases(hlist_lock) | 1057 | __releases(hlist_lock) |
1058 | { | 1058 | { |
1059 | spinlock_t *hlist_lock = kretprobe_table_lock_ptr(hash); | 1059 | raw_spinlock_t *hlist_lock = kretprobe_table_lock_ptr(hash); |
1060 | spin_unlock_irqrestore(hlist_lock, *flags); | 1060 | raw_spin_unlock_irqrestore(hlist_lock, *flags); |
1061 | } | 1061 | } |
1062 | 1062 | ||
1063 | /* | 1063 | /* |
@@ -1663,12 +1663,12 @@ static int __kprobes pre_handler_kretprobe(struct kprobe *p, | |||
1663 | 1663 | ||
1664 | /*TODO: consider to only swap the RA after the last pre_handler fired */ | 1664 | /*TODO: consider to only swap the RA after the last pre_handler fired */ |
1665 | hash = hash_ptr(current, KPROBE_HASH_BITS); | 1665 | hash = hash_ptr(current, KPROBE_HASH_BITS); |
1666 | spin_lock_irqsave(&rp->lock, flags); | 1666 | raw_spin_lock_irqsave(&rp->lock, flags); |
1667 | if (!hlist_empty(&rp->free_instances)) { | 1667 | if (!hlist_empty(&rp->free_instances)) { |
1668 | ri = hlist_entry(rp->free_instances.first, | 1668 | ri = hlist_entry(rp->free_instances.first, |
1669 | struct kretprobe_instance, hlist); | 1669 | struct kretprobe_instance, hlist); |
1670 | hlist_del(&ri->hlist); | 1670 | hlist_del(&ri->hlist); |
1671 | spin_unlock_irqrestore(&rp->lock, flags); | 1671 | raw_spin_unlock_irqrestore(&rp->lock, flags); |
1672 | 1672 | ||
1673 | ri->rp = rp; | 1673 | ri->rp = rp; |
1674 | ri->task = current; | 1674 | ri->task = current; |
@@ -1685,7 +1685,7 @@ static int __kprobes pre_handler_kretprobe(struct kprobe *p, | |||
1685 | kretprobe_table_unlock(hash, &flags); | 1685 | kretprobe_table_unlock(hash, &flags); |
1686 | } else { | 1686 | } else { |
1687 | rp->nmissed++; | 1687 | rp->nmissed++; |
1688 | spin_unlock_irqrestore(&rp->lock, flags); | 1688 | raw_spin_unlock_irqrestore(&rp->lock, flags); |
1689 | } | 1689 | } |
1690 | return 0; | 1690 | return 0; |
1691 | } | 1691 | } |
@@ -1721,7 +1721,7 @@ int __kprobes register_kretprobe(struct kretprobe *rp) | |||
1721 | rp->maxactive = num_possible_cpus(); | 1721 | rp->maxactive = num_possible_cpus(); |
1722 | #endif | 1722 | #endif |
1723 | } | 1723 | } |
1724 | spin_lock_init(&rp->lock); | 1724 | raw_spin_lock_init(&rp->lock); |
1725 | INIT_HLIST_HEAD(&rp->free_instances); | 1725 | INIT_HLIST_HEAD(&rp->free_instances); |
1726 | for (i = 0; i < rp->maxactive; i++) { | 1726 | for (i = 0; i < rp->maxactive; i++) { |
1727 | inst = kmalloc(sizeof(struct kretprobe_instance) + | 1727 | inst = kmalloc(sizeof(struct kretprobe_instance) + |
@@ -1959,7 +1959,7 @@ static int __init init_kprobes(void) | |||
1959 | for (i = 0; i < KPROBE_TABLE_SIZE; i++) { | 1959 | for (i = 0; i < KPROBE_TABLE_SIZE; i++) { |
1960 | INIT_HLIST_HEAD(&kprobe_table[i]); | 1960 | INIT_HLIST_HEAD(&kprobe_table[i]); |
1961 | INIT_HLIST_HEAD(&kretprobe_inst_table[i]); | 1961 | INIT_HLIST_HEAD(&kretprobe_inst_table[i]); |
1962 | spin_lock_init(&(kretprobe_table_locks[i].lock)); | 1962 | raw_spin_lock_init(&(kretprobe_table_locks[i].lock)); |
1963 | } | 1963 | } |
1964 | 1964 | ||
1965 | /* | 1965 | /* |
diff --git a/kernel/latencytop.c b/kernel/latencytop.c index 376066e10413..4ac8ebfcab59 100644 --- a/kernel/latencytop.c +++ b/kernel/latencytop.c | |||
@@ -58,7 +58,7 @@ | |||
58 | #include <linux/list.h> | 58 | #include <linux/list.h> |
59 | #include <linux/stacktrace.h> | 59 | #include <linux/stacktrace.h> |
60 | 60 | ||
61 | static DEFINE_SPINLOCK(latency_lock); | 61 | static DEFINE_RAW_SPINLOCK(latency_lock); |
62 | 62 | ||
63 | #define MAXLR 128 | 63 | #define MAXLR 128 |
64 | static struct latency_record latency_record[MAXLR]; | 64 | static struct latency_record latency_record[MAXLR]; |
@@ -72,19 +72,19 @@ void clear_all_latency_tracing(struct task_struct *p) | |||
72 | if (!latencytop_enabled) | 72 | if (!latencytop_enabled) |
73 | return; | 73 | return; |
74 | 74 | ||
75 | spin_lock_irqsave(&latency_lock, flags); | 75 | raw_spin_lock_irqsave(&latency_lock, flags); |
76 | memset(&p->latency_record, 0, sizeof(p->latency_record)); | 76 | memset(&p->latency_record, 0, sizeof(p->latency_record)); |
77 | p->latency_record_count = 0; | 77 | p->latency_record_count = 0; |
78 | spin_unlock_irqrestore(&latency_lock, flags); | 78 | raw_spin_unlock_irqrestore(&latency_lock, flags); |
79 | } | 79 | } |
80 | 80 | ||
81 | static void clear_global_latency_tracing(void) | 81 | static void clear_global_latency_tracing(void) |
82 | { | 82 | { |
83 | unsigned long flags; | 83 | unsigned long flags; |
84 | 84 | ||
85 | spin_lock_irqsave(&latency_lock, flags); | 85 | raw_spin_lock_irqsave(&latency_lock, flags); |
86 | memset(&latency_record, 0, sizeof(latency_record)); | 86 | memset(&latency_record, 0, sizeof(latency_record)); |
87 | spin_unlock_irqrestore(&latency_lock, flags); | 87 | raw_spin_unlock_irqrestore(&latency_lock, flags); |
88 | } | 88 | } |
89 | 89 | ||
90 | static void __sched | 90 | static void __sched |
@@ -190,7 +190,7 @@ __account_scheduler_latency(struct task_struct *tsk, int usecs, int inter) | |||
190 | lat.max = usecs; | 190 | lat.max = usecs; |
191 | store_stacktrace(tsk, &lat); | 191 | store_stacktrace(tsk, &lat); |
192 | 192 | ||
193 | spin_lock_irqsave(&latency_lock, flags); | 193 | raw_spin_lock_irqsave(&latency_lock, flags); |
194 | 194 | ||
195 | account_global_scheduler_latency(tsk, &lat); | 195 | account_global_scheduler_latency(tsk, &lat); |
196 | 196 | ||
@@ -231,7 +231,7 @@ __account_scheduler_latency(struct task_struct *tsk, int usecs, int inter) | |||
231 | memcpy(&tsk->latency_record[i], &lat, sizeof(struct latency_record)); | 231 | memcpy(&tsk->latency_record[i], &lat, sizeof(struct latency_record)); |
232 | 232 | ||
233 | out_unlock: | 233 | out_unlock: |
234 | spin_unlock_irqrestore(&latency_lock, flags); | 234 | raw_spin_unlock_irqrestore(&latency_lock, flags); |
235 | } | 235 | } |
236 | 236 | ||
237 | static int lstats_show(struct seq_file *m, void *v) | 237 | static int lstats_show(struct seq_file *m, void *v) |
diff --git a/kernel/lockdep.c b/kernel/lockdep.c index 91d67ce3a8d5..e69434b070da 100644 --- a/kernel/lockdep.c +++ b/kernel/lockdep.c | |||
@@ -96,8 +96,13 @@ static int graph_lock(void) | |||
96 | 96 | ||
97 | static inline int graph_unlock(void) | 97 | static inline int graph_unlock(void) |
98 | { | 98 | { |
99 | if (debug_locks && !arch_spin_is_locked(&lockdep_lock)) | 99 | if (debug_locks && !arch_spin_is_locked(&lockdep_lock)) { |
100 | /* | ||
101 | * The lockdep graph lock isn't locked while we expect it to | ||
102 | * be, we're confused now, bye! | ||
103 | */ | ||
100 | return DEBUG_LOCKS_WARN_ON(1); | 104 | return DEBUG_LOCKS_WARN_ON(1); |
105 | } | ||
101 | 106 | ||
102 | current->lockdep_recursion--; | 107 | current->lockdep_recursion--; |
103 | arch_spin_unlock(&lockdep_lock); | 108 | arch_spin_unlock(&lockdep_lock); |
@@ -134,6 +139,9 @@ static struct lock_class lock_classes[MAX_LOCKDEP_KEYS]; | |||
134 | static inline struct lock_class *hlock_class(struct held_lock *hlock) | 139 | static inline struct lock_class *hlock_class(struct held_lock *hlock) |
135 | { | 140 | { |
136 | if (!hlock->class_idx) { | 141 | if (!hlock->class_idx) { |
142 | /* | ||
143 | * Someone passed in garbage, we give up. | ||
144 | */ | ||
137 | DEBUG_LOCKS_WARN_ON(1); | 145 | DEBUG_LOCKS_WARN_ON(1); |
138 | return NULL; | 146 | return NULL; |
139 | } | 147 | } |
@@ -687,6 +695,10 @@ look_up_lock_class(struct lockdep_map *lock, unsigned int subclass) | |||
687 | */ | 695 | */ |
688 | list_for_each_entry(class, hash_head, hash_entry) { | 696 | list_for_each_entry(class, hash_head, hash_entry) { |
689 | if (class->key == key) { | 697 | if (class->key == key) { |
698 | /* | ||
699 | * Huh! same key, different name? Did someone trample | ||
700 | * on some memory? We're most confused. | ||
701 | */ | ||
690 | WARN_ON_ONCE(class->name != lock->name); | 702 | WARN_ON_ONCE(class->name != lock->name); |
691 | return class; | 703 | return class; |
692 | } | 704 | } |
@@ -800,6 +812,10 @@ out_unlock_set: | |||
800 | else if (subclass < NR_LOCKDEP_CACHING_CLASSES) | 812 | else if (subclass < NR_LOCKDEP_CACHING_CLASSES) |
801 | lock->class_cache[subclass] = class; | 813 | lock->class_cache[subclass] = class; |
802 | 814 | ||
815 | /* | ||
816 | * Hash collision, did we smoke some? We found a class with a matching | ||
817 | * hash but the subclass -- which is hashed in -- didn't match. | ||
818 | */ | ||
803 | if (DEBUG_LOCKS_WARN_ON(class->subclass != subclass)) | 819 | if (DEBUG_LOCKS_WARN_ON(class->subclass != subclass)) |
804 | return NULL; | 820 | return NULL; |
805 | 821 | ||
@@ -926,7 +942,7 @@ static inline void mark_lock_accessed(struct lock_list *lock, | |||
926 | unsigned long nr; | 942 | unsigned long nr; |
927 | 943 | ||
928 | nr = lock - list_entries; | 944 | nr = lock - list_entries; |
929 | WARN_ON(nr >= nr_list_entries); | 945 | WARN_ON(nr >= nr_list_entries); /* Out-of-bounds, input fail */ |
930 | lock->parent = parent; | 946 | lock->parent = parent; |
931 | lock->class->dep_gen_id = lockdep_dependency_gen_id; | 947 | lock->class->dep_gen_id = lockdep_dependency_gen_id; |
932 | } | 948 | } |
@@ -936,7 +952,7 @@ static inline unsigned long lock_accessed(struct lock_list *lock) | |||
936 | unsigned long nr; | 952 | unsigned long nr; |
937 | 953 | ||
938 | nr = lock - list_entries; | 954 | nr = lock - list_entries; |
939 | WARN_ON(nr >= nr_list_entries); | 955 | WARN_ON(nr >= nr_list_entries); /* Out-of-bounds, input fail */ |
940 | return lock->class->dep_gen_id == lockdep_dependency_gen_id; | 956 | return lock->class->dep_gen_id == lockdep_dependency_gen_id; |
941 | } | 957 | } |
942 | 958 | ||
@@ -1129,10 +1145,11 @@ print_circular_bug_header(struct lock_list *entry, unsigned int depth, | |||
1129 | if (debug_locks_silent) | 1145 | if (debug_locks_silent) |
1130 | return 0; | 1146 | return 0; |
1131 | 1147 | ||
1132 | printk("\n=======================================================\n"); | 1148 | printk("\n"); |
1133 | printk( "[ INFO: possible circular locking dependency detected ]\n"); | 1149 | printk("======================================================\n"); |
1150 | printk("[ INFO: possible circular locking dependency detected ]\n"); | ||
1134 | print_kernel_version(); | 1151 | print_kernel_version(); |
1135 | printk( "-------------------------------------------------------\n"); | 1152 | printk("-------------------------------------------------------\n"); |
1136 | printk("%s/%d is trying to acquire lock:\n", | 1153 | printk("%s/%d is trying to acquire lock:\n", |
1137 | curr->comm, task_pid_nr(curr)); | 1154 | curr->comm, task_pid_nr(curr)); |
1138 | print_lock(check_src); | 1155 | print_lock(check_src); |
@@ -1196,6 +1213,9 @@ static noinline int print_bfs_bug(int ret) | |||
1196 | if (!debug_locks_off_graph_unlock()) | 1213 | if (!debug_locks_off_graph_unlock()) |
1197 | return 0; | 1214 | return 0; |
1198 | 1215 | ||
1216 | /* | ||
1217 | * Breadth-first-search failed, graph got corrupted? | ||
1218 | */ | ||
1199 | WARN(1, "lockdep bfs error:%d\n", ret); | 1219 | WARN(1, "lockdep bfs error:%d\n", ret); |
1200 | 1220 | ||
1201 | return 0; | 1221 | return 0; |
@@ -1463,11 +1483,12 @@ print_bad_irq_dependency(struct task_struct *curr, | |||
1463 | if (!debug_locks_off_graph_unlock() || debug_locks_silent) | 1483 | if (!debug_locks_off_graph_unlock() || debug_locks_silent) |
1464 | return 0; | 1484 | return 0; |
1465 | 1485 | ||
1466 | printk("\n======================================================\n"); | 1486 | printk("\n"); |
1467 | printk( "[ INFO: %s-safe -> %s-unsafe lock order detected ]\n", | 1487 | printk("======================================================\n"); |
1488 | printk("[ INFO: %s-safe -> %s-unsafe lock order detected ]\n", | ||
1468 | irqclass, irqclass); | 1489 | irqclass, irqclass); |
1469 | print_kernel_version(); | 1490 | print_kernel_version(); |
1470 | printk( "------------------------------------------------------\n"); | 1491 | printk("------------------------------------------------------\n"); |
1471 | printk("%s/%d [HC%u[%lu]:SC%u[%lu]:HE%u:SE%u] is trying to acquire:\n", | 1492 | printk("%s/%d [HC%u[%lu]:SC%u[%lu]:HE%u:SE%u] is trying to acquire:\n", |
1472 | curr->comm, task_pid_nr(curr), | 1493 | curr->comm, task_pid_nr(curr), |
1473 | curr->hardirq_context, hardirq_count() >> HARDIRQ_SHIFT, | 1494 | curr->hardirq_context, hardirq_count() >> HARDIRQ_SHIFT, |
@@ -1692,10 +1713,11 @@ print_deadlock_bug(struct task_struct *curr, struct held_lock *prev, | |||
1692 | if (!debug_locks_off_graph_unlock() || debug_locks_silent) | 1713 | if (!debug_locks_off_graph_unlock() || debug_locks_silent) |
1693 | return 0; | 1714 | return 0; |
1694 | 1715 | ||
1695 | printk("\n=============================================\n"); | 1716 | printk("\n"); |
1696 | printk( "[ INFO: possible recursive locking detected ]\n"); | 1717 | printk("=============================================\n"); |
1718 | printk("[ INFO: possible recursive locking detected ]\n"); | ||
1697 | print_kernel_version(); | 1719 | print_kernel_version(); |
1698 | printk( "---------------------------------------------\n"); | 1720 | printk("---------------------------------------------\n"); |
1699 | printk("%s/%d is trying to acquire lock:\n", | 1721 | printk("%s/%d is trying to acquire lock:\n", |
1700 | curr->comm, task_pid_nr(curr)); | 1722 | curr->comm, task_pid_nr(curr)); |
1701 | print_lock(next); | 1723 | print_lock(next); |
@@ -1944,6 +1966,11 @@ out_bug: | |||
1944 | if (!debug_locks_off_graph_unlock()) | 1966 | if (!debug_locks_off_graph_unlock()) |
1945 | return 0; | 1967 | return 0; |
1946 | 1968 | ||
1969 | /* | ||
1970 | * Clearly we all shouldn't be here, but since we made it we | ||
1971 | * can reliable say we messed up our state. See the above two | ||
1972 | * gotos for reasons why we could possibly end up here. | ||
1973 | */ | ||
1947 | WARN_ON(1); | 1974 | WARN_ON(1); |
1948 | 1975 | ||
1949 | return 0; | 1976 | return 0; |
@@ -1975,6 +2002,11 @@ static inline int lookup_chain_cache(struct task_struct *curr, | |||
1975 | struct held_lock *hlock_curr, *hlock_next; | 2002 | struct held_lock *hlock_curr, *hlock_next; |
1976 | int i, j; | 2003 | int i, j; |
1977 | 2004 | ||
2005 | /* | ||
2006 | * We might need to take the graph lock, ensure we've got IRQs | ||
2007 | * disabled to make this an IRQ-safe lock.. for recursion reasons | ||
2008 | * lockdep won't complain about its own locking errors. | ||
2009 | */ | ||
1978 | if (DEBUG_LOCKS_WARN_ON(!irqs_disabled())) | 2010 | if (DEBUG_LOCKS_WARN_ON(!irqs_disabled())) |
1979 | return 0; | 2011 | return 0; |
1980 | /* | 2012 | /* |
@@ -2126,6 +2158,10 @@ static void check_chain_key(struct task_struct *curr) | |||
2126 | hlock = curr->held_locks + i; | 2158 | hlock = curr->held_locks + i; |
2127 | if (chain_key != hlock->prev_chain_key) { | 2159 | if (chain_key != hlock->prev_chain_key) { |
2128 | debug_locks_off(); | 2160 | debug_locks_off(); |
2161 | /* | ||
2162 | * We got mighty confused, our chain keys don't match | ||
2163 | * with what we expect, someone trample on our task state? | ||
2164 | */ | ||
2129 | WARN(1, "hm#1, depth: %u [%u], %016Lx != %016Lx\n", | 2165 | WARN(1, "hm#1, depth: %u [%u], %016Lx != %016Lx\n", |
2130 | curr->lockdep_depth, i, | 2166 | curr->lockdep_depth, i, |
2131 | (unsigned long long)chain_key, | 2167 | (unsigned long long)chain_key, |
@@ -2133,6 +2169,9 @@ static void check_chain_key(struct task_struct *curr) | |||
2133 | return; | 2169 | return; |
2134 | } | 2170 | } |
2135 | id = hlock->class_idx - 1; | 2171 | id = hlock->class_idx - 1; |
2172 | /* | ||
2173 | * Whoops ran out of static storage again? | ||
2174 | */ | ||
2136 | if (DEBUG_LOCKS_WARN_ON(id >= MAX_LOCKDEP_KEYS)) | 2175 | if (DEBUG_LOCKS_WARN_ON(id >= MAX_LOCKDEP_KEYS)) |
2137 | return; | 2176 | return; |
2138 | 2177 | ||
@@ -2144,6 +2183,10 @@ static void check_chain_key(struct task_struct *curr) | |||
2144 | } | 2183 | } |
2145 | if (chain_key != curr->curr_chain_key) { | 2184 | if (chain_key != curr->curr_chain_key) { |
2146 | debug_locks_off(); | 2185 | debug_locks_off(); |
2186 | /* | ||
2187 | * More smoking hash instead of calculating it, damn see these | ||
2188 | * numbers float.. I bet that a pink elephant stepped on my memory. | ||
2189 | */ | ||
2147 | WARN(1, "hm#2, depth: %u [%u], %016Lx != %016Lx\n", | 2190 | WARN(1, "hm#2, depth: %u [%u], %016Lx != %016Lx\n", |
2148 | curr->lockdep_depth, i, | 2191 | curr->lockdep_depth, i, |
2149 | (unsigned long long)chain_key, | 2192 | (unsigned long long)chain_key, |
@@ -2177,10 +2220,11 @@ print_usage_bug(struct task_struct *curr, struct held_lock *this, | |||
2177 | if (!debug_locks_off_graph_unlock() || debug_locks_silent) | 2220 | if (!debug_locks_off_graph_unlock() || debug_locks_silent) |
2178 | return 0; | 2221 | return 0; |
2179 | 2222 | ||
2180 | printk("\n=================================\n"); | 2223 | printk("\n"); |
2181 | printk( "[ INFO: inconsistent lock state ]\n"); | 2224 | printk("=================================\n"); |
2225 | printk("[ INFO: inconsistent lock state ]\n"); | ||
2182 | print_kernel_version(); | 2226 | print_kernel_version(); |
2183 | printk( "---------------------------------\n"); | 2227 | printk("---------------------------------\n"); |
2184 | 2228 | ||
2185 | printk("inconsistent {%s} -> {%s} usage.\n", | 2229 | printk("inconsistent {%s} -> {%s} usage.\n", |
2186 | usage_str[prev_bit], usage_str[new_bit]); | 2230 | usage_str[prev_bit], usage_str[new_bit]); |
@@ -2241,10 +2285,11 @@ print_irq_inversion_bug(struct task_struct *curr, | |||
2241 | if (!debug_locks_off_graph_unlock() || debug_locks_silent) | 2285 | if (!debug_locks_off_graph_unlock() || debug_locks_silent) |
2242 | return 0; | 2286 | return 0; |
2243 | 2287 | ||
2244 | printk("\n=========================================================\n"); | 2288 | printk("\n"); |
2245 | printk( "[ INFO: possible irq lock inversion dependency detected ]\n"); | 2289 | printk("=========================================================\n"); |
2290 | printk("[ INFO: possible irq lock inversion dependency detected ]\n"); | ||
2246 | print_kernel_version(); | 2291 | print_kernel_version(); |
2247 | printk( "---------------------------------------------------------\n"); | 2292 | printk("---------------------------------------------------------\n"); |
2248 | printk("%s/%d just changed the state of lock:\n", | 2293 | printk("%s/%d just changed the state of lock:\n", |
2249 | curr->comm, task_pid_nr(curr)); | 2294 | curr->comm, task_pid_nr(curr)); |
2250 | print_lock(this); | 2295 | print_lock(this); |
@@ -2525,12 +2570,24 @@ void trace_hardirqs_on_caller(unsigned long ip) | |||
2525 | return; | 2570 | return; |
2526 | } | 2571 | } |
2527 | 2572 | ||
2573 | /* | ||
2574 | * We're enabling irqs and according to our state above irqs weren't | ||
2575 | * already enabled, yet we find the hardware thinks they are in fact | ||
2576 | * enabled.. someone messed up their IRQ state tracing. | ||
2577 | */ | ||
2528 | if (DEBUG_LOCKS_WARN_ON(!irqs_disabled())) | 2578 | if (DEBUG_LOCKS_WARN_ON(!irqs_disabled())) |
2529 | return; | 2579 | return; |
2530 | 2580 | ||
2581 | /* | ||
2582 | * See the fine text that goes along with this variable definition. | ||
2583 | */ | ||
2531 | if (DEBUG_LOCKS_WARN_ON(unlikely(early_boot_irqs_disabled))) | 2584 | if (DEBUG_LOCKS_WARN_ON(unlikely(early_boot_irqs_disabled))) |
2532 | return; | 2585 | return; |
2533 | 2586 | ||
2587 | /* | ||
2588 | * Can't allow enabling interrupts while in an interrupt handler, | ||
2589 | * that's general bad form and such. Recursion, limited stack etc.. | ||
2590 | */ | ||
2534 | if (DEBUG_LOCKS_WARN_ON(current->hardirq_context)) | 2591 | if (DEBUG_LOCKS_WARN_ON(current->hardirq_context)) |
2535 | return; | 2592 | return; |
2536 | 2593 | ||
@@ -2558,6 +2615,10 @@ void trace_hardirqs_off_caller(unsigned long ip) | |||
2558 | if (unlikely(!debug_locks || current->lockdep_recursion)) | 2615 | if (unlikely(!debug_locks || current->lockdep_recursion)) |
2559 | return; | 2616 | return; |
2560 | 2617 | ||
2618 | /* | ||
2619 | * So we're supposed to get called after you mask local IRQs, but for | ||
2620 | * some reason the hardware doesn't quite think you did a proper job. | ||
2621 | */ | ||
2561 | if (DEBUG_LOCKS_WARN_ON(!irqs_disabled())) | 2622 | if (DEBUG_LOCKS_WARN_ON(!irqs_disabled())) |
2562 | return; | 2623 | return; |
2563 | 2624 | ||
@@ -2590,6 +2651,10 @@ void trace_softirqs_on(unsigned long ip) | |||
2590 | if (unlikely(!debug_locks || current->lockdep_recursion)) | 2651 | if (unlikely(!debug_locks || current->lockdep_recursion)) |
2591 | return; | 2652 | return; |
2592 | 2653 | ||
2654 | /* | ||
2655 | * We fancy IRQs being disabled here, see softirq.c, avoids | ||
2656 | * funny state and nesting things. | ||
2657 | */ | ||
2593 | if (DEBUG_LOCKS_WARN_ON(!irqs_disabled())) | 2658 | if (DEBUG_LOCKS_WARN_ON(!irqs_disabled())) |
2594 | return; | 2659 | return; |
2595 | 2660 | ||
@@ -2626,6 +2691,9 @@ void trace_softirqs_off(unsigned long ip) | |||
2626 | if (unlikely(!debug_locks || current->lockdep_recursion)) | 2691 | if (unlikely(!debug_locks || current->lockdep_recursion)) |
2627 | return; | 2692 | return; |
2628 | 2693 | ||
2694 | /* | ||
2695 | * We fancy IRQs being disabled here, see softirq.c | ||
2696 | */ | ||
2629 | if (DEBUG_LOCKS_WARN_ON(!irqs_disabled())) | 2697 | if (DEBUG_LOCKS_WARN_ON(!irqs_disabled())) |
2630 | return; | 2698 | return; |
2631 | 2699 | ||
@@ -2637,6 +2705,9 @@ void trace_softirqs_off(unsigned long ip) | |||
2637 | curr->softirq_disable_ip = ip; | 2705 | curr->softirq_disable_ip = ip; |
2638 | curr->softirq_disable_event = ++curr->irq_events; | 2706 | curr->softirq_disable_event = ++curr->irq_events; |
2639 | debug_atomic_inc(softirqs_off_events); | 2707 | debug_atomic_inc(softirqs_off_events); |
2708 | /* | ||
2709 | * Whoops, we wanted softirqs off, so why aren't they? | ||
2710 | */ | ||
2640 | DEBUG_LOCKS_WARN_ON(!softirq_count()); | 2711 | DEBUG_LOCKS_WARN_ON(!softirq_count()); |
2641 | } else | 2712 | } else |
2642 | debug_atomic_inc(redundant_softirqs_off); | 2713 | debug_atomic_inc(redundant_softirqs_off); |
@@ -2661,6 +2732,9 @@ static void __lockdep_trace_alloc(gfp_t gfp_mask, unsigned long flags) | |||
2661 | if (!(gfp_mask & __GFP_FS)) | 2732 | if (!(gfp_mask & __GFP_FS)) |
2662 | return; | 2733 | return; |
2663 | 2734 | ||
2735 | /* | ||
2736 | * Oi! Can't be having __GFP_FS allocations with IRQs disabled. | ||
2737 | */ | ||
2664 | if (DEBUG_LOCKS_WARN_ON(irqs_disabled_flags(flags))) | 2738 | if (DEBUG_LOCKS_WARN_ON(irqs_disabled_flags(flags))) |
2665 | return; | 2739 | return; |
2666 | 2740 | ||
@@ -2773,13 +2847,13 @@ static int separate_irq_context(struct task_struct *curr, | |||
2773 | return 0; | 2847 | return 0; |
2774 | } | 2848 | } |
2775 | 2849 | ||
2776 | #else | 2850 | #else /* defined(CONFIG_TRACE_IRQFLAGS) && defined(CONFIG_PROVE_LOCKING) */ |
2777 | 2851 | ||
2778 | static inline | 2852 | static inline |
2779 | int mark_lock_irq(struct task_struct *curr, struct held_lock *this, | 2853 | int mark_lock_irq(struct task_struct *curr, struct held_lock *this, |
2780 | enum lock_usage_bit new_bit) | 2854 | enum lock_usage_bit new_bit) |
2781 | { | 2855 | { |
2782 | WARN_ON(1); | 2856 | WARN_ON(1); /* Impossible innit? when we don't have TRACE_IRQFLAG */ |
2783 | return 1; | 2857 | return 1; |
2784 | } | 2858 | } |
2785 | 2859 | ||
@@ -2799,7 +2873,7 @@ void lockdep_trace_alloc(gfp_t gfp_mask) | |||
2799 | { | 2873 | { |
2800 | } | 2874 | } |
2801 | 2875 | ||
2802 | #endif | 2876 | #endif /* defined(CONFIG_TRACE_IRQFLAGS) && defined(CONFIG_PROVE_LOCKING) */ |
2803 | 2877 | ||
2804 | /* | 2878 | /* |
2805 | * Mark a lock with a usage bit, and validate the state transition: | 2879 | * Mark a lock with a usage bit, and validate the state transition: |
@@ -2880,6 +2954,9 @@ void lockdep_init_map(struct lockdep_map *lock, const char *name, | |||
2880 | lock->cpu = raw_smp_processor_id(); | 2954 | lock->cpu = raw_smp_processor_id(); |
2881 | #endif | 2955 | #endif |
2882 | 2956 | ||
2957 | /* | ||
2958 | * Can't be having no nameless bastards around this place! | ||
2959 | */ | ||
2883 | if (DEBUG_LOCKS_WARN_ON(!name)) { | 2960 | if (DEBUG_LOCKS_WARN_ON(!name)) { |
2884 | lock->name = "NULL"; | 2961 | lock->name = "NULL"; |
2885 | return; | 2962 | return; |
@@ -2887,6 +2964,9 @@ void lockdep_init_map(struct lockdep_map *lock, const char *name, | |||
2887 | 2964 | ||
2888 | lock->name = name; | 2965 | lock->name = name; |
2889 | 2966 | ||
2967 | /* | ||
2968 | * No key, no joy, we need to hash something. | ||
2969 | */ | ||
2890 | if (DEBUG_LOCKS_WARN_ON(!key)) | 2970 | if (DEBUG_LOCKS_WARN_ON(!key)) |
2891 | return; | 2971 | return; |
2892 | /* | 2972 | /* |
@@ -2894,6 +2974,9 @@ void lockdep_init_map(struct lockdep_map *lock, const char *name, | |||
2894 | */ | 2974 | */ |
2895 | if (!static_obj(key)) { | 2975 | if (!static_obj(key)) { |
2896 | printk("BUG: key %p not in .data!\n", key); | 2976 | printk("BUG: key %p not in .data!\n", key); |
2977 | /* | ||
2978 | * What it says above ^^^^^, I suggest you read it. | ||
2979 | */ | ||
2897 | DEBUG_LOCKS_WARN_ON(1); | 2980 | DEBUG_LOCKS_WARN_ON(1); |
2898 | return; | 2981 | return; |
2899 | } | 2982 | } |
@@ -2932,6 +3015,11 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass, | |||
2932 | if (unlikely(!debug_locks)) | 3015 | if (unlikely(!debug_locks)) |
2933 | return 0; | 3016 | return 0; |
2934 | 3017 | ||
3018 | /* | ||
3019 | * Lockdep should run with IRQs disabled, otherwise we could | ||
3020 | * get an interrupt which would want to take locks, which would | ||
3021 | * end up in lockdep and have you got a head-ache already? | ||
3022 | */ | ||
2935 | if (DEBUG_LOCKS_WARN_ON(!irqs_disabled())) | 3023 | if (DEBUG_LOCKS_WARN_ON(!irqs_disabled())) |
2936 | return 0; | 3024 | return 0; |
2937 | 3025 | ||
@@ -2963,6 +3051,9 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass, | |||
2963 | * dependency checks are done) | 3051 | * dependency checks are done) |
2964 | */ | 3052 | */ |
2965 | depth = curr->lockdep_depth; | 3053 | depth = curr->lockdep_depth; |
3054 | /* | ||
3055 | * Ran out of static storage for our per-task lock stack again have we? | ||
3056 | */ | ||
2966 | if (DEBUG_LOCKS_WARN_ON(depth >= MAX_LOCK_DEPTH)) | 3057 | if (DEBUG_LOCKS_WARN_ON(depth >= MAX_LOCK_DEPTH)) |
2967 | return 0; | 3058 | return 0; |
2968 | 3059 | ||
@@ -2981,6 +3072,10 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass, | |||
2981 | } | 3072 | } |
2982 | 3073 | ||
2983 | hlock = curr->held_locks + depth; | 3074 | hlock = curr->held_locks + depth; |
3075 | /* | ||
3076 | * Plain impossible, we just registered it and checked it weren't no | ||
3077 | * NULL like.. I bet this mushroom I ate was good! | ||
3078 | */ | ||
2984 | if (DEBUG_LOCKS_WARN_ON(!class)) | 3079 | if (DEBUG_LOCKS_WARN_ON(!class)) |
2985 | return 0; | 3080 | return 0; |
2986 | hlock->class_idx = class_idx; | 3081 | hlock->class_idx = class_idx; |
@@ -3015,11 +3110,17 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass, | |||
3015 | * the hash, not class->key. | 3110 | * the hash, not class->key. |
3016 | */ | 3111 | */ |
3017 | id = class - lock_classes; | 3112 | id = class - lock_classes; |
3113 | /* | ||
3114 | * Whoops, we did it again.. ran straight out of our static allocation. | ||
3115 | */ | ||
3018 | if (DEBUG_LOCKS_WARN_ON(id >= MAX_LOCKDEP_KEYS)) | 3116 | if (DEBUG_LOCKS_WARN_ON(id >= MAX_LOCKDEP_KEYS)) |
3019 | return 0; | 3117 | return 0; |
3020 | 3118 | ||
3021 | chain_key = curr->curr_chain_key; | 3119 | chain_key = curr->curr_chain_key; |
3022 | if (!depth) { | 3120 | if (!depth) { |
3121 | /* | ||
3122 | * How can we have a chain hash when we ain't got no keys?! | ||
3123 | */ | ||
3023 | if (DEBUG_LOCKS_WARN_ON(chain_key != 0)) | 3124 | if (DEBUG_LOCKS_WARN_ON(chain_key != 0)) |
3024 | return 0; | 3125 | return 0; |
3025 | chain_head = 1; | 3126 | chain_head = 1; |
@@ -3065,9 +3166,10 @@ print_unlock_inbalance_bug(struct task_struct *curr, struct lockdep_map *lock, | |||
3065 | if (debug_locks_silent) | 3166 | if (debug_locks_silent) |
3066 | return 0; | 3167 | return 0; |
3067 | 3168 | ||
3068 | printk("\n=====================================\n"); | 3169 | printk("\n"); |
3069 | printk( "[ BUG: bad unlock balance detected! ]\n"); | 3170 | printk("=====================================\n"); |
3070 | printk( "-------------------------------------\n"); | 3171 | printk("[ BUG: bad unlock balance detected! ]\n"); |
3172 | printk("-------------------------------------\n"); | ||
3071 | printk("%s/%d is trying to release lock (", | 3173 | printk("%s/%d is trying to release lock (", |
3072 | curr->comm, task_pid_nr(curr)); | 3174 | curr->comm, task_pid_nr(curr)); |
3073 | print_lockdep_cache(lock); | 3175 | print_lockdep_cache(lock); |
@@ -3091,6 +3193,9 @@ static int check_unlock(struct task_struct *curr, struct lockdep_map *lock, | |||
3091 | { | 3193 | { |
3092 | if (unlikely(!debug_locks)) | 3194 | if (unlikely(!debug_locks)) |
3093 | return 0; | 3195 | return 0; |
3196 | /* | ||
3197 | * Lockdep should run with IRQs disabled, recursion, head-ache, etc.. | ||
3198 | */ | ||
3094 | if (DEBUG_LOCKS_WARN_ON(!irqs_disabled())) | 3199 | if (DEBUG_LOCKS_WARN_ON(!irqs_disabled())) |
3095 | return 0; | 3200 | return 0; |
3096 | 3201 | ||
@@ -3120,6 +3225,11 @@ static int match_held_lock(struct held_lock *hlock, struct lockdep_map *lock) | |||
3120 | if (!class) | 3225 | if (!class) |
3121 | return 0; | 3226 | return 0; |
3122 | 3227 | ||
3228 | /* | ||
3229 | * References, but not a lock we're actually ref-counting? | ||
3230 | * State got messed up, follow the sites that change ->references | ||
3231 | * and try to make sense of it. | ||
3232 | */ | ||
3123 | if (DEBUG_LOCKS_WARN_ON(!hlock->nest_lock)) | 3233 | if (DEBUG_LOCKS_WARN_ON(!hlock->nest_lock)) |
3124 | return 0; | 3234 | return 0; |
3125 | 3235 | ||
@@ -3142,6 +3252,10 @@ __lock_set_class(struct lockdep_map *lock, const char *name, | |||
3142 | int i; | 3252 | int i; |
3143 | 3253 | ||
3144 | depth = curr->lockdep_depth; | 3254 | depth = curr->lockdep_depth; |
3255 | /* | ||
3256 | * This function is about (re)setting the class of a held lock, | ||
3257 | * yet we're not actually holding any locks. Naughty user! | ||
3258 | */ | ||
3145 | if (DEBUG_LOCKS_WARN_ON(!depth)) | 3259 | if (DEBUG_LOCKS_WARN_ON(!depth)) |
3146 | return 0; | 3260 | return 0; |
3147 | 3261 | ||
@@ -3177,6 +3291,10 @@ found_it: | |||
3177 | return 0; | 3291 | return 0; |
3178 | } | 3292 | } |
3179 | 3293 | ||
3294 | /* | ||
3295 | * I took it apart and put it back together again, except now I have | ||
3296 | * these 'spare' parts.. where shall I put them. | ||
3297 | */ | ||
3180 | if (DEBUG_LOCKS_WARN_ON(curr->lockdep_depth != depth)) | 3298 | if (DEBUG_LOCKS_WARN_ON(curr->lockdep_depth != depth)) |
3181 | return 0; | 3299 | return 0; |
3182 | return 1; | 3300 | return 1; |
@@ -3201,6 +3319,10 @@ lock_release_non_nested(struct task_struct *curr, | |||
3201 | * of held locks: | 3319 | * of held locks: |
3202 | */ | 3320 | */ |
3203 | depth = curr->lockdep_depth; | 3321 | depth = curr->lockdep_depth; |
3322 | /* | ||
3323 | * So we're all set to release this lock.. wait what lock? We don't | ||
3324 | * own any locks, you've been drinking again? | ||
3325 | */ | ||
3204 | if (DEBUG_LOCKS_WARN_ON(!depth)) | 3326 | if (DEBUG_LOCKS_WARN_ON(!depth)) |
3205 | return 0; | 3327 | return 0; |
3206 | 3328 | ||
@@ -3253,6 +3375,10 @@ found_it: | |||
3253 | return 0; | 3375 | return 0; |
3254 | } | 3376 | } |
3255 | 3377 | ||
3378 | /* | ||
3379 | * We had N bottles of beer on the wall, we drank one, but now | ||
3380 | * there's not N-1 bottles of beer left on the wall... | ||
3381 | */ | ||
3256 | if (DEBUG_LOCKS_WARN_ON(curr->lockdep_depth != depth - 1)) | 3382 | if (DEBUG_LOCKS_WARN_ON(curr->lockdep_depth != depth - 1)) |
3257 | return 0; | 3383 | return 0; |
3258 | return 1; | 3384 | return 1; |
@@ -3283,6 +3409,9 @@ static int lock_release_nested(struct task_struct *curr, | |||
3283 | return lock_release_non_nested(curr, lock, ip); | 3409 | return lock_release_non_nested(curr, lock, ip); |
3284 | curr->lockdep_depth--; | 3410 | curr->lockdep_depth--; |
3285 | 3411 | ||
3412 | /* | ||
3413 | * No more locks, but somehow we've got hash left over, who left it? | ||
3414 | */ | ||
3286 | if (DEBUG_LOCKS_WARN_ON(!depth && (hlock->prev_chain_key != 0))) | 3415 | if (DEBUG_LOCKS_WARN_ON(!depth && (hlock->prev_chain_key != 0))) |
3287 | return 0; | 3416 | return 0; |
3288 | 3417 | ||
@@ -3365,10 +3494,13 @@ static void check_flags(unsigned long flags) | |||
3365 | * check if not in hardirq contexts: | 3494 | * check if not in hardirq contexts: |
3366 | */ | 3495 | */ |
3367 | if (!hardirq_count()) { | 3496 | if (!hardirq_count()) { |
3368 | if (softirq_count()) | 3497 | if (softirq_count()) { |
3498 | /* like the above, but with softirqs */ | ||
3369 | DEBUG_LOCKS_WARN_ON(current->softirqs_enabled); | 3499 | DEBUG_LOCKS_WARN_ON(current->softirqs_enabled); |
3370 | else | 3500 | } else { |
3501 | /* lick the above, does it taste good? */ | ||
3371 | DEBUG_LOCKS_WARN_ON(!current->softirqs_enabled); | 3502 | DEBUG_LOCKS_WARN_ON(!current->softirqs_enabled); |
3503 | } | ||
3372 | } | 3504 | } |
3373 | 3505 | ||
3374 | if (!debug_locks) | 3506 | if (!debug_locks) |
@@ -3478,9 +3610,10 @@ print_lock_contention_bug(struct task_struct *curr, struct lockdep_map *lock, | |||
3478 | if (debug_locks_silent) | 3610 | if (debug_locks_silent) |
3479 | return 0; | 3611 | return 0; |
3480 | 3612 | ||
3481 | printk("\n=================================\n"); | 3613 | printk("\n"); |
3482 | printk( "[ BUG: bad contention detected! ]\n"); | 3614 | printk("=================================\n"); |
3483 | printk( "---------------------------------\n"); | 3615 | printk("[ BUG: bad contention detected! ]\n"); |
3616 | printk("---------------------------------\n"); | ||
3484 | printk("%s/%d is trying to contend lock (", | 3617 | printk("%s/%d is trying to contend lock (", |
3485 | curr->comm, task_pid_nr(curr)); | 3618 | curr->comm, task_pid_nr(curr)); |
3486 | print_lockdep_cache(lock); | 3619 | print_lockdep_cache(lock); |
@@ -3506,6 +3639,10 @@ __lock_contended(struct lockdep_map *lock, unsigned long ip) | |||
3506 | int i, contention_point, contending_point; | 3639 | int i, contention_point, contending_point; |
3507 | 3640 | ||
3508 | depth = curr->lockdep_depth; | 3641 | depth = curr->lockdep_depth; |
3642 | /* | ||
3643 | * Whee, we contended on this lock, except it seems we're not | ||
3644 | * actually trying to acquire anything much at all.. | ||
3645 | */ | ||
3509 | if (DEBUG_LOCKS_WARN_ON(!depth)) | 3646 | if (DEBUG_LOCKS_WARN_ON(!depth)) |
3510 | return; | 3647 | return; |
3511 | 3648 | ||
@@ -3555,6 +3692,10 @@ __lock_acquired(struct lockdep_map *lock, unsigned long ip) | |||
3555 | int i, cpu; | 3692 | int i, cpu; |
3556 | 3693 | ||
3557 | depth = curr->lockdep_depth; | 3694 | depth = curr->lockdep_depth; |
3695 | /* | ||
3696 | * Yay, we acquired ownership of this lock we didn't try to | ||
3697 | * acquire, how the heck did that happen? | ||
3698 | */ | ||
3558 | if (DEBUG_LOCKS_WARN_ON(!depth)) | 3699 | if (DEBUG_LOCKS_WARN_ON(!depth)) |
3559 | return; | 3700 | return; |
3560 | 3701 | ||
@@ -3759,8 +3900,12 @@ void lockdep_reset_lock(struct lockdep_map *lock) | |||
3759 | match |= class == lock->class_cache[j]; | 3900 | match |= class == lock->class_cache[j]; |
3760 | 3901 | ||
3761 | if (unlikely(match)) { | 3902 | if (unlikely(match)) { |
3762 | if (debug_locks_off_graph_unlock()) | 3903 | if (debug_locks_off_graph_unlock()) { |
3904 | /* | ||
3905 | * We all just reset everything, how did it match? | ||
3906 | */ | ||
3763 | WARN_ON(1); | 3907 | WARN_ON(1); |
3908 | } | ||
3764 | goto out_restore; | 3909 | goto out_restore; |
3765 | } | 3910 | } |
3766 | } | 3911 | } |
@@ -3839,9 +3984,10 @@ print_freed_lock_bug(struct task_struct *curr, const void *mem_from, | |||
3839 | if (debug_locks_silent) | 3984 | if (debug_locks_silent) |
3840 | return; | 3985 | return; |
3841 | 3986 | ||
3842 | printk("\n=========================\n"); | 3987 | printk("\n"); |
3843 | printk( "[ BUG: held lock freed! ]\n"); | 3988 | printk("=========================\n"); |
3844 | printk( "-------------------------\n"); | 3989 | printk("[ BUG: held lock freed! ]\n"); |
3990 | printk("-------------------------\n"); | ||
3845 | printk("%s/%d is freeing memory %p-%p, with a lock still held there!\n", | 3991 | printk("%s/%d is freeing memory %p-%p, with a lock still held there!\n", |
3846 | curr->comm, task_pid_nr(curr), mem_from, mem_to-1); | 3992 | curr->comm, task_pid_nr(curr), mem_from, mem_to-1); |
3847 | print_lock(hlock); | 3993 | print_lock(hlock); |
@@ -3895,9 +4041,10 @@ static void print_held_locks_bug(struct task_struct *curr) | |||
3895 | if (debug_locks_silent) | 4041 | if (debug_locks_silent) |
3896 | return; | 4042 | return; |
3897 | 4043 | ||
3898 | printk("\n=====================================\n"); | 4044 | printk("\n"); |
3899 | printk( "[ BUG: lock held at task exit time! ]\n"); | 4045 | printk("=====================================\n"); |
3900 | printk( "-------------------------------------\n"); | 4046 | printk("[ BUG: lock held at task exit time! ]\n"); |
4047 | printk("-------------------------------------\n"); | ||
3901 | printk("%s/%d is exiting with locks still held!\n", | 4048 | printk("%s/%d is exiting with locks still held!\n", |
3902 | curr->comm, task_pid_nr(curr)); | 4049 | curr->comm, task_pid_nr(curr)); |
3903 | lockdep_print_held_locks(curr); | 4050 | lockdep_print_held_locks(curr); |
@@ -3991,16 +4138,17 @@ void lockdep_sys_exit(void) | |||
3991 | if (unlikely(curr->lockdep_depth)) { | 4138 | if (unlikely(curr->lockdep_depth)) { |
3992 | if (!debug_locks_off()) | 4139 | if (!debug_locks_off()) |
3993 | return; | 4140 | return; |
3994 | printk("\n================================================\n"); | 4141 | printk("\n"); |
3995 | printk( "[ BUG: lock held when returning to user space! ]\n"); | 4142 | printk("================================================\n"); |
3996 | printk( "------------------------------------------------\n"); | 4143 | printk("[ BUG: lock held when returning to user space! ]\n"); |
4144 | printk("------------------------------------------------\n"); | ||
3997 | printk("%s/%d is leaving the kernel with locks still held!\n", | 4145 | printk("%s/%d is leaving the kernel with locks still held!\n", |
3998 | curr->comm, curr->pid); | 4146 | curr->comm, curr->pid); |
3999 | lockdep_print_held_locks(curr); | 4147 | lockdep_print_held_locks(curr); |
4000 | } | 4148 | } |
4001 | } | 4149 | } |
4002 | 4150 | ||
4003 | void lockdep_rcu_dereference(const char *file, const int line) | 4151 | void lockdep_rcu_suspicious(const char *file, const int line, const char *s) |
4004 | { | 4152 | { |
4005 | struct task_struct *curr = current; | 4153 | struct task_struct *curr = current; |
4006 | 4154 | ||
@@ -4009,15 +4157,15 @@ void lockdep_rcu_dereference(const char *file, const int line) | |||
4009 | return; | 4157 | return; |
4010 | #endif /* #ifdef CONFIG_PROVE_RCU_REPEATEDLY */ | 4158 | #endif /* #ifdef CONFIG_PROVE_RCU_REPEATEDLY */ |
4011 | /* Note: the following can be executed concurrently, so be careful. */ | 4159 | /* Note: the following can be executed concurrently, so be careful. */ |
4012 | printk("\n===================================================\n"); | 4160 | printk("\n"); |
4013 | printk( "[ INFO: suspicious rcu_dereference_check() usage. ]\n"); | 4161 | printk("===============================\n"); |
4014 | printk( "---------------------------------------------------\n"); | 4162 | printk("[ INFO: suspicious RCU usage. ]\n"); |
4015 | printk("%s:%d invoked rcu_dereference_check() without protection!\n", | 4163 | printk("-------------------------------\n"); |
4016 | file, line); | 4164 | printk("%s:%d %s!\n", file, line, s); |
4017 | printk("\nother info that might help us debug this:\n\n"); | 4165 | printk("\nother info that might help us debug this:\n\n"); |
4018 | printk("\nrcu_scheduler_active = %d, debug_locks = %d\n", rcu_scheduler_active, debug_locks); | 4166 | printk("\nrcu_scheduler_active = %d, debug_locks = %d\n", rcu_scheduler_active, debug_locks); |
4019 | lockdep_print_held_locks(curr); | 4167 | lockdep_print_held_locks(curr); |
4020 | printk("\nstack backtrace:\n"); | 4168 | printk("\nstack backtrace:\n"); |
4021 | dump_stack(); | 4169 | dump_stack(); |
4022 | } | 4170 | } |
4023 | EXPORT_SYMBOL_GPL(lockdep_rcu_dereference); | 4171 | EXPORT_SYMBOL_GPL(lockdep_rcu_suspicious); |
diff --git a/kernel/module.c b/kernel/module.c index 04379f92f843..93342d992f34 100644 --- a/kernel/module.c +++ b/kernel/module.c | |||
@@ -3487,50 +3487,3 @@ void module_layout(struct module *mod, | |||
3487 | } | 3487 | } |
3488 | EXPORT_SYMBOL(module_layout); | 3488 | EXPORT_SYMBOL(module_layout); |
3489 | #endif | 3489 | #endif |
3490 | |||
3491 | #ifdef CONFIG_TRACEPOINTS | ||
3492 | void module_update_tracepoints(void) | ||
3493 | { | ||
3494 | struct module *mod; | ||
3495 | |||
3496 | mutex_lock(&module_mutex); | ||
3497 | list_for_each_entry(mod, &modules, list) | ||
3498 | if (!mod->taints) | ||
3499 | tracepoint_update_probe_range(mod->tracepoints_ptrs, | ||
3500 | mod->tracepoints_ptrs + mod->num_tracepoints); | ||
3501 | mutex_unlock(&module_mutex); | ||
3502 | } | ||
3503 | |||
3504 | /* | ||
3505 | * Returns 0 if current not found. | ||
3506 | * Returns 1 if current found. | ||
3507 | */ | ||
3508 | int module_get_iter_tracepoints(struct tracepoint_iter *iter) | ||
3509 | { | ||
3510 | struct module *iter_mod; | ||
3511 | int found = 0; | ||
3512 | |||
3513 | mutex_lock(&module_mutex); | ||
3514 | list_for_each_entry(iter_mod, &modules, list) { | ||
3515 | if (!iter_mod->taints) { | ||
3516 | /* | ||
3517 | * Sorted module list | ||
3518 | */ | ||
3519 | if (iter_mod < iter->module) | ||
3520 | continue; | ||
3521 | else if (iter_mod > iter->module) | ||
3522 | iter->tracepoint = NULL; | ||
3523 | found = tracepoint_get_iter_range(&iter->tracepoint, | ||
3524 | iter_mod->tracepoints_ptrs, | ||
3525 | iter_mod->tracepoints_ptrs | ||
3526 | + iter_mod->num_tracepoints); | ||
3527 | if (found) { | ||
3528 | iter->module = iter_mod; | ||
3529 | break; | ||
3530 | } | ||
3531 | } | ||
3532 | } | ||
3533 | mutex_unlock(&module_mutex); | ||
3534 | return found; | ||
3535 | } | ||
3536 | #endif | ||
diff --git a/kernel/params.c b/kernel/params.c index 22df3e0d142a..821788947e40 100644 --- a/kernel/params.c +++ b/kernel/params.c | |||
@@ -67,20 +67,27 @@ static void maybe_kfree_parameter(void *param) | |||
67 | } | 67 | } |
68 | } | 68 | } |
69 | 69 | ||
70 | static inline char dash2underscore(char c) | 70 | static char dash2underscore(char c) |
71 | { | 71 | { |
72 | if (c == '-') | 72 | if (c == '-') |
73 | return '_'; | 73 | return '_'; |
74 | return c; | 74 | return c; |
75 | } | 75 | } |
76 | 76 | ||
77 | static inline int parameq(const char *input, const char *paramname) | 77 | bool parameqn(const char *a, const char *b, size_t n) |
78 | { | 78 | { |
79 | unsigned int i; | 79 | size_t i; |
80 | for (i = 0; dash2underscore(input[i]) == paramname[i]; i++) | 80 | |
81 | if (input[i] == '\0') | 81 | for (i = 0; i < n; i++) { |
82 | return 1; | 82 | if (dash2underscore(a[i]) != dash2underscore(b[i])) |
83 | return 0; | 83 | return false; |
84 | } | ||
85 | return true; | ||
86 | } | ||
87 | |||
88 | bool parameq(const char *a, const char *b) | ||
89 | { | ||
90 | return parameqn(a, b, strlen(a)+1); | ||
84 | } | 91 | } |
85 | 92 | ||
86 | static int parse_one(char *param, | 93 | static int parse_one(char *param, |
diff --git a/kernel/pid.c b/kernel/pid.c index e432057f3b21..8cafe7e72ad2 100644 --- a/kernel/pid.c +++ b/kernel/pid.c | |||
@@ -418,7 +418,9 @@ EXPORT_SYMBOL(pid_task); | |||
418 | */ | 418 | */ |
419 | struct task_struct *find_task_by_pid_ns(pid_t nr, struct pid_namespace *ns) | 419 | struct task_struct *find_task_by_pid_ns(pid_t nr, struct pid_namespace *ns) |
420 | { | 420 | { |
421 | rcu_lockdep_assert(rcu_read_lock_held()); | 421 | rcu_lockdep_assert(rcu_read_lock_held(), |
422 | "find_task_by_pid_ns() needs rcu_read_lock()" | ||
423 | " protection"); | ||
422 | return pid_task(find_pid_ns(nr, ns), PIDTYPE_PID); | 424 | return pid_task(find_pid_ns(nr, ns), PIDTYPE_PID); |
423 | } | 425 | } |
424 | 426 | ||
diff --git a/kernel/posix-cpu-timers.c b/kernel/posix-cpu-timers.c index 58f405b581e7..e7cb76dc18f5 100644 --- a/kernel/posix-cpu-timers.c +++ b/kernel/posix-cpu-timers.c | |||
@@ -250,7 +250,7 @@ void thread_group_cputime(struct task_struct *tsk, struct task_cputime *times) | |||
250 | do { | 250 | do { |
251 | times->utime = cputime_add(times->utime, t->utime); | 251 | times->utime = cputime_add(times->utime, t->utime); |
252 | times->stime = cputime_add(times->stime, t->stime); | 252 | times->stime = cputime_add(times->stime, t->stime); |
253 | times->sum_exec_runtime += t->se.sum_exec_runtime; | 253 | times->sum_exec_runtime += task_sched_runtime(t); |
254 | } while_each_thread(tsk, t); | 254 | } while_each_thread(tsk, t); |
255 | out: | 255 | out: |
256 | rcu_read_unlock(); | 256 | rcu_read_unlock(); |
@@ -274,9 +274,7 @@ void thread_group_cputimer(struct task_struct *tsk, struct task_cputime *times) | |||
274 | struct task_cputime sum; | 274 | struct task_cputime sum; |
275 | unsigned long flags; | 275 | unsigned long flags; |
276 | 276 | ||
277 | spin_lock_irqsave(&cputimer->lock, flags); | ||
278 | if (!cputimer->running) { | 277 | if (!cputimer->running) { |
279 | cputimer->running = 1; | ||
280 | /* | 278 | /* |
281 | * The POSIX timer interface allows for absolute time expiry | 279 | * The POSIX timer interface allows for absolute time expiry |
282 | * values through the TIMER_ABSTIME flag, therefore we have | 280 | * values through the TIMER_ABSTIME flag, therefore we have |
@@ -284,10 +282,13 @@ void thread_group_cputimer(struct task_struct *tsk, struct task_cputime *times) | |||
284 | * it. | 282 | * it. |
285 | */ | 283 | */ |
286 | thread_group_cputime(tsk, &sum); | 284 | thread_group_cputime(tsk, &sum); |
285 | raw_spin_lock_irqsave(&cputimer->lock, flags); | ||
286 | cputimer->running = 1; | ||
287 | update_gt_cputime(&cputimer->cputime, &sum); | 287 | update_gt_cputime(&cputimer->cputime, &sum); |
288 | } | 288 | } else |
289 | raw_spin_lock_irqsave(&cputimer->lock, flags); | ||
289 | *times = cputimer->cputime; | 290 | *times = cputimer->cputime; |
290 | spin_unlock_irqrestore(&cputimer->lock, flags); | 291 | raw_spin_unlock_irqrestore(&cputimer->lock, flags); |
291 | } | 292 | } |
292 | 293 | ||
293 | /* | 294 | /* |
@@ -312,7 +313,8 @@ static int cpu_clock_sample_group(const clockid_t which_clock, | |||
312 | cpu->cpu = cputime.utime; | 313 | cpu->cpu = cputime.utime; |
313 | break; | 314 | break; |
314 | case CPUCLOCK_SCHED: | 315 | case CPUCLOCK_SCHED: |
315 | cpu->sched = thread_group_sched_runtime(p); | 316 | thread_group_cputime(p, &cputime); |
317 | cpu->sched = cputime.sum_exec_runtime; | ||
316 | break; | 318 | break; |
317 | } | 319 | } |
318 | return 0; | 320 | return 0; |
@@ -997,9 +999,9 @@ static void stop_process_timers(struct signal_struct *sig) | |||
997 | struct thread_group_cputimer *cputimer = &sig->cputimer; | 999 | struct thread_group_cputimer *cputimer = &sig->cputimer; |
998 | unsigned long flags; | 1000 | unsigned long flags; |
999 | 1001 | ||
1000 | spin_lock_irqsave(&cputimer->lock, flags); | 1002 | raw_spin_lock_irqsave(&cputimer->lock, flags); |
1001 | cputimer->running = 0; | 1003 | cputimer->running = 0; |
1002 | spin_unlock_irqrestore(&cputimer->lock, flags); | 1004 | raw_spin_unlock_irqrestore(&cputimer->lock, flags); |
1003 | } | 1005 | } |
1004 | 1006 | ||
1005 | static u32 onecputick; | 1007 | static u32 onecputick; |
@@ -1289,9 +1291,9 @@ static inline int fastpath_timer_check(struct task_struct *tsk) | |||
1289 | if (sig->cputimer.running) { | 1291 | if (sig->cputimer.running) { |
1290 | struct task_cputime group_sample; | 1292 | struct task_cputime group_sample; |
1291 | 1293 | ||
1292 | spin_lock(&sig->cputimer.lock); | 1294 | raw_spin_lock(&sig->cputimer.lock); |
1293 | group_sample = sig->cputimer.cputime; | 1295 | group_sample = sig->cputimer.cputime; |
1294 | spin_unlock(&sig->cputimer.lock); | 1296 | raw_spin_unlock(&sig->cputimer.lock); |
1295 | 1297 | ||
1296 | if (task_cputime_expired(&group_sample, &sig->cputime_expires)) | 1298 | if (task_cputime_expired(&group_sample, &sig->cputime_expires)) |
1297 | return 1; | 1299 | return 1; |
diff --git a/kernel/power/Kconfig b/kernel/power/Kconfig index 80a85971cf64..deb5461e3216 100644 --- a/kernel/power/Kconfig +++ b/kernel/power/Kconfig | |||
@@ -27,6 +27,7 @@ config HIBERNATION | |||
27 | select HIBERNATE_CALLBACKS | 27 | select HIBERNATE_CALLBACKS |
28 | select LZO_COMPRESS | 28 | select LZO_COMPRESS |
29 | select LZO_DECOMPRESS | 29 | select LZO_DECOMPRESS |
30 | select CRC32 | ||
30 | ---help--- | 31 | ---help--- |
31 | Enable the suspend to disk (STD) functionality, which is usually | 32 | Enable the suspend to disk (STD) functionality, which is usually |
32 | called "hibernation" in user interfaces. STD checkpoints the | 33 | called "hibernation" in user interfaces. STD checkpoints the |
@@ -65,6 +66,9 @@ config HIBERNATION | |||
65 | 66 | ||
66 | For more information take a look at <file:Documentation/power/swsusp.txt>. | 67 | For more information take a look at <file:Documentation/power/swsusp.txt>. |
67 | 68 | ||
69 | config ARCH_SAVE_PAGE_KEYS | ||
70 | bool | ||
71 | |||
68 | config PM_STD_PARTITION | 72 | config PM_STD_PARTITION |
69 | string "Default resume partition" | 73 | string "Default resume partition" |
70 | depends on HIBERNATION | 74 | depends on HIBERNATION |
diff --git a/kernel/power/Makefile b/kernel/power/Makefile index c5ebc6a90643..07e0e28ffba7 100644 --- a/kernel/power/Makefile +++ b/kernel/power/Makefile | |||
@@ -1,8 +1,8 @@ | |||
1 | 1 | ||
2 | ccflags-$(CONFIG_PM_DEBUG) := -DDEBUG | 2 | ccflags-$(CONFIG_PM_DEBUG) := -DDEBUG |
3 | 3 | ||
4 | obj-$(CONFIG_PM) += main.o | 4 | obj-$(CONFIG_PM) += main.o qos.o |
5 | obj-$(CONFIG_PM_SLEEP) += console.o | 5 | obj-$(CONFIG_VT_CONSOLE_SLEEP) += console.o |
6 | obj-$(CONFIG_FREEZER) += process.o | 6 | obj-$(CONFIG_FREEZER) += process.o |
7 | obj-$(CONFIG_SUSPEND) += suspend.o | 7 | obj-$(CONFIG_SUSPEND) += suspend.o |
8 | obj-$(CONFIG_PM_TEST_SUSPEND) += suspend_test.o | 8 | obj-$(CONFIG_PM_TEST_SUSPEND) += suspend_test.o |
diff --git a/kernel/power/console.c b/kernel/power/console.c index 218e5af90156..b1dc456474b5 100644 --- a/kernel/power/console.c +++ b/kernel/power/console.c | |||
@@ -1,5 +1,5 @@ | |||
1 | /* | 1 | /* |
2 | * drivers/power/process.c - Functions for saving/restoring console. | 2 | * Functions for saving/restoring console. |
3 | * | 3 | * |
4 | * Originally from swsusp. | 4 | * Originally from swsusp. |
5 | */ | 5 | */ |
@@ -10,7 +10,6 @@ | |||
10 | #include <linux/module.h> | 10 | #include <linux/module.h> |
11 | #include "power.h" | 11 | #include "power.h" |
12 | 12 | ||
13 | #if defined(CONFIG_VT) && defined(CONFIG_VT_CONSOLE) | ||
14 | #define SUSPEND_CONSOLE (MAX_NR_CONSOLES-1) | 13 | #define SUSPEND_CONSOLE (MAX_NR_CONSOLES-1) |
15 | 14 | ||
16 | static int orig_fgconsole, orig_kmsg; | 15 | static int orig_fgconsole, orig_kmsg; |
@@ -32,4 +31,3 @@ void pm_restore_console(void) | |||
32 | vt_kmsg_redirect(orig_kmsg); | 31 | vt_kmsg_redirect(orig_kmsg); |
33 | } | 32 | } |
34 | } | 33 | } |
35 | #endif | ||
diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c index 8f7b1db1ece1..1c53f7fad5f7 100644 --- a/kernel/power/hibernate.c +++ b/kernel/power/hibernate.c | |||
@@ -14,6 +14,7 @@ | |||
14 | #include <linux/reboot.h> | 14 | #include <linux/reboot.h> |
15 | #include <linux/string.h> | 15 | #include <linux/string.h> |
16 | #include <linux/device.h> | 16 | #include <linux/device.h> |
17 | #include <linux/async.h> | ||
17 | #include <linux/kmod.h> | 18 | #include <linux/kmod.h> |
18 | #include <linux/delay.h> | 19 | #include <linux/delay.h> |
19 | #include <linux/fs.h> | 20 | #include <linux/fs.h> |
@@ -29,12 +30,14 @@ | |||
29 | #include "power.h" | 30 | #include "power.h" |
30 | 31 | ||
31 | 32 | ||
32 | static int nocompress = 0; | 33 | static int nocompress; |
33 | static int noresume = 0; | 34 | static int noresume; |
35 | static int resume_wait; | ||
36 | static int resume_delay; | ||
34 | static char resume_file[256] = CONFIG_PM_STD_PARTITION; | 37 | static char resume_file[256] = CONFIG_PM_STD_PARTITION; |
35 | dev_t swsusp_resume_device; | 38 | dev_t swsusp_resume_device; |
36 | sector_t swsusp_resume_block; | 39 | sector_t swsusp_resume_block; |
37 | int in_suspend __nosavedata = 0; | 40 | int in_suspend __nosavedata; |
38 | 41 | ||
39 | enum { | 42 | enum { |
40 | HIBERNATION_INVALID, | 43 | HIBERNATION_INVALID, |
@@ -334,13 +337,17 @@ int hibernation_snapshot(int platform_mode) | |||
334 | if (error) | 337 | if (error) |
335 | goto Close; | 338 | goto Close; |
336 | 339 | ||
337 | error = dpm_prepare(PMSG_FREEZE); | ||
338 | if (error) | ||
339 | goto Complete_devices; | ||
340 | |||
341 | /* Preallocate image memory before shutting down devices. */ | 340 | /* Preallocate image memory before shutting down devices. */ |
342 | error = hibernate_preallocate_memory(); | 341 | error = hibernate_preallocate_memory(); |
343 | if (error) | 342 | if (error) |
343 | goto Close; | ||
344 | |||
345 | error = freeze_kernel_threads(); | ||
346 | if (error) | ||
347 | goto Close; | ||
348 | |||
349 | error = dpm_prepare(PMSG_FREEZE); | ||
350 | if (error) | ||
344 | goto Complete_devices; | 351 | goto Complete_devices; |
345 | 352 | ||
346 | suspend_console(); | 353 | suspend_console(); |
@@ -463,7 +470,7 @@ static int resume_target_kernel(bool platform_mode) | |||
463 | * @platform_mode: If set, use platform driver to prepare for the transition. | 470 | * @platform_mode: If set, use platform driver to prepare for the transition. |
464 | * | 471 | * |
465 | * This routine must be called with pm_mutex held. If it is successful, control | 472 | * This routine must be called with pm_mutex held. If it is successful, control |
466 | * reappears in the restored target kernel in hibernation_snaphot(). | 473 | * reappears in the restored target kernel in hibernation_snapshot(). |
467 | */ | 474 | */ |
468 | int hibernation_restore(int platform_mode) | 475 | int hibernation_restore(int platform_mode) |
469 | { | 476 | { |
@@ -650,6 +657,9 @@ int hibernate(void) | |||
650 | flags |= SF_PLATFORM_MODE; | 657 | flags |= SF_PLATFORM_MODE; |
651 | if (nocompress) | 658 | if (nocompress) |
652 | flags |= SF_NOCOMPRESS_MODE; | 659 | flags |= SF_NOCOMPRESS_MODE; |
660 | else | ||
661 | flags |= SF_CRC32_MODE; | ||
662 | |||
653 | pr_debug("PM: writing image.\n"); | 663 | pr_debug("PM: writing image.\n"); |
654 | error = swsusp_write(flags); | 664 | error = swsusp_write(flags); |
655 | swsusp_free(); | 665 | swsusp_free(); |
@@ -724,6 +734,12 @@ static int software_resume(void) | |||
724 | 734 | ||
725 | pr_debug("PM: Checking hibernation image partition %s\n", resume_file); | 735 | pr_debug("PM: Checking hibernation image partition %s\n", resume_file); |
726 | 736 | ||
737 | if (resume_delay) { | ||
738 | printk(KERN_INFO "Waiting %dsec before reading resume device...\n", | ||
739 | resume_delay); | ||
740 | ssleep(resume_delay); | ||
741 | } | ||
742 | |||
727 | /* Check if the device is there */ | 743 | /* Check if the device is there */ |
728 | swsusp_resume_device = name_to_dev_t(resume_file); | 744 | swsusp_resume_device = name_to_dev_t(resume_file); |
729 | if (!swsusp_resume_device) { | 745 | if (!swsusp_resume_device) { |
@@ -732,6 +748,13 @@ static int software_resume(void) | |||
732 | * to wait for this to finish. | 748 | * to wait for this to finish. |
733 | */ | 749 | */ |
734 | wait_for_device_probe(); | 750 | wait_for_device_probe(); |
751 | |||
752 | if (resume_wait) { | ||
753 | while ((swsusp_resume_device = name_to_dev_t(resume_file)) == 0) | ||
754 | msleep(10); | ||
755 | async_synchronize_full(); | ||
756 | } | ||
757 | |||
735 | /* | 758 | /* |
736 | * We can't depend on SCSI devices being available after loading | 759 | * We can't depend on SCSI devices being available after loading |
737 | * one of their modules until scsi_complete_async_scans() is | 760 | * one of their modules until scsi_complete_async_scans() is |
@@ -1060,7 +1083,21 @@ static int __init noresume_setup(char *str) | |||
1060 | return 1; | 1083 | return 1; |
1061 | } | 1084 | } |
1062 | 1085 | ||
1086 | static int __init resumewait_setup(char *str) | ||
1087 | { | ||
1088 | resume_wait = 1; | ||
1089 | return 1; | ||
1090 | } | ||
1091 | |||
1092 | static int __init resumedelay_setup(char *str) | ||
1093 | { | ||
1094 | resume_delay = simple_strtoul(str, NULL, 0); | ||
1095 | return 1; | ||
1096 | } | ||
1097 | |||
1063 | __setup("noresume", noresume_setup); | 1098 | __setup("noresume", noresume_setup); |
1064 | __setup("resume_offset=", resume_offset_setup); | 1099 | __setup("resume_offset=", resume_offset_setup); |
1065 | __setup("resume=", resume_setup); | 1100 | __setup("resume=", resume_setup); |
1066 | __setup("hibernate=", hibernate_setup); | 1101 | __setup("hibernate=", hibernate_setup); |
1102 | __setup("resumewait", resumewait_setup); | ||
1103 | __setup("resumedelay=", resumedelay_setup); | ||
diff --git a/kernel/power/main.c b/kernel/power/main.c index 6c601f871964..a52e88425a31 100644 --- a/kernel/power/main.c +++ b/kernel/power/main.c | |||
@@ -12,6 +12,8 @@ | |||
12 | #include <linux/string.h> | 12 | #include <linux/string.h> |
13 | #include <linux/resume-trace.h> | 13 | #include <linux/resume-trace.h> |
14 | #include <linux/workqueue.h> | 14 | #include <linux/workqueue.h> |
15 | #include <linux/debugfs.h> | ||
16 | #include <linux/seq_file.h> | ||
15 | 17 | ||
16 | #include "power.h" | 18 | #include "power.h" |
17 | 19 | ||
@@ -131,6 +133,101 @@ static ssize_t pm_test_store(struct kobject *kobj, struct kobj_attribute *attr, | |||
131 | power_attr(pm_test); | 133 | power_attr(pm_test); |
132 | #endif /* CONFIG_PM_DEBUG */ | 134 | #endif /* CONFIG_PM_DEBUG */ |
133 | 135 | ||
136 | #ifdef CONFIG_DEBUG_FS | ||
137 | static char *suspend_step_name(enum suspend_stat_step step) | ||
138 | { | ||
139 | switch (step) { | ||
140 | case SUSPEND_FREEZE: | ||
141 | return "freeze"; | ||
142 | case SUSPEND_PREPARE: | ||
143 | return "prepare"; | ||
144 | case SUSPEND_SUSPEND: | ||
145 | return "suspend"; | ||
146 | case SUSPEND_SUSPEND_NOIRQ: | ||
147 | return "suspend_noirq"; | ||
148 | case SUSPEND_RESUME_NOIRQ: | ||
149 | return "resume_noirq"; | ||
150 | case SUSPEND_RESUME: | ||
151 | return "resume"; | ||
152 | default: | ||
153 | return ""; | ||
154 | } | ||
155 | } | ||
156 | |||
157 | static int suspend_stats_show(struct seq_file *s, void *unused) | ||
158 | { | ||
159 | int i, index, last_dev, last_errno, last_step; | ||
160 | |||
161 | last_dev = suspend_stats.last_failed_dev + REC_FAILED_NUM - 1; | ||
162 | last_dev %= REC_FAILED_NUM; | ||
163 | last_errno = suspend_stats.last_failed_errno + REC_FAILED_NUM - 1; | ||
164 | last_errno %= REC_FAILED_NUM; | ||
165 | last_step = suspend_stats.last_failed_step + REC_FAILED_NUM - 1; | ||
166 | last_step %= REC_FAILED_NUM; | ||
167 | seq_printf(s, "%s: %d\n%s: %d\n%s: %d\n%s: %d\n" | ||
168 | "%s: %d\n%s: %d\n%s: %d\n%s: %d\n", | ||
169 | "success", suspend_stats.success, | ||
170 | "fail", suspend_stats.fail, | ||
171 | "failed_freeze", suspend_stats.failed_freeze, | ||
172 | "failed_prepare", suspend_stats.failed_prepare, | ||
173 | "failed_suspend", suspend_stats.failed_suspend, | ||
174 | "failed_suspend_noirq", | ||
175 | suspend_stats.failed_suspend_noirq, | ||
176 | "failed_resume", suspend_stats.failed_resume, | ||
177 | "failed_resume_noirq", | ||
178 | suspend_stats.failed_resume_noirq); | ||
179 | seq_printf(s, "failures:\n last_failed_dev:\t%-s\n", | ||
180 | suspend_stats.failed_devs[last_dev]); | ||
181 | for (i = 1; i < REC_FAILED_NUM; i++) { | ||
182 | index = last_dev + REC_FAILED_NUM - i; | ||
183 | index %= REC_FAILED_NUM; | ||
184 | seq_printf(s, "\t\t\t%-s\n", | ||
185 | suspend_stats.failed_devs[index]); | ||
186 | } | ||
187 | seq_printf(s, " last_failed_errno:\t%-d\n", | ||
188 | suspend_stats.errno[last_errno]); | ||
189 | for (i = 1; i < REC_FAILED_NUM; i++) { | ||
190 | index = last_errno + REC_FAILED_NUM - i; | ||
191 | index %= REC_FAILED_NUM; | ||
192 | seq_printf(s, "\t\t\t%-d\n", | ||
193 | suspend_stats.errno[index]); | ||
194 | } | ||
195 | seq_printf(s, " last_failed_step:\t%-s\n", | ||
196 | suspend_step_name( | ||
197 | suspend_stats.failed_steps[last_step])); | ||
198 | for (i = 1; i < REC_FAILED_NUM; i++) { | ||
199 | index = last_step + REC_FAILED_NUM - i; | ||
200 | index %= REC_FAILED_NUM; | ||
201 | seq_printf(s, "\t\t\t%-s\n", | ||
202 | suspend_step_name( | ||
203 | suspend_stats.failed_steps[index])); | ||
204 | } | ||
205 | |||
206 | return 0; | ||
207 | } | ||
208 | |||
209 | static int suspend_stats_open(struct inode *inode, struct file *file) | ||
210 | { | ||
211 | return single_open(file, suspend_stats_show, NULL); | ||
212 | } | ||
213 | |||
214 | static const struct file_operations suspend_stats_operations = { | ||
215 | .open = suspend_stats_open, | ||
216 | .read = seq_read, | ||
217 | .llseek = seq_lseek, | ||
218 | .release = single_release, | ||
219 | }; | ||
220 | |||
221 | static int __init pm_debugfs_init(void) | ||
222 | { | ||
223 | debugfs_create_file("suspend_stats", S_IFREG | S_IRUGO, | ||
224 | NULL, NULL, &suspend_stats_operations); | ||
225 | return 0; | ||
226 | } | ||
227 | |||
228 | late_initcall(pm_debugfs_init); | ||
229 | #endif /* CONFIG_DEBUG_FS */ | ||
230 | |||
134 | #endif /* CONFIG_PM_SLEEP */ | 231 | #endif /* CONFIG_PM_SLEEP */ |
135 | 232 | ||
136 | struct kobject *power_kobj; | 233 | struct kobject *power_kobj; |
@@ -194,6 +291,11 @@ static ssize_t state_store(struct kobject *kobj, struct kobj_attribute *attr, | |||
194 | } | 291 | } |
195 | if (state < PM_SUSPEND_MAX && *s) | 292 | if (state < PM_SUSPEND_MAX && *s) |
196 | error = enter_state(state); | 293 | error = enter_state(state); |
294 | if (error) { | ||
295 | suspend_stats.fail++; | ||
296 | dpm_save_failed_errno(error); | ||
297 | } else | ||
298 | suspend_stats.success++; | ||
197 | #endif | 299 | #endif |
198 | 300 | ||
199 | Exit: | 301 | Exit: |
diff --git a/kernel/power/power.h b/kernel/power/power.h index 9a00a0a26280..23a2db1ec442 100644 --- a/kernel/power/power.h +++ b/kernel/power/power.h | |||
@@ -146,6 +146,7 @@ extern int swsusp_swap_in_use(void); | |||
146 | */ | 146 | */ |
147 | #define SF_PLATFORM_MODE 1 | 147 | #define SF_PLATFORM_MODE 1 |
148 | #define SF_NOCOMPRESS_MODE 2 | 148 | #define SF_NOCOMPRESS_MODE 2 |
149 | #define SF_CRC32_MODE 4 | ||
149 | 150 | ||
150 | /* kernel/power/hibernate.c */ | 151 | /* kernel/power/hibernate.c */ |
151 | extern int swsusp_check(void); | 152 | extern int swsusp_check(void); |
@@ -228,7 +229,8 @@ extern int pm_test_level; | |||
228 | #ifdef CONFIG_SUSPEND_FREEZER | 229 | #ifdef CONFIG_SUSPEND_FREEZER |
229 | static inline int suspend_freeze_processes(void) | 230 | static inline int suspend_freeze_processes(void) |
230 | { | 231 | { |
231 | return freeze_processes(); | 232 | int error = freeze_processes(); |
233 | return error ? : freeze_kernel_threads(); | ||
232 | } | 234 | } |
233 | 235 | ||
234 | static inline void suspend_thaw_processes(void) | 236 | static inline void suspend_thaw_processes(void) |
diff --git a/kernel/power/process.c b/kernel/power/process.c index 0cf3a27a6c9d..addbbe5531bc 100644 --- a/kernel/power/process.c +++ b/kernel/power/process.c | |||
@@ -135,7 +135,7 @@ static int try_to_freeze_tasks(bool sig_only) | |||
135 | } | 135 | } |
136 | 136 | ||
137 | /** | 137 | /** |
138 | * freeze_processes - tell processes to enter the refrigerator | 138 | * freeze_processes - Signal user space processes to enter the refrigerator. |
139 | */ | 139 | */ |
140 | int freeze_processes(void) | 140 | int freeze_processes(void) |
141 | { | 141 | { |
@@ -143,20 +143,30 @@ int freeze_processes(void) | |||
143 | 143 | ||
144 | printk("Freezing user space processes ... "); | 144 | printk("Freezing user space processes ... "); |
145 | error = try_to_freeze_tasks(true); | 145 | error = try_to_freeze_tasks(true); |
146 | if (error) | 146 | if (!error) { |
147 | goto Exit; | 147 | printk("done."); |
148 | printk("done.\n"); | 148 | oom_killer_disable(); |
149 | } | ||
150 | printk("\n"); | ||
151 | BUG_ON(in_atomic()); | ||
152 | |||
153 | return error; | ||
154 | } | ||
155 | |||
156 | /** | ||
157 | * freeze_kernel_threads - Make freezable kernel threads go to the refrigerator. | ||
158 | */ | ||
159 | int freeze_kernel_threads(void) | ||
160 | { | ||
161 | int error; | ||
149 | 162 | ||
150 | printk("Freezing remaining freezable tasks ... "); | 163 | printk("Freezing remaining freezable tasks ... "); |
151 | error = try_to_freeze_tasks(false); | 164 | error = try_to_freeze_tasks(false); |
152 | if (error) | 165 | if (!error) |
153 | goto Exit; | 166 | printk("done."); |
154 | printk("done."); | ||
155 | 167 | ||
156 | oom_killer_disable(); | ||
157 | Exit: | ||
158 | BUG_ON(in_atomic()); | ||
159 | printk("\n"); | 168 | printk("\n"); |
169 | BUG_ON(in_atomic()); | ||
160 | 170 | ||
161 | return error; | 171 | return error; |
162 | } | 172 | } |
diff --git a/kernel/pm_qos_params.c b/kernel/power/qos.c index 37f05d0f0793..1c1797dd1d1d 100644 --- a/kernel/pm_qos_params.c +++ b/kernel/power/qos.c | |||
@@ -29,7 +29,7 @@ | |||
29 | 29 | ||
30 | /*#define DEBUG*/ | 30 | /*#define DEBUG*/ |
31 | 31 | ||
32 | #include <linux/pm_qos_params.h> | 32 | #include <linux/pm_qos.h> |
33 | #include <linux/sched.h> | 33 | #include <linux/sched.h> |
34 | #include <linux/spinlock.h> | 34 | #include <linux/spinlock.h> |
35 | #include <linux/slab.h> | 35 | #include <linux/slab.h> |
@@ -45,62 +45,57 @@ | |||
45 | #include <linux/uaccess.h> | 45 | #include <linux/uaccess.h> |
46 | 46 | ||
47 | /* | 47 | /* |
48 | * locking rule: all changes to requests or notifiers lists | 48 | * locking rule: all changes to constraints or notifiers lists |
49 | * or pm_qos_object list and pm_qos_objects need to happen with pm_qos_lock | 49 | * or pm_qos_object list and pm_qos_objects need to happen with pm_qos_lock |
50 | * held, taken with _irqsave. One lock to rule them all | 50 | * held, taken with _irqsave. One lock to rule them all |
51 | */ | 51 | */ |
52 | enum pm_qos_type { | ||
53 | PM_QOS_MAX, /* return the largest value */ | ||
54 | PM_QOS_MIN /* return the smallest value */ | ||
55 | }; | ||
56 | |||
57 | /* | ||
58 | * Note: The lockless read path depends on the CPU accessing | ||
59 | * target_value atomically. Atomic access is only guaranteed on all CPU | ||
60 | * types linux supports for 32 bit quantites | ||
61 | */ | ||
62 | struct pm_qos_object { | 52 | struct pm_qos_object { |
63 | struct plist_head requests; | 53 | struct pm_qos_constraints *constraints; |
64 | struct blocking_notifier_head *notifiers; | ||
65 | struct miscdevice pm_qos_power_miscdev; | 54 | struct miscdevice pm_qos_power_miscdev; |
66 | char *name; | 55 | char *name; |
67 | s32 target_value; /* Do not change to 64 bit */ | ||
68 | s32 default_value; | ||
69 | enum pm_qos_type type; | ||
70 | }; | 56 | }; |
71 | 57 | ||
72 | static DEFINE_SPINLOCK(pm_qos_lock); | 58 | static DEFINE_SPINLOCK(pm_qos_lock); |
73 | 59 | ||
74 | static struct pm_qos_object null_pm_qos; | 60 | static struct pm_qos_object null_pm_qos; |
61 | |||
75 | static BLOCKING_NOTIFIER_HEAD(cpu_dma_lat_notifier); | 62 | static BLOCKING_NOTIFIER_HEAD(cpu_dma_lat_notifier); |
76 | static struct pm_qos_object cpu_dma_pm_qos = { | 63 | static struct pm_qos_constraints cpu_dma_constraints = { |
77 | .requests = PLIST_HEAD_INIT(cpu_dma_pm_qos.requests), | 64 | .list = PLIST_HEAD_INIT(cpu_dma_constraints.list), |
78 | .notifiers = &cpu_dma_lat_notifier, | ||
79 | .name = "cpu_dma_latency", | ||
80 | .target_value = PM_QOS_CPU_DMA_LAT_DEFAULT_VALUE, | 65 | .target_value = PM_QOS_CPU_DMA_LAT_DEFAULT_VALUE, |
81 | .default_value = PM_QOS_CPU_DMA_LAT_DEFAULT_VALUE, | 66 | .default_value = PM_QOS_CPU_DMA_LAT_DEFAULT_VALUE, |
82 | .type = PM_QOS_MIN, | 67 | .type = PM_QOS_MIN, |
68 | .notifiers = &cpu_dma_lat_notifier, | ||
69 | }; | ||
70 | static struct pm_qos_object cpu_dma_pm_qos = { | ||
71 | .constraints = &cpu_dma_constraints, | ||
83 | }; | 72 | }; |
84 | 73 | ||
85 | static BLOCKING_NOTIFIER_HEAD(network_lat_notifier); | 74 | static BLOCKING_NOTIFIER_HEAD(network_lat_notifier); |
86 | static struct pm_qos_object network_lat_pm_qos = { | 75 | static struct pm_qos_constraints network_lat_constraints = { |
87 | .requests = PLIST_HEAD_INIT(network_lat_pm_qos.requests), | 76 | .list = PLIST_HEAD_INIT(network_lat_constraints.list), |
88 | .notifiers = &network_lat_notifier, | ||
89 | .name = "network_latency", | ||
90 | .target_value = PM_QOS_NETWORK_LAT_DEFAULT_VALUE, | 77 | .target_value = PM_QOS_NETWORK_LAT_DEFAULT_VALUE, |
91 | .default_value = PM_QOS_NETWORK_LAT_DEFAULT_VALUE, | 78 | .default_value = PM_QOS_NETWORK_LAT_DEFAULT_VALUE, |
92 | .type = PM_QOS_MIN | 79 | .type = PM_QOS_MIN, |
80 | .notifiers = &network_lat_notifier, | ||
81 | }; | ||
82 | static struct pm_qos_object network_lat_pm_qos = { | ||
83 | .constraints = &network_lat_constraints, | ||
84 | .name = "network_latency", | ||
93 | }; | 85 | }; |
94 | 86 | ||
95 | 87 | ||
96 | static BLOCKING_NOTIFIER_HEAD(network_throughput_notifier); | 88 | static BLOCKING_NOTIFIER_HEAD(network_throughput_notifier); |
97 | static struct pm_qos_object network_throughput_pm_qos = { | 89 | static struct pm_qos_constraints network_tput_constraints = { |
98 | .requests = PLIST_HEAD_INIT(network_throughput_pm_qos.requests), | 90 | .list = PLIST_HEAD_INIT(network_tput_constraints.list), |
99 | .notifiers = &network_throughput_notifier, | ||
100 | .name = "network_throughput", | ||
101 | .target_value = PM_QOS_NETWORK_THROUGHPUT_DEFAULT_VALUE, | 91 | .target_value = PM_QOS_NETWORK_THROUGHPUT_DEFAULT_VALUE, |
102 | .default_value = PM_QOS_NETWORK_THROUGHPUT_DEFAULT_VALUE, | 92 | .default_value = PM_QOS_NETWORK_THROUGHPUT_DEFAULT_VALUE, |
103 | .type = PM_QOS_MAX, | 93 | .type = PM_QOS_MAX, |
94 | .notifiers = &network_throughput_notifier, | ||
95 | }; | ||
96 | static struct pm_qos_object network_throughput_pm_qos = { | ||
97 | .constraints = &network_tput_constraints, | ||
98 | .name = "network_throughput", | ||
104 | }; | 99 | }; |
105 | 100 | ||
106 | 101 | ||
@@ -127,17 +122,17 @@ static const struct file_operations pm_qos_power_fops = { | |||
127 | }; | 122 | }; |
128 | 123 | ||
129 | /* unlocked internal variant */ | 124 | /* unlocked internal variant */ |
130 | static inline int pm_qos_get_value(struct pm_qos_object *o) | 125 | static inline int pm_qos_get_value(struct pm_qos_constraints *c) |
131 | { | 126 | { |
132 | if (plist_head_empty(&o->requests)) | 127 | if (plist_head_empty(&c->list)) |
133 | return o->default_value; | 128 | return c->default_value; |
134 | 129 | ||
135 | switch (o->type) { | 130 | switch (c->type) { |
136 | case PM_QOS_MIN: | 131 | case PM_QOS_MIN: |
137 | return plist_first(&o->requests)->prio; | 132 | return plist_first(&c->list)->prio; |
138 | 133 | ||
139 | case PM_QOS_MAX: | 134 | case PM_QOS_MAX: |
140 | return plist_last(&o->requests)->prio; | 135 | return plist_last(&c->list)->prio; |
141 | 136 | ||
142 | default: | 137 | default: |
143 | /* runtime check for not using enum */ | 138 | /* runtime check for not using enum */ |
@@ -145,69 +140,73 @@ static inline int pm_qos_get_value(struct pm_qos_object *o) | |||
145 | } | 140 | } |
146 | } | 141 | } |
147 | 142 | ||
148 | static inline s32 pm_qos_read_value(struct pm_qos_object *o) | 143 | s32 pm_qos_read_value(struct pm_qos_constraints *c) |
149 | { | 144 | { |
150 | return o->target_value; | 145 | return c->target_value; |
151 | } | 146 | } |
152 | 147 | ||
153 | static inline void pm_qos_set_value(struct pm_qos_object *o, s32 value) | 148 | static inline void pm_qos_set_value(struct pm_qos_constraints *c, s32 value) |
154 | { | 149 | { |
155 | o->target_value = value; | 150 | c->target_value = value; |
156 | } | 151 | } |
157 | 152 | ||
158 | static void update_target(struct pm_qos_object *o, struct plist_node *node, | 153 | /** |
159 | int del, int value) | 154 | * pm_qos_update_target - manages the constraints list and calls the notifiers |
155 | * if needed | ||
156 | * @c: constraints data struct | ||
157 | * @node: request to add to the list, to update or to remove | ||
158 | * @action: action to take on the constraints list | ||
159 | * @value: value of the request to add or update | ||
160 | * | ||
161 | * This function returns 1 if the aggregated constraint value has changed, 0 | ||
162 | * otherwise. | ||
163 | */ | ||
164 | int pm_qos_update_target(struct pm_qos_constraints *c, struct plist_node *node, | ||
165 | enum pm_qos_req_action action, int value) | ||
160 | { | 166 | { |
161 | unsigned long flags; | 167 | unsigned long flags; |
162 | int prev_value, curr_value; | 168 | int prev_value, curr_value, new_value; |
163 | 169 | ||
164 | spin_lock_irqsave(&pm_qos_lock, flags); | 170 | spin_lock_irqsave(&pm_qos_lock, flags); |
165 | prev_value = pm_qos_get_value(o); | 171 | prev_value = pm_qos_get_value(c); |
166 | /* PM_QOS_DEFAULT_VALUE is a signal that the value is unchanged */ | 172 | if (value == PM_QOS_DEFAULT_VALUE) |
167 | if (value != PM_QOS_DEFAULT_VALUE) { | 173 | new_value = c->default_value; |
174 | else | ||
175 | new_value = value; | ||
176 | |||
177 | switch (action) { | ||
178 | case PM_QOS_REMOVE_REQ: | ||
179 | plist_del(node, &c->list); | ||
180 | break; | ||
181 | case PM_QOS_UPDATE_REQ: | ||
168 | /* | 182 | /* |
169 | * to change the list, we atomically remove, reinit | 183 | * to change the list, we atomically remove, reinit |
170 | * with new value and add, then see if the extremal | 184 | * with new value and add, then see if the extremal |
171 | * changed | 185 | * changed |
172 | */ | 186 | */ |
173 | plist_del(node, &o->requests); | 187 | plist_del(node, &c->list); |
174 | plist_node_init(node, value); | 188 | case PM_QOS_ADD_REQ: |
175 | plist_add(node, &o->requests); | 189 | plist_node_init(node, new_value); |
176 | } else if (del) { | 190 | plist_add(node, &c->list); |
177 | plist_del(node, &o->requests); | 191 | break; |
178 | } else { | 192 | default: |
179 | plist_add(node, &o->requests); | 193 | /* no action */ |
194 | ; | ||
180 | } | 195 | } |
181 | curr_value = pm_qos_get_value(o); | 196 | |
182 | pm_qos_set_value(o, curr_value); | 197 | curr_value = pm_qos_get_value(c); |
198 | pm_qos_set_value(c, curr_value); | ||
199 | |||
183 | spin_unlock_irqrestore(&pm_qos_lock, flags); | 200 | spin_unlock_irqrestore(&pm_qos_lock, flags); |
184 | 201 | ||
185 | if (prev_value != curr_value) | 202 | if (prev_value != curr_value) { |
186 | blocking_notifier_call_chain(o->notifiers, | 203 | blocking_notifier_call_chain(c->notifiers, |
187 | (unsigned long)curr_value, | 204 | (unsigned long)curr_value, |
188 | NULL); | 205 | NULL); |
189 | } | 206 | return 1; |
190 | 207 | } else { | |
191 | static int register_pm_qos_misc(struct pm_qos_object *qos) | 208 | return 0; |
192 | { | ||
193 | qos->pm_qos_power_miscdev.minor = MISC_DYNAMIC_MINOR; | ||
194 | qos->pm_qos_power_miscdev.name = qos->name; | ||
195 | qos->pm_qos_power_miscdev.fops = &pm_qos_power_fops; | ||
196 | |||
197 | return misc_register(&qos->pm_qos_power_miscdev); | ||
198 | } | ||
199 | |||
200 | static int find_pm_qos_object_by_minor(int minor) | ||
201 | { | ||
202 | int pm_qos_class; | ||
203 | |||
204 | for (pm_qos_class = 0; | ||
205 | pm_qos_class < PM_QOS_NUM_CLASSES; pm_qos_class++) { | ||
206 | if (minor == | ||
207 | pm_qos_array[pm_qos_class]->pm_qos_power_miscdev.minor) | ||
208 | return pm_qos_class; | ||
209 | } | 209 | } |
210 | return -1; | ||
211 | } | 210 | } |
212 | 211 | ||
213 | /** | 212 | /** |
@@ -218,11 +217,11 @@ static int find_pm_qos_object_by_minor(int minor) | |||
218 | */ | 217 | */ |
219 | int pm_qos_request(int pm_qos_class) | 218 | int pm_qos_request(int pm_qos_class) |
220 | { | 219 | { |
221 | return pm_qos_read_value(pm_qos_array[pm_qos_class]); | 220 | return pm_qos_read_value(pm_qos_array[pm_qos_class]->constraints); |
222 | } | 221 | } |
223 | EXPORT_SYMBOL_GPL(pm_qos_request); | 222 | EXPORT_SYMBOL_GPL(pm_qos_request); |
224 | 223 | ||
225 | int pm_qos_request_active(struct pm_qos_request_list *req) | 224 | int pm_qos_request_active(struct pm_qos_request *req) |
226 | { | 225 | { |
227 | return req->pm_qos_class != 0; | 226 | return req->pm_qos_class != 0; |
228 | } | 227 | } |
@@ -230,40 +229,36 @@ EXPORT_SYMBOL_GPL(pm_qos_request_active); | |||
230 | 229 | ||
231 | /** | 230 | /** |
232 | * pm_qos_add_request - inserts new qos request into the list | 231 | * pm_qos_add_request - inserts new qos request into the list |
233 | * @dep: pointer to a preallocated handle | 232 | * @req: pointer to a preallocated handle |
234 | * @pm_qos_class: identifies which list of qos request to use | 233 | * @pm_qos_class: identifies which list of qos request to use |
235 | * @value: defines the qos request | 234 | * @value: defines the qos request |
236 | * | 235 | * |
237 | * This function inserts a new entry in the pm_qos_class list of requested qos | 236 | * This function inserts a new entry in the pm_qos_class list of requested qos |
238 | * performance characteristics. It recomputes the aggregate QoS expectations | 237 | * performance characteristics. It recomputes the aggregate QoS expectations |
239 | * for the pm_qos_class of parameters and initializes the pm_qos_request_list | 238 | * for the pm_qos_class of parameters and initializes the pm_qos_request |
240 | * handle. Caller needs to save this handle for later use in updates and | 239 | * handle. Caller needs to save this handle for later use in updates and |
241 | * removal. | 240 | * removal. |
242 | */ | 241 | */ |
243 | 242 | ||
244 | void pm_qos_add_request(struct pm_qos_request_list *dep, | 243 | void pm_qos_add_request(struct pm_qos_request *req, |
245 | int pm_qos_class, s32 value) | 244 | int pm_qos_class, s32 value) |
246 | { | 245 | { |
247 | struct pm_qos_object *o = pm_qos_array[pm_qos_class]; | 246 | if (!req) /*guard against callers passing in null */ |
248 | int new_value; | 247 | return; |
249 | 248 | ||
250 | if (pm_qos_request_active(dep)) { | 249 | if (pm_qos_request_active(req)) { |
251 | WARN(1, KERN_ERR "pm_qos_add_request() called for already added request\n"); | 250 | WARN(1, KERN_ERR "pm_qos_add_request() called for already added request\n"); |
252 | return; | 251 | return; |
253 | } | 252 | } |
254 | if (value == PM_QOS_DEFAULT_VALUE) | 253 | req->pm_qos_class = pm_qos_class; |
255 | new_value = o->default_value; | 254 | pm_qos_update_target(pm_qos_array[pm_qos_class]->constraints, |
256 | else | 255 | &req->node, PM_QOS_ADD_REQ, value); |
257 | new_value = value; | ||
258 | plist_node_init(&dep->list, new_value); | ||
259 | dep->pm_qos_class = pm_qos_class; | ||
260 | update_target(o, &dep->list, 0, PM_QOS_DEFAULT_VALUE); | ||
261 | } | 256 | } |
262 | EXPORT_SYMBOL_GPL(pm_qos_add_request); | 257 | EXPORT_SYMBOL_GPL(pm_qos_add_request); |
263 | 258 | ||
264 | /** | 259 | /** |
265 | * pm_qos_update_request - modifies an existing qos request | 260 | * pm_qos_update_request - modifies an existing qos request |
266 | * @pm_qos_req : handle to list element holding a pm_qos request to use | 261 | * @req : handle to list element holding a pm_qos request to use |
267 | * @value: defines the qos request | 262 | * @value: defines the qos request |
268 | * | 263 | * |
269 | * Updates an existing qos request for the pm_qos_class of parameters along | 264 | * Updates an existing qos request for the pm_qos_class of parameters along |
@@ -271,56 +266,47 @@ EXPORT_SYMBOL_GPL(pm_qos_add_request); | |||
271 | * | 266 | * |
272 | * Attempts are made to make this code callable on hot code paths. | 267 | * Attempts are made to make this code callable on hot code paths. |
273 | */ | 268 | */ |
274 | void pm_qos_update_request(struct pm_qos_request_list *pm_qos_req, | 269 | void pm_qos_update_request(struct pm_qos_request *req, |
275 | s32 new_value) | 270 | s32 new_value) |
276 | { | 271 | { |
277 | s32 temp; | 272 | if (!req) /*guard against callers passing in null */ |
278 | struct pm_qos_object *o; | ||
279 | |||
280 | if (!pm_qos_req) /*guard against callers passing in null */ | ||
281 | return; | 273 | return; |
282 | 274 | ||
283 | if (!pm_qos_request_active(pm_qos_req)) { | 275 | if (!pm_qos_request_active(req)) { |
284 | WARN(1, KERN_ERR "pm_qos_update_request() called for unknown object\n"); | 276 | WARN(1, KERN_ERR "pm_qos_update_request() called for unknown object\n"); |
285 | return; | 277 | return; |
286 | } | 278 | } |
287 | 279 | ||
288 | o = pm_qos_array[pm_qos_req->pm_qos_class]; | 280 | if (new_value != req->node.prio) |
289 | 281 | pm_qos_update_target( | |
290 | if (new_value == PM_QOS_DEFAULT_VALUE) | 282 | pm_qos_array[req->pm_qos_class]->constraints, |
291 | temp = o->default_value; | 283 | &req->node, PM_QOS_UPDATE_REQ, new_value); |
292 | else | ||
293 | temp = new_value; | ||
294 | |||
295 | if (temp != pm_qos_req->list.prio) | ||
296 | update_target(o, &pm_qos_req->list, 0, temp); | ||
297 | } | 284 | } |
298 | EXPORT_SYMBOL_GPL(pm_qos_update_request); | 285 | EXPORT_SYMBOL_GPL(pm_qos_update_request); |
299 | 286 | ||
300 | /** | 287 | /** |
301 | * pm_qos_remove_request - modifies an existing qos request | 288 | * pm_qos_remove_request - modifies an existing qos request |
302 | * @pm_qos_req: handle to request list element | 289 | * @req: handle to request list element |
303 | * | 290 | * |
304 | * Will remove pm qos request from the list of requests and | 291 | * Will remove pm qos request from the list of constraints and |
305 | * recompute the current target value for the pm_qos_class. Call this | 292 | * recompute the current target value for the pm_qos_class. Call this |
306 | * on slow code paths. | 293 | * on slow code paths. |
307 | */ | 294 | */ |
308 | void pm_qos_remove_request(struct pm_qos_request_list *pm_qos_req) | 295 | void pm_qos_remove_request(struct pm_qos_request *req) |
309 | { | 296 | { |
310 | struct pm_qos_object *o; | 297 | if (!req) /*guard against callers passing in null */ |
311 | |||
312 | if (pm_qos_req == NULL) | ||
313 | return; | 298 | return; |
314 | /* silent return to keep pcm code cleaner */ | 299 | /* silent return to keep pcm code cleaner */ |
315 | 300 | ||
316 | if (!pm_qos_request_active(pm_qos_req)) { | 301 | if (!pm_qos_request_active(req)) { |
317 | WARN(1, KERN_ERR "pm_qos_remove_request() called for unknown object\n"); | 302 | WARN(1, KERN_ERR "pm_qos_remove_request() called for unknown object\n"); |
318 | return; | 303 | return; |
319 | } | 304 | } |
320 | 305 | ||
321 | o = pm_qos_array[pm_qos_req->pm_qos_class]; | 306 | pm_qos_update_target(pm_qos_array[req->pm_qos_class]->constraints, |
322 | update_target(o, &pm_qos_req->list, 1, PM_QOS_DEFAULT_VALUE); | 307 | &req->node, PM_QOS_REMOVE_REQ, |
323 | memset(pm_qos_req, 0, sizeof(*pm_qos_req)); | 308 | PM_QOS_DEFAULT_VALUE); |
309 | memset(req, 0, sizeof(*req)); | ||
324 | } | 310 | } |
325 | EXPORT_SYMBOL_GPL(pm_qos_remove_request); | 311 | EXPORT_SYMBOL_GPL(pm_qos_remove_request); |
326 | 312 | ||
@@ -337,7 +323,8 @@ int pm_qos_add_notifier(int pm_qos_class, struct notifier_block *notifier) | |||
337 | int retval; | 323 | int retval; |
338 | 324 | ||
339 | retval = blocking_notifier_chain_register( | 325 | retval = blocking_notifier_chain_register( |
340 | pm_qos_array[pm_qos_class]->notifiers, notifier); | 326 | pm_qos_array[pm_qos_class]->constraints->notifiers, |
327 | notifier); | ||
341 | 328 | ||
342 | return retval; | 329 | return retval; |
343 | } | 330 | } |
@@ -356,19 +343,43 @@ int pm_qos_remove_notifier(int pm_qos_class, struct notifier_block *notifier) | |||
356 | int retval; | 343 | int retval; |
357 | 344 | ||
358 | retval = blocking_notifier_chain_unregister( | 345 | retval = blocking_notifier_chain_unregister( |
359 | pm_qos_array[pm_qos_class]->notifiers, notifier); | 346 | pm_qos_array[pm_qos_class]->constraints->notifiers, |
347 | notifier); | ||
360 | 348 | ||
361 | return retval; | 349 | return retval; |
362 | } | 350 | } |
363 | EXPORT_SYMBOL_GPL(pm_qos_remove_notifier); | 351 | EXPORT_SYMBOL_GPL(pm_qos_remove_notifier); |
364 | 352 | ||
353 | /* User space interface to PM QoS classes via misc devices */ | ||
354 | static int register_pm_qos_misc(struct pm_qos_object *qos) | ||
355 | { | ||
356 | qos->pm_qos_power_miscdev.minor = MISC_DYNAMIC_MINOR; | ||
357 | qos->pm_qos_power_miscdev.name = qos->name; | ||
358 | qos->pm_qos_power_miscdev.fops = &pm_qos_power_fops; | ||
359 | |||
360 | return misc_register(&qos->pm_qos_power_miscdev); | ||
361 | } | ||
362 | |||
363 | static int find_pm_qos_object_by_minor(int minor) | ||
364 | { | ||
365 | int pm_qos_class; | ||
366 | |||
367 | for (pm_qos_class = 0; | ||
368 | pm_qos_class < PM_QOS_NUM_CLASSES; pm_qos_class++) { | ||
369 | if (minor == | ||
370 | pm_qos_array[pm_qos_class]->pm_qos_power_miscdev.minor) | ||
371 | return pm_qos_class; | ||
372 | } | ||
373 | return -1; | ||
374 | } | ||
375 | |||
365 | static int pm_qos_power_open(struct inode *inode, struct file *filp) | 376 | static int pm_qos_power_open(struct inode *inode, struct file *filp) |
366 | { | 377 | { |
367 | long pm_qos_class; | 378 | long pm_qos_class; |
368 | 379 | ||
369 | pm_qos_class = find_pm_qos_object_by_minor(iminor(inode)); | 380 | pm_qos_class = find_pm_qos_object_by_minor(iminor(inode)); |
370 | if (pm_qos_class >= 0) { | 381 | if (pm_qos_class >= 0) { |
371 | struct pm_qos_request_list *req = kzalloc(sizeof(*req), GFP_KERNEL); | 382 | struct pm_qos_request *req = kzalloc(sizeof(*req), GFP_KERNEL); |
372 | if (!req) | 383 | if (!req) |
373 | return -ENOMEM; | 384 | return -ENOMEM; |
374 | 385 | ||
@@ -383,7 +394,7 @@ static int pm_qos_power_open(struct inode *inode, struct file *filp) | |||
383 | 394 | ||
384 | static int pm_qos_power_release(struct inode *inode, struct file *filp) | 395 | static int pm_qos_power_release(struct inode *inode, struct file *filp) |
385 | { | 396 | { |
386 | struct pm_qos_request_list *req; | 397 | struct pm_qos_request *req; |
387 | 398 | ||
388 | req = filp->private_data; | 399 | req = filp->private_data; |
389 | pm_qos_remove_request(req); | 400 | pm_qos_remove_request(req); |
@@ -398,17 +409,15 @@ static ssize_t pm_qos_power_read(struct file *filp, char __user *buf, | |||
398 | { | 409 | { |
399 | s32 value; | 410 | s32 value; |
400 | unsigned long flags; | 411 | unsigned long flags; |
401 | struct pm_qos_object *o; | 412 | struct pm_qos_request *req = filp->private_data; |
402 | struct pm_qos_request_list *pm_qos_req = filp->private_data; | ||
403 | 413 | ||
404 | if (!pm_qos_req) | 414 | if (!req) |
405 | return -EINVAL; | 415 | return -EINVAL; |
406 | if (!pm_qos_request_active(pm_qos_req)) | 416 | if (!pm_qos_request_active(req)) |
407 | return -EINVAL; | 417 | return -EINVAL; |
408 | 418 | ||
409 | o = pm_qos_array[pm_qos_req->pm_qos_class]; | ||
410 | spin_lock_irqsave(&pm_qos_lock, flags); | 419 | spin_lock_irqsave(&pm_qos_lock, flags); |
411 | value = pm_qos_get_value(o); | 420 | value = pm_qos_get_value(pm_qos_array[req->pm_qos_class]->constraints); |
412 | spin_unlock_irqrestore(&pm_qos_lock, flags); | 421 | spin_unlock_irqrestore(&pm_qos_lock, flags); |
413 | 422 | ||
414 | return simple_read_from_buffer(buf, count, f_pos, &value, sizeof(s32)); | 423 | return simple_read_from_buffer(buf, count, f_pos, &value, sizeof(s32)); |
@@ -418,7 +427,7 @@ static ssize_t pm_qos_power_write(struct file *filp, const char __user *buf, | |||
418 | size_t count, loff_t *f_pos) | 427 | size_t count, loff_t *f_pos) |
419 | { | 428 | { |
420 | s32 value; | 429 | s32 value; |
421 | struct pm_qos_request_list *pm_qos_req; | 430 | struct pm_qos_request *req; |
422 | 431 | ||
423 | if (count == sizeof(s32)) { | 432 | if (count == sizeof(s32)) { |
424 | if (copy_from_user(&value, buf, sizeof(s32))) | 433 | if (copy_from_user(&value, buf, sizeof(s32))) |
@@ -449,8 +458,8 @@ static ssize_t pm_qos_power_write(struct file *filp, const char __user *buf, | |||
449 | return -EINVAL; | 458 | return -EINVAL; |
450 | } | 459 | } |
451 | 460 | ||
452 | pm_qos_req = filp->private_data; | 461 | req = filp->private_data; |
453 | pm_qos_update_request(pm_qos_req, value); | 462 | pm_qos_update_request(req, value); |
454 | 463 | ||
455 | return count; | 464 | return count; |
456 | } | 465 | } |
diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c index 06efa54f93d6..cbe2c1441392 100644 --- a/kernel/power/snapshot.c +++ b/kernel/power/snapshot.c | |||
@@ -1339,6 +1339,9 @@ int hibernate_preallocate_memory(void) | |||
1339 | count += highmem; | 1339 | count += highmem; |
1340 | count -= totalreserve_pages; | 1340 | count -= totalreserve_pages; |
1341 | 1341 | ||
1342 | /* Add number of pages required for page keys (s390 only). */ | ||
1343 | size += page_key_additional_pages(saveable); | ||
1344 | |||
1342 | /* Compute the maximum number of saveable pages to leave in memory. */ | 1345 | /* Compute the maximum number of saveable pages to leave in memory. */ |
1343 | max_size = (count - (size + PAGES_FOR_IO)) / 2 | 1346 | max_size = (count - (size + PAGES_FOR_IO)) / 2 |
1344 | - 2 * DIV_ROUND_UP(reserved_size, PAGE_SIZE); | 1347 | - 2 * DIV_ROUND_UP(reserved_size, PAGE_SIZE); |
@@ -1662,6 +1665,8 @@ pack_pfns(unsigned long *buf, struct memory_bitmap *bm) | |||
1662 | buf[j] = memory_bm_next_pfn(bm); | 1665 | buf[j] = memory_bm_next_pfn(bm); |
1663 | if (unlikely(buf[j] == BM_END_OF_MAP)) | 1666 | if (unlikely(buf[j] == BM_END_OF_MAP)) |
1664 | break; | 1667 | break; |
1668 | /* Save page key for data page (s390 only). */ | ||
1669 | page_key_read(buf + j); | ||
1665 | } | 1670 | } |
1666 | } | 1671 | } |
1667 | 1672 | ||
@@ -1821,6 +1826,9 @@ static int unpack_orig_pfns(unsigned long *buf, struct memory_bitmap *bm) | |||
1821 | if (unlikely(buf[j] == BM_END_OF_MAP)) | 1826 | if (unlikely(buf[j] == BM_END_OF_MAP)) |
1822 | break; | 1827 | break; |
1823 | 1828 | ||
1829 | /* Extract and buffer page key for data page (s390 only). */ | ||
1830 | page_key_memorize(buf + j); | ||
1831 | |||
1824 | if (memory_bm_pfn_present(bm, buf[j])) | 1832 | if (memory_bm_pfn_present(bm, buf[j])) |
1825 | memory_bm_set_bit(bm, buf[j]); | 1833 | memory_bm_set_bit(bm, buf[j]); |
1826 | else | 1834 | else |
@@ -2223,6 +2231,11 @@ int snapshot_write_next(struct snapshot_handle *handle) | |||
2223 | if (error) | 2231 | if (error) |
2224 | return error; | 2232 | return error; |
2225 | 2233 | ||
2234 | /* Allocate buffer for page keys. */ | ||
2235 | error = page_key_alloc(nr_copy_pages); | ||
2236 | if (error) | ||
2237 | return error; | ||
2238 | |||
2226 | } else if (handle->cur <= nr_meta_pages + 1) { | 2239 | } else if (handle->cur <= nr_meta_pages + 1) { |
2227 | error = unpack_orig_pfns(buffer, ©_bm); | 2240 | error = unpack_orig_pfns(buffer, ©_bm); |
2228 | if (error) | 2241 | if (error) |
@@ -2243,6 +2256,8 @@ int snapshot_write_next(struct snapshot_handle *handle) | |||
2243 | } | 2256 | } |
2244 | } else { | 2257 | } else { |
2245 | copy_last_highmem_page(); | 2258 | copy_last_highmem_page(); |
2259 | /* Restore page key for data page (s390 only). */ | ||
2260 | page_key_write(handle->buffer); | ||
2246 | handle->buffer = get_buffer(&orig_bm, &ca); | 2261 | handle->buffer = get_buffer(&orig_bm, &ca); |
2247 | if (IS_ERR(handle->buffer)) | 2262 | if (IS_ERR(handle->buffer)) |
2248 | return PTR_ERR(handle->buffer); | 2263 | return PTR_ERR(handle->buffer); |
@@ -2264,6 +2279,9 @@ int snapshot_write_next(struct snapshot_handle *handle) | |||
2264 | void snapshot_write_finalize(struct snapshot_handle *handle) | 2279 | void snapshot_write_finalize(struct snapshot_handle *handle) |
2265 | { | 2280 | { |
2266 | copy_last_highmem_page(); | 2281 | copy_last_highmem_page(); |
2282 | /* Restore page key for data page (s390 only). */ | ||
2283 | page_key_write(handle->buffer); | ||
2284 | page_key_free(); | ||
2267 | /* Free only if we have loaded the image entirely */ | 2285 | /* Free only if we have loaded the image entirely */ |
2268 | if (handle->cur > 1 && handle->cur > nr_meta_pages + nr_copy_pages) { | 2286 | if (handle->cur > 1 && handle->cur > nr_meta_pages + nr_copy_pages) { |
2269 | memory_bm_free(&orig_bm, PG_UNSAFE_CLEAR); | 2287 | memory_bm_free(&orig_bm, PG_UNSAFE_CLEAR); |
diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c index b6b71ad2208f..fdd4263b995d 100644 --- a/kernel/power/suspend.c +++ b/kernel/power/suspend.c | |||
@@ -104,7 +104,10 @@ static int suspend_prepare(void) | |||
104 | goto Finish; | 104 | goto Finish; |
105 | 105 | ||
106 | error = suspend_freeze_processes(); | 106 | error = suspend_freeze_processes(); |
107 | if (!error) | 107 | if (error) { |
108 | suspend_stats.failed_freeze++; | ||
109 | dpm_save_failed_step(SUSPEND_FREEZE); | ||
110 | } else | ||
108 | return 0; | 111 | return 0; |
109 | 112 | ||
110 | suspend_thaw_processes(); | 113 | suspend_thaw_processes(); |
@@ -315,8 +318,16 @@ int enter_state(suspend_state_t state) | |||
315 | */ | 318 | */ |
316 | int pm_suspend(suspend_state_t state) | 319 | int pm_suspend(suspend_state_t state) |
317 | { | 320 | { |
318 | if (state > PM_SUSPEND_ON && state <= PM_SUSPEND_MAX) | 321 | int ret; |
319 | return enter_state(state); | 322 | if (state > PM_SUSPEND_ON && state < PM_SUSPEND_MAX) { |
323 | ret = enter_state(state); | ||
324 | if (ret) { | ||
325 | suspend_stats.fail++; | ||
326 | dpm_save_failed_errno(ret); | ||
327 | } else | ||
328 | suspend_stats.success++; | ||
329 | return ret; | ||
330 | } | ||
320 | return -EINVAL; | 331 | return -EINVAL; |
321 | } | 332 | } |
322 | EXPORT_SYMBOL(pm_suspend); | 333 | EXPORT_SYMBOL(pm_suspend); |
diff --git a/kernel/power/swap.c b/kernel/power/swap.c index 7c97c3a0eee3..11a594c4ba25 100644 --- a/kernel/power/swap.c +++ b/kernel/power/swap.c | |||
@@ -27,6 +27,10 @@ | |||
27 | #include <linux/slab.h> | 27 | #include <linux/slab.h> |
28 | #include <linux/lzo.h> | 28 | #include <linux/lzo.h> |
29 | #include <linux/vmalloc.h> | 29 | #include <linux/vmalloc.h> |
30 | #include <linux/cpumask.h> | ||
31 | #include <linux/atomic.h> | ||
32 | #include <linux/kthread.h> | ||
33 | #include <linux/crc32.h> | ||
30 | 34 | ||
31 | #include "power.h" | 35 | #include "power.h" |
32 | 36 | ||
@@ -43,8 +47,7 @@ | |||
43 | * allocated and populated one at a time, so we only need one memory | 47 | * allocated and populated one at a time, so we only need one memory |
44 | * page to set up the entire structure. | 48 | * page to set up the entire structure. |
45 | * | 49 | * |
46 | * During resume we also only need to use one swap_map_page structure | 50 | * During resume we pick up all swap_map_page structures into a list. |
47 | * at a time. | ||
48 | */ | 51 | */ |
49 | 52 | ||
50 | #define MAP_PAGE_ENTRIES (PAGE_SIZE / sizeof(sector_t) - 1) | 53 | #define MAP_PAGE_ENTRIES (PAGE_SIZE / sizeof(sector_t) - 1) |
@@ -54,6 +57,11 @@ struct swap_map_page { | |||
54 | sector_t next_swap; | 57 | sector_t next_swap; |
55 | }; | 58 | }; |
56 | 59 | ||
60 | struct swap_map_page_list { | ||
61 | struct swap_map_page *map; | ||
62 | struct swap_map_page_list *next; | ||
63 | }; | ||
64 | |||
57 | /** | 65 | /** |
58 | * The swap_map_handle structure is used for handling swap in | 66 | * The swap_map_handle structure is used for handling swap in |
59 | * a file-alike way | 67 | * a file-alike way |
@@ -61,13 +69,18 @@ struct swap_map_page { | |||
61 | 69 | ||
62 | struct swap_map_handle { | 70 | struct swap_map_handle { |
63 | struct swap_map_page *cur; | 71 | struct swap_map_page *cur; |
72 | struct swap_map_page_list *maps; | ||
64 | sector_t cur_swap; | 73 | sector_t cur_swap; |
65 | sector_t first_sector; | 74 | sector_t first_sector; |
66 | unsigned int k; | 75 | unsigned int k; |
76 | unsigned long nr_free_pages, written; | ||
77 | u32 crc32; | ||
67 | }; | 78 | }; |
68 | 79 | ||
69 | struct swsusp_header { | 80 | struct swsusp_header { |
70 | char reserved[PAGE_SIZE - 20 - sizeof(sector_t) - sizeof(int)]; | 81 | char reserved[PAGE_SIZE - 20 - sizeof(sector_t) - sizeof(int) - |
82 | sizeof(u32)]; | ||
83 | u32 crc32; | ||
71 | sector_t image; | 84 | sector_t image; |
72 | unsigned int flags; /* Flags to pass to the "boot" kernel */ | 85 | unsigned int flags; /* Flags to pass to the "boot" kernel */ |
73 | char orig_sig[10]; | 86 | char orig_sig[10]; |
@@ -199,6 +212,8 @@ static int mark_swapfiles(struct swap_map_handle *handle, unsigned int flags) | |||
199 | memcpy(swsusp_header->sig, HIBERNATE_SIG, 10); | 212 | memcpy(swsusp_header->sig, HIBERNATE_SIG, 10); |
200 | swsusp_header->image = handle->first_sector; | 213 | swsusp_header->image = handle->first_sector; |
201 | swsusp_header->flags = flags; | 214 | swsusp_header->flags = flags; |
215 | if (flags & SF_CRC32_MODE) | ||
216 | swsusp_header->crc32 = handle->crc32; | ||
202 | error = hib_bio_write_page(swsusp_resume_block, | 217 | error = hib_bio_write_page(swsusp_resume_block, |
203 | swsusp_header, NULL); | 218 | swsusp_header, NULL); |
204 | } else { | 219 | } else { |
@@ -245,6 +260,7 @@ static int swsusp_swap_check(void) | |||
245 | static int write_page(void *buf, sector_t offset, struct bio **bio_chain) | 260 | static int write_page(void *buf, sector_t offset, struct bio **bio_chain) |
246 | { | 261 | { |
247 | void *src; | 262 | void *src; |
263 | int ret; | ||
248 | 264 | ||
249 | if (!offset) | 265 | if (!offset) |
250 | return -ENOSPC; | 266 | return -ENOSPC; |
@@ -254,9 +270,17 @@ static int write_page(void *buf, sector_t offset, struct bio **bio_chain) | |||
254 | if (src) { | 270 | if (src) { |
255 | copy_page(src, buf); | 271 | copy_page(src, buf); |
256 | } else { | 272 | } else { |
257 | WARN_ON_ONCE(1); | 273 | ret = hib_wait_on_bio_chain(bio_chain); /* Free pages */ |
258 | bio_chain = NULL; /* Go synchronous */ | 274 | if (ret) |
259 | src = buf; | 275 | return ret; |
276 | src = (void *)__get_free_page(__GFP_WAIT | __GFP_HIGH); | ||
277 | if (src) { | ||
278 | copy_page(src, buf); | ||
279 | } else { | ||
280 | WARN_ON_ONCE(1); | ||
281 | bio_chain = NULL; /* Go synchronous */ | ||
282 | src = buf; | ||
283 | } | ||
260 | } | 284 | } |
261 | } else { | 285 | } else { |
262 | src = buf; | 286 | src = buf; |
@@ -293,6 +317,8 @@ static int get_swap_writer(struct swap_map_handle *handle) | |||
293 | goto err_rel; | 317 | goto err_rel; |
294 | } | 318 | } |
295 | handle->k = 0; | 319 | handle->k = 0; |
320 | handle->nr_free_pages = nr_free_pages() >> 1; | ||
321 | handle->written = 0; | ||
296 | handle->first_sector = handle->cur_swap; | 322 | handle->first_sector = handle->cur_swap; |
297 | return 0; | 323 | return 0; |
298 | err_rel: | 324 | err_rel: |
@@ -316,20 +342,23 @@ static int swap_write_page(struct swap_map_handle *handle, void *buf, | |||
316 | return error; | 342 | return error; |
317 | handle->cur->entries[handle->k++] = offset; | 343 | handle->cur->entries[handle->k++] = offset; |
318 | if (handle->k >= MAP_PAGE_ENTRIES) { | 344 | if (handle->k >= MAP_PAGE_ENTRIES) { |
319 | error = hib_wait_on_bio_chain(bio_chain); | ||
320 | if (error) | ||
321 | goto out; | ||
322 | offset = alloc_swapdev_block(root_swap); | 345 | offset = alloc_swapdev_block(root_swap); |
323 | if (!offset) | 346 | if (!offset) |
324 | return -ENOSPC; | 347 | return -ENOSPC; |
325 | handle->cur->next_swap = offset; | 348 | handle->cur->next_swap = offset; |
326 | error = write_page(handle->cur, handle->cur_swap, NULL); | 349 | error = write_page(handle->cur, handle->cur_swap, bio_chain); |
327 | if (error) | 350 | if (error) |
328 | goto out; | 351 | goto out; |
329 | clear_page(handle->cur); | 352 | clear_page(handle->cur); |
330 | handle->cur_swap = offset; | 353 | handle->cur_swap = offset; |
331 | handle->k = 0; | 354 | handle->k = 0; |
332 | } | 355 | } |
356 | if (bio_chain && ++handle->written > handle->nr_free_pages) { | ||
357 | error = hib_wait_on_bio_chain(bio_chain); | ||
358 | if (error) | ||
359 | goto out; | ||
360 | handle->written = 0; | ||
361 | } | ||
333 | out: | 362 | out: |
334 | return error; | 363 | return error; |
335 | } | 364 | } |
@@ -372,6 +401,13 @@ static int swap_writer_finish(struct swap_map_handle *handle, | |||
372 | LZO_HEADER, PAGE_SIZE) | 401 | LZO_HEADER, PAGE_SIZE) |
373 | #define LZO_CMP_SIZE (LZO_CMP_PAGES * PAGE_SIZE) | 402 | #define LZO_CMP_SIZE (LZO_CMP_PAGES * PAGE_SIZE) |
374 | 403 | ||
404 | /* Maximum number of threads for compression/decompression. */ | ||
405 | #define LZO_THREADS 3 | ||
406 | |||
407 | /* Maximum number of pages for read buffering. */ | ||
408 | #define LZO_READ_PAGES (MAP_PAGE_ENTRIES * 8) | ||
409 | |||
410 | |||
375 | /** | 411 | /** |
376 | * save_image - save the suspend image data | 412 | * save_image - save the suspend image data |
377 | */ | 413 | */ |
@@ -419,6 +455,92 @@ static int save_image(struct swap_map_handle *handle, | |||
419 | return ret; | 455 | return ret; |
420 | } | 456 | } |
421 | 457 | ||
458 | /** | ||
459 | * Structure used for CRC32. | ||
460 | */ | ||
461 | struct crc_data { | ||
462 | struct task_struct *thr; /* thread */ | ||
463 | atomic_t ready; /* ready to start flag */ | ||
464 | atomic_t stop; /* ready to stop flag */ | ||
465 | unsigned run_threads; /* nr current threads */ | ||
466 | wait_queue_head_t go; /* start crc update */ | ||
467 | wait_queue_head_t done; /* crc update done */ | ||
468 | u32 *crc32; /* points to handle's crc32 */ | ||
469 | size_t *unc_len[LZO_THREADS]; /* uncompressed lengths */ | ||
470 | unsigned char *unc[LZO_THREADS]; /* uncompressed data */ | ||
471 | }; | ||
472 | |||
473 | /** | ||
474 | * CRC32 update function that runs in its own thread. | ||
475 | */ | ||
476 | static int crc32_threadfn(void *data) | ||
477 | { | ||
478 | struct crc_data *d = data; | ||
479 | unsigned i; | ||
480 | |||
481 | while (1) { | ||
482 | wait_event(d->go, atomic_read(&d->ready) || | ||
483 | kthread_should_stop()); | ||
484 | if (kthread_should_stop()) { | ||
485 | d->thr = NULL; | ||
486 | atomic_set(&d->stop, 1); | ||
487 | wake_up(&d->done); | ||
488 | break; | ||
489 | } | ||
490 | atomic_set(&d->ready, 0); | ||
491 | |||
492 | for (i = 0; i < d->run_threads; i++) | ||
493 | *d->crc32 = crc32_le(*d->crc32, | ||
494 | d->unc[i], *d->unc_len[i]); | ||
495 | atomic_set(&d->stop, 1); | ||
496 | wake_up(&d->done); | ||
497 | } | ||
498 | return 0; | ||
499 | } | ||
500 | /** | ||
501 | * Structure used for LZO data compression. | ||
502 | */ | ||
503 | struct cmp_data { | ||
504 | struct task_struct *thr; /* thread */ | ||
505 | atomic_t ready; /* ready to start flag */ | ||
506 | atomic_t stop; /* ready to stop flag */ | ||
507 | int ret; /* return code */ | ||
508 | wait_queue_head_t go; /* start compression */ | ||
509 | wait_queue_head_t done; /* compression done */ | ||
510 | size_t unc_len; /* uncompressed length */ | ||
511 | size_t cmp_len; /* compressed length */ | ||
512 | unsigned char unc[LZO_UNC_SIZE]; /* uncompressed buffer */ | ||
513 | unsigned char cmp[LZO_CMP_SIZE]; /* compressed buffer */ | ||
514 | unsigned char wrk[LZO1X_1_MEM_COMPRESS]; /* compression workspace */ | ||
515 | }; | ||
516 | |||
517 | /** | ||
518 | * Compression function that runs in its own thread. | ||
519 | */ | ||
520 | static int lzo_compress_threadfn(void *data) | ||
521 | { | ||
522 | struct cmp_data *d = data; | ||
523 | |||
524 | while (1) { | ||
525 | wait_event(d->go, atomic_read(&d->ready) || | ||
526 | kthread_should_stop()); | ||
527 | if (kthread_should_stop()) { | ||
528 | d->thr = NULL; | ||
529 | d->ret = -1; | ||
530 | atomic_set(&d->stop, 1); | ||
531 | wake_up(&d->done); | ||
532 | break; | ||
533 | } | ||
534 | atomic_set(&d->ready, 0); | ||
535 | |||
536 | d->ret = lzo1x_1_compress(d->unc, d->unc_len, | ||
537 | d->cmp + LZO_HEADER, &d->cmp_len, | ||
538 | d->wrk); | ||
539 | atomic_set(&d->stop, 1); | ||
540 | wake_up(&d->done); | ||
541 | } | ||
542 | return 0; | ||
543 | } | ||
422 | 544 | ||
423 | /** | 545 | /** |
424 | * save_image_lzo - Save the suspend image data compressed with LZO. | 546 | * save_image_lzo - Save the suspend image data compressed with LZO. |
@@ -437,42 +559,93 @@ static int save_image_lzo(struct swap_map_handle *handle, | |||
437 | struct bio *bio; | 559 | struct bio *bio; |
438 | struct timeval start; | 560 | struct timeval start; |
439 | struct timeval stop; | 561 | struct timeval stop; |
440 | size_t off, unc_len, cmp_len; | 562 | size_t off; |
441 | unsigned char *unc, *cmp, *wrk, *page; | 563 | unsigned thr, run_threads, nr_threads; |
564 | unsigned char *page = NULL; | ||
565 | struct cmp_data *data = NULL; | ||
566 | struct crc_data *crc = NULL; | ||
567 | |||
568 | /* | ||
569 | * We'll limit the number of threads for compression to limit memory | ||
570 | * footprint. | ||
571 | */ | ||
572 | nr_threads = num_online_cpus() - 1; | ||
573 | nr_threads = clamp_val(nr_threads, 1, LZO_THREADS); | ||
442 | 574 | ||
443 | page = (void *)__get_free_page(__GFP_WAIT | __GFP_HIGH); | 575 | page = (void *)__get_free_page(__GFP_WAIT | __GFP_HIGH); |
444 | if (!page) { | 576 | if (!page) { |
445 | printk(KERN_ERR "PM: Failed to allocate LZO page\n"); | 577 | printk(KERN_ERR "PM: Failed to allocate LZO page\n"); |
446 | return -ENOMEM; | 578 | ret = -ENOMEM; |
579 | goto out_clean; | ||
447 | } | 580 | } |
448 | 581 | ||
449 | wrk = vmalloc(LZO1X_1_MEM_COMPRESS); | 582 | data = vmalloc(sizeof(*data) * nr_threads); |
450 | if (!wrk) { | 583 | if (!data) { |
451 | printk(KERN_ERR "PM: Failed to allocate LZO workspace\n"); | 584 | printk(KERN_ERR "PM: Failed to allocate LZO data\n"); |
452 | free_page((unsigned long)page); | 585 | ret = -ENOMEM; |
453 | return -ENOMEM; | 586 | goto out_clean; |
454 | } | 587 | } |
588 | for (thr = 0; thr < nr_threads; thr++) | ||
589 | memset(&data[thr], 0, offsetof(struct cmp_data, go)); | ||
455 | 590 | ||
456 | unc = vmalloc(LZO_UNC_SIZE); | 591 | crc = kmalloc(sizeof(*crc), GFP_KERNEL); |
457 | if (!unc) { | 592 | if (!crc) { |
458 | printk(KERN_ERR "PM: Failed to allocate LZO uncompressed\n"); | 593 | printk(KERN_ERR "PM: Failed to allocate crc\n"); |
459 | vfree(wrk); | 594 | ret = -ENOMEM; |
460 | free_page((unsigned long)page); | 595 | goto out_clean; |
461 | return -ENOMEM; | 596 | } |
597 | memset(crc, 0, offsetof(struct crc_data, go)); | ||
598 | |||
599 | /* | ||
600 | * Start the compression threads. | ||
601 | */ | ||
602 | for (thr = 0; thr < nr_threads; thr++) { | ||
603 | init_waitqueue_head(&data[thr].go); | ||
604 | init_waitqueue_head(&data[thr].done); | ||
605 | |||
606 | data[thr].thr = kthread_run(lzo_compress_threadfn, | ||
607 | &data[thr], | ||
608 | "image_compress/%u", thr); | ||
609 | if (IS_ERR(data[thr].thr)) { | ||
610 | data[thr].thr = NULL; | ||
611 | printk(KERN_ERR | ||
612 | "PM: Cannot start compression threads\n"); | ||
613 | ret = -ENOMEM; | ||
614 | goto out_clean; | ||
615 | } | ||
462 | } | 616 | } |
463 | 617 | ||
464 | cmp = vmalloc(LZO_CMP_SIZE); | 618 | /* |
465 | if (!cmp) { | 619 | * Adjust number of free pages after all allocations have been done. |
466 | printk(KERN_ERR "PM: Failed to allocate LZO compressed\n"); | 620 | * We don't want to run out of pages when writing. |
467 | vfree(unc); | 621 | */ |
468 | vfree(wrk); | 622 | handle->nr_free_pages = nr_free_pages() >> 1; |
469 | free_page((unsigned long)page); | 623 | |
470 | return -ENOMEM; | 624 | /* |
625 | * Start the CRC32 thread. | ||
626 | */ | ||
627 | init_waitqueue_head(&crc->go); | ||
628 | init_waitqueue_head(&crc->done); | ||
629 | |||
630 | handle->crc32 = 0; | ||
631 | crc->crc32 = &handle->crc32; | ||
632 | for (thr = 0; thr < nr_threads; thr++) { | ||
633 | crc->unc[thr] = data[thr].unc; | ||
634 | crc->unc_len[thr] = &data[thr].unc_len; | ||
635 | } | ||
636 | |||
637 | crc->thr = kthread_run(crc32_threadfn, crc, "image_crc32"); | ||
638 | if (IS_ERR(crc->thr)) { | ||
639 | crc->thr = NULL; | ||
640 | printk(KERN_ERR "PM: Cannot start CRC32 thread\n"); | ||
641 | ret = -ENOMEM; | ||
642 | goto out_clean; | ||
471 | } | 643 | } |
472 | 644 | ||
473 | printk(KERN_INFO | 645 | printk(KERN_INFO |
646 | "PM: Using %u thread(s) for compression.\n" | ||
474 | "PM: Compressing and saving image data (%u pages) ... ", | 647 | "PM: Compressing and saving image data (%u pages) ... ", |
475 | nr_to_write); | 648 | nr_threads, nr_to_write); |
476 | m = nr_to_write / 100; | 649 | m = nr_to_write / 100; |
477 | if (!m) | 650 | if (!m) |
478 | m = 1; | 651 | m = 1; |
@@ -480,55 +653,83 @@ static int save_image_lzo(struct swap_map_handle *handle, | |||
480 | bio = NULL; | 653 | bio = NULL; |
481 | do_gettimeofday(&start); | 654 | do_gettimeofday(&start); |
482 | for (;;) { | 655 | for (;;) { |
483 | for (off = 0; off < LZO_UNC_SIZE; off += PAGE_SIZE) { | 656 | for (thr = 0; thr < nr_threads; thr++) { |
484 | ret = snapshot_read_next(snapshot); | 657 | for (off = 0; off < LZO_UNC_SIZE; off += PAGE_SIZE) { |
485 | if (ret < 0) | 658 | ret = snapshot_read_next(snapshot); |
486 | goto out_finish; | 659 | if (ret < 0) |
487 | 660 | goto out_finish; | |
488 | if (!ret) | 661 | |
662 | if (!ret) | ||
663 | break; | ||
664 | |||
665 | memcpy(data[thr].unc + off, | ||
666 | data_of(*snapshot), PAGE_SIZE); | ||
667 | |||
668 | if (!(nr_pages % m)) | ||
669 | printk(KERN_CONT "\b\b\b\b%3d%%", | ||
670 | nr_pages / m); | ||
671 | nr_pages++; | ||
672 | } | ||
673 | if (!off) | ||
489 | break; | 674 | break; |
490 | 675 | ||
491 | memcpy(unc + off, data_of(*snapshot), PAGE_SIZE); | 676 | data[thr].unc_len = off; |
492 | 677 | ||
493 | if (!(nr_pages % m)) | 678 | atomic_set(&data[thr].ready, 1); |
494 | printk(KERN_CONT "\b\b\b\b%3d%%", nr_pages / m); | 679 | wake_up(&data[thr].go); |
495 | nr_pages++; | ||
496 | } | 680 | } |
497 | 681 | ||
498 | if (!off) | 682 | if (!thr) |
499 | break; | 683 | break; |
500 | 684 | ||
501 | unc_len = off; | 685 | crc->run_threads = thr; |
502 | ret = lzo1x_1_compress(unc, unc_len, | 686 | atomic_set(&crc->ready, 1); |
503 | cmp + LZO_HEADER, &cmp_len, wrk); | 687 | wake_up(&crc->go); |
504 | if (ret < 0) { | ||
505 | printk(KERN_ERR "PM: LZO compression failed\n"); | ||
506 | break; | ||
507 | } | ||
508 | 688 | ||
509 | if (unlikely(!cmp_len || | 689 | for (run_threads = thr, thr = 0; thr < run_threads; thr++) { |
510 | cmp_len > lzo1x_worst_compress(unc_len))) { | 690 | wait_event(data[thr].done, |
511 | printk(KERN_ERR "PM: Invalid LZO compressed length\n"); | 691 | atomic_read(&data[thr].stop)); |
512 | ret = -1; | 692 | atomic_set(&data[thr].stop, 0); |
513 | break; | ||
514 | } | ||
515 | 693 | ||
516 | *(size_t *)cmp = cmp_len; | 694 | ret = data[thr].ret; |
517 | 695 | ||
518 | /* | 696 | if (ret < 0) { |
519 | * Given we are writing one page at a time to disk, we copy | 697 | printk(KERN_ERR "PM: LZO compression failed\n"); |
520 | * that much from the buffer, although the last bit will likely | 698 | goto out_finish; |
521 | * be smaller than full page. This is OK - we saved the length | 699 | } |
522 | * of the compressed data, so any garbage at the end will be | ||
523 | * discarded when we read it. | ||
524 | */ | ||
525 | for (off = 0; off < LZO_HEADER + cmp_len; off += PAGE_SIZE) { | ||
526 | memcpy(page, cmp + off, PAGE_SIZE); | ||
527 | 700 | ||
528 | ret = swap_write_page(handle, page, &bio); | 701 | if (unlikely(!data[thr].cmp_len || |
529 | if (ret) | 702 | data[thr].cmp_len > |
703 | lzo1x_worst_compress(data[thr].unc_len))) { | ||
704 | printk(KERN_ERR | ||
705 | "PM: Invalid LZO compressed length\n"); | ||
706 | ret = -1; | ||
530 | goto out_finish; | 707 | goto out_finish; |
708 | } | ||
709 | |||
710 | *(size_t *)data[thr].cmp = data[thr].cmp_len; | ||
711 | |||
712 | /* | ||
713 | * Given we are writing one page at a time to disk, we | ||
714 | * copy that much from the buffer, although the last | ||
715 | * bit will likely be smaller than full page. This is | ||
716 | * OK - we saved the length of the compressed data, so | ||
717 | * any garbage at the end will be discarded when we | ||
718 | * read it. | ||
719 | */ | ||
720 | for (off = 0; | ||
721 | off < LZO_HEADER + data[thr].cmp_len; | ||
722 | off += PAGE_SIZE) { | ||
723 | memcpy(page, data[thr].cmp + off, PAGE_SIZE); | ||
724 | |||
725 | ret = swap_write_page(handle, page, &bio); | ||
726 | if (ret) | ||
727 | goto out_finish; | ||
728 | } | ||
531 | } | 729 | } |
730 | |||
731 | wait_event(crc->done, atomic_read(&crc->stop)); | ||
732 | atomic_set(&crc->stop, 0); | ||
532 | } | 733 | } |
533 | 734 | ||
534 | out_finish: | 735 | out_finish: |
@@ -536,16 +737,25 @@ out_finish: | |||
536 | do_gettimeofday(&stop); | 737 | do_gettimeofday(&stop); |
537 | if (!ret) | 738 | if (!ret) |
538 | ret = err2; | 739 | ret = err2; |
539 | if (!ret) | 740 | if (!ret) { |
540 | printk(KERN_CONT "\b\b\b\bdone\n"); | 741 | printk(KERN_CONT "\b\b\b\bdone\n"); |
541 | else | 742 | } else { |
542 | printk(KERN_CONT "\n"); | 743 | printk(KERN_CONT "\n"); |
744 | } | ||
543 | swsusp_show_speed(&start, &stop, nr_to_write, "Wrote"); | 745 | swsusp_show_speed(&start, &stop, nr_to_write, "Wrote"); |
544 | 746 | out_clean: | |
545 | vfree(cmp); | 747 | if (crc) { |
546 | vfree(unc); | 748 | if (crc->thr) |
547 | vfree(wrk); | 749 | kthread_stop(crc->thr); |
548 | free_page((unsigned long)page); | 750 | kfree(crc); |
751 | } | ||
752 | if (data) { | ||
753 | for (thr = 0; thr < nr_threads; thr++) | ||
754 | if (data[thr].thr) | ||
755 | kthread_stop(data[thr].thr); | ||
756 | vfree(data); | ||
757 | } | ||
758 | if (page) free_page((unsigned long)page); | ||
549 | 759 | ||
550 | return ret; | 760 | return ret; |
551 | } | 761 | } |
@@ -625,8 +835,15 @@ out_finish: | |||
625 | 835 | ||
626 | static void release_swap_reader(struct swap_map_handle *handle) | 836 | static void release_swap_reader(struct swap_map_handle *handle) |
627 | { | 837 | { |
628 | if (handle->cur) | 838 | struct swap_map_page_list *tmp; |
629 | free_page((unsigned long)handle->cur); | 839 | |
840 | while (handle->maps) { | ||
841 | if (handle->maps->map) | ||
842 | free_page((unsigned long)handle->maps->map); | ||
843 | tmp = handle->maps; | ||
844 | handle->maps = handle->maps->next; | ||
845 | kfree(tmp); | ||
846 | } | ||
630 | handle->cur = NULL; | 847 | handle->cur = NULL; |
631 | } | 848 | } |
632 | 849 | ||
@@ -634,22 +851,46 @@ static int get_swap_reader(struct swap_map_handle *handle, | |||
634 | unsigned int *flags_p) | 851 | unsigned int *flags_p) |
635 | { | 852 | { |
636 | int error; | 853 | int error; |
854 | struct swap_map_page_list *tmp, *last; | ||
855 | sector_t offset; | ||
637 | 856 | ||
638 | *flags_p = swsusp_header->flags; | 857 | *flags_p = swsusp_header->flags; |
639 | 858 | ||
640 | if (!swsusp_header->image) /* how can this happen? */ | 859 | if (!swsusp_header->image) /* how can this happen? */ |
641 | return -EINVAL; | 860 | return -EINVAL; |
642 | 861 | ||
643 | handle->cur = (struct swap_map_page *)get_zeroed_page(__GFP_WAIT | __GFP_HIGH); | 862 | handle->cur = NULL; |
644 | if (!handle->cur) | 863 | last = handle->maps = NULL; |
645 | return -ENOMEM; | 864 | offset = swsusp_header->image; |
865 | while (offset) { | ||
866 | tmp = kmalloc(sizeof(*handle->maps), GFP_KERNEL); | ||
867 | if (!tmp) { | ||
868 | release_swap_reader(handle); | ||
869 | return -ENOMEM; | ||
870 | } | ||
871 | memset(tmp, 0, sizeof(*tmp)); | ||
872 | if (!handle->maps) | ||
873 | handle->maps = tmp; | ||
874 | if (last) | ||
875 | last->next = tmp; | ||
876 | last = tmp; | ||
877 | |||
878 | tmp->map = (struct swap_map_page *) | ||
879 | __get_free_page(__GFP_WAIT | __GFP_HIGH); | ||
880 | if (!tmp->map) { | ||
881 | release_swap_reader(handle); | ||
882 | return -ENOMEM; | ||
883 | } | ||
646 | 884 | ||
647 | error = hib_bio_read_page(swsusp_header->image, handle->cur, NULL); | 885 | error = hib_bio_read_page(offset, tmp->map, NULL); |
648 | if (error) { | 886 | if (error) { |
649 | release_swap_reader(handle); | 887 | release_swap_reader(handle); |
650 | return error; | 888 | return error; |
889 | } | ||
890 | offset = tmp->map->next_swap; | ||
651 | } | 891 | } |
652 | handle->k = 0; | 892 | handle->k = 0; |
893 | handle->cur = handle->maps->map; | ||
653 | return 0; | 894 | return 0; |
654 | } | 895 | } |
655 | 896 | ||
@@ -658,6 +899,7 @@ static int swap_read_page(struct swap_map_handle *handle, void *buf, | |||
658 | { | 899 | { |
659 | sector_t offset; | 900 | sector_t offset; |
660 | int error; | 901 | int error; |
902 | struct swap_map_page_list *tmp; | ||
661 | 903 | ||
662 | if (!handle->cur) | 904 | if (!handle->cur) |
663 | return -EINVAL; | 905 | return -EINVAL; |
@@ -668,13 +910,15 @@ static int swap_read_page(struct swap_map_handle *handle, void *buf, | |||
668 | if (error) | 910 | if (error) |
669 | return error; | 911 | return error; |
670 | if (++handle->k >= MAP_PAGE_ENTRIES) { | 912 | if (++handle->k >= MAP_PAGE_ENTRIES) { |
671 | error = hib_wait_on_bio_chain(bio_chain); | ||
672 | handle->k = 0; | 913 | handle->k = 0; |
673 | offset = handle->cur->next_swap; | 914 | free_page((unsigned long)handle->maps->map); |
674 | if (!offset) | 915 | tmp = handle->maps; |
916 | handle->maps = handle->maps->next; | ||
917 | kfree(tmp); | ||
918 | if (!handle->maps) | ||
675 | release_swap_reader(handle); | 919 | release_swap_reader(handle); |
676 | else if (!error) | 920 | else |
677 | error = hib_bio_read_page(offset, handle->cur, NULL); | 921 | handle->cur = handle->maps->map; |
678 | } | 922 | } |
679 | return error; | 923 | return error; |
680 | } | 924 | } |
@@ -697,7 +941,7 @@ static int load_image(struct swap_map_handle *handle, | |||
697 | unsigned int nr_to_read) | 941 | unsigned int nr_to_read) |
698 | { | 942 | { |
699 | unsigned int m; | 943 | unsigned int m; |
700 | int error = 0; | 944 | int ret = 0; |
701 | struct timeval start; | 945 | struct timeval start; |
702 | struct timeval stop; | 946 | struct timeval stop; |
703 | struct bio *bio; | 947 | struct bio *bio; |
@@ -713,15 +957,15 @@ static int load_image(struct swap_map_handle *handle, | |||
713 | bio = NULL; | 957 | bio = NULL; |
714 | do_gettimeofday(&start); | 958 | do_gettimeofday(&start); |
715 | for ( ; ; ) { | 959 | for ( ; ; ) { |
716 | error = snapshot_write_next(snapshot); | 960 | ret = snapshot_write_next(snapshot); |
717 | if (error <= 0) | 961 | if (ret <= 0) |
718 | break; | 962 | break; |
719 | error = swap_read_page(handle, data_of(*snapshot), &bio); | 963 | ret = swap_read_page(handle, data_of(*snapshot), &bio); |
720 | if (error) | 964 | if (ret) |
721 | break; | 965 | break; |
722 | if (snapshot->sync_read) | 966 | if (snapshot->sync_read) |
723 | error = hib_wait_on_bio_chain(&bio); | 967 | ret = hib_wait_on_bio_chain(&bio); |
724 | if (error) | 968 | if (ret) |
725 | break; | 969 | break; |
726 | if (!(nr_pages % m)) | 970 | if (!(nr_pages % m)) |
727 | printk("\b\b\b\b%3d%%", nr_pages / m); | 971 | printk("\b\b\b\b%3d%%", nr_pages / m); |
@@ -729,17 +973,61 @@ static int load_image(struct swap_map_handle *handle, | |||
729 | } | 973 | } |
730 | err2 = hib_wait_on_bio_chain(&bio); | 974 | err2 = hib_wait_on_bio_chain(&bio); |
731 | do_gettimeofday(&stop); | 975 | do_gettimeofday(&stop); |
732 | if (!error) | 976 | if (!ret) |
733 | error = err2; | 977 | ret = err2; |
734 | if (!error) { | 978 | if (!ret) { |
735 | printk("\b\b\b\bdone\n"); | 979 | printk("\b\b\b\bdone\n"); |
736 | snapshot_write_finalize(snapshot); | 980 | snapshot_write_finalize(snapshot); |
737 | if (!snapshot_image_loaded(snapshot)) | 981 | if (!snapshot_image_loaded(snapshot)) |
738 | error = -ENODATA; | 982 | ret = -ENODATA; |
739 | } else | 983 | } else |
740 | printk("\n"); | 984 | printk("\n"); |
741 | swsusp_show_speed(&start, &stop, nr_to_read, "Read"); | 985 | swsusp_show_speed(&start, &stop, nr_to_read, "Read"); |
742 | return error; | 986 | return ret; |
987 | } | ||
988 | |||
989 | /** | ||
990 | * Structure used for LZO data decompression. | ||
991 | */ | ||
992 | struct dec_data { | ||
993 | struct task_struct *thr; /* thread */ | ||
994 | atomic_t ready; /* ready to start flag */ | ||
995 | atomic_t stop; /* ready to stop flag */ | ||
996 | int ret; /* return code */ | ||
997 | wait_queue_head_t go; /* start decompression */ | ||
998 | wait_queue_head_t done; /* decompression done */ | ||
999 | size_t unc_len; /* uncompressed length */ | ||
1000 | size_t cmp_len; /* compressed length */ | ||
1001 | unsigned char unc[LZO_UNC_SIZE]; /* uncompressed buffer */ | ||
1002 | unsigned char cmp[LZO_CMP_SIZE]; /* compressed buffer */ | ||
1003 | }; | ||
1004 | |||
1005 | /** | ||
1006 | * Deompression function that runs in its own thread. | ||
1007 | */ | ||
1008 | static int lzo_decompress_threadfn(void *data) | ||
1009 | { | ||
1010 | struct dec_data *d = data; | ||
1011 | |||
1012 | while (1) { | ||
1013 | wait_event(d->go, atomic_read(&d->ready) || | ||
1014 | kthread_should_stop()); | ||
1015 | if (kthread_should_stop()) { | ||
1016 | d->thr = NULL; | ||
1017 | d->ret = -1; | ||
1018 | atomic_set(&d->stop, 1); | ||
1019 | wake_up(&d->done); | ||
1020 | break; | ||
1021 | } | ||
1022 | atomic_set(&d->ready, 0); | ||
1023 | |||
1024 | d->unc_len = LZO_UNC_SIZE; | ||
1025 | d->ret = lzo1x_decompress_safe(d->cmp + LZO_HEADER, d->cmp_len, | ||
1026 | d->unc, &d->unc_len); | ||
1027 | atomic_set(&d->stop, 1); | ||
1028 | wake_up(&d->done); | ||
1029 | } | ||
1030 | return 0; | ||
743 | } | 1031 | } |
744 | 1032 | ||
745 | /** | 1033 | /** |
@@ -753,50 +1041,120 @@ static int load_image_lzo(struct swap_map_handle *handle, | |||
753 | unsigned int nr_to_read) | 1041 | unsigned int nr_to_read) |
754 | { | 1042 | { |
755 | unsigned int m; | 1043 | unsigned int m; |
756 | int error = 0; | 1044 | int ret = 0; |
1045 | int eof = 0; | ||
757 | struct bio *bio; | 1046 | struct bio *bio; |
758 | struct timeval start; | 1047 | struct timeval start; |
759 | struct timeval stop; | 1048 | struct timeval stop; |
760 | unsigned nr_pages; | 1049 | unsigned nr_pages; |
761 | size_t i, off, unc_len, cmp_len; | 1050 | size_t off; |
762 | unsigned char *unc, *cmp, *page[LZO_CMP_PAGES]; | 1051 | unsigned i, thr, run_threads, nr_threads; |
763 | 1052 | unsigned ring = 0, pg = 0, ring_size = 0, | |
764 | for (i = 0; i < LZO_CMP_PAGES; i++) { | 1053 | have = 0, want, need, asked = 0; |
765 | page[i] = (void *)__get_free_page(__GFP_WAIT | __GFP_HIGH); | 1054 | unsigned long read_pages; |
766 | if (!page[i]) { | 1055 | unsigned char **page = NULL; |
767 | printk(KERN_ERR "PM: Failed to allocate LZO page\n"); | 1056 | struct dec_data *data = NULL; |
1057 | struct crc_data *crc = NULL; | ||
1058 | |||
1059 | /* | ||
1060 | * We'll limit the number of threads for decompression to limit memory | ||
1061 | * footprint. | ||
1062 | */ | ||
1063 | nr_threads = num_online_cpus() - 1; | ||
1064 | nr_threads = clamp_val(nr_threads, 1, LZO_THREADS); | ||
1065 | |||
1066 | page = vmalloc(sizeof(*page) * LZO_READ_PAGES); | ||
1067 | if (!page) { | ||
1068 | printk(KERN_ERR "PM: Failed to allocate LZO page\n"); | ||
1069 | ret = -ENOMEM; | ||
1070 | goto out_clean; | ||
1071 | } | ||
768 | 1072 | ||
769 | while (i) | 1073 | data = vmalloc(sizeof(*data) * nr_threads); |
770 | free_page((unsigned long)page[--i]); | 1074 | if (!data) { |
1075 | printk(KERN_ERR "PM: Failed to allocate LZO data\n"); | ||
1076 | ret = -ENOMEM; | ||
1077 | goto out_clean; | ||
1078 | } | ||
1079 | for (thr = 0; thr < nr_threads; thr++) | ||
1080 | memset(&data[thr], 0, offsetof(struct dec_data, go)); | ||
771 | 1081 | ||
772 | return -ENOMEM; | 1082 | crc = kmalloc(sizeof(*crc), GFP_KERNEL); |
1083 | if (!crc) { | ||
1084 | printk(KERN_ERR "PM: Failed to allocate crc\n"); | ||
1085 | ret = -ENOMEM; | ||
1086 | goto out_clean; | ||
1087 | } | ||
1088 | memset(crc, 0, offsetof(struct crc_data, go)); | ||
1089 | |||
1090 | /* | ||
1091 | * Start the decompression threads. | ||
1092 | */ | ||
1093 | for (thr = 0; thr < nr_threads; thr++) { | ||
1094 | init_waitqueue_head(&data[thr].go); | ||
1095 | init_waitqueue_head(&data[thr].done); | ||
1096 | |||
1097 | data[thr].thr = kthread_run(lzo_decompress_threadfn, | ||
1098 | &data[thr], | ||
1099 | "image_decompress/%u", thr); | ||
1100 | if (IS_ERR(data[thr].thr)) { | ||
1101 | data[thr].thr = NULL; | ||
1102 | printk(KERN_ERR | ||
1103 | "PM: Cannot start decompression threads\n"); | ||
1104 | ret = -ENOMEM; | ||
1105 | goto out_clean; | ||
773 | } | 1106 | } |
774 | } | 1107 | } |
775 | 1108 | ||
776 | unc = vmalloc(LZO_UNC_SIZE); | 1109 | /* |
777 | if (!unc) { | 1110 | * Start the CRC32 thread. |
778 | printk(KERN_ERR "PM: Failed to allocate LZO uncompressed\n"); | 1111 | */ |
779 | 1112 | init_waitqueue_head(&crc->go); | |
780 | for (i = 0; i < LZO_CMP_PAGES; i++) | 1113 | init_waitqueue_head(&crc->done); |
781 | free_page((unsigned long)page[i]); | 1114 | |
782 | 1115 | handle->crc32 = 0; | |
783 | return -ENOMEM; | 1116 | crc->crc32 = &handle->crc32; |
1117 | for (thr = 0; thr < nr_threads; thr++) { | ||
1118 | crc->unc[thr] = data[thr].unc; | ||
1119 | crc->unc_len[thr] = &data[thr].unc_len; | ||
784 | } | 1120 | } |
785 | 1121 | ||
786 | cmp = vmalloc(LZO_CMP_SIZE); | 1122 | crc->thr = kthread_run(crc32_threadfn, crc, "image_crc32"); |
787 | if (!cmp) { | 1123 | if (IS_ERR(crc->thr)) { |
788 | printk(KERN_ERR "PM: Failed to allocate LZO compressed\n"); | 1124 | crc->thr = NULL; |
1125 | printk(KERN_ERR "PM: Cannot start CRC32 thread\n"); | ||
1126 | ret = -ENOMEM; | ||
1127 | goto out_clean; | ||
1128 | } | ||
789 | 1129 | ||
790 | vfree(unc); | 1130 | /* |
791 | for (i = 0; i < LZO_CMP_PAGES; i++) | 1131 | * Adjust number of pages for read buffering, in case we are short. |
792 | free_page((unsigned long)page[i]); | 1132 | */ |
1133 | read_pages = (nr_free_pages() - snapshot_get_image_size()) >> 1; | ||
1134 | read_pages = clamp_val(read_pages, LZO_CMP_PAGES, LZO_READ_PAGES); | ||
793 | 1135 | ||
794 | return -ENOMEM; | 1136 | for (i = 0; i < read_pages; i++) { |
1137 | page[i] = (void *)__get_free_page(i < LZO_CMP_PAGES ? | ||
1138 | __GFP_WAIT | __GFP_HIGH : | ||
1139 | __GFP_WAIT); | ||
1140 | if (!page[i]) { | ||
1141 | if (i < LZO_CMP_PAGES) { | ||
1142 | ring_size = i; | ||
1143 | printk(KERN_ERR | ||
1144 | "PM: Failed to allocate LZO pages\n"); | ||
1145 | ret = -ENOMEM; | ||
1146 | goto out_clean; | ||
1147 | } else { | ||
1148 | break; | ||
1149 | } | ||
1150 | } | ||
795 | } | 1151 | } |
1152 | want = ring_size = i; | ||
796 | 1153 | ||
797 | printk(KERN_INFO | 1154 | printk(KERN_INFO |
1155 | "PM: Using %u thread(s) for decompression.\n" | ||
798 | "PM: Loading and decompressing image data (%u pages) ... ", | 1156 | "PM: Loading and decompressing image data (%u pages) ... ", |
799 | nr_to_read); | 1157 | nr_threads, nr_to_read); |
800 | m = nr_to_read / 100; | 1158 | m = nr_to_read / 100; |
801 | if (!m) | 1159 | if (!m) |
802 | m = 1; | 1160 | m = 1; |
@@ -804,85 +1162,189 @@ static int load_image_lzo(struct swap_map_handle *handle, | |||
804 | bio = NULL; | 1162 | bio = NULL; |
805 | do_gettimeofday(&start); | 1163 | do_gettimeofday(&start); |
806 | 1164 | ||
807 | error = snapshot_write_next(snapshot); | 1165 | ret = snapshot_write_next(snapshot); |
808 | if (error <= 0) | 1166 | if (ret <= 0) |
809 | goto out_finish; | 1167 | goto out_finish; |
810 | 1168 | ||
811 | for (;;) { | 1169 | for(;;) { |
812 | error = swap_read_page(handle, page[0], NULL); /* sync */ | 1170 | for (i = 0; !eof && i < want; i++) { |
813 | if (error) | 1171 | ret = swap_read_page(handle, page[ring], &bio); |
814 | break; | 1172 | if (ret) { |
815 | 1173 | /* | |
816 | cmp_len = *(size_t *)page[0]; | 1174 | * On real read error, finish. On end of data, |
817 | if (unlikely(!cmp_len || | 1175 | * set EOF flag and just exit the read loop. |
818 | cmp_len > lzo1x_worst_compress(LZO_UNC_SIZE))) { | 1176 | */ |
819 | printk(KERN_ERR "PM: Invalid LZO compressed length\n"); | 1177 | if (handle->cur && |
820 | error = -1; | 1178 | handle->cur->entries[handle->k]) { |
821 | break; | 1179 | goto out_finish; |
1180 | } else { | ||
1181 | eof = 1; | ||
1182 | break; | ||
1183 | } | ||
1184 | } | ||
1185 | if (++ring >= ring_size) | ||
1186 | ring = 0; | ||
822 | } | 1187 | } |
1188 | asked += i; | ||
1189 | want -= i; | ||
823 | 1190 | ||
824 | for (off = PAGE_SIZE, i = 1; | 1191 | /* |
825 | off < LZO_HEADER + cmp_len; off += PAGE_SIZE, i++) { | 1192 | * We are out of data, wait for some more. |
826 | error = swap_read_page(handle, page[i], &bio); | 1193 | */ |
827 | if (error) | 1194 | if (!have) { |
1195 | if (!asked) | ||
1196 | break; | ||
1197 | |||
1198 | ret = hib_wait_on_bio_chain(&bio); | ||
1199 | if (ret) | ||
828 | goto out_finish; | 1200 | goto out_finish; |
1201 | have += asked; | ||
1202 | asked = 0; | ||
1203 | if (eof) | ||
1204 | eof = 2; | ||
829 | } | 1205 | } |
830 | 1206 | ||
831 | error = hib_wait_on_bio_chain(&bio); /* need all data now */ | 1207 | if (crc->run_threads) { |
832 | if (error) | 1208 | wait_event(crc->done, atomic_read(&crc->stop)); |
833 | goto out_finish; | 1209 | atomic_set(&crc->stop, 0); |
834 | 1210 | crc->run_threads = 0; | |
835 | for (off = 0, i = 0; | ||
836 | off < LZO_HEADER + cmp_len; off += PAGE_SIZE, i++) { | ||
837 | memcpy(cmp + off, page[i], PAGE_SIZE); | ||
838 | } | 1211 | } |
839 | 1212 | ||
840 | unc_len = LZO_UNC_SIZE; | 1213 | for (thr = 0; have && thr < nr_threads; thr++) { |
841 | error = lzo1x_decompress_safe(cmp + LZO_HEADER, cmp_len, | 1214 | data[thr].cmp_len = *(size_t *)page[pg]; |
842 | unc, &unc_len); | 1215 | if (unlikely(!data[thr].cmp_len || |
843 | if (error < 0) { | 1216 | data[thr].cmp_len > |
844 | printk(KERN_ERR "PM: LZO decompression failed\n"); | 1217 | lzo1x_worst_compress(LZO_UNC_SIZE))) { |
845 | break; | 1218 | printk(KERN_ERR |
1219 | "PM: Invalid LZO compressed length\n"); | ||
1220 | ret = -1; | ||
1221 | goto out_finish; | ||
1222 | } | ||
1223 | |||
1224 | need = DIV_ROUND_UP(data[thr].cmp_len + LZO_HEADER, | ||
1225 | PAGE_SIZE); | ||
1226 | if (need > have) { | ||
1227 | if (eof > 1) { | ||
1228 | ret = -1; | ||
1229 | goto out_finish; | ||
1230 | } | ||
1231 | break; | ||
1232 | } | ||
1233 | |||
1234 | for (off = 0; | ||
1235 | off < LZO_HEADER + data[thr].cmp_len; | ||
1236 | off += PAGE_SIZE) { | ||
1237 | memcpy(data[thr].cmp + off, | ||
1238 | page[pg], PAGE_SIZE); | ||
1239 | have--; | ||
1240 | want++; | ||
1241 | if (++pg >= ring_size) | ||
1242 | pg = 0; | ||
1243 | } | ||
1244 | |||
1245 | atomic_set(&data[thr].ready, 1); | ||
1246 | wake_up(&data[thr].go); | ||
846 | } | 1247 | } |
847 | 1248 | ||
848 | if (unlikely(!unc_len || | 1249 | /* |
849 | unc_len > LZO_UNC_SIZE || | 1250 | * Wait for more data while we are decompressing. |
850 | unc_len & (PAGE_SIZE - 1))) { | 1251 | */ |
851 | printk(KERN_ERR "PM: Invalid LZO uncompressed length\n"); | 1252 | if (have < LZO_CMP_PAGES && asked) { |
852 | error = -1; | 1253 | ret = hib_wait_on_bio_chain(&bio); |
853 | break; | 1254 | if (ret) |
1255 | goto out_finish; | ||
1256 | have += asked; | ||
1257 | asked = 0; | ||
1258 | if (eof) | ||
1259 | eof = 2; | ||
854 | } | 1260 | } |
855 | 1261 | ||
856 | for (off = 0; off < unc_len; off += PAGE_SIZE) { | 1262 | for (run_threads = thr, thr = 0; thr < run_threads; thr++) { |
857 | memcpy(data_of(*snapshot), unc + off, PAGE_SIZE); | 1263 | wait_event(data[thr].done, |
1264 | atomic_read(&data[thr].stop)); | ||
1265 | atomic_set(&data[thr].stop, 0); | ||
1266 | |||
1267 | ret = data[thr].ret; | ||
858 | 1268 | ||
859 | if (!(nr_pages % m)) | 1269 | if (ret < 0) { |
860 | printk("\b\b\b\b%3d%%", nr_pages / m); | 1270 | printk(KERN_ERR |
861 | nr_pages++; | 1271 | "PM: LZO decompression failed\n"); |
1272 | goto out_finish; | ||
1273 | } | ||
862 | 1274 | ||
863 | error = snapshot_write_next(snapshot); | 1275 | if (unlikely(!data[thr].unc_len || |
864 | if (error <= 0) | 1276 | data[thr].unc_len > LZO_UNC_SIZE || |
1277 | data[thr].unc_len & (PAGE_SIZE - 1))) { | ||
1278 | printk(KERN_ERR | ||
1279 | "PM: Invalid LZO uncompressed length\n"); | ||
1280 | ret = -1; | ||
865 | goto out_finish; | 1281 | goto out_finish; |
1282 | } | ||
1283 | |||
1284 | for (off = 0; | ||
1285 | off < data[thr].unc_len; off += PAGE_SIZE) { | ||
1286 | memcpy(data_of(*snapshot), | ||
1287 | data[thr].unc + off, PAGE_SIZE); | ||
1288 | |||
1289 | if (!(nr_pages % m)) | ||
1290 | printk("\b\b\b\b%3d%%", nr_pages / m); | ||
1291 | nr_pages++; | ||
1292 | |||
1293 | ret = snapshot_write_next(snapshot); | ||
1294 | if (ret <= 0) { | ||
1295 | crc->run_threads = thr + 1; | ||
1296 | atomic_set(&crc->ready, 1); | ||
1297 | wake_up(&crc->go); | ||
1298 | goto out_finish; | ||
1299 | } | ||
1300 | } | ||
866 | } | 1301 | } |
1302 | |||
1303 | crc->run_threads = thr; | ||
1304 | atomic_set(&crc->ready, 1); | ||
1305 | wake_up(&crc->go); | ||
867 | } | 1306 | } |
868 | 1307 | ||
869 | out_finish: | 1308 | out_finish: |
1309 | if (crc->run_threads) { | ||
1310 | wait_event(crc->done, atomic_read(&crc->stop)); | ||
1311 | atomic_set(&crc->stop, 0); | ||
1312 | } | ||
870 | do_gettimeofday(&stop); | 1313 | do_gettimeofday(&stop); |
871 | if (!error) { | 1314 | if (!ret) { |
872 | printk("\b\b\b\bdone\n"); | 1315 | printk("\b\b\b\bdone\n"); |
873 | snapshot_write_finalize(snapshot); | 1316 | snapshot_write_finalize(snapshot); |
874 | if (!snapshot_image_loaded(snapshot)) | 1317 | if (!snapshot_image_loaded(snapshot)) |
875 | error = -ENODATA; | 1318 | ret = -ENODATA; |
1319 | if (!ret) { | ||
1320 | if (swsusp_header->flags & SF_CRC32_MODE) { | ||
1321 | if(handle->crc32 != swsusp_header->crc32) { | ||
1322 | printk(KERN_ERR | ||
1323 | "PM: Invalid image CRC32!\n"); | ||
1324 | ret = -ENODATA; | ||
1325 | } | ||
1326 | } | ||
1327 | } | ||
876 | } else | 1328 | } else |
877 | printk("\n"); | 1329 | printk("\n"); |
878 | swsusp_show_speed(&start, &stop, nr_to_read, "Read"); | 1330 | swsusp_show_speed(&start, &stop, nr_to_read, "Read"); |
879 | 1331 | out_clean: | |
880 | vfree(cmp); | 1332 | for (i = 0; i < ring_size; i++) |
881 | vfree(unc); | ||
882 | for (i = 0; i < LZO_CMP_PAGES; i++) | ||
883 | free_page((unsigned long)page[i]); | 1333 | free_page((unsigned long)page[i]); |
1334 | if (crc) { | ||
1335 | if (crc->thr) | ||
1336 | kthread_stop(crc->thr); | ||
1337 | kfree(crc); | ||
1338 | } | ||
1339 | if (data) { | ||
1340 | for (thr = 0; thr < nr_threads; thr++) | ||
1341 | if (data[thr].thr) | ||
1342 | kthread_stop(data[thr].thr); | ||
1343 | vfree(data); | ||
1344 | } | ||
1345 | if (page) vfree(page); | ||
884 | 1346 | ||
885 | return error; | 1347 | return ret; |
886 | } | 1348 | } |
887 | 1349 | ||
888 | /** | 1350 | /** |
diff --git a/kernel/printk.c b/kernel/printk.c index 28a40d8171b8..b7da18391c38 100644 --- a/kernel/printk.c +++ b/kernel/printk.c | |||
@@ -100,7 +100,7 @@ static int console_locked, console_suspended; | |||
100 | * It is also used in interesting ways to provide interlocking in | 100 | * It is also used in interesting ways to provide interlocking in |
101 | * console_unlock();. | 101 | * console_unlock();. |
102 | */ | 102 | */ |
103 | static DEFINE_SPINLOCK(logbuf_lock); | 103 | static DEFINE_RAW_SPINLOCK(logbuf_lock); |
104 | 104 | ||
105 | #define LOG_BUF_MASK (log_buf_len-1) | 105 | #define LOG_BUF_MASK (log_buf_len-1) |
106 | #define LOG_BUF(idx) (log_buf[(idx) & LOG_BUF_MASK]) | 106 | #define LOG_BUF(idx) (log_buf[(idx) & LOG_BUF_MASK]) |
@@ -212,7 +212,7 @@ void __init setup_log_buf(int early) | |||
212 | return; | 212 | return; |
213 | } | 213 | } |
214 | 214 | ||
215 | spin_lock_irqsave(&logbuf_lock, flags); | 215 | raw_spin_lock_irqsave(&logbuf_lock, flags); |
216 | log_buf_len = new_log_buf_len; | 216 | log_buf_len = new_log_buf_len; |
217 | log_buf = new_log_buf; | 217 | log_buf = new_log_buf; |
218 | new_log_buf_len = 0; | 218 | new_log_buf_len = 0; |
@@ -230,7 +230,7 @@ void __init setup_log_buf(int early) | |||
230 | log_start -= offset; | 230 | log_start -= offset; |
231 | con_start -= offset; | 231 | con_start -= offset; |
232 | log_end -= offset; | 232 | log_end -= offset; |
233 | spin_unlock_irqrestore(&logbuf_lock, flags); | 233 | raw_spin_unlock_irqrestore(&logbuf_lock, flags); |
234 | 234 | ||
235 | pr_info("log_buf_len: %d\n", log_buf_len); | 235 | pr_info("log_buf_len: %d\n", log_buf_len); |
236 | pr_info("early log buf free: %d(%d%%)\n", | 236 | pr_info("early log buf free: %d(%d%%)\n", |
@@ -365,18 +365,18 @@ int do_syslog(int type, char __user *buf, int len, bool from_file) | |||
365 | if (error) | 365 | if (error) |
366 | goto out; | 366 | goto out; |
367 | i = 0; | 367 | i = 0; |
368 | spin_lock_irq(&logbuf_lock); | 368 | raw_spin_lock_irq(&logbuf_lock); |
369 | while (!error && (log_start != log_end) && i < len) { | 369 | while (!error && (log_start != log_end) && i < len) { |
370 | c = LOG_BUF(log_start); | 370 | c = LOG_BUF(log_start); |
371 | log_start++; | 371 | log_start++; |
372 | spin_unlock_irq(&logbuf_lock); | 372 | raw_spin_unlock_irq(&logbuf_lock); |
373 | error = __put_user(c,buf); | 373 | error = __put_user(c,buf); |
374 | buf++; | 374 | buf++; |
375 | i++; | 375 | i++; |
376 | cond_resched(); | 376 | cond_resched(); |
377 | spin_lock_irq(&logbuf_lock); | 377 | raw_spin_lock_irq(&logbuf_lock); |
378 | } | 378 | } |
379 | spin_unlock_irq(&logbuf_lock); | 379 | raw_spin_unlock_irq(&logbuf_lock); |
380 | if (!error) | 380 | if (!error) |
381 | error = i; | 381 | error = i; |
382 | break; | 382 | break; |
@@ -399,7 +399,7 @@ int do_syslog(int type, char __user *buf, int len, bool from_file) | |||
399 | count = len; | 399 | count = len; |
400 | if (count > log_buf_len) | 400 | if (count > log_buf_len) |
401 | count = log_buf_len; | 401 | count = log_buf_len; |
402 | spin_lock_irq(&logbuf_lock); | 402 | raw_spin_lock_irq(&logbuf_lock); |
403 | if (count > logged_chars) | 403 | if (count > logged_chars) |
404 | count = logged_chars; | 404 | count = logged_chars; |
405 | if (do_clear) | 405 | if (do_clear) |
@@ -416,12 +416,12 @@ int do_syslog(int type, char __user *buf, int len, bool from_file) | |||
416 | if (j + log_buf_len < log_end) | 416 | if (j + log_buf_len < log_end) |
417 | break; | 417 | break; |
418 | c = LOG_BUF(j); | 418 | c = LOG_BUF(j); |
419 | spin_unlock_irq(&logbuf_lock); | 419 | raw_spin_unlock_irq(&logbuf_lock); |
420 | error = __put_user(c,&buf[count-1-i]); | 420 | error = __put_user(c,&buf[count-1-i]); |
421 | cond_resched(); | 421 | cond_resched(); |
422 | spin_lock_irq(&logbuf_lock); | 422 | raw_spin_lock_irq(&logbuf_lock); |
423 | } | 423 | } |
424 | spin_unlock_irq(&logbuf_lock); | 424 | raw_spin_unlock_irq(&logbuf_lock); |
425 | if (error) | 425 | if (error) |
426 | break; | 426 | break; |
427 | error = i; | 427 | error = i; |
@@ -689,7 +689,7 @@ static void zap_locks(void) | |||
689 | oops_timestamp = jiffies; | 689 | oops_timestamp = jiffies; |
690 | 690 | ||
691 | /* If a crash is occurring, make sure we can't deadlock */ | 691 | /* If a crash is occurring, make sure we can't deadlock */ |
692 | spin_lock_init(&logbuf_lock); | 692 | raw_spin_lock_init(&logbuf_lock); |
693 | /* And make sure that we print immediately */ | 693 | /* And make sure that we print immediately */ |
694 | sema_init(&console_sem, 1); | 694 | sema_init(&console_sem, 1); |
695 | } | 695 | } |
@@ -802,9 +802,9 @@ static int console_trylock_for_printk(unsigned int cpu) | |||
802 | } | 802 | } |
803 | } | 803 | } |
804 | printk_cpu = UINT_MAX; | 804 | printk_cpu = UINT_MAX; |
805 | spin_unlock(&logbuf_lock); | ||
806 | if (wake) | 805 | if (wake) |
807 | up(&console_sem); | 806 | up(&console_sem); |
807 | raw_spin_unlock(&logbuf_lock); | ||
808 | return retval; | 808 | return retval; |
809 | } | 809 | } |
810 | static const char recursion_bug_msg [] = | 810 | static const char recursion_bug_msg [] = |
@@ -864,7 +864,7 @@ asmlinkage int vprintk(const char *fmt, va_list args) | |||
864 | } | 864 | } |
865 | 865 | ||
866 | lockdep_off(); | 866 | lockdep_off(); |
867 | spin_lock(&logbuf_lock); | 867 | raw_spin_lock(&logbuf_lock); |
868 | printk_cpu = this_cpu; | 868 | printk_cpu = this_cpu; |
869 | 869 | ||
870 | if (recursion_bug) { | 870 | if (recursion_bug) { |
@@ -1257,14 +1257,14 @@ void console_unlock(void) | |||
1257 | 1257 | ||
1258 | again: | 1258 | again: |
1259 | for ( ; ; ) { | 1259 | for ( ; ; ) { |
1260 | spin_lock_irqsave(&logbuf_lock, flags); | 1260 | raw_spin_lock_irqsave(&logbuf_lock, flags); |
1261 | wake_klogd |= log_start - log_end; | 1261 | wake_klogd |= log_start - log_end; |
1262 | if (con_start == log_end) | 1262 | if (con_start == log_end) |
1263 | break; /* Nothing to print */ | 1263 | break; /* Nothing to print */ |
1264 | _con_start = con_start; | 1264 | _con_start = con_start; |
1265 | _log_end = log_end; | 1265 | _log_end = log_end; |
1266 | con_start = log_end; /* Flush */ | 1266 | con_start = log_end; /* Flush */ |
1267 | spin_unlock(&logbuf_lock); | 1267 | raw_spin_unlock(&logbuf_lock); |
1268 | stop_critical_timings(); /* don't trace print latency */ | 1268 | stop_critical_timings(); /* don't trace print latency */ |
1269 | call_console_drivers(_con_start, _log_end); | 1269 | call_console_drivers(_con_start, _log_end); |
1270 | start_critical_timings(); | 1270 | start_critical_timings(); |
@@ -1276,7 +1276,7 @@ again: | |||
1276 | if (unlikely(exclusive_console)) | 1276 | if (unlikely(exclusive_console)) |
1277 | exclusive_console = NULL; | 1277 | exclusive_console = NULL; |
1278 | 1278 | ||
1279 | spin_unlock(&logbuf_lock); | 1279 | raw_spin_unlock(&logbuf_lock); |
1280 | 1280 | ||
1281 | up(&console_sem); | 1281 | up(&console_sem); |
1282 | 1282 | ||
@@ -1286,13 +1286,13 @@ again: | |||
1286 | * there's a new owner and the console_unlock() from them will do the | 1286 | * there's a new owner and the console_unlock() from them will do the |
1287 | * flush, no worries. | 1287 | * flush, no worries. |
1288 | */ | 1288 | */ |
1289 | spin_lock(&logbuf_lock); | 1289 | raw_spin_lock(&logbuf_lock); |
1290 | if (con_start != log_end) | 1290 | if (con_start != log_end) |
1291 | retry = 1; | 1291 | retry = 1; |
1292 | spin_unlock_irqrestore(&logbuf_lock, flags); | ||
1293 | if (retry && console_trylock()) | 1292 | if (retry && console_trylock()) |
1294 | goto again; | 1293 | goto again; |
1295 | 1294 | ||
1295 | raw_spin_unlock_irqrestore(&logbuf_lock, flags); | ||
1296 | if (wake_klogd) | 1296 | if (wake_klogd) |
1297 | wake_up_klogd(); | 1297 | wake_up_klogd(); |
1298 | } | 1298 | } |
@@ -1522,9 +1522,9 @@ void register_console(struct console *newcon) | |||
1522 | * console_unlock(); will print out the buffered messages | 1522 | * console_unlock(); will print out the buffered messages |
1523 | * for us. | 1523 | * for us. |
1524 | */ | 1524 | */ |
1525 | spin_lock_irqsave(&logbuf_lock, flags); | 1525 | raw_spin_lock_irqsave(&logbuf_lock, flags); |
1526 | con_start = log_start; | 1526 | con_start = log_start; |
1527 | spin_unlock_irqrestore(&logbuf_lock, flags); | 1527 | raw_spin_unlock_irqrestore(&logbuf_lock, flags); |
1528 | /* | 1528 | /* |
1529 | * We're about to replay the log buffer. Only do this to the | 1529 | * We're about to replay the log buffer. Only do this to the |
1530 | * just-registered console to avoid excessive message spam to | 1530 | * just-registered console to avoid excessive message spam to |
@@ -1731,10 +1731,10 @@ void kmsg_dump(enum kmsg_dump_reason reason) | |||
1731 | /* Theoretically, the log could move on after we do this, but | 1731 | /* Theoretically, the log could move on after we do this, but |
1732 | there's not a lot we can do about that. The new messages | 1732 | there's not a lot we can do about that. The new messages |
1733 | will overwrite the start of what we dump. */ | 1733 | will overwrite the start of what we dump. */ |
1734 | spin_lock_irqsave(&logbuf_lock, flags); | 1734 | raw_spin_lock_irqsave(&logbuf_lock, flags); |
1735 | end = log_end & LOG_BUF_MASK; | 1735 | end = log_end & LOG_BUF_MASK; |
1736 | chars = logged_chars; | 1736 | chars = logged_chars; |
1737 | spin_unlock_irqrestore(&logbuf_lock, flags); | 1737 | raw_spin_unlock_irqrestore(&logbuf_lock, flags); |
1738 | 1738 | ||
1739 | if (chars > end) { | 1739 | if (chars > end) { |
1740 | s1 = log_buf + log_buf_len - chars + end; | 1740 | s1 = log_buf + log_buf_len - chars + end; |
diff --git a/kernel/ptrace.c b/kernel/ptrace.c index 9de3ecfd20f9..a70d2a5d8c7b 100644 --- a/kernel/ptrace.c +++ b/kernel/ptrace.c | |||
@@ -744,20 +744,17 @@ int ptrace_request(struct task_struct *child, long request, | |||
744 | break; | 744 | break; |
745 | 745 | ||
746 | si = child->last_siginfo; | 746 | si = child->last_siginfo; |
747 | if (unlikely(!si || si->si_code >> 8 != PTRACE_EVENT_STOP)) | 747 | if (likely(si && (si->si_code >> 8) == PTRACE_EVENT_STOP)) { |
748 | break; | 748 | child->jobctl |= JOBCTL_LISTENING; |
749 | 749 | /* | |
750 | child->jobctl |= JOBCTL_LISTENING; | 750 | * If NOTIFY is set, it means event happened between |
751 | 751 | * start of this trap and now. Trigger re-trap. | |
752 | /* | 752 | */ |
753 | * If NOTIFY is set, it means event happened between start | 753 | if (child->jobctl & JOBCTL_TRAP_NOTIFY) |
754 | * of this trap and now. Trigger re-trap immediately. | 754 | signal_wake_up(child, true); |
755 | */ | 755 | ret = 0; |
756 | if (child->jobctl & JOBCTL_TRAP_NOTIFY) | 756 | } |
757 | signal_wake_up(child, true); | ||
758 | |||
759 | unlock_task_sighand(child, &flags); | 757 | unlock_task_sighand(child, &flags); |
760 | ret = 0; | ||
761 | break; | 758 | break; |
762 | 759 | ||
763 | case PTRACE_DETACH: /* detach a process that was attached. */ | 760 | case PTRACE_DETACH: /* detach a process that was attached. */ |
diff --git a/kernel/rcu.h b/kernel/rcu.h new file mode 100644 index 000000000000..f600868d550d --- /dev/null +++ b/kernel/rcu.h | |||
@@ -0,0 +1,85 @@ | |||
1 | /* | ||
2 | * Read-Copy Update definitions shared among RCU implementations. | ||
3 | * | ||
4 | * This program is free software; you can redistribute it and/or modify | ||
5 | * it under the terms of the GNU General Public License as published by | ||
6 | * the Free Software Foundation; either version 2 of the License, or | ||
7 | * (at your option) any later version. | ||
8 | * | ||
9 | * This program is distributed in the hope that it will be useful, | ||
10 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
11 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | ||
12 | * GNU General Public License for more details. | ||
13 | * | ||
14 | * You should have received a copy of the GNU General Public License | ||
15 | * along with this program; if not, write to the Free Software | ||
16 | * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. | ||
17 | * | ||
18 | * Copyright IBM Corporation, 2011 | ||
19 | * | ||
20 | * Author: Paul E. McKenney <paulmck@linux.vnet.ibm.com> | ||
21 | */ | ||
22 | |||
23 | #ifndef __LINUX_RCU_H | ||
24 | #define __LINUX_RCU_H | ||
25 | |||
26 | #ifdef CONFIG_RCU_TRACE | ||
27 | #define RCU_TRACE(stmt) stmt | ||
28 | #else /* #ifdef CONFIG_RCU_TRACE */ | ||
29 | #define RCU_TRACE(stmt) | ||
30 | #endif /* #else #ifdef CONFIG_RCU_TRACE */ | ||
31 | |||
32 | /* | ||
33 | * debug_rcu_head_queue()/debug_rcu_head_unqueue() are used internally | ||
34 | * by call_rcu() and rcu callback execution, and are therefore not part of the | ||
35 | * RCU API. Leaving in rcupdate.h because they are used by all RCU flavors. | ||
36 | */ | ||
37 | |||
38 | #ifdef CONFIG_DEBUG_OBJECTS_RCU_HEAD | ||
39 | # define STATE_RCU_HEAD_READY 0 | ||
40 | # define STATE_RCU_HEAD_QUEUED 1 | ||
41 | |||
42 | extern struct debug_obj_descr rcuhead_debug_descr; | ||
43 | |||
44 | static inline void debug_rcu_head_queue(struct rcu_head *head) | ||
45 | { | ||
46 | WARN_ON_ONCE((unsigned long)head & 0x3); | ||
47 | debug_object_activate(head, &rcuhead_debug_descr); | ||
48 | debug_object_active_state(head, &rcuhead_debug_descr, | ||
49 | STATE_RCU_HEAD_READY, | ||
50 | STATE_RCU_HEAD_QUEUED); | ||
51 | } | ||
52 | |||
53 | static inline void debug_rcu_head_unqueue(struct rcu_head *head) | ||
54 | { | ||
55 | debug_object_active_state(head, &rcuhead_debug_descr, | ||
56 | STATE_RCU_HEAD_QUEUED, | ||
57 | STATE_RCU_HEAD_READY); | ||
58 | debug_object_deactivate(head, &rcuhead_debug_descr); | ||
59 | } | ||
60 | #else /* !CONFIG_DEBUG_OBJECTS_RCU_HEAD */ | ||
61 | static inline void debug_rcu_head_queue(struct rcu_head *head) | ||
62 | { | ||
63 | } | ||
64 | |||
65 | static inline void debug_rcu_head_unqueue(struct rcu_head *head) | ||
66 | { | ||
67 | } | ||
68 | #endif /* #else !CONFIG_DEBUG_OBJECTS_RCU_HEAD */ | ||
69 | |||
70 | extern void kfree(const void *); | ||
71 | |||
72 | static inline void __rcu_reclaim(char *rn, struct rcu_head *head) | ||
73 | { | ||
74 | unsigned long offset = (unsigned long)head->func; | ||
75 | |||
76 | if (__is_kfree_rcu_offset(offset)) { | ||
77 | RCU_TRACE(trace_rcu_invoke_kfree_callback(rn, head, offset)); | ||
78 | kfree((void *)head - offset); | ||
79 | } else { | ||
80 | RCU_TRACE(trace_rcu_invoke_callback(rn, head)); | ||
81 | head->func(head); | ||
82 | } | ||
83 | } | ||
84 | |||
85 | #endif /* __LINUX_RCU_H */ | ||
diff --git a/kernel/rcupdate.c b/kernel/rcupdate.c index ddddb320be61..ca0d23b6b3e8 100644 --- a/kernel/rcupdate.c +++ b/kernel/rcupdate.c | |||
@@ -46,6 +46,11 @@ | |||
46 | #include <linux/module.h> | 46 | #include <linux/module.h> |
47 | #include <linux/hardirq.h> | 47 | #include <linux/hardirq.h> |
48 | 48 | ||
49 | #define CREATE_TRACE_POINTS | ||
50 | #include <trace/events/rcu.h> | ||
51 | |||
52 | #include "rcu.h" | ||
53 | |||
49 | #ifdef CONFIG_DEBUG_LOCK_ALLOC | 54 | #ifdef CONFIG_DEBUG_LOCK_ALLOC |
50 | static struct lock_class_key rcu_lock_key; | 55 | static struct lock_class_key rcu_lock_key; |
51 | struct lockdep_map rcu_lock_map = | 56 | struct lockdep_map rcu_lock_map = |
@@ -94,11 +99,16 @@ EXPORT_SYMBOL_GPL(rcu_read_lock_bh_held); | |||
94 | 99 | ||
95 | #endif /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */ | 100 | #endif /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */ |
96 | 101 | ||
102 | struct rcu_synchronize { | ||
103 | struct rcu_head head; | ||
104 | struct completion completion; | ||
105 | }; | ||
106 | |||
97 | /* | 107 | /* |
98 | * Awaken the corresponding synchronize_rcu() instance now that a | 108 | * Awaken the corresponding synchronize_rcu() instance now that a |
99 | * grace period has elapsed. | 109 | * grace period has elapsed. |
100 | */ | 110 | */ |
101 | void wakeme_after_rcu(struct rcu_head *head) | 111 | static void wakeme_after_rcu(struct rcu_head *head) |
102 | { | 112 | { |
103 | struct rcu_synchronize *rcu; | 113 | struct rcu_synchronize *rcu; |
104 | 114 | ||
@@ -106,6 +116,20 @@ void wakeme_after_rcu(struct rcu_head *head) | |||
106 | complete(&rcu->completion); | 116 | complete(&rcu->completion); |
107 | } | 117 | } |
108 | 118 | ||
119 | void wait_rcu_gp(call_rcu_func_t crf) | ||
120 | { | ||
121 | struct rcu_synchronize rcu; | ||
122 | |||
123 | init_rcu_head_on_stack(&rcu.head); | ||
124 | init_completion(&rcu.completion); | ||
125 | /* Will wake me after RCU finished. */ | ||
126 | crf(&rcu.head, wakeme_after_rcu); | ||
127 | /* Wait for it. */ | ||
128 | wait_for_completion(&rcu.completion); | ||
129 | destroy_rcu_head_on_stack(&rcu.head); | ||
130 | } | ||
131 | EXPORT_SYMBOL_GPL(wait_rcu_gp); | ||
132 | |||
109 | #ifdef CONFIG_PROVE_RCU | 133 | #ifdef CONFIG_PROVE_RCU |
110 | /* | 134 | /* |
111 | * wrapper function to avoid #include problems. | 135 | * wrapper function to avoid #include problems. |
diff --git a/kernel/rcutiny.c b/kernel/rcutiny.c index 7bbac7d0f5ab..da775c87f27f 100644 --- a/kernel/rcutiny.c +++ b/kernel/rcutiny.c | |||
@@ -37,16 +37,17 @@ | |||
37 | #include <linux/cpu.h> | 37 | #include <linux/cpu.h> |
38 | #include <linux/prefetch.h> | 38 | #include <linux/prefetch.h> |
39 | 39 | ||
40 | /* Controls for rcu_kthread() kthread, replacing RCU_SOFTIRQ used previously. */ | 40 | #ifdef CONFIG_RCU_TRACE |
41 | static struct task_struct *rcu_kthread_task; | 41 | #include <trace/events/rcu.h> |
42 | static DECLARE_WAIT_QUEUE_HEAD(rcu_kthread_wq); | 42 | #endif /* #else #ifdef CONFIG_RCU_TRACE */ |
43 | static unsigned long have_rcu_kthread_work; | 43 | |
44 | #include "rcu.h" | ||
44 | 45 | ||
45 | /* Forward declarations for rcutiny_plugin.h. */ | 46 | /* Forward declarations for rcutiny_plugin.h. */ |
46 | struct rcu_ctrlblk; | 47 | struct rcu_ctrlblk; |
47 | static void invoke_rcu_kthread(void); | 48 | static void invoke_rcu_callbacks(void); |
48 | static void rcu_process_callbacks(struct rcu_ctrlblk *rcp); | 49 | static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp); |
49 | static int rcu_kthread(void *arg); | 50 | static void rcu_process_callbacks(struct softirq_action *unused); |
50 | static void __call_rcu(struct rcu_head *head, | 51 | static void __call_rcu(struct rcu_head *head, |
51 | void (*func)(struct rcu_head *rcu), | 52 | void (*func)(struct rcu_head *rcu), |
52 | struct rcu_ctrlblk *rcp); | 53 | struct rcu_ctrlblk *rcp); |
@@ -96,16 +97,6 @@ static int rcu_qsctr_help(struct rcu_ctrlblk *rcp) | |||
96 | } | 97 | } |
97 | 98 | ||
98 | /* | 99 | /* |
99 | * Wake up rcu_kthread() to process callbacks now eligible for invocation | ||
100 | * or to boost readers. | ||
101 | */ | ||
102 | static void invoke_rcu_kthread(void) | ||
103 | { | ||
104 | have_rcu_kthread_work = 1; | ||
105 | wake_up(&rcu_kthread_wq); | ||
106 | } | ||
107 | |||
108 | /* | ||
109 | * Record an rcu quiescent state. And an rcu_bh quiescent state while we | 100 | * Record an rcu quiescent state. And an rcu_bh quiescent state while we |
110 | * are at it, given that any rcu quiescent state is also an rcu_bh | 101 | * are at it, given that any rcu quiescent state is also an rcu_bh |
111 | * quiescent state. Use "+" instead of "||" to defeat short circuiting. | 102 | * quiescent state. Use "+" instead of "||" to defeat short circuiting. |
@@ -117,7 +108,7 @@ void rcu_sched_qs(int cpu) | |||
117 | local_irq_save(flags); | 108 | local_irq_save(flags); |
118 | if (rcu_qsctr_help(&rcu_sched_ctrlblk) + | 109 | if (rcu_qsctr_help(&rcu_sched_ctrlblk) + |
119 | rcu_qsctr_help(&rcu_bh_ctrlblk)) | 110 | rcu_qsctr_help(&rcu_bh_ctrlblk)) |
120 | invoke_rcu_kthread(); | 111 | invoke_rcu_callbacks(); |
121 | local_irq_restore(flags); | 112 | local_irq_restore(flags); |
122 | } | 113 | } |
123 | 114 | ||
@@ -130,7 +121,7 @@ void rcu_bh_qs(int cpu) | |||
130 | 121 | ||
131 | local_irq_save(flags); | 122 | local_irq_save(flags); |
132 | if (rcu_qsctr_help(&rcu_bh_ctrlblk)) | 123 | if (rcu_qsctr_help(&rcu_bh_ctrlblk)) |
133 | invoke_rcu_kthread(); | 124 | invoke_rcu_callbacks(); |
134 | local_irq_restore(flags); | 125 | local_irq_restore(flags); |
135 | } | 126 | } |
136 | 127 | ||
@@ -154,18 +145,23 @@ void rcu_check_callbacks(int cpu, int user) | |||
154 | * Invoke the RCU callbacks on the specified rcu_ctrlkblk structure | 145 | * Invoke the RCU callbacks on the specified rcu_ctrlkblk structure |
155 | * whose grace period has elapsed. | 146 | * whose grace period has elapsed. |
156 | */ | 147 | */ |
157 | static void rcu_process_callbacks(struct rcu_ctrlblk *rcp) | 148 | static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp) |
158 | { | 149 | { |
150 | char *rn = NULL; | ||
159 | struct rcu_head *next, *list; | 151 | struct rcu_head *next, *list; |
160 | unsigned long flags; | 152 | unsigned long flags; |
161 | RCU_TRACE(int cb_count = 0); | 153 | RCU_TRACE(int cb_count = 0); |
162 | 154 | ||
163 | /* If no RCU callbacks ready to invoke, just return. */ | 155 | /* If no RCU callbacks ready to invoke, just return. */ |
164 | if (&rcp->rcucblist == rcp->donetail) | 156 | if (&rcp->rcucblist == rcp->donetail) { |
157 | RCU_TRACE(trace_rcu_batch_start(rcp->name, 0, -1)); | ||
158 | RCU_TRACE(trace_rcu_batch_end(rcp->name, 0)); | ||
165 | return; | 159 | return; |
160 | } | ||
166 | 161 | ||
167 | /* Move the ready-to-invoke callbacks to a local list. */ | 162 | /* Move the ready-to-invoke callbacks to a local list. */ |
168 | local_irq_save(flags); | 163 | local_irq_save(flags); |
164 | RCU_TRACE(trace_rcu_batch_start(rcp->name, 0, -1)); | ||
169 | list = rcp->rcucblist; | 165 | list = rcp->rcucblist; |
170 | rcp->rcucblist = *rcp->donetail; | 166 | rcp->rcucblist = *rcp->donetail; |
171 | *rcp->donetail = NULL; | 167 | *rcp->donetail = NULL; |
@@ -176,49 +172,26 @@ static void rcu_process_callbacks(struct rcu_ctrlblk *rcp) | |||
176 | local_irq_restore(flags); | 172 | local_irq_restore(flags); |
177 | 173 | ||
178 | /* Invoke the callbacks on the local list. */ | 174 | /* Invoke the callbacks on the local list. */ |
175 | RCU_TRACE(rn = rcp->name); | ||
179 | while (list) { | 176 | while (list) { |
180 | next = list->next; | 177 | next = list->next; |
181 | prefetch(next); | 178 | prefetch(next); |
182 | debug_rcu_head_unqueue(list); | 179 | debug_rcu_head_unqueue(list); |
183 | local_bh_disable(); | 180 | local_bh_disable(); |
184 | __rcu_reclaim(list); | 181 | __rcu_reclaim(rn, list); |
185 | local_bh_enable(); | 182 | local_bh_enable(); |
186 | list = next; | 183 | list = next; |
187 | RCU_TRACE(cb_count++); | 184 | RCU_TRACE(cb_count++); |
188 | } | 185 | } |
189 | RCU_TRACE(rcu_trace_sub_qlen(rcp, cb_count)); | 186 | RCU_TRACE(rcu_trace_sub_qlen(rcp, cb_count)); |
187 | RCU_TRACE(trace_rcu_batch_end(rcp->name, cb_count)); | ||
190 | } | 188 | } |
191 | 189 | ||
192 | /* | 190 | static void rcu_process_callbacks(struct softirq_action *unused) |
193 | * This kthread invokes RCU callbacks whose grace periods have | ||
194 | * elapsed. It is awakened as needed, and takes the place of the | ||
195 | * RCU_SOFTIRQ that was used previously for this purpose. | ||
196 | * This is a kthread, but it is never stopped, at least not until | ||
197 | * the system goes down. | ||
198 | */ | ||
199 | static int rcu_kthread(void *arg) | ||
200 | { | 191 | { |
201 | unsigned long work; | 192 | __rcu_process_callbacks(&rcu_sched_ctrlblk); |
202 | unsigned long morework; | 193 | __rcu_process_callbacks(&rcu_bh_ctrlblk); |
203 | unsigned long flags; | 194 | rcu_preempt_process_callbacks(); |
204 | |||
205 | for (;;) { | ||
206 | wait_event_interruptible(rcu_kthread_wq, | ||
207 | have_rcu_kthread_work != 0); | ||
208 | morework = rcu_boost(); | ||
209 | local_irq_save(flags); | ||
210 | work = have_rcu_kthread_work; | ||
211 | have_rcu_kthread_work = morework; | ||
212 | local_irq_restore(flags); | ||
213 | if (work) { | ||
214 | rcu_process_callbacks(&rcu_sched_ctrlblk); | ||
215 | rcu_process_callbacks(&rcu_bh_ctrlblk); | ||
216 | rcu_preempt_process_callbacks(); | ||
217 | } | ||
218 | schedule_timeout_interruptible(1); /* Leave CPU for others. */ | ||
219 | } | ||
220 | |||
221 | return 0; /* Not reached, but needed to shut gcc up. */ | ||
222 | } | 195 | } |
223 | 196 | ||
224 | /* | 197 | /* |
@@ -280,45 +253,3 @@ void call_rcu_bh(struct rcu_head *head, void (*func)(struct rcu_head *rcu)) | |||
280 | __call_rcu(head, func, &rcu_bh_ctrlblk); | 253 | __call_rcu(head, func, &rcu_bh_ctrlblk); |
281 | } | 254 | } |
282 | EXPORT_SYMBOL_GPL(call_rcu_bh); | 255 | EXPORT_SYMBOL_GPL(call_rcu_bh); |
283 | |||
284 | void rcu_barrier_bh(void) | ||
285 | { | ||
286 | struct rcu_synchronize rcu; | ||
287 | |||
288 | init_rcu_head_on_stack(&rcu.head); | ||
289 | init_completion(&rcu.completion); | ||
290 | /* Will wake me after RCU finished. */ | ||
291 | call_rcu_bh(&rcu.head, wakeme_after_rcu); | ||
292 | /* Wait for it. */ | ||
293 | wait_for_completion(&rcu.completion); | ||
294 | destroy_rcu_head_on_stack(&rcu.head); | ||
295 | } | ||
296 | EXPORT_SYMBOL_GPL(rcu_barrier_bh); | ||
297 | |||
298 | void rcu_barrier_sched(void) | ||
299 | { | ||
300 | struct rcu_synchronize rcu; | ||
301 | |||
302 | init_rcu_head_on_stack(&rcu.head); | ||
303 | init_completion(&rcu.completion); | ||
304 | /* Will wake me after RCU finished. */ | ||
305 | call_rcu_sched(&rcu.head, wakeme_after_rcu); | ||
306 | /* Wait for it. */ | ||
307 | wait_for_completion(&rcu.completion); | ||
308 | destroy_rcu_head_on_stack(&rcu.head); | ||
309 | } | ||
310 | EXPORT_SYMBOL_GPL(rcu_barrier_sched); | ||
311 | |||
312 | /* | ||
313 | * Spawn the kthread that invokes RCU callbacks. | ||
314 | */ | ||
315 | static int __init rcu_spawn_kthreads(void) | ||
316 | { | ||
317 | struct sched_param sp; | ||
318 | |||
319 | rcu_kthread_task = kthread_run(rcu_kthread, NULL, "rcu_kthread"); | ||
320 | sp.sched_priority = RCU_BOOST_PRIO; | ||
321 | sched_setscheduler_nocheck(rcu_kthread_task, SCHED_FIFO, &sp); | ||
322 | return 0; | ||
323 | } | ||
324 | early_initcall(rcu_spawn_kthreads); | ||
diff --git a/kernel/rcutiny_plugin.h b/kernel/rcutiny_plugin.h index f259c676195f..02aa7139861c 100644 --- a/kernel/rcutiny_plugin.h +++ b/kernel/rcutiny_plugin.h | |||
@@ -26,29 +26,26 @@ | |||
26 | #include <linux/debugfs.h> | 26 | #include <linux/debugfs.h> |
27 | #include <linux/seq_file.h> | 27 | #include <linux/seq_file.h> |
28 | 28 | ||
29 | #ifdef CONFIG_RCU_TRACE | ||
30 | #define RCU_TRACE(stmt) stmt | ||
31 | #else /* #ifdef CONFIG_RCU_TRACE */ | ||
32 | #define RCU_TRACE(stmt) | ||
33 | #endif /* #else #ifdef CONFIG_RCU_TRACE */ | ||
34 | |||
35 | /* Global control variables for rcupdate callback mechanism. */ | 29 | /* Global control variables for rcupdate callback mechanism. */ |
36 | struct rcu_ctrlblk { | 30 | struct rcu_ctrlblk { |
37 | struct rcu_head *rcucblist; /* List of pending callbacks (CBs). */ | 31 | struct rcu_head *rcucblist; /* List of pending callbacks (CBs). */ |
38 | struct rcu_head **donetail; /* ->next pointer of last "done" CB. */ | 32 | struct rcu_head **donetail; /* ->next pointer of last "done" CB. */ |
39 | struct rcu_head **curtail; /* ->next pointer of last CB. */ | 33 | struct rcu_head **curtail; /* ->next pointer of last CB. */ |
40 | RCU_TRACE(long qlen); /* Number of pending CBs. */ | 34 | RCU_TRACE(long qlen); /* Number of pending CBs. */ |
35 | RCU_TRACE(char *name); /* Name of RCU type. */ | ||
41 | }; | 36 | }; |
42 | 37 | ||
43 | /* Definition for rcupdate control block. */ | 38 | /* Definition for rcupdate control block. */ |
44 | static struct rcu_ctrlblk rcu_sched_ctrlblk = { | 39 | static struct rcu_ctrlblk rcu_sched_ctrlblk = { |
45 | .donetail = &rcu_sched_ctrlblk.rcucblist, | 40 | .donetail = &rcu_sched_ctrlblk.rcucblist, |
46 | .curtail = &rcu_sched_ctrlblk.rcucblist, | 41 | .curtail = &rcu_sched_ctrlblk.rcucblist, |
42 | RCU_TRACE(.name = "rcu_sched") | ||
47 | }; | 43 | }; |
48 | 44 | ||
49 | static struct rcu_ctrlblk rcu_bh_ctrlblk = { | 45 | static struct rcu_ctrlblk rcu_bh_ctrlblk = { |
50 | .donetail = &rcu_bh_ctrlblk.rcucblist, | 46 | .donetail = &rcu_bh_ctrlblk.rcucblist, |
51 | .curtail = &rcu_bh_ctrlblk.rcucblist, | 47 | .curtail = &rcu_bh_ctrlblk.rcucblist, |
48 | RCU_TRACE(.name = "rcu_bh") | ||
52 | }; | 49 | }; |
53 | 50 | ||
54 | #ifdef CONFIG_DEBUG_LOCK_ALLOC | 51 | #ifdef CONFIG_DEBUG_LOCK_ALLOC |
@@ -131,6 +128,7 @@ static struct rcu_preempt_ctrlblk rcu_preempt_ctrlblk = { | |||
131 | .rcb.curtail = &rcu_preempt_ctrlblk.rcb.rcucblist, | 128 | .rcb.curtail = &rcu_preempt_ctrlblk.rcb.rcucblist, |
132 | .nexttail = &rcu_preempt_ctrlblk.rcb.rcucblist, | 129 | .nexttail = &rcu_preempt_ctrlblk.rcb.rcucblist, |
133 | .blkd_tasks = LIST_HEAD_INIT(rcu_preempt_ctrlblk.blkd_tasks), | 130 | .blkd_tasks = LIST_HEAD_INIT(rcu_preempt_ctrlblk.blkd_tasks), |
131 | RCU_TRACE(.rcb.name = "rcu_preempt") | ||
134 | }; | 132 | }; |
135 | 133 | ||
136 | static int rcu_preempted_readers_exp(void); | 134 | static int rcu_preempted_readers_exp(void); |
@@ -247,6 +245,13 @@ static void show_tiny_preempt_stats(struct seq_file *m) | |||
247 | 245 | ||
248 | #include "rtmutex_common.h" | 246 | #include "rtmutex_common.h" |
249 | 247 | ||
248 | #define RCU_BOOST_PRIO CONFIG_RCU_BOOST_PRIO | ||
249 | |||
250 | /* Controls for rcu_kthread() kthread. */ | ||
251 | static struct task_struct *rcu_kthread_task; | ||
252 | static DECLARE_WAIT_QUEUE_HEAD(rcu_kthread_wq); | ||
253 | static unsigned long have_rcu_kthread_work; | ||
254 | |||
250 | /* | 255 | /* |
251 | * Carry out RCU priority boosting on the task indicated by ->boost_tasks, | 256 | * Carry out RCU priority boosting on the task indicated by ->boost_tasks, |
252 | * and advance ->boost_tasks to the next task in the ->blkd_tasks list. | 257 | * and advance ->boost_tasks to the next task in the ->blkd_tasks list. |
@@ -334,7 +339,7 @@ static int rcu_initiate_boost(void) | |||
334 | if (rcu_preempt_ctrlblk.exp_tasks == NULL) | 339 | if (rcu_preempt_ctrlblk.exp_tasks == NULL) |
335 | rcu_preempt_ctrlblk.boost_tasks = | 340 | rcu_preempt_ctrlblk.boost_tasks = |
336 | rcu_preempt_ctrlblk.gp_tasks; | 341 | rcu_preempt_ctrlblk.gp_tasks; |
337 | invoke_rcu_kthread(); | 342 | invoke_rcu_callbacks(); |
338 | } else | 343 | } else |
339 | RCU_TRACE(rcu_initiate_boost_trace()); | 344 | RCU_TRACE(rcu_initiate_boost_trace()); |
340 | return 1; | 345 | return 1; |
@@ -353,14 +358,6 @@ static void rcu_preempt_boost_start_gp(void) | |||
353 | #else /* #ifdef CONFIG_RCU_BOOST */ | 358 | #else /* #ifdef CONFIG_RCU_BOOST */ |
354 | 359 | ||
355 | /* | 360 | /* |
356 | * If there is no RCU priority boosting, we don't boost. | ||
357 | */ | ||
358 | static int rcu_boost(void) | ||
359 | { | ||
360 | return 0; | ||
361 | } | ||
362 | |||
363 | /* | ||
364 | * If there is no RCU priority boosting, we don't initiate boosting, | 361 | * If there is no RCU priority boosting, we don't initiate boosting, |
365 | * but we do indicate whether there are blocked readers blocking the | 362 | * but we do indicate whether there are blocked readers blocking the |
366 | * current grace period. | 363 | * current grace period. |
@@ -427,7 +424,7 @@ static void rcu_preempt_cpu_qs(void) | |||
427 | 424 | ||
428 | /* If there are done callbacks, cause them to be invoked. */ | 425 | /* If there are done callbacks, cause them to be invoked. */ |
429 | if (*rcu_preempt_ctrlblk.rcb.donetail != NULL) | 426 | if (*rcu_preempt_ctrlblk.rcb.donetail != NULL) |
430 | invoke_rcu_kthread(); | 427 | invoke_rcu_callbacks(); |
431 | } | 428 | } |
432 | 429 | ||
433 | /* | 430 | /* |
@@ -648,7 +645,7 @@ static void rcu_preempt_check_callbacks(void) | |||
648 | rcu_preempt_cpu_qs(); | 645 | rcu_preempt_cpu_qs(); |
649 | if (&rcu_preempt_ctrlblk.rcb.rcucblist != | 646 | if (&rcu_preempt_ctrlblk.rcb.rcucblist != |
650 | rcu_preempt_ctrlblk.rcb.donetail) | 647 | rcu_preempt_ctrlblk.rcb.donetail) |
651 | invoke_rcu_kthread(); | 648 | invoke_rcu_callbacks(); |
652 | if (rcu_preempt_gp_in_progress() && | 649 | if (rcu_preempt_gp_in_progress() && |
653 | rcu_cpu_blocking_cur_gp() && | 650 | rcu_cpu_blocking_cur_gp() && |
654 | rcu_preempt_running_reader()) | 651 | rcu_preempt_running_reader()) |
@@ -674,7 +671,7 @@ static void rcu_preempt_remove_callbacks(struct rcu_ctrlblk *rcp) | |||
674 | */ | 671 | */ |
675 | static void rcu_preempt_process_callbacks(void) | 672 | static void rcu_preempt_process_callbacks(void) |
676 | { | 673 | { |
677 | rcu_process_callbacks(&rcu_preempt_ctrlblk.rcb); | 674 | __rcu_process_callbacks(&rcu_preempt_ctrlblk.rcb); |
678 | } | 675 | } |
679 | 676 | ||
680 | /* | 677 | /* |
@@ -697,20 +694,6 @@ void call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu)) | |||
697 | } | 694 | } |
698 | EXPORT_SYMBOL_GPL(call_rcu); | 695 | EXPORT_SYMBOL_GPL(call_rcu); |
699 | 696 | ||
700 | void rcu_barrier(void) | ||
701 | { | ||
702 | struct rcu_synchronize rcu; | ||
703 | |||
704 | init_rcu_head_on_stack(&rcu.head); | ||
705 | init_completion(&rcu.completion); | ||
706 | /* Will wake me after RCU finished. */ | ||
707 | call_rcu(&rcu.head, wakeme_after_rcu); | ||
708 | /* Wait for it. */ | ||
709 | wait_for_completion(&rcu.completion); | ||
710 | destroy_rcu_head_on_stack(&rcu.head); | ||
711 | } | ||
712 | EXPORT_SYMBOL_GPL(rcu_barrier); | ||
713 | |||
714 | /* | 697 | /* |
715 | * synchronize_rcu - wait until a grace period has elapsed. | 698 | * synchronize_rcu - wait until a grace period has elapsed. |
716 | * | 699 | * |
@@ -864,15 +847,6 @@ static void show_tiny_preempt_stats(struct seq_file *m) | |||
864 | #endif /* #ifdef CONFIG_RCU_TRACE */ | 847 | #endif /* #ifdef CONFIG_RCU_TRACE */ |
865 | 848 | ||
866 | /* | 849 | /* |
867 | * Because preemptible RCU does not exist, it is never necessary to | ||
868 | * boost preempted RCU readers. | ||
869 | */ | ||
870 | static int rcu_boost(void) | ||
871 | { | ||
872 | return 0; | ||
873 | } | ||
874 | |||
875 | /* | ||
876 | * Because preemptible RCU does not exist, it never has any callbacks | 850 | * Because preemptible RCU does not exist, it never has any callbacks |
877 | * to check. | 851 | * to check. |
878 | */ | 852 | */ |
@@ -898,6 +872,78 @@ static void rcu_preempt_process_callbacks(void) | |||
898 | 872 | ||
899 | #endif /* #else #ifdef CONFIG_TINY_PREEMPT_RCU */ | 873 | #endif /* #else #ifdef CONFIG_TINY_PREEMPT_RCU */ |
900 | 874 | ||
875 | #ifdef CONFIG_RCU_BOOST | ||
876 | |||
877 | /* | ||
878 | * Wake up rcu_kthread() to process callbacks now eligible for invocation | ||
879 | * or to boost readers. | ||
880 | */ | ||
881 | static void invoke_rcu_callbacks(void) | ||
882 | { | ||
883 | have_rcu_kthread_work = 1; | ||
884 | wake_up(&rcu_kthread_wq); | ||
885 | } | ||
886 | |||
887 | /* | ||
888 | * This kthread invokes RCU callbacks whose grace periods have | ||
889 | * elapsed. It is awakened as needed, and takes the place of the | ||
890 | * RCU_SOFTIRQ that is used for this purpose when boosting is disabled. | ||
891 | * This is a kthread, but it is never stopped, at least not until | ||
892 | * the system goes down. | ||
893 | */ | ||
894 | static int rcu_kthread(void *arg) | ||
895 | { | ||
896 | unsigned long work; | ||
897 | unsigned long morework; | ||
898 | unsigned long flags; | ||
899 | |||
900 | for (;;) { | ||
901 | wait_event_interruptible(rcu_kthread_wq, | ||
902 | have_rcu_kthread_work != 0); | ||
903 | morework = rcu_boost(); | ||
904 | local_irq_save(flags); | ||
905 | work = have_rcu_kthread_work; | ||
906 | have_rcu_kthread_work = morework; | ||
907 | local_irq_restore(flags); | ||
908 | if (work) | ||
909 | rcu_process_callbacks(NULL); | ||
910 | schedule_timeout_interruptible(1); /* Leave CPU for others. */ | ||
911 | } | ||
912 | |||
913 | return 0; /* Not reached, but needed to shut gcc up. */ | ||
914 | } | ||
915 | |||
916 | /* | ||
917 | * Spawn the kthread that invokes RCU callbacks. | ||
918 | */ | ||
919 | static int __init rcu_spawn_kthreads(void) | ||
920 | { | ||
921 | struct sched_param sp; | ||
922 | |||
923 | rcu_kthread_task = kthread_run(rcu_kthread, NULL, "rcu_kthread"); | ||
924 | sp.sched_priority = RCU_BOOST_PRIO; | ||
925 | sched_setscheduler_nocheck(rcu_kthread_task, SCHED_FIFO, &sp); | ||
926 | return 0; | ||
927 | } | ||
928 | early_initcall(rcu_spawn_kthreads); | ||
929 | |||
930 | #else /* #ifdef CONFIG_RCU_BOOST */ | ||
931 | |||
932 | /* | ||
933 | * Start up softirq processing of callbacks. | ||
934 | */ | ||
935 | void invoke_rcu_callbacks(void) | ||
936 | { | ||
937 | raise_softirq(RCU_SOFTIRQ); | ||
938 | } | ||
939 | |||
940 | void rcu_init(void) | ||
941 | { | ||
942 | open_softirq(RCU_SOFTIRQ, rcu_process_callbacks); | ||
943 | } | ||
944 | |||
945 | #endif /* #else #ifdef CONFIG_RCU_BOOST */ | ||
946 | |||
901 | #ifdef CONFIG_DEBUG_LOCK_ALLOC | 947 | #ifdef CONFIG_DEBUG_LOCK_ALLOC |
902 | #include <linux/kernel_stat.h> | 948 | #include <linux/kernel_stat.h> |
903 | 949 | ||
@@ -913,12 +959,6 @@ void __init rcu_scheduler_starting(void) | |||
913 | 959 | ||
914 | #endif /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */ | 960 | #endif /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */ |
915 | 961 | ||
916 | #ifdef CONFIG_RCU_BOOST | ||
917 | #define RCU_BOOST_PRIO CONFIG_RCU_BOOST_PRIO | ||
918 | #else /* #ifdef CONFIG_RCU_BOOST */ | ||
919 | #define RCU_BOOST_PRIO 1 | ||
920 | #endif /* #else #ifdef CONFIG_RCU_BOOST */ | ||
921 | |||
922 | #ifdef CONFIG_RCU_TRACE | 962 | #ifdef CONFIG_RCU_TRACE |
923 | 963 | ||
924 | #ifdef CONFIG_RCU_BOOST | 964 | #ifdef CONFIG_RCU_BOOST |
diff --git a/kernel/rcutorture.c b/kernel/rcutorture.c index 98f51b13bb7e..764825c2685c 100644 --- a/kernel/rcutorture.c +++ b/kernel/rcutorture.c | |||
@@ -73,7 +73,7 @@ module_param(nreaders, int, 0444); | |||
73 | MODULE_PARM_DESC(nreaders, "Number of RCU reader threads"); | 73 | MODULE_PARM_DESC(nreaders, "Number of RCU reader threads"); |
74 | module_param(nfakewriters, int, 0444); | 74 | module_param(nfakewriters, int, 0444); |
75 | MODULE_PARM_DESC(nfakewriters, "Number of RCU fake writer threads"); | 75 | MODULE_PARM_DESC(nfakewriters, "Number of RCU fake writer threads"); |
76 | module_param(stat_interval, int, 0444); | 76 | module_param(stat_interval, int, 0644); |
77 | MODULE_PARM_DESC(stat_interval, "Number of seconds between stats printk()s"); | 77 | MODULE_PARM_DESC(stat_interval, "Number of seconds between stats printk()s"); |
78 | module_param(verbose, bool, 0444); | 78 | module_param(verbose, bool, 0444); |
79 | MODULE_PARM_DESC(verbose, "Enable verbose debugging printk()s"); | 79 | MODULE_PARM_DESC(verbose, "Enable verbose debugging printk()s"); |
@@ -480,30 +480,6 @@ static void rcu_bh_torture_deferred_free(struct rcu_torture *p) | |||
480 | call_rcu_bh(&p->rtort_rcu, rcu_torture_cb); | 480 | call_rcu_bh(&p->rtort_rcu, rcu_torture_cb); |
481 | } | 481 | } |
482 | 482 | ||
483 | struct rcu_bh_torture_synchronize { | ||
484 | struct rcu_head head; | ||
485 | struct completion completion; | ||
486 | }; | ||
487 | |||
488 | static void rcu_bh_torture_wakeme_after_cb(struct rcu_head *head) | ||
489 | { | ||
490 | struct rcu_bh_torture_synchronize *rcu; | ||
491 | |||
492 | rcu = container_of(head, struct rcu_bh_torture_synchronize, head); | ||
493 | complete(&rcu->completion); | ||
494 | } | ||
495 | |||
496 | static void rcu_bh_torture_synchronize(void) | ||
497 | { | ||
498 | struct rcu_bh_torture_synchronize rcu; | ||
499 | |||
500 | init_rcu_head_on_stack(&rcu.head); | ||
501 | init_completion(&rcu.completion); | ||
502 | call_rcu_bh(&rcu.head, rcu_bh_torture_wakeme_after_cb); | ||
503 | wait_for_completion(&rcu.completion); | ||
504 | destroy_rcu_head_on_stack(&rcu.head); | ||
505 | } | ||
506 | |||
507 | static struct rcu_torture_ops rcu_bh_ops = { | 483 | static struct rcu_torture_ops rcu_bh_ops = { |
508 | .init = NULL, | 484 | .init = NULL, |
509 | .cleanup = NULL, | 485 | .cleanup = NULL, |
@@ -512,7 +488,7 @@ static struct rcu_torture_ops rcu_bh_ops = { | |||
512 | .readunlock = rcu_bh_torture_read_unlock, | 488 | .readunlock = rcu_bh_torture_read_unlock, |
513 | .completed = rcu_bh_torture_completed, | 489 | .completed = rcu_bh_torture_completed, |
514 | .deferred_free = rcu_bh_torture_deferred_free, | 490 | .deferred_free = rcu_bh_torture_deferred_free, |
515 | .sync = rcu_bh_torture_synchronize, | 491 | .sync = synchronize_rcu_bh, |
516 | .cb_barrier = rcu_barrier_bh, | 492 | .cb_barrier = rcu_barrier_bh, |
517 | .fqs = rcu_bh_force_quiescent_state, | 493 | .fqs = rcu_bh_force_quiescent_state, |
518 | .stats = NULL, | 494 | .stats = NULL, |
@@ -528,7 +504,7 @@ static struct rcu_torture_ops rcu_bh_sync_ops = { | |||
528 | .readunlock = rcu_bh_torture_read_unlock, | 504 | .readunlock = rcu_bh_torture_read_unlock, |
529 | .completed = rcu_bh_torture_completed, | 505 | .completed = rcu_bh_torture_completed, |
530 | .deferred_free = rcu_sync_torture_deferred_free, | 506 | .deferred_free = rcu_sync_torture_deferred_free, |
531 | .sync = rcu_bh_torture_synchronize, | 507 | .sync = synchronize_rcu_bh, |
532 | .cb_barrier = NULL, | 508 | .cb_barrier = NULL, |
533 | .fqs = rcu_bh_force_quiescent_state, | 509 | .fqs = rcu_bh_force_quiescent_state, |
534 | .stats = NULL, | 510 | .stats = NULL, |
@@ -536,6 +512,22 @@ static struct rcu_torture_ops rcu_bh_sync_ops = { | |||
536 | .name = "rcu_bh_sync" | 512 | .name = "rcu_bh_sync" |
537 | }; | 513 | }; |
538 | 514 | ||
515 | static struct rcu_torture_ops rcu_bh_expedited_ops = { | ||
516 | .init = rcu_sync_torture_init, | ||
517 | .cleanup = NULL, | ||
518 | .readlock = rcu_bh_torture_read_lock, | ||
519 | .read_delay = rcu_read_delay, /* just reuse rcu's version. */ | ||
520 | .readunlock = rcu_bh_torture_read_unlock, | ||
521 | .completed = rcu_bh_torture_completed, | ||
522 | .deferred_free = rcu_sync_torture_deferred_free, | ||
523 | .sync = synchronize_rcu_bh_expedited, | ||
524 | .cb_barrier = NULL, | ||
525 | .fqs = rcu_bh_force_quiescent_state, | ||
526 | .stats = NULL, | ||
527 | .irq_capable = 1, | ||
528 | .name = "rcu_bh_expedited" | ||
529 | }; | ||
530 | |||
539 | /* | 531 | /* |
540 | * Definitions for srcu torture testing. | 532 | * Definitions for srcu torture testing. |
541 | */ | 533 | */ |
@@ -659,11 +651,6 @@ static void rcu_sched_torture_deferred_free(struct rcu_torture *p) | |||
659 | call_rcu_sched(&p->rtort_rcu, rcu_torture_cb); | 651 | call_rcu_sched(&p->rtort_rcu, rcu_torture_cb); |
660 | } | 652 | } |
661 | 653 | ||
662 | static void sched_torture_synchronize(void) | ||
663 | { | ||
664 | synchronize_sched(); | ||
665 | } | ||
666 | |||
667 | static struct rcu_torture_ops sched_ops = { | 654 | static struct rcu_torture_ops sched_ops = { |
668 | .init = rcu_sync_torture_init, | 655 | .init = rcu_sync_torture_init, |
669 | .cleanup = NULL, | 656 | .cleanup = NULL, |
@@ -672,7 +659,7 @@ static struct rcu_torture_ops sched_ops = { | |||
672 | .readunlock = sched_torture_read_unlock, | 659 | .readunlock = sched_torture_read_unlock, |
673 | .completed = rcu_no_completed, | 660 | .completed = rcu_no_completed, |
674 | .deferred_free = rcu_sched_torture_deferred_free, | 661 | .deferred_free = rcu_sched_torture_deferred_free, |
675 | .sync = sched_torture_synchronize, | 662 | .sync = synchronize_sched, |
676 | .cb_barrier = rcu_barrier_sched, | 663 | .cb_barrier = rcu_barrier_sched, |
677 | .fqs = rcu_sched_force_quiescent_state, | 664 | .fqs = rcu_sched_force_quiescent_state, |
678 | .stats = NULL, | 665 | .stats = NULL, |
@@ -688,7 +675,7 @@ static struct rcu_torture_ops sched_sync_ops = { | |||
688 | .readunlock = sched_torture_read_unlock, | 675 | .readunlock = sched_torture_read_unlock, |
689 | .completed = rcu_no_completed, | 676 | .completed = rcu_no_completed, |
690 | .deferred_free = rcu_sync_torture_deferred_free, | 677 | .deferred_free = rcu_sync_torture_deferred_free, |
691 | .sync = sched_torture_synchronize, | 678 | .sync = synchronize_sched, |
692 | .cb_barrier = NULL, | 679 | .cb_barrier = NULL, |
693 | .fqs = rcu_sched_force_quiescent_state, | 680 | .fqs = rcu_sched_force_quiescent_state, |
694 | .stats = NULL, | 681 | .stats = NULL, |
@@ -754,7 +741,7 @@ static int rcu_torture_boost(void *arg) | |||
754 | do { | 741 | do { |
755 | /* Wait for the next test interval. */ | 742 | /* Wait for the next test interval. */ |
756 | oldstarttime = boost_starttime; | 743 | oldstarttime = boost_starttime; |
757 | while (jiffies - oldstarttime > ULONG_MAX / 2) { | 744 | while (ULONG_CMP_LT(jiffies, oldstarttime)) { |
758 | schedule_timeout_uninterruptible(1); | 745 | schedule_timeout_uninterruptible(1); |
759 | rcu_stutter_wait("rcu_torture_boost"); | 746 | rcu_stutter_wait("rcu_torture_boost"); |
760 | if (kthread_should_stop() || | 747 | if (kthread_should_stop() || |
@@ -765,7 +752,7 @@ static int rcu_torture_boost(void *arg) | |||
765 | /* Do one boost-test interval. */ | 752 | /* Do one boost-test interval. */ |
766 | endtime = oldstarttime + test_boost_duration * HZ; | 753 | endtime = oldstarttime + test_boost_duration * HZ; |
767 | call_rcu_time = jiffies; | 754 | call_rcu_time = jiffies; |
768 | while (jiffies - endtime > ULONG_MAX / 2) { | 755 | while (ULONG_CMP_LT(jiffies, endtime)) { |
769 | /* If we don't have a callback in flight, post one. */ | 756 | /* If we don't have a callback in flight, post one. */ |
770 | if (!rbi.inflight) { | 757 | if (!rbi.inflight) { |
771 | smp_mb(); /* RCU core before ->inflight = 1. */ | 758 | smp_mb(); /* RCU core before ->inflight = 1. */ |
@@ -792,7 +779,8 @@ static int rcu_torture_boost(void *arg) | |||
792 | * interval. Besides, we are running at RT priority, | 779 | * interval. Besides, we are running at RT priority, |
793 | * so delays should be relatively rare. | 780 | * so delays should be relatively rare. |
794 | */ | 781 | */ |
795 | while (oldstarttime == boost_starttime) { | 782 | while (oldstarttime == boost_starttime && |
783 | !kthread_should_stop()) { | ||
796 | if (mutex_trylock(&boost_mutex)) { | 784 | if (mutex_trylock(&boost_mutex)) { |
797 | boost_starttime = jiffies + | 785 | boost_starttime = jiffies + |
798 | test_boost_interval * HZ; | 786 | test_boost_interval * HZ; |
@@ -809,11 +797,11 @@ checkwait: rcu_stutter_wait("rcu_torture_boost"); | |||
809 | 797 | ||
810 | /* Clean up and exit. */ | 798 | /* Clean up and exit. */ |
811 | VERBOSE_PRINTK_STRING("rcu_torture_boost task stopping"); | 799 | VERBOSE_PRINTK_STRING("rcu_torture_boost task stopping"); |
812 | destroy_rcu_head_on_stack(&rbi.rcu); | ||
813 | rcutorture_shutdown_absorb("rcu_torture_boost"); | 800 | rcutorture_shutdown_absorb("rcu_torture_boost"); |
814 | while (!kthread_should_stop() || rbi.inflight) | 801 | while (!kthread_should_stop() || rbi.inflight) |
815 | schedule_timeout_uninterruptible(1); | 802 | schedule_timeout_uninterruptible(1); |
816 | smp_mb(); /* order accesses to ->inflight before stack-frame death. */ | 803 | smp_mb(); /* order accesses to ->inflight before stack-frame death. */ |
804 | destroy_rcu_head_on_stack(&rbi.rcu); | ||
817 | return 0; | 805 | return 0; |
818 | } | 806 | } |
819 | 807 | ||
@@ -831,11 +819,13 @@ rcu_torture_fqs(void *arg) | |||
831 | VERBOSE_PRINTK_STRING("rcu_torture_fqs task started"); | 819 | VERBOSE_PRINTK_STRING("rcu_torture_fqs task started"); |
832 | do { | 820 | do { |
833 | fqs_resume_time = jiffies + fqs_stutter * HZ; | 821 | fqs_resume_time = jiffies + fqs_stutter * HZ; |
834 | while (jiffies - fqs_resume_time > LONG_MAX) { | 822 | while (ULONG_CMP_LT(jiffies, fqs_resume_time) && |
823 | !kthread_should_stop()) { | ||
835 | schedule_timeout_interruptible(1); | 824 | schedule_timeout_interruptible(1); |
836 | } | 825 | } |
837 | fqs_burst_remaining = fqs_duration; | 826 | fqs_burst_remaining = fqs_duration; |
838 | while (fqs_burst_remaining > 0) { | 827 | while (fqs_burst_remaining > 0 && |
828 | !kthread_should_stop()) { | ||
839 | cur_ops->fqs(); | 829 | cur_ops->fqs(); |
840 | udelay(fqs_holdoff); | 830 | udelay(fqs_holdoff); |
841 | fqs_burst_remaining -= fqs_holdoff; | 831 | fqs_burst_remaining -= fqs_holdoff; |
@@ -1280,8 +1270,9 @@ static int rcutorture_booster_init(int cpu) | |||
1280 | /* Don't allow time recalculation while creating a new task. */ | 1270 | /* Don't allow time recalculation while creating a new task. */ |
1281 | mutex_lock(&boost_mutex); | 1271 | mutex_lock(&boost_mutex); |
1282 | VERBOSE_PRINTK_STRING("Creating rcu_torture_boost task"); | 1272 | VERBOSE_PRINTK_STRING("Creating rcu_torture_boost task"); |
1283 | boost_tasks[cpu] = kthread_create(rcu_torture_boost, NULL, | 1273 | boost_tasks[cpu] = kthread_create_on_node(rcu_torture_boost, NULL, |
1284 | "rcu_torture_boost"); | 1274 | cpu_to_node(cpu), |
1275 | "rcu_torture_boost"); | ||
1285 | if (IS_ERR(boost_tasks[cpu])) { | 1276 | if (IS_ERR(boost_tasks[cpu])) { |
1286 | retval = PTR_ERR(boost_tasks[cpu]); | 1277 | retval = PTR_ERR(boost_tasks[cpu]); |
1287 | VERBOSE_PRINTK_STRING("rcu_torture_boost task create failed"); | 1278 | VERBOSE_PRINTK_STRING("rcu_torture_boost task create failed"); |
@@ -1424,7 +1415,7 @@ rcu_torture_init(void) | |||
1424 | int firsterr = 0; | 1415 | int firsterr = 0; |
1425 | static struct rcu_torture_ops *torture_ops[] = | 1416 | static struct rcu_torture_ops *torture_ops[] = |
1426 | { &rcu_ops, &rcu_sync_ops, &rcu_expedited_ops, | 1417 | { &rcu_ops, &rcu_sync_ops, &rcu_expedited_ops, |
1427 | &rcu_bh_ops, &rcu_bh_sync_ops, | 1418 | &rcu_bh_ops, &rcu_bh_sync_ops, &rcu_bh_expedited_ops, |
1428 | &srcu_ops, &srcu_expedited_ops, | 1419 | &srcu_ops, &srcu_expedited_ops, |
1429 | &sched_ops, &sched_sync_ops, &sched_expedited_ops, }; | 1420 | &sched_ops, &sched_sync_ops, &sched_expedited_ops, }; |
1430 | 1421 | ||
diff --git a/kernel/rcutree.c b/kernel/rcutree.c index ba06207b1dd3..e234eb92a177 100644 --- a/kernel/rcutree.c +++ b/kernel/rcutree.c | |||
@@ -52,13 +52,16 @@ | |||
52 | #include <linux/prefetch.h> | 52 | #include <linux/prefetch.h> |
53 | 53 | ||
54 | #include "rcutree.h" | 54 | #include "rcutree.h" |
55 | #include <trace/events/rcu.h> | ||
56 | |||
57 | #include "rcu.h" | ||
55 | 58 | ||
56 | /* Data structures. */ | 59 | /* Data structures. */ |
57 | 60 | ||
58 | static struct lock_class_key rcu_node_class[NUM_RCU_LVLS]; | 61 | static struct lock_class_key rcu_node_class[NUM_RCU_LVLS]; |
59 | 62 | ||
60 | #define RCU_STATE_INITIALIZER(structname) { \ | 63 | #define RCU_STATE_INITIALIZER(structname) { \ |
61 | .level = { &structname.node[0] }, \ | 64 | .level = { &structname##_state.node[0] }, \ |
62 | .levelcnt = { \ | 65 | .levelcnt = { \ |
63 | NUM_RCU_LVL_0, /* root of hierarchy. */ \ | 66 | NUM_RCU_LVL_0, /* root of hierarchy. */ \ |
64 | NUM_RCU_LVL_1, \ | 67 | NUM_RCU_LVL_1, \ |
@@ -69,17 +72,17 @@ static struct lock_class_key rcu_node_class[NUM_RCU_LVLS]; | |||
69 | .signaled = RCU_GP_IDLE, \ | 72 | .signaled = RCU_GP_IDLE, \ |
70 | .gpnum = -300, \ | 73 | .gpnum = -300, \ |
71 | .completed = -300, \ | 74 | .completed = -300, \ |
72 | .onofflock = __RAW_SPIN_LOCK_UNLOCKED(&structname.onofflock), \ | 75 | .onofflock = __RAW_SPIN_LOCK_UNLOCKED(&structname##_state.onofflock), \ |
73 | .fqslock = __RAW_SPIN_LOCK_UNLOCKED(&structname.fqslock), \ | 76 | .fqslock = __RAW_SPIN_LOCK_UNLOCKED(&structname##_state.fqslock), \ |
74 | .n_force_qs = 0, \ | 77 | .n_force_qs = 0, \ |
75 | .n_force_qs_ngp = 0, \ | 78 | .n_force_qs_ngp = 0, \ |
76 | .name = #structname, \ | 79 | .name = #structname, \ |
77 | } | 80 | } |
78 | 81 | ||
79 | struct rcu_state rcu_sched_state = RCU_STATE_INITIALIZER(rcu_sched_state); | 82 | struct rcu_state rcu_sched_state = RCU_STATE_INITIALIZER(rcu_sched); |
80 | DEFINE_PER_CPU(struct rcu_data, rcu_sched_data); | 83 | DEFINE_PER_CPU(struct rcu_data, rcu_sched_data); |
81 | 84 | ||
82 | struct rcu_state rcu_bh_state = RCU_STATE_INITIALIZER(rcu_bh_state); | 85 | struct rcu_state rcu_bh_state = RCU_STATE_INITIALIZER(rcu_bh); |
83 | DEFINE_PER_CPU(struct rcu_data, rcu_bh_data); | 86 | DEFINE_PER_CPU(struct rcu_data, rcu_bh_data); |
84 | 87 | ||
85 | static struct rcu_state *rcu_state; | 88 | static struct rcu_state *rcu_state; |
@@ -128,8 +131,6 @@ static void rcu_node_kthread_setaffinity(struct rcu_node *rnp, int outgoingcpu); | |||
128 | static void invoke_rcu_core(void); | 131 | static void invoke_rcu_core(void); |
129 | static void invoke_rcu_callbacks(struct rcu_state *rsp, struct rcu_data *rdp); | 132 | static void invoke_rcu_callbacks(struct rcu_state *rsp, struct rcu_data *rdp); |
130 | 133 | ||
131 | #define RCU_KTHREAD_PRIO 1 /* RT priority for per-CPU kthreads. */ | ||
132 | |||
133 | /* | 134 | /* |
134 | * Track the rcutorture test sequence number and the update version | 135 | * Track the rcutorture test sequence number and the update version |
135 | * number within a given test. The rcutorture_testseq is incremented | 136 | * number within a given test. The rcutorture_testseq is incremented |
@@ -156,33 +157,41 @@ static int rcu_gp_in_progress(struct rcu_state *rsp) | |||
156 | * Note a quiescent state. Because we do not need to know | 157 | * Note a quiescent state. Because we do not need to know |
157 | * how many quiescent states passed, just if there was at least | 158 | * how many quiescent states passed, just if there was at least |
158 | * one since the start of the grace period, this just sets a flag. | 159 | * one since the start of the grace period, this just sets a flag. |
160 | * The caller must have disabled preemption. | ||
159 | */ | 161 | */ |
160 | void rcu_sched_qs(int cpu) | 162 | void rcu_sched_qs(int cpu) |
161 | { | 163 | { |
162 | struct rcu_data *rdp = &per_cpu(rcu_sched_data, cpu); | 164 | struct rcu_data *rdp = &per_cpu(rcu_sched_data, cpu); |
163 | 165 | ||
164 | rdp->passed_quiesc_completed = rdp->gpnum - 1; | 166 | rdp->passed_quiesce_gpnum = rdp->gpnum; |
165 | barrier(); | 167 | barrier(); |
166 | rdp->passed_quiesc = 1; | 168 | if (rdp->passed_quiesce == 0) |
169 | trace_rcu_grace_period("rcu_sched", rdp->gpnum, "cpuqs"); | ||
170 | rdp->passed_quiesce = 1; | ||
167 | } | 171 | } |
168 | 172 | ||
169 | void rcu_bh_qs(int cpu) | 173 | void rcu_bh_qs(int cpu) |
170 | { | 174 | { |
171 | struct rcu_data *rdp = &per_cpu(rcu_bh_data, cpu); | 175 | struct rcu_data *rdp = &per_cpu(rcu_bh_data, cpu); |
172 | 176 | ||
173 | rdp->passed_quiesc_completed = rdp->gpnum - 1; | 177 | rdp->passed_quiesce_gpnum = rdp->gpnum; |
174 | barrier(); | 178 | barrier(); |
175 | rdp->passed_quiesc = 1; | 179 | if (rdp->passed_quiesce == 0) |
180 | trace_rcu_grace_period("rcu_bh", rdp->gpnum, "cpuqs"); | ||
181 | rdp->passed_quiesce = 1; | ||
176 | } | 182 | } |
177 | 183 | ||
178 | /* | 184 | /* |
179 | * Note a context switch. This is a quiescent state for RCU-sched, | 185 | * Note a context switch. This is a quiescent state for RCU-sched, |
180 | * and requires special handling for preemptible RCU. | 186 | * and requires special handling for preemptible RCU. |
187 | * The caller must have disabled preemption. | ||
181 | */ | 188 | */ |
182 | void rcu_note_context_switch(int cpu) | 189 | void rcu_note_context_switch(int cpu) |
183 | { | 190 | { |
191 | trace_rcu_utilization("Start context switch"); | ||
184 | rcu_sched_qs(cpu); | 192 | rcu_sched_qs(cpu); |
185 | rcu_preempt_note_context_switch(cpu); | 193 | rcu_preempt_note_context_switch(cpu); |
194 | trace_rcu_utilization("End context switch"); | ||
186 | } | 195 | } |
187 | EXPORT_SYMBOL_GPL(rcu_note_context_switch); | 196 | EXPORT_SYMBOL_GPL(rcu_note_context_switch); |
188 | 197 | ||
@@ -193,7 +202,7 @@ DEFINE_PER_CPU(struct rcu_dynticks, rcu_dynticks) = { | |||
193 | }; | 202 | }; |
194 | #endif /* #ifdef CONFIG_NO_HZ */ | 203 | #endif /* #ifdef CONFIG_NO_HZ */ |
195 | 204 | ||
196 | static int blimit = 10; /* Maximum callbacks per softirq. */ | 205 | static int blimit = 10; /* Maximum callbacks per rcu_do_batch. */ |
197 | static int qhimark = 10000; /* If this many pending, ignore blimit. */ | 206 | static int qhimark = 10000; /* If this many pending, ignore blimit. */ |
198 | static int qlowmark = 100; /* Once only this many pending, use blimit. */ | 207 | static int qlowmark = 100; /* Once only this many pending, use blimit. */ |
199 | 208 | ||
@@ -314,6 +323,7 @@ static int rcu_implicit_offline_qs(struct rcu_data *rdp) | |||
314 | * trust its state not to change because interrupts are disabled. | 323 | * trust its state not to change because interrupts are disabled. |
315 | */ | 324 | */ |
316 | if (cpu_is_offline(rdp->cpu)) { | 325 | if (cpu_is_offline(rdp->cpu)) { |
326 | trace_rcu_fqs(rdp->rsp->name, rdp->gpnum, rdp->cpu, "ofl"); | ||
317 | rdp->offline_fqs++; | 327 | rdp->offline_fqs++; |
318 | return 1; | 328 | return 1; |
319 | } | 329 | } |
@@ -354,19 +364,13 @@ void rcu_enter_nohz(void) | |||
354 | local_irq_restore(flags); | 364 | local_irq_restore(flags); |
355 | return; | 365 | return; |
356 | } | 366 | } |
367 | trace_rcu_dyntick("Start"); | ||
357 | /* CPUs seeing atomic_inc() must see prior RCU read-side crit sects */ | 368 | /* CPUs seeing atomic_inc() must see prior RCU read-side crit sects */ |
358 | smp_mb__before_atomic_inc(); /* See above. */ | 369 | smp_mb__before_atomic_inc(); /* See above. */ |
359 | atomic_inc(&rdtp->dynticks); | 370 | atomic_inc(&rdtp->dynticks); |
360 | smp_mb__after_atomic_inc(); /* Force ordering with next sojourn. */ | 371 | smp_mb__after_atomic_inc(); /* Force ordering with next sojourn. */ |
361 | WARN_ON_ONCE(atomic_read(&rdtp->dynticks) & 0x1); | 372 | WARN_ON_ONCE(atomic_read(&rdtp->dynticks) & 0x1); |
362 | local_irq_restore(flags); | 373 | local_irq_restore(flags); |
363 | |||
364 | /* If the interrupt queued a callback, get out of dyntick mode. */ | ||
365 | if (in_irq() && | ||
366 | (__get_cpu_var(rcu_sched_data).nxtlist || | ||
367 | __get_cpu_var(rcu_bh_data).nxtlist || | ||
368 | rcu_preempt_needs_cpu(smp_processor_id()))) | ||
369 | set_need_resched(); | ||
370 | } | 374 | } |
371 | 375 | ||
372 | /* | 376 | /* |
@@ -391,6 +395,7 @@ void rcu_exit_nohz(void) | |||
391 | /* CPUs seeing atomic_inc() must see later RCU read-side crit sects */ | 395 | /* CPUs seeing atomic_inc() must see later RCU read-side crit sects */ |
392 | smp_mb__after_atomic_inc(); /* See above. */ | 396 | smp_mb__after_atomic_inc(); /* See above. */ |
393 | WARN_ON_ONCE(!(atomic_read(&rdtp->dynticks) & 0x1)); | 397 | WARN_ON_ONCE(!(atomic_read(&rdtp->dynticks) & 0x1)); |
398 | trace_rcu_dyntick("End"); | ||
394 | local_irq_restore(flags); | 399 | local_irq_restore(flags); |
395 | } | 400 | } |
396 | 401 | ||
@@ -481,11 +486,11 @@ static int dyntick_save_progress_counter(struct rcu_data *rdp) | |||
481 | */ | 486 | */ |
482 | static int rcu_implicit_dynticks_qs(struct rcu_data *rdp) | 487 | static int rcu_implicit_dynticks_qs(struct rcu_data *rdp) |
483 | { | 488 | { |
484 | unsigned long curr; | 489 | unsigned int curr; |
485 | unsigned long snap; | 490 | unsigned int snap; |
486 | 491 | ||
487 | curr = (unsigned long)atomic_add_return(0, &rdp->dynticks->dynticks); | 492 | curr = (unsigned int)atomic_add_return(0, &rdp->dynticks->dynticks); |
488 | snap = (unsigned long)rdp->dynticks_snap; | 493 | snap = (unsigned int)rdp->dynticks_snap; |
489 | 494 | ||
490 | /* | 495 | /* |
491 | * If the CPU passed through or entered a dynticks idle phase with | 496 | * If the CPU passed through or entered a dynticks idle phase with |
@@ -495,7 +500,8 @@ static int rcu_implicit_dynticks_qs(struct rcu_data *rdp) | |||
495 | * read-side critical section that started before the beginning | 500 | * read-side critical section that started before the beginning |
496 | * of the current RCU grace period. | 501 | * of the current RCU grace period. |
497 | */ | 502 | */ |
498 | if ((curr & 0x1) == 0 || ULONG_CMP_GE(curr, snap + 2)) { | 503 | if ((curr & 0x1) == 0 || UINT_CMP_GE(curr, snap + 2)) { |
504 | trace_rcu_fqs(rdp->rsp->name, rdp->gpnum, rdp->cpu, "dti"); | ||
499 | rdp->dynticks_fqs++; | 505 | rdp->dynticks_fqs++; |
500 | return 1; | 506 | return 1; |
501 | } | 507 | } |
@@ -537,6 +543,7 @@ static void print_other_cpu_stall(struct rcu_state *rsp) | |||
537 | int cpu; | 543 | int cpu; |
538 | long delta; | 544 | long delta; |
539 | unsigned long flags; | 545 | unsigned long flags; |
546 | int ndetected; | ||
540 | struct rcu_node *rnp = rcu_get_root(rsp); | 547 | struct rcu_node *rnp = rcu_get_root(rsp); |
541 | 548 | ||
542 | /* Only let one CPU complain about others per time interval. */ | 549 | /* Only let one CPU complain about others per time interval. */ |
@@ -553,7 +560,7 @@ static void print_other_cpu_stall(struct rcu_state *rsp) | |||
553 | * Now rat on any tasks that got kicked up to the root rcu_node | 560 | * Now rat on any tasks that got kicked up to the root rcu_node |
554 | * due to CPU offlining. | 561 | * due to CPU offlining. |
555 | */ | 562 | */ |
556 | rcu_print_task_stall(rnp); | 563 | ndetected = rcu_print_task_stall(rnp); |
557 | raw_spin_unlock_irqrestore(&rnp->lock, flags); | 564 | raw_spin_unlock_irqrestore(&rnp->lock, flags); |
558 | 565 | ||
559 | /* | 566 | /* |
@@ -565,17 +572,22 @@ static void print_other_cpu_stall(struct rcu_state *rsp) | |||
565 | rsp->name); | 572 | rsp->name); |
566 | rcu_for_each_leaf_node(rsp, rnp) { | 573 | rcu_for_each_leaf_node(rsp, rnp) { |
567 | raw_spin_lock_irqsave(&rnp->lock, flags); | 574 | raw_spin_lock_irqsave(&rnp->lock, flags); |
568 | rcu_print_task_stall(rnp); | 575 | ndetected += rcu_print_task_stall(rnp); |
569 | raw_spin_unlock_irqrestore(&rnp->lock, flags); | 576 | raw_spin_unlock_irqrestore(&rnp->lock, flags); |
570 | if (rnp->qsmask == 0) | 577 | if (rnp->qsmask == 0) |
571 | continue; | 578 | continue; |
572 | for (cpu = 0; cpu <= rnp->grphi - rnp->grplo; cpu++) | 579 | for (cpu = 0; cpu <= rnp->grphi - rnp->grplo; cpu++) |
573 | if (rnp->qsmask & (1UL << cpu)) | 580 | if (rnp->qsmask & (1UL << cpu)) { |
574 | printk(" %d", rnp->grplo + cpu); | 581 | printk(" %d", rnp->grplo + cpu); |
582 | ndetected++; | ||
583 | } | ||
575 | } | 584 | } |
576 | printk("} (detected by %d, t=%ld jiffies)\n", | 585 | printk("} (detected by %d, t=%ld jiffies)\n", |
577 | smp_processor_id(), (long)(jiffies - rsp->gp_start)); | 586 | smp_processor_id(), (long)(jiffies - rsp->gp_start)); |
578 | trigger_all_cpu_backtrace(); | 587 | if (ndetected == 0) |
588 | printk(KERN_ERR "INFO: Stall ended before state dump start\n"); | ||
589 | else if (!trigger_all_cpu_backtrace()) | ||
590 | dump_stack(); | ||
579 | 591 | ||
580 | /* If so configured, complain about tasks blocking the grace period. */ | 592 | /* If so configured, complain about tasks blocking the grace period. */ |
581 | 593 | ||
@@ -596,7 +608,8 @@ static void print_cpu_stall(struct rcu_state *rsp) | |||
596 | */ | 608 | */ |
597 | printk(KERN_ERR "INFO: %s detected stall on CPU %d (t=%lu jiffies)\n", | 609 | printk(KERN_ERR "INFO: %s detected stall on CPU %d (t=%lu jiffies)\n", |
598 | rsp->name, smp_processor_id(), jiffies - rsp->gp_start); | 610 | rsp->name, smp_processor_id(), jiffies - rsp->gp_start); |
599 | trigger_all_cpu_backtrace(); | 611 | if (!trigger_all_cpu_backtrace()) |
612 | dump_stack(); | ||
600 | 613 | ||
601 | raw_spin_lock_irqsave(&rnp->lock, flags); | 614 | raw_spin_lock_irqsave(&rnp->lock, flags); |
602 | if (ULONG_CMP_GE(jiffies, rsp->jiffies_stall)) | 615 | if (ULONG_CMP_GE(jiffies, rsp->jiffies_stall)) |
@@ -678,9 +691,10 @@ static void __note_new_gpnum(struct rcu_state *rsp, struct rcu_node *rnp, struct | |||
678 | * go looking for one. | 691 | * go looking for one. |
679 | */ | 692 | */ |
680 | rdp->gpnum = rnp->gpnum; | 693 | rdp->gpnum = rnp->gpnum; |
694 | trace_rcu_grace_period(rsp->name, rdp->gpnum, "cpustart"); | ||
681 | if (rnp->qsmask & rdp->grpmask) { | 695 | if (rnp->qsmask & rdp->grpmask) { |
682 | rdp->qs_pending = 1; | 696 | rdp->qs_pending = 1; |
683 | rdp->passed_quiesc = 0; | 697 | rdp->passed_quiesce = 0; |
684 | } else | 698 | } else |
685 | rdp->qs_pending = 0; | 699 | rdp->qs_pending = 0; |
686 | } | 700 | } |
@@ -741,6 +755,7 @@ __rcu_process_gp_end(struct rcu_state *rsp, struct rcu_node *rnp, struct rcu_dat | |||
741 | 755 | ||
742 | /* Remember that we saw this grace-period completion. */ | 756 | /* Remember that we saw this grace-period completion. */ |
743 | rdp->completed = rnp->completed; | 757 | rdp->completed = rnp->completed; |
758 | trace_rcu_grace_period(rsp->name, rdp->gpnum, "cpuend"); | ||
744 | 759 | ||
745 | /* | 760 | /* |
746 | * If we were in an extended quiescent state, we may have | 761 | * If we were in an extended quiescent state, we may have |
@@ -826,31 +841,31 @@ rcu_start_gp(struct rcu_state *rsp, unsigned long flags) | |||
826 | struct rcu_data *rdp = this_cpu_ptr(rsp->rda); | 841 | struct rcu_data *rdp = this_cpu_ptr(rsp->rda); |
827 | struct rcu_node *rnp = rcu_get_root(rsp); | 842 | struct rcu_node *rnp = rcu_get_root(rsp); |
828 | 843 | ||
829 | if (!cpu_needs_another_gp(rsp, rdp) || rsp->fqs_active) { | 844 | if (!rcu_scheduler_fully_active || |
830 | if (cpu_needs_another_gp(rsp, rdp)) | 845 | !cpu_needs_another_gp(rsp, rdp)) { |
831 | rsp->fqs_need_gp = 1; | 846 | /* |
832 | if (rnp->completed == rsp->completed) { | 847 | * Either the scheduler hasn't yet spawned the first |
833 | raw_spin_unlock_irqrestore(&rnp->lock, flags); | 848 | * non-idle task or this CPU does not need another |
834 | return; | 849 | * grace period. Either way, don't start a new grace |
835 | } | 850 | * period. |
836 | raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */ | 851 | */ |
852 | raw_spin_unlock_irqrestore(&rnp->lock, flags); | ||
853 | return; | ||
854 | } | ||
837 | 855 | ||
856 | if (rsp->fqs_active) { | ||
838 | /* | 857 | /* |
839 | * Propagate new ->completed value to rcu_node structures | 858 | * This CPU needs a grace period, but force_quiescent_state() |
840 | * so that other CPUs don't have to wait until the start | 859 | * is running. Tell it to start one on this CPU's behalf. |
841 | * of the next grace period to process their callbacks. | ||
842 | */ | 860 | */ |
843 | rcu_for_each_node_breadth_first(rsp, rnp) { | 861 | rsp->fqs_need_gp = 1; |
844 | raw_spin_lock(&rnp->lock); /* irqs already disabled. */ | 862 | raw_spin_unlock_irqrestore(&rnp->lock, flags); |
845 | rnp->completed = rsp->completed; | ||
846 | raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */ | ||
847 | } | ||
848 | local_irq_restore(flags); | ||
849 | return; | 863 | return; |
850 | } | 864 | } |
851 | 865 | ||
852 | /* Advance to a new grace period and initialize state. */ | 866 | /* Advance to a new grace period and initialize state. */ |
853 | rsp->gpnum++; | 867 | rsp->gpnum++; |
868 | trace_rcu_grace_period(rsp->name, rsp->gpnum, "start"); | ||
854 | WARN_ON_ONCE(rsp->signaled == RCU_GP_INIT); | 869 | WARN_ON_ONCE(rsp->signaled == RCU_GP_INIT); |
855 | rsp->signaled = RCU_GP_INIT; /* Hold off force_quiescent_state. */ | 870 | rsp->signaled = RCU_GP_INIT; /* Hold off force_quiescent_state. */ |
856 | rsp->jiffies_force_qs = jiffies + RCU_JIFFIES_TILL_FORCE_QS; | 871 | rsp->jiffies_force_qs = jiffies + RCU_JIFFIES_TILL_FORCE_QS; |
@@ -865,6 +880,9 @@ rcu_start_gp(struct rcu_state *rsp, unsigned long flags) | |||
865 | rsp->signaled = RCU_SIGNAL_INIT; /* force_quiescent_state OK. */ | 880 | rsp->signaled = RCU_SIGNAL_INIT; /* force_quiescent_state OK. */ |
866 | rcu_start_gp_per_cpu(rsp, rnp, rdp); | 881 | rcu_start_gp_per_cpu(rsp, rnp, rdp); |
867 | rcu_preempt_boost_start_gp(rnp); | 882 | rcu_preempt_boost_start_gp(rnp); |
883 | trace_rcu_grace_period_init(rsp->name, rnp->gpnum, | ||
884 | rnp->level, rnp->grplo, | ||
885 | rnp->grphi, rnp->qsmask); | ||
868 | raw_spin_unlock_irqrestore(&rnp->lock, flags); | 886 | raw_spin_unlock_irqrestore(&rnp->lock, flags); |
869 | return; | 887 | return; |
870 | } | 888 | } |
@@ -901,6 +919,9 @@ rcu_start_gp(struct rcu_state *rsp, unsigned long flags) | |||
901 | if (rnp == rdp->mynode) | 919 | if (rnp == rdp->mynode) |
902 | rcu_start_gp_per_cpu(rsp, rnp, rdp); | 920 | rcu_start_gp_per_cpu(rsp, rnp, rdp); |
903 | rcu_preempt_boost_start_gp(rnp); | 921 | rcu_preempt_boost_start_gp(rnp); |
922 | trace_rcu_grace_period_init(rsp->name, rnp->gpnum, | ||
923 | rnp->level, rnp->grplo, | ||
924 | rnp->grphi, rnp->qsmask); | ||
904 | raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */ | 925 | raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */ |
905 | } | 926 | } |
906 | 927 | ||
@@ -922,6 +943,8 @@ static void rcu_report_qs_rsp(struct rcu_state *rsp, unsigned long flags) | |||
922 | __releases(rcu_get_root(rsp)->lock) | 943 | __releases(rcu_get_root(rsp)->lock) |
923 | { | 944 | { |
924 | unsigned long gp_duration; | 945 | unsigned long gp_duration; |
946 | struct rcu_node *rnp = rcu_get_root(rsp); | ||
947 | struct rcu_data *rdp = this_cpu_ptr(rsp->rda); | ||
925 | 948 | ||
926 | WARN_ON_ONCE(!rcu_gp_in_progress(rsp)); | 949 | WARN_ON_ONCE(!rcu_gp_in_progress(rsp)); |
927 | 950 | ||
@@ -933,7 +956,41 @@ static void rcu_report_qs_rsp(struct rcu_state *rsp, unsigned long flags) | |||
933 | gp_duration = jiffies - rsp->gp_start; | 956 | gp_duration = jiffies - rsp->gp_start; |
934 | if (gp_duration > rsp->gp_max) | 957 | if (gp_duration > rsp->gp_max) |
935 | rsp->gp_max = gp_duration; | 958 | rsp->gp_max = gp_duration; |
936 | rsp->completed = rsp->gpnum; | 959 | |
960 | /* | ||
961 | * We know the grace period is complete, but to everyone else | ||
962 | * it appears to still be ongoing. But it is also the case | ||
963 | * that to everyone else it looks like there is nothing that | ||
964 | * they can do to advance the grace period. It is therefore | ||
965 | * safe for us to drop the lock in order to mark the grace | ||
966 | * period as completed in all of the rcu_node structures. | ||
967 | * | ||
968 | * But if this CPU needs another grace period, it will take | ||
969 | * care of this while initializing the next grace period. | ||
970 | * We use RCU_WAIT_TAIL instead of the usual RCU_DONE_TAIL | ||
971 | * because the callbacks have not yet been advanced: Those | ||
972 | * callbacks are waiting on the grace period that just now | ||
973 | * completed. | ||
974 | */ | ||
975 | if (*rdp->nxttail[RCU_WAIT_TAIL] == NULL) { | ||
976 | raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */ | ||
977 | |||
978 | /* | ||
979 | * Propagate new ->completed value to rcu_node structures | ||
980 | * so that other CPUs don't have to wait until the start | ||
981 | * of the next grace period to process their callbacks. | ||
982 | */ | ||
983 | rcu_for_each_node_breadth_first(rsp, rnp) { | ||
984 | raw_spin_lock(&rnp->lock); /* irqs already disabled. */ | ||
985 | rnp->completed = rsp->gpnum; | ||
986 | raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */ | ||
987 | } | ||
988 | rnp = rcu_get_root(rsp); | ||
989 | raw_spin_lock(&rnp->lock); /* irqs already disabled. */ | ||
990 | } | ||
991 | |||
992 | rsp->completed = rsp->gpnum; /* Declare the grace period complete. */ | ||
993 | trace_rcu_grace_period(rsp->name, rsp->completed, "end"); | ||
937 | rsp->signaled = RCU_GP_IDLE; | 994 | rsp->signaled = RCU_GP_IDLE; |
938 | rcu_start_gp(rsp, flags); /* releases root node's rnp->lock. */ | 995 | rcu_start_gp(rsp, flags); /* releases root node's rnp->lock. */ |
939 | } | 996 | } |
@@ -962,6 +1019,10 @@ rcu_report_qs_rnp(unsigned long mask, struct rcu_state *rsp, | |||
962 | return; | 1019 | return; |
963 | } | 1020 | } |
964 | rnp->qsmask &= ~mask; | 1021 | rnp->qsmask &= ~mask; |
1022 | trace_rcu_quiescent_state_report(rsp->name, rnp->gpnum, | ||
1023 | mask, rnp->qsmask, rnp->level, | ||
1024 | rnp->grplo, rnp->grphi, | ||
1025 | !!rnp->gp_tasks); | ||
965 | if (rnp->qsmask != 0 || rcu_preempt_blocked_readers_cgp(rnp)) { | 1026 | if (rnp->qsmask != 0 || rcu_preempt_blocked_readers_cgp(rnp)) { |
966 | 1027 | ||
967 | /* Other bits still set at this level, so done. */ | 1028 | /* Other bits still set at this level, so done. */ |
@@ -1000,7 +1061,7 @@ rcu_report_qs_rnp(unsigned long mask, struct rcu_state *rsp, | |||
1000 | * based on quiescent states detected in an earlier grace period! | 1061 | * based on quiescent states detected in an earlier grace period! |
1001 | */ | 1062 | */ |
1002 | static void | 1063 | static void |
1003 | rcu_report_qs_rdp(int cpu, struct rcu_state *rsp, struct rcu_data *rdp, long lastcomp) | 1064 | rcu_report_qs_rdp(int cpu, struct rcu_state *rsp, struct rcu_data *rdp, long lastgp) |
1004 | { | 1065 | { |
1005 | unsigned long flags; | 1066 | unsigned long flags; |
1006 | unsigned long mask; | 1067 | unsigned long mask; |
@@ -1008,17 +1069,15 @@ rcu_report_qs_rdp(int cpu, struct rcu_state *rsp, struct rcu_data *rdp, long las | |||
1008 | 1069 | ||
1009 | rnp = rdp->mynode; | 1070 | rnp = rdp->mynode; |
1010 | raw_spin_lock_irqsave(&rnp->lock, flags); | 1071 | raw_spin_lock_irqsave(&rnp->lock, flags); |
1011 | if (lastcomp != rnp->completed) { | 1072 | if (lastgp != rnp->gpnum || rnp->completed == rnp->gpnum) { |
1012 | 1073 | ||
1013 | /* | 1074 | /* |
1014 | * Someone beat us to it for this grace period, so leave. | 1075 | * The grace period in which this quiescent state was |
1015 | * The race with GP start is resolved by the fact that we | 1076 | * recorded has ended, so don't report it upwards. |
1016 | * hold the leaf rcu_node lock, so that the per-CPU bits | 1077 | * We will instead need a new quiescent state that lies |
1017 | * cannot yet be initialized -- so we would simply find our | 1078 | * within the current grace period. |
1018 | * CPU's bit already cleared in rcu_report_qs_rnp() if this | ||
1019 | * race occurred. | ||
1020 | */ | 1079 | */ |
1021 | rdp->passed_quiesc = 0; /* try again later! */ | 1080 | rdp->passed_quiesce = 0; /* need qs for new gp. */ |
1022 | raw_spin_unlock_irqrestore(&rnp->lock, flags); | 1081 | raw_spin_unlock_irqrestore(&rnp->lock, flags); |
1023 | return; | 1082 | return; |
1024 | } | 1083 | } |
@@ -1062,14 +1121,14 @@ rcu_check_quiescent_state(struct rcu_state *rsp, struct rcu_data *rdp) | |||
1062 | * Was there a quiescent state since the beginning of the grace | 1121 | * Was there a quiescent state since the beginning of the grace |
1063 | * period? If no, then exit and wait for the next call. | 1122 | * period? If no, then exit and wait for the next call. |
1064 | */ | 1123 | */ |
1065 | if (!rdp->passed_quiesc) | 1124 | if (!rdp->passed_quiesce) |
1066 | return; | 1125 | return; |
1067 | 1126 | ||
1068 | /* | 1127 | /* |
1069 | * Tell RCU we are done (but rcu_report_qs_rdp() will be the | 1128 | * Tell RCU we are done (but rcu_report_qs_rdp() will be the |
1070 | * judge of that). | 1129 | * judge of that). |
1071 | */ | 1130 | */ |
1072 | rcu_report_qs_rdp(rdp->cpu, rsp, rdp, rdp->passed_quiesc_completed); | 1131 | rcu_report_qs_rdp(rdp->cpu, rsp, rdp, rdp->passed_quiesce_gpnum); |
1073 | } | 1132 | } |
1074 | 1133 | ||
1075 | #ifdef CONFIG_HOTPLUG_CPU | 1134 | #ifdef CONFIG_HOTPLUG_CPU |
@@ -1130,11 +1189,20 @@ static void __rcu_offline_cpu(int cpu, struct rcu_state *rsp) | |||
1130 | if (rnp->qsmaskinit != 0) { | 1189 | if (rnp->qsmaskinit != 0) { |
1131 | if (rnp != rdp->mynode) | 1190 | if (rnp != rdp->mynode) |
1132 | raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */ | 1191 | raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */ |
1192 | else | ||
1193 | trace_rcu_grace_period(rsp->name, | ||
1194 | rnp->gpnum + 1 - | ||
1195 | !!(rnp->qsmask & mask), | ||
1196 | "cpuofl"); | ||
1133 | break; | 1197 | break; |
1134 | } | 1198 | } |
1135 | if (rnp == rdp->mynode) | 1199 | if (rnp == rdp->mynode) { |
1200 | trace_rcu_grace_period(rsp->name, | ||
1201 | rnp->gpnum + 1 - | ||
1202 | !!(rnp->qsmask & mask), | ||
1203 | "cpuofl"); | ||
1136 | need_report = rcu_preempt_offline_tasks(rsp, rnp, rdp); | 1204 | need_report = rcu_preempt_offline_tasks(rsp, rnp, rdp); |
1137 | else | 1205 | } else |
1138 | raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */ | 1206 | raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */ |
1139 | mask = rnp->grpmask; | 1207 | mask = rnp->grpmask; |
1140 | rnp = rnp->parent; | 1208 | rnp = rnp->parent; |
@@ -1190,17 +1258,22 @@ static void rcu_do_batch(struct rcu_state *rsp, struct rcu_data *rdp) | |||
1190 | { | 1258 | { |
1191 | unsigned long flags; | 1259 | unsigned long flags; |
1192 | struct rcu_head *next, *list, **tail; | 1260 | struct rcu_head *next, *list, **tail; |
1193 | int count; | 1261 | int bl, count; |
1194 | 1262 | ||
1195 | /* If no callbacks are ready, just return.*/ | 1263 | /* If no callbacks are ready, just return.*/ |
1196 | if (!cpu_has_callbacks_ready_to_invoke(rdp)) | 1264 | if (!cpu_has_callbacks_ready_to_invoke(rdp)) { |
1265 | trace_rcu_batch_start(rsp->name, 0, 0); | ||
1266 | trace_rcu_batch_end(rsp->name, 0); | ||
1197 | return; | 1267 | return; |
1268 | } | ||
1198 | 1269 | ||
1199 | /* | 1270 | /* |
1200 | * Extract the list of ready callbacks, disabling to prevent | 1271 | * Extract the list of ready callbacks, disabling to prevent |
1201 | * races with call_rcu() from interrupt handlers. | 1272 | * races with call_rcu() from interrupt handlers. |
1202 | */ | 1273 | */ |
1203 | local_irq_save(flags); | 1274 | local_irq_save(flags); |
1275 | bl = rdp->blimit; | ||
1276 | trace_rcu_batch_start(rsp->name, rdp->qlen, bl); | ||
1204 | list = rdp->nxtlist; | 1277 | list = rdp->nxtlist; |
1205 | rdp->nxtlist = *rdp->nxttail[RCU_DONE_TAIL]; | 1278 | rdp->nxtlist = *rdp->nxttail[RCU_DONE_TAIL]; |
1206 | *rdp->nxttail[RCU_DONE_TAIL] = NULL; | 1279 | *rdp->nxttail[RCU_DONE_TAIL] = NULL; |
@@ -1216,13 +1289,14 @@ static void rcu_do_batch(struct rcu_state *rsp, struct rcu_data *rdp) | |||
1216 | next = list->next; | 1289 | next = list->next; |
1217 | prefetch(next); | 1290 | prefetch(next); |
1218 | debug_rcu_head_unqueue(list); | 1291 | debug_rcu_head_unqueue(list); |
1219 | __rcu_reclaim(list); | 1292 | __rcu_reclaim(rsp->name, list); |
1220 | list = next; | 1293 | list = next; |
1221 | if (++count >= rdp->blimit) | 1294 | if (++count >= bl) |
1222 | break; | 1295 | break; |
1223 | } | 1296 | } |
1224 | 1297 | ||
1225 | local_irq_save(flags); | 1298 | local_irq_save(flags); |
1299 | trace_rcu_batch_end(rsp->name, count); | ||
1226 | 1300 | ||
1227 | /* Update count, and requeue any remaining callbacks. */ | 1301 | /* Update count, and requeue any remaining callbacks. */ |
1228 | rdp->qlen -= count; | 1302 | rdp->qlen -= count; |
@@ -1250,7 +1324,7 @@ static void rcu_do_batch(struct rcu_state *rsp, struct rcu_data *rdp) | |||
1250 | 1324 | ||
1251 | local_irq_restore(flags); | 1325 | local_irq_restore(flags); |
1252 | 1326 | ||
1253 | /* Re-raise the RCU softirq if there are callbacks remaining. */ | 1327 | /* Re-invoke RCU core processing if there are callbacks remaining. */ |
1254 | if (cpu_has_callbacks_ready_to_invoke(rdp)) | 1328 | if (cpu_has_callbacks_ready_to_invoke(rdp)) |
1255 | invoke_rcu_core(); | 1329 | invoke_rcu_core(); |
1256 | } | 1330 | } |
@@ -1258,7 +1332,7 @@ static void rcu_do_batch(struct rcu_state *rsp, struct rcu_data *rdp) | |||
1258 | /* | 1332 | /* |
1259 | * Check to see if this CPU is in a non-context-switch quiescent state | 1333 | * Check to see if this CPU is in a non-context-switch quiescent state |
1260 | * (user mode or idle loop for rcu, non-softirq execution for rcu_bh). | 1334 | * (user mode or idle loop for rcu, non-softirq execution for rcu_bh). |
1261 | * Also schedule the RCU softirq handler. | 1335 | * Also schedule RCU core processing. |
1262 | * | 1336 | * |
1263 | * This function must be called with hardirqs disabled. It is normally | 1337 | * This function must be called with hardirqs disabled. It is normally |
1264 | * invoked from the scheduling-clock interrupt. If rcu_pending returns | 1338 | * invoked from the scheduling-clock interrupt. If rcu_pending returns |
@@ -1266,6 +1340,7 @@ static void rcu_do_batch(struct rcu_state *rsp, struct rcu_data *rdp) | |||
1266 | */ | 1340 | */ |
1267 | void rcu_check_callbacks(int cpu, int user) | 1341 | void rcu_check_callbacks(int cpu, int user) |
1268 | { | 1342 | { |
1343 | trace_rcu_utilization("Start scheduler-tick"); | ||
1269 | if (user || | 1344 | if (user || |
1270 | (idle_cpu(cpu) && rcu_scheduler_active && | 1345 | (idle_cpu(cpu) && rcu_scheduler_active && |
1271 | !in_softirq() && hardirq_count() <= (1 << HARDIRQ_SHIFT))) { | 1346 | !in_softirq() && hardirq_count() <= (1 << HARDIRQ_SHIFT))) { |
@@ -1299,6 +1374,7 @@ void rcu_check_callbacks(int cpu, int user) | |||
1299 | rcu_preempt_check_callbacks(cpu); | 1374 | rcu_preempt_check_callbacks(cpu); |
1300 | if (rcu_pending(cpu)) | 1375 | if (rcu_pending(cpu)) |
1301 | invoke_rcu_core(); | 1376 | invoke_rcu_core(); |
1377 | trace_rcu_utilization("End scheduler-tick"); | ||
1302 | } | 1378 | } |
1303 | 1379 | ||
1304 | #ifdef CONFIG_SMP | 1380 | #ifdef CONFIG_SMP |
@@ -1360,10 +1436,14 @@ static void force_quiescent_state(struct rcu_state *rsp, int relaxed) | |||
1360 | unsigned long flags; | 1436 | unsigned long flags; |
1361 | struct rcu_node *rnp = rcu_get_root(rsp); | 1437 | struct rcu_node *rnp = rcu_get_root(rsp); |
1362 | 1438 | ||
1363 | if (!rcu_gp_in_progress(rsp)) | 1439 | trace_rcu_utilization("Start fqs"); |
1440 | if (!rcu_gp_in_progress(rsp)) { | ||
1441 | trace_rcu_utilization("End fqs"); | ||
1364 | return; /* No grace period in progress, nothing to force. */ | 1442 | return; /* No grace period in progress, nothing to force. */ |
1443 | } | ||
1365 | if (!raw_spin_trylock_irqsave(&rsp->fqslock, flags)) { | 1444 | if (!raw_spin_trylock_irqsave(&rsp->fqslock, flags)) { |
1366 | rsp->n_force_qs_lh++; /* Inexact, can lose counts. Tough! */ | 1445 | rsp->n_force_qs_lh++; /* Inexact, can lose counts. Tough! */ |
1446 | trace_rcu_utilization("End fqs"); | ||
1367 | return; /* Someone else is already on the job. */ | 1447 | return; /* Someone else is already on the job. */ |
1368 | } | 1448 | } |
1369 | if (relaxed && ULONG_CMP_GE(rsp->jiffies_force_qs, jiffies)) | 1449 | if (relaxed && ULONG_CMP_GE(rsp->jiffies_force_qs, jiffies)) |
@@ -1412,11 +1492,13 @@ static void force_quiescent_state(struct rcu_state *rsp, int relaxed) | |||
1412 | raw_spin_unlock(&rsp->fqslock); /* irqs remain disabled */ | 1492 | raw_spin_unlock(&rsp->fqslock); /* irqs remain disabled */ |
1413 | rsp->fqs_need_gp = 0; | 1493 | rsp->fqs_need_gp = 0; |
1414 | rcu_start_gp(rsp, flags); /* releases rnp->lock */ | 1494 | rcu_start_gp(rsp, flags); /* releases rnp->lock */ |
1495 | trace_rcu_utilization("End fqs"); | ||
1415 | return; | 1496 | return; |
1416 | } | 1497 | } |
1417 | raw_spin_unlock(&rnp->lock); /* irqs remain disabled */ | 1498 | raw_spin_unlock(&rnp->lock); /* irqs remain disabled */ |
1418 | unlock_fqs_ret: | 1499 | unlock_fqs_ret: |
1419 | raw_spin_unlock_irqrestore(&rsp->fqslock, flags); | 1500 | raw_spin_unlock_irqrestore(&rsp->fqslock, flags); |
1501 | trace_rcu_utilization("End fqs"); | ||
1420 | } | 1502 | } |
1421 | 1503 | ||
1422 | #else /* #ifdef CONFIG_SMP */ | 1504 | #else /* #ifdef CONFIG_SMP */ |
@@ -1429,9 +1511,9 @@ static void force_quiescent_state(struct rcu_state *rsp, int relaxed) | |||
1429 | #endif /* #else #ifdef CONFIG_SMP */ | 1511 | #endif /* #else #ifdef CONFIG_SMP */ |
1430 | 1512 | ||
1431 | /* | 1513 | /* |
1432 | * This does the RCU processing work from softirq context for the | 1514 | * This does the RCU core processing work for the specified rcu_state |
1433 | * specified rcu_state and rcu_data structures. This may be called | 1515 | * and rcu_data structures. This may be called only from the CPU to |
1434 | * only from the CPU to whom the rdp belongs. | 1516 | * whom the rdp belongs. |
1435 | */ | 1517 | */ |
1436 | static void | 1518 | static void |
1437 | __rcu_process_callbacks(struct rcu_state *rsp, struct rcu_data *rdp) | 1519 | __rcu_process_callbacks(struct rcu_state *rsp, struct rcu_data *rdp) |
@@ -1468,24 +1550,24 @@ __rcu_process_callbacks(struct rcu_state *rsp, struct rcu_data *rdp) | |||
1468 | } | 1550 | } |
1469 | 1551 | ||
1470 | /* | 1552 | /* |
1471 | * Do softirq processing for the current CPU. | 1553 | * Do RCU core processing for the current CPU. |
1472 | */ | 1554 | */ |
1473 | static void rcu_process_callbacks(struct softirq_action *unused) | 1555 | static void rcu_process_callbacks(struct softirq_action *unused) |
1474 | { | 1556 | { |
1557 | trace_rcu_utilization("Start RCU core"); | ||
1475 | __rcu_process_callbacks(&rcu_sched_state, | 1558 | __rcu_process_callbacks(&rcu_sched_state, |
1476 | &__get_cpu_var(rcu_sched_data)); | 1559 | &__get_cpu_var(rcu_sched_data)); |
1477 | __rcu_process_callbacks(&rcu_bh_state, &__get_cpu_var(rcu_bh_data)); | 1560 | __rcu_process_callbacks(&rcu_bh_state, &__get_cpu_var(rcu_bh_data)); |
1478 | rcu_preempt_process_callbacks(); | 1561 | rcu_preempt_process_callbacks(); |
1479 | 1562 | trace_rcu_utilization("End RCU core"); | |
1480 | /* If we are last CPU on way to dyntick-idle mode, accelerate it. */ | ||
1481 | rcu_needs_cpu_flush(); | ||
1482 | } | 1563 | } |
1483 | 1564 | ||
1484 | /* | 1565 | /* |
1485 | * Wake up the current CPU's kthread. This replaces raise_softirq() | 1566 | * Schedule RCU callback invocation. If the specified type of RCU |
1486 | * in earlier versions of RCU. Note that because we are running on | 1567 | * does not support RCU priority boosting, just do a direct call, |
1487 | * the current CPU with interrupts disabled, the rcu_cpu_kthread_task | 1568 | * otherwise wake up the per-CPU kernel kthread. Note that because we |
1488 | * cannot disappear out from under us. | 1569 | * are running on the current CPU with interrupts disabled, the |
1570 | * rcu_cpu_kthread_task cannot disappear out from under us. | ||
1489 | */ | 1571 | */ |
1490 | static void invoke_rcu_callbacks(struct rcu_state *rsp, struct rcu_data *rdp) | 1572 | static void invoke_rcu_callbacks(struct rcu_state *rsp, struct rcu_data *rdp) |
1491 | { | 1573 | { |
@@ -1530,6 +1612,12 @@ __call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu), | |||
1530 | rdp->nxttail[RCU_NEXT_TAIL] = &head->next; | 1612 | rdp->nxttail[RCU_NEXT_TAIL] = &head->next; |
1531 | rdp->qlen++; | 1613 | rdp->qlen++; |
1532 | 1614 | ||
1615 | if (__is_kfree_rcu_offset((unsigned long)func)) | ||
1616 | trace_rcu_kfree_callback(rsp->name, head, (unsigned long)func, | ||
1617 | rdp->qlen); | ||
1618 | else | ||
1619 | trace_rcu_callback(rsp->name, head, rdp->qlen); | ||
1620 | |||
1533 | /* If interrupts were disabled, don't dive into RCU core. */ | 1621 | /* If interrupts were disabled, don't dive into RCU core. */ |
1534 | if (irqs_disabled_flags(flags)) { | 1622 | if (irqs_disabled_flags(flags)) { |
1535 | local_irq_restore(flags); | 1623 | local_irq_restore(flags); |
@@ -1613,18 +1701,9 @@ EXPORT_SYMBOL_GPL(call_rcu_bh); | |||
1613 | */ | 1701 | */ |
1614 | void synchronize_sched(void) | 1702 | void synchronize_sched(void) |
1615 | { | 1703 | { |
1616 | struct rcu_synchronize rcu; | ||
1617 | |||
1618 | if (rcu_blocking_is_gp()) | 1704 | if (rcu_blocking_is_gp()) |
1619 | return; | 1705 | return; |
1620 | 1706 | wait_rcu_gp(call_rcu_sched); | |
1621 | init_rcu_head_on_stack(&rcu.head); | ||
1622 | init_completion(&rcu.completion); | ||
1623 | /* Will wake me after RCU finished. */ | ||
1624 | call_rcu_sched(&rcu.head, wakeme_after_rcu); | ||
1625 | /* Wait for it. */ | ||
1626 | wait_for_completion(&rcu.completion); | ||
1627 | destroy_rcu_head_on_stack(&rcu.head); | ||
1628 | } | 1707 | } |
1629 | EXPORT_SYMBOL_GPL(synchronize_sched); | 1708 | EXPORT_SYMBOL_GPL(synchronize_sched); |
1630 | 1709 | ||
@@ -1639,18 +1718,9 @@ EXPORT_SYMBOL_GPL(synchronize_sched); | |||
1639 | */ | 1718 | */ |
1640 | void synchronize_rcu_bh(void) | 1719 | void synchronize_rcu_bh(void) |
1641 | { | 1720 | { |
1642 | struct rcu_synchronize rcu; | ||
1643 | |||
1644 | if (rcu_blocking_is_gp()) | 1721 | if (rcu_blocking_is_gp()) |
1645 | return; | 1722 | return; |
1646 | 1723 | wait_rcu_gp(call_rcu_bh); | |
1647 | init_rcu_head_on_stack(&rcu.head); | ||
1648 | init_completion(&rcu.completion); | ||
1649 | /* Will wake me after RCU finished. */ | ||
1650 | call_rcu_bh(&rcu.head, wakeme_after_rcu); | ||
1651 | /* Wait for it. */ | ||
1652 | wait_for_completion(&rcu.completion); | ||
1653 | destroy_rcu_head_on_stack(&rcu.head); | ||
1654 | } | 1724 | } |
1655 | EXPORT_SYMBOL_GPL(synchronize_rcu_bh); | 1725 | EXPORT_SYMBOL_GPL(synchronize_rcu_bh); |
1656 | 1726 | ||
@@ -1671,7 +1741,8 @@ static int __rcu_pending(struct rcu_state *rsp, struct rcu_data *rdp) | |||
1671 | check_cpu_stall(rsp, rdp); | 1741 | check_cpu_stall(rsp, rdp); |
1672 | 1742 | ||
1673 | /* Is the RCU core waiting for a quiescent state from this CPU? */ | 1743 | /* Is the RCU core waiting for a quiescent state from this CPU? */ |
1674 | if (rdp->qs_pending && !rdp->passed_quiesc) { | 1744 | if (rcu_scheduler_fully_active && |
1745 | rdp->qs_pending && !rdp->passed_quiesce) { | ||
1675 | 1746 | ||
1676 | /* | 1747 | /* |
1677 | * If force_quiescent_state() coming soon and this CPU | 1748 | * If force_quiescent_state() coming soon and this CPU |
@@ -1683,7 +1754,7 @@ static int __rcu_pending(struct rcu_state *rsp, struct rcu_data *rdp) | |||
1683 | ULONG_CMP_LT(ACCESS_ONCE(rsp->jiffies_force_qs) - 1, | 1754 | ULONG_CMP_LT(ACCESS_ONCE(rsp->jiffies_force_qs) - 1, |
1684 | jiffies)) | 1755 | jiffies)) |
1685 | set_need_resched(); | 1756 | set_need_resched(); |
1686 | } else if (rdp->qs_pending && rdp->passed_quiesc) { | 1757 | } else if (rdp->qs_pending && rdp->passed_quiesce) { |
1687 | rdp->n_rp_report_qs++; | 1758 | rdp->n_rp_report_qs++; |
1688 | return 1; | 1759 | return 1; |
1689 | } | 1760 | } |
@@ -1846,6 +1917,7 @@ rcu_boot_init_percpu_data(int cpu, struct rcu_state *rsp) | |||
1846 | rdp->dynticks = &per_cpu(rcu_dynticks, cpu); | 1917 | rdp->dynticks = &per_cpu(rcu_dynticks, cpu); |
1847 | #endif /* #ifdef CONFIG_NO_HZ */ | 1918 | #endif /* #ifdef CONFIG_NO_HZ */ |
1848 | rdp->cpu = cpu; | 1919 | rdp->cpu = cpu; |
1920 | rdp->rsp = rsp; | ||
1849 | raw_spin_unlock_irqrestore(&rnp->lock, flags); | 1921 | raw_spin_unlock_irqrestore(&rnp->lock, flags); |
1850 | } | 1922 | } |
1851 | 1923 | ||
@@ -1865,8 +1937,6 @@ rcu_init_percpu_data(int cpu, struct rcu_state *rsp, int preemptible) | |||
1865 | 1937 | ||
1866 | /* Set up local state, ensuring consistent view of global state. */ | 1938 | /* Set up local state, ensuring consistent view of global state. */ |
1867 | raw_spin_lock_irqsave(&rnp->lock, flags); | 1939 | raw_spin_lock_irqsave(&rnp->lock, flags); |
1868 | rdp->passed_quiesc = 0; /* We could be racing with new GP, */ | ||
1869 | rdp->qs_pending = 1; /* so set up to respond to current GP. */ | ||
1870 | rdp->beenonline = 1; /* We have now been online. */ | 1940 | rdp->beenonline = 1; /* We have now been online. */ |
1871 | rdp->preemptible = preemptible; | 1941 | rdp->preemptible = preemptible; |
1872 | rdp->qlen_last_fqs_check = 0; | 1942 | rdp->qlen_last_fqs_check = 0; |
@@ -1891,9 +1961,17 @@ rcu_init_percpu_data(int cpu, struct rcu_state *rsp, int preemptible) | |||
1891 | rnp->qsmaskinit |= mask; | 1961 | rnp->qsmaskinit |= mask; |
1892 | mask = rnp->grpmask; | 1962 | mask = rnp->grpmask; |
1893 | if (rnp == rdp->mynode) { | 1963 | if (rnp == rdp->mynode) { |
1894 | rdp->gpnum = rnp->completed; /* if GP in progress... */ | 1964 | /* |
1965 | * If there is a grace period in progress, we will | ||
1966 | * set up to wait for it next time we run the | ||
1967 | * RCU core code. | ||
1968 | */ | ||
1969 | rdp->gpnum = rnp->completed; | ||
1895 | rdp->completed = rnp->completed; | 1970 | rdp->completed = rnp->completed; |
1896 | rdp->passed_quiesc_completed = rnp->completed - 1; | 1971 | rdp->passed_quiesce = 0; |
1972 | rdp->qs_pending = 0; | ||
1973 | rdp->passed_quiesce_gpnum = rnp->gpnum - 1; | ||
1974 | trace_rcu_grace_period(rsp->name, rdp->gpnum, "cpuonl"); | ||
1897 | } | 1975 | } |
1898 | raw_spin_unlock(&rnp->lock); /* irqs already disabled. */ | 1976 | raw_spin_unlock(&rnp->lock); /* irqs already disabled. */ |
1899 | rnp = rnp->parent; | 1977 | rnp = rnp->parent; |
@@ -1919,6 +1997,7 @@ static int __cpuinit rcu_cpu_notify(struct notifier_block *self, | |||
1919 | struct rcu_data *rdp = per_cpu_ptr(rcu_state->rda, cpu); | 1997 | struct rcu_data *rdp = per_cpu_ptr(rcu_state->rda, cpu); |
1920 | struct rcu_node *rnp = rdp->mynode; | 1998 | struct rcu_node *rnp = rdp->mynode; |
1921 | 1999 | ||
2000 | trace_rcu_utilization("Start CPU hotplug"); | ||
1922 | switch (action) { | 2001 | switch (action) { |
1923 | case CPU_UP_PREPARE: | 2002 | case CPU_UP_PREPARE: |
1924 | case CPU_UP_PREPARE_FROZEN: | 2003 | case CPU_UP_PREPARE_FROZEN: |
@@ -1954,6 +2033,7 @@ static int __cpuinit rcu_cpu_notify(struct notifier_block *self, | |||
1954 | default: | 2033 | default: |
1955 | break; | 2034 | break; |
1956 | } | 2035 | } |
2036 | trace_rcu_utilization("End CPU hotplug"); | ||
1957 | return NOTIFY_OK; | 2037 | return NOTIFY_OK; |
1958 | } | 2038 | } |
1959 | 2039 | ||
diff --git a/kernel/rcutree.h b/kernel/rcutree.h index 01b2ccda26fb..849ce9ec51fe 100644 --- a/kernel/rcutree.h +++ b/kernel/rcutree.h | |||
@@ -230,9 +230,9 @@ struct rcu_data { | |||
230 | /* in order to detect GP end. */ | 230 | /* in order to detect GP end. */ |
231 | unsigned long gpnum; /* Highest gp number that this CPU */ | 231 | unsigned long gpnum; /* Highest gp number that this CPU */ |
232 | /* is aware of having started. */ | 232 | /* is aware of having started. */ |
233 | unsigned long passed_quiesc_completed; | 233 | unsigned long passed_quiesce_gpnum; |
234 | /* Value of completed at time of qs. */ | 234 | /* gpnum at time of quiescent state. */ |
235 | bool passed_quiesc; /* User-mode/idle loop etc. */ | 235 | bool passed_quiesce; /* User-mode/idle loop etc. */ |
236 | bool qs_pending; /* Core waits for quiesc state. */ | 236 | bool qs_pending; /* Core waits for quiesc state. */ |
237 | bool beenonline; /* CPU online at least once. */ | 237 | bool beenonline; /* CPU online at least once. */ |
238 | bool preemptible; /* Preemptible RCU? */ | 238 | bool preemptible; /* Preemptible RCU? */ |
@@ -299,6 +299,7 @@ struct rcu_data { | |||
299 | unsigned long n_rp_need_nothing; | 299 | unsigned long n_rp_need_nothing; |
300 | 300 | ||
301 | int cpu; | 301 | int cpu; |
302 | struct rcu_state *rsp; | ||
302 | }; | 303 | }; |
303 | 304 | ||
304 | /* Values for signaled field in struct rcu_state. */ | 305 | /* Values for signaled field in struct rcu_state. */ |
@@ -417,6 +418,13 @@ extern struct rcu_state rcu_preempt_state; | |||
417 | DECLARE_PER_CPU(struct rcu_data, rcu_preempt_data); | 418 | DECLARE_PER_CPU(struct rcu_data, rcu_preempt_data); |
418 | #endif /* #ifdef CONFIG_TREE_PREEMPT_RCU */ | 419 | #endif /* #ifdef CONFIG_TREE_PREEMPT_RCU */ |
419 | 420 | ||
421 | #ifdef CONFIG_RCU_BOOST | ||
422 | DECLARE_PER_CPU(unsigned int, rcu_cpu_kthread_status); | ||
423 | DECLARE_PER_CPU(int, rcu_cpu_kthread_cpu); | ||
424 | DECLARE_PER_CPU(unsigned int, rcu_cpu_kthread_loops); | ||
425 | DECLARE_PER_CPU(char, rcu_cpu_has_work); | ||
426 | #endif /* #ifdef CONFIG_RCU_BOOST */ | ||
427 | |||
420 | #ifndef RCU_TREE_NONCORE | 428 | #ifndef RCU_TREE_NONCORE |
421 | 429 | ||
422 | /* Forward declarations for rcutree_plugin.h */ | 430 | /* Forward declarations for rcutree_plugin.h */ |
@@ -430,7 +438,7 @@ static void rcu_report_unblock_qs_rnp(struct rcu_node *rnp, | |||
430 | static void rcu_stop_cpu_kthread(int cpu); | 438 | static void rcu_stop_cpu_kthread(int cpu); |
431 | #endif /* #ifdef CONFIG_HOTPLUG_CPU */ | 439 | #endif /* #ifdef CONFIG_HOTPLUG_CPU */ |
432 | static void rcu_print_detail_task_stall(struct rcu_state *rsp); | 440 | static void rcu_print_detail_task_stall(struct rcu_state *rsp); |
433 | static void rcu_print_task_stall(struct rcu_node *rnp); | 441 | static int rcu_print_task_stall(struct rcu_node *rnp); |
434 | static void rcu_preempt_stall_reset(void); | 442 | static void rcu_preempt_stall_reset(void); |
435 | static void rcu_preempt_check_blocked_tasks(struct rcu_node *rnp); | 443 | static void rcu_preempt_check_blocked_tasks(struct rcu_node *rnp); |
436 | #ifdef CONFIG_HOTPLUG_CPU | 444 | #ifdef CONFIG_HOTPLUG_CPU |
@@ -450,7 +458,6 @@ static int rcu_preempt_needs_cpu(int cpu); | |||
450 | static void __cpuinit rcu_preempt_init_percpu_data(int cpu); | 458 | static void __cpuinit rcu_preempt_init_percpu_data(int cpu); |
451 | static void rcu_preempt_send_cbs_to_online(void); | 459 | static void rcu_preempt_send_cbs_to_online(void); |
452 | static void __init __rcu_init_preempt(void); | 460 | static void __init __rcu_init_preempt(void); |
453 | static void rcu_needs_cpu_flush(void); | ||
454 | static void rcu_initiate_boost(struct rcu_node *rnp, unsigned long flags); | 461 | static void rcu_initiate_boost(struct rcu_node *rnp, unsigned long flags); |
455 | static void rcu_preempt_boost_start_gp(struct rcu_node *rnp); | 462 | static void rcu_preempt_boost_start_gp(struct rcu_node *rnp); |
456 | static void invoke_rcu_callbacks_kthread(void); | 463 | static void invoke_rcu_callbacks_kthread(void); |
diff --git a/kernel/rcutree_plugin.h b/kernel/rcutree_plugin.h index 8aafbb80b8b0..4b9b9f8a4184 100644 --- a/kernel/rcutree_plugin.h +++ b/kernel/rcutree_plugin.h | |||
@@ -27,6 +27,14 @@ | |||
27 | #include <linux/delay.h> | 27 | #include <linux/delay.h> |
28 | #include <linux/stop_machine.h> | 28 | #include <linux/stop_machine.h> |
29 | 29 | ||
30 | #define RCU_KTHREAD_PRIO 1 | ||
31 | |||
32 | #ifdef CONFIG_RCU_BOOST | ||
33 | #define RCU_BOOST_PRIO CONFIG_RCU_BOOST_PRIO | ||
34 | #else | ||
35 | #define RCU_BOOST_PRIO RCU_KTHREAD_PRIO | ||
36 | #endif | ||
37 | |||
30 | /* | 38 | /* |
31 | * Check the RCU kernel configuration parameters and print informative | 39 | * Check the RCU kernel configuration parameters and print informative |
32 | * messages about anything out of the ordinary. If you like #ifdef, you | 40 | * messages about anything out of the ordinary. If you like #ifdef, you |
@@ -64,7 +72,7 @@ static void __init rcu_bootup_announce_oddness(void) | |||
64 | 72 | ||
65 | #ifdef CONFIG_TREE_PREEMPT_RCU | 73 | #ifdef CONFIG_TREE_PREEMPT_RCU |
66 | 74 | ||
67 | struct rcu_state rcu_preempt_state = RCU_STATE_INITIALIZER(rcu_preempt_state); | 75 | struct rcu_state rcu_preempt_state = RCU_STATE_INITIALIZER(rcu_preempt); |
68 | DEFINE_PER_CPU(struct rcu_data, rcu_preempt_data); | 76 | DEFINE_PER_CPU(struct rcu_data, rcu_preempt_data); |
69 | static struct rcu_state *rcu_state = &rcu_preempt_state; | 77 | static struct rcu_state *rcu_state = &rcu_preempt_state; |
70 | 78 | ||
@@ -122,9 +130,11 @@ static void rcu_preempt_qs(int cpu) | |||
122 | { | 130 | { |
123 | struct rcu_data *rdp = &per_cpu(rcu_preempt_data, cpu); | 131 | struct rcu_data *rdp = &per_cpu(rcu_preempt_data, cpu); |
124 | 132 | ||
125 | rdp->passed_quiesc_completed = rdp->gpnum - 1; | 133 | rdp->passed_quiesce_gpnum = rdp->gpnum; |
126 | barrier(); | 134 | barrier(); |
127 | rdp->passed_quiesc = 1; | 135 | if (rdp->passed_quiesce == 0) |
136 | trace_rcu_grace_period("rcu_preempt", rdp->gpnum, "cpuqs"); | ||
137 | rdp->passed_quiesce = 1; | ||
128 | current->rcu_read_unlock_special &= ~RCU_READ_UNLOCK_NEED_QS; | 138 | current->rcu_read_unlock_special &= ~RCU_READ_UNLOCK_NEED_QS; |
129 | } | 139 | } |
130 | 140 | ||
@@ -190,6 +200,11 @@ static void rcu_preempt_note_context_switch(int cpu) | |||
190 | if (rnp->qsmask & rdp->grpmask) | 200 | if (rnp->qsmask & rdp->grpmask) |
191 | rnp->gp_tasks = &t->rcu_node_entry; | 201 | rnp->gp_tasks = &t->rcu_node_entry; |
192 | } | 202 | } |
203 | trace_rcu_preempt_task(rdp->rsp->name, | ||
204 | t->pid, | ||
205 | (rnp->qsmask & rdp->grpmask) | ||
206 | ? rnp->gpnum | ||
207 | : rnp->gpnum + 1); | ||
193 | raw_spin_unlock_irqrestore(&rnp->lock, flags); | 208 | raw_spin_unlock_irqrestore(&rnp->lock, flags); |
194 | } else if (t->rcu_read_lock_nesting < 0 && | 209 | } else if (t->rcu_read_lock_nesting < 0 && |
195 | t->rcu_read_unlock_special) { | 210 | t->rcu_read_unlock_special) { |
@@ -299,6 +314,9 @@ static noinline void rcu_read_unlock_special(struct task_struct *t) | |||
299 | int empty_exp; | 314 | int empty_exp; |
300 | unsigned long flags; | 315 | unsigned long flags; |
301 | struct list_head *np; | 316 | struct list_head *np; |
317 | #ifdef CONFIG_RCU_BOOST | ||
318 | struct rt_mutex *rbmp = NULL; | ||
319 | #endif /* #ifdef CONFIG_RCU_BOOST */ | ||
302 | struct rcu_node *rnp; | 320 | struct rcu_node *rnp; |
303 | int special; | 321 | int special; |
304 | 322 | ||
@@ -344,6 +362,9 @@ static noinline void rcu_read_unlock_special(struct task_struct *t) | |||
344 | smp_mb(); /* ensure expedited fastpath sees end of RCU c-s. */ | 362 | smp_mb(); /* ensure expedited fastpath sees end of RCU c-s. */ |
345 | np = rcu_next_node_entry(t, rnp); | 363 | np = rcu_next_node_entry(t, rnp); |
346 | list_del_init(&t->rcu_node_entry); | 364 | list_del_init(&t->rcu_node_entry); |
365 | t->rcu_blocked_node = NULL; | ||
366 | trace_rcu_unlock_preempted_task("rcu_preempt", | ||
367 | rnp->gpnum, t->pid); | ||
347 | if (&t->rcu_node_entry == rnp->gp_tasks) | 368 | if (&t->rcu_node_entry == rnp->gp_tasks) |
348 | rnp->gp_tasks = np; | 369 | rnp->gp_tasks = np; |
349 | if (&t->rcu_node_entry == rnp->exp_tasks) | 370 | if (&t->rcu_node_entry == rnp->exp_tasks) |
@@ -351,30 +372,34 @@ static noinline void rcu_read_unlock_special(struct task_struct *t) | |||
351 | #ifdef CONFIG_RCU_BOOST | 372 | #ifdef CONFIG_RCU_BOOST |
352 | if (&t->rcu_node_entry == rnp->boost_tasks) | 373 | if (&t->rcu_node_entry == rnp->boost_tasks) |
353 | rnp->boost_tasks = np; | 374 | rnp->boost_tasks = np; |
354 | /* Snapshot and clear ->rcu_boosted with rcu_node lock held. */ | 375 | /* Snapshot/clear ->rcu_boost_mutex with rcu_node lock held. */ |
355 | if (t->rcu_boosted) { | 376 | if (t->rcu_boost_mutex) { |
356 | special |= RCU_READ_UNLOCK_BOOSTED; | 377 | rbmp = t->rcu_boost_mutex; |
357 | t->rcu_boosted = 0; | 378 | t->rcu_boost_mutex = NULL; |
358 | } | 379 | } |
359 | #endif /* #ifdef CONFIG_RCU_BOOST */ | 380 | #endif /* #ifdef CONFIG_RCU_BOOST */ |
360 | t->rcu_blocked_node = NULL; | ||
361 | 381 | ||
362 | /* | 382 | /* |
363 | * If this was the last task on the current list, and if | 383 | * If this was the last task on the current list, and if |
364 | * we aren't waiting on any CPUs, report the quiescent state. | 384 | * we aren't waiting on any CPUs, report the quiescent state. |
365 | * Note that rcu_report_unblock_qs_rnp() releases rnp->lock. | 385 | * Note that rcu_report_unblock_qs_rnp() releases rnp->lock. |
366 | */ | 386 | */ |
367 | if (empty) | 387 | if (!empty && !rcu_preempt_blocked_readers_cgp(rnp)) { |
368 | raw_spin_unlock_irqrestore(&rnp->lock, flags); | 388 | trace_rcu_quiescent_state_report("preempt_rcu", |
369 | else | 389 | rnp->gpnum, |
390 | 0, rnp->qsmask, | ||
391 | rnp->level, | ||
392 | rnp->grplo, | ||
393 | rnp->grphi, | ||
394 | !!rnp->gp_tasks); | ||
370 | rcu_report_unblock_qs_rnp(rnp, flags); | 395 | rcu_report_unblock_qs_rnp(rnp, flags); |
396 | } else | ||
397 | raw_spin_unlock_irqrestore(&rnp->lock, flags); | ||
371 | 398 | ||
372 | #ifdef CONFIG_RCU_BOOST | 399 | #ifdef CONFIG_RCU_BOOST |
373 | /* Unboost if we were boosted. */ | 400 | /* Unboost if we were boosted. */ |
374 | if (special & RCU_READ_UNLOCK_BOOSTED) { | 401 | if (rbmp) |
375 | rt_mutex_unlock(t->rcu_boost_mutex); | 402 | rt_mutex_unlock(rbmp); |
376 | t->rcu_boost_mutex = NULL; | ||
377 | } | ||
378 | #endif /* #ifdef CONFIG_RCU_BOOST */ | 403 | #endif /* #ifdef CONFIG_RCU_BOOST */ |
379 | 404 | ||
380 | /* | 405 | /* |
@@ -399,10 +424,10 @@ void __rcu_read_unlock(void) | |||
399 | { | 424 | { |
400 | struct task_struct *t = current; | 425 | struct task_struct *t = current; |
401 | 426 | ||
402 | barrier(); /* needed if we ever invoke rcu_read_unlock in rcutree.c */ | ||
403 | if (t->rcu_read_lock_nesting != 1) | 427 | if (t->rcu_read_lock_nesting != 1) |
404 | --t->rcu_read_lock_nesting; | 428 | --t->rcu_read_lock_nesting; |
405 | else { | 429 | else { |
430 | barrier(); /* critical section before exit code. */ | ||
406 | t->rcu_read_lock_nesting = INT_MIN; | 431 | t->rcu_read_lock_nesting = INT_MIN; |
407 | barrier(); /* assign before ->rcu_read_unlock_special load */ | 432 | barrier(); /* assign before ->rcu_read_unlock_special load */ |
408 | if (unlikely(ACCESS_ONCE(t->rcu_read_unlock_special))) | 433 | if (unlikely(ACCESS_ONCE(t->rcu_read_unlock_special))) |
@@ -466,16 +491,20 @@ static void rcu_print_detail_task_stall(struct rcu_state *rsp) | |||
466 | * Scan the current list of tasks blocked within RCU read-side critical | 491 | * Scan the current list of tasks blocked within RCU read-side critical |
467 | * sections, printing out the tid of each. | 492 | * sections, printing out the tid of each. |
468 | */ | 493 | */ |
469 | static void rcu_print_task_stall(struct rcu_node *rnp) | 494 | static int rcu_print_task_stall(struct rcu_node *rnp) |
470 | { | 495 | { |
471 | struct task_struct *t; | 496 | struct task_struct *t; |
497 | int ndetected = 0; | ||
472 | 498 | ||
473 | if (!rcu_preempt_blocked_readers_cgp(rnp)) | 499 | if (!rcu_preempt_blocked_readers_cgp(rnp)) |
474 | return; | 500 | return 0; |
475 | t = list_entry(rnp->gp_tasks, | 501 | t = list_entry(rnp->gp_tasks, |
476 | struct task_struct, rcu_node_entry); | 502 | struct task_struct, rcu_node_entry); |
477 | list_for_each_entry_continue(t, &rnp->blkd_tasks, rcu_node_entry) | 503 | list_for_each_entry_continue(t, &rnp->blkd_tasks, rcu_node_entry) { |
478 | printk(" P%d", t->pid); | 504 | printk(" P%d", t->pid); |
505 | ndetected++; | ||
506 | } | ||
507 | return ndetected; | ||
479 | } | 508 | } |
480 | 509 | ||
481 | /* | 510 | /* |
@@ -656,18 +685,9 @@ EXPORT_SYMBOL_GPL(call_rcu); | |||
656 | */ | 685 | */ |
657 | void synchronize_rcu(void) | 686 | void synchronize_rcu(void) |
658 | { | 687 | { |
659 | struct rcu_synchronize rcu; | ||
660 | |||
661 | if (!rcu_scheduler_active) | 688 | if (!rcu_scheduler_active) |
662 | return; | 689 | return; |
663 | 690 | wait_rcu_gp(call_rcu); | |
664 | init_rcu_head_on_stack(&rcu.head); | ||
665 | init_completion(&rcu.completion); | ||
666 | /* Will wake me after RCU finished. */ | ||
667 | call_rcu(&rcu.head, wakeme_after_rcu); | ||
668 | /* Wait for it. */ | ||
669 | wait_for_completion(&rcu.completion); | ||
670 | destroy_rcu_head_on_stack(&rcu.head); | ||
671 | } | 691 | } |
672 | EXPORT_SYMBOL_GPL(synchronize_rcu); | 692 | EXPORT_SYMBOL_GPL(synchronize_rcu); |
673 | 693 | ||
@@ -968,8 +988,9 @@ static void rcu_print_detail_task_stall(struct rcu_state *rsp) | |||
968 | * Because preemptible RCU does not exist, we never have to check for | 988 | * Because preemptible RCU does not exist, we never have to check for |
969 | * tasks blocked within RCU read-side critical sections. | 989 | * tasks blocked within RCU read-side critical sections. |
970 | */ | 990 | */ |
971 | static void rcu_print_task_stall(struct rcu_node *rnp) | 991 | static int rcu_print_task_stall(struct rcu_node *rnp) |
972 | { | 992 | { |
993 | return 0; | ||
973 | } | 994 | } |
974 | 995 | ||
975 | /* | 996 | /* |
@@ -1136,6 +1157,8 @@ static void rcu_initiate_boost_trace(struct rcu_node *rnp) | |||
1136 | 1157 | ||
1137 | #endif /* #else #ifdef CONFIG_RCU_TRACE */ | 1158 | #endif /* #else #ifdef CONFIG_RCU_TRACE */ |
1138 | 1159 | ||
1160 | static struct lock_class_key rcu_boost_class; | ||
1161 | |||
1139 | /* | 1162 | /* |
1140 | * Carry out RCU priority boosting on the task indicated by ->exp_tasks | 1163 | * Carry out RCU priority boosting on the task indicated by ->exp_tasks |
1141 | * or ->boost_tasks, advancing the pointer to the next task in the | 1164 | * or ->boost_tasks, advancing the pointer to the next task in the |
@@ -1198,8 +1221,10 @@ static int rcu_boost(struct rcu_node *rnp) | |||
1198 | */ | 1221 | */ |
1199 | t = container_of(tb, struct task_struct, rcu_node_entry); | 1222 | t = container_of(tb, struct task_struct, rcu_node_entry); |
1200 | rt_mutex_init_proxy_locked(&mtx, t); | 1223 | rt_mutex_init_proxy_locked(&mtx, t); |
1224 | /* Avoid lockdep false positives. This rt_mutex is its own thing. */ | ||
1225 | lockdep_set_class_and_name(&mtx.wait_lock, &rcu_boost_class, | ||
1226 | "rcu_boost_mutex"); | ||
1201 | t->rcu_boost_mutex = &mtx; | 1227 | t->rcu_boost_mutex = &mtx; |
1202 | t->rcu_boosted = 1; | ||
1203 | raw_spin_unlock_irqrestore(&rnp->lock, flags); | 1228 | raw_spin_unlock_irqrestore(&rnp->lock, flags); |
1204 | rt_mutex_lock(&mtx); /* Side effect: boosts task t's priority. */ | 1229 | rt_mutex_lock(&mtx); /* Side effect: boosts task t's priority. */ |
1205 | rt_mutex_unlock(&mtx); /* Keep lockdep happy. */ | 1230 | rt_mutex_unlock(&mtx); /* Keep lockdep happy. */ |
@@ -1228,9 +1253,12 @@ static int rcu_boost_kthread(void *arg) | |||
1228 | int spincnt = 0; | 1253 | int spincnt = 0; |
1229 | int more2boost; | 1254 | int more2boost; |
1230 | 1255 | ||
1256 | trace_rcu_utilization("Start boost kthread@init"); | ||
1231 | for (;;) { | 1257 | for (;;) { |
1232 | rnp->boost_kthread_status = RCU_KTHREAD_WAITING; | 1258 | rnp->boost_kthread_status = RCU_KTHREAD_WAITING; |
1259 | trace_rcu_utilization("End boost kthread@rcu_wait"); | ||
1233 | rcu_wait(rnp->boost_tasks || rnp->exp_tasks); | 1260 | rcu_wait(rnp->boost_tasks || rnp->exp_tasks); |
1261 | trace_rcu_utilization("Start boost kthread@rcu_wait"); | ||
1234 | rnp->boost_kthread_status = RCU_KTHREAD_RUNNING; | 1262 | rnp->boost_kthread_status = RCU_KTHREAD_RUNNING; |
1235 | more2boost = rcu_boost(rnp); | 1263 | more2boost = rcu_boost(rnp); |
1236 | if (more2boost) | 1264 | if (more2boost) |
@@ -1238,11 +1266,14 @@ static int rcu_boost_kthread(void *arg) | |||
1238 | else | 1266 | else |
1239 | spincnt = 0; | 1267 | spincnt = 0; |
1240 | if (spincnt > 10) { | 1268 | if (spincnt > 10) { |
1269 | trace_rcu_utilization("End boost kthread@rcu_yield"); | ||
1241 | rcu_yield(rcu_boost_kthread_timer, (unsigned long)rnp); | 1270 | rcu_yield(rcu_boost_kthread_timer, (unsigned long)rnp); |
1271 | trace_rcu_utilization("Start boost kthread@rcu_yield"); | ||
1242 | spincnt = 0; | 1272 | spincnt = 0; |
1243 | } | 1273 | } |
1244 | } | 1274 | } |
1245 | /* NOTREACHED */ | 1275 | /* NOTREACHED */ |
1276 | trace_rcu_utilization("End boost kthread@notreached"); | ||
1246 | return 0; | 1277 | return 0; |
1247 | } | 1278 | } |
1248 | 1279 | ||
@@ -1291,11 +1322,9 @@ static void invoke_rcu_callbacks_kthread(void) | |||
1291 | 1322 | ||
1292 | local_irq_save(flags); | 1323 | local_irq_save(flags); |
1293 | __this_cpu_write(rcu_cpu_has_work, 1); | 1324 | __this_cpu_write(rcu_cpu_has_work, 1); |
1294 | if (__this_cpu_read(rcu_cpu_kthread_task) == NULL) { | 1325 | if (__this_cpu_read(rcu_cpu_kthread_task) != NULL && |
1295 | local_irq_restore(flags); | 1326 | current != __this_cpu_read(rcu_cpu_kthread_task)) |
1296 | return; | 1327 | wake_up_process(__this_cpu_read(rcu_cpu_kthread_task)); |
1297 | } | ||
1298 | wake_up_process(__this_cpu_read(rcu_cpu_kthread_task)); | ||
1299 | local_irq_restore(flags); | 1328 | local_irq_restore(flags); |
1300 | } | 1329 | } |
1301 | 1330 | ||
@@ -1343,13 +1372,13 @@ static int __cpuinit rcu_spawn_one_boost_kthread(struct rcu_state *rsp, | |||
1343 | if (rnp->boost_kthread_task != NULL) | 1372 | if (rnp->boost_kthread_task != NULL) |
1344 | return 0; | 1373 | return 0; |
1345 | t = kthread_create(rcu_boost_kthread, (void *)rnp, | 1374 | t = kthread_create(rcu_boost_kthread, (void *)rnp, |
1346 | "rcub%d", rnp_index); | 1375 | "rcub/%d", rnp_index); |
1347 | if (IS_ERR(t)) | 1376 | if (IS_ERR(t)) |
1348 | return PTR_ERR(t); | 1377 | return PTR_ERR(t); |
1349 | raw_spin_lock_irqsave(&rnp->lock, flags); | 1378 | raw_spin_lock_irqsave(&rnp->lock, flags); |
1350 | rnp->boost_kthread_task = t; | 1379 | rnp->boost_kthread_task = t; |
1351 | raw_spin_unlock_irqrestore(&rnp->lock, flags); | 1380 | raw_spin_unlock_irqrestore(&rnp->lock, flags); |
1352 | sp.sched_priority = RCU_KTHREAD_PRIO; | 1381 | sp.sched_priority = RCU_BOOST_PRIO; |
1353 | sched_setscheduler_nocheck(t, SCHED_FIFO, &sp); | 1382 | sched_setscheduler_nocheck(t, SCHED_FIFO, &sp); |
1354 | wake_up_process(t); /* get to TASK_INTERRUPTIBLE quickly. */ | 1383 | wake_up_process(t); /* get to TASK_INTERRUPTIBLE quickly. */ |
1355 | return 0; | 1384 | return 0; |
@@ -1444,6 +1473,7 @@ static void rcu_yield(void (*f)(unsigned long), unsigned long arg) | |||
1444 | { | 1473 | { |
1445 | struct sched_param sp; | 1474 | struct sched_param sp; |
1446 | struct timer_list yield_timer; | 1475 | struct timer_list yield_timer; |
1476 | int prio = current->rt_priority; | ||
1447 | 1477 | ||
1448 | setup_timer_on_stack(&yield_timer, f, arg); | 1478 | setup_timer_on_stack(&yield_timer, f, arg); |
1449 | mod_timer(&yield_timer, jiffies + 2); | 1479 | mod_timer(&yield_timer, jiffies + 2); |
@@ -1451,7 +1481,8 @@ static void rcu_yield(void (*f)(unsigned long), unsigned long arg) | |||
1451 | sched_setscheduler_nocheck(current, SCHED_NORMAL, &sp); | 1481 | sched_setscheduler_nocheck(current, SCHED_NORMAL, &sp); |
1452 | set_user_nice(current, 19); | 1482 | set_user_nice(current, 19); |
1453 | schedule(); | 1483 | schedule(); |
1454 | sp.sched_priority = RCU_KTHREAD_PRIO; | 1484 | set_user_nice(current, 0); |
1485 | sp.sched_priority = prio; | ||
1455 | sched_setscheduler_nocheck(current, SCHED_FIFO, &sp); | 1486 | sched_setscheduler_nocheck(current, SCHED_FIFO, &sp); |
1456 | del_timer(&yield_timer); | 1487 | del_timer(&yield_timer); |
1457 | } | 1488 | } |
@@ -1489,7 +1520,8 @@ static int rcu_cpu_kthread_should_stop(int cpu) | |||
1489 | 1520 | ||
1490 | /* | 1521 | /* |
1491 | * Per-CPU kernel thread that invokes RCU callbacks. This replaces the | 1522 | * Per-CPU kernel thread that invokes RCU callbacks. This replaces the |
1492 | * earlier RCU softirq. | 1523 | * RCU softirq used in flavors and configurations of RCU that do not |
1524 | * support RCU priority boosting. | ||
1493 | */ | 1525 | */ |
1494 | static int rcu_cpu_kthread(void *arg) | 1526 | static int rcu_cpu_kthread(void *arg) |
1495 | { | 1527 | { |
@@ -1500,9 +1532,12 @@ static int rcu_cpu_kthread(void *arg) | |||
1500 | char work; | 1532 | char work; |
1501 | char *workp = &per_cpu(rcu_cpu_has_work, cpu); | 1533 | char *workp = &per_cpu(rcu_cpu_has_work, cpu); |
1502 | 1534 | ||
1535 | trace_rcu_utilization("Start CPU kthread@init"); | ||
1503 | for (;;) { | 1536 | for (;;) { |
1504 | *statusp = RCU_KTHREAD_WAITING; | 1537 | *statusp = RCU_KTHREAD_WAITING; |
1538 | trace_rcu_utilization("End CPU kthread@rcu_wait"); | ||
1505 | rcu_wait(*workp != 0 || kthread_should_stop()); | 1539 | rcu_wait(*workp != 0 || kthread_should_stop()); |
1540 | trace_rcu_utilization("Start CPU kthread@rcu_wait"); | ||
1506 | local_bh_disable(); | 1541 | local_bh_disable(); |
1507 | if (rcu_cpu_kthread_should_stop(cpu)) { | 1542 | if (rcu_cpu_kthread_should_stop(cpu)) { |
1508 | local_bh_enable(); | 1543 | local_bh_enable(); |
@@ -1523,11 +1558,14 @@ static int rcu_cpu_kthread(void *arg) | |||
1523 | spincnt = 0; | 1558 | spincnt = 0; |
1524 | if (spincnt > 10) { | 1559 | if (spincnt > 10) { |
1525 | *statusp = RCU_KTHREAD_YIELDING; | 1560 | *statusp = RCU_KTHREAD_YIELDING; |
1561 | trace_rcu_utilization("End CPU kthread@rcu_yield"); | ||
1526 | rcu_yield(rcu_cpu_kthread_timer, (unsigned long)cpu); | 1562 | rcu_yield(rcu_cpu_kthread_timer, (unsigned long)cpu); |
1563 | trace_rcu_utilization("Start CPU kthread@rcu_yield"); | ||
1527 | spincnt = 0; | 1564 | spincnt = 0; |
1528 | } | 1565 | } |
1529 | } | 1566 | } |
1530 | *statusp = RCU_KTHREAD_STOPPED; | 1567 | *statusp = RCU_KTHREAD_STOPPED; |
1568 | trace_rcu_utilization("End CPU kthread@term"); | ||
1531 | return 0; | 1569 | return 0; |
1532 | } | 1570 | } |
1533 | 1571 | ||
@@ -1560,7 +1598,10 @@ static int __cpuinit rcu_spawn_one_cpu_kthread(int cpu) | |||
1560 | if (!rcu_scheduler_fully_active || | 1598 | if (!rcu_scheduler_fully_active || |
1561 | per_cpu(rcu_cpu_kthread_task, cpu) != NULL) | 1599 | per_cpu(rcu_cpu_kthread_task, cpu) != NULL) |
1562 | return 0; | 1600 | return 0; |
1563 | t = kthread_create(rcu_cpu_kthread, (void *)(long)cpu, "rcuc%d", cpu); | 1601 | t = kthread_create_on_node(rcu_cpu_kthread, |
1602 | (void *)(long)cpu, | ||
1603 | cpu_to_node(cpu), | ||
1604 | "rcuc/%d", cpu); | ||
1564 | if (IS_ERR(t)) | 1605 | if (IS_ERR(t)) |
1565 | return PTR_ERR(t); | 1606 | return PTR_ERR(t); |
1566 | if (cpu_online(cpu)) | 1607 | if (cpu_online(cpu)) |
@@ -1669,7 +1710,7 @@ static int __cpuinit rcu_spawn_one_node_kthread(struct rcu_state *rsp, | |||
1669 | return 0; | 1710 | return 0; |
1670 | if (rnp->node_kthread_task == NULL) { | 1711 | if (rnp->node_kthread_task == NULL) { |
1671 | t = kthread_create(rcu_node_kthread, (void *)rnp, | 1712 | t = kthread_create(rcu_node_kthread, (void *)rnp, |
1672 | "rcun%d", rnp_index); | 1713 | "rcun/%d", rnp_index); |
1673 | if (IS_ERR(t)) | 1714 | if (IS_ERR(t)) |
1674 | return PTR_ERR(t); | 1715 | return PTR_ERR(t); |
1675 | raw_spin_lock_irqsave(&rnp->lock, flags); | 1716 | raw_spin_lock_irqsave(&rnp->lock, flags); |
@@ -1907,15 +1948,6 @@ int rcu_needs_cpu(int cpu) | |||
1907 | return rcu_needs_cpu_quick_check(cpu); | 1948 | return rcu_needs_cpu_quick_check(cpu); |
1908 | } | 1949 | } |
1909 | 1950 | ||
1910 | /* | ||
1911 | * Check to see if we need to continue a callback-flush operations to | ||
1912 | * allow the last CPU to enter dyntick-idle mode. But fast dyntick-idle | ||
1913 | * entry is not configured, so we never do need to. | ||
1914 | */ | ||
1915 | static void rcu_needs_cpu_flush(void) | ||
1916 | { | ||
1917 | } | ||
1918 | |||
1919 | #else /* #if !defined(CONFIG_RCU_FAST_NO_HZ) */ | 1951 | #else /* #if !defined(CONFIG_RCU_FAST_NO_HZ) */ |
1920 | 1952 | ||
1921 | #define RCU_NEEDS_CPU_FLUSHES 5 | 1953 | #define RCU_NEEDS_CPU_FLUSHES 5 |
@@ -1991,20 +2023,4 @@ int rcu_needs_cpu(int cpu) | |||
1991 | return c; | 2023 | return c; |
1992 | } | 2024 | } |
1993 | 2025 | ||
1994 | /* | ||
1995 | * Check to see if we need to continue a callback-flush operations to | ||
1996 | * allow the last CPU to enter dyntick-idle mode. | ||
1997 | */ | ||
1998 | static void rcu_needs_cpu_flush(void) | ||
1999 | { | ||
2000 | int cpu = smp_processor_id(); | ||
2001 | unsigned long flags; | ||
2002 | |||
2003 | if (per_cpu(rcu_dyntick_drain, cpu) <= 0) | ||
2004 | return; | ||
2005 | local_irq_save(flags); | ||
2006 | (void)rcu_needs_cpu(cpu); | ||
2007 | local_irq_restore(flags); | ||
2008 | } | ||
2009 | |||
2010 | #endif /* #else #if !defined(CONFIG_RCU_FAST_NO_HZ) */ | 2026 | #endif /* #else #if !defined(CONFIG_RCU_FAST_NO_HZ) */ |
diff --git a/kernel/rcutree_trace.c b/kernel/rcutree_trace.c index 3b0c0986afc0..9feffa4c0695 100644 --- a/kernel/rcutree_trace.c +++ b/kernel/rcutree_trace.c | |||
@@ -48,11 +48,6 @@ | |||
48 | 48 | ||
49 | #ifdef CONFIG_RCU_BOOST | 49 | #ifdef CONFIG_RCU_BOOST |
50 | 50 | ||
51 | DECLARE_PER_CPU(unsigned int, rcu_cpu_kthread_status); | ||
52 | DECLARE_PER_CPU(unsigned int, rcu_cpu_kthread_cpu); | ||
53 | DECLARE_PER_CPU(unsigned int, rcu_cpu_kthread_loops); | ||
54 | DECLARE_PER_CPU(char, rcu_cpu_has_work); | ||
55 | |||
56 | static char convert_kthread_status(unsigned int kthread_status) | 51 | static char convert_kthread_status(unsigned int kthread_status) |
57 | { | 52 | { |
58 | if (kthread_status > RCU_KTHREAD_MAX) | 53 | if (kthread_status > RCU_KTHREAD_MAX) |
@@ -66,11 +61,11 @@ static void print_one_rcu_data(struct seq_file *m, struct rcu_data *rdp) | |||
66 | { | 61 | { |
67 | if (!rdp->beenonline) | 62 | if (!rdp->beenonline) |
68 | return; | 63 | return; |
69 | seq_printf(m, "%3d%cc=%lu g=%lu pq=%d pqc=%lu qp=%d", | 64 | seq_printf(m, "%3d%cc=%lu g=%lu pq=%d pgp=%lu qp=%d", |
70 | rdp->cpu, | 65 | rdp->cpu, |
71 | cpu_is_offline(rdp->cpu) ? '!' : ' ', | 66 | cpu_is_offline(rdp->cpu) ? '!' : ' ', |
72 | rdp->completed, rdp->gpnum, | 67 | rdp->completed, rdp->gpnum, |
73 | rdp->passed_quiesc, rdp->passed_quiesc_completed, | 68 | rdp->passed_quiesce, rdp->passed_quiesce_gpnum, |
74 | rdp->qs_pending); | 69 | rdp->qs_pending); |
75 | #ifdef CONFIG_NO_HZ | 70 | #ifdef CONFIG_NO_HZ |
76 | seq_printf(m, " dt=%d/%d/%d df=%lu", | 71 | seq_printf(m, " dt=%d/%d/%d df=%lu", |
@@ -144,7 +139,7 @@ static void print_one_rcu_data_csv(struct seq_file *m, struct rcu_data *rdp) | |||
144 | rdp->cpu, | 139 | rdp->cpu, |
145 | cpu_is_offline(rdp->cpu) ? "\"N\"" : "\"Y\"", | 140 | cpu_is_offline(rdp->cpu) ? "\"N\"" : "\"Y\"", |
146 | rdp->completed, rdp->gpnum, | 141 | rdp->completed, rdp->gpnum, |
147 | rdp->passed_quiesc, rdp->passed_quiesc_completed, | 142 | rdp->passed_quiesce, rdp->passed_quiesce_gpnum, |
148 | rdp->qs_pending); | 143 | rdp->qs_pending); |
149 | #ifdef CONFIG_NO_HZ | 144 | #ifdef CONFIG_NO_HZ |
150 | seq_printf(m, ",%d,%d,%d,%lu", | 145 | seq_printf(m, ",%d,%d,%d,%lu", |
@@ -175,7 +170,7 @@ static void print_one_rcu_data_csv(struct seq_file *m, struct rcu_data *rdp) | |||
175 | 170 | ||
176 | static int show_rcudata_csv(struct seq_file *m, void *unused) | 171 | static int show_rcudata_csv(struct seq_file *m, void *unused) |
177 | { | 172 | { |
178 | seq_puts(m, "\"CPU\",\"Online?\",\"c\",\"g\",\"pq\",\"pqc\",\"pq\","); | 173 | seq_puts(m, "\"CPU\",\"Online?\",\"c\",\"g\",\"pq\",\"pgp\",\"pq\","); |
179 | #ifdef CONFIG_NO_HZ | 174 | #ifdef CONFIG_NO_HZ |
180 | seq_puts(m, "\"dt\",\"dt nesting\",\"dt NMI nesting\",\"df\","); | 175 | seq_puts(m, "\"dt\",\"dt nesting\",\"dt NMI nesting\",\"df\","); |
181 | #endif /* #ifdef CONFIG_NO_HZ */ | 176 | #endif /* #ifdef CONFIG_NO_HZ */ |
diff --git a/kernel/resource.c b/kernel/resource.c index 3b3cedc52592..c8dc249da5ce 100644 --- a/kernel/resource.c +++ b/kernel/resource.c | |||
@@ -419,6 +419,9 @@ static int __find_resource(struct resource *root, struct resource *old, | |||
419 | else | 419 | else |
420 | tmp.end = root->end; | 420 | tmp.end = root->end; |
421 | 421 | ||
422 | if (tmp.end < tmp.start) | ||
423 | goto next; | ||
424 | |||
422 | resource_clip(&tmp, constraint->min, constraint->max); | 425 | resource_clip(&tmp, constraint->min, constraint->max); |
423 | arch_remove_reservations(&tmp); | 426 | arch_remove_reservations(&tmp); |
424 | 427 | ||
@@ -436,8 +439,10 @@ static int __find_resource(struct resource *root, struct resource *old, | |||
436 | return 0; | 439 | return 0; |
437 | } | 440 | } |
438 | } | 441 | } |
439 | if (!this) | 442 | |
443 | next: if (!this || this->end == root->end) | ||
440 | break; | 444 | break; |
445 | |||
441 | if (this != old) | 446 | if (this != old) |
442 | tmp.start = this->end + 1; | 447 | tmp.start = this->end + 1; |
443 | this = this->sibling; | 448 | this = this->sibling; |
diff --git a/kernel/rtmutex-debug.c b/kernel/rtmutex-debug.c index 3c7cbc2c33be..a2e7e7210f3e 100644 --- a/kernel/rtmutex-debug.c +++ b/kernel/rtmutex-debug.c | |||
@@ -29,61 +29,6 @@ | |||
29 | 29 | ||
30 | #include "rtmutex_common.h" | 30 | #include "rtmutex_common.h" |
31 | 31 | ||
32 | # define TRACE_WARN_ON(x) WARN_ON(x) | ||
33 | # define TRACE_BUG_ON(x) BUG_ON(x) | ||
34 | |||
35 | # define TRACE_OFF() \ | ||
36 | do { \ | ||
37 | if (rt_trace_on) { \ | ||
38 | rt_trace_on = 0; \ | ||
39 | console_verbose(); \ | ||
40 | if (raw_spin_is_locked(¤t->pi_lock)) \ | ||
41 | raw_spin_unlock(¤t->pi_lock); \ | ||
42 | } \ | ||
43 | } while (0) | ||
44 | |||
45 | # define TRACE_OFF_NOLOCK() \ | ||
46 | do { \ | ||
47 | if (rt_trace_on) { \ | ||
48 | rt_trace_on = 0; \ | ||
49 | console_verbose(); \ | ||
50 | } \ | ||
51 | } while (0) | ||
52 | |||
53 | # define TRACE_BUG_LOCKED() \ | ||
54 | do { \ | ||
55 | TRACE_OFF(); \ | ||
56 | BUG(); \ | ||
57 | } while (0) | ||
58 | |||
59 | # define TRACE_WARN_ON_LOCKED(c) \ | ||
60 | do { \ | ||
61 | if (unlikely(c)) { \ | ||
62 | TRACE_OFF(); \ | ||
63 | WARN_ON(1); \ | ||
64 | } \ | ||
65 | } while (0) | ||
66 | |||
67 | # define TRACE_BUG_ON_LOCKED(c) \ | ||
68 | do { \ | ||
69 | if (unlikely(c)) \ | ||
70 | TRACE_BUG_LOCKED(); \ | ||
71 | } while (0) | ||
72 | |||
73 | #ifdef CONFIG_SMP | ||
74 | # define SMP_TRACE_BUG_ON_LOCKED(c) TRACE_BUG_ON_LOCKED(c) | ||
75 | #else | ||
76 | # define SMP_TRACE_BUG_ON_LOCKED(c) do { } while (0) | ||
77 | #endif | ||
78 | |||
79 | /* | ||
80 | * deadlock detection flag. We turn it off when we detect | ||
81 | * the first problem because we dont want to recurse back | ||
82 | * into the tracing code when doing error printk or | ||
83 | * executing a BUG(): | ||
84 | */ | ||
85 | static int rt_trace_on = 1; | ||
86 | |||
87 | static void printk_task(struct task_struct *p) | 32 | static void printk_task(struct task_struct *p) |
88 | { | 33 | { |
89 | if (p) | 34 | if (p) |
@@ -111,8 +56,8 @@ static void printk_lock(struct rt_mutex *lock, int print_owner) | |||
111 | 56 | ||
112 | void rt_mutex_debug_task_free(struct task_struct *task) | 57 | void rt_mutex_debug_task_free(struct task_struct *task) |
113 | { | 58 | { |
114 | WARN_ON(!plist_head_empty(&task->pi_waiters)); | 59 | DEBUG_LOCKS_WARN_ON(!plist_head_empty(&task->pi_waiters)); |
115 | WARN_ON(task->pi_blocked_on); | 60 | DEBUG_LOCKS_WARN_ON(task->pi_blocked_on); |
116 | } | 61 | } |
117 | 62 | ||
118 | /* | 63 | /* |
@@ -125,7 +70,7 @@ void debug_rt_mutex_deadlock(int detect, struct rt_mutex_waiter *act_waiter, | |||
125 | { | 70 | { |
126 | struct task_struct *task; | 71 | struct task_struct *task; |
127 | 72 | ||
128 | if (!rt_trace_on || detect || !act_waiter) | 73 | if (!debug_locks || detect || !act_waiter) |
129 | return; | 74 | return; |
130 | 75 | ||
131 | task = rt_mutex_owner(act_waiter->lock); | 76 | task = rt_mutex_owner(act_waiter->lock); |
@@ -139,7 +84,7 @@ void debug_rt_mutex_print_deadlock(struct rt_mutex_waiter *waiter) | |||
139 | { | 84 | { |
140 | struct task_struct *task; | 85 | struct task_struct *task; |
141 | 86 | ||
142 | if (!waiter->deadlock_lock || !rt_trace_on) | 87 | if (!waiter->deadlock_lock || !debug_locks) |
143 | return; | 88 | return; |
144 | 89 | ||
145 | rcu_read_lock(); | 90 | rcu_read_lock(); |
@@ -149,7 +94,10 @@ void debug_rt_mutex_print_deadlock(struct rt_mutex_waiter *waiter) | |||
149 | return; | 94 | return; |
150 | } | 95 | } |
151 | 96 | ||
152 | TRACE_OFF_NOLOCK(); | 97 | if (!debug_locks_off()) { |
98 | rcu_read_unlock(); | ||
99 | return; | ||
100 | } | ||
153 | 101 | ||
154 | printk("\n============================================\n"); | 102 | printk("\n============================================\n"); |
155 | printk( "[ BUG: circular locking deadlock detected! ]\n"); | 103 | printk( "[ BUG: circular locking deadlock detected! ]\n"); |
@@ -180,7 +128,6 @@ void debug_rt_mutex_print_deadlock(struct rt_mutex_waiter *waiter) | |||
180 | 128 | ||
181 | printk("[ turning off deadlock detection." | 129 | printk("[ turning off deadlock detection." |
182 | "Please report this trace. ]\n\n"); | 130 | "Please report this trace. ]\n\n"); |
183 | local_irq_disable(); | ||
184 | } | 131 | } |
185 | 132 | ||
186 | void debug_rt_mutex_lock(struct rt_mutex *lock) | 133 | void debug_rt_mutex_lock(struct rt_mutex *lock) |
@@ -189,7 +136,7 @@ void debug_rt_mutex_lock(struct rt_mutex *lock) | |||
189 | 136 | ||
190 | void debug_rt_mutex_unlock(struct rt_mutex *lock) | 137 | void debug_rt_mutex_unlock(struct rt_mutex *lock) |
191 | { | 138 | { |
192 | TRACE_WARN_ON_LOCKED(rt_mutex_owner(lock) != current); | 139 | DEBUG_LOCKS_WARN_ON(rt_mutex_owner(lock) != current); |
193 | } | 140 | } |
194 | 141 | ||
195 | void | 142 | void |
@@ -199,7 +146,7 @@ debug_rt_mutex_proxy_lock(struct rt_mutex *lock, struct task_struct *powner) | |||
199 | 146 | ||
200 | void debug_rt_mutex_proxy_unlock(struct rt_mutex *lock) | 147 | void debug_rt_mutex_proxy_unlock(struct rt_mutex *lock) |
201 | { | 148 | { |
202 | TRACE_WARN_ON_LOCKED(!rt_mutex_owner(lock)); | 149 | DEBUG_LOCKS_WARN_ON(!rt_mutex_owner(lock)); |
203 | } | 150 | } |
204 | 151 | ||
205 | void debug_rt_mutex_init_waiter(struct rt_mutex_waiter *waiter) | 152 | void debug_rt_mutex_init_waiter(struct rt_mutex_waiter *waiter) |
@@ -213,8 +160,8 @@ void debug_rt_mutex_init_waiter(struct rt_mutex_waiter *waiter) | |||
213 | void debug_rt_mutex_free_waiter(struct rt_mutex_waiter *waiter) | 160 | void debug_rt_mutex_free_waiter(struct rt_mutex_waiter *waiter) |
214 | { | 161 | { |
215 | put_pid(waiter->deadlock_task_pid); | 162 | put_pid(waiter->deadlock_task_pid); |
216 | TRACE_WARN_ON(!plist_node_empty(&waiter->list_entry)); | 163 | DEBUG_LOCKS_WARN_ON(!plist_node_empty(&waiter->list_entry)); |
217 | TRACE_WARN_ON(!plist_node_empty(&waiter->pi_list_entry)); | 164 | DEBUG_LOCKS_WARN_ON(!plist_node_empty(&waiter->pi_list_entry)); |
218 | memset(waiter, 0x22, sizeof(*waiter)); | 165 | memset(waiter, 0x22, sizeof(*waiter)); |
219 | } | 166 | } |
220 | 167 | ||
diff --git a/kernel/rtmutex.c b/kernel/rtmutex.c index 255e1662acdb..5e8d9cce7470 100644 --- a/kernel/rtmutex.c +++ b/kernel/rtmutex.c | |||
@@ -579,6 +579,7 @@ __rt_mutex_slowlock(struct rt_mutex *lock, int state, | |||
579 | struct rt_mutex_waiter *waiter) | 579 | struct rt_mutex_waiter *waiter) |
580 | { | 580 | { |
581 | int ret = 0; | 581 | int ret = 0; |
582 | int was_disabled; | ||
582 | 583 | ||
583 | for (;;) { | 584 | for (;;) { |
584 | /* Try to acquire the lock: */ | 585 | /* Try to acquire the lock: */ |
@@ -601,10 +602,17 @@ __rt_mutex_slowlock(struct rt_mutex *lock, int state, | |||
601 | 602 | ||
602 | raw_spin_unlock(&lock->wait_lock); | 603 | raw_spin_unlock(&lock->wait_lock); |
603 | 604 | ||
605 | was_disabled = irqs_disabled(); | ||
606 | if (was_disabled) | ||
607 | local_irq_enable(); | ||
608 | |||
604 | debug_rt_mutex_print_deadlock(waiter); | 609 | debug_rt_mutex_print_deadlock(waiter); |
605 | 610 | ||
606 | schedule_rt_mutex(lock); | 611 | schedule_rt_mutex(lock); |
607 | 612 | ||
613 | if (was_disabled) | ||
614 | local_irq_disable(); | ||
615 | |||
608 | raw_spin_lock(&lock->wait_lock); | 616 | raw_spin_lock(&lock->wait_lock); |
609 | set_current_state(state); | 617 | set_current_state(state); |
610 | } | 618 | } |
diff --git a/kernel/sched.c b/kernel/sched.c index ec5f472bc5b9..d87c6e5d4e8c 100644 --- a/kernel/sched.c +++ b/kernel/sched.c | |||
@@ -196,10 +196,28 @@ static inline int rt_bandwidth_enabled(void) | |||
196 | return sysctl_sched_rt_runtime >= 0; | 196 | return sysctl_sched_rt_runtime >= 0; |
197 | } | 197 | } |
198 | 198 | ||
199 | static void start_rt_bandwidth(struct rt_bandwidth *rt_b) | 199 | static void start_bandwidth_timer(struct hrtimer *period_timer, ktime_t period) |
200 | { | 200 | { |
201 | ktime_t now; | 201 | unsigned long delta; |
202 | ktime_t soft, hard, now; | ||
203 | |||
204 | for (;;) { | ||
205 | if (hrtimer_active(period_timer)) | ||
206 | break; | ||
207 | |||
208 | now = hrtimer_cb_get_time(period_timer); | ||
209 | hrtimer_forward(period_timer, now, period); | ||
202 | 210 | ||
211 | soft = hrtimer_get_softexpires(period_timer); | ||
212 | hard = hrtimer_get_expires(period_timer); | ||
213 | delta = ktime_to_ns(ktime_sub(hard, soft)); | ||
214 | __hrtimer_start_range_ns(period_timer, soft, delta, | ||
215 | HRTIMER_MODE_ABS_PINNED, 0); | ||
216 | } | ||
217 | } | ||
218 | |||
219 | static void start_rt_bandwidth(struct rt_bandwidth *rt_b) | ||
220 | { | ||
203 | if (!rt_bandwidth_enabled() || rt_b->rt_runtime == RUNTIME_INF) | 221 | if (!rt_bandwidth_enabled() || rt_b->rt_runtime == RUNTIME_INF) |
204 | return; | 222 | return; |
205 | 223 | ||
@@ -207,22 +225,7 @@ static void start_rt_bandwidth(struct rt_bandwidth *rt_b) | |||
207 | return; | 225 | return; |
208 | 226 | ||
209 | raw_spin_lock(&rt_b->rt_runtime_lock); | 227 | raw_spin_lock(&rt_b->rt_runtime_lock); |
210 | for (;;) { | 228 | start_bandwidth_timer(&rt_b->rt_period_timer, rt_b->rt_period); |
211 | unsigned long delta; | ||
212 | ktime_t soft, hard; | ||
213 | |||
214 | if (hrtimer_active(&rt_b->rt_period_timer)) | ||
215 | break; | ||
216 | |||
217 | now = hrtimer_cb_get_time(&rt_b->rt_period_timer); | ||
218 | hrtimer_forward(&rt_b->rt_period_timer, now, rt_b->rt_period); | ||
219 | |||
220 | soft = hrtimer_get_softexpires(&rt_b->rt_period_timer); | ||
221 | hard = hrtimer_get_expires(&rt_b->rt_period_timer); | ||
222 | delta = ktime_to_ns(ktime_sub(hard, soft)); | ||
223 | __hrtimer_start_range_ns(&rt_b->rt_period_timer, soft, delta, | ||
224 | HRTIMER_MODE_ABS_PINNED, 0); | ||
225 | } | ||
226 | raw_spin_unlock(&rt_b->rt_runtime_lock); | 229 | raw_spin_unlock(&rt_b->rt_runtime_lock); |
227 | } | 230 | } |
228 | 231 | ||
@@ -247,6 +250,24 @@ struct cfs_rq; | |||
247 | 250 | ||
248 | static LIST_HEAD(task_groups); | 251 | static LIST_HEAD(task_groups); |
249 | 252 | ||
253 | struct cfs_bandwidth { | ||
254 | #ifdef CONFIG_CFS_BANDWIDTH | ||
255 | raw_spinlock_t lock; | ||
256 | ktime_t period; | ||
257 | u64 quota, runtime; | ||
258 | s64 hierarchal_quota; | ||
259 | u64 runtime_expires; | ||
260 | |||
261 | int idle, timer_active; | ||
262 | struct hrtimer period_timer, slack_timer; | ||
263 | struct list_head throttled_cfs_rq; | ||
264 | |||
265 | /* statistics */ | ||
266 | int nr_periods, nr_throttled; | ||
267 | u64 throttled_time; | ||
268 | #endif | ||
269 | }; | ||
270 | |||
250 | /* task group related information */ | 271 | /* task group related information */ |
251 | struct task_group { | 272 | struct task_group { |
252 | struct cgroup_subsys_state css; | 273 | struct cgroup_subsys_state css; |
@@ -278,6 +299,8 @@ struct task_group { | |||
278 | #ifdef CONFIG_SCHED_AUTOGROUP | 299 | #ifdef CONFIG_SCHED_AUTOGROUP |
279 | struct autogroup *autogroup; | 300 | struct autogroup *autogroup; |
280 | #endif | 301 | #endif |
302 | |||
303 | struct cfs_bandwidth cfs_bandwidth; | ||
281 | }; | 304 | }; |
282 | 305 | ||
283 | /* task_group_lock serializes the addition/removal of task groups */ | 306 | /* task_group_lock serializes the addition/removal of task groups */ |
@@ -311,7 +334,7 @@ struct task_group root_task_group; | |||
311 | /* CFS-related fields in a runqueue */ | 334 | /* CFS-related fields in a runqueue */ |
312 | struct cfs_rq { | 335 | struct cfs_rq { |
313 | struct load_weight load; | 336 | struct load_weight load; |
314 | unsigned long nr_running; | 337 | unsigned long nr_running, h_nr_running; |
315 | 338 | ||
316 | u64 exec_clock; | 339 | u64 exec_clock; |
317 | u64 min_vruntime; | 340 | u64 min_vruntime; |
@@ -377,9 +400,120 @@ struct cfs_rq { | |||
377 | 400 | ||
378 | unsigned long load_contribution; | 401 | unsigned long load_contribution; |
379 | #endif | 402 | #endif |
403 | #ifdef CONFIG_CFS_BANDWIDTH | ||
404 | int runtime_enabled; | ||
405 | u64 runtime_expires; | ||
406 | s64 runtime_remaining; | ||
407 | |||
408 | u64 throttled_timestamp; | ||
409 | int throttled, throttle_count; | ||
410 | struct list_head throttled_list; | ||
411 | #endif | ||
380 | #endif | 412 | #endif |
381 | }; | 413 | }; |
382 | 414 | ||
415 | #ifdef CONFIG_FAIR_GROUP_SCHED | ||
416 | #ifdef CONFIG_CFS_BANDWIDTH | ||
417 | static inline struct cfs_bandwidth *tg_cfs_bandwidth(struct task_group *tg) | ||
418 | { | ||
419 | return &tg->cfs_bandwidth; | ||
420 | } | ||
421 | |||
422 | static inline u64 default_cfs_period(void); | ||
423 | static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun); | ||
424 | static void do_sched_cfs_slack_timer(struct cfs_bandwidth *cfs_b); | ||
425 | |||
426 | static enum hrtimer_restart sched_cfs_slack_timer(struct hrtimer *timer) | ||
427 | { | ||
428 | struct cfs_bandwidth *cfs_b = | ||
429 | container_of(timer, struct cfs_bandwidth, slack_timer); | ||
430 | do_sched_cfs_slack_timer(cfs_b); | ||
431 | |||
432 | return HRTIMER_NORESTART; | ||
433 | } | ||
434 | |||
435 | static enum hrtimer_restart sched_cfs_period_timer(struct hrtimer *timer) | ||
436 | { | ||
437 | struct cfs_bandwidth *cfs_b = | ||
438 | container_of(timer, struct cfs_bandwidth, period_timer); | ||
439 | ktime_t now; | ||
440 | int overrun; | ||
441 | int idle = 0; | ||
442 | |||
443 | for (;;) { | ||
444 | now = hrtimer_cb_get_time(timer); | ||
445 | overrun = hrtimer_forward(timer, now, cfs_b->period); | ||
446 | |||
447 | if (!overrun) | ||
448 | break; | ||
449 | |||
450 | idle = do_sched_cfs_period_timer(cfs_b, overrun); | ||
451 | } | ||
452 | |||
453 | return idle ? HRTIMER_NORESTART : HRTIMER_RESTART; | ||
454 | } | ||
455 | |||
456 | static void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b) | ||
457 | { | ||
458 | raw_spin_lock_init(&cfs_b->lock); | ||
459 | cfs_b->runtime = 0; | ||
460 | cfs_b->quota = RUNTIME_INF; | ||
461 | cfs_b->period = ns_to_ktime(default_cfs_period()); | ||
462 | |||
463 | INIT_LIST_HEAD(&cfs_b->throttled_cfs_rq); | ||
464 | hrtimer_init(&cfs_b->period_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); | ||
465 | cfs_b->period_timer.function = sched_cfs_period_timer; | ||
466 | hrtimer_init(&cfs_b->slack_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); | ||
467 | cfs_b->slack_timer.function = sched_cfs_slack_timer; | ||
468 | } | ||
469 | |||
470 | static void init_cfs_rq_runtime(struct cfs_rq *cfs_rq) | ||
471 | { | ||
472 | cfs_rq->runtime_enabled = 0; | ||
473 | INIT_LIST_HEAD(&cfs_rq->throttled_list); | ||
474 | } | ||
475 | |||
476 | /* requires cfs_b->lock, may release to reprogram timer */ | ||
477 | static void __start_cfs_bandwidth(struct cfs_bandwidth *cfs_b) | ||
478 | { | ||
479 | /* | ||
480 | * The timer may be active because we're trying to set a new bandwidth | ||
481 | * period or because we're racing with the tear-down path | ||
482 | * (timer_active==0 becomes visible before the hrtimer call-back | ||
483 | * terminates). In either case we ensure that it's re-programmed | ||
484 | */ | ||
485 | while (unlikely(hrtimer_active(&cfs_b->period_timer))) { | ||
486 | raw_spin_unlock(&cfs_b->lock); | ||
487 | /* ensure cfs_b->lock is available while we wait */ | ||
488 | hrtimer_cancel(&cfs_b->period_timer); | ||
489 | |||
490 | raw_spin_lock(&cfs_b->lock); | ||
491 | /* if someone else restarted the timer then we're done */ | ||
492 | if (cfs_b->timer_active) | ||
493 | return; | ||
494 | } | ||
495 | |||
496 | cfs_b->timer_active = 1; | ||
497 | start_bandwidth_timer(&cfs_b->period_timer, cfs_b->period); | ||
498 | } | ||
499 | |||
500 | static void destroy_cfs_bandwidth(struct cfs_bandwidth *cfs_b) | ||
501 | { | ||
502 | hrtimer_cancel(&cfs_b->period_timer); | ||
503 | hrtimer_cancel(&cfs_b->slack_timer); | ||
504 | } | ||
505 | #else | ||
506 | static void init_cfs_rq_runtime(struct cfs_rq *cfs_rq) {} | ||
507 | static void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b) {} | ||
508 | static void destroy_cfs_bandwidth(struct cfs_bandwidth *cfs_b) {} | ||
509 | |||
510 | static inline struct cfs_bandwidth *tg_cfs_bandwidth(struct task_group *tg) | ||
511 | { | ||
512 | return NULL; | ||
513 | } | ||
514 | #endif /* CONFIG_CFS_BANDWIDTH */ | ||
515 | #endif /* CONFIG_FAIR_GROUP_SCHED */ | ||
516 | |||
383 | /* Real-Time classes' related field in a runqueue: */ | 517 | /* Real-Time classes' related field in a runqueue: */ |
384 | struct rt_rq { | 518 | struct rt_rq { |
385 | struct rt_prio_array active; | 519 | struct rt_prio_array active; |
@@ -510,7 +644,7 @@ struct rq { | |||
510 | 644 | ||
511 | unsigned long cpu_power; | 645 | unsigned long cpu_power; |
512 | 646 | ||
513 | unsigned char idle_at_tick; | 647 | unsigned char idle_balance; |
514 | /* For active balancing */ | 648 | /* For active balancing */ |
515 | int post_schedule; | 649 | int post_schedule; |
516 | int active_balance; | 650 | int active_balance; |
@@ -520,8 +654,6 @@ struct rq { | |||
520 | int cpu; | 654 | int cpu; |
521 | int online; | 655 | int online; |
522 | 656 | ||
523 | unsigned long avg_load_per_task; | ||
524 | |||
525 | u64 rt_avg; | 657 | u64 rt_avg; |
526 | u64 age_stamp; | 658 | u64 age_stamp; |
527 | u64 idle_stamp; | 659 | u64 idle_stamp; |
@@ -570,7 +702,7 @@ struct rq { | |||
570 | #endif | 702 | #endif |
571 | 703 | ||
572 | #ifdef CONFIG_SMP | 704 | #ifdef CONFIG_SMP |
573 | struct task_struct *wake_list; | 705 | struct llist_head wake_list; |
574 | #endif | 706 | #endif |
575 | }; | 707 | }; |
576 | 708 | ||
@@ -1272,6 +1404,18 @@ void wake_up_idle_cpu(int cpu) | |||
1272 | smp_send_reschedule(cpu); | 1404 | smp_send_reschedule(cpu); |
1273 | } | 1405 | } |
1274 | 1406 | ||
1407 | static inline bool got_nohz_idle_kick(void) | ||
1408 | { | ||
1409 | return idle_cpu(smp_processor_id()) && this_rq()->nohz_balance_kick; | ||
1410 | } | ||
1411 | |||
1412 | #else /* CONFIG_NO_HZ */ | ||
1413 | |||
1414 | static inline bool got_nohz_idle_kick(void) | ||
1415 | { | ||
1416 | return false; | ||
1417 | } | ||
1418 | |||
1275 | #endif /* CONFIG_NO_HZ */ | 1419 | #endif /* CONFIG_NO_HZ */ |
1276 | 1420 | ||
1277 | static u64 sched_avg_period(void) | 1421 | static u64 sched_avg_period(void) |
@@ -1471,24 +1615,28 @@ static inline void dec_cpu_load(struct rq *rq, unsigned long load) | |||
1471 | update_load_sub(&rq->load, load); | 1615 | update_load_sub(&rq->load, load); |
1472 | } | 1616 | } |
1473 | 1617 | ||
1474 | #if (defined(CONFIG_SMP) && defined(CONFIG_FAIR_GROUP_SCHED)) || defined(CONFIG_RT_GROUP_SCHED) | 1618 | #if defined(CONFIG_RT_GROUP_SCHED) || (defined(CONFIG_FAIR_GROUP_SCHED) && \ |
1619 | (defined(CONFIG_SMP) || defined(CONFIG_CFS_BANDWIDTH))) | ||
1475 | typedef int (*tg_visitor)(struct task_group *, void *); | 1620 | typedef int (*tg_visitor)(struct task_group *, void *); |
1476 | 1621 | ||
1477 | /* | 1622 | /* |
1478 | * Iterate the full tree, calling @down when first entering a node and @up when | 1623 | * Iterate task_group tree rooted at *from, calling @down when first entering a |
1479 | * leaving it for the final time. | 1624 | * node and @up when leaving it for the final time. |
1625 | * | ||
1626 | * Caller must hold rcu_lock or sufficient equivalent. | ||
1480 | */ | 1627 | */ |
1481 | static int walk_tg_tree(tg_visitor down, tg_visitor up, void *data) | 1628 | static int walk_tg_tree_from(struct task_group *from, |
1629 | tg_visitor down, tg_visitor up, void *data) | ||
1482 | { | 1630 | { |
1483 | struct task_group *parent, *child; | 1631 | struct task_group *parent, *child; |
1484 | int ret; | 1632 | int ret; |
1485 | 1633 | ||
1486 | rcu_read_lock(); | 1634 | parent = from; |
1487 | parent = &root_task_group; | 1635 | |
1488 | down: | 1636 | down: |
1489 | ret = (*down)(parent, data); | 1637 | ret = (*down)(parent, data); |
1490 | if (ret) | 1638 | if (ret) |
1491 | goto out_unlock; | 1639 | goto out; |
1492 | list_for_each_entry_rcu(child, &parent->children, siblings) { | 1640 | list_for_each_entry_rcu(child, &parent->children, siblings) { |
1493 | parent = child; | 1641 | parent = child; |
1494 | goto down; | 1642 | goto down; |
@@ -1497,19 +1645,29 @@ up: | |||
1497 | continue; | 1645 | continue; |
1498 | } | 1646 | } |
1499 | ret = (*up)(parent, data); | 1647 | ret = (*up)(parent, data); |
1500 | if (ret) | 1648 | if (ret || parent == from) |
1501 | goto out_unlock; | 1649 | goto out; |
1502 | 1650 | ||
1503 | child = parent; | 1651 | child = parent; |
1504 | parent = parent->parent; | 1652 | parent = parent->parent; |
1505 | if (parent) | 1653 | if (parent) |
1506 | goto up; | 1654 | goto up; |
1507 | out_unlock: | 1655 | out: |
1508 | rcu_read_unlock(); | ||
1509 | |||
1510 | return ret; | 1656 | return ret; |
1511 | } | 1657 | } |
1512 | 1658 | ||
1659 | /* | ||
1660 | * Iterate the full tree, calling @down when first entering a node and @up when | ||
1661 | * leaving it for the final time. | ||
1662 | * | ||
1663 | * Caller must hold rcu_lock or sufficient equivalent. | ||
1664 | */ | ||
1665 | |||
1666 | static inline int walk_tg_tree(tg_visitor down, tg_visitor up, void *data) | ||
1667 | { | ||
1668 | return walk_tg_tree_from(&root_task_group, down, up, data); | ||
1669 | } | ||
1670 | |||
1513 | static int tg_nop(struct task_group *tg, void *data) | 1671 | static int tg_nop(struct task_group *tg, void *data) |
1514 | { | 1672 | { |
1515 | return 0; | 1673 | return 0; |
@@ -1569,11 +1727,9 @@ static unsigned long cpu_avg_load_per_task(int cpu) | |||
1569 | unsigned long nr_running = ACCESS_ONCE(rq->nr_running); | 1727 | unsigned long nr_running = ACCESS_ONCE(rq->nr_running); |
1570 | 1728 | ||
1571 | if (nr_running) | 1729 | if (nr_running) |
1572 | rq->avg_load_per_task = rq->load.weight / nr_running; | 1730 | return rq->load.weight / nr_running; |
1573 | else | ||
1574 | rq->avg_load_per_task = 0; | ||
1575 | 1731 | ||
1576 | return rq->avg_load_per_task; | 1732 | return 0; |
1577 | } | 1733 | } |
1578 | 1734 | ||
1579 | #ifdef CONFIG_PREEMPT | 1735 | #ifdef CONFIG_PREEMPT |
@@ -1739,7 +1895,7 @@ static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu) | |||
1739 | #ifdef CONFIG_SMP | 1895 | #ifdef CONFIG_SMP |
1740 | /* | 1896 | /* |
1741 | * After ->cpu is set up to a new value, task_rq_lock(p, ...) can be | 1897 | * After ->cpu is set up to a new value, task_rq_lock(p, ...) can be |
1742 | * successfuly executed on another CPU. We must ensure that updates of | 1898 | * successfully executed on another CPU. We must ensure that updates of |
1743 | * per-task data have been completed by this moment. | 1899 | * per-task data have been completed by this moment. |
1744 | */ | 1900 | */ |
1745 | smp_wmb(); | 1901 | smp_wmb(); |
@@ -1806,7 +1962,6 @@ static void activate_task(struct rq *rq, struct task_struct *p, int flags) | |||
1806 | rq->nr_uninterruptible--; | 1962 | rq->nr_uninterruptible--; |
1807 | 1963 | ||
1808 | enqueue_task(rq, p, flags); | 1964 | enqueue_task(rq, p, flags); |
1809 | inc_nr_running(rq); | ||
1810 | } | 1965 | } |
1811 | 1966 | ||
1812 | /* | 1967 | /* |
@@ -1818,7 +1973,6 @@ static void deactivate_task(struct rq *rq, struct task_struct *p, int flags) | |||
1818 | rq->nr_uninterruptible++; | 1973 | rq->nr_uninterruptible++; |
1819 | 1974 | ||
1820 | dequeue_task(rq, p, flags); | 1975 | dequeue_task(rq, p, flags); |
1821 | dec_nr_running(rq); | ||
1822 | } | 1976 | } |
1823 | 1977 | ||
1824 | #ifdef CONFIG_IRQ_TIME_ACCOUNTING | 1978 | #ifdef CONFIG_IRQ_TIME_ACCOUNTING |
@@ -2390,11 +2544,11 @@ static int select_fallback_rq(int cpu, struct task_struct *p) | |||
2390 | 2544 | ||
2391 | /* Look for allowed, online CPU in same node. */ | 2545 | /* Look for allowed, online CPU in same node. */ |
2392 | for_each_cpu_and(dest_cpu, nodemask, cpu_active_mask) | 2546 | for_each_cpu_and(dest_cpu, nodemask, cpu_active_mask) |
2393 | if (cpumask_test_cpu(dest_cpu, &p->cpus_allowed)) | 2547 | if (cpumask_test_cpu(dest_cpu, tsk_cpus_allowed(p))) |
2394 | return dest_cpu; | 2548 | return dest_cpu; |
2395 | 2549 | ||
2396 | /* Any allowed, online CPU? */ | 2550 | /* Any allowed, online CPU? */ |
2397 | dest_cpu = cpumask_any_and(&p->cpus_allowed, cpu_active_mask); | 2551 | dest_cpu = cpumask_any_and(tsk_cpus_allowed(p), cpu_active_mask); |
2398 | if (dest_cpu < nr_cpu_ids) | 2552 | if (dest_cpu < nr_cpu_ids) |
2399 | return dest_cpu; | 2553 | return dest_cpu; |
2400 | 2554 | ||
@@ -2431,7 +2585,7 @@ int select_task_rq(struct task_struct *p, int sd_flags, int wake_flags) | |||
2431 | * [ this allows ->select_task() to simply return task_cpu(p) and | 2585 | * [ this allows ->select_task() to simply return task_cpu(p) and |
2432 | * not worry about this generic constraint ] | 2586 | * not worry about this generic constraint ] |
2433 | */ | 2587 | */ |
2434 | if (unlikely(!cpumask_test_cpu(cpu, &p->cpus_allowed) || | 2588 | if (unlikely(!cpumask_test_cpu(cpu, tsk_cpus_allowed(p)) || |
2435 | !cpu_online(cpu))) | 2589 | !cpu_online(cpu))) |
2436 | cpu = select_fallback_rq(task_cpu(p), p); | 2590 | cpu = select_fallback_rq(task_cpu(p), p); |
2437 | 2591 | ||
@@ -2556,42 +2710,26 @@ static int ttwu_remote(struct task_struct *p, int wake_flags) | |||
2556 | } | 2710 | } |
2557 | 2711 | ||
2558 | #ifdef CONFIG_SMP | 2712 | #ifdef CONFIG_SMP |
2559 | static void sched_ttwu_do_pending(struct task_struct *list) | 2713 | static void sched_ttwu_pending(void) |
2560 | { | 2714 | { |
2561 | struct rq *rq = this_rq(); | 2715 | struct rq *rq = this_rq(); |
2716 | struct llist_node *llist = llist_del_all(&rq->wake_list); | ||
2717 | struct task_struct *p; | ||
2562 | 2718 | ||
2563 | raw_spin_lock(&rq->lock); | 2719 | raw_spin_lock(&rq->lock); |
2564 | 2720 | ||
2565 | while (list) { | 2721 | while (llist) { |
2566 | struct task_struct *p = list; | 2722 | p = llist_entry(llist, struct task_struct, wake_entry); |
2567 | list = list->wake_entry; | 2723 | llist = llist_next(llist); |
2568 | ttwu_do_activate(rq, p, 0); | 2724 | ttwu_do_activate(rq, p, 0); |
2569 | } | 2725 | } |
2570 | 2726 | ||
2571 | raw_spin_unlock(&rq->lock); | 2727 | raw_spin_unlock(&rq->lock); |
2572 | } | 2728 | } |
2573 | 2729 | ||
2574 | #ifdef CONFIG_HOTPLUG_CPU | ||
2575 | |||
2576 | static void sched_ttwu_pending(void) | ||
2577 | { | ||
2578 | struct rq *rq = this_rq(); | ||
2579 | struct task_struct *list = xchg(&rq->wake_list, NULL); | ||
2580 | |||
2581 | if (!list) | ||
2582 | return; | ||
2583 | |||
2584 | sched_ttwu_do_pending(list); | ||
2585 | } | ||
2586 | |||
2587 | #endif /* CONFIG_HOTPLUG_CPU */ | ||
2588 | |||
2589 | void scheduler_ipi(void) | 2730 | void scheduler_ipi(void) |
2590 | { | 2731 | { |
2591 | struct rq *rq = this_rq(); | 2732 | if (llist_empty(&this_rq()->wake_list) && !got_nohz_idle_kick()) |
2592 | struct task_struct *list = xchg(&rq->wake_list, NULL); | ||
2593 | |||
2594 | if (!list) | ||
2595 | return; | 2733 | return; |
2596 | 2734 | ||
2597 | /* | 2735 | /* |
@@ -2608,25 +2746,21 @@ void scheduler_ipi(void) | |||
2608 | * somewhat pessimize the simple resched case. | 2746 | * somewhat pessimize the simple resched case. |
2609 | */ | 2747 | */ |
2610 | irq_enter(); | 2748 | irq_enter(); |
2611 | sched_ttwu_do_pending(list); | 2749 | sched_ttwu_pending(); |
2750 | |||
2751 | /* | ||
2752 | * Check if someone kicked us for doing the nohz idle load balance. | ||
2753 | */ | ||
2754 | if (unlikely(got_nohz_idle_kick() && !need_resched())) { | ||
2755 | this_rq()->idle_balance = 1; | ||
2756 | raise_softirq_irqoff(SCHED_SOFTIRQ); | ||
2757 | } | ||
2612 | irq_exit(); | 2758 | irq_exit(); |
2613 | } | 2759 | } |
2614 | 2760 | ||
2615 | static void ttwu_queue_remote(struct task_struct *p, int cpu) | 2761 | static void ttwu_queue_remote(struct task_struct *p, int cpu) |
2616 | { | 2762 | { |
2617 | struct rq *rq = cpu_rq(cpu); | 2763 | if (llist_add(&p->wake_entry, &cpu_rq(cpu)->wake_list)) |
2618 | struct task_struct *next = rq->wake_list; | ||
2619 | |||
2620 | for (;;) { | ||
2621 | struct task_struct *old = next; | ||
2622 | |||
2623 | p->wake_entry = next; | ||
2624 | next = cmpxchg(&rq->wake_list, old, p); | ||
2625 | if (next == old) | ||
2626 | break; | ||
2627 | } | ||
2628 | |||
2629 | if (!next) | ||
2630 | smp_send_reschedule(cpu); | 2764 | smp_send_reschedule(cpu); |
2631 | } | 2765 | } |
2632 | 2766 | ||
@@ -2848,19 +2982,23 @@ void sched_fork(struct task_struct *p) | |||
2848 | p->state = TASK_RUNNING; | 2982 | p->state = TASK_RUNNING; |
2849 | 2983 | ||
2850 | /* | 2984 | /* |
2985 | * Make sure we do not leak PI boosting priority to the child. | ||
2986 | */ | ||
2987 | p->prio = current->normal_prio; | ||
2988 | |||
2989 | /* | ||
2851 | * Revert to default priority/policy on fork if requested. | 2990 | * Revert to default priority/policy on fork if requested. |
2852 | */ | 2991 | */ |
2853 | if (unlikely(p->sched_reset_on_fork)) { | 2992 | if (unlikely(p->sched_reset_on_fork)) { |
2854 | if (p->policy == SCHED_FIFO || p->policy == SCHED_RR) { | 2993 | if (task_has_rt_policy(p)) { |
2855 | p->policy = SCHED_NORMAL; | 2994 | p->policy = SCHED_NORMAL; |
2856 | p->normal_prio = p->static_prio; | ||
2857 | } | ||
2858 | |||
2859 | if (PRIO_TO_NICE(p->static_prio) < 0) { | ||
2860 | p->static_prio = NICE_TO_PRIO(0); | 2995 | p->static_prio = NICE_TO_PRIO(0); |
2861 | p->normal_prio = p->static_prio; | 2996 | p->rt_priority = 0; |
2862 | set_load_weight(p); | 2997 | } else if (PRIO_TO_NICE(p->static_prio) < 0) |
2863 | } | 2998 | p->static_prio = NICE_TO_PRIO(0); |
2999 | |||
3000 | p->prio = p->normal_prio = __normal_prio(p); | ||
3001 | set_load_weight(p); | ||
2864 | 3002 | ||
2865 | /* | 3003 | /* |
2866 | * We don't need the reset flag anymore after the fork. It has | 3004 | * We don't need the reset flag anymore after the fork. It has |
@@ -2869,11 +3007,6 @@ void sched_fork(struct task_struct *p) | |||
2869 | p->sched_reset_on_fork = 0; | 3007 | p->sched_reset_on_fork = 0; |
2870 | } | 3008 | } |
2871 | 3009 | ||
2872 | /* | ||
2873 | * Make sure we do not leak PI boosting priority to the child. | ||
2874 | */ | ||
2875 | p->prio = current->normal_prio; | ||
2876 | |||
2877 | if (!rt_prio(p->prio)) | 3010 | if (!rt_prio(p->prio)) |
2878 | p->sched_class = &fair_sched_class; | 3011 | p->sched_class = &fair_sched_class; |
2879 | 3012 | ||
@@ -3725,30 +3858,6 @@ unsigned long long task_sched_runtime(struct task_struct *p) | |||
3725 | } | 3858 | } |
3726 | 3859 | ||
3727 | /* | 3860 | /* |
3728 | * Return sum_exec_runtime for the thread group. | ||
3729 | * In case the task is currently running, return the sum plus current's | ||
3730 | * pending runtime that have not been accounted yet. | ||
3731 | * | ||
3732 | * Note that the thread group might have other running tasks as well, | ||
3733 | * so the return value not includes other pending runtime that other | ||
3734 | * running tasks might have. | ||
3735 | */ | ||
3736 | unsigned long long thread_group_sched_runtime(struct task_struct *p) | ||
3737 | { | ||
3738 | struct task_cputime totals; | ||
3739 | unsigned long flags; | ||
3740 | struct rq *rq; | ||
3741 | u64 ns; | ||
3742 | |||
3743 | rq = task_rq_lock(p, &flags); | ||
3744 | thread_group_cputime(p, &totals); | ||
3745 | ns = totals.sum_exec_runtime + do_task_delta_exec(p, rq); | ||
3746 | task_rq_unlock(rq, p, &flags); | ||
3747 | |||
3748 | return ns; | ||
3749 | } | ||
3750 | |||
3751 | /* | ||
3752 | * Account user cpu time to a process. | 3861 | * Account user cpu time to a process. |
3753 | * @p: the process that the cpu time gets accounted to | 3862 | * @p: the process that the cpu time gets accounted to |
3754 | * @cputime: the cpu time spent in user space since the last update | 3863 | * @cputime: the cpu time spent in user space since the last update |
@@ -4140,7 +4249,7 @@ void scheduler_tick(void) | |||
4140 | perf_event_task_tick(); | 4249 | perf_event_task_tick(); |
4141 | 4250 | ||
4142 | #ifdef CONFIG_SMP | 4251 | #ifdef CONFIG_SMP |
4143 | rq->idle_at_tick = idle_cpu(cpu); | 4252 | rq->idle_balance = idle_cpu(cpu); |
4144 | trigger_load_balance(rq, cpu); | 4253 | trigger_load_balance(rq, cpu); |
4145 | #endif | 4254 | #endif |
4146 | } | 4255 | } |
@@ -4237,6 +4346,7 @@ static inline void schedule_debug(struct task_struct *prev) | |||
4237 | */ | 4346 | */ |
4238 | if (unlikely(in_atomic_preempt_off() && !prev->exit_state)) | 4347 | if (unlikely(in_atomic_preempt_off() && !prev->exit_state)) |
4239 | __schedule_bug(prev); | 4348 | __schedule_bug(prev); |
4349 | rcu_sleep_check(); | ||
4240 | 4350 | ||
4241 | profile_hit(SCHED_PROFILING, __builtin_return_address(0)); | 4351 | profile_hit(SCHED_PROFILING, __builtin_return_address(0)); |
4242 | 4352 | ||
@@ -4263,7 +4373,7 @@ pick_next_task(struct rq *rq) | |||
4263 | * Optimization: we know that if all tasks are in | 4373 | * Optimization: we know that if all tasks are in |
4264 | * the fair class we can call that function directly: | 4374 | * the fair class we can call that function directly: |
4265 | */ | 4375 | */ |
4266 | if (likely(rq->nr_running == rq->cfs.nr_running)) { | 4376 | if (likely(rq->nr_running == rq->cfs.h_nr_running)) { |
4267 | p = fair_sched_class.pick_next_task(rq); | 4377 | p = fair_sched_class.pick_next_task(rq); |
4268 | if (likely(p)) | 4378 | if (likely(p)) |
4269 | return p; | 4379 | return p; |
@@ -4372,7 +4482,7 @@ static inline void sched_submit_work(struct task_struct *tsk) | |||
4372 | blk_schedule_flush_plug(tsk); | 4482 | blk_schedule_flush_plug(tsk); |
4373 | } | 4483 | } |
4374 | 4484 | ||
4375 | asmlinkage void schedule(void) | 4485 | asmlinkage void __sched schedule(void) |
4376 | { | 4486 | { |
4377 | struct task_struct *tsk = current; | 4487 | struct task_struct *tsk = current; |
4378 | 4488 | ||
@@ -5049,7 +5159,20 @@ EXPORT_SYMBOL(task_nice); | |||
5049 | */ | 5159 | */ |
5050 | int idle_cpu(int cpu) | 5160 | int idle_cpu(int cpu) |
5051 | { | 5161 | { |
5052 | return cpu_curr(cpu) == cpu_rq(cpu)->idle; | 5162 | struct rq *rq = cpu_rq(cpu); |
5163 | |||
5164 | if (rq->curr != rq->idle) | ||
5165 | return 0; | ||
5166 | |||
5167 | if (rq->nr_running) | ||
5168 | return 0; | ||
5169 | |||
5170 | #ifdef CONFIG_SMP | ||
5171 | if (!llist_empty(&rq->wake_list)) | ||
5172 | return 0; | ||
5173 | #endif | ||
5174 | |||
5175 | return 1; | ||
5053 | } | 5176 | } |
5054 | 5177 | ||
5055 | /** | 5178 | /** |
@@ -5899,7 +6022,7 @@ void show_state_filter(unsigned long state_filter) | |||
5899 | printk(KERN_INFO | 6022 | printk(KERN_INFO |
5900 | " task PC stack pid father\n"); | 6023 | " task PC stack pid father\n"); |
5901 | #endif | 6024 | #endif |
5902 | read_lock(&tasklist_lock); | 6025 | rcu_read_lock(); |
5903 | do_each_thread(g, p) { | 6026 | do_each_thread(g, p) { |
5904 | /* | 6027 | /* |
5905 | * reset the NMI-timeout, listing all files on a slow | 6028 | * reset the NMI-timeout, listing all files on a slow |
@@ -5915,7 +6038,7 @@ void show_state_filter(unsigned long state_filter) | |||
5915 | #ifdef CONFIG_SCHED_DEBUG | 6038 | #ifdef CONFIG_SCHED_DEBUG |
5916 | sysrq_sched_debug_show(); | 6039 | sysrq_sched_debug_show(); |
5917 | #endif | 6040 | #endif |
5918 | read_unlock(&tasklist_lock); | 6041 | rcu_read_unlock(); |
5919 | /* | 6042 | /* |
5920 | * Only show locks if all tasks are dumped: | 6043 | * Only show locks if all tasks are dumped: |
5921 | */ | 6044 | */ |
@@ -5979,15 +6102,6 @@ void __cpuinit init_idle(struct task_struct *idle, int cpu) | |||
5979 | } | 6102 | } |
5980 | 6103 | ||
5981 | /* | 6104 | /* |
5982 | * In a system that switches off the HZ timer nohz_cpu_mask | ||
5983 | * indicates which cpus entered this state. This is used | ||
5984 | * in the rcu update to wait only for active cpus. For system | ||
5985 | * which do not switch off the HZ timer nohz_cpu_mask should | ||
5986 | * always be CPU_BITS_NONE. | ||
5987 | */ | ||
5988 | cpumask_var_t nohz_cpu_mask; | ||
5989 | |||
5990 | /* | ||
5991 | * Increase the granularity value when there are more CPUs, | 6105 | * Increase the granularity value when there are more CPUs, |
5992 | * because with more CPUs the 'effective latency' as visible | 6106 | * because with more CPUs the 'effective latency' as visible |
5993 | * to users decreases. But the relationship is not linear, | 6107 | * to users decreases. But the relationship is not linear, |
@@ -6039,10 +6153,9 @@ void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask) | |||
6039 | { | 6153 | { |
6040 | if (p->sched_class && p->sched_class->set_cpus_allowed) | 6154 | if (p->sched_class && p->sched_class->set_cpus_allowed) |
6041 | p->sched_class->set_cpus_allowed(p, new_mask); | 6155 | p->sched_class->set_cpus_allowed(p, new_mask); |
6042 | else { | 6156 | |
6043 | cpumask_copy(&p->cpus_allowed, new_mask); | 6157 | cpumask_copy(&p->cpus_allowed, new_mask); |
6044 | p->rt.nr_cpus_allowed = cpumask_weight(new_mask); | 6158 | p->rt.nr_cpus_allowed = cpumask_weight(new_mask); |
6045 | } | ||
6046 | } | 6159 | } |
6047 | 6160 | ||
6048 | /* | 6161 | /* |
@@ -6140,7 +6253,7 @@ static int __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu) | |||
6140 | if (task_cpu(p) != src_cpu) | 6253 | if (task_cpu(p) != src_cpu) |
6141 | goto done; | 6254 | goto done; |
6142 | /* Affinity changed (again). */ | 6255 | /* Affinity changed (again). */ |
6143 | if (!cpumask_test_cpu(dest_cpu, &p->cpus_allowed)) | 6256 | if (!cpumask_test_cpu(dest_cpu, tsk_cpus_allowed(p))) |
6144 | goto fail; | 6257 | goto fail; |
6145 | 6258 | ||
6146 | /* | 6259 | /* |
@@ -6221,6 +6334,30 @@ static void calc_global_load_remove(struct rq *rq) | |||
6221 | rq->calc_load_active = 0; | 6334 | rq->calc_load_active = 0; |
6222 | } | 6335 | } |
6223 | 6336 | ||
6337 | #ifdef CONFIG_CFS_BANDWIDTH | ||
6338 | static void unthrottle_offline_cfs_rqs(struct rq *rq) | ||
6339 | { | ||
6340 | struct cfs_rq *cfs_rq; | ||
6341 | |||
6342 | for_each_leaf_cfs_rq(rq, cfs_rq) { | ||
6343 | struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg); | ||
6344 | |||
6345 | if (!cfs_rq->runtime_enabled) | ||
6346 | continue; | ||
6347 | |||
6348 | /* | ||
6349 | * clock_task is not advancing so we just need to make sure | ||
6350 | * there's some valid quota amount | ||
6351 | */ | ||
6352 | cfs_rq->runtime_remaining = cfs_b->quota; | ||
6353 | if (cfs_rq_throttled(cfs_rq)) | ||
6354 | unthrottle_cfs_rq(cfs_rq); | ||
6355 | } | ||
6356 | } | ||
6357 | #else | ||
6358 | static void unthrottle_offline_cfs_rqs(struct rq *rq) {} | ||
6359 | #endif | ||
6360 | |||
6224 | /* | 6361 | /* |
6225 | * Migrate all tasks from the rq, sleeping tasks will be migrated by | 6362 | * Migrate all tasks from the rq, sleeping tasks will be migrated by |
6226 | * try_to_wake_up()->select_task_rq(). | 6363 | * try_to_wake_up()->select_task_rq(). |
@@ -6246,6 +6383,9 @@ static void migrate_tasks(unsigned int dead_cpu) | |||
6246 | */ | 6383 | */ |
6247 | rq->stop = NULL; | 6384 | rq->stop = NULL; |
6248 | 6385 | ||
6386 | /* Ensure any throttled groups are reachable by pick_next_task */ | ||
6387 | unthrottle_offline_cfs_rqs(rq); | ||
6388 | |||
6249 | for ( ; ; ) { | 6389 | for ( ; ; ) { |
6250 | /* | 6390 | /* |
6251 | * There's this thread running, bail when that's the only | 6391 | * There's this thread running, bail when that's the only |
@@ -7989,6 +8129,7 @@ static void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq, | |||
7989 | /* allow initial update_cfs_load() to truncate */ | 8129 | /* allow initial update_cfs_load() to truncate */ |
7990 | cfs_rq->load_stamp = 1; | 8130 | cfs_rq->load_stamp = 1; |
7991 | #endif | 8131 | #endif |
8132 | init_cfs_rq_runtime(cfs_rq); | ||
7992 | 8133 | ||
7993 | tg->cfs_rq[cpu] = cfs_rq; | 8134 | tg->cfs_rq[cpu] = cfs_rq; |
7994 | tg->se[cpu] = se; | 8135 | tg->se[cpu] = se; |
@@ -8128,6 +8269,7 @@ void __init sched_init(void) | |||
8128 | * We achieve this by letting root_task_group's tasks sit | 8269 | * We achieve this by letting root_task_group's tasks sit |
8129 | * directly in rq->cfs (i.e root_task_group->se[] = NULL). | 8270 | * directly in rq->cfs (i.e root_task_group->se[] = NULL). |
8130 | */ | 8271 | */ |
8272 | init_cfs_bandwidth(&root_task_group.cfs_bandwidth); | ||
8131 | init_tg_cfs_entry(&root_task_group, &rq->cfs, NULL, i, NULL); | 8273 | init_tg_cfs_entry(&root_task_group, &rq->cfs, NULL, i, NULL); |
8132 | #endif /* CONFIG_FAIR_GROUP_SCHED */ | 8274 | #endif /* CONFIG_FAIR_GROUP_SCHED */ |
8133 | 8275 | ||
@@ -8157,7 +8299,6 @@ void __init sched_init(void) | |||
8157 | rq_attach_root(rq, &def_root_domain); | 8299 | rq_attach_root(rq, &def_root_domain); |
8158 | #ifdef CONFIG_NO_HZ | 8300 | #ifdef CONFIG_NO_HZ |
8159 | rq->nohz_balance_kick = 0; | 8301 | rq->nohz_balance_kick = 0; |
8160 | init_sched_softirq_csd(&per_cpu(remote_sched_softirq_cb, i)); | ||
8161 | #endif | 8302 | #endif |
8162 | #endif | 8303 | #endif |
8163 | init_rq_hrtick(rq); | 8304 | init_rq_hrtick(rq); |
@@ -8199,8 +8340,6 @@ void __init sched_init(void) | |||
8199 | */ | 8340 | */ |
8200 | current->sched_class = &fair_sched_class; | 8341 | current->sched_class = &fair_sched_class; |
8201 | 8342 | ||
8202 | /* Allocate the nohz_cpu_mask if CONFIG_CPUMASK_OFFSTACK */ | ||
8203 | zalloc_cpumask_var(&nohz_cpu_mask, GFP_NOWAIT); | ||
8204 | #ifdef CONFIG_SMP | 8343 | #ifdef CONFIG_SMP |
8205 | zalloc_cpumask_var(&sched_domains_tmpmask, GFP_NOWAIT); | 8344 | zalloc_cpumask_var(&sched_domains_tmpmask, GFP_NOWAIT); |
8206 | #ifdef CONFIG_NO_HZ | 8345 | #ifdef CONFIG_NO_HZ |
@@ -8230,6 +8369,7 @@ void __might_sleep(const char *file, int line, int preempt_offset) | |||
8230 | { | 8369 | { |
8231 | static unsigned long prev_jiffy; /* ratelimiting */ | 8370 | static unsigned long prev_jiffy; /* ratelimiting */ |
8232 | 8371 | ||
8372 | rcu_sleep_check(); /* WARN_ON_ONCE() by default, no rate limit reqd. */ | ||
8233 | if ((preempt_count_equals(preempt_offset) && !irqs_disabled()) || | 8373 | if ((preempt_count_equals(preempt_offset) && !irqs_disabled()) || |
8234 | system_state != SYSTEM_RUNNING || oops_in_progress) | 8374 | system_state != SYSTEM_RUNNING || oops_in_progress) |
8235 | return; | 8375 | return; |
@@ -8369,6 +8509,8 @@ static void free_fair_sched_group(struct task_group *tg) | |||
8369 | { | 8509 | { |
8370 | int i; | 8510 | int i; |
8371 | 8511 | ||
8512 | destroy_cfs_bandwidth(tg_cfs_bandwidth(tg)); | ||
8513 | |||
8372 | for_each_possible_cpu(i) { | 8514 | for_each_possible_cpu(i) { |
8373 | if (tg->cfs_rq) | 8515 | if (tg->cfs_rq) |
8374 | kfree(tg->cfs_rq[i]); | 8516 | kfree(tg->cfs_rq[i]); |
@@ -8396,6 +8538,8 @@ int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent) | |||
8396 | 8538 | ||
8397 | tg->shares = NICE_0_LOAD; | 8539 | tg->shares = NICE_0_LOAD; |
8398 | 8540 | ||
8541 | init_cfs_bandwidth(tg_cfs_bandwidth(tg)); | ||
8542 | |||
8399 | for_each_possible_cpu(i) { | 8543 | for_each_possible_cpu(i) { |
8400 | cfs_rq = kzalloc_node(sizeof(struct cfs_rq), | 8544 | cfs_rq = kzalloc_node(sizeof(struct cfs_rq), |
8401 | GFP_KERNEL, cpu_to_node(i)); | 8545 | GFP_KERNEL, cpu_to_node(i)); |
@@ -8671,12 +8815,7 @@ unsigned long sched_group_shares(struct task_group *tg) | |||
8671 | } | 8815 | } |
8672 | #endif | 8816 | #endif |
8673 | 8817 | ||
8674 | #ifdef CONFIG_RT_GROUP_SCHED | 8818 | #if defined(CONFIG_RT_GROUP_SCHED) || defined(CONFIG_CFS_BANDWIDTH) |
8675 | /* | ||
8676 | * Ensure that the real time constraints are schedulable. | ||
8677 | */ | ||
8678 | static DEFINE_MUTEX(rt_constraints_mutex); | ||
8679 | |||
8680 | static unsigned long to_ratio(u64 period, u64 runtime) | 8819 | static unsigned long to_ratio(u64 period, u64 runtime) |
8681 | { | 8820 | { |
8682 | if (runtime == RUNTIME_INF) | 8821 | if (runtime == RUNTIME_INF) |
@@ -8684,6 +8823,13 @@ static unsigned long to_ratio(u64 period, u64 runtime) | |||
8684 | 8823 | ||
8685 | return div64_u64(runtime << 20, period); | 8824 | return div64_u64(runtime << 20, period); |
8686 | } | 8825 | } |
8826 | #endif | ||
8827 | |||
8828 | #ifdef CONFIG_RT_GROUP_SCHED | ||
8829 | /* | ||
8830 | * Ensure that the real time constraints are schedulable. | ||
8831 | */ | ||
8832 | static DEFINE_MUTEX(rt_constraints_mutex); | ||
8687 | 8833 | ||
8688 | /* Must be called with tasklist_lock held */ | 8834 | /* Must be called with tasklist_lock held */ |
8689 | static inline int tg_has_rt_tasks(struct task_group *tg) | 8835 | static inline int tg_has_rt_tasks(struct task_group *tg) |
@@ -8704,7 +8850,7 @@ struct rt_schedulable_data { | |||
8704 | u64 rt_runtime; | 8850 | u64 rt_runtime; |
8705 | }; | 8851 | }; |
8706 | 8852 | ||
8707 | static int tg_schedulable(struct task_group *tg, void *data) | 8853 | static int tg_rt_schedulable(struct task_group *tg, void *data) |
8708 | { | 8854 | { |
8709 | struct rt_schedulable_data *d = data; | 8855 | struct rt_schedulable_data *d = data; |
8710 | struct task_group *child; | 8856 | struct task_group *child; |
@@ -8762,16 +8908,22 @@ static int tg_schedulable(struct task_group *tg, void *data) | |||
8762 | 8908 | ||
8763 | static int __rt_schedulable(struct task_group *tg, u64 period, u64 runtime) | 8909 | static int __rt_schedulable(struct task_group *tg, u64 period, u64 runtime) |
8764 | { | 8910 | { |
8911 | int ret; | ||
8912 | |||
8765 | struct rt_schedulable_data data = { | 8913 | struct rt_schedulable_data data = { |
8766 | .tg = tg, | 8914 | .tg = tg, |
8767 | .rt_period = period, | 8915 | .rt_period = period, |
8768 | .rt_runtime = runtime, | 8916 | .rt_runtime = runtime, |
8769 | }; | 8917 | }; |
8770 | 8918 | ||
8771 | return walk_tg_tree(tg_schedulable, tg_nop, &data); | 8919 | rcu_read_lock(); |
8920 | ret = walk_tg_tree(tg_rt_schedulable, tg_nop, &data); | ||
8921 | rcu_read_unlock(); | ||
8922 | |||
8923 | return ret; | ||
8772 | } | 8924 | } |
8773 | 8925 | ||
8774 | static int tg_set_bandwidth(struct task_group *tg, | 8926 | static int tg_set_rt_bandwidth(struct task_group *tg, |
8775 | u64 rt_period, u64 rt_runtime) | 8927 | u64 rt_period, u64 rt_runtime) |
8776 | { | 8928 | { |
8777 | int i, err = 0; | 8929 | int i, err = 0; |
@@ -8810,7 +8962,7 @@ int sched_group_set_rt_runtime(struct task_group *tg, long rt_runtime_us) | |||
8810 | if (rt_runtime_us < 0) | 8962 | if (rt_runtime_us < 0) |
8811 | rt_runtime = RUNTIME_INF; | 8963 | rt_runtime = RUNTIME_INF; |
8812 | 8964 | ||
8813 | return tg_set_bandwidth(tg, rt_period, rt_runtime); | 8965 | return tg_set_rt_bandwidth(tg, rt_period, rt_runtime); |
8814 | } | 8966 | } |
8815 | 8967 | ||
8816 | long sched_group_rt_runtime(struct task_group *tg) | 8968 | long sched_group_rt_runtime(struct task_group *tg) |
@@ -8835,7 +8987,7 @@ int sched_group_set_rt_period(struct task_group *tg, long rt_period_us) | |||
8835 | if (rt_period == 0) | 8987 | if (rt_period == 0) |
8836 | return -EINVAL; | 8988 | return -EINVAL; |
8837 | 8989 | ||
8838 | return tg_set_bandwidth(tg, rt_period, rt_runtime); | 8990 | return tg_set_rt_bandwidth(tg, rt_period, rt_runtime); |
8839 | } | 8991 | } |
8840 | 8992 | ||
8841 | long sched_group_rt_period(struct task_group *tg) | 8993 | long sched_group_rt_period(struct task_group *tg) |
@@ -9025,6 +9177,238 @@ static u64 cpu_shares_read_u64(struct cgroup *cgrp, struct cftype *cft) | |||
9025 | 9177 | ||
9026 | return (u64) scale_load_down(tg->shares); | 9178 | return (u64) scale_load_down(tg->shares); |
9027 | } | 9179 | } |
9180 | |||
9181 | #ifdef CONFIG_CFS_BANDWIDTH | ||
9182 | static DEFINE_MUTEX(cfs_constraints_mutex); | ||
9183 | |||
9184 | const u64 max_cfs_quota_period = 1 * NSEC_PER_SEC; /* 1s */ | ||
9185 | const u64 min_cfs_quota_period = 1 * NSEC_PER_MSEC; /* 1ms */ | ||
9186 | |||
9187 | static int __cfs_schedulable(struct task_group *tg, u64 period, u64 runtime); | ||
9188 | |||
9189 | static int tg_set_cfs_bandwidth(struct task_group *tg, u64 period, u64 quota) | ||
9190 | { | ||
9191 | int i, ret = 0, runtime_enabled; | ||
9192 | struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(tg); | ||
9193 | |||
9194 | if (tg == &root_task_group) | ||
9195 | return -EINVAL; | ||
9196 | |||
9197 | /* | ||
9198 | * Ensure we have at some amount of bandwidth every period. This is | ||
9199 | * to prevent reaching a state of large arrears when throttled via | ||
9200 | * entity_tick() resulting in prolonged exit starvation. | ||
9201 | */ | ||
9202 | if (quota < min_cfs_quota_period || period < min_cfs_quota_period) | ||
9203 | return -EINVAL; | ||
9204 | |||
9205 | /* | ||
9206 | * Likewise, bound things on the otherside by preventing insane quota | ||
9207 | * periods. This also allows us to normalize in computing quota | ||
9208 | * feasibility. | ||
9209 | */ | ||
9210 | if (period > max_cfs_quota_period) | ||
9211 | return -EINVAL; | ||
9212 | |||
9213 | mutex_lock(&cfs_constraints_mutex); | ||
9214 | ret = __cfs_schedulable(tg, period, quota); | ||
9215 | if (ret) | ||
9216 | goto out_unlock; | ||
9217 | |||
9218 | runtime_enabled = quota != RUNTIME_INF; | ||
9219 | raw_spin_lock_irq(&cfs_b->lock); | ||
9220 | cfs_b->period = ns_to_ktime(period); | ||
9221 | cfs_b->quota = quota; | ||
9222 | |||
9223 | __refill_cfs_bandwidth_runtime(cfs_b); | ||
9224 | /* restart the period timer (if active) to handle new period expiry */ | ||
9225 | if (runtime_enabled && cfs_b->timer_active) { | ||
9226 | /* force a reprogram */ | ||
9227 | cfs_b->timer_active = 0; | ||
9228 | __start_cfs_bandwidth(cfs_b); | ||
9229 | } | ||
9230 | raw_spin_unlock_irq(&cfs_b->lock); | ||
9231 | |||
9232 | for_each_possible_cpu(i) { | ||
9233 | struct cfs_rq *cfs_rq = tg->cfs_rq[i]; | ||
9234 | struct rq *rq = rq_of(cfs_rq); | ||
9235 | |||
9236 | raw_spin_lock_irq(&rq->lock); | ||
9237 | cfs_rq->runtime_enabled = runtime_enabled; | ||
9238 | cfs_rq->runtime_remaining = 0; | ||
9239 | |||
9240 | if (cfs_rq_throttled(cfs_rq)) | ||
9241 | unthrottle_cfs_rq(cfs_rq); | ||
9242 | raw_spin_unlock_irq(&rq->lock); | ||
9243 | } | ||
9244 | out_unlock: | ||
9245 | mutex_unlock(&cfs_constraints_mutex); | ||
9246 | |||
9247 | return ret; | ||
9248 | } | ||
9249 | |||
9250 | int tg_set_cfs_quota(struct task_group *tg, long cfs_quota_us) | ||
9251 | { | ||
9252 | u64 quota, period; | ||
9253 | |||
9254 | period = ktime_to_ns(tg_cfs_bandwidth(tg)->period); | ||
9255 | if (cfs_quota_us < 0) | ||
9256 | quota = RUNTIME_INF; | ||
9257 | else | ||
9258 | quota = (u64)cfs_quota_us * NSEC_PER_USEC; | ||
9259 | |||
9260 | return tg_set_cfs_bandwidth(tg, period, quota); | ||
9261 | } | ||
9262 | |||
9263 | long tg_get_cfs_quota(struct task_group *tg) | ||
9264 | { | ||
9265 | u64 quota_us; | ||
9266 | |||
9267 | if (tg_cfs_bandwidth(tg)->quota == RUNTIME_INF) | ||
9268 | return -1; | ||
9269 | |||
9270 | quota_us = tg_cfs_bandwidth(tg)->quota; | ||
9271 | do_div(quota_us, NSEC_PER_USEC); | ||
9272 | |||
9273 | return quota_us; | ||
9274 | } | ||
9275 | |||
9276 | int tg_set_cfs_period(struct task_group *tg, long cfs_period_us) | ||
9277 | { | ||
9278 | u64 quota, period; | ||
9279 | |||
9280 | period = (u64)cfs_period_us * NSEC_PER_USEC; | ||
9281 | quota = tg_cfs_bandwidth(tg)->quota; | ||
9282 | |||
9283 | if (period <= 0) | ||
9284 | return -EINVAL; | ||
9285 | |||
9286 | return tg_set_cfs_bandwidth(tg, period, quota); | ||
9287 | } | ||
9288 | |||
9289 | long tg_get_cfs_period(struct task_group *tg) | ||
9290 | { | ||
9291 | u64 cfs_period_us; | ||
9292 | |||
9293 | cfs_period_us = ktime_to_ns(tg_cfs_bandwidth(tg)->period); | ||
9294 | do_div(cfs_period_us, NSEC_PER_USEC); | ||
9295 | |||
9296 | return cfs_period_us; | ||
9297 | } | ||
9298 | |||
9299 | static s64 cpu_cfs_quota_read_s64(struct cgroup *cgrp, struct cftype *cft) | ||
9300 | { | ||
9301 | return tg_get_cfs_quota(cgroup_tg(cgrp)); | ||
9302 | } | ||
9303 | |||
9304 | static int cpu_cfs_quota_write_s64(struct cgroup *cgrp, struct cftype *cftype, | ||
9305 | s64 cfs_quota_us) | ||
9306 | { | ||
9307 | return tg_set_cfs_quota(cgroup_tg(cgrp), cfs_quota_us); | ||
9308 | } | ||
9309 | |||
9310 | static u64 cpu_cfs_period_read_u64(struct cgroup *cgrp, struct cftype *cft) | ||
9311 | { | ||
9312 | return tg_get_cfs_period(cgroup_tg(cgrp)); | ||
9313 | } | ||
9314 | |||
9315 | static int cpu_cfs_period_write_u64(struct cgroup *cgrp, struct cftype *cftype, | ||
9316 | u64 cfs_period_us) | ||
9317 | { | ||
9318 | return tg_set_cfs_period(cgroup_tg(cgrp), cfs_period_us); | ||
9319 | } | ||
9320 | |||
9321 | struct cfs_schedulable_data { | ||
9322 | struct task_group *tg; | ||
9323 | u64 period, quota; | ||
9324 | }; | ||
9325 | |||
9326 | /* | ||
9327 | * normalize group quota/period to be quota/max_period | ||
9328 | * note: units are usecs | ||
9329 | */ | ||
9330 | static u64 normalize_cfs_quota(struct task_group *tg, | ||
9331 | struct cfs_schedulable_data *d) | ||
9332 | { | ||
9333 | u64 quota, period; | ||
9334 | |||
9335 | if (tg == d->tg) { | ||
9336 | period = d->period; | ||
9337 | quota = d->quota; | ||
9338 | } else { | ||
9339 | period = tg_get_cfs_period(tg); | ||
9340 | quota = tg_get_cfs_quota(tg); | ||
9341 | } | ||
9342 | |||
9343 | /* note: these should typically be equivalent */ | ||
9344 | if (quota == RUNTIME_INF || quota == -1) | ||
9345 | return RUNTIME_INF; | ||
9346 | |||
9347 | return to_ratio(period, quota); | ||
9348 | } | ||
9349 | |||
9350 | static int tg_cfs_schedulable_down(struct task_group *tg, void *data) | ||
9351 | { | ||
9352 | struct cfs_schedulable_data *d = data; | ||
9353 | struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(tg); | ||
9354 | s64 quota = 0, parent_quota = -1; | ||
9355 | |||
9356 | if (!tg->parent) { | ||
9357 | quota = RUNTIME_INF; | ||
9358 | } else { | ||
9359 | struct cfs_bandwidth *parent_b = tg_cfs_bandwidth(tg->parent); | ||
9360 | |||
9361 | quota = normalize_cfs_quota(tg, d); | ||
9362 | parent_quota = parent_b->hierarchal_quota; | ||
9363 | |||
9364 | /* | ||
9365 | * ensure max(child_quota) <= parent_quota, inherit when no | ||
9366 | * limit is set | ||
9367 | */ | ||
9368 | if (quota == RUNTIME_INF) | ||
9369 | quota = parent_quota; | ||
9370 | else if (parent_quota != RUNTIME_INF && quota > parent_quota) | ||
9371 | return -EINVAL; | ||
9372 | } | ||
9373 | cfs_b->hierarchal_quota = quota; | ||
9374 | |||
9375 | return 0; | ||
9376 | } | ||
9377 | |||
9378 | static int __cfs_schedulable(struct task_group *tg, u64 period, u64 quota) | ||
9379 | { | ||
9380 | int ret; | ||
9381 | struct cfs_schedulable_data data = { | ||
9382 | .tg = tg, | ||
9383 | .period = period, | ||
9384 | .quota = quota, | ||
9385 | }; | ||
9386 | |||
9387 | if (quota != RUNTIME_INF) { | ||
9388 | do_div(data.period, NSEC_PER_USEC); | ||
9389 | do_div(data.quota, NSEC_PER_USEC); | ||
9390 | } | ||
9391 | |||
9392 | rcu_read_lock(); | ||
9393 | ret = walk_tg_tree(tg_cfs_schedulable_down, tg_nop, &data); | ||
9394 | rcu_read_unlock(); | ||
9395 | |||
9396 | return ret; | ||
9397 | } | ||
9398 | |||
9399 | static int cpu_stats_show(struct cgroup *cgrp, struct cftype *cft, | ||
9400 | struct cgroup_map_cb *cb) | ||
9401 | { | ||
9402 | struct task_group *tg = cgroup_tg(cgrp); | ||
9403 | struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(tg); | ||
9404 | |||
9405 | cb->fill(cb, "nr_periods", cfs_b->nr_periods); | ||
9406 | cb->fill(cb, "nr_throttled", cfs_b->nr_throttled); | ||
9407 | cb->fill(cb, "throttled_time", cfs_b->throttled_time); | ||
9408 | |||
9409 | return 0; | ||
9410 | } | ||
9411 | #endif /* CONFIG_CFS_BANDWIDTH */ | ||
9028 | #endif /* CONFIG_FAIR_GROUP_SCHED */ | 9412 | #endif /* CONFIG_FAIR_GROUP_SCHED */ |
9029 | 9413 | ||
9030 | #ifdef CONFIG_RT_GROUP_SCHED | 9414 | #ifdef CONFIG_RT_GROUP_SCHED |
@@ -9059,6 +9443,22 @@ static struct cftype cpu_files[] = { | |||
9059 | .write_u64 = cpu_shares_write_u64, | 9443 | .write_u64 = cpu_shares_write_u64, |
9060 | }, | 9444 | }, |
9061 | #endif | 9445 | #endif |
9446 | #ifdef CONFIG_CFS_BANDWIDTH | ||
9447 | { | ||
9448 | .name = "cfs_quota_us", | ||
9449 | .read_s64 = cpu_cfs_quota_read_s64, | ||
9450 | .write_s64 = cpu_cfs_quota_write_s64, | ||
9451 | }, | ||
9452 | { | ||
9453 | .name = "cfs_period_us", | ||
9454 | .read_u64 = cpu_cfs_period_read_u64, | ||
9455 | .write_u64 = cpu_cfs_period_write_u64, | ||
9456 | }, | ||
9457 | { | ||
9458 | .name = "stat", | ||
9459 | .read_map = cpu_stats_show, | ||
9460 | }, | ||
9461 | #endif | ||
9062 | #ifdef CONFIG_RT_GROUP_SCHED | 9462 | #ifdef CONFIG_RT_GROUP_SCHED |
9063 | { | 9463 | { |
9064 | .name = "rt_runtime_us", | 9464 | .name = "rt_runtime_us", |
@@ -9368,4 +9768,3 @@ struct cgroup_subsys cpuacct_subsys = { | |||
9368 | .subsys_id = cpuacct_subsys_id, | 9768 | .subsys_id = cpuacct_subsys_id, |
9369 | }; | 9769 | }; |
9370 | #endif /* CONFIG_CGROUP_CPUACCT */ | 9770 | #endif /* CONFIG_CGROUP_CPUACCT */ |
9371 | |||
diff --git a/kernel/sched_cpupri.c b/kernel/sched_cpupri.c index 2722dc1b4138..a86cf9d9eb11 100644 --- a/kernel/sched_cpupri.c +++ b/kernel/sched_cpupri.c | |||
@@ -47,9 +47,6 @@ static int convert_prio(int prio) | |||
47 | return cpupri; | 47 | return cpupri; |
48 | } | 48 | } |
49 | 49 | ||
50 | #define for_each_cpupri_active(array, idx) \ | ||
51 | for_each_set_bit(idx, array, CPUPRI_NR_PRIORITIES) | ||
52 | |||
53 | /** | 50 | /** |
54 | * cpupri_find - find the best (lowest-pri) CPU in the system | 51 | * cpupri_find - find the best (lowest-pri) CPU in the system |
55 | * @cp: The cpupri context | 52 | * @cp: The cpupri context |
@@ -71,11 +68,38 @@ int cpupri_find(struct cpupri *cp, struct task_struct *p, | |||
71 | int idx = 0; | 68 | int idx = 0; |
72 | int task_pri = convert_prio(p->prio); | 69 | int task_pri = convert_prio(p->prio); |
73 | 70 | ||
74 | for_each_cpupri_active(cp->pri_active, idx) { | 71 | if (task_pri >= MAX_RT_PRIO) |
75 | struct cpupri_vec *vec = &cp->pri_to_cpu[idx]; | 72 | return 0; |
76 | 73 | ||
77 | if (idx >= task_pri) | 74 | for (idx = 0; idx < task_pri; idx++) { |
78 | break; | 75 | struct cpupri_vec *vec = &cp->pri_to_cpu[idx]; |
76 | int skip = 0; | ||
77 | |||
78 | if (!atomic_read(&(vec)->count)) | ||
79 | skip = 1; | ||
80 | /* | ||
81 | * When looking at the vector, we need to read the counter, | ||
82 | * do a memory barrier, then read the mask. | ||
83 | * | ||
84 | * Note: This is still all racey, but we can deal with it. | ||
85 | * Ideally, we only want to look at masks that are set. | ||
86 | * | ||
87 | * If a mask is not set, then the only thing wrong is that we | ||
88 | * did a little more work than necessary. | ||
89 | * | ||
90 | * If we read a zero count but the mask is set, because of the | ||
91 | * memory barriers, that can only happen when the highest prio | ||
92 | * task for a run queue has left the run queue, in which case, | ||
93 | * it will be followed by a pull. If the task we are processing | ||
94 | * fails to find a proper place to go, that pull request will | ||
95 | * pull this task if the run queue is running at a lower | ||
96 | * priority. | ||
97 | */ | ||
98 | smp_rmb(); | ||
99 | |||
100 | /* Need to do the rmb for every iteration */ | ||
101 | if (skip) | ||
102 | continue; | ||
79 | 103 | ||
80 | if (cpumask_any_and(&p->cpus_allowed, vec->mask) >= nr_cpu_ids) | 104 | if (cpumask_any_and(&p->cpus_allowed, vec->mask) >= nr_cpu_ids) |
81 | continue; | 105 | continue; |
@@ -115,7 +139,7 @@ void cpupri_set(struct cpupri *cp, int cpu, int newpri) | |||
115 | { | 139 | { |
116 | int *currpri = &cp->cpu_to_pri[cpu]; | 140 | int *currpri = &cp->cpu_to_pri[cpu]; |
117 | int oldpri = *currpri; | 141 | int oldpri = *currpri; |
118 | unsigned long flags; | 142 | int do_mb = 0; |
119 | 143 | ||
120 | newpri = convert_prio(newpri); | 144 | newpri = convert_prio(newpri); |
121 | 145 | ||
@@ -128,32 +152,46 @@ void cpupri_set(struct cpupri *cp, int cpu, int newpri) | |||
128 | * If the cpu was currently mapped to a different value, we | 152 | * If the cpu was currently mapped to a different value, we |
129 | * need to map it to the new value then remove the old value. | 153 | * need to map it to the new value then remove the old value. |
130 | * Note, we must add the new value first, otherwise we risk the | 154 | * Note, we must add the new value first, otherwise we risk the |
131 | * cpu being cleared from pri_active, and this cpu could be | 155 | * cpu being missed by the priority loop in cpupri_find. |
132 | * missed for a push or pull. | ||
133 | */ | 156 | */ |
134 | if (likely(newpri != CPUPRI_INVALID)) { | 157 | if (likely(newpri != CPUPRI_INVALID)) { |
135 | struct cpupri_vec *vec = &cp->pri_to_cpu[newpri]; | 158 | struct cpupri_vec *vec = &cp->pri_to_cpu[newpri]; |
136 | 159 | ||
137 | raw_spin_lock_irqsave(&vec->lock, flags); | ||
138 | |||
139 | cpumask_set_cpu(cpu, vec->mask); | 160 | cpumask_set_cpu(cpu, vec->mask); |
140 | vec->count++; | 161 | /* |
141 | if (vec->count == 1) | 162 | * When adding a new vector, we update the mask first, |
142 | set_bit(newpri, cp->pri_active); | 163 | * do a write memory barrier, and then update the count, to |
143 | 164 | * make sure the vector is visible when count is set. | |
144 | raw_spin_unlock_irqrestore(&vec->lock, flags); | 165 | */ |
166 | smp_mb__before_atomic_inc(); | ||
167 | atomic_inc(&(vec)->count); | ||
168 | do_mb = 1; | ||
145 | } | 169 | } |
146 | if (likely(oldpri != CPUPRI_INVALID)) { | 170 | if (likely(oldpri != CPUPRI_INVALID)) { |
147 | struct cpupri_vec *vec = &cp->pri_to_cpu[oldpri]; | 171 | struct cpupri_vec *vec = &cp->pri_to_cpu[oldpri]; |
148 | 172 | ||
149 | raw_spin_lock_irqsave(&vec->lock, flags); | 173 | /* |
150 | 174 | * Because the order of modification of the vec->count | |
151 | vec->count--; | 175 | * is important, we must make sure that the update |
152 | if (!vec->count) | 176 | * of the new prio is seen before we decrement the |
153 | clear_bit(oldpri, cp->pri_active); | 177 | * old prio. This makes sure that the loop sees |
178 | * one or the other when we raise the priority of | ||
179 | * the run queue. We don't care about when we lower the | ||
180 | * priority, as that will trigger an rt pull anyway. | ||
181 | * | ||
182 | * We only need to do a memory barrier if we updated | ||
183 | * the new priority vec. | ||
184 | */ | ||
185 | if (do_mb) | ||
186 | smp_mb__after_atomic_inc(); | ||
187 | |||
188 | /* | ||
189 | * When removing from the vector, we decrement the counter first | ||
190 | * do a memory barrier and then clear the mask. | ||
191 | */ | ||
192 | atomic_dec(&(vec)->count); | ||
193 | smp_mb__after_atomic_inc(); | ||
154 | cpumask_clear_cpu(cpu, vec->mask); | 194 | cpumask_clear_cpu(cpu, vec->mask); |
155 | |||
156 | raw_spin_unlock_irqrestore(&vec->lock, flags); | ||
157 | } | 195 | } |
158 | 196 | ||
159 | *currpri = newpri; | 197 | *currpri = newpri; |
@@ -175,8 +213,7 @@ int cpupri_init(struct cpupri *cp) | |||
175 | for (i = 0; i < CPUPRI_NR_PRIORITIES; i++) { | 213 | for (i = 0; i < CPUPRI_NR_PRIORITIES; i++) { |
176 | struct cpupri_vec *vec = &cp->pri_to_cpu[i]; | 214 | struct cpupri_vec *vec = &cp->pri_to_cpu[i]; |
177 | 215 | ||
178 | raw_spin_lock_init(&vec->lock); | 216 | atomic_set(&vec->count, 0); |
179 | vec->count = 0; | ||
180 | if (!zalloc_cpumask_var(&vec->mask, GFP_KERNEL)) | 217 | if (!zalloc_cpumask_var(&vec->mask, GFP_KERNEL)) |
181 | goto cleanup; | 218 | goto cleanup; |
182 | } | 219 | } |
diff --git a/kernel/sched_cpupri.h b/kernel/sched_cpupri.h index 9fc7d386fea4..f6d756173491 100644 --- a/kernel/sched_cpupri.h +++ b/kernel/sched_cpupri.h | |||
@@ -4,7 +4,6 @@ | |||
4 | #include <linux/sched.h> | 4 | #include <linux/sched.h> |
5 | 5 | ||
6 | #define CPUPRI_NR_PRIORITIES (MAX_RT_PRIO + 2) | 6 | #define CPUPRI_NR_PRIORITIES (MAX_RT_PRIO + 2) |
7 | #define CPUPRI_NR_PRI_WORDS BITS_TO_LONGS(CPUPRI_NR_PRIORITIES) | ||
8 | 7 | ||
9 | #define CPUPRI_INVALID -1 | 8 | #define CPUPRI_INVALID -1 |
10 | #define CPUPRI_IDLE 0 | 9 | #define CPUPRI_IDLE 0 |
@@ -12,14 +11,12 @@ | |||
12 | /* values 2-101 are RT priorities 0-99 */ | 11 | /* values 2-101 are RT priorities 0-99 */ |
13 | 12 | ||
14 | struct cpupri_vec { | 13 | struct cpupri_vec { |
15 | raw_spinlock_t lock; | 14 | atomic_t count; |
16 | int count; | 15 | cpumask_var_t mask; |
17 | cpumask_var_t mask; | ||
18 | }; | 16 | }; |
19 | 17 | ||
20 | struct cpupri { | 18 | struct cpupri { |
21 | struct cpupri_vec pri_to_cpu[CPUPRI_NR_PRIORITIES]; | 19 | struct cpupri_vec pri_to_cpu[CPUPRI_NR_PRIORITIES]; |
22 | long pri_active[CPUPRI_NR_PRI_WORDS]; | ||
23 | int cpu_to_pri[NR_CPUS]; | 20 | int cpu_to_pri[NR_CPUS]; |
24 | }; | 21 | }; |
25 | 22 | ||
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index bc8ee9993814..5c9e67923b7c 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c | |||
@@ -89,6 +89,20 @@ const_debug unsigned int sysctl_sched_migration_cost = 500000UL; | |||
89 | */ | 89 | */ |
90 | unsigned int __read_mostly sysctl_sched_shares_window = 10000000UL; | 90 | unsigned int __read_mostly sysctl_sched_shares_window = 10000000UL; |
91 | 91 | ||
92 | #ifdef CONFIG_CFS_BANDWIDTH | ||
93 | /* | ||
94 | * Amount of runtime to allocate from global (tg) to local (per-cfs_rq) pool | ||
95 | * each time a cfs_rq requests quota. | ||
96 | * | ||
97 | * Note: in the case that the slice exceeds the runtime remaining (either due | ||
98 | * to consumption or the quota being specified to be smaller than the slice) | ||
99 | * we will always only issue the remaining available time. | ||
100 | * | ||
101 | * default: 5 msec, units: microseconds | ||
102 | */ | ||
103 | unsigned int sysctl_sched_cfs_bandwidth_slice = 5000UL; | ||
104 | #endif | ||
105 | |||
92 | static const struct sched_class fair_sched_class; | 106 | static const struct sched_class fair_sched_class; |
93 | 107 | ||
94 | /************************************************************** | 108 | /************************************************************** |
@@ -292,6 +306,8 @@ find_matching_se(struct sched_entity **se, struct sched_entity **pse) | |||
292 | 306 | ||
293 | #endif /* CONFIG_FAIR_GROUP_SCHED */ | 307 | #endif /* CONFIG_FAIR_GROUP_SCHED */ |
294 | 308 | ||
309 | static void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, | ||
310 | unsigned long delta_exec); | ||
295 | 311 | ||
296 | /************************************************************** | 312 | /************************************************************** |
297 | * Scheduling class tree data structure manipulation methods: | 313 | * Scheduling class tree data structure manipulation methods: |
@@ -583,6 +599,8 @@ static void update_curr(struct cfs_rq *cfs_rq) | |||
583 | cpuacct_charge(curtask, delta_exec); | 599 | cpuacct_charge(curtask, delta_exec); |
584 | account_group_exec_runtime(curtask, delta_exec); | 600 | account_group_exec_runtime(curtask, delta_exec); |
585 | } | 601 | } |
602 | |||
603 | account_cfs_rq_runtime(cfs_rq, delta_exec); | ||
586 | } | 604 | } |
587 | 605 | ||
588 | static inline void | 606 | static inline void |
@@ -688,6 +706,8 @@ account_entity_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se) | |||
688 | } | 706 | } |
689 | 707 | ||
690 | #ifdef CONFIG_FAIR_GROUP_SCHED | 708 | #ifdef CONFIG_FAIR_GROUP_SCHED |
709 | /* we need this in update_cfs_load and load-balance functions below */ | ||
710 | static inline int throttled_hierarchy(struct cfs_rq *cfs_rq); | ||
691 | # ifdef CONFIG_SMP | 711 | # ifdef CONFIG_SMP |
692 | static void update_cfs_rq_load_contribution(struct cfs_rq *cfs_rq, | 712 | static void update_cfs_rq_load_contribution(struct cfs_rq *cfs_rq, |
693 | int global_update) | 713 | int global_update) |
@@ -710,7 +730,7 @@ static void update_cfs_load(struct cfs_rq *cfs_rq, int global_update) | |||
710 | u64 now, delta; | 730 | u64 now, delta; |
711 | unsigned long load = cfs_rq->load.weight; | 731 | unsigned long load = cfs_rq->load.weight; |
712 | 732 | ||
713 | if (cfs_rq->tg == &root_task_group) | 733 | if (cfs_rq->tg == &root_task_group || throttled_hierarchy(cfs_rq)) |
714 | return; | 734 | return; |
715 | 735 | ||
716 | now = rq_of(cfs_rq)->clock_task; | 736 | now = rq_of(cfs_rq)->clock_task; |
@@ -819,7 +839,7 @@ static void update_cfs_shares(struct cfs_rq *cfs_rq) | |||
819 | 839 | ||
820 | tg = cfs_rq->tg; | 840 | tg = cfs_rq->tg; |
821 | se = tg->se[cpu_of(rq_of(cfs_rq))]; | 841 | se = tg->se[cpu_of(rq_of(cfs_rq))]; |
822 | if (!se) | 842 | if (!se || throttled_hierarchy(cfs_rq)) |
823 | return; | 843 | return; |
824 | #ifndef CONFIG_SMP | 844 | #ifndef CONFIG_SMP |
825 | if (likely(se->load.weight == tg->shares)) | 845 | if (likely(se->load.weight == tg->shares)) |
@@ -950,6 +970,8 @@ place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial) | |||
950 | se->vruntime = vruntime; | 970 | se->vruntime = vruntime; |
951 | } | 971 | } |
952 | 972 | ||
973 | static void check_enqueue_throttle(struct cfs_rq *cfs_rq); | ||
974 | |||
953 | static void | 975 | static void |
954 | enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) | 976 | enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) |
955 | { | 977 | { |
@@ -979,8 +1001,10 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) | |||
979 | __enqueue_entity(cfs_rq, se); | 1001 | __enqueue_entity(cfs_rq, se); |
980 | se->on_rq = 1; | 1002 | se->on_rq = 1; |
981 | 1003 | ||
982 | if (cfs_rq->nr_running == 1) | 1004 | if (cfs_rq->nr_running == 1) { |
983 | list_add_leaf_cfs_rq(cfs_rq); | 1005 | list_add_leaf_cfs_rq(cfs_rq); |
1006 | check_enqueue_throttle(cfs_rq); | ||
1007 | } | ||
984 | } | 1008 | } |
985 | 1009 | ||
986 | static void __clear_buddies_last(struct sched_entity *se) | 1010 | static void __clear_buddies_last(struct sched_entity *se) |
@@ -1028,6 +1052,8 @@ static void clear_buddies(struct cfs_rq *cfs_rq, struct sched_entity *se) | |||
1028 | __clear_buddies_skip(se); | 1052 | __clear_buddies_skip(se); |
1029 | } | 1053 | } |
1030 | 1054 | ||
1055 | static void return_cfs_rq_runtime(struct cfs_rq *cfs_rq); | ||
1056 | |||
1031 | static void | 1057 | static void |
1032 | dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) | 1058 | dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) |
1033 | { | 1059 | { |
@@ -1066,6 +1092,9 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) | |||
1066 | if (!(flags & DEQUEUE_SLEEP)) | 1092 | if (!(flags & DEQUEUE_SLEEP)) |
1067 | se->vruntime -= cfs_rq->min_vruntime; | 1093 | se->vruntime -= cfs_rq->min_vruntime; |
1068 | 1094 | ||
1095 | /* return excess runtime on last dequeue */ | ||
1096 | return_cfs_rq_runtime(cfs_rq); | ||
1097 | |||
1069 | update_min_vruntime(cfs_rq); | 1098 | update_min_vruntime(cfs_rq); |
1070 | update_cfs_shares(cfs_rq); | 1099 | update_cfs_shares(cfs_rq); |
1071 | } | 1100 | } |
@@ -1077,6 +1106,8 @@ static void | |||
1077 | check_preempt_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr) | 1106 | check_preempt_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr) |
1078 | { | 1107 | { |
1079 | unsigned long ideal_runtime, delta_exec; | 1108 | unsigned long ideal_runtime, delta_exec; |
1109 | struct sched_entity *se; | ||
1110 | s64 delta; | ||
1080 | 1111 | ||
1081 | ideal_runtime = sched_slice(cfs_rq, curr); | 1112 | ideal_runtime = sched_slice(cfs_rq, curr); |
1082 | delta_exec = curr->sum_exec_runtime - curr->prev_sum_exec_runtime; | 1113 | delta_exec = curr->sum_exec_runtime - curr->prev_sum_exec_runtime; |
@@ -1095,22 +1126,17 @@ check_preempt_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr) | |||
1095 | * narrow margin doesn't have to wait for a full slice. | 1126 | * narrow margin doesn't have to wait for a full slice. |
1096 | * This also mitigates buddy induced latencies under load. | 1127 | * This also mitigates buddy induced latencies under load. |
1097 | */ | 1128 | */ |
1098 | if (!sched_feat(WAKEUP_PREEMPT)) | ||
1099 | return; | ||
1100 | |||
1101 | if (delta_exec < sysctl_sched_min_granularity) | 1129 | if (delta_exec < sysctl_sched_min_granularity) |
1102 | return; | 1130 | return; |
1103 | 1131 | ||
1104 | if (cfs_rq->nr_running > 1) { | 1132 | se = __pick_first_entity(cfs_rq); |
1105 | struct sched_entity *se = __pick_first_entity(cfs_rq); | 1133 | delta = curr->vruntime - se->vruntime; |
1106 | s64 delta = curr->vruntime - se->vruntime; | ||
1107 | 1134 | ||
1108 | if (delta < 0) | 1135 | if (delta < 0) |
1109 | return; | 1136 | return; |
1110 | 1137 | ||
1111 | if (delta > ideal_runtime) | 1138 | if (delta > ideal_runtime) |
1112 | resched_task(rq_of(cfs_rq)->curr); | 1139 | resched_task(rq_of(cfs_rq)->curr); |
1113 | } | ||
1114 | } | 1140 | } |
1115 | 1141 | ||
1116 | static void | 1142 | static void |
@@ -1185,6 +1211,8 @@ static struct sched_entity *pick_next_entity(struct cfs_rq *cfs_rq) | |||
1185 | return se; | 1211 | return se; |
1186 | } | 1212 | } |
1187 | 1213 | ||
1214 | static void check_cfs_rq_runtime(struct cfs_rq *cfs_rq); | ||
1215 | |||
1188 | static void put_prev_entity(struct cfs_rq *cfs_rq, struct sched_entity *prev) | 1216 | static void put_prev_entity(struct cfs_rq *cfs_rq, struct sched_entity *prev) |
1189 | { | 1217 | { |
1190 | /* | 1218 | /* |
@@ -1194,6 +1222,9 @@ static void put_prev_entity(struct cfs_rq *cfs_rq, struct sched_entity *prev) | |||
1194 | if (prev->on_rq) | 1222 | if (prev->on_rq) |
1195 | update_curr(cfs_rq); | 1223 | update_curr(cfs_rq); |
1196 | 1224 | ||
1225 | /* throttle cfs_rqs exceeding runtime */ | ||
1226 | check_cfs_rq_runtime(cfs_rq); | ||
1227 | |||
1197 | check_spread(cfs_rq, prev); | 1228 | check_spread(cfs_rq, prev); |
1198 | if (prev->on_rq) { | 1229 | if (prev->on_rq) { |
1199 | update_stats_wait_start(cfs_rq, prev); | 1230 | update_stats_wait_start(cfs_rq, prev); |
@@ -1233,10 +1264,583 @@ entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr, int queued) | |||
1233 | return; | 1264 | return; |
1234 | #endif | 1265 | #endif |
1235 | 1266 | ||
1236 | if (cfs_rq->nr_running > 1 || !sched_feat(WAKEUP_PREEMPT)) | 1267 | if (cfs_rq->nr_running > 1) |
1237 | check_preempt_tick(cfs_rq, curr); | 1268 | check_preempt_tick(cfs_rq, curr); |
1238 | } | 1269 | } |
1239 | 1270 | ||
1271 | |||
1272 | /************************************************** | ||
1273 | * CFS bandwidth control machinery | ||
1274 | */ | ||
1275 | |||
1276 | #ifdef CONFIG_CFS_BANDWIDTH | ||
1277 | /* | ||
1278 | * default period for cfs group bandwidth. | ||
1279 | * default: 0.1s, units: nanoseconds | ||
1280 | */ | ||
1281 | static inline u64 default_cfs_period(void) | ||
1282 | { | ||
1283 | return 100000000ULL; | ||
1284 | } | ||
1285 | |||
1286 | static inline u64 sched_cfs_bandwidth_slice(void) | ||
1287 | { | ||
1288 | return (u64)sysctl_sched_cfs_bandwidth_slice * NSEC_PER_USEC; | ||
1289 | } | ||
1290 | |||
1291 | /* | ||
1292 | * Replenish runtime according to assigned quota and update expiration time. | ||
1293 | * We use sched_clock_cpu directly instead of rq->clock to avoid adding | ||
1294 | * additional synchronization around rq->lock. | ||
1295 | * | ||
1296 | * requires cfs_b->lock | ||
1297 | */ | ||
1298 | static void __refill_cfs_bandwidth_runtime(struct cfs_bandwidth *cfs_b) | ||
1299 | { | ||
1300 | u64 now; | ||
1301 | |||
1302 | if (cfs_b->quota == RUNTIME_INF) | ||
1303 | return; | ||
1304 | |||
1305 | now = sched_clock_cpu(smp_processor_id()); | ||
1306 | cfs_b->runtime = cfs_b->quota; | ||
1307 | cfs_b->runtime_expires = now + ktime_to_ns(cfs_b->period); | ||
1308 | } | ||
1309 | |||
1310 | /* returns 0 on failure to allocate runtime */ | ||
1311 | static int assign_cfs_rq_runtime(struct cfs_rq *cfs_rq) | ||
1312 | { | ||
1313 | struct task_group *tg = cfs_rq->tg; | ||
1314 | struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(tg); | ||
1315 | u64 amount = 0, min_amount, expires; | ||
1316 | |||
1317 | /* note: this is a positive sum as runtime_remaining <= 0 */ | ||
1318 | min_amount = sched_cfs_bandwidth_slice() - cfs_rq->runtime_remaining; | ||
1319 | |||
1320 | raw_spin_lock(&cfs_b->lock); | ||
1321 | if (cfs_b->quota == RUNTIME_INF) | ||
1322 | amount = min_amount; | ||
1323 | else { | ||
1324 | /* | ||
1325 | * If the bandwidth pool has become inactive, then at least one | ||
1326 | * period must have elapsed since the last consumption. | ||
1327 | * Refresh the global state and ensure bandwidth timer becomes | ||
1328 | * active. | ||
1329 | */ | ||
1330 | if (!cfs_b->timer_active) { | ||
1331 | __refill_cfs_bandwidth_runtime(cfs_b); | ||
1332 | __start_cfs_bandwidth(cfs_b); | ||
1333 | } | ||
1334 | |||
1335 | if (cfs_b->runtime > 0) { | ||
1336 | amount = min(cfs_b->runtime, min_amount); | ||
1337 | cfs_b->runtime -= amount; | ||
1338 | cfs_b->idle = 0; | ||
1339 | } | ||
1340 | } | ||
1341 | expires = cfs_b->runtime_expires; | ||
1342 | raw_spin_unlock(&cfs_b->lock); | ||
1343 | |||
1344 | cfs_rq->runtime_remaining += amount; | ||
1345 | /* | ||
1346 | * we may have advanced our local expiration to account for allowed | ||
1347 | * spread between our sched_clock and the one on which runtime was | ||
1348 | * issued. | ||
1349 | */ | ||
1350 | if ((s64)(expires - cfs_rq->runtime_expires) > 0) | ||
1351 | cfs_rq->runtime_expires = expires; | ||
1352 | |||
1353 | return cfs_rq->runtime_remaining > 0; | ||
1354 | } | ||
1355 | |||
1356 | /* | ||
1357 | * Note: This depends on the synchronization provided by sched_clock and the | ||
1358 | * fact that rq->clock snapshots this value. | ||
1359 | */ | ||
1360 | static void expire_cfs_rq_runtime(struct cfs_rq *cfs_rq) | ||
1361 | { | ||
1362 | struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg); | ||
1363 | struct rq *rq = rq_of(cfs_rq); | ||
1364 | |||
1365 | /* if the deadline is ahead of our clock, nothing to do */ | ||
1366 | if (likely((s64)(rq->clock - cfs_rq->runtime_expires) < 0)) | ||
1367 | return; | ||
1368 | |||
1369 | if (cfs_rq->runtime_remaining < 0) | ||
1370 | return; | ||
1371 | |||
1372 | /* | ||
1373 | * If the local deadline has passed we have to consider the | ||
1374 | * possibility that our sched_clock is 'fast' and the global deadline | ||
1375 | * has not truly expired. | ||
1376 | * | ||
1377 | * Fortunately we can check determine whether this the case by checking | ||
1378 | * whether the global deadline has advanced. | ||
1379 | */ | ||
1380 | |||
1381 | if ((s64)(cfs_rq->runtime_expires - cfs_b->runtime_expires) >= 0) { | ||
1382 | /* extend local deadline, drift is bounded above by 2 ticks */ | ||
1383 | cfs_rq->runtime_expires += TICK_NSEC; | ||
1384 | } else { | ||
1385 | /* global deadline is ahead, expiration has passed */ | ||
1386 | cfs_rq->runtime_remaining = 0; | ||
1387 | } | ||
1388 | } | ||
1389 | |||
1390 | static void __account_cfs_rq_runtime(struct cfs_rq *cfs_rq, | ||
1391 | unsigned long delta_exec) | ||
1392 | { | ||
1393 | /* dock delta_exec before expiring quota (as it could span periods) */ | ||
1394 | cfs_rq->runtime_remaining -= delta_exec; | ||
1395 | expire_cfs_rq_runtime(cfs_rq); | ||
1396 | |||
1397 | if (likely(cfs_rq->runtime_remaining > 0)) | ||
1398 | return; | ||
1399 | |||
1400 | /* | ||
1401 | * if we're unable to extend our runtime we resched so that the active | ||
1402 | * hierarchy can be throttled | ||
1403 | */ | ||
1404 | if (!assign_cfs_rq_runtime(cfs_rq) && likely(cfs_rq->curr)) | ||
1405 | resched_task(rq_of(cfs_rq)->curr); | ||
1406 | } | ||
1407 | |||
1408 | static __always_inline void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, | ||
1409 | unsigned long delta_exec) | ||
1410 | { | ||
1411 | if (!cfs_rq->runtime_enabled) | ||
1412 | return; | ||
1413 | |||
1414 | __account_cfs_rq_runtime(cfs_rq, delta_exec); | ||
1415 | } | ||
1416 | |||
1417 | static inline int cfs_rq_throttled(struct cfs_rq *cfs_rq) | ||
1418 | { | ||
1419 | return cfs_rq->throttled; | ||
1420 | } | ||
1421 | |||
1422 | /* check whether cfs_rq, or any parent, is throttled */ | ||
1423 | static inline int throttled_hierarchy(struct cfs_rq *cfs_rq) | ||
1424 | { | ||
1425 | return cfs_rq->throttle_count; | ||
1426 | } | ||
1427 | |||
1428 | /* | ||
1429 | * Ensure that neither of the group entities corresponding to src_cpu or | ||
1430 | * dest_cpu are members of a throttled hierarchy when performing group | ||
1431 | * load-balance operations. | ||
1432 | */ | ||
1433 | static inline int throttled_lb_pair(struct task_group *tg, | ||
1434 | int src_cpu, int dest_cpu) | ||
1435 | { | ||
1436 | struct cfs_rq *src_cfs_rq, *dest_cfs_rq; | ||
1437 | |||
1438 | src_cfs_rq = tg->cfs_rq[src_cpu]; | ||
1439 | dest_cfs_rq = tg->cfs_rq[dest_cpu]; | ||
1440 | |||
1441 | return throttled_hierarchy(src_cfs_rq) || | ||
1442 | throttled_hierarchy(dest_cfs_rq); | ||
1443 | } | ||
1444 | |||
1445 | /* updated child weight may affect parent so we have to do this bottom up */ | ||
1446 | static int tg_unthrottle_up(struct task_group *tg, void *data) | ||
1447 | { | ||
1448 | struct rq *rq = data; | ||
1449 | struct cfs_rq *cfs_rq = tg->cfs_rq[cpu_of(rq)]; | ||
1450 | |||
1451 | cfs_rq->throttle_count--; | ||
1452 | #ifdef CONFIG_SMP | ||
1453 | if (!cfs_rq->throttle_count) { | ||
1454 | u64 delta = rq->clock_task - cfs_rq->load_stamp; | ||
1455 | |||
1456 | /* leaving throttled state, advance shares averaging windows */ | ||
1457 | cfs_rq->load_stamp += delta; | ||
1458 | cfs_rq->load_last += delta; | ||
1459 | |||
1460 | /* update entity weight now that we are on_rq again */ | ||
1461 | update_cfs_shares(cfs_rq); | ||
1462 | } | ||
1463 | #endif | ||
1464 | |||
1465 | return 0; | ||
1466 | } | ||
1467 | |||
1468 | static int tg_throttle_down(struct task_group *tg, void *data) | ||
1469 | { | ||
1470 | struct rq *rq = data; | ||
1471 | struct cfs_rq *cfs_rq = tg->cfs_rq[cpu_of(rq)]; | ||
1472 | |||
1473 | /* group is entering throttled state, record last load */ | ||
1474 | if (!cfs_rq->throttle_count) | ||
1475 | update_cfs_load(cfs_rq, 0); | ||
1476 | cfs_rq->throttle_count++; | ||
1477 | |||
1478 | return 0; | ||
1479 | } | ||
1480 | |||
1481 | static void throttle_cfs_rq(struct cfs_rq *cfs_rq) | ||
1482 | { | ||
1483 | struct rq *rq = rq_of(cfs_rq); | ||
1484 | struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg); | ||
1485 | struct sched_entity *se; | ||
1486 | long task_delta, dequeue = 1; | ||
1487 | |||
1488 | se = cfs_rq->tg->se[cpu_of(rq_of(cfs_rq))]; | ||
1489 | |||
1490 | /* account load preceding throttle */ | ||
1491 | rcu_read_lock(); | ||
1492 | walk_tg_tree_from(cfs_rq->tg, tg_throttle_down, tg_nop, (void *)rq); | ||
1493 | rcu_read_unlock(); | ||
1494 | |||
1495 | task_delta = cfs_rq->h_nr_running; | ||
1496 | for_each_sched_entity(se) { | ||
1497 | struct cfs_rq *qcfs_rq = cfs_rq_of(se); | ||
1498 | /* throttled entity or throttle-on-deactivate */ | ||
1499 | if (!se->on_rq) | ||
1500 | break; | ||
1501 | |||
1502 | if (dequeue) | ||
1503 | dequeue_entity(qcfs_rq, se, DEQUEUE_SLEEP); | ||
1504 | qcfs_rq->h_nr_running -= task_delta; | ||
1505 | |||
1506 | if (qcfs_rq->load.weight) | ||
1507 | dequeue = 0; | ||
1508 | } | ||
1509 | |||
1510 | if (!se) | ||
1511 | rq->nr_running -= task_delta; | ||
1512 | |||
1513 | cfs_rq->throttled = 1; | ||
1514 | cfs_rq->throttled_timestamp = rq->clock; | ||
1515 | raw_spin_lock(&cfs_b->lock); | ||
1516 | list_add_tail_rcu(&cfs_rq->throttled_list, &cfs_b->throttled_cfs_rq); | ||
1517 | raw_spin_unlock(&cfs_b->lock); | ||
1518 | } | ||
1519 | |||
1520 | static void unthrottle_cfs_rq(struct cfs_rq *cfs_rq) | ||
1521 | { | ||
1522 | struct rq *rq = rq_of(cfs_rq); | ||
1523 | struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg); | ||
1524 | struct sched_entity *se; | ||
1525 | int enqueue = 1; | ||
1526 | long task_delta; | ||
1527 | |||
1528 | se = cfs_rq->tg->se[cpu_of(rq_of(cfs_rq))]; | ||
1529 | |||
1530 | cfs_rq->throttled = 0; | ||
1531 | raw_spin_lock(&cfs_b->lock); | ||
1532 | cfs_b->throttled_time += rq->clock - cfs_rq->throttled_timestamp; | ||
1533 | list_del_rcu(&cfs_rq->throttled_list); | ||
1534 | raw_spin_unlock(&cfs_b->lock); | ||
1535 | cfs_rq->throttled_timestamp = 0; | ||
1536 | |||
1537 | update_rq_clock(rq); | ||
1538 | /* update hierarchical throttle state */ | ||
1539 | walk_tg_tree_from(cfs_rq->tg, tg_nop, tg_unthrottle_up, (void *)rq); | ||
1540 | |||
1541 | if (!cfs_rq->load.weight) | ||
1542 | return; | ||
1543 | |||
1544 | task_delta = cfs_rq->h_nr_running; | ||
1545 | for_each_sched_entity(se) { | ||
1546 | if (se->on_rq) | ||
1547 | enqueue = 0; | ||
1548 | |||
1549 | cfs_rq = cfs_rq_of(se); | ||
1550 | if (enqueue) | ||
1551 | enqueue_entity(cfs_rq, se, ENQUEUE_WAKEUP); | ||
1552 | cfs_rq->h_nr_running += task_delta; | ||
1553 | |||
1554 | if (cfs_rq_throttled(cfs_rq)) | ||
1555 | break; | ||
1556 | } | ||
1557 | |||
1558 | if (!se) | ||
1559 | rq->nr_running += task_delta; | ||
1560 | |||
1561 | /* determine whether we need to wake up potentially idle cpu */ | ||
1562 | if (rq->curr == rq->idle && rq->cfs.nr_running) | ||
1563 | resched_task(rq->curr); | ||
1564 | } | ||
1565 | |||
1566 | static u64 distribute_cfs_runtime(struct cfs_bandwidth *cfs_b, | ||
1567 | u64 remaining, u64 expires) | ||
1568 | { | ||
1569 | struct cfs_rq *cfs_rq; | ||
1570 | u64 runtime = remaining; | ||
1571 | |||
1572 | rcu_read_lock(); | ||
1573 | list_for_each_entry_rcu(cfs_rq, &cfs_b->throttled_cfs_rq, | ||
1574 | throttled_list) { | ||
1575 | struct rq *rq = rq_of(cfs_rq); | ||
1576 | |||
1577 | raw_spin_lock(&rq->lock); | ||
1578 | if (!cfs_rq_throttled(cfs_rq)) | ||
1579 | goto next; | ||
1580 | |||
1581 | runtime = -cfs_rq->runtime_remaining + 1; | ||
1582 | if (runtime > remaining) | ||
1583 | runtime = remaining; | ||
1584 | remaining -= runtime; | ||
1585 | |||
1586 | cfs_rq->runtime_remaining += runtime; | ||
1587 | cfs_rq->runtime_expires = expires; | ||
1588 | |||
1589 | /* we check whether we're throttled above */ | ||
1590 | if (cfs_rq->runtime_remaining > 0) | ||
1591 | unthrottle_cfs_rq(cfs_rq); | ||
1592 | |||
1593 | next: | ||
1594 | raw_spin_unlock(&rq->lock); | ||
1595 | |||
1596 | if (!remaining) | ||
1597 | break; | ||
1598 | } | ||
1599 | rcu_read_unlock(); | ||
1600 | |||
1601 | return remaining; | ||
1602 | } | ||
1603 | |||
1604 | /* | ||
1605 | * Responsible for refilling a task_group's bandwidth and unthrottling its | ||
1606 | * cfs_rqs as appropriate. If there has been no activity within the last | ||
1607 | * period the timer is deactivated until scheduling resumes; cfs_b->idle is | ||
1608 | * used to track this state. | ||
1609 | */ | ||
1610 | static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun) | ||
1611 | { | ||
1612 | u64 runtime, runtime_expires; | ||
1613 | int idle = 1, throttled; | ||
1614 | |||
1615 | raw_spin_lock(&cfs_b->lock); | ||
1616 | /* no need to continue the timer with no bandwidth constraint */ | ||
1617 | if (cfs_b->quota == RUNTIME_INF) | ||
1618 | goto out_unlock; | ||
1619 | |||
1620 | throttled = !list_empty(&cfs_b->throttled_cfs_rq); | ||
1621 | /* idle depends on !throttled (for the case of a large deficit) */ | ||
1622 | idle = cfs_b->idle && !throttled; | ||
1623 | cfs_b->nr_periods += overrun; | ||
1624 | |||
1625 | /* if we're going inactive then everything else can be deferred */ | ||
1626 | if (idle) | ||
1627 | goto out_unlock; | ||
1628 | |||
1629 | __refill_cfs_bandwidth_runtime(cfs_b); | ||
1630 | |||
1631 | if (!throttled) { | ||
1632 | /* mark as potentially idle for the upcoming period */ | ||
1633 | cfs_b->idle = 1; | ||
1634 | goto out_unlock; | ||
1635 | } | ||
1636 | |||
1637 | /* account preceding periods in which throttling occurred */ | ||
1638 | cfs_b->nr_throttled += overrun; | ||
1639 | |||
1640 | /* | ||
1641 | * There are throttled entities so we must first use the new bandwidth | ||
1642 | * to unthrottle them before making it generally available. This | ||
1643 | * ensures that all existing debts will be paid before a new cfs_rq is | ||
1644 | * allowed to run. | ||
1645 | */ | ||
1646 | runtime = cfs_b->runtime; | ||
1647 | runtime_expires = cfs_b->runtime_expires; | ||
1648 | cfs_b->runtime = 0; | ||
1649 | |||
1650 | /* | ||
1651 | * This check is repeated as we are holding onto the new bandwidth | ||
1652 | * while we unthrottle. This can potentially race with an unthrottled | ||
1653 | * group trying to acquire new bandwidth from the global pool. | ||
1654 | */ | ||
1655 | while (throttled && runtime > 0) { | ||
1656 | raw_spin_unlock(&cfs_b->lock); | ||
1657 | /* we can't nest cfs_b->lock while distributing bandwidth */ | ||
1658 | runtime = distribute_cfs_runtime(cfs_b, runtime, | ||
1659 | runtime_expires); | ||
1660 | raw_spin_lock(&cfs_b->lock); | ||
1661 | |||
1662 | throttled = !list_empty(&cfs_b->throttled_cfs_rq); | ||
1663 | } | ||
1664 | |||
1665 | /* return (any) remaining runtime */ | ||
1666 | cfs_b->runtime = runtime; | ||
1667 | /* | ||
1668 | * While we are ensured activity in the period following an | ||
1669 | * unthrottle, this also covers the case in which the new bandwidth is | ||
1670 | * insufficient to cover the existing bandwidth deficit. (Forcing the | ||
1671 | * timer to remain active while there are any throttled entities.) | ||
1672 | */ | ||
1673 | cfs_b->idle = 0; | ||
1674 | out_unlock: | ||
1675 | if (idle) | ||
1676 | cfs_b->timer_active = 0; | ||
1677 | raw_spin_unlock(&cfs_b->lock); | ||
1678 | |||
1679 | return idle; | ||
1680 | } | ||
1681 | |||
1682 | /* a cfs_rq won't donate quota below this amount */ | ||
1683 | static const u64 min_cfs_rq_runtime = 1 * NSEC_PER_MSEC; | ||
1684 | /* minimum remaining period time to redistribute slack quota */ | ||
1685 | static const u64 min_bandwidth_expiration = 2 * NSEC_PER_MSEC; | ||
1686 | /* how long we wait to gather additional slack before distributing */ | ||
1687 | static const u64 cfs_bandwidth_slack_period = 5 * NSEC_PER_MSEC; | ||
1688 | |||
1689 | /* are we near the end of the current quota period? */ | ||
1690 | static int runtime_refresh_within(struct cfs_bandwidth *cfs_b, u64 min_expire) | ||
1691 | { | ||
1692 | struct hrtimer *refresh_timer = &cfs_b->period_timer; | ||
1693 | u64 remaining; | ||
1694 | |||
1695 | /* if the call-back is running a quota refresh is already occurring */ | ||
1696 | if (hrtimer_callback_running(refresh_timer)) | ||
1697 | return 1; | ||
1698 | |||
1699 | /* is a quota refresh about to occur? */ | ||
1700 | remaining = ktime_to_ns(hrtimer_expires_remaining(refresh_timer)); | ||
1701 | if (remaining < min_expire) | ||
1702 | return 1; | ||
1703 | |||
1704 | return 0; | ||
1705 | } | ||
1706 | |||
1707 | static void start_cfs_slack_bandwidth(struct cfs_bandwidth *cfs_b) | ||
1708 | { | ||
1709 | u64 min_left = cfs_bandwidth_slack_period + min_bandwidth_expiration; | ||
1710 | |||
1711 | /* if there's a quota refresh soon don't bother with slack */ | ||
1712 | if (runtime_refresh_within(cfs_b, min_left)) | ||
1713 | return; | ||
1714 | |||
1715 | start_bandwidth_timer(&cfs_b->slack_timer, | ||
1716 | ns_to_ktime(cfs_bandwidth_slack_period)); | ||
1717 | } | ||
1718 | |||
1719 | /* we know any runtime found here is valid as update_curr() precedes return */ | ||
1720 | static void __return_cfs_rq_runtime(struct cfs_rq *cfs_rq) | ||
1721 | { | ||
1722 | struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg); | ||
1723 | s64 slack_runtime = cfs_rq->runtime_remaining - min_cfs_rq_runtime; | ||
1724 | |||
1725 | if (slack_runtime <= 0) | ||
1726 | return; | ||
1727 | |||
1728 | raw_spin_lock(&cfs_b->lock); | ||
1729 | if (cfs_b->quota != RUNTIME_INF && | ||
1730 | cfs_rq->runtime_expires == cfs_b->runtime_expires) { | ||
1731 | cfs_b->runtime += slack_runtime; | ||
1732 | |||
1733 | /* we are under rq->lock, defer unthrottling using a timer */ | ||
1734 | if (cfs_b->runtime > sched_cfs_bandwidth_slice() && | ||
1735 | !list_empty(&cfs_b->throttled_cfs_rq)) | ||
1736 | start_cfs_slack_bandwidth(cfs_b); | ||
1737 | } | ||
1738 | raw_spin_unlock(&cfs_b->lock); | ||
1739 | |||
1740 | /* even if it's not valid for return we don't want to try again */ | ||
1741 | cfs_rq->runtime_remaining -= slack_runtime; | ||
1742 | } | ||
1743 | |||
1744 | static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq) | ||
1745 | { | ||
1746 | if (!cfs_rq->runtime_enabled || !cfs_rq->nr_running) | ||
1747 | return; | ||
1748 | |||
1749 | __return_cfs_rq_runtime(cfs_rq); | ||
1750 | } | ||
1751 | |||
1752 | /* | ||
1753 | * This is done with a timer (instead of inline with bandwidth return) since | ||
1754 | * it's necessary to juggle rq->locks to unthrottle their respective cfs_rqs. | ||
1755 | */ | ||
1756 | static void do_sched_cfs_slack_timer(struct cfs_bandwidth *cfs_b) | ||
1757 | { | ||
1758 | u64 runtime = 0, slice = sched_cfs_bandwidth_slice(); | ||
1759 | u64 expires; | ||
1760 | |||
1761 | /* confirm we're still not at a refresh boundary */ | ||
1762 | if (runtime_refresh_within(cfs_b, min_bandwidth_expiration)) | ||
1763 | return; | ||
1764 | |||
1765 | raw_spin_lock(&cfs_b->lock); | ||
1766 | if (cfs_b->quota != RUNTIME_INF && cfs_b->runtime > slice) { | ||
1767 | runtime = cfs_b->runtime; | ||
1768 | cfs_b->runtime = 0; | ||
1769 | } | ||
1770 | expires = cfs_b->runtime_expires; | ||
1771 | raw_spin_unlock(&cfs_b->lock); | ||
1772 | |||
1773 | if (!runtime) | ||
1774 | return; | ||
1775 | |||
1776 | runtime = distribute_cfs_runtime(cfs_b, runtime, expires); | ||
1777 | |||
1778 | raw_spin_lock(&cfs_b->lock); | ||
1779 | if (expires == cfs_b->runtime_expires) | ||
1780 | cfs_b->runtime = runtime; | ||
1781 | raw_spin_unlock(&cfs_b->lock); | ||
1782 | } | ||
1783 | |||
1784 | /* | ||
1785 | * When a group wakes up we want to make sure that its quota is not already | ||
1786 | * expired/exceeded, otherwise it may be allowed to steal additional ticks of | ||
1787 | * runtime as update_curr() throttling can not not trigger until it's on-rq. | ||
1788 | */ | ||
1789 | static void check_enqueue_throttle(struct cfs_rq *cfs_rq) | ||
1790 | { | ||
1791 | /* an active group must be handled by the update_curr()->put() path */ | ||
1792 | if (!cfs_rq->runtime_enabled || cfs_rq->curr) | ||
1793 | return; | ||
1794 | |||
1795 | /* ensure the group is not already throttled */ | ||
1796 | if (cfs_rq_throttled(cfs_rq)) | ||
1797 | return; | ||
1798 | |||
1799 | /* update runtime allocation */ | ||
1800 | account_cfs_rq_runtime(cfs_rq, 0); | ||
1801 | if (cfs_rq->runtime_remaining <= 0) | ||
1802 | throttle_cfs_rq(cfs_rq); | ||
1803 | } | ||
1804 | |||
1805 | /* conditionally throttle active cfs_rq's from put_prev_entity() */ | ||
1806 | static void check_cfs_rq_runtime(struct cfs_rq *cfs_rq) | ||
1807 | { | ||
1808 | if (likely(!cfs_rq->runtime_enabled || cfs_rq->runtime_remaining > 0)) | ||
1809 | return; | ||
1810 | |||
1811 | /* | ||
1812 | * it's possible for a throttled entity to be forced into a running | ||
1813 | * state (e.g. set_curr_task), in this case we're finished. | ||
1814 | */ | ||
1815 | if (cfs_rq_throttled(cfs_rq)) | ||
1816 | return; | ||
1817 | |||
1818 | throttle_cfs_rq(cfs_rq); | ||
1819 | } | ||
1820 | #else | ||
1821 | static void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, | ||
1822 | unsigned long delta_exec) {} | ||
1823 | static void check_cfs_rq_runtime(struct cfs_rq *cfs_rq) {} | ||
1824 | static void check_enqueue_throttle(struct cfs_rq *cfs_rq) {} | ||
1825 | static void return_cfs_rq_runtime(struct cfs_rq *cfs_rq) {} | ||
1826 | |||
1827 | static inline int cfs_rq_throttled(struct cfs_rq *cfs_rq) | ||
1828 | { | ||
1829 | return 0; | ||
1830 | } | ||
1831 | |||
1832 | static inline int throttled_hierarchy(struct cfs_rq *cfs_rq) | ||
1833 | { | ||
1834 | return 0; | ||
1835 | } | ||
1836 | |||
1837 | static inline int throttled_lb_pair(struct task_group *tg, | ||
1838 | int src_cpu, int dest_cpu) | ||
1839 | { | ||
1840 | return 0; | ||
1841 | } | ||
1842 | #endif | ||
1843 | |||
1240 | /************************************************** | 1844 | /************************************************** |
1241 | * CFS operations on tasks: | 1845 | * CFS operations on tasks: |
1242 | */ | 1846 | */ |
@@ -1313,16 +1917,33 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags) | |||
1313 | break; | 1917 | break; |
1314 | cfs_rq = cfs_rq_of(se); | 1918 | cfs_rq = cfs_rq_of(se); |
1315 | enqueue_entity(cfs_rq, se, flags); | 1919 | enqueue_entity(cfs_rq, se, flags); |
1920 | |||
1921 | /* | ||
1922 | * end evaluation on encountering a throttled cfs_rq | ||
1923 | * | ||
1924 | * note: in the case of encountering a throttled cfs_rq we will | ||
1925 | * post the final h_nr_running increment below. | ||
1926 | */ | ||
1927 | if (cfs_rq_throttled(cfs_rq)) | ||
1928 | break; | ||
1929 | cfs_rq->h_nr_running++; | ||
1930 | |||
1316 | flags = ENQUEUE_WAKEUP; | 1931 | flags = ENQUEUE_WAKEUP; |
1317 | } | 1932 | } |
1318 | 1933 | ||
1319 | for_each_sched_entity(se) { | 1934 | for_each_sched_entity(se) { |
1320 | cfs_rq = cfs_rq_of(se); | 1935 | cfs_rq = cfs_rq_of(se); |
1936 | cfs_rq->h_nr_running++; | ||
1937 | |||
1938 | if (cfs_rq_throttled(cfs_rq)) | ||
1939 | break; | ||
1321 | 1940 | ||
1322 | update_cfs_load(cfs_rq, 0); | 1941 | update_cfs_load(cfs_rq, 0); |
1323 | update_cfs_shares(cfs_rq); | 1942 | update_cfs_shares(cfs_rq); |
1324 | } | 1943 | } |
1325 | 1944 | ||
1945 | if (!se) | ||
1946 | inc_nr_running(rq); | ||
1326 | hrtick_update(rq); | 1947 | hrtick_update(rq); |
1327 | } | 1948 | } |
1328 | 1949 | ||
@@ -1343,6 +1964,16 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags) | |||
1343 | cfs_rq = cfs_rq_of(se); | 1964 | cfs_rq = cfs_rq_of(se); |
1344 | dequeue_entity(cfs_rq, se, flags); | 1965 | dequeue_entity(cfs_rq, se, flags); |
1345 | 1966 | ||
1967 | /* | ||
1968 | * end evaluation on encountering a throttled cfs_rq | ||
1969 | * | ||
1970 | * note: in the case of encountering a throttled cfs_rq we will | ||
1971 | * post the final h_nr_running decrement below. | ||
1972 | */ | ||
1973 | if (cfs_rq_throttled(cfs_rq)) | ||
1974 | break; | ||
1975 | cfs_rq->h_nr_running--; | ||
1976 | |||
1346 | /* Don't dequeue parent if it has other entities besides us */ | 1977 | /* Don't dequeue parent if it has other entities besides us */ |
1347 | if (cfs_rq->load.weight) { | 1978 | if (cfs_rq->load.weight) { |
1348 | /* | 1979 | /* |
@@ -1361,11 +1992,17 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags) | |||
1361 | 1992 | ||
1362 | for_each_sched_entity(se) { | 1993 | for_each_sched_entity(se) { |
1363 | cfs_rq = cfs_rq_of(se); | 1994 | cfs_rq = cfs_rq_of(se); |
1995 | cfs_rq->h_nr_running--; | ||
1996 | |||
1997 | if (cfs_rq_throttled(cfs_rq)) | ||
1998 | break; | ||
1364 | 1999 | ||
1365 | update_cfs_load(cfs_rq, 0); | 2000 | update_cfs_load(cfs_rq, 0); |
1366 | update_cfs_shares(cfs_rq); | 2001 | update_cfs_shares(cfs_rq); |
1367 | } | 2002 | } |
1368 | 2003 | ||
2004 | if (!se) | ||
2005 | dec_nr_running(rq); | ||
1369 | hrtick_update(rq); | 2006 | hrtick_update(rq); |
1370 | } | 2007 | } |
1371 | 2008 | ||
@@ -1434,7 +2071,6 @@ static long effective_load(struct task_group *tg, int cpu, long wl, long wg) | |||
1434 | 2071 | ||
1435 | return wl; | 2072 | return wl; |
1436 | } | 2073 | } |
1437 | |||
1438 | #else | 2074 | #else |
1439 | 2075 | ||
1440 | static inline unsigned long effective_load(struct task_group *tg, int cpu, | 2076 | static inline unsigned long effective_load(struct task_group *tg, int cpu, |
@@ -1547,7 +2183,7 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p, | |||
1547 | 2183 | ||
1548 | /* Skip over this group if it has no CPUs allowed */ | 2184 | /* Skip over this group if it has no CPUs allowed */ |
1549 | if (!cpumask_intersects(sched_group_cpus(group), | 2185 | if (!cpumask_intersects(sched_group_cpus(group), |
1550 | &p->cpus_allowed)) | 2186 | tsk_cpus_allowed(p))) |
1551 | continue; | 2187 | continue; |
1552 | 2188 | ||
1553 | local_group = cpumask_test_cpu(this_cpu, | 2189 | local_group = cpumask_test_cpu(this_cpu, |
@@ -1593,7 +2229,7 @@ find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu) | |||
1593 | int i; | 2229 | int i; |
1594 | 2230 | ||
1595 | /* Traverse only the allowed CPUs */ | 2231 | /* Traverse only the allowed CPUs */ |
1596 | for_each_cpu_and(i, sched_group_cpus(group), &p->cpus_allowed) { | 2232 | for_each_cpu_and(i, sched_group_cpus(group), tsk_cpus_allowed(p)) { |
1597 | load = weighted_cpuload(i); | 2233 | load = weighted_cpuload(i); |
1598 | 2234 | ||
1599 | if (load < min_load || (load == min_load && i == this_cpu)) { | 2235 | if (load < min_load || (load == min_load && i == this_cpu)) { |
@@ -1637,7 +2273,7 @@ static int select_idle_sibling(struct task_struct *p, int target) | |||
1637 | if (!(sd->flags & SD_SHARE_PKG_RESOURCES)) | 2273 | if (!(sd->flags & SD_SHARE_PKG_RESOURCES)) |
1638 | break; | 2274 | break; |
1639 | 2275 | ||
1640 | for_each_cpu_and(i, sched_domain_span(sd), &p->cpus_allowed) { | 2276 | for_each_cpu_and(i, sched_domain_span(sd), tsk_cpus_allowed(p)) { |
1641 | if (idle_cpu(i)) { | 2277 | if (idle_cpu(i)) { |
1642 | target = i; | 2278 | target = i; |
1643 | break; | 2279 | break; |
@@ -1680,7 +2316,7 @@ select_task_rq_fair(struct task_struct *p, int sd_flag, int wake_flags) | |||
1680 | int sync = wake_flags & WF_SYNC; | 2316 | int sync = wake_flags & WF_SYNC; |
1681 | 2317 | ||
1682 | if (sd_flag & SD_BALANCE_WAKE) { | 2318 | if (sd_flag & SD_BALANCE_WAKE) { |
1683 | if (cpumask_test_cpu(cpu, &p->cpus_allowed)) | 2319 | if (cpumask_test_cpu(cpu, tsk_cpus_allowed(p))) |
1684 | want_affine = 1; | 2320 | want_affine = 1; |
1685 | new_cpu = prev_cpu; | 2321 | new_cpu = prev_cpu; |
1686 | } | 2322 | } |
@@ -1875,6 +2511,15 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_ | |||
1875 | if (unlikely(se == pse)) | 2511 | if (unlikely(se == pse)) |
1876 | return; | 2512 | return; |
1877 | 2513 | ||
2514 | /* | ||
2515 | * This is possible from callers such as pull_task(), in which we | ||
2516 | * unconditionally check_prempt_curr() after an enqueue (which may have | ||
2517 | * lead to a throttle). This both saves work and prevents false | ||
2518 | * next-buddy nomination below. | ||
2519 | */ | ||
2520 | if (unlikely(throttled_hierarchy(cfs_rq_of(pse)))) | ||
2521 | return; | ||
2522 | |||
1878 | if (sched_feat(NEXT_BUDDY) && scale && !(wake_flags & WF_FORK)) { | 2523 | if (sched_feat(NEXT_BUDDY) && scale && !(wake_flags & WF_FORK)) { |
1879 | set_next_buddy(pse); | 2524 | set_next_buddy(pse); |
1880 | next_buddy_marked = 1; | 2525 | next_buddy_marked = 1; |
@@ -1883,6 +2528,12 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_ | |||
1883 | /* | 2528 | /* |
1884 | * We can come here with TIF_NEED_RESCHED already set from new task | 2529 | * We can come here with TIF_NEED_RESCHED already set from new task |
1885 | * wake up path. | 2530 | * wake up path. |
2531 | * | ||
2532 | * Note: this also catches the edge-case of curr being in a throttled | ||
2533 | * group (e.g. via set_curr_task), since update_curr() (in the | ||
2534 | * enqueue of curr) will have resulted in resched being set. This | ||
2535 | * prevents us from potentially nominating it as a false LAST_BUDDY | ||
2536 | * below. | ||
1886 | */ | 2537 | */ |
1887 | if (test_tsk_need_resched(curr)) | 2538 | if (test_tsk_need_resched(curr)) |
1888 | return; | 2539 | return; |
@@ -1899,10 +2550,6 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_ | |||
1899 | if (unlikely(p->policy != SCHED_NORMAL)) | 2550 | if (unlikely(p->policy != SCHED_NORMAL)) |
1900 | return; | 2551 | return; |
1901 | 2552 | ||
1902 | |||
1903 | if (!sched_feat(WAKEUP_PREEMPT)) | ||
1904 | return; | ||
1905 | |||
1906 | find_matching_se(&se, &pse); | 2553 | find_matching_se(&se, &pse); |
1907 | update_curr(cfs_rq_of(se)); | 2554 | update_curr(cfs_rq_of(se)); |
1908 | BUG_ON(!pse); | 2555 | BUG_ON(!pse); |
@@ -2005,7 +2652,8 @@ static bool yield_to_task_fair(struct rq *rq, struct task_struct *p, bool preemp | |||
2005 | { | 2652 | { |
2006 | struct sched_entity *se = &p->se; | 2653 | struct sched_entity *se = &p->se; |
2007 | 2654 | ||
2008 | if (!se->on_rq) | 2655 | /* throttled hierarchies are not runnable */ |
2656 | if (!se->on_rq || throttled_hierarchy(cfs_rq_of(se))) | ||
2009 | return false; | 2657 | return false; |
2010 | 2658 | ||
2011 | /* Tell the scheduler that we'd really like pse to run next. */ | 2659 | /* Tell the scheduler that we'd really like pse to run next. */ |
@@ -2049,7 +2697,7 @@ int can_migrate_task(struct task_struct *p, struct rq *rq, int this_cpu, | |||
2049 | * 2) cannot be migrated to this CPU due to cpus_allowed, or | 2697 | * 2) cannot be migrated to this CPU due to cpus_allowed, or |
2050 | * 3) are cache-hot on their current CPU. | 2698 | * 3) are cache-hot on their current CPU. |
2051 | */ | 2699 | */ |
2052 | if (!cpumask_test_cpu(this_cpu, &p->cpus_allowed)) { | 2700 | if (!cpumask_test_cpu(this_cpu, tsk_cpus_allowed(p))) { |
2053 | schedstat_inc(p, se.statistics.nr_failed_migrations_affine); | 2701 | schedstat_inc(p, se.statistics.nr_failed_migrations_affine); |
2054 | return 0; | 2702 | return 0; |
2055 | } | 2703 | } |
@@ -2102,6 +2750,9 @@ move_one_task(struct rq *this_rq, int this_cpu, struct rq *busiest, | |||
2102 | 2750 | ||
2103 | for_each_leaf_cfs_rq(busiest, cfs_rq) { | 2751 | for_each_leaf_cfs_rq(busiest, cfs_rq) { |
2104 | list_for_each_entry_safe(p, n, &cfs_rq->tasks, se.group_node) { | 2752 | list_for_each_entry_safe(p, n, &cfs_rq->tasks, se.group_node) { |
2753 | if (throttled_lb_pair(task_group(p), | ||
2754 | busiest->cpu, this_cpu)) | ||
2755 | break; | ||
2105 | 2756 | ||
2106 | if (!can_migrate_task(p, busiest, this_cpu, | 2757 | if (!can_migrate_task(p, busiest, this_cpu, |
2107 | sd, idle, &pinned)) | 2758 | sd, idle, &pinned)) |
@@ -2217,8 +2868,13 @@ static void update_shares(int cpu) | |||
2217 | * Iterates the task_group tree in a bottom up fashion, see | 2868 | * Iterates the task_group tree in a bottom up fashion, see |
2218 | * list_add_leaf_cfs_rq() for details. | 2869 | * list_add_leaf_cfs_rq() for details. |
2219 | */ | 2870 | */ |
2220 | for_each_leaf_cfs_rq(rq, cfs_rq) | 2871 | for_each_leaf_cfs_rq(rq, cfs_rq) { |
2872 | /* throttled entities do not contribute to load */ | ||
2873 | if (throttled_hierarchy(cfs_rq)) | ||
2874 | continue; | ||
2875 | |||
2221 | update_shares_cpu(cfs_rq->tg, cpu); | 2876 | update_shares_cpu(cfs_rq->tg, cpu); |
2877 | } | ||
2222 | rcu_read_unlock(); | 2878 | rcu_read_unlock(); |
2223 | } | 2879 | } |
2224 | 2880 | ||
@@ -2268,9 +2924,10 @@ load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest, | |||
2268 | u64 rem_load, moved_load; | 2924 | u64 rem_load, moved_load; |
2269 | 2925 | ||
2270 | /* | 2926 | /* |
2271 | * empty group | 2927 | * empty group or part of a throttled hierarchy |
2272 | */ | 2928 | */ |
2273 | if (!busiest_cfs_rq->task_weight) | 2929 | if (!busiest_cfs_rq->task_weight || |
2930 | throttled_lb_pair(busiest_cfs_rq->tg, cpu_of(busiest), this_cpu)) | ||
2274 | continue; | 2931 | continue; |
2275 | 2932 | ||
2276 | rem_load = (u64)rem_load_move * busiest_weight; | 2933 | rem_load = (u64)rem_load_move * busiest_weight; |
@@ -3430,7 +4087,7 @@ redo: | |||
3430 | * moved to this_cpu | 4087 | * moved to this_cpu |
3431 | */ | 4088 | */ |
3432 | if (!cpumask_test_cpu(this_cpu, | 4089 | if (!cpumask_test_cpu(this_cpu, |
3433 | &busiest->curr->cpus_allowed)) { | 4090 | tsk_cpus_allowed(busiest->curr))) { |
3434 | raw_spin_unlock_irqrestore(&busiest->lock, | 4091 | raw_spin_unlock_irqrestore(&busiest->lock, |
3435 | flags); | 4092 | flags); |
3436 | all_pinned = 1; | 4093 | all_pinned = 1; |
@@ -3612,22 +4269,6 @@ out_unlock: | |||
3612 | } | 4269 | } |
3613 | 4270 | ||
3614 | #ifdef CONFIG_NO_HZ | 4271 | #ifdef CONFIG_NO_HZ |
3615 | |||
3616 | static DEFINE_PER_CPU(struct call_single_data, remote_sched_softirq_cb); | ||
3617 | |||
3618 | static void trigger_sched_softirq(void *data) | ||
3619 | { | ||
3620 | raise_softirq_irqoff(SCHED_SOFTIRQ); | ||
3621 | } | ||
3622 | |||
3623 | static inline void init_sched_softirq_csd(struct call_single_data *csd) | ||
3624 | { | ||
3625 | csd->func = trigger_sched_softirq; | ||
3626 | csd->info = NULL; | ||
3627 | csd->flags = 0; | ||
3628 | csd->priv = 0; | ||
3629 | } | ||
3630 | |||
3631 | /* | 4272 | /* |
3632 | * idle load balancing details | 4273 | * idle load balancing details |
3633 | * - One of the idle CPUs nominates itself as idle load_balancer, while | 4274 | * - One of the idle CPUs nominates itself as idle load_balancer, while |
@@ -3667,7 +4308,7 @@ static inline struct sched_domain *lowest_flag_domain(int cpu, int flag) | |||
3667 | struct sched_domain *sd; | 4308 | struct sched_domain *sd; |
3668 | 4309 | ||
3669 | for_each_domain(cpu, sd) | 4310 | for_each_domain(cpu, sd) |
3670 | if (sd && (sd->flags & flag)) | 4311 | if (sd->flags & flag) |
3671 | break; | 4312 | break; |
3672 | 4313 | ||
3673 | return sd; | 4314 | return sd; |
@@ -3793,11 +4434,16 @@ static void nohz_balancer_kick(int cpu) | |||
3793 | } | 4434 | } |
3794 | 4435 | ||
3795 | if (!cpu_rq(ilb_cpu)->nohz_balance_kick) { | 4436 | if (!cpu_rq(ilb_cpu)->nohz_balance_kick) { |
3796 | struct call_single_data *cp; | ||
3797 | |||
3798 | cpu_rq(ilb_cpu)->nohz_balance_kick = 1; | 4437 | cpu_rq(ilb_cpu)->nohz_balance_kick = 1; |
3799 | cp = &per_cpu(remote_sched_softirq_cb, cpu); | 4438 | |
3800 | __smp_call_function_single(ilb_cpu, cp, 0); | 4439 | smp_mb(); |
4440 | /* | ||
4441 | * Use smp_send_reschedule() instead of resched_cpu(). | ||
4442 | * This way we generate a sched IPI on the target cpu which | ||
4443 | * is idle. And the softirq performing nohz idle load balance | ||
4444 | * will be run before returning from the IPI. | ||
4445 | */ | ||
4446 | smp_send_reschedule(ilb_cpu); | ||
3801 | } | 4447 | } |
3802 | return; | 4448 | return; |
3803 | } | 4449 | } |
@@ -4030,7 +4676,7 @@ static inline int nohz_kick_needed(struct rq *rq, int cpu) | |||
4030 | if (time_before(now, nohz.next_balance)) | 4676 | if (time_before(now, nohz.next_balance)) |
4031 | return 0; | 4677 | return 0; |
4032 | 4678 | ||
4033 | if (rq->idle_at_tick) | 4679 | if (idle_cpu(cpu)) |
4034 | return 0; | 4680 | return 0; |
4035 | 4681 | ||
4036 | first_pick_cpu = atomic_read(&nohz.first_pick_cpu); | 4682 | first_pick_cpu = atomic_read(&nohz.first_pick_cpu); |
@@ -4066,7 +4712,7 @@ static void run_rebalance_domains(struct softirq_action *h) | |||
4066 | { | 4712 | { |
4067 | int this_cpu = smp_processor_id(); | 4713 | int this_cpu = smp_processor_id(); |
4068 | struct rq *this_rq = cpu_rq(this_cpu); | 4714 | struct rq *this_rq = cpu_rq(this_cpu); |
4069 | enum cpu_idle_type idle = this_rq->idle_at_tick ? | 4715 | enum cpu_idle_type idle = this_rq->idle_balance ? |
4070 | CPU_IDLE : CPU_NOT_IDLE; | 4716 | CPU_IDLE : CPU_NOT_IDLE; |
4071 | 4717 | ||
4072 | rebalance_domains(this_cpu, idle); | 4718 | rebalance_domains(this_cpu, idle); |
@@ -4251,8 +4897,13 @@ static void set_curr_task_fair(struct rq *rq) | |||
4251 | { | 4897 | { |
4252 | struct sched_entity *se = &rq->curr->se; | 4898 | struct sched_entity *se = &rq->curr->se; |
4253 | 4899 | ||
4254 | for_each_sched_entity(se) | 4900 | for_each_sched_entity(se) { |
4255 | set_next_entity(cfs_rq_of(se), se); | 4901 | struct cfs_rq *cfs_rq = cfs_rq_of(se); |
4902 | |||
4903 | set_next_entity(cfs_rq, se); | ||
4904 | /* ensure bandwidth has been allocated on our new cfs_rq */ | ||
4905 | account_cfs_rq_runtime(cfs_rq, 0); | ||
4906 | } | ||
4256 | } | 4907 | } |
4257 | 4908 | ||
4258 | #ifdef CONFIG_FAIR_GROUP_SCHED | 4909 | #ifdef CONFIG_FAIR_GROUP_SCHED |
diff --git a/kernel/sched_features.h b/kernel/sched_features.h index 2e74677cb040..efa0a7b75dde 100644 --- a/kernel/sched_features.h +++ b/kernel/sched_features.h | |||
@@ -12,11 +12,6 @@ SCHED_FEAT(GENTLE_FAIR_SLEEPERS, 1) | |||
12 | SCHED_FEAT(START_DEBIT, 1) | 12 | SCHED_FEAT(START_DEBIT, 1) |
13 | 13 | ||
14 | /* | 14 | /* |
15 | * Should wakeups try to preempt running tasks. | ||
16 | */ | ||
17 | SCHED_FEAT(WAKEUP_PREEMPT, 1) | ||
18 | |||
19 | /* | ||
20 | * Based on load and program behaviour, see if it makes sense to place | 15 | * Based on load and program behaviour, see if it makes sense to place |
21 | * a newly woken task on the same cpu as the task that woke it -- | 16 | * a newly woken task on the same cpu as the task that woke it -- |
22 | * improve cache locality. Typically used with SYNC wakeups as | 17 | * improve cache locality. Typically used with SYNC wakeups as |
diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c index 97540f0c9e47..056cbd2e2a27 100644 --- a/kernel/sched_rt.c +++ b/kernel/sched_rt.c | |||
@@ -124,21 +124,33 @@ static void dec_rt_migration(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq) | |||
124 | update_rt_migration(rt_rq); | 124 | update_rt_migration(rt_rq); |
125 | } | 125 | } |
126 | 126 | ||
127 | static inline int has_pushable_tasks(struct rq *rq) | ||
128 | { | ||
129 | return !plist_head_empty(&rq->rt.pushable_tasks); | ||
130 | } | ||
131 | |||
127 | static void enqueue_pushable_task(struct rq *rq, struct task_struct *p) | 132 | static void enqueue_pushable_task(struct rq *rq, struct task_struct *p) |
128 | { | 133 | { |
129 | plist_del(&p->pushable_tasks, &rq->rt.pushable_tasks); | 134 | plist_del(&p->pushable_tasks, &rq->rt.pushable_tasks); |
130 | plist_node_init(&p->pushable_tasks, p->prio); | 135 | plist_node_init(&p->pushable_tasks, p->prio); |
131 | plist_add(&p->pushable_tasks, &rq->rt.pushable_tasks); | 136 | plist_add(&p->pushable_tasks, &rq->rt.pushable_tasks); |
137 | |||
138 | /* Update the highest prio pushable task */ | ||
139 | if (p->prio < rq->rt.highest_prio.next) | ||
140 | rq->rt.highest_prio.next = p->prio; | ||
132 | } | 141 | } |
133 | 142 | ||
134 | static void dequeue_pushable_task(struct rq *rq, struct task_struct *p) | 143 | static void dequeue_pushable_task(struct rq *rq, struct task_struct *p) |
135 | { | 144 | { |
136 | plist_del(&p->pushable_tasks, &rq->rt.pushable_tasks); | 145 | plist_del(&p->pushable_tasks, &rq->rt.pushable_tasks); |
137 | } | ||
138 | 146 | ||
139 | static inline int has_pushable_tasks(struct rq *rq) | 147 | /* Update the new highest prio pushable task */ |
140 | { | 148 | if (has_pushable_tasks(rq)) { |
141 | return !plist_head_empty(&rq->rt.pushable_tasks); | 149 | p = plist_first_entry(&rq->rt.pushable_tasks, |
150 | struct task_struct, pushable_tasks); | ||
151 | rq->rt.highest_prio.next = p->prio; | ||
152 | } else | ||
153 | rq->rt.highest_prio.next = MAX_RT_PRIO; | ||
142 | } | 154 | } |
143 | 155 | ||
144 | #else | 156 | #else |
@@ -643,6 +655,7 @@ static int sched_rt_runtime_exceeded(struct rt_rq *rt_rq) | |||
643 | 655 | ||
644 | if (rt_rq->rt_time > runtime) { | 656 | if (rt_rq->rt_time > runtime) { |
645 | rt_rq->rt_throttled = 1; | 657 | rt_rq->rt_throttled = 1; |
658 | printk_once(KERN_WARNING "sched: RT throttling activated\n"); | ||
646 | if (rt_rq_throttled(rt_rq)) { | 659 | if (rt_rq_throttled(rt_rq)) { |
647 | sched_rt_rq_dequeue(rt_rq); | 660 | sched_rt_rq_dequeue(rt_rq); |
648 | return 1; | 661 | return 1; |
@@ -698,47 +711,13 @@ static void update_curr_rt(struct rq *rq) | |||
698 | 711 | ||
699 | #if defined CONFIG_SMP | 712 | #if defined CONFIG_SMP |
700 | 713 | ||
701 | static struct task_struct *pick_next_highest_task_rt(struct rq *rq, int cpu); | ||
702 | |||
703 | static inline int next_prio(struct rq *rq) | ||
704 | { | ||
705 | struct task_struct *next = pick_next_highest_task_rt(rq, rq->cpu); | ||
706 | |||
707 | if (next && rt_prio(next->prio)) | ||
708 | return next->prio; | ||
709 | else | ||
710 | return MAX_RT_PRIO; | ||
711 | } | ||
712 | |||
713 | static void | 714 | static void |
714 | inc_rt_prio_smp(struct rt_rq *rt_rq, int prio, int prev_prio) | 715 | inc_rt_prio_smp(struct rt_rq *rt_rq, int prio, int prev_prio) |
715 | { | 716 | { |
716 | struct rq *rq = rq_of_rt_rq(rt_rq); | 717 | struct rq *rq = rq_of_rt_rq(rt_rq); |
717 | 718 | ||
718 | if (prio < prev_prio) { | 719 | if (rq->online && prio < prev_prio) |
719 | 720 | cpupri_set(&rq->rd->cpupri, rq->cpu, prio); | |
720 | /* | ||
721 | * If the new task is higher in priority than anything on the | ||
722 | * run-queue, we know that the previous high becomes our | ||
723 | * next-highest. | ||
724 | */ | ||
725 | rt_rq->highest_prio.next = prev_prio; | ||
726 | |||
727 | if (rq->online) | ||
728 | cpupri_set(&rq->rd->cpupri, rq->cpu, prio); | ||
729 | |||
730 | } else if (prio == rt_rq->highest_prio.curr) | ||
731 | /* | ||
732 | * If the next task is equal in priority to the highest on | ||
733 | * the run-queue, then we implicitly know that the next highest | ||
734 | * task cannot be any lower than current | ||
735 | */ | ||
736 | rt_rq->highest_prio.next = prio; | ||
737 | else if (prio < rt_rq->highest_prio.next) | ||
738 | /* | ||
739 | * Otherwise, we need to recompute next-highest | ||
740 | */ | ||
741 | rt_rq->highest_prio.next = next_prio(rq); | ||
742 | } | 721 | } |
743 | 722 | ||
744 | static void | 723 | static void |
@@ -746,9 +725,6 @@ dec_rt_prio_smp(struct rt_rq *rt_rq, int prio, int prev_prio) | |||
746 | { | 725 | { |
747 | struct rq *rq = rq_of_rt_rq(rt_rq); | 726 | struct rq *rq = rq_of_rt_rq(rt_rq); |
748 | 727 | ||
749 | if (rt_rq->rt_nr_running && (prio <= rt_rq->highest_prio.next)) | ||
750 | rt_rq->highest_prio.next = next_prio(rq); | ||
751 | |||
752 | if (rq->online && rt_rq->highest_prio.curr != prev_prio) | 728 | if (rq->online && rt_rq->highest_prio.curr != prev_prio) |
753 | cpupri_set(&rq->rd->cpupri, rq->cpu, rt_rq->highest_prio.curr); | 729 | cpupri_set(&rq->rd->cpupri, rq->cpu, rt_rq->highest_prio.curr); |
754 | } | 730 | } |
@@ -961,6 +937,8 @@ enqueue_task_rt(struct rq *rq, struct task_struct *p, int flags) | |||
961 | 937 | ||
962 | if (!task_current(rq, p) && p->rt.nr_cpus_allowed > 1) | 938 | if (!task_current(rq, p) && p->rt.nr_cpus_allowed > 1) |
963 | enqueue_pushable_task(rq, p); | 939 | enqueue_pushable_task(rq, p); |
940 | |||
941 | inc_nr_running(rq); | ||
964 | } | 942 | } |
965 | 943 | ||
966 | static void dequeue_task_rt(struct rq *rq, struct task_struct *p, int flags) | 944 | static void dequeue_task_rt(struct rq *rq, struct task_struct *p, int flags) |
@@ -971,6 +949,8 @@ static void dequeue_task_rt(struct rq *rq, struct task_struct *p, int flags) | |||
971 | dequeue_rt_entity(rt_se); | 949 | dequeue_rt_entity(rt_se); |
972 | 950 | ||
973 | dequeue_pushable_task(rq, p); | 951 | dequeue_pushable_task(rq, p); |
952 | |||
953 | dec_nr_running(rq); | ||
974 | } | 954 | } |
975 | 955 | ||
976 | /* | 956 | /* |
@@ -1017,10 +997,12 @@ select_task_rq_rt(struct task_struct *p, int sd_flag, int flags) | |||
1017 | struct rq *rq; | 997 | struct rq *rq; |
1018 | int cpu; | 998 | int cpu; |
1019 | 999 | ||
1020 | if (sd_flag != SD_BALANCE_WAKE) | ||
1021 | return smp_processor_id(); | ||
1022 | |||
1023 | cpu = task_cpu(p); | 1000 | cpu = task_cpu(p); |
1001 | |||
1002 | /* For anything but wake ups, just return the task_cpu */ | ||
1003 | if (sd_flag != SD_BALANCE_WAKE && sd_flag != SD_BALANCE_FORK) | ||
1004 | goto out; | ||
1005 | |||
1024 | rq = cpu_rq(cpu); | 1006 | rq = cpu_rq(cpu); |
1025 | 1007 | ||
1026 | rcu_read_lock(); | 1008 | rcu_read_lock(); |
@@ -1050,7 +1032,7 @@ select_task_rq_rt(struct task_struct *p, int sd_flag, int flags) | |||
1050 | */ | 1032 | */ |
1051 | if (curr && unlikely(rt_task(curr)) && | 1033 | if (curr && unlikely(rt_task(curr)) && |
1052 | (curr->rt.nr_cpus_allowed < 2 || | 1034 | (curr->rt.nr_cpus_allowed < 2 || |
1053 | curr->prio < p->prio) && | 1035 | curr->prio <= p->prio) && |
1054 | (p->rt.nr_cpus_allowed > 1)) { | 1036 | (p->rt.nr_cpus_allowed > 1)) { |
1055 | int target = find_lowest_rq(p); | 1037 | int target = find_lowest_rq(p); |
1056 | 1038 | ||
@@ -1059,6 +1041,7 @@ select_task_rq_rt(struct task_struct *p, int sd_flag, int flags) | |||
1059 | } | 1041 | } |
1060 | rcu_read_unlock(); | 1042 | rcu_read_unlock(); |
1061 | 1043 | ||
1044 | out: | ||
1062 | return cpu; | 1045 | return cpu; |
1063 | } | 1046 | } |
1064 | 1047 | ||
@@ -1178,7 +1161,6 @@ static struct task_struct *pick_next_task_rt(struct rq *rq) | |||
1178 | static void put_prev_task_rt(struct rq *rq, struct task_struct *p) | 1161 | static void put_prev_task_rt(struct rq *rq, struct task_struct *p) |
1179 | { | 1162 | { |
1180 | update_curr_rt(rq); | 1163 | update_curr_rt(rq); |
1181 | p->se.exec_start = 0; | ||
1182 | 1164 | ||
1183 | /* | 1165 | /* |
1184 | * The previous task needs to be made eligible for pushing | 1166 | * The previous task needs to be made eligible for pushing |
@@ -1198,7 +1180,7 @@ static void deactivate_task(struct rq *rq, struct task_struct *p, int sleep); | |||
1198 | static int pick_rt_task(struct rq *rq, struct task_struct *p, int cpu) | 1180 | static int pick_rt_task(struct rq *rq, struct task_struct *p, int cpu) |
1199 | { | 1181 | { |
1200 | if (!task_running(rq, p) && | 1182 | if (!task_running(rq, p) && |
1201 | (cpu < 0 || cpumask_test_cpu(cpu, &p->cpus_allowed)) && | 1183 | (cpu < 0 || cpumask_test_cpu(cpu, tsk_cpus_allowed(p))) && |
1202 | (p->rt.nr_cpus_allowed > 1)) | 1184 | (p->rt.nr_cpus_allowed > 1)) |
1203 | return 1; | 1185 | return 1; |
1204 | return 0; | 1186 | return 0; |
@@ -1343,7 +1325,7 @@ static struct rq *find_lock_lowest_rq(struct task_struct *task, struct rq *rq) | |||
1343 | */ | 1325 | */ |
1344 | if (unlikely(task_rq(task) != rq || | 1326 | if (unlikely(task_rq(task) != rq || |
1345 | !cpumask_test_cpu(lowest_rq->cpu, | 1327 | !cpumask_test_cpu(lowest_rq->cpu, |
1346 | &task->cpus_allowed) || | 1328 | tsk_cpus_allowed(task)) || |
1347 | task_running(rq, task) || | 1329 | task_running(rq, task) || |
1348 | !task->on_rq)) { | 1330 | !task->on_rq)) { |
1349 | 1331 | ||
@@ -1394,6 +1376,7 @@ static int push_rt_task(struct rq *rq) | |||
1394 | { | 1376 | { |
1395 | struct task_struct *next_task; | 1377 | struct task_struct *next_task; |
1396 | struct rq *lowest_rq; | 1378 | struct rq *lowest_rq; |
1379 | int ret = 0; | ||
1397 | 1380 | ||
1398 | if (!rq->rt.overloaded) | 1381 | if (!rq->rt.overloaded) |
1399 | return 0; | 1382 | return 0; |
@@ -1426,7 +1409,7 @@ retry: | |||
1426 | if (!lowest_rq) { | 1409 | if (!lowest_rq) { |
1427 | struct task_struct *task; | 1410 | struct task_struct *task; |
1428 | /* | 1411 | /* |
1429 | * find lock_lowest_rq releases rq->lock | 1412 | * find_lock_lowest_rq releases rq->lock |
1430 | * so it is possible that next_task has migrated. | 1413 | * so it is possible that next_task has migrated. |
1431 | * | 1414 | * |
1432 | * We need to make sure that the task is still on the same | 1415 | * We need to make sure that the task is still on the same |
@@ -1436,12 +1419,11 @@ retry: | |||
1436 | task = pick_next_pushable_task(rq); | 1419 | task = pick_next_pushable_task(rq); |
1437 | if (task_cpu(next_task) == rq->cpu && task == next_task) { | 1420 | if (task_cpu(next_task) == rq->cpu && task == next_task) { |
1438 | /* | 1421 | /* |
1439 | * If we get here, the task hasn't moved at all, but | 1422 | * The task hasn't migrated, and is still the next |
1440 | * it has failed to push. We will not try again, | 1423 | * eligible task, but we failed to find a run-queue |
1441 | * since the other cpus will pull from us when they | 1424 | * to push it to. Do not retry in this case, since |
1442 | * are ready. | 1425 | * other cpus will pull from us when ready. |
1443 | */ | 1426 | */ |
1444 | dequeue_pushable_task(rq, next_task); | ||
1445 | goto out; | 1427 | goto out; |
1446 | } | 1428 | } |
1447 | 1429 | ||
@@ -1460,6 +1442,7 @@ retry: | |||
1460 | deactivate_task(rq, next_task, 0); | 1442 | deactivate_task(rq, next_task, 0); |
1461 | set_task_cpu(next_task, lowest_rq->cpu); | 1443 | set_task_cpu(next_task, lowest_rq->cpu); |
1462 | activate_task(lowest_rq, next_task, 0); | 1444 | activate_task(lowest_rq, next_task, 0); |
1445 | ret = 1; | ||
1463 | 1446 | ||
1464 | resched_task(lowest_rq->curr); | 1447 | resched_task(lowest_rq->curr); |
1465 | 1448 | ||
@@ -1468,7 +1451,7 @@ retry: | |||
1468 | out: | 1451 | out: |
1469 | put_task_struct(next_task); | 1452 | put_task_struct(next_task); |
1470 | 1453 | ||
1471 | return 1; | 1454 | return ret; |
1472 | } | 1455 | } |
1473 | 1456 | ||
1474 | static void push_rt_tasks(struct rq *rq) | 1457 | static void push_rt_tasks(struct rq *rq) |
@@ -1581,7 +1564,7 @@ static void task_woken_rt(struct rq *rq, struct task_struct *p) | |||
1581 | p->rt.nr_cpus_allowed > 1 && | 1564 | p->rt.nr_cpus_allowed > 1 && |
1582 | rt_task(rq->curr) && | 1565 | rt_task(rq->curr) && |
1583 | (rq->curr->rt.nr_cpus_allowed < 2 || | 1566 | (rq->curr->rt.nr_cpus_allowed < 2 || |
1584 | rq->curr->prio < p->prio)) | 1567 | rq->curr->prio <= p->prio)) |
1585 | push_rt_tasks(rq); | 1568 | push_rt_tasks(rq); |
1586 | } | 1569 | } |
1587 | 1570 | ||
@@ -1626,9 +1609,6 @@ static void set_cpus_allowed_rt(struct task_struct *p, | |||
1626 | 1609 | ||
1627 | update_rt_migration(&rq->rt); | 1610 | update_rt_migration(&rq->rt); |
1628 | } | 1611 | } |
1629 | |||
1630 | cpumask_copy(&p->cpus_allowed, new_mask); | ||
1631 | p->rt.nr_cpus_allowed = weight; | ||
1632 | } | 1612 | } |
1633 | 1613 | ||
1634 | /* Assumes rq->lock is held */ | 1614 | /* Assumes rq->lock is held */ |
@@ -1863,4 +1843,3 @@ static void print_rt_stats(struct seq_file *m, int cpu) | |||
1863 | rcu_read_unlock(); | 1843 | rcu_read_unlock(); |
1864 | } | 1844 | } |
1865 | #endif /* CONFIG_SCHED_DEBUG */ | 1845 | #endif /* CONFIG_SCHED_DEBUG */ |
1866 | |||
diff --git a/kernel/sched_stats.h b/kernel/sched_stats.h index 331e01bcd026..87f9e36ea56e 100644 --- a/kernel/sched_stats.h +++ b/kernel/sched_stats.h | |||
@@ -282,10 +282,10 @@ static inline void account_group_user_time(struct task_struct *tsk, | |||
282 | if (!cputimer->running) | 282 | if (!cputimer->running) |
283 | return; | 283 | return; |
284 | 284 | ||
285 | spin_lock(&cputimer->lock); | 285 | raw_spin_lock(&cputimer->lock); |
286 | cputimer->cputime.utime = | 286 | cputimer->cputime.utime = |
287 | cputime_add(cputimer->cputime.utime, cputime); | 287 | cputime_add(cputimer->cputime.utime, cputime); |
288 | spin_unlock(&cputimer->lock); | 288 | raw_spin_unlock(&cputimer->lock); |
289 | } | 289 | } |
290 | 290 | ||
291 | /** | 291 | /** |
@@ -306,10 +306,10 @@ static inline void account_group_system_time(struct task_struct *tsk, | |||
306 | if (!cputimer->running) | 306 | if (!cputimer->running) |
307 | return; | 307 | return; |
308 | 308 | ||
309 | spin_lock(&cputimer->lock); | 309 | raw_spin_lock(&cputimer->lock); |
310 | cputimer->cputime.stime = | 310 | cputimer->cputime.stime = |
311 | cputime_add(cputimer->cputime.stime, cputime); | 311 | cputime_add(cputimer->cputime.stime, cputime); |
312 | spin_unlock(&cputimer->lock); | 312 | raw_spin_unlock(&cputimer->lock); |
313 | } | 313 | } |
314 | 314 | ||
315 | /** | 315 | /** |
@@ -330,7 +330,7 @@ static inline void account_group_exec_runtime(struct task_struct *tsk, | |||
330 | if (!cputimer->running) | 330 | if (!cputimer->running) |
331 | return; | 331 | return; |
332 | 332 | ||
333 | spin_lock(&cputimer->lock); | 333 | raw_spin_lock(&cputimer->lock); |
334 | cputimer->cputime.sum_exec_runtime += ns; | 334 | cputimer->cputime.sum_exec_runtime += ns; |
335 | spin_unlock(&cputimer->lock); | 335 | raw_spin_unlock(&cputimer->lock); |
336 | } | 336 | } |
diff --git a/kernel/sched_stoptask.c b/kernel/sched_stoptask.c index 6f437632afab..8b44e7fa7fb3 100644 --- a/kernel/sched_stoptask.c +++ b/kernel/sched_stoptask.c | |||
@@ -34,11 +34,13 @@ static struct task_struct *pick_next_task_stop(struct rq *rq) | |||
34 | static void | 34 | static void |
35 | enqueue_task_stop(struct rq *rq, struct task_struct *p, int flags) | 35 | enqueue_task_stop(struct rq *rq, struct task_struct *p, int flags) |
36 | { | 36 | { |
37 | inc_nr_running(rq); | ||
37 | } | 38 | } |
38 | 39 | ||
39 | static void | 40 | static void |
40 | dequeue_task_stop(struct rq *rq, struct task_struct *p, int flags) | 41 | dequeue_task_stop(struct rq *rq, struct task_struct *p, int flags) |
41 | { | 42 | { |
43 | dec_nr_running(rq); | ||
42 | } | 44 | } |
43 | 45 | ||
44 | static void yield_task_stop(struct rq *rq) | 46 | static void yield_task_stop(struct rq *rq) |
diff --git a/kernel/semaphore.c b/kernel/semaphore.c index 94a62c0d4ade..d831841e55a7 100644 --- a/kernel/semaphore.c +++ b/kernel/semaphore.c | |||
@@ -54,12 +54,12 @@ void down(struct semaphore *sem) | |||
54 | { | 54 | { |
55 | unsigned long flags; | 55 | unsigned long flags; |
56 | 56 | ||
57 | spin_lock_irqsave(&sem->lock, flags); | 57 | raw_spin_lock_irqsave(&sem->lock, flags); |
58 | if (likely(sem->count > 0)) | 58 | if (likely(sem->count > 0)) |
59 | sem->count--; | 59 | sem->count--; |
60 | else | 60 | else |
61 | __down(sem); | 61 | __down(sem); |
62 | spin_unlock_irqrestore(&sem->lock, flags); | 62 | raw_spin_unlock_irqrestore(&sem->lock, flags); |
63 | } | 63 | } |
64 | EXPORT_SYMBOL(down); | 64 | EXPORT_SYMBOL(down); |
65 | 65 | ||
@@ -77,12 +77,12 @@ int down_interruptible(struct semaphore *sem) | |||
77 | unsigned long flags; | 77 | unsigned long flags; |
78 | int result = 0; | 78 | int result = 0; |
79 | 79 | ||
80 | spin_lock_irqsave(&sem->lock, flags); | 80 | raw_spin_lock_irqsave(&sem->lock, flags); |
81 | if (likely(sem->count > 0)) | 81 | if (likely(sem->count > 0)) |
82 | sem->count--; | 82 | sem->count--; |
83 | else | 83 | else |
84 | result = __down_interruptible(sem); | 84 | result = __down_interruptible(sem); |
85 | spin_unlock_irqrestore(&sem->lock, flags); | 85 | raw_spin_unlock_irqrestore(&sem->lock, flags); |
86 | 86 | ||
87 | return result; | 87 | return result; |
88 | } | 88 | } |
@@ -103,12 +103,12 @@ int down_killable(struct semaphore *sem) | |||
103 | unsigned long flags; | 103 | unsigned long flags; |
104 | int result = 0; | 104 | int result = 0; |
105 | 105 | ||
106 | spin_lock_irqsave(&sem->lock, flags); | 106 | raw_spin_lock_irqsave(&sem->lock, flags); |
107 | if (likely(sem->count > 0)) | 107 | if (likely(sem->count > 0)) |
108 | sem->count--; | 108 | sem->count--; |
109 | else | 109 | else |
110 | result = __down_killable(sem); | 110 | result = __down_killable(sem); |
111 | spin_unlock_irqrestore(&sem->lock, flags); | 111 | raw_spin_unlock_irqrestore(&sem->lock, flags); |
112 | 112 | ||
113 | return result; | 113 | return result; |
114 | } | 114 | } |
@@ -132,11 +132,11 @@ int down_trylock(struct semaphore *sem) | |||
132 | unsigned long flags; | 132 | unsigned long flags; |
133 | int count; | 133 | int count; |
134 | 134 | ||
135 | spin_lock_irqsave(&sem->lock, flags); | 135 | raw_spin_lock_irqsave(&sem->lock, flags); |
136 | count = sem->count - 1; | 136 | count = sem->count - 1; |
137 | if (likely(count >= 0)) | 137 | if (likely(count >= 0)) |
138 | sem->count = count; | 138 | sem->count = count; |
139 | spin_unlock_irqrestore(&sem->lock, flags); | 139 | raw_spin_unlock_irqrestore(&sem->lock, flags); |
140 | 140 | ||
141 | return (count < 0); | 141 | return (count < 0); |
142 | } | 142 | } |
@@ -157,12 +157,12 @@ int down_timeout(struct semaphore *sem, long jiffies) | |||
157 | unsigned long flags; | 157 | unsigned long flags; |
158 | int result = 0; | 158 | int result = 0; |
159 | 159 | ||
160 | spin_lock_irqsave(&sem->lock, flags); | 160 | raw_spin_lock_irqsave(&sem->lock, flags); |
161 | if (likely(sem->count > 0)) | 161 | if (likely(sem->count > 0)) |
162 | sem->count--; | 162 | sem->count--; |
163 | else | 163 | else |
164 | result = __down_timeout(sem, jiffies); | 164 | result = __down_timeout(sem, jiffies); |
165 | spin_unlock_irqrestore(&sem->lock, flags); | 165 | raw_spin_unlock_irqrestore(&sem->lock, flags); |
166 | 166 | ||
167 | return result; | 167 | return result; |
168 | } | 168 | } |
@@ -179,12 +179,12 @@ void up(struct semaphore *sem) | |||
179 | { | 179 | { |
180 | unsigned long flags; | 180 | unsigned long flags; |
181 | 181 | ||
182 | spin_lock_irqsave(&sem->lock, flags); | 182 | raw_spin_lock_irqsave(&sem->lock, flags); |
183 | if (likely(list_empty(&sem->wait_list))) | 183 | if (likely(list_empty(&sem->wait_list))) |
184 | sem->count++; | 184 | sem->count++; |
185 | else | 185 | else |
186 | __up(sem); | 186 | __up(sem); |
187 | spin_unlock_irqrestore(&sem->lock, flags); | 187 | raw_spin_unlock_irqrestore(&sem->lock, flags); |
188 | } | 188 | } |
189 | EXPORT_SYMBOL(up); | 189 | EXPORT_SYMBOL(up); |
190 | 190 | ||
@@ -217,9 +217,9 @@ static inline int __sched __down_common(struct semaphore *sem, long state, | |||
217 | if (timeout <= 0) | 217 | if (timeout <= 0) |
218 | goto timed_out; | 218 | goto timed_out; |
219 | __set_task_state(task, state); | 219 | __set_task_state(task, state); |
220 | spin_unlock_irq(&sem->lock); | 220 | raw_spin_unlock_irq(&sem->lock); |
221 | timeout = schedule_timeout(timeout); | 221 | timeout = schedule_timeout(timeout); |
222 | spin_lock_irq(&sem->lock); | 222 | raw_spin_lock_irq(&sem->lock); |
223 | if (waiter.up) | 223 | if (waiter.up) |
224 | return 0; | 224 | return 0; |
225 | } | 225 | } |
diff --git a/kernel/signal.c b/kernel/signal.c index 291c9700be75..d252be2d3de5 100644 --- a/kernel/signal.c +++ b/kernel/signal.c | |||
@@ -1344,13 +1344,24 @@ int kill_proc_info(int sig, struct siginfo *info, pid_t pid) | |||
1344 | return error; | 1344 | return error; |
1345 | } | 1345 | } |
1346 | 1346 | ||
1347 | static int kill_as_cred_perm(const struct cred *cred, | ||
1348 | struct task_struct *target) | ||
1349 | { | ||
1350 | const struct cred *pcred = __task_cred(target); | ||
1351 | if (cred->user_ns != pcred->user_ns) | ||
1352 | return 0; | ||
1353 | if (cred->euid != pcred->suid && cred->euid != pcred->uid && | ||
1354 | cred->uid != pcred->suid && cred->uid != pcred->uid) | ||
1355 | return 0; | ||
1356 | return 1; | ||
1357 | } | ||
1358 | |||
1347 | /* like kill_pid_info(), but doesn't use uid/euid of "current" */ | 1359 | /* like kill_pid_info(), but doesn't use uid/euid of "current" */ |
1348 | int kill_pid_info_as_uid(int sig, struct siginfo *info, struct pid *pid, | 1360 | int kill_pid_info_as_cred(int sig, struct siginfo *info, struct pid *pid, |
1349 | uid_t uid, uid_t euid, u32 secid) | 1361 | const struct cred *cred, u32 secid) |
1350 | { | 1362 | { |
1351 | int ret = -EINVAL; | 1363 | int ret = -EINVAL; |
1352 | struct task_struct *p; | 1364 | struct task_struct *p; |
1353 | const struct cred *pcred; | ||
1354 | unsigned long flags; | 1365 | unsigned long flags; |
1355 | 1366 | ||
1356 | if (!valid_signal(sig)) | 1367 | if (!valid_signal(sig)) |
@@ -1362,10 +1373,7 @@ int kill_pid_info_as_uid(int sig, struct siginfo *info, struct pid *pid, | |||
1362 | ret = -ESRCH; | 1373 | ret = -ESRCH; |
1363 | goto out_unlock; | 1374 | goto out_unlock; |
1364 | } | 1375 | } |
1365 | pcred = __task_cred(p); | 1376 | if (si_fromuser(info) && !kill_as_cred_perm(cred, p)) { |
1366 | if (si_fromuser(info) && | ||
1367 | euid != pcred->suid && euid != pcred->uid && | ||
1368 | uid != pcred->suid && uid != pcred->uid) { | ||
1369 | ret = -EPERM; | 1377 | ret = -EPERM; |
1370 | goto out_unlock; | 1378 | goto out_unlock; |
1371 | } | 1379 | } |
@@ -1384,7 +1392,7 @@ out_unlock: | |||
1384 | rcu_read_unlock(); | 1392 | rcu_read_unlock(); |
1385 | return ret; | 1393 | return ret; |
1386 | } | 1394 | } |
1387 | EXPORT_SYMBOL_GPL(kill_pid_info_as_uid); | 1395 | EXPORT_SYMBOL_GPL(kill_pid_info_as_cred); |
1388 | 1396 | ||
1389 | /* | 1397 | /* |
1390 | * kill_something_info() interprets pid in interesting ways just like kill(2). | 1398 | * kill_something_info() interprets pid in interesting ways just like kill(2). |
diff --git a/kernel/sys.c b/kernel/sys.c index 18ee1d2f6474..58459509b14c 100644 --- a/kernel/sys.c +++ b/kernel/sys.c | |||
@@ -1172,7 +1172,7 @@ DECLARE_RWSEM(uts_sem); | |||
1172 | static int override_release(char __user *release, int len) | 1172 | static int override_release(char __user *release, int len) |
1173 | { | 1173 | { |
1174 | int ret = 0; | 1174 | int ret = 0; |
1175 | char buf[len]; | 1175 | char buf[65]; |
1176 | 1176 | ||
1177 | if (current->personality & UNAME26) { | 1177 | if (current->personality & UNAME26) { |
1178 | char *rest = UTS_RELEASE; | 1178 | char *rest = UTS_RELEASE; |
@@ -1759,6 +1759,7 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3, | |||
1759 | sizeof(me->comm) - 1) < 0) | 1759 | sizeof(me->comm) - 1) < 0) |
1760 | return -EFAULT; | 1760 | return -EFAULT; |
1761 | set_task_comm(me, comm); | 1761 | set_task_comm(me, comm); |
1762 | proc_comm_connector(me); | ||
1762 | return 0; | 1763 | return 0; |
1763 | case PR_GET_NAME: | 1764 | case PR_GET_NAME: |
1764 | get_task_comm(comm, me); | 1765 | get_task_comm(comm, me); |
diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 11d65b531e50..2d2ecdcc8cdb 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c | |||
@@ -379,6 +379,16 @@ static struct ctl_table kern_table[] = { | |||
379 | .extra2 = &one, | 379 | .extra2 = &one, |
380 | }, | 380 | }, |
381 | #endif | 381 | #endif |
382 | #ifdef CONFIG_CFS_BANDWIDTH | ||
383 | { | ||
384 | .procname = "sched_cfs_bandwidth_slice_us", | ||
385 | .data = &sysctl_sched_cfs_bandwidth_slice, | ||
386 | .maxlen = sizeof(unsigned int), | ||
387 | .mode = 0644, | ||
388 | .proc_handler = proc_dointvec_minmax, | ||
389 | .extra1 = &one, | ||
390 | }, | ||
391 | #endif | ||
382 | #ifdef CONFIG_PROVE_LOCKING | 392 | #ifdef CONFIG_PROVE_LOCKING |
383 | { | 393 | { |
384 | .procname = "prove_locking", | 394 | .procname = "prove_locking", |
diff --git a/kernel/sysctl_binary.c b/kernel/sysctl_binary.c index e8bffbe2ba4b..6318b511afa1 100644 --- a/kernel/sysctl_binary.c +++ b/kernel/sysctl_binary.c | |||
@@ -214,7 +214,7 @@ static const struct bin_table bin_net_ipv4_route_table[] = { | |||
214 | { CTL_INT, NET_IPV4_ROUTE_GC_MIN_INTERVAL, "gc_min_interval" }, | 214 | { CTL_INT, NET_IPV4_ROUTE_GC_MIN_INTERVAL, "gc_min_interval" }, |
215 | { CTL_INT, NET_IPV4_ROUTE_GC_MIN_INTERVAL_MS, "gc_min_interval_ms" }, | 215 | { CTL_INT, NET_IPV4_ROUTE_GC_MIN_INTERVAL_MS, "gc_min_interval_ms" }, |
216 | { CTL_INT, NET_IPV4_ROUTE_GC_TIMEOUT, "gc_timeout" }, | 216 | { CTL_INT, NET_IPV4_ROUTE_GC_TIMEOUT, "gc_timeout" }, |
217 | { CTL_INT, NET_IPV4_ROUTE_GC_INTERVAL, "gc_interval" }, | 217 | /* NET_IPV4_ROUTE_GC_INTERVAL "gc_interval" no longer used */ |
218 | { CTL_INT, NET_IPV4_ROUTE_REDIRECT_LOAD, "redirect_load" }, | 218 | { CTL_INT, NET_IPV4_ROUTE_REDIRECT_LOAD, "redirect_load" }, |
219 | { CTL_INT, NET_IPV4_ROUTE_REDIRECT_NUMBER, "redirect_number" }, | 219 | { CTL_INT, NET_IPV4_ROUTE_REDIRECT_NUMBER, "redirect_number" }, |
220 | { CTL_INT, NET_IPV4_ROUTE_REDIRECT_SILENCE, "redirect_silence" }, | 220 | { CTL_INT, NET_IPV4_ROUTE_REDIRECT_SILENCE, "redirect_silence" }, |
diff --git a/kernel/taskstats.c b/kernel/taskstats.c index e19ce1454ee1..e66046456f4f 100644 --- a/kernel/taskstats.c +++ b/kernel/taskstats.c | |||
@@ -655,6 +655,7 @@ static struct genl_ops taskstats_ops = { | |||
655 | .cmd = TASKSTATS_CMD_GET, | 655 | .cmd = TASKSTATS_CMD_GET, |
656 | .doit = taskstats_user_cmd, | 656 | .doit = taskstats_user_cmd, |
657 | .policy = taskstats_cmd_get_policy, | 657 | .policy = taskstats_cmd_get_policy, |
658 | .flags = GENL_ADMIN_PERM, | ||
658 | }; | 659 | }; |
659 | 660 | ||
660 | static struct genl_ops cgroupstats_ops = { | 661 | static struct genl_ops cgroupstats_ops = { |
diff --git a/kernel/time.c b/kernel/time.c index 8e8dc6d705c9..d77606214529 100644 --- a/kernel/time.c +++ b/kernel/time.c | |||
@@ -575,7 +575,7 @@ EXPORT_SYMBOL(jiffies_to_timeval); | |||
575 | /* | 575 | /* |
576 | * Convert jiffies/jiffies_64 to clock_t and back. | 576 | * Convert jiffies/jiffies_64 to clock_t and back. |
577 | */ | 577 | */ |
578 | clock_t jiffies_to_clock_t(long x) | 578 | clock_t jiffies_to_clock_t(unsigned long x) |
579 | { | 579 | { |
580 | #if (TICK_NSEC % (NSEC_PER_SEC / USER_HZ)) == 0 | 580 | #if (TICK_NSEC % (NSEC_PER_SEC / USER_HZ)) == 0 |
581 | # if HZ < USER_HZ | 581 | # if HZ < USER_HZ |
diff --git a/kernel/time/Kconfig b/kernel/time/Kconfig index f06a8a365648..b26c2228fe92 100644 --- a/kernel/time/Kconfig +++ b/kernel/time/Kconfig | |||
@@ -27,3 +27,5 @@ config GENERIC_CLOCKEVENTS_BUILD | |||
27 | default y | 27 | default y |
28 | depends on GENERIC_CLOCKEVENTS || GENERIC_CLOCKEVENTS_MIGR | 28 | depends on GENERIC_CLOCKEVENTS || GENERIC_CLOCKEVENTS_MIGR |
29 | 29 | ||
30 | config GENERIC_CLOCKEVENTS_MIN_ADJUST | ||
31 | bool | ||
diff --git a/kernel/time/alarmtimer.c b/kernel/time/alarmtimer.c index ea5e1a928d5b..c436e790b21b 100644 --- a/kernel/time/alarmtimer.c +++ b/kernel/time/alarmtimer.c | |||
@@ -53,27 +53,6 @@ static struct rtc_device *rtcdev; | |||
53 | static DEFINE_SPINLOCK(rtcdev_lock); | 53 | static DEFINE_SPINLOCK(rtcdev_lock); |
54 | 54 | ||
55 | /** | 55 | /** |
56 | * has_wakealarm - check rtc device has wakealarm ability | ||
57 | * @dev: current device | ||
58 | * @name_ptr: name to be returned | ||
59 | * | ||
60 | * This helper function checks to see if the rtc device can wake | ||
61 | * from suspend. | ||
62 | */ | ||
63 | static int has_wakealarm(struct device *dev, void *name_ptr) | ||
64 | { | ||
65 | struct rtc_device *candidate = to_rtc_device(dev); | ||
66 | |||
67 | if (!candidate->ops->set_alarm) | ||
68 | return 0; | ||
69 | if (!device_may_wakeup(candidate->dev.parent)) | ||
70 | return 0; | ||
71 | |||
72 | *(const char **)name_ptr = dev_name(dev); | ||
73 | return 1; | ||
74 | } | ||
75 | |||
76 | /** | ||
77 | * alarmtimer_get_rtcdev - Return selected rtcdevice | 56 | * alarmtimer_get_rtcdev - Return selected rtcdevice |
78 | * | 57 | * |
79 | * This function returns the rtc device to use for wakealarms. | 58 | * This function returns the rtc device to use for wakealarms. |
@@ -82,37 +61,64 @@ static int has_wakealarm(struct device *dev, void *name_ptr) | |||
82 | */ | 61 | */ |
83 | static struct rtc_device *alarmtimer_get_rtcdev(void) | 62 | static struct rtc_device *alarmtimer_get_rtcdev(void) |
84 | { | 63 | { |
85 | struct device *dev; | ||
86 | char *str; | ||
87 | unsigned long flags; | 64 | unsigned long flags; |
88 | struct rtc_device *ret; | 65 | struct rtc_device *ret; |
89 | 66 | ||
90 | spin_lock_irqsave(&rtcdev_lock, flags); | 67 | spin_lock_irqsave(&rtcdev_lock, flags); |
91 | if (!rtcdev) { | ||
92 | /* Find an rtc device and init the rtc_timer */ | ||
93 | dev = class_find_device(rtc_class, NULL, &str, has_wakealarm); | ||
94 | /* If we have a device then str is valid. See has_wakealarm() */ | ||
95 | if (dev) { | ||
96 | rtcdev = rtc_class_open(str); | ||
97 | /* | ||
98 | * Drop the reference we got in class_find_device, | ||
99 | * rtc_open takes its own. | ||
100 | */ | ||
101 | put_device(dev); | ||
102 | rtc_timer_init(&rtctimer, NULL, NULL); | ||
103 | } | ||
104 | } | ||
105 | ret = rtcdev; | 68 | ret = rtcdev; |
106 | spin_unlock_irqrestore(&rtcdev_lock, flags); | 69 | spin_unlock_irqrestore(&rtcdev_lock, flags); |
107 | 70 | ||
108 | return ret; | 71 | return ret; |
109 | } | 72 | } |
73 | |||
74 | |||
75 | static int alarmtimer_rtc_add_device(struct device *dev, | ||
76 | struct class_interface *class_intf) | ||
77 | { | ||
78 | unsigned long flags; | ||
79 | struct rtc_device *rtc = to_rtc_device(dev); | ||
80 | |||
81 | if (rtcdev) | ||
82 | return -EBUSY; | ||
83 | |||
84 | if (!rtc->ops->set_alarm) | ||
85 | return -1; | ||
86 | if (!device_may_wakeup(rtc->dev.parent)) | ||
87 | return -1; | ||
88 | |||
89 | spin_lock_irqsave(&rtcdev_lock, flags); | ||
90 | if (!rtcdev) { | ||
91 | rtcdev = rtc; | ||
92 | /* hold a reference so it doesn't go away */ | ||
93 | get_device(dev); | ||
94 | } | ||
95 | spin_unlock_irqrestore(&rtcdev_lock, flags); | ||
96 | return 0; | ||
97 | } | ||
98 | |||
99 | static struct class_interface alarmtimer_rtc_interface = { | ||
100 | .add_dev = &alarmtimer_rtc_add_device, | ||
101 | }; | ||
102 | |||
103 | static int alarmtimer_rtc_interface_setup(void) | ||
104 | { | ||
105 | alarmtimer_rtc_interface.class = rtc_class; | ||
106 | return class_interface_register(&alarmtimer_rtc_interface); | ||
107 | } | ||
108 | static void alarmtimer_rtc_interface_remove(void) | ||
109 | { | ||
110 | class_interface_unregister(&alarmtimer_rtc_interface); | ||
111 | } | ||
110 | #else | 112 | #else |
111 | #define alarmtimer_get_rtcdev() (0) | 113 | static inline struct rtc_device *alarmtimer_get_rtcdev(void) |
112 | #define rtcdev (0) | 114 | { |
115 | return NULL; | ||
116 | } | ||
117 | #define rtcdev (NULL) | ||
118 | static inline int alarmtimer_rtc_interface_setup(void) { return 0; } | ||
119 | static inline void alarmtimer_rtc_interface_remove(void) { } | ||
113 | #endif | 120 | #endif |
114 | 121 | ||
115 | |||
116 | /** | 122 | /** |
117 | * alarmtimer_enqueue - Adds an alarm timer to an alarm_base timerqueue | 123 | * alarmtimer_enqueue - Adds an alarm timer to an alarm_base timerqueue |
118 | * @base: pointer to the base where the timer is being run | 124 | * @base: pointer to the base where the timer is being run |
@@ -126,6 +132,8 @@ static struct rtc_device *alarmtimer_get_rtcdev(void) | |||
126 | static void alarmtimer_enqueue(struct alarm_base *base, struct alarm *alarm) | 132 | static void alarmtimer_enqueue(struct alarm_base *base, struct alarm *alarm) |
127 | { | 133 | { |
128 | timerqueue_add(&base->timerqueue, &alarm->node); | 134 | timerqueue_add(&base->timerqueue, &alarm->node); |
135 | alarm->state |= ALARMTIMER_STATE_ENQUEUED; | ||
136 | |||
129 | if (&alarm->node == timerqueue_getnext(&base->timerqueue)) { | 137 | if (&alarm->node == timerqueue_getnext(&base->timerqueue)) { |
130 | hrtimer_try_to_cancel(&base->timer); | 138 | hrtimer_try_to_cancel(&base->timer); |
131 | hrtimer_start(&base->timer, alarm->node.expires, | 139 | hrtimer_start(&base->timer, alarm->node.expires, |
@@ -147,7 +155,12 @@ static void alarmtimer_remove(struct alarm_base *base, struct alarm *alarm) | |||
147 | { | 155 | { |
148 | struct timerqueue_node *next = timerqueue_getnext(&base->timerqueue); | 156 | struct timerqueue_node *next = timerqueue_getnext(&base->timerqueue); |
149 | 157 | ||
158 | if (!(alarm->state & ALARMTIMER_STATE_ENQUEUED)) | ||
159 | return; | ||
160 | |||
150 | timerqueue_del(&base->timerqueue, &alarm->node); | 161 | timerqueue_del(&base->timerqueue, &alarm->node); |
162 | alarm->state &= ~ALARMTIMER_STATE_ENQUEUED; | ||
163 | |||
151 | if (next == &alarm->node) { | 164 | if (next == &alarm->node) { |
152 | hrtimer_try_to_cancel(&base->timer); | 165 | hrtimer_try_to_cancel(&base->timer); |
153 | next = timerqueue_getnext(&base->timerqueue); | 166 | next = timerqueue_getnext(&base->timerqueue); |
@@ -174,6 +187,7 @@ static enum hrtimer_restart alarmtimer_fired(struct hrtimer *timer) | |||
174 | unsigned long flags; | 187 | unsigned long flags; |
175 | ktime_t now; | 188 | ktime_t now; |
176 | int ret = HRTIMER_NORESTART; | 189 | int ret = HRTIMER_NORESTART; |
190 | int restart = ALARMTIMER_NORESTART; | ||
177 | 191 | ||
178 | spin_lock_irqsave(&base->lock, flags); | 192 | spin_lock_irqsave(&base->lock, flags); |
179 | now = base->gettime(); | 193 | now = base->gettime(); |
@@ -187,17 +201,19 @@ static enum hrtimer_restart alarmtimer_fired(struct hrtimer *timer) | |||
187 | alarm = container_of(next, struct alarm, node); | 201 | alarm = container_of(next, struct alarm, node); |
188 | 202 | ||
189 | timerqueue_del(&base->timerqueue, &alarm->node); | 203 | timerqueue_del(&base->timerqueue, &alarm->node); |
190 | alarm->enabled = 0; | 204 | alarm->state &= ~ALARMTIMER_STATE_ENQUEUED; |
191 | /* Re-add periodic timers */ | 205 | |
192 | if (alarm->period.tv64) { | 206 | alarm->state |= ALARMTIMER_STATE_CALLBACK; |
193 | alarm->node.expires = ktime_add(expired, alarm->period); | ||
194 | timerqueue_add(&base->timerqueue, &alarm->node); | ||
195 | alarm->enabled = 1; | ||
196 | } | ||
197 | spin_unlock_irqrestore(&base->lock, flags); | 207 | spin_unlock_irqrestore(&base->lock, flags); |
198 | if (alarm->function) | 208 | if (alarm->function) |
199 | alarm->function(alarm); | 209 | restart = alarm->function(alarm, now); |
200 | spin_lock_irqsave(&base->lock, flags); | 210 | spin_lock_irqsave(&base->lock, flags); |
211 | alarm->state &= ~ALARMTIMER_STATE_CALLBACK; | ||
212 | |||
213 | if (restart != ALARMTIMER_NORESTART) { | ||
214 | timerqueue_add(&base->timerqueue, &alarm->node); | ||
215 | alarm->state |= ALARMTIMER_STATE_ENQUEUED; | ||
216 | } | ||
201 | } | 217 | } |
202 | 218 | ||
203 | if (next) { | 219 | if (next) { |
@@ -234,7 +250,7 @@ static int alarmtimer_suspend(struct device *dev) | |||
234 | freezer_delta = ktime_set(0, 0); | 250 | freezer_delta = ktime_set(0, 0); |
235 | spin_unlock_irqrestore(&freezer_delta_lock, flags); | 251 | spin_unlock_irqrestore(&freezer_delta_lock, flags); |
236 | 252 | ||
237 | rtc = rtcdev; | 253 | rtc = alarmtimer_get_rtcdev(); |
238 | /* If we have no rtcdev, just return */ | 254 | /* If we have no rtcdev, just return */ |
239 | if (!rtc) | 255 | if (!rtc) |
240 | return 0; | 256 | return 0; |
@@ -299,53 +315,111 @@ static void alarmtimer_freezerset(ktime_t absexp, enum alarmtimer_type type) | |||
299 | * @function: callback that is run when the alarm fires | 315 | * @function: callback that is run when the alarm fires |
300 | */ | 316 | */ |
301 | void alarm_init(struct alarm *alarm, enum alarmtimer_type type, | 317 | void alarm_init(struct alarm *alarm, enum alarmtimer_type type, |
302 | void (*function)(struct alarm *)) | 318 | enum alarmtimer_restart (*function)(struct alarm *, ktime_t)) |
303 | { | 319 | { |
304 | timerqueue_init(&alarm->node); | 320 | timerqueue_init(&alarm->node); |
305 | alarm->period = ktime_set(0, 0); | ||
306 | alarm->function = function; | 321 | alarm->function = function; |
307 | alarm->type = type; | 322 | alarm->type = type; |
308 | alarm->enabled = 0; | 323 | alarm->state = ALARMTIMER_STATE_INACTIVE; |
309 | } | 324 | } |
310 | 325 | ||
311 | /** | 326 | /** |
312 | * alarm_start - Sets an alarm to fire | 327 | * alarm_start - Sets an alarm to fire |
313 | * @alarm: ptr to alarm to set | 328 | * @alarm: ptr to alarm to set |
314 | * @start: time to run the alarm | 329 | * @start: time to run the alarm |
315 | * @period: period at which the alarm will recur | ||
316 | */ | 330 | */ |
317 | void alarm_start(struct alarm *alarm, ktime_t start, ktime_t period) | 331 | void alarm_start(struct alarm *alarm, ktime_t start) |
318 | { | 332 | { |
319 | struct alarm_base *base = &alarm_bases[alarm->type]; | 333 | struct alarm_base *base = &alarm_bases[alarm->type]; |
320 | unsigned long flags; | 334 | unsigned long flags; |
321 | 335 | ||
322 | spin_lock_irqsave(&base->lock, flags); | 336 | spin_lock_irqsave(&base->lock, flags); |
323 | if (alarm->enabled) | 337 | if (alarmtimer_active(alarm)) |
324 | alarmtimer_remove(base, alarm); | 338 | alarmtimer_remove(base, alarm); |
325 | alarm->node.expires = start; | 339 | alarm->node.expires = start; |
326 | alarm->period = period; | ||
327 | alarmtimer_enqueue(base, alarm); | 340 | alarmtimer_enqueue(base, alarm); |
328 | alarm->enabled = 1; | ||
329 | spin_unlock_irqrestore(&base->lock, flags); | 341 | spin_unlock_irqrestore(&base->lock, flags); |
330 | } | 342 | } |
331 | 343 | ||
332 | /** | 344 | /** |
333 | * alarm_cancel - Tries to cancel an alarm timer | 345 | * alarm_try_to_cancel - Tries to cancel an alarm timer |
334 | * @alarm: ptr to alarm to be canceled | 346 | * @alarm: ptr to alarm to be canceled |
347 | * | ||
348 | * Returns 1 if the timer was canceled, 0 if it was not running, | ||
349 | * and -1 if the callback was running | ||
335 | */ | 350 | */ |
336 | void alarm_cancel(struct alarm *alarm) | 351 | int alarm_try_to_cancel(struct alarm *alarm) |
337 | { | 352 | { |
338 | struct alarm_base *base = &alarm_bases[alarm->type]; | 353 | struct alarm_base *base = &alarm_bases[alarm->type]; |
339 | unsigned long flags; | 354 | unsigned long flags; |
340 | 355 | int ret = -1; | |
341 | spin_lock_irqsave(&base->lock, flags); | 356 | spin_lock_irqsave(&base->lock, flags); |
342 | if (alarm->enabled) | 357 | |
358 | if (alarmtimer_callback_running(alarm)) | ||
359 | goto out; | ||
360 | |||
361 | if (alarmtimer_is_queued(alarm)) { | ||
343 | alarmtimer_remove(base, alarm); | 362 | alarmtimer_remove(base, alarm); |
344 | alarm->enabled = 0; | 363 | ret = 1; |
364 | } else | ||
365 | ret = 0; | ||
366 | out: | ||
345 | spin_unlock_irqrestore(&base->lock, flags); | 367 | spin_unlock_irqrestore(&base->lock, flags); |
368 | return ret; | ||
369 | } | ||
370 | |||
371 | |||
372 | /** | ||
373 | * alarm_cancel - Spins trying to cancel an alarm timer until it is done | ||
374 | * @alarm: ptr to alarm to be canceled | ||
375 | * | ||
376 | * Returns 1 if the timer was canceled, 0 if it was not active. | ||
377 | */ | ||
378 | int alarm_cancel(struct alarm *alarm) | ||
379 | { | ||
380 | for (;;) { | ||
381 | int ret = alarm_try_to_cancel(alarm); | ||
382 | if (ret >= 0) | ||
383 | return ret; | ||
384 | cpu_relax(); | ||
385 | } | ||
386 | } | ||
387 | |||
388 | |||
389 | u64 alarm_forward(struct alarm *alarm, ktime_t now, ktime_t interval) | ||
390 | { | ||
391 | u64 overrun = 1; | ||
392 | ktime_t delta; | ||
393 | |||
394 | delta = ktime_sub(now, alarm->node.expires); | ||
395 | |||
396 | if (delta.tv64 < 0) | ||
397 | return 0; | ||
398 | |||
399 | if (unlikely(delta.tv64 >= interval.tv64)) { | ||
400 | s64 incr = ktime_to_ns(interval); | ||
401 | |||
402 | overrun = ktime_divns(delta, incr); | ||
403 | |||
404 | alarm->node.expires = ktime_add_ns(alarm->node.expires, | ||
405 | incr*overrun); | ||
406 | |||
407 | if (alarm->node.expires.tv64 > now.tv64) | ||
408 | return overrun; | ||
409 | /* | ||
410 | * This (and the ktime_add() below) is the | ||
411 | * correction for exact: | ||
412 | */ | ||
413 | overrun++; | ||
414 | } | ||
415 | |||
416 | alarm->node.expires = ktime_add(alarm->node.expires, interval); | ||
417 | return overrun; | ||
346 | } | 418 | } |
347 | 419 | ||
348 | 420 | ||
421 | |||
422 | |||
349 | /** | 423 | /** |
350 | * clock2alarm - helper that converts from clockid to alarmtypes | 424 | * clock2alarm - helper that converts from clockid to alarmtypes |
351 | * @clockid: clockid. | 425 | * @clockid: clockid. |
@@ -365,12 +439,21 @@ static enum alarmtimer_type clock2alarm(clockid_t clockid) | |||
365 | * | 439 | * |
366 | * Posix timer callback for expired alarm timers. | 440 | * Posix timer callback for expired alarm timers. |
367 | */ | 441 | */ |
368 | static void alarm_handle_timer(struct alarm *alarm) | 442 | static enum alarmtimer_restart alarm_handle_timer(struct alarm *alarm, |
443 | ktime_t now) | ||
369 | { | 444 | { |
370 | struct k_itimer *ptr = container_of(alarm, struct k_itimer, | 445 | struct k_itimer *ptr = container_of(alarm, struct k_itimer, |
371 | it.alarmtimer); | 446 | it.alarm.alarmtimer); |
372 | if (posix_timer_event(ptr, 0) != 0) | 447 | if (posix_timer_event(ptr, 0) != 0) |
373 | ptr->it_overrun++; | 448 | ptr->it_overrun++; |
449 | |||
450 | /* Re-add periodic timers */ | ||
451 | if (ptr->it.alarm.interval.tv64) { | ||
452 | ptr->it_overrun += alarm_forward(alarm, now, | ||
453 | ptr->it.alarm.interval); | ||
454 | return ALARMTIMER_RESTART; | ||
455 | } | ||
456 | return ALARMTIMER_NORESTART; | ||
374 | } | 457 | } |
375 | 458 | ||
376 | /** | 459 | /** |
@@ -427,7 +510,7 @@ static int alarm_timer_create(struct k_itimer *new_timer) | |||
427 | 510 | ||
428 | type = clock2alarm(new_timer->it_clock); | 511 | type = clock2alarm(new_timer->it_clock); |
429 | base = &alarm_bases[type]; | 512 | base = &alarm_bases[type]; |
430 | alarm_init(&new_timer->it.alarmtimer, type, alarm_handle_timer); | 513 | alarm_init(&new_timer->it.alarm.alarmtimer, type, alarm_handle_timer); |
431 | return 0; | 514 | return 0; |
432 | } | 515 | } |
433 | 516 | ||
@@ -444,9 +527,9 @@ static void alarm_timer_get(struct k_itimer *timr, | |||
444 | memset(cur_setting, 0, sizeof(struct itimerspec)); | 527 | memset(cur_setting, 0, sizeof(struct itimerspec)); |
445 | 528 | ||
446 | cur_setting->it_interval = | 529 | cur_setting->it_interval = |
447 | ktime_to_timespec(timr->it.alarmtimer.period); | 530 | ktime_to_timespec(timr->it.alarm.interval); |
448 | cur_setting->it_value = | 531 | cur_setting->it_value = |
449 | ktime_to_timespec(timr->it.alarmtimer.node.expires); | 532 | ktime_to_timespec(timr->it.alarm.alarmtimer.node.expires); |
450 | return; | 533 | return; |
451 | } | 534 | } |
452 | 535 | ||
@@ -461,7 +544,9 @@ static int alarm_timer_del(struct k_itimer *timr) | |||
461 | if (!rtcdev) | 544 | if (!rtcdev) |
462 | return -ENOTSUPP; | 545 | return -ENOTSUPP; |
463 | 546 | ||
464 | alarm_cancel(&timr->it.alarmtimer); | 547 | if (alarm_try_to_cancel(&timr->it.alarm.alarmtimer) < 0) |
548 | return TIMER_RETRY; | ||
549 | |||
465 | return 0; | 550 | return 0; |
466 | } | 551 | } |
467 | 552 | ||
@@ -481,25 +566,17 @@ static int alarm_timer_set(struct k_itimer *timr, int flags, | |||
481 | if (!rtcdev) | 566 | if (!rtcdev) |
482 | return -ENOTSUPP; | 567 | return -ENOTSUPP; |
483 | 568 | ||
484 | /* | ||
485 | * XXX HACK! Currently we can DOS a system if the interval | ||
486 | * period on alarmtimers is too small. Cap the interval here | ||
487 | * to 100us and solve this properly in a future patch! -jstultz | ||
488 | */ | ||
489 | if ((new_setting->it_interval.tv_sec == 0) && | ||
490 | (new_setting->it_interval.tv_nsec < 100000)) | ||
491 | new_setting->it_interval.tv_nsec = 100000; | ||
492 | |||
493 | if (old_setting) | 569 | if (old_setting) |
494 | alarm_timer_get(timr, old_setting); | 570 | alarm_timer_get(timr, old_setting); |
495 | 571 | ||
496 | /* If the timer was already set, cancel it */ | 572 | /* If the timer was already set, cancel it */ |
497 | alarm_cancel(&timr->it.alarmtimer); | 573 | if (alarm_try_to_cancel(&timr->it.alarm.alarmtimer) < 0) |
574 | return TIMER_RETRY; | ||
498 | 575 | ||
499 | /* start the timer */ | 576 | /* start the timer */ |
500 | alarm_start(&timr->it.alarmtimer, | 577 | timr->it.alarm.interval = timespec_to_ktime(new_setting->it_interval); |
501 | timespec_to_ktime(new_setting->it_value), | 578 | alarm_start(&timr->it.alarm.alarmtimer, |
502 | timespec_to_ktime(new_setting->it_interval)); | 579 | timespec_to_ktime(new_setting->it_value)); |
503 | return 0; | 580 | return 0; |
504 | } | 581 | } |
505 | 582 | ||
@@ -509,13 +586,15 @@ static int alarm_timer_set(struct k_itimer *timr, int flags, | |||
509 | * | 586 | * |
510 | * Wakes up the task that set the alarmtimer | 587 | * Wakes up the task that set the alarmtimer |
511 | */ | 588 | */ |
512 | static void alarmtimer_nsleep_wakeup(struct alarm *alarm) | 589 | static enum alarmtimer_restart alarmtimer_nsleep_wakeup(struct alarm *alarm, |
590 | ktime_t now) | ||
513 | { | 591 | { |
514 | struct task_struct *task = (struct task_struct *)alarm->data; | 592 | struct task_struct *task = (struct task_struct *)alarm->data; |
515 | 593 | ||
516 | alarm->data = NULL; | 594 | alarm->data = NULL; |
517 | if (task) | 595 | if (task) |
518 | wake_up_process(task); | 596 | wake_up_process(task); |
597 | return ALARMTIMER_NORESTART; | ||
519 | } | 598 | } |
520 | 599 | ||
521 | /** | 600 | /** |
@@ -530,7 +609,7 @@ static int alarmtimer_do_nsleep(struct alarm *alarm, ktime_t absexp) | |||
530 | alarm->data = (void *)current; | 609 | alarm->data = (void *)current; |
531 | do { | 610 | do { |
532 | set_current_state(TASK_INTERRUPTIBLE); | 611 | set_current_state(TASK_INTERRUPTIBLE); |
533 | alarm_start(alarm, absexp, ktime_set(0, 0)); | 612 | alarm_start(alarm, absexp); |
534 | if (likely(alarm->data)) | 613 | if (likely(alarm->data)) |
535 | schedule(); | 614 | schedule(); |
536 | 615 | ||
@@ -691,6 +770,7 @@ static struct platform_driver alarmtimer_driver = { | |||
691 | */ | 770 | */ |
692 | static int __init alarmtimer_init(void) | 771 | static int __init alarmtimer_init(void) |
693 | { | 772 | { |
773 | struct platform_device *pdev; | ||
694 | int error = 0; | 774 | int error = 0; |
695 | int i; | 775 | int i; |
696 | struct k_clock alarm_clock = { | 776 | struct k_clock alarm_clock = { |
@@ -719,10 +799,26 @@ static int __init alarmtimer_init(void) | |||
719 | HRTIMER_MODE_ABS); | 799 | HRTIMER_MODE_ABS); |
720 | alarm_bases[i].timer.function = alarmtimer_fired; | 800 | alarm_bases[i].timer.function = alarmtimer_fired; |
721 | } | 801 | } |
802 | |||
803 | error = alarmtimer_rtc_interface_setup(); | ||
804 | if (error) | ||
805 | return error; | ||
806 | |||
722 | error = platform_driver_register(&alarmtimer_driver); | 807 | error = platform_driver_register(&alarmtimer_driver); |
723 | platform_device_register_simple("alarmtimer", -1, NULL, 0); | 808 | if (error) |
809 | goto out_if; | ||
724 | 810 | ||
811 | pdev = platform_device_register_simple("alarmtimer", -1, NULL, 0); | ||
812 | if (IS_ERR(pdev)) { | ||
813 | error = PTR_ERR(pdev); | ||
814 | goto out_drv; | ||
815 | } | ||
816 | return 0; | ||
817 | |||
818 | out_drv: | ||
819 | platform_driver_unregister(&alarmtimer_driver); | ||
820 | out_if: | ||
821 | alarmtimer_rtc_interface_remove(); | ||
725 | return error; | 822 | return error; |
726 | } | 823 | } |
727 | device_initcall(alarmtimer_init); | 824 | device_initcall(alarmtimer_init); |
728 | |||
diff --git a/kernel/time/clockevents.c b/kernel/time/clockevents.c index e4c699dfa4e8..1ecd6ba36d6c 100644 --- a/kernel/time/clockevents.c +++ b/kernel/time/clockevents.c | |||
@@ -94,42 +94,143 @@ void clockevents_shutdown(struct clock_event_device *dev) | |||
94 | dev->next_event.tv64 = KTIME_MAX; | 94 | dev->next_event.tv64 = KTIME_MAX; |
95 | } | 95 | } |
96 | 96 | ||
97 | #ifdef CONFIG_GENERIC_CLOCKEVENTS_MIN_ADJUST | ||
98 | |||
99 | /* Limit min_delta to a jiffie */ | ||
100 | #define MIN_DELTA_LIMIT (NSEC_PER_SEC / HZ) | ||
101 | |||
102 | /** | ||
103 | * clockevents_increase_min_delta - raise minimum delta of a clock event device | ||
104 | * @dev: device to increase the minimum delta | ||
105 | * | ||
106 | * Returns 0 on success, -ETIME when the minimum delta reached the limit. | ||
107 | */ | ||
108 | static int clockevents_increase_min_delta(struct clock_event_device *dev) | ||
109 | { | ||
110 | /* Nothing to do if we already reached the limit */ | ||
111 | if (dev->min_delta_ns >= MIN_DELTA_LIMIT) { | ||
112 | printk(KERN_WARNING "CE: Reprogramming failure. Giving up\n"); | ||
113 | dev->next_event.tv64 = KTIME_MAX; | ||
114 | return -ETIME; | ||
115 | } | ||
116 | |||
117 | if (dev->min_delta_ns < 5000) | ||
118 | dev->min_delta_ns = 5000; | ||
119 | else | ||
120 | dev->min_delta_ns += dev->min_delta_ns >> 1; | ||
121 | |||
122 | if (dev->min_delta_ns > MIN_DELTA_LIMIT) | ||
123 | dev->min_delta_ns = MIN_DELTA_LIMIT; | ||
124 | |||
125 | printk(KERN_WARNING "CE: %s increased min_delta_ns to %llu nsec\n", | ||
126 | dev->name ? dev->name : "?", | ||
127 | (unsigned long long) dev->min_delta_ns); | ||
128 | return 0; | ||
129 | } | ||
130 | |||
131 | /** | ||
132 | * clockevents_program_min_delta - Set clock event device to the minimum delay. | ||
133 | * @dev: device to program | ||
134 | * | ||
135 | * Returns 0 on success, -ETIME when the retry loop failed. | ||
136 | */ | ||
137 | static int clockevents_program_min_delta(struct clock_event_device *dev) | ||
138 | { | ||
139 | unsigned long long clc; | ||
140 | int64_t delta; | ||
141 | int i; | ||
142 | |||
143 | for (i = 0;;) { | ||
144 | delta = dev->min_delta_ns; | ||
145 | dev->next_event = ktime_add_ns(ktime_get(), delta); | ||
146 | |||
147 | if (dev->mode == CLOCK_EVT_MODE_SHUTDOWN) | ||
148 | return 0; | ||
149 | |||
150 | dev->retries++; | ||
151 | clc = ((unsigned long long) delta * dev->mult) >> dev->shift; | ||
152 | if (dev->set_next_event((unsigned long) clc, dev) == 0) | ||
153 | return 0; | ||
154 | |||
155 | if (++i > 2) { | ||
156 | /* | ||
157 | * We tried 3 times to program the device with the | ||
158 | * given min_delta_ns. Try to increase the minimum | ||
159 | * delta, if that fails as well get out of here. | ||
160 | */ | ||
161 | if (clockevents_increase_min_delta(dev)) | ||
162 | return -ETIME; | ||
163 | i = 0; | ||
164 | } | ||
165 | } | ||
166 | } | ||
167 | |||
168 | #else /* CONFIG_GENERIC_CLOCKEVENTS_MIN_ADJUST */ | ||
169 | |||
170 | /** | ||
171 | * clockevents_program_min_delta - Set clock event device to the minimum delay. | ||
172 | * @dev: device to program | ||
173 | * | ||
174 | * Returns 0 on success, -ETIME when the retry loop failed. | ||
175 | */ | ||
176 | static int clockevents_program_min_delta(struct clock_event_device *dev) | ||
177 | { | ||
178 | unsigned long long clc; | ||
179 | int64_t delta; | ||
180 | |||
181 | delta = dev->min_delta_ns; | ||
182 | dev->next_event = ktime_add_ns(ktime_get(), delta); | ||
183 | |||
184 | if (dev->mode == CLOCK_EVT_MODE_SHUTDOWN) | ||
185 | return 0; | ||
186 | |||
187 | dev->retries++; | ||
188 | clc = ((unsigned long long) delta * dev->mult) >> dev->shift; | ||
189 | return dev->set_next_event((unsigned long) clc, dev); | ||
190 | } | ||
191 | |||
192 | #endif /* CONFIG_GENERIC_CLOCKEVENTS_MIN_ADJUST */ | ||
193 | |||
97 | /** | 194 | /** |
98 | * clockevents_program_event - Reprogram the clock event device. | 195 | * clockevents_program_event - Reprogram the clock event device. |
196 | * @dev: device to program | ||
99 | * @expires: absolute expiry time (monotonic clock) | 197 | * @expires: absolute expiry time (monotonic clock) |
198 | * @force: program minimum delay if expires can not be set | ||
100 | * | 199 | * |
101 | * Returns 0 on success, -ETIME when the event is in the past. | 200 | * Returns 0 on success, -ETIME when the event is in the past. |
102 | */ | 201 | */ |
103 | int clockevents_program_event(struct clock_event_device *dev, ktime_t expires, | 202 | int clockevents_program_event(struct clock_event_device *dev, ktime_t expires, |
104 | ktime_t now) | 203 | bool force) |
105 | { | 204 | { |
106 | unsigned long long clc; | 205 | unsigned long long clc; |
107 | int64_t delta; | 206 | int64_t delta; |
207 | int rc; | ||
108 | 208 | ||
109 | if (unlikely(expires.tv64 < 0)) { | 209 | if (unlikely(expires.tv64 < 0)) { |
110 | WARN_ON_ONCE(1); | 210 | WARN_ON_ONCE(1); |
111 | return -ETIME; | 211 | return -ETIME; |
112 | } | 212 | } |
113 | 213 | ||
114 | delta = ktime_to_ns(ktime_sub(expires, now)); | ||
115 | |||
116 | if (delta <= 0) | ||
117 | return -ETIME; | ||
118 | |||
119 | dev->next_event = expires; | 214 | dev->next_event = expires; |
120 | 215 | ||
121 | if (dev->mode == CLOCK_EVT_MODE_SHUTDOWN) | 216 | if (dev->mode == CLOCK_EVT_MODE_SHUTDOWN) |
122 | return 0; | 217 | return 0; |
123 | 218 | ||
124 | if (delta > dev->max_delta_ns) | 219 | /* Shortcut for clockevent devices that can deal with ktime. */ |
125 | delta = dev->max_delta_ns; | 220 | if (dev->features & CLOCK_EVT_FEAT_KTIME) |
126 | if (delta < dev->min_delta_ns) | 221 | return dev->set_next_ktime(expires, dev); |
127 | delta = dev->min_delta_ns; | 222 | |
223 | delta = ktime_to_ns(ktime_sub(expires, ktime_get())); | ||
224 | if (delta <= 0) | ||
225 | return force ? clockevents_program_min_delta(dev) : -ETIME; | ||
128 | 226 | ||
129 | clc = delta * dev->mult; | 227 | delta = min(delta, (int64_t) dev->max_delta_ns); |
130 | clc >>= dev->shift; | 228 | delta = max(delta, (int64_t) dev->min_delta_ns); |
131 | 229 | ||
132 | return dev->set_next_event((unsigned long) clc, dev); | 230 | clc = ((unsigned long long) delta * dev->mult) >> dev->shift; |
231 | rc = dev->set_next_event((unsigned long) clc, dev); | ||
232 | |||
233 | return (rc && force) ? clockevents_program_min_delta(dev) : rc; | ||
133 | } | 234 | } |
134 | 235 | ||
135 | /** | 236 | /** |
@@ -258,7 +359,7 @@ int clockevents_update_freq(struct clock_event_device *dev, u32 freq) | |||
258 | if (dev->mode != CLOCK_EVT_MODE_ONESHOT) | 359 | if (dev->mode != CLOCK_EVT_MODE_ONESHOT) |
259 | return 0; | 360 | return 0; |
260 | 361 | ||
261 | return clockevents_program_event(dev, dev->next_event, ktime_get()); | 362 | return clockevents_program_event(dev, dev->next_event, false); |
262 | } | 363 | } |
263 | 364 | ||
264 | /* | 365 | /* |
diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c index e0980f0d9a0a..cf52fda2e096 100644 --- a/kernel/time/clocksource.c +++ b/kernel/time/clocksource.c | |||
@@ -186,6 +186,7 @@ static struct timer_list watchdog_timer; | |||
186 | static DECLARE_WORK(watchdog_work, clocksource_watchdog_work); | 186 | static DECLARE_WORK(watchdog_work, clocksource_watchdog_work); |
187 | static DEFINE_SPINLOCK(watchdog_lock); | 187 | static DEFINE_SPINLOCK(watchdog_lock); |
188 | static int watchdog_running; | 188 | static int watchdog_running; |
189 | static atomic_t watchdog_reset_pending; | ||
189 | 190 | ||
190 | static int clocksource_watchdog_kthread(void *data); | 191 | static int clocksource_watchdog_kthread(void *data); |
191 | static void __clocksource_change_rating(struct clocksource *cs, int rating); | 192 | static void __clocksource_change_rating(struct clocksource *cs, int rating); |
@@ -247,12 +248,14 @@ static void clocksource_watchdog(unsigned long data) | |||
247 | struct clocksource *cs; | 248 | struct clocksource *cs; |
248 | cycle_t csnow, wdnow; | 249 | cycle_t csnow, wdnow; |
249 | int64_t wd_nsec, cs_nsec; | 250 | int64_t wd_nsec, cs_nsec; |
250 | int next_cpu; | 251 | int next_cpu, reset_pending; |
251 | 252 | ||
252 | spin_lock(&watchdog_lock); | 253 | spin_lock(&watchdog_lock); |
253 | if (!watchdog_running) | 254 | if (!watchdog_running) |
254 | goto out; | 255 | goto out; |
255 | 256 | ||
257 | reset_pending = atomic_read(&watchdog_reset_pending); | ||
258 | |||
256 | list_for_each_entry(cs, &watchdog_list, wd_list) { | 259 | list_for_each_entry(cs, &watchdog_list, wd_list) { |
257 | 260 | ||
258 | /* Clocksource already marked unstable? */ | 261 | /* Clocksource already marked unstable? */ |
@@ -268,7 +271,8 @@ static void clocksource_watchdog(unsigned long data) | |||
268 | local_irq_enable(); | 271 | local_irq_enable(); |
269 | 272 | ||
270 | /* Clocksource initialized ? */ | 273 | /* Clocksource initialized ? */ |
271 | if (!(cs->flags & CLOCK_SOURCE_WATCHDOG)) { | 274 | if (!(cs->flags & CLOCK_SOURCE_WATCHDOG) || |
275 | atomic_read(&watchdog_reset_pending)) { | ||
272 | cs->flags |= CLOCK_SOURCE_WATCHDOG; | 276 | cs->flags |= CLOCK_SOURCE_WATCHDOG; |
273 | cs->wd_last = wdnow; | 277 | cs->wd_last = wdnow; |
274 | cs->cs_last = csnow; | 278 | cs->cs_last = csnow; |
@@ -283,8 +287,11 @@ static void clocksource_watchdog(unsigned long data) | |||
283 | cs->cs_last = csnow; | 287 | cs->cs_last = csnow; |
284 | cs->wd_last = wdnow; | 288 | cs->wd_last = wdnow; |
285 | 289 | ||
290 | if (atomic_read(&watchdog_reset_pending)) | ||
291 | continue; | ||
292 | |||
286 | /* Check the deviation from the watchdog clocksource. */ | 293 | /* Check the deviation from the watchdog clocksource. */ |
287 | if (abs(cs_nsec - wd_nsec) > WATCHDOG_THRESHOLD) { | 294 | if ((abs(cs_nsec - wd_nsec) > WATCHDOG_THRESHOLD)) { |
288 | clocksource_unstable(cs, cs_nsec - wd_nsec); | 295 | clocksource_unstable(cs, cs_nsec - wd_nsec); |
289 | continue; | 296 | continue; |
290 | } | 297 | } |
@@ -303,6 +310,13 @@ static void clocksource_watchdog(unsigned long data) | |||
303 | } | 310 | } |
304 | 311 | ||
305 | /* | 312 | /* |
313 | * We only clear the watchdog_reset_pending, when we did a | ||
314 | * full cycle through all clocksources. | ||
315 | */ | ||
316 | if (reset_pending) | ||
317 | atomic_dec(&watchdog_reset_pending); | ||
318 | |||
319 | /* | ||
306 | * Cycle through CPUs to check if the CPUs stay synchronized | 320 | * Cycle through CPUs to check if the CPUs stay synchronized |
307 | * to each other. | 321 | * to each other. |
308 | */ | 322 | */ |
@@ -344,23 +358,7 @@ static inline void clocksource_reset_watchdog(void) | |||
344 | 358 | ||
345 | static void clocksource_resume_watchdog(void) | 359 | static void clocksource_resume_watchdog(void) |
346 | { | 360 | { |
347 | unsigned long flags; | 361 | atomic_inc(&watchdog_reset_pending); |
348 | |||
349 | /* | ||
350 | * We use trylock here to avoid a potential dead lock when | ||
351 | * kgdb calls this code after the kernel has been stopped with | ||
352 | * watchdog_lock held. When watchdog_lock is held we just | ||
353 | * return and accept, that the watchdog might trigger and mark | ||
354 | * the monitored clock source (usually TSC) unstable. | ||
355 | * | ||
356 | * This does not affect the other caller clocksource_resume() | ||
357 | * because at this point the kernel is UP, interrupts are | ||
358 | * disabled and nothing can hold watchdog_lock. | ||
359 | */ | ||
360 | if (!spin_trylock_irqsave(&watchdog_lock, flags)) | ||
361 | return; | ||
362 | clocksource_reset_watchdog(); | ||
363 | spin_unlock_irqrestore(&watchdog_lock, flags); | ||
364 | } | 362 | } |
365 | 363 | ||
366 | static void clocksource_enqueue_watchdog(struct clocksource *cs) | 364 | static void clocksource_enqueue_watchdog(struct clocksource *cs) |
diff --git a/kernel/time/tick-broadcast.c b/kernel/time/tick-broadcast.c index c7218d132738..f954282d9a82 100644 --- a/kernel/time/tick-broadcast.c +++ b/kernel/time/tick-broadcast.c | |||
@@ -194,7 +194,7 @@ static void tick_handle_periodic_broadcast(struct clock_event_device *dev) | |||
194 | for (next = dev->next_event; ;) { | 194 | for (next = dev->next_event; ;) { |
195 | next = ktime_add(next, tick_period); | 195 | next = ktime_add(next, tick_period); |
196 | 196 | ||
197 | if (!clockevents_program_event(dev, next, ktime_get())) | 197 | if (!clockevents_program_event(dev, next, false)) |
198 | return; | 198 | return; |
199 | tick_do_periodic_broadcast(); | 199 | tick_do_periodic_broadcast(); |
200 | } | 200 | } |
@@ -373,7 +373,7 @@ static int tick_broadcast_set_event(ktime_t expires, int force) | |||
373 | { | 373 | { |
374 | struct clock_event_device *bc = tick_broadcast_device.evtdev; | 374 | struct clock_event_device *bc = tick_broadcast_device.evtdev; |
375 | 375 | ||
376 | return tick_dev_program_event(bc, expires, force); | 376 | return clockevents_program_event(bc, expires, force); |
377 | } | 377 | } |
378 | 378 | ||
379 | int tick_resume_broadcast_oneshot(struct clock_event_device *bc) | 379 | int tick_resume_broadcast_oneshot(struct clock_event_device *bc) |
diff --git a/kernel/time/tick-common.c b/kernel/time/tick-common.c index 119528de8235..da6c9ecad4e4 100644 --- a/kernel/time/tick-common.c +++ b/kernel/time/tick-common.c | |||
@@ -94,7 +94,7 @@ void tick_handle_periodic(struct clock_event_device *dev) | |||
94 | */ | 94 | */ |
95 | next = ktime_add(dev->next_event, tick_period); | 95 | next = ktime_add(dev->next_event, tick_period); |
96 | for (;;) { | 96 | for (;;) { |
97 | if (!clockevents_program_event(dev, next, ktime_get())) | 97 | if (!clockevents_program_event(dev, next, false)) |
98 | return; | 98 | return; |
99 | /* | 99 | /* |
100 | * Have to be careful here. If we're in oneshot mode, | 100 | * Have to be careful here. If we're in oneshot mode, |
@@ -137,7 +137,7 @@ void tick_setup_periodic(struct clock_event_device *dev, int broadcast) | |||
137 | clockevents_set_mode(dev, CLOCK_EVT_MODE_ONESHOT); | 137 | clockevents_set_mode(dev, CLOCK_EVT_MODE_ONESHOT); |
138 | 138 | ||
139 | for (;;) { | 139 | for (;;) { |
140 | if (!clockevents_program_event(dev, next, ktime_get())) | 140 | if (!clockevents_program_event(dev, next, false)) |
141 | return; | 141 | return; |
142 | next = ktime_add(next, tick_period); | 142 | next = ktime_add(next, tick_period); |
143 | } | 143 | } |
diff --git a/kernel/time/tick-internal.h b/kernel/time/tick-internal.h index 1009b06d6f89..4e265b901fed 100644 --- a/kernel/time/tick-internal.h +++ b/kernel/time/tick-internal.h | |||
@@ -26,8 +26,6 @@ extern void clockevents_shutdown(struct clock_event_device *dev); | |||
26 | extern void tick_setup_oneshot(struct clock_event_device *newdev, | 26 | extern void tick_setup_oneshot(struct clock_event_device *newdev, |
27 | void (*handler)(struct clock_event_device *), | 27 | void (*handler)(struct clock_event_device *), |
28 | ktime_t nextevt); | 28 | ktime_t nextevt); |
29 | extern int tick_dev_program_event(struct clock_event_device *dev, | ||
30 | ktime_t expires, int force); | ||
31 | extern int tick_program_event(ktime_t expires, int force); | 29 | extern int tick_program_event(ktime_t expires, int force); |
32 | extern void tick_oneshot_notify(void); | 30 | extern void tick_oneshot_notify(void); |
33 | extern int tick_switch_to_oneshot(void (*handler)(struct clock_event_device *)); | 31 | extern int tick_switch_to_oneshot(void (*handler)(struct clock_event_device *)); |
diff --git a/kernel/time/tick-oneshot.c b/kernel/time/tick-oneshot.c index 2d04411a5f05..824109060a33 100644 --- a/kernel/time/tick-oneshot.c +++ b/kernel/time/tick-oneshot.c | |||
@@ -21,74 +21,6 @@ | |||
21 | 21 | ||
22 | #include "tick-internal.h" | 22 | #include "tick-internal.h" |
23 | 23 | ||
24 | /* Limit min_delta to a jiffie */ | ||
25 | #define MIN_DELTA_LIMIT (NSEC_PER_SEC / HZ) | ||
26 | |||
27 | static int tick_increase_min_delta(struct clock_event_device *dev) | ||
28 | { | ||
29 | /* Nothing to do if we already reached the limit */ | ||
30 | if (dev->min_delta_ns >= MIN_DELTA_LIMIT) | ||
31 | return -ETIME; | ||
32 | |||
33 | if (dev->min_delta_ns < 5000) | ||
34 | dev->min_delta_ns = 5000; | ||
35 | else | ||
36 | dev->min_delta_ns += dev->min_delta_ns >> 1; | ||
37 | |||
38 | if (dev->min_delta_ns > MIN_DELTA_LIMIT) | ||
39 | dev->min_delta_ns = MIN_DELTA_LIMIT; | ||
40 | |||
41 | printk(KERN_WARNING "CE: %s increased min_delta_ns to %llu nsec\n", | ||
42 | dev->name ? dev->name : "?", | ||
43 | (unsigned long long) dev->min_delta_ns); | ||
44 | return 0; | ||
45 | } | ||
46 | |||
47 | /** | ||
48 | * tick_program_event internal worker function | ||
49 | */ | ||
50 | int tick_dev_program_event(struct clock_event_device *dev, ktime_t expires, | ||
51 | int force) | ||
52 | { | ||
53 | ktime_t now = ktime_get(); | ||
54 | int i; | ||
55 | |||
56 | for (i = 0;;) { | ||
57 | int ret = clockevents_program_event(dev, expires, now); | ||
58 | |||
59 | if (!ret || !force) | ||
60 | return ret; | ||
61 | |||
62 | dev->retries++; | ||
63 | /* | ||
64 | * We tried 3 times to program the device with the given | ||
65 | * min_delta_ns. If that's not working then we increase it | ||
66 | * and emit a warning. | ||
67 | */ | ||
68 | if (++i > 2) { | ||
69 | /* Increase the min. delta and try again */ | ||
70 | if (tick_increase_min_delta(dev)) { | ||
71 | /* | ||
72 | * Get out of the loop if min_delta_ns | ||
73 | * hit the limit already. That's | ||
74 | * better than staying here forever. | ||
75 | * | ||
76 | * We clear next_event so we have a | ||
77 | * chance that the box survives. | ||
78 | */ | ||
79 | printk(KERN_WARNING | ||
80 | "CE: Reprogramming failure. Giving up\n"); | ||
81 | dev->next_event.tv64 = KTIME_MAX; | ||
82 | return -ETIME; | ||
83 | } | ||
84 | i = 0; | ||
85 | } | ||
86 | |||
87 | now = ktime_get(); | ||
88 | expires = ktime_add_ns(now, dev->min_delta_ns); | ||
89 | } | ||
90 | } | ||
91 | |||
92 | /** | 24 | /** |
93 | * tick_program_event | 25 | * tick_program_event |
94 | */ | 26 | */ |
@@ -96,7 +28,7 @@ int tick_program_event(ktime_t expires, int force) | |||
96 | { | 28 | { |
97 | struct clock_event_device *dev = __this_cpu_read(tick_cpu_device.evtdev); | 29 | struct clock_event_device *dev = __this_cpu_read(tick_cpu_device.evtdev); |
98 | 30 | ||
99 | return tick_dev_program_event(dev, expires, force); | 31 | return clockevents_program_event(dev, expires, force); |
100 | } | 32 | } |
101 | 33 | ||
102 | /** | 34 | /** |
@@ -104,11 +36,10 @@ int tick_program_event(ktime_t expires, int force) | |||
104 | */ | 36 | */ |
105 | void tick_resume_oneshot(void) | 37 | void tick_resume_oneshot(void) |
106 | { | 38 | { |
107 | struct tick_device *td = &__get_cpu_var(tick_cpu_device); | 39 | struct clock_event_device *dev = __this_cpu_read(tick_cpu_device.evtdev); |
108 | struct clock_event_device *dev = td->evtdev; | ||
109 | 40 | ||
110 | clockevents_set_mode(dev, CLOCK_EVT_MODE_ONESHOT); | 41 | clockevents_set_mode(dev, CLOCK_EVT_MODE_ONESHOT); |
111 | tick_program_event(ktime_get(), 1); | 42 | clockevents_program_event(dev, ktime_get(), true); |
112 | } | 43 | } |
113 | 44 | ||
114 | /** | 45 | /** |
@@ -120,7 +51,7 @@ void tick_setup_oneshot(struct clock_event_device *newdev, | |||
120 | { | 51 | { |
121 | newdev->event_handler = handler; | 52 | newdev->event_handler = handler; |
122 | clockevents_set_mode(newdev, CLOCK_EVT_MODE_ONESHOT); | 53 | clockevents_set_mode(newdev, CLOCK_EVT_MODE_ONESHOT); |
123 | tick_dev_program_event(newdev, next_event, 1); | 54 | clockevents_program_event(newdev, next_event, true); |
124 | } | 55 | } |
125 | 56 | ||
126 | /** | 57 | /** |
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index d5097c44b407..40420644d0ba 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c | |||
@@ -139,7 +139,6 @@ static void tick_nohz_update_jiffies(ktime_t now) | |||
139 | struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu); | 139 | struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu); |
140 | unsigned long flags; | 140 | unsigned long flags; |
141 | 141 | ||
142 | cpumask_clear_cpu(cpu, nohz_cpu_mask); | ||
143 | ts->idle_waketime = now; | 142 | ts->idle_waketime = now; |
144 | 143 | ||
145 | local_irq_save(flags); | 144 | local_irq_save(flags); |
@@ -159,9 +158,10 @@ update_ts_time_stats(int cpu, struct tick_sched *ts, ktime_t now, u64 *last_upda | |||
159 | 158 | ||
160 | if (ts->idle_active) { | 159 | if (ts->idle_active) { |
161 | delta = ktime_sub(now, ts->idle_entrytime); | 160 | delta = ktime_sub(now, ts->idle_entrytime); |
162 | ts->idle_sleeptime = ktime_add(ts->idle_sleeptime, delta); | ||
163 | if (nr_iowait_cpu(cpu) > 0) | 161 | if (nr_iowait_cpu(cpu) > 0) |
164 | ts->iowait_sleeptime = ktime_add(ts->iowait_sleeptime, delta); | 162 | ts->iowait_sleeptime = ktime_add(ts->iowait_sleeptime, delta); |
163 | else | ||
164 | ts->idle_sleeptime = ktime_add(ts->idle_sleeptime, delta); | ||
165 | ts->idle_entrytime = now; | 165 | ts->idle_entrytime = now; |
166 | } | 166 | } |
167 | 167 | ||
@@ -197,11 +197,11 @@ static ktime_t tick_nohz_start_idle(int cpu, struct tick_sched *ts) | |||
197 | /** | 197 | /** |
198 | * get_cpu_idle_time_us - get the total idle time of a cpu | 198 | * get_cpu_idle_time_us - get the total idle time of a cpu |
199 | * @cpu: CPU number to query | 199 | * @cpu: CPU number to query |
200 | * @last_update_time: variable to store update time in | 200 | * @last_update_time: variable to store update time in. Do not update |
201 | * counters if NULL. | ||
201 | * | 202 | * |
202 | * Return the cummulative idle time (since boot) for a given | 203 | * Return the cummulative idle time (since boot) for a given |
203 | * CPU, in microseconds. The idle time returned includes | 204 | * CPU, in microseconds. |
204 | * the iowait time (unlike what "top" and co report). | ||
205 | * | 205 | * |
206 | * This time is measured via accounting rather than sampling, | 206 | * This time is measured via accounting rather than sampling, |
207 | * and is as accurate as ktime_get() is. | 207 | * and is as accurate as ktime_get() is. |
@@ -211,20 +211,35 @@ static ktime_t tick_nohz_start_idle(int cpu, struct tick_sched *ts) | |||
211 | u64 get_cpu_idle_time_us(int cpu, u64 *last_update_time) | 211 | u64 get_cpu_idle_time_us(int cpu, u64 *last_update_time) |
212 | { | 212 | { |
213 | struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu); | 213 | struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu); |
214 | ktime_t now, idle; | ||
214 | 215 | ||
215 | if (!tick_nohz_enabled) | 216 | if (!tick_nohz_enabled) |
216 | return -1; | 217 | return -1; |
217 | 218 | ||
218 | update_ts_time_stats(cpu, ts, ktime_get(), last_update_time); | 219 | now = ktime_get(); |
220 | if (last_update_time) { | ||
221 | update_ts_time_stats(cpu, ts, now, last_update_time); | ||
222 | idle = ts->idle_sleeptime; | ||
223 | } else { | ||
224 | if (ts->idle_active && !nr_iowait_cpu(cpu)) { | ||
225 | ktime_t delta = ktime_sub(now, ts->idle_entrytime); | ||
226 | |||
227 | idle = ktime_add(ts->idle_sleeptime, delta); | ||
228 | } else { | ||
229 | idle = ts->idle_sleeptime; | ||
230 | } | ||
231 | } | ||
232 | |||
233 | return ktime_to_us(idle); | ||
219 | 234 | ||
220 | return ktime_to_us(ts->idle_sleeptime); | ||
221 | } | 235 | } |
222 | EXPORT_SYMBOL_GPL(get_cpu_idle_time_us); | 236 | EXPORT_SYMBOL_GPL(get_cpu_idle_time_us); |
223 | 237 | ||
224 | /* | 238 | /** |
225 | * get_cpu_iowait_time_us - get the total iowait time of a cpu | 239 | * get_cpu_iowait_time_us - get the total iowait time of a cpu |
226 | * @cpu: CPU number to query | 240 | * @cpu: CPU number to query |
227 | * @last_update_time: variable to store update time in | 241 | * @last_update_time: variable to store update time in. Do not update |
242 | * counters if NULL. | ||
228 | * | 243 | * |
229 | * Return the cummulative iowait time (since boot) for a given | 244 | * Return the cummulative iowait time (since boot) for a given |
230 | * CPU, in microseconds. | 245 | * CPU, in microseconds. |
@@ -237,13 +252,26 @@ EXPORT_SYMBOL_GPL(get_cpu_idle_time_us); | |||
237 | u64 get_cpu_iowait_time_us(int cpu, u64 *last_update_time) | 252 | u64 get_cpu_iowait_time_us(int cpu, u64 *last_update_time) |
238 | { | 253 | { |
239 | struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu); | 254 | struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu); |
255 | ktime_t now, iowait; | ||
240 | 256 | ||
241 | if (!tick_nohz_enabled) | 257 | if (!tick_nohz_enabled) |
242 | return -1; | 258 | return -1; |
243 | 259 | ||
244 | update_ts_time_stats(cpu, ts, ktime_get(), last_update_time); | 260 | now = ktime_get(); |
261 | if (last_update_time) { | ||
262 | update_ts_time_stats(cpu, ts, now, last_update_time); | ||
263 | iowait = ts->iowait_sleeptime; | ||
264 | } else { | ||
265 | if (ts->idle_active && nr_iowait_cpu(cpu) > 0) { | ||
266 | ktime_t delta = ktime_sub(now, ts->idle_entrytime); | ||
267 | |||
268 | iowait = ktime_add(ts->iowait_sleeptime, delta); | ||
269 | } else { | ||
270 | iowait = ts->iowait_sleeptime; | ||
271 | } | ||
272 | } | ||
245 | 273 | ||
246 | return ktime_to_us(ts->iowait_sleeptime); | 274 | return ktime_to_us(iowait); |
247 | } | 275 | } |
248 | EXPORT_SYMBOL_GPL(get_cpu_iowait_time_us); | 276 | EXPORT_SYMBOL_GPL(get_cpu_iowait_time_us); |
249 | 277 | ||
@@ -389,9 +417,6 @@ void tick_nohz_stop_sched_tick(int inidle) | |||
389 | else | 417 | else |
390 | expires.tv64 = KTIME_MAX; | 418 | expires.tv64 = KTIME_MAX; |
391 | 419 | ||
392 | if (delta_jiffies > 1) | ||
393 | cpumask_set_cpu(cpu, nohz_cpu_mask); | ||
394 | |||
395 | /* Skip reprogram of event if its not changed */ | 420 | /* Skip reprogram of event if its not changed */ |
396 | if (ts->tick_stopped && ktime_equal(expires, dev->next_event)) | 421 | if (ts->tick_stopped && ktime_equal(expires, dev->next_event)) |
397 | goto out; | 422 | goto out; |
@@ -441,7 +466,6 @@ void tick_nohz_stop_sched_tick(int inidle) | |||
441 | * softirq. | 466 | * softirq. |
442 | */ | 467 | */ |
443 | tick_do_update_jiffies64(ktime_get()); | 468 | tick_do_update_jiffies64(ktime_get()); |
444 | cpumask_clear_cpu(cpu, nohz_cpu_mask); | ||
445 | } | 469 | } |
446 | raise_softirq_irqoff(TIMER_SOFTIRQ); | 470 | raise_softirq_irqoff(TIMER_SOFTIRQ); |
447 | out: | 471 | out: |
@@ -524,7 +548,6 @@ void tick_nohz_restart_sched_tick(void) | |||
524 | /* Update jiffies first */ | 548 | /* Update jiffies first */ |
525 | select_nohz_load_balancer(0); | 549 | select_nohz_load_balancer(0); |
526 | tick_do_update_jiffies64(now); | 550 | tick_do_update_jiffies64(now); |
527 | cpumask_clear_cpu(cpu, nohz_cpu_mask); | ||
528 | 551 | ||
529 | #ifndef CONFIG_VIRT_CPU_ACCOUNTING | 552 | #ifndef CONFIG_VIRT_CPU_ACCOUNTING |
530 | /* | 553 | /* |
@@ -640,8 +663,6 @@ static void tick_nohz_switch_to_nohz(void) | |||
640 | next = ktime_add(next, tick_period); | 663 | next = ktime_add(next, tick_period); |
641 | } | 664 | } |
642 | local_irq_enable(); | 665 | local_irq_enable(); |
643 | |||
644 | printk(KERN_INFO "Switched to NOHz mode on CPU #%d\n", smp_processor_id()); | ||
645 | } | 666 | } |
646 | 667 | ||
647 | /* | 668 | /* |
@@ -793,10 +814,8 @@ void tick_setup_sched_timer(void) | |||
793 | } | 814 | } |
794 | 815 | ||
795 | #ifdef CONFIG_NO_HZ | 816 | #ifdef CONFIG_NO_HZ |
796 | if (tick_nohz_enabled) { | 817 | if (tick_nohz_enabled) |
797 | ts->nohz_mode = NOHZ_MODE_HIGHRES; | 818 | ts->nohz_mode = NOHZ_MODE_HIGHRES; |
798 | printk(KERN_INFO "Switched to NOHz mode on CPU #%d\n", smp_processor_id()); | ||
799 | } | ||
800 | #endif | 819 | #endif |
801 | } | 820 | } |
802 | #endif /* HIGH_RES_TIMERS */ | 821 | #endif /* HIGH_RES_TIMERS */ |
diff --git a/kernel/time/timer_stats.c b/kernel/time/timer_stats.c index a5d0a3a85dd8..0b537f27b559 100644 --- a/kernel/time/timer_stats.c +++ b/kernel/time/timer_stats.c | |||
@@ -81,7 +81,7 @@ struct entry { | |||
81 | /* | 81 | /* |
82 | * Spinlock protecting the tables - not taken during lookup: | 82 | * Spinlock protecting the tables - not taken during lookup: |
83 | */ | 83 | */ |
84 | static DEFINE_SPINLOCK(table_lock); | 84 | static DEFINE_RAW_SPINLOCK(table_lock); |
85 | 85 | ||
86 | /* | 86 | /* |
87 | * Per-CPU lookup locks for fast hash lookup: | 87 | * Per-CPU lookup locks for fast hash lookup: |
@@ -188,7 +188,7 @@ static struct entry *tstat_lookup(struct entry *entry, char *comm) | |||
188 | prev = NULL; | 188 | prev = NULL; |
189 | curr = *head; | 189 | curr = *head; |
190 | 190 | ||
191 | spin_lock(&table_lock); | 191 | raw_spin_lock(&table_lock); |
192 | /* | 192 | /* |
193 | * Make sure we have not raced with another CPU: | 193 | * Make sure we have not raced with another CPU: |
194 | */ | 194 | */ |
@@ -215,7 +215,7 @@ static struct entry *tstat_lookup(struct entry *entry, char *comm) | |||
215 | *head = curr; | 215 | *head = curr; |
216 | } | 216 | } |
217 | out_unlock: | 217 | out_unlock: |
218 | spin_unlock(&table_lock); | 218 | raw_spin_unlock(&table_lock); |
219 | 219 | ||
220 | return curr; | 220 | return curr; |
221 | } | 221 | } |
diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile index 761c510a06c5..5f39a07fe5ea 100644 --- a/kernel/trace/Makefile +++ b/kernel/trace/Makefile | |||
@@ -15,6 +15,8 @@ ifdef CONFIG_TRACING_BRANCHES | |||
15 | KBUILD_CFLAGS += -DDISABLE_BRANCH_PROFILING | 15 | KBUILD_CFLAGS += -DDISABLE_BRANCH_PROFILING |
16 | endif | 16 | endif |
17 | 17 | ||
18 | CFLAGS_trace_events_filter.o := -I$(src) | ||
19 | |||
18 | # | 20 | # |
19 | # Make the trace clocks available generally: it's infrastructure | 21 | # Make the trace clocks available generally: it's infrastructure |
20 | # relied on by ptrace for example: | 22 | # relied on by ptrace for example: |
@@ -53,6 +55,9 @@ endif | |||
53 | obj-$(CONFIG_EVENT_TRACING) += trace_events_filter.o | 55 | obj-$(CONFIG_EVENT_TRACING) += trace_events_filter.o |
54 | obj-$(CONFIG_KPROBE_EVENT) += trace_kprobe.o | 56 | obj-$(CONFIG_KPROBE_EVENT) += trace_kprobe.o |
55 | obj-$(CONFIG_TRACEPOINTS) += power-traces.o | 57 | obj-$(CONFIG_TRACEPOINTS) += power-traces.o |
58 | ifeq ($(CONFIG_PM_RUNTIME),y) | ||
59 | obj-$(CONFIG_TRACEPOINTS) += rpm-traces.o | ||
60 | endif | ||
56 | ifeq ($(CONFIG_TRACING),y) | 61 | ifeq ($(CONFIG_TRACING),y) |
57 | obj-$(CONFIG_KGDB_KDB) += trace_kdb.o | 62 | obj-$(CONFIG_KGDB_KDB) += trace_kdb.o |
58 | endif | 63 | endif |
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index c3e4575e7829..077d85387908 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c | |||
@@ -3863,6 +3863,14 @@ void ftrace_kill(void) | |||
3863 | } | 3863 | } |
3864 | 3864 | ||
3865 | /** | 3865 | /** |
3866 | * Test if ftrace is dead or not. | ||
3867 | */ | ||
3868 | int ftrace_is_dead(void) | ||
3869 | { | ||
3870 | return ftrace_disabled; | ||
3871 | } | ||
3872 | |||
3873 | /** | ||
3866 | * register_ftrace_function - register a function for profiling | 3874 | * register_ftrace_function - register a function for profiling |
3867 | * @ops - ops structure that holds the function for profiling. | 3875 | * @ops - ops structure that holds the function for profiling. |
3868 | * | 3876 | * |
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 731201bf4acc..f5b7b5c1195b 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c | |||
@@ -478,7 +478,7 @@ struct ring_buffer_per_cpu { | |||
478 | int cpu; | 478 | int cpu; |
479 | atomic_t record_disabled; | 479 | atomic_t record_disabled; |
480 | struct ring_buffer *buffer; | 480 | struct ring_buffer *buffer; |
481 | spinlock_t reader_lock; /* serialize readers */ | 481 | raw_spinlock_t reader_lock; /* serialize readers */ |
482 | arch_spinlock_t lock; | 482 | arch_spinlock_t lock; |
483 | struct lock_class_key lock_key; | 483 | struct lock_class_key lock_key; |
484 | struct list_head *pages; | 484 | struct list_head *pages; |
@@ -488,12 +488,14 @@ struct ring_buffer_per_cpu { | |||
488 | struct buffer_page *reader_page; | 488 | struct buffer_page *reader_page; |
489 | unsigned long lost_events; | 489 | unsigned long lost_events; |
490 | unsigned long last_overrun; | 490 | unsigned long last_overrun; |
491 | local_t entries_bytes; | ||
491 | local_t commit_overrun; | 492 | local_t commit_overrun; |
492 | local_t overrun; | 493 | local_t overrun; |
493 | local_t entries; | 494 | local_t entries; |
494 | local_t committing; | 495 | local_t committing; |
495 | local_t commits; | 496 | local_t commits; |
496 | unsigned long read; | 497 | unsigned long read; |
498 | unsigned long read_bytes; | ||
497 | u64 write_stamp; | 499 | u64 write_stamp; |
498 | u64 read_stamp; | 500 | u64 read_stamp; |
499 | }; | 501 | }; |
@@ -1062,7 +1064,7 @@ rb_allocate_cpu_buffer(struct ring_buffer *buffer, int cpu) | |||
1062 | 1064 | ||
1063 | cpu_buffer->cpu = cpu; | 1065 | cpu_buffer->cpu = cpu; |
1064 | cpu_buffer->buffer = buffer; | 1066 | cpu_buffer->buffer = buffer; |
1065 | spin_lock_init(&cpu_buffer->reader_lock); | 1067 | raw_spin_lock_init(&cpu_buffer->reader_lock); |
1066 | lockdep_set_class(&cpu_buffer->reader_lock, buffer->reader_lock_key); | 1068 | lockdep_set_class(&cpu_buffer->reader_lock, buffer->reader_lock_key); |
1067 | cpu_buffer->lock = (arch_spinlock_t)__ARCH_SPIN_LOCK_UNLOCKED; | 1069 | cpu_buffer->lock = (arch_spinlock_t)__ARCH_SPIN_LOCK_UNLOCKED; |
1068 | 1070 | ||
@@ -1259,7 +1261,7 @@ rb_remove_pages(struct ring_buffer_per_cpu *cpu_buffer, unsigned nr_pages) | |||
1259 | struct list_head *p; | 1261 | struct list_head *p; |
1260 | unsigned i; | 1262 | unsigned i; |
1261 | 1263 | ||
1262 | spin_lock_irq(&cpu_buffer->reader_lock); | 1264 | raw_spin_lock_irq(&cpu_buffer->reader_lock); |
1263 | rb_head_page_deactivate(cpu_buffer); | 1265 | rb_head_page_deactivate(cpu_buffer); |
1264 | 1266 | ||
1265 | for (i = 0; i < nr_pages; i++) { | 1267 | for (i = 0; i < nr_pages; i++) { |
@@ -1277,7 +1279,7 @@ rb_remove_pages(struct ring_buffer_per_cpu *cpu_buffer, unsigned nr_pages) | |||
1277 | rb_check_pages(cpu_buffer); | 1279 | rb_check_pages(cpu_buffer); |
1278 | 1280 | ||
1279 | out: | 1281 | out: |
1280 | spin_unlock_irq(&cpu_buffer->reader_lock); | 1282 | raw_spin_unlock_irq(&cpu_buffer->reader_lock); |
1281 | } | 1283 | } |
1282 | 1284 | ||
1283 | static void | 1285 | static void |
@@ -1288,7 +1290,7 @@ rb_insert_pages(struct ring_buffer_per_cpu *cpu_buffer, | |||
1288 | struct list_head *p; | 1290 | struct list_head *p; |
1289 | unsigned i; | 1291 | unsigned i; |
1290 | 1292 | ||
1291 | spin_lock_irq(&cpu_buffer->reader_lock); | 1293 | raw_spin_lock_irq(&cpu_buffer->reader_lock); |
1292 | rb_head_page_deactivate(cpu_buffer); | 1294 | rb_head_page_deactivate(cpu_buffer); |
1293 | 1295 | ||
1294 | for (i = 0; i < nr_pages; i++) { | 1296 | for (i = 0; i < nr_pages; i++) { |
@@ -1303,7 +1305,7 @@ rb_insert_pages(struct ring_buffer_per_cpu *cpu_buffer, | |||
1303 | rb_check_pages(cpu_buffer); | 1305 | rb_check_pages(cpu_buffer); |
1304 | 1306 | ||
1305 | out: | 1307 | out: |
1306 | spin_unlock_irq(&cpu_buffer->reader_lock); | 1308 | raw_spin_unlock_irq(&cpu_buffer->reader_lock); |
1307 | } | 1309 | } |
1308 | 1310 | ||
1309 | /** | 1311 | /** |
@@ -1708,6 +1710,7 @@ rb_handle_head_page(struct ring_buffer_per_cpu *cpu_buffer, | |||
1708 | * the counters. | 1710 | * the counters. |
1709 | */ | 1711 | */ |
1710 | local_add(entries, &cpu_buffer->overrun); | 1712 | local_add(entries, &cpu_buffer->overrun); |
1713 | local_sub(BUF_PAGE_SIZE, &cpu_buffer->entries_bytes); | ||
1711 | 1714 | ||
1712 | /* | 1715 | /* |
1713 | * The entries will be zeroed out when we move the | 1716 | * The entries will be zeroed out when we move the |
@@ -1863,6 +1866,9 @@ rb_reset_tail(struct ring_buffer_per_cpu *cpu_buffer, | |||
1863 | event = __rb_page_index(tail_page, tail); | 1866 | event = __rb_page_index(tail_page, tail); |
1864 | kmemcheck_annotate_bitfield(event, bitfield); | 1867 | kmemcheck_annotate_bitfield(event, bitfield); |
1865 | 1868 | ||
1869 | /* account for padding bytes */ | ||
1870 | local_add(BUF_PAGE_SIZE - tail, &cpu_buffer->entries_bytes); | ||
1871 | |||
1866 | /* | 1872 | /* |
1867 | * Save the original length to the meta data. | 1873 | * Save the original length to the meta data. |
1868 | * This will be used by the reader to add lost event | 1874 | * This will be used by the reader to add lost event |
@@ -2054,6 +2060,9 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer, | |||
2054 | if (!tail) | 2060 | if (!tail) |
2055 | tail_page->page->time_stamp = ts; | 2061 | tail_page->page->time_stamp = ts; |
2056 | 2062 | ||
2063 | /* account for these added bytes */ | ||
2064 | local_add(length, &cpu_buffer->entries_bytes); | ||
2065 | |||
2057 | return event; | 2066 | return event; |
2058 | } | 2067 | } |
2059 | 2068 | ||
@@ -2076,6 +2085,7 @@ rb_try_to_discard(struct ring_buffer_per_cpu *cpu_buffer, | |||
2076 | if (bpage->page == (void *)addr && rb_page_write(bpage) == old_index) { | 2085 | if (bpage->page == (void *)addr && rb_page_write(bpage) == old_index) { |
2077 | unsigned long write_mask = | 2086 | unsigned long write_mask = |
2078 | local_read(&bpage->write) & ~RB_WRITE_MASK; | 2087 | local_read(&bpage->write) & ~RB_WRITE_MASK; |
2088 | unsigned long event_length = rb_event_length(event); | ||
2079 | /* | 2089 | /* |
2080 | * This is on the tail page. It is possible that | 2090 | * This is on the tail page. It is possible that |
2081 | * a write could come in and move the tail page | 2091 | * a write could come in and move the tail page |
@@ -2085,8 +2095,11 @@ rb_try_to_discard(struct ring_buffer_per_cpu *cpu_buffer, | |||
2085 | old_index += write_mask; | 2095 | old_index += write_mask; |
2086 | new_index += write_mask; | 2096 | new_index += write_mask; |
2087 | index = local_cmpxchg(&bpage->write, old_index, new_index); | 2097 | index = local_cmpxchg(&bpage->write, old_index, new_index); |
2088 | if (index == old_index) | 2098 | if (index == old_index) { |
2099 | /* update counters */ | ||
2100 | local_sub(event_length, &cpu_buffer->entries_bytes); | ||
2089 | return 1; | 2101 | return 1; |
2102 | } | ||
2090 | } | 2103 | } |
2091 | 2104 | ||
2092 | /* could not discard */ | 2105 | /* could not discard */ |
@@ -2661,6 +2674,58 @@ rb_num_of_entries(struct ring_buffer_per_cpu *cpu_buffer) | |||
2661 | } | 2674 | } |
2662 | 2675 | ||
2663 | /** | 2676 | /** |
2677 | * ring_buffer_oldest_event_ts - get the oldest event timestamp from the buffer | ||
2678 | * @buffer: The ring buffer | ||
2679 | * @cpu: The per CPU buffer to read from. | ||
2680 | */ | ||
2681 | unsigned long ring_buffer_oldest_event_ts(struct ring_buffer *buffer, int cpu) | ||
2682 | { | ||
2683 | unsigned long flags; | ||
2684 | struct ring_buffer_per_cpu *cpu_buffer; | ||
2685 | struct buffer_page *bpage; | ||
2686 | unsigned long ret; | ||
2687 | |||
2688 | if (!cpumask_test_cpu(cpu, buffer->cpumask)) | ||
2689 | return 0; | ||
2690 | |||
2691 | cpu_buffer = buffer->buffers[cpu]; | ||
2692 | raw_spin_lock_irqsave(&cpu_buffer->reader_lock, flags); | ||
2693 | /* | ||
2694 | * if the tail is on reader_page, oldest time stamp is on the reader | ||
2695 | * page | ||
2696 | */ | ||
2697 | if (cpu_buffer->tail_page == cpu_buffer->reader_page) | ||
2698 | bpage = cpu_buffer->reader_page; | ||
2699 | else | ||
2700 | bpage = rb_set_head_page(cpu_buffer); | ||
2701 | ret = bpage->page->time_stamp; | ||
2702 | raw_spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags); | ||
2703 | |||
2704 | return ret; | ||
2705 | } | ||
2706 | EXPORT_SYMBOL_GPL(ring_buffer_oldest_event_ts); | ||
2707 | |||
2708 | /** | ||
2709 | * ring_buffer_bytes_cpu - get the number of bytes consumed in a cpu buffer | ||
2710 | * @buffer: The ring buffer | ||
2711 | * @cpu: The per CPU buffer to read from. | ||
2712 | */ | ||
2713 | unsigned long ring_buffer_bytes_cpu(struct ring_buffer *buffer, int cpu) | ||
2714 | { | ||
2715 | struct ring_buffer_per_cpu *cpu_buffer; | ||
2716 | unsigned long ret; | ||
2717 | |||
2718 | if (!cpumask_test_cpu(cpu, buffer->cpumask)) | ||
2719 | return 0; | ||
2720 | |||
2721 | cpu_buffer = buffer->buffers[cpu]; | ||
2722 | ret = local_read(&cpu_buffer->entries_bytes) - cpu_buffer->read_bytes; | ||
2723 | |||
2724 | return ret; | ||
2725 | } | ||
2726 | EXPORT_SYMBOL_GPL(ring_buffer_bytes_cpu); | ||
2727 | |||
2728 | /** | ||
2664 | * ring_buffer_entries_cpu - get the number of entries in a cpu buffer | 2729 | * ring_buffer_entries_cpu - get the number of entries in a cpu buffer |
2665 | * @buffer: The ring buffer | 2730 | * @buffer: The ring buffer |
2666 | * @cpu: The per CPU buffer to get the entries from. | 2731 | * @cpu: The per CPU buffer to get the entries from. |
@@ -2804,9 +2869,9 @@ void ring_buffer_iter_reset(struct ring_buffer_iter *iter) | |||
2804 | 2869 | ||
2805 | cpu_buffer = iter->cpu_buffer; | 2870 | cpu_buffer = iter->cpu_buffer; |
2806 | 2871 | ||
2807 | spin_lock_irqsave(&cpu_buffer->reader_lock, flags); | 2872 | raw_spin_lock_irqsave(&cpu_buffer->reader_lock, flags); |
2808 | rb_iter_reset(iter); | 2873 | rb_iter_reset(iter); |
2809 | spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags); | 2874 | raw_spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags); |
2810 | } | 2875 | } |
2811 | EXPORT_SYMBOL_GPL(ring_buffer_iter_reset); | 2876 | EXPORT_SYMBOL_GPL(ring_buffer_iter_reset); |
2812 | 2877 | ||
@@ -3265,12 +3330,12 @@ ring_buffer_peek(struct ring_buffer *buffer, int cpu, u64 *ts, | |||
3265 | again: | 3330 | again: |
3266 | local_irq_save(flags); | 3331 | local_irq_save(flags); |
3267 | if (dolock) | 3332 | if (dolock) |
3268 | spin_lock(&cpu_buffer->reader_lock); | 3333 | raw_spin_lock(&cpu_buffer->reader_lock); |
3269 | event = rb_buffer_peek(cpu_buffer, ts, lost_events); | 3334 | event = rb_buffer_peek(cpu_buffer, ts, lost_events); |
3270 | if (event && event->type_len == RINGBUF_TYPE_PADDING) | 3335 | if (event && event->type_len == RINGBUF_TYPE_PADDING) |
3271 | rb_advance_reader(cpu_buffer); | 3336 | rb_advance_reader(cpu_buffer); |
3272 | if (dolock) | 3337 | if (dolock) |
3273 | spin_unlock(&cpu_buffer->reader_lock); | 3338 | raw_spin_unlock(&cpu_buffer->reader_lock); |
3274 | local_irq_restore(flags); | 3339 | local_irq_restore(flags); |
3275 | 3340 | ||
3276 | if (event && event->type_len == RINGBUF_TYPE_PADDING) | 3341 | if (event && event->type_len == RINGBUF_TYPE_PADDING) |
@@ -3295,9 +3360,9 @@ ring_buffer_iter_peek(struct ring_buffer_iter *iter, u64 *ts) | |||
3295 | unsigned long flags; | 3360 | unsigned long flags; |
3296 | 3361 | ||
3297 | again: | 3362 | again: |
3298 | spin_lock_irqsave(&cpu_buffer->reader_lock, flags); | 3363 | raw_spin_lock_irqsave(&cpu_buffer->reader_lock, flags); |
3299 | event = rb_iter_peek(iter, ts); | 3364 | event = rb_iter_peek(iter, ts); |
3300 | spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags); | 3365 | raw_spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags); |
3301 | 3366 | ||
3302 | if (event && event->type_len == RINGBUF_TYPE_PADDING) | 3367 | if (event && event->type_len == RINGBUF_TYPE_PADDING) |
3303 | goto again; | 3368 | goto again; |
@@ -3337,7 +3402,7 @@ ring_buffer_consume(struct ring_buffer *buffer, int cpu, u64 *ts, | |||
3337 | cpu_buffer = buffer->buffers[cpu]; | 3402 | cpu_buffer = buffer->buffers[cpu]; |
3338 | local_irq_save(flags); | 3403 | local_irq_save(flags); |
3339 | if (dolock) | 3404 | if (dolock) |
3340 | spin_lock(&cpu_buffer->reader_lock); | 3405 | raw_spin_lock(&cpu_buffer->reader_lock); |
3341 | 3406 | ||
3342 | event = rb_buffer_peek(cpu_buffer, ts, lost_events); | 3407 | event = rb_buffer_peek(cpu_buffer, ts, lost_events); |
3343 | if (event) { | 3408 | if (event) { |
@@ -3346,7 +3411,7 @@ ring_buffer_consume(struct ring_buffer *buffer, int cpu, u64 *ts, | |||
3346 | } | 3411 | } |
3347 | 3412 | ||
3348 | if (dolock) | 3413 | if (dolock) |
3349 | spin_unlock(&cpu_buffer->reader_lock); | 3414 | raw_spin_unlock(&cpu_buffer->reader_lock); |
3350 | local_irq_restore(flags); | 3415 | local_irq_restore(flags); |
3351 | 3416 | ||
3352 | out: | 3417 | out: |
@@ -3438,11 +3503,11 @@ ring_buffer_read_start(struct ring_buffer_iter *iter) | |||
3438 | 3503 | ||
3439 | cpu_buffer = iter->cpu_buffer; | 3504 | cpu_buffer = iter->cpu_buffer; |
3440 | 3505 | ||
3441 | spin_lock_irqsave(&cpu_buffer->reader_lock, flags); | 3506 | raw_spin_lock_irqsave(&cpu_buffer->reader_lock, flags); |
3442 | arch_spin_lock(&cpu_buffer->lock); | 3507 | arch_spin_lock(&cpu_buffer->lock); |
3443 | rb_iter_reset(iter); | 3508 | rb_iter_reset(iter); |
3444 | arch_spin_unlock(&cpu_buffer->lock); | 3509 | arch_spin_unlock(&cpu_buffer->lock); |
3445 | spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags); | 3510 | raw_spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags); |
3446 | } | 3511 | } |
3447 | EXPORT_SYMBOL_GPL(ring_buffer_read_start); | 3512 | EXPORT_SYMBOL_GPL(ring_buffer_read_start); |
3448 | 3513 | ||
@@ -3477,7 +3542,7 @@ ring_buffer_read(struct ring_buffer_iter *iter, u64 *ts) | |||
3477 | struct ring_buffer_per_cpu *cpu_buffer = iter->cpu_buffer; | 3542 | struct ring_buffer_per_cpu *cpu_buffer = iter->cpu_buffer; |
3478 | unsigned long flags; | 3543 | unsigned long flags; |
3479 | 3544 | ||
3480 | spin_lock_irqsave(&cpu_buffer->reader_lock, flags); | 3545 | raw_spin_lock_irqsave(&cpu_buffer->reader_lock, flags); |
3481 | again: | 3546 | again: |
3482 | event = rb_iter_peek(iter, ts); | 3547 | event = rb_iter_peek(iter, ts); |
3483 | if (!event) | 3548 | if (!event) |
@@ -3488,7 +3553,7 @@ ring_buffer_read(struct ring_buffer_iter *iter, u64 *ts) | |||
3488 | 3553 | ||
3489 | rb_advance_iter(iter); | 3554 | rb_advance_iter(iter); |
3490 | out: | 3555 | out: |
3491 | spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags); | 3556 | raw_spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags); |
3492 | 3557 | ||
3493 | return event; | 3558 | return event; |
3494 | } | 3559 | } |
@@ -3527,11 +3592,13 @@ rb_reset_cpu(struct ring_buffer_per_cpu *cpu_buffer) | |||
3527 | cpu_buffer->reader_page->read = 0; | 3592 | cpu_buffer->reader_page->read = 0; |
3528 | 3593 | ||
3529 | local_set(&cpu_buffer->commit_overrun, 0); | 3594 | local_set(&cpu_buffer->commit_overrun, 0); |
3595 | local_set(&cpu_buffer->entries_bytes, 0); | ||
3530 | local_set(&cpu_buffer->overrun, 0); | 3596 | local_set(&cpu_buffer->overrun, 0); |
3531 | local_set(&cpu_buffer->entries, 0); | 3597 | local_set(&cpu_buffer->entries, 0); |
3532 | local_set(&cpu_buffer->committing, 0); | 3598 | local_set(&cpu_buffer->committing, 0); |
3533 | local_set(&cpu_buffer->commits, 0); | 3599 | local_set(&cpu_buffer->commits, 0); |
3534 | cpu_buffer->read = 0; | 3600 | cpu_buffer->read = 0; |
3601 | cpu_buffer->read_bytes = 0; | ||
3535 | 3602 | ||
3536 | cpu_buffer->write_stamp = 0; | 3603 | cpu_buffer->write_stamp = 0; |
3537 | cpu_buffer->read_stamp = 0; | 3604 | cpu_buffer->read_stamp = 0; |
@@ -3557,7 +3624,7 @@ void ring_buffer_reset_cpu(struct ring_buffer *buffer, int cpu) | |||
3557 | 3624 | ||
3558 | atomic_inc(&cpu_buffer->record_disabled); | 3625 | atomic_inc(&cpu_buffer->record_disabled); |
3559 | 3626 | ||
3560 | spin_lock_irqsave(&cpu_buffer->reader_lock, flags); | 3627 | raw_spin_lock_irqsave(&cpu_buffer->reader_lock, flags); |
3561 | 3628 | ||
3562 | if (RB_WARN_ON(cpu_buffer, local_read(&cpu_buffer->committing))) | 3629 | if (RB_WARN_ON(cpu_buffer, local_read(&cpu_buffer->committing))) |
3563 | goto out; | 3630 | goto out; |
@@ -3569,7 +3636,7 @@ void ring_buffer_reset_cpu(struct ring_buffer *buffer, int cpu) | |||
3569 | arch_spin_unlock(&cpu_buffer->lock); | 3636 | arch_spin_unlock(&cpu_buffer->lock); |
3570 | 3637 | ||
3571 | out: | 3638 | out: |
3572 | spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags); | 3639 | raw_spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags); |
3573 | 3640 | ||
3574 | atomic_dec(&cpu_buffer->record_disabled); | 3641 | atomic_dec(&cpu_buffer->record_disabled); |
3575 | } | 3642 | } |
@@ -3607,10 +3674,10 @@ int ring_buffer_empty(struct ring_buffer *buffer) | |||
3607 | cpu_buffer = buffer->buffers[cpu]; | 3674 | cpu_buffer = buffer->buffers[cpu]; |
3608 | local_irq_save(flags); | 3675 | local_irq_save(flags); |
3609 | if (dolock) | 3676 | if (dolock) |
3610 | spin_lock(&cpu_buffer->reader_lock); | 3677 | raw_spin_lock(&cpu_buffer->reader_lock); |
3611 | ret = rb_per_cpu_empty(cpu_buffer); | 3678 | ret = rb_per_cpu_empty(cpu_buffer); |
3612 | if (dolock) | 3679 | if (dolock) |
3613 | spin_unlock(&cpu_buffer->reader_lock); | 3680 | raw_spin_unlock(&cpu_buffer->reader_lock); |
3614 | local_irq_restore(flags); | 3681 | local_irq_restore(flags); |
3615 | 3682 | ||
3616 | if (!ret) | 3683 | if (!ret) |
@@ -3641,10 +3708,10 @@ int ring_buffer_empty_cpu(struct ring_buffer *buffer, int cpu) | |||
3641 | cpu_buffer = buffer->buffers[cpu]; | 3708 | cpu_buffer = buffer->buffers[cpu]; |
3642 | local_irq_save(flags); | 3709 | local_irq_save(flags); |
3643 | if (dolock) | 3710 | if (dolock) |
3644 | spin_lock(&cpu_buffer->reader_lock); | 3711 | raw_spin_lock(&cpu_buffer->reader_lock); |
3645 | ret = rb_per_cpu_empty(cpu_buffer); | 3712 | ret = rb_per_cpu_empty(cpu_buffer); |
3646 | if (dolock) | 3713 | if (dolock) |
3647 | spin_unlock(&cpu_buffer->reader_lock); | 3714 | raw_spin_unlock(&cpu_buffer->reader_lock); |
3648 | local_irq_restore(flags); | 3715 | local_irq_restore(flags); |
3649 | 3716 | ||
3650 | return ret; | 3717 | return ret; |
@@ -3841,7 +3908,7 @@ int ring_buffer_read_page(struct ring_buffer *buffer, | |||
3841 | if (!bpage) | 3908 | if (!bpage) |
3842 | goto out; | 3909 | goto out; |
3843 | 3910 | ||
3844 | spin_lock_irqsave(&cpu_buffer->reader_lock, flags); | 3911 | raw_spin_lock_irqsave(&cpu_buffer->reader_lock, flags); |
3845 | 3912 | ||
3846 | reader = rb_get_reader_page(cpu_buffer); | 3913 | reader = rb_get_reader_page(cpu_buffer); |
3847 | if (!reader) | 3914 | if (!reader) |
@@ -3918,6 +3985,7 @@ int ring_buffer_read_page(struct ring_buffer *buffer, | |||
3918 | } else { | 3985 | } else { |
3919 | /* update the entry counter */ | 3986 | /* update the entry counter */ |
3920 | cpu_buffer->read += rb_page_entries(reader); | 3987 | cpu_buffer->read += rb_page_entries(reader); |
3988 | cpu_buffer->read_bytes += BUF_PAGE_SIZE; | ||
3921 | 3989 | ||
3922 | /* swap the pages */ | 3990 | /* swap the pages */ |
3923 | rb_init_page(bpage); | 3991 | rb_init_page(bpage); |
@@ -3964,7 +4032,7 @@ int ring_buffer_read_page(struct ring_buffer *buffer, | |||
3964 | memset(&bpage->data[commit], 0, BUF_PAGE_SIZE - commit); | 4032 | memset(&bpage->data[commit], 0, BUF_PAGE_SIZE - commit); |
3965 | 4033 | ||
3966 | out_unlock: | 4034 | out_unlock: |
3967 | spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags); | 4035 | raw_spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags); |
3968 | 4036 | ||
3969 | out: | 4037 | out: |
3970 | return ret; | 4038 | return ret; |
diff --git a/kernel/trace/rpm-traces.c b/kernel/trace/rpm-traces.c new file mode 100644 index 000000000000..4b3b5eaf94d1 --- /dev/null +++ b/kernel/trace/rpm-traces.c | |||
@@ -0,0 +1,20 @@ | |||
1 | /* | ||
2 | * Power trace points | ||
3 | * | ||
4 | * Copyright (C) 2009 Ming Lei <ming.lei@canonical.com> | ||
5 | */ | ||
6 | |||
7 | #include <linux/string.h> | ||
8 | #include <linux/types.h> | ||
9 | #include <linux/workqueue.h> | ||
10 | #include <linux/sched.h> | ||
11 | #include <linux/module.h> | ||
12 | #include <linux/usb.h> | ||
13 | |||
14 | #define CREATE_TRACE_POINTS | ||
15 | #include <trace/events/rpm.h> | ||
16 | |||
17 | EXPORT_TRACEPOINT_SYMBOL_GPL(rpm_return_int); | ||
18 | EXPORT_TRACEPOINT_SYMBOL_GPL(rpm_idle); | ||
19 | EXPORT_TRACEPOINT_SYMBOL_GPL(rpm_suspend); | ||
20 | EXPORT_TRACEPOINT_SYMBOL_GPL(rpm_resume); | ||
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index e5df02c69b1d..f2bd275bb60f 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c | |||
@@ -341,7 +341,7 @@ unsigned long trace_flags = TRACE_ITER_PRINT_PARENT | TRACE_ITER_PRINTK | | |||
341 | TRACE_ITER_GRAPH_TIME | TRACE_ITER_RECORD_CMD | TRACE_ITER_OVERWRITE; | 341 | TRACE_ITER_GRAPH_TIME | TRACE_ITER_RECORD_CMD | TRACE_ITER_OVERWRITE; |
342 | 342 | ||
343 | static int trace_stop_count; | 343 | static int trace_stop_count; |
344 | static DEFINE_SPINLOCK(tracing_start_lock); | 344 | static DEFINE_RAW_SPINLOCK(tracing_start_lock); |
345 | 345 | ||
346 | static void wakeup_work_handler(struct work_struct *work) | 346 | static void wakeup_work_handler(struct work_struct *work) |
347 | { | 347 | { |
@@ -435,6 +435,7 @@ static struct { | |||
435 | } trace_clocks[] = { | 435 | } trace_clocks[] = { |
436 | { trace_clock_local, "local" }, | 436 | { trace_clock_local, "local" }, |
437 | { trace_clock_global, "global" }, | 437 | { trace_clock_global, "global" }, |
438 | { trace_clock_counter, "counter" }, | ||
438 | }; | 439 | }; |
439 | 440 | ||
440 | int trace_clock_id; | 441 | int trace_clock_id; |
@@ -960,7 +961,7 @@ void tracing_start(void) | |||
960 | if (tracing_disabled) | 961 | if (tracing_disabled) |
961 | return; | 962 | return; |
962 | 963 | ||
963 | spin_lock_irqsave(&tracing_start_lock, flags); | 964 | raw_spin_lock_irqsave(&tracing_start_lock, flags); |
964 | if (--trace_stop_count) { | 965 | if (--trace_stop_count) { |
965 | if (trace_stop_count < 0) { | 966 | if (trace_stop_count < 0) { |
966 | /* Someone screwed up their debugging */ | 967 | /* Someone screwed up their debugging */ |
@@ -985,7 +986,7 @@ void tracing_start(void) | |||
985 | 986 | ||
986 | ftrace_start(); | 987 | ftrace_start(); |
987 | out: | 988 | out: |
988 | spin_unlock_irqrestore(&tracing_start_lock, flags); | 989 | raw_spin_unlock_irqrestore(&tracing_start_lock, flags); |
989 | } | 990 | } |
990 | 991 | ||
991 | /** | 992 | /** |
@@ -1000,7 +1001,7 @@ void tracing_stop(void) | |||
1000 | unsigned long flags; | 1001 | unsigned long flags; |
1001 | 1002 | ||
1002 | ftrace_stop(); | 1003 | ftrace_stop(); |
1003 | spin_lock_irqsave(&tracing_start_lock, flags); | 1004 | raw_spin_lock_irqsave(&tracing_start_lock, flags); |
1004 | if (trace_stop_count++) | 1005 | if (trace_stop_count++) |
1005 | goto out; | 1006 | goto out; |
1006 | 1007 | ||
@@ -1018,7 +1019,7 @@ void tracing_stop(void) | |||
1018 | arch_spin_unlock(&ftrace_max_lock); | 1019 | arch_spin_unlock(&ftrace_max_lock); |
1019 | 1020 | ||
1020 | out: | 1021 | out: |
1021 | spin_unlock_irqrestore(&tracing_start_lock, flags); | 1022 | raw_spin_unlock_irqrestore(&tracing_start_lock, flags); |
1022 | } | 1023 | } |
1023 | 1024 | ||
1024 | void trace_stop_cmdline_recording(void); | 1025 | void trace_stop_cmdline_recording(void); |
@@ -2159,6 +2160,14 @@ void trace_default_header(struct seq_file *m) | |||
2159 | } | 2160 | } |
2160 | } | 2161 | } |
2161 | 2162 | ||
2163 | static void test_ftrace_alive(struct seq_file *m) | ||
2164 | { | ||
2165 | if (!ftrace_is_dead()) | ||
2166 | return; | ||
2167 | seq_printf(m, "# WARNING: FUNCTION TRACING IS CORRUPTED\n"); | ||
2168 | seq_printf(m, "# MAY BE MISSING FUNCTION EVENTS\n"); | ||
2169 | } | ||
2170 | |||
2162 | static int s_show(struct seq_file *m, void *v) | 2171 | static int s_show(struct seq_file *m, void *v) |
2163 | { | 2172 | { |
2164 | struct trace_iterator *iter = v; | 2173 | struct trace_iterator *iter = v; |
@@ -2168,6 +2177,7 @@ static int s_show(struct seq_file *m, void *v) | |||
2168 | if (iter->tr) { | 2177 | if (iter->tr) { |
2169 | seq_printf(m, "# tracer: %s\n", iter->trace->name); | 2178 | seq_printf(m, "# tracer: %s\n", iter->trace->name); |
2170 | seq_puts(m, "#\n"); | 2179 | seq_puts(m, "#\n"); |
2180 | test_ftrace_alive(m); | ||
2171 | } | 2181 | } |
2172 | if (iter->trace && iter->trace->print_header) | 2182 | if (iter->trace && iter->trace->print_header) |
2173 | iter->trace->print_header(m); | 2183 | iter->trace->print_header(m); |
@@ -2710,9 +2720,9 @@ static const char readme_msg[] = | |||
2710 | "# cat /sys/kernel/debug/tracing/trace_options\n" | 2720 | "# cat /sys/kernel/debug/tracing/trace_options\n" |
2711 | "noprint-parent nosym-offset nosym-addr noverbose\n" | 2721 | "noprint-parent nosym-offset nosym-addr noverbose\n" |
2712 | "# echo print-parent > /sys/kernel/debug/tracing/trace_options\n" | 2722 | "# echo print-parent > /sys/kernel/debug/tracing/trace_options\n" |
2713 | "# echo 1 > /sys/kernel/debug/tracing/tracing_enabled\n" | 2723 | "# echo 1 > /sys/kernel/debug/tracing/tracing_on\n" |
2714 | "# cat /sys/kernel/debug/tracing/trace > /tmp/trace.txt\n" | 2724 | "# cat /sys/kernel/debug/tracing/trace > /tmp/trace.txt\n" |
2715 | "# echo 0 > /sys/kernel/debug/tracing/tracing_enabled\n" | 2725 | "# echo 0 > /sys/kernel/debug/tracing/tracing_on\n" |
2716 | ; | 2726 | ; |
2717 | 2727 | ||
2718 | static ssize_t | 2728 | static ssize_t |
@@ -3569,6 +3579,30 @@ tracing_entries_write(struct file *filp, const char __user *ubuf, | |||
3569 | } | 3579 | } |
3570 | 3580 | ||
3571 | static ssize_t | 3581 | static ssize_t |
3582 | tracing_total_entries_read(struct file *filp, char __user *ubuf, | ||
3583 | size_t cnt, loff_t *ppos) | ||
3584 | { | ||
3585 | struct trace_array *tr = filp->private_data; | ||
3586 | char buf[64]; | ||
3587 | int r, cpu; | ||
3588 | unsigned long size = 0, expanded_size = 0; | ||
3589 | |||
3590 | mutex_lock(&trace_types_lock); | ||
3591 | for_each_tracing_cpu(cpu) { | ||
3592 | size += tr->entries >> 10; | ||
3593 | if (!ring_buffer_expanded) | ||
3594 | expanded_size += trace_buf_size >> 10; | ||
3595 | } | ||
3596 | if (ring_buffer_expanded) | ||
3597 | r = sprintf(buf, "%lu\n", size); | ||
3598 | else | ||
3599 | r = sprintf(buf, "%lu (expanded: %lu)\n", size, expanded_size); | ||
3600 | mutex_unlock(&trace_types_lock); | ||
3601 | |||
3602 | return simple_read_from_buffer(ubuf, cnt, ppos, buf, r); | ||
3603 | } | ||
3604 | |||
3605 | static ssize_t | ||
3572 | tracing_free_buffer_write(struct file *filp, const char __user *ubuf, | 3606 | tracing_free_buffer_write(struct file *filp, const char __user *ubuf, |
3573 | size_t cnt, loff_t *ppos) | 3607 | size_t cnt, loff_t *ppos) |
3574 | { | 3608 | { |
@@ -3594,22 +3628,24 @@ tracing_free_buffer_release(struct inode *inode, struct file *filp) | |||
3594 | return 0; | 3628 | return 0; |
3595 | } | 3629 | } |
3596 | 3630 | ||
3597 | static int mark_printk(const char *fmt, ...) | ||
3598 | { | ||
3599 | int ret; | ||
3600 | va_list args; | ||
3601 | va_start(args, fmt); | ||
3602 | ret = trace_vprintk(0, fmt, args); | ||
3603 | va_end(args); | ||
3604 | return ret; | ||
3605 | } | ||
3606 | |||
3607 | static ssize_t | 3631 | static ssize_t |
3608 | tracing_mark_write(struct file *filp, const char __user *ubuf, | 3632 | tracing_mark_write(struct file *filp, const char __user *ubuf, |
3609 | size_t cnt, loff_t *fpos) | 3633 | size_t cnt, loff_t *fpos) |
3610 | { | 3634 | { |
3611 | char *buf; | 3635 | unsigned long addr = (unsigned long)ubuf; |
3612 | size_t written; | 3636 | struct ring_buffer_event *event; |
3637 | struct ring_buffer *buffer; | ||
3638 | struct print_entry *entry; | ||
3639 | unsigned long irq_flags; | ||
3640 | struct page *pages[2]; | ||
3641 | int nr_pages = 1; | ||
3642 | ssize_t written; | ||
3643 | void *page1; | ||
3644 | void *page2; | ||
3645 | int offset; | ||
3646 | int size; | ||
3647 | int len; | ||
3648 | int ret; | ||
3613 | 3649 | ||
3614 | if (tracing_disabled) | 3650 | if (tracing_disabled) |
3615 | return -EINVAL; | 3651 | return -EINVAL; |
@@ -3617,28 +3653,81 @@ tracing_mark_write(struct file *filp, const char __user *ubuf, | |||
3617 | if (cnt > TRACE_BUF_SIZE) | 3653 | if (cnt > TRACE_BUF_SIZE) |
3618 | cnt = TRACE_BUF_SIZE; | 3654 | cnt = TRACE_BUF_SIZE; |
3619 | 3655 | ||
3620 | buf = kmalloc(cnt + 2, GFP_KERNEL); | 3656 | /* |
3621 | if (buf == NULL) | 3657 | * Userspace is injecting traces into the kernel trace buffer. |
3622 | return -ENOMEM; | 3658 | * We want to be as non intrusive as possible. |
3659 | * To do so, we do not want to allocate any special buffers | ||
3660 | * or take any locks, but instead write the userspace data | ||
3661 | * straight into the ring buffer. | ||
3662 | * | ||
3663 | * First we need to pin the userspace buffer into memory, | ||
3664 | * which, most likely it is, because it just referenced it. | ||
3665 | * But there's no guarantee that it is. By using get_user_pages_fast() | ||
3666 | * and kmap_atomic/kunmap_atomic() we can get access to the | ||
3667 | * pages directly. We then write the data directly into the | ||
3668 | * ring buffer. | ||
3669 | */ | ||
3670 | BUILD_BUG_ON(TRACE_BUF_SIZE >= PAGE_SIZE); | ||
3623 | 3671 | ||
3624 | if (copy_from_user(buf, ubuf, cnt)) { | 3672 | /* check if we cross pages */ |
3625 | kfree(buf); | 3673 | if ((addr & PAGE_MASK) != ((addr + cnt) & PAGE_MASK)) |
3626 | return -EFAULT; | 3674 | nr_pages = 2; |
3675 | |||
3676 | offset = addr & (PAGE_SIZE - 1); | ||
3677 | addr &= PAGE_MASK; | ||
3678 | |||
3679 | ret = get_user_pages_fast(addr, nr_pages, 0, pages); | ||
3680 | if (ret < nr_pages) { | ||
3681 | while (--ret >= 0) | ||
3682 | put_page(pages[ret]); | ||
3683 | written = -EFAULT; | ||
3684 | goto out; | ||
3685 | } | ||
3686 | |||
3687 | page1 = kmap_atomic(pages[0]); | ||
3688 | if (nr_pages == 2) | ||
3689 | page2 = kmap_atomic(pages[1]); | ||
3690 | |||
3691 | local_save_flags(irq_flags); | ||
3692 | size = sizeof(*entry) + cnt + 2; /* possible \n added */ | ||
3693 | buffer = global_trace.buffer; | ||
3694 | event = trace_buffer_lock_reserve(buffer, TRACE_PRINT, size, | ||
3695 | irq_flags, preempt_count()); | ||
3696 | if (!event) { | ||
3697 | /* Ring buffer disabled, return as if not open for write */ | ||
3698 | written = -EBADF; | ||
3699 | goto out_unlock; | ||
3627 | } | 3700 | } |
3628 | if (buf[cnt-1] != '\n') { | 3701 | |
3629 | buf[cnt] = '\n'; | 3702 | entry = ring_buffer_event_data(event); |
3630 | buf[cnt+1] = '\0'; | 3703 | entry->ip = _THIS_IP_; |
3704 | |||
3705 | if (nr_pages == 2) { | ||
3706 | len = PAGE_SIZE - offset; | ||
3707 | memcpy(&entry->buf, page1 + offset, len); | ||
3708 | memcpy(&entry->buf[len], page2, cnt - len); | ||
3631 | } else | 3709 | } else |
3632 | buf[cnt] = '\0'; | 3710 | memcpy(&entry->buf, page1 + offset, cnt); |
3633 | 3711 | ||
3634 | written = mark_printk("%s", buf); | 3712 | if (entry->buf[cnt - 1] != '\n') { |
3635 | kfree(buf); | 3713 | entry->buf[cnt] = '\n'; |
3636 | *fpos += written; | 3714 | entry->buf[cnt + 1] = '\0'; |
3715 | } else | ||
3716 | entry->buf[cnt] = '\0'; | ||
3717 | |||
3718 | ring_buffer_unlock_commit(buffer, event); | ||
3637 | 3719 | ||
3638 | /* don't tell userspace we wrote more - it might confuse them */ | 3720 | written = cnt; |
3639 | if (written > cnt) | ||
3640 | written = cnt; | ||
3641 | 3721 | ||
3722 | *fpos += written; | ||
3723 | |||
3724 | out_unlock: | ||
3725 | if (nr_pages == 2) | ||
3726 | kunmap_atomic(page2); | ||
3727 | kunmap_atomic(page1); | ||
3728 | while (nr_pages > 0) | ||
3729 | put_page(pages[--nr_pages]); | ||
3730 | out: | ||
3642 | return written; | 3731 | return written; |
3643 | } | 3732 | } |
3644 | 3733 | ||
@@ -3739,6 +3828,12 @@ static const struct file_operations tracing_entries_fops = { | |||
3739 | .llseek = generic_file_llseek, | 3828 | .llseek = generic_file_llseek, |
3740 | }; | 3829 | }; |
3741 | 3830 | ||
3831 | static const struct file_operations tracing_total_entries_fops = { | ||
3832 | .open = tracing_open_generic, | ||
3833 | .read = tracing_total_entries_read, | ||
3834 | .llseek = generic_file_llseek, | ||
3835 | }; | ||
3836 | |||
3742 | static const struct file_operations tracing_free_buffer_fops = { | 3837 | static const struct file_operations tracing_free_buffer_fops = { |
3743 | .write = tracing_free_buffer_write, | 3838 | .write = tracing_free_buffer_write, |
3744 | .release = tracing_free_buffer_release, | 3839 | .release = tracing_free_buffer_release, |
@@ -3808,8 +3903,6 @@ tracing_buffers_read(struct file *filp, char __user *ubuf, | |||
3808 | if (info->read < PAGE_SIZE) | 3903 | if (info->read < PAGE_SIZE) |
3809 | goto read; | 3904 | goto read; |
3810 | 3905 | ||
3811 | info->read = 0; | ||
3812 | |||
3813 | trace_access_lock(info->cpu); | 3906 | trace_access_lock(info->cpu); |
3814 | ret = ring_buffer_read_page(info->tr->buffer, | 3907 | ret = ring_buffer_read_page(info->tr->buffer, |
3815 | &info->spare, | 3908 | &info->spare, |
@@ -3819,6 +3912,8 @@ tracing_buffers_read(struct file *filp, char __user *ubuf, | |||
3819 | if (ret < 0) | 3912 | if (ret < 0) |
3820 | return 0; | 3913 | return 0; |
3821 | 3914 | ||
3915 | info->read = 0; | ||
3916 | |||
3822 | read: | 3917 | read: |
3823 | size = PAGE_SIZE - info->read; | 3918 | size = PAGE_SIZE - info->read; |
3824 | if (size > count) | 3919 | if (size > count) |
@@ -4026,6 +4121,8 @@ tracing_stats_read(struct file *filp, char __user *ubuf, | |||
4026 | struct trace_array *tr = &global_trace; | 4121 | struct trace_array *tr = &global_trace; |
4027 | struct trace_seq *s; | 4122 | struct trace_seq *s; |
4028 | unsigned long cnt; | 4123 | unsigned long cnt; |
4124 | unsigned long long t; | ||
4125 | unsigned long usec_rem; | ||
4029 | 4126 | ||
4030 | s = kmalloc(sizeof(*s), GFP_KERNEL); | 4127 | s = kmalloc(sizeof(*s), GFP_KERNEL); |
4031 | if (!s) | 4128 | if (!s) |
@@ -4042,6 +4139,17 @@ tracing_stats_read(struct file *filp, char __user *ubuf, | |||
4042 | cnt = ring_buffer_commit_overrun_cpu(tr->buffer, cpu); | 4139 | cnt = ring_buffer_commit_overrun_cpu(tr->buffer, cpu); |
4043 | trace_seq_printf(s, "commit overrun: %ld\n", cnt); | 4140 | trace_seq_printf(s, "commit overrun: %ld\n", cnt); |
4044 | 4141 | ||
4142 | cnt = ring_buffer_bytes_cpu(tr->buffer, cpu); | ||
4143 | trace_seq_printf(s, "bytes: %ld\n", cnt); | ||
4144 | |||
4145 | t = ns2usecs(ring_buffer_oldest_event_ts(tr->buffer, cpu)); | ||
4146 | usec_rem = do_div(t, USEC_PER_SEC); | ||
4147 | trace_seq_printf(s, "oldest event ts: %5llu.%06lu\n", t, usec_rem); | ||
4148 | |||
4149 | t = ns2usecs(ring_buffer_time_stamp(tr->buffer, cpu)); | ||
4150 | usec_rem = do_div(t, USEC_PER_SEC); | ||
4151 | trace_seq_printf(s, "now ts: %5llu.%06lu\n", t, usec_rem); | ||
4152 | |||
4045 | count = simple_read_from_buffer(ubuf, count, ppos, s->buffer, s->len); | 4153 | count = simple_read_from_buffer(ubuf, count, ppos, s->buffer, s->len); |
4046 | 4154 | ||
4047 | kfree(s); | 4155 | kfree(s); |
@@ -4450,6 +4558,9 @@ static __init int tracer_init_debugfs(void) | |||
4450 | trace_create_file("buffer_size_kb", 0644, d_tracer, | 4558 | trace_create_file("buffer_size_kb", 0644, d_tracer, |
4451 | &global_trace, &tracing_entries_fops); | 4559 | &global_trace, &tracing_entries_fops); |
4452 | 4560 | ||
4561 | trace_create_file("buffer_total_size_kb", 0444, d_tracer, | ||
4562 | &global_trace, &tracing_total_entries_fops); | ||
4563 | |||
4453 | trace_create_file("free_buffer", 0644, d_tracer, | 4564 | trace_create_file("free_buffer", 0644, d_tracer, |
4454 | &global_trace, &tracing_free_buffer_fops); | 4565 | &global_trace, &tracing_free_buffer_fops); |
4455 | 4566 | ||
@@ -4566,6 +4677,12 @@ __ftrace_dump(bool disable_tracing, enum ftrace_dump_mode oops_dump_mode) | |||
4566 | 4677 | ||
4567 | tracing_off(); | 4678 | tracing_off(); |
4568 | 4679 | ||
4680 | /* Did function tracer already get disabled? */ | ||
4681 | if (ftrace_is_dead()) { | ||
4682 | printk("# WARNING: FUNCTION TRACING IS CORRUPTED\n"); | ||
4683 | printk("# MAY BE MISSING FUNCTION EVENTS\n"); | ||
4684 | } | ||
4685 | |||
4569 | if (disable_tracing) | 4686 | if (disable_tracing) |
4570 | ftrace_kill(); | 4687 | ftrace_kill(); |
4571 | 4688 | ||
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 616846bcfee5..092e1f8d18dc 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h | |||
@@ -579,11 +579,13 @@ static inline int ftrace_trace_task(struct task_struct *task) | |||
579 | 579 | ||
580 | return test_tsk_trace_trace(task); | 580 | return test_tsk_trace_trace(task); |
581 | } | 581 | } |
582 | extern int ftrace_is_dead(void); | ||
582 | #else | 583 | #else |
583 | static inline int ftrace_trace_task(struct task_struct *task) | 584 | static inline int ftrace_trace_task(struct task_struct *task) |
584 | { | 585 | { |
585 | return 1; | 586 | return 1; |
586 | } | 587 | } |
588 | static inline int ftrace_is_dead(void) { return 0; } | ||
587 | #endif | 589 | #endif |
588 | 590 | ||
589 | /* | 591 | /* |
@@ -761,16 +763,10 @@ struct filter_pred { | |||
761 | filter_pred_fn_t fn; | 763 | filter_pred_fn_t fn; |
762 | u64 val; | 764 | u64 val; |
763 | struct regex regex; | 765 | struct regex regex; |
764 | /* | 766 | unsigned short *ops; |
765 | * Leaf nodes use field_name, ops is used by AND and OR | 767 | #ifdef CONFIG_FTRACE_STARTUP_TEST |
766 | * nodes. The field_name is always freed when freeing a pred. | 768 | struct ftrace_event_field *field; |
767 | * We can overload field_name for ops and have it freed | 769 | #endif |
768 | * as well. | ||
769 | */ | ||
770 | union { | ||
771 | char *field_name; | ||
772 | unsigned short *ops; | ||
773 | }; | ||
774 | int offset; | 770 | int offset; |
775 | int not; | 771 | int not; |
776 | int op; | 772 | int op; |
diff --git a/kernel/trace/trace_clock.c b/kernel/trace/trace_clock.c index 6302747a1398..394783531cbb 100644 --- a/kernel/trace/trace_clock.c +++ b/kernel/trace/trace_clock.c | |||
@@ -113,3 +113,15 @@ u64 notrace trace_clock_global(void) | |||
113 | 113 | ||
114 | return now; | 114 | return now; |
115 | } | 115 | } |
116 | |||
117 | static atomic64_t trace_counter; | ||
118 | |||
119 | /* | ||
120 | * trace_clock_counter(): simply an atomic counter. | ||
121 | * Use the trace_counter "counter" for cases where you do not care | ||
122 | * about timings, but are interested in strict ordering. | ||
123 | */ | ||
124 | u64 notrace trace_clock_counter(void) | ||
125 | { | ||
126 | return atomic64_add_return(1, &trace_counter); | ||
127 | } | ||
diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c index 256764ecccd6..816d3d074979 100644 --- a/kernel/trace/trace_events_filter.c +++ b/kernel/trace/trace_events_filter.c | |||
@@ -381,6 +381,63 @@ get_pred_parent(struct filter_pred *pred, struct filter_pred *preds, | |||
381 | return pred; | 381 | return pred; |
382 | } | 382 | } |
383 | 383 | ||
384 | enum walk_return { | ||
385 | WALK_PRED_ABORT, | ||
386 | WALK_PRED_PARENT, | ||
387 | WALK_PRED_DEFAULT, | ||
388 | }; | ||
389 | |||
390 | typedef int (*filter_pred_walkcb_t) (enum move_type move, | ||
391 | struct filter_pred *pred, | ||
392 | int *err, void *data); | ||
393 | |||
394 | static int walk_pred_tree(struct filter_pred *preds, | ||
395 | struct filter_pred *root, | ||
396 | filter_pred_walkcb_t cb, void *data) | ||
397 | { | ||
398 | struct filter_pred *pred = root; | ||
399 | enum move_type move = MOVE_DOWN; | ||
400 | int done = 0; | ||
401 | |||
402 | if (!preds) | ||
403 | return -EINVAL; | ||
404 | |||
405 | do { | ||
406 | int err = 0, ret; | ||
407 | |||
408 | ret = cb(move, pred, &err, data); | ||
409 | if (ret == WALK_PRED_ABORT) | ||
410 | return err; | ||
411 | if (ret == WALK_PRED_PARENT) | ||
412 | goto get_parent; | ||
413 | |||
414 | switch (move) { | ||
415 | case MOVE_DOWN: | ||
416 | if (pred->left != FILTER_PRED_INVALID) { | ||
417 | pred = &preds[pred->left]; | ||
418 | continue; | ||
419 | } | ||
420 | goto get_parent; | ||
421 | case MOVE_UP_FROM_LEFT: | ||
422 | pred = &preds[pred->right]; | ||
423 | move = MOVE_DOWN; | ||
424 | continue; | ||
425 | case MOVE_UP_FROM_RIGHT: | ||
426 | get_parent: | ||
427 | if (pred == root) | ||
428 | break; | ||
429 | pred = get_pred_parent(pred, preds, | ||
430 | pred->parent, | ||
431 | &move); | ||
432 | continue; | ||
433 | } | ||
434 | done = 1; | ||
435 | } while (!done); | ||
436 | |||
437 | /* We are fine. */ | ||
438 | return 0; | ||
439 | } | ||
440 | |||
384 | /* | 441 | /* |
385 | * A series of AND or ORs where found together. Instead of | 442 | * A series of AND or ORs where found together. Instead of |
386 | * climbing up and down the tree branches, an array of the | 443 | * climbing up and down the tree branches, an array of the |
@@ -410,99 +467,91 @@ static int process_ops(struct filter_pred *preds, | |||
410 | 467 | ||
411 | for (i = 0; i < op->val; i++) { | 468 | for (i = 0; i < op->val; i++) { |
412 | pred = &preds[op->ops[i]]; | 469 | pred = &preds[op->ops[i]]; |
413 | match = pred->fn(pred, rec); | 470 | if (!WARN_ON_ONCE(!pred->fn)) |
471 | match = pred->fn(pred, rec); | ||
414 | if (!!match == type) | 472 | if (!!match == type) |
415 | return match; | 473 | return match; |
416 | } | 474 | } |
417 | return match; | 475 | return match; |
418 | } | 476 | } |
419 | 477 | ||
478 | struct filter_match_preds_data { | ||
479 | struct filter_pred *preds; | ||
480 | int match; | ||
481 | void *rec; | ||
482 | }; | ||
483 | |||
484 | static int filter_match_preds_cb(enum move_type move, struct filter_pred *pred, | ||
485 | int *err, void *data) | ||
486 | { | ||
487 | struct filter_match_preds_data *d = data; | ||
488 | |||
489 | *err = 0; | ||
490 | switch (move) { | ||
491 | case MOVE_DOWN: | ||
492 | /* only AND and OR have children */ | ||
493 | if (pred->left != FILTER_PRED_INVALID) { | ||
494 | /* If ops is set, then it was folded. */ | ||
495 | if (!pred->ops) | ||
496 | return WALK_PRED_DEFAULT; | ||
497 | /* We can treat folded ops as a leaf node */ | ||
498 | d->match = process_ops(d->preds, pred, d->rec); | ||
499 | } else { | ||
500 | if (!WARN_ON_ONCE(!pred->fn)) | ||
501 | d->match = pred->fn(pred, d->rec); | ||
502 | } | ||
503 | |||
504 | return WALK_PRED_PARENT; | ||
505 | case MOVE_UP_FROM_LEFT: | ||
506 | /* | ||
507 | * Check for short circuits. | ||
508 | * | ||
509 | * Optimization: !!match == (pred->op == OP_OR) | ||
510 | * is the same as: | ||
511 | * if ((match && pred->op == OP_OR) || | ||
512 | * (!match && pred->op == OP_AND)) | ||
513 | */ | ||
514 | if (!!d->match == (pred->op == OP_OR)) | ||
515 | return WALK_PRED_PARENT; | ||
516 | break; | ||
517 | case MOVE_UP_FROM_RIGHT: | ||
518 | break; | ||
519 | } | ||
520 | |||
521 | return WALK_PRED_DEFAULT; | ||
522 | } | ||
523 | |||
420 | /* return 1 if event matches, 0 otherwise (discard) */ | 524 | /* return 1 if event matches, 0 otherwise (discard) */ |
421 | int filter_match_preds(struct event_filter *filter, void *rec) | 525 | int filter_match_preds(struct event_filter *filter, void *rec) |
422 | { | 526 | { |
423 | int match = -1; | ||
424 | enum move_type move = MOVE_DOWN; | ||
425 | struct filter_pred *preds; | 527 | struct filter_pred *preds; |
426 | struct filter_pred *pred; | ||
427 | struct filter_pred *root; | 528 | struct filter_pred *root; |
428 | int n_preds; | 529 | struct filter_match_preds_data data = { |
429 | int done = 0; | 530 | /* match is currently meaningless */ |
531 | .match = -1, | ||
532 | .rec = rec, | ||
533 | }; | ||
534 | int n_preds, ret; | ||
430 | 535 | ||
431 | /* no filter is considered a match */ | 536 | /* no filter is considered a match */ |
432 | if (!filter) | 537 | if (!filter) |
433 | return 1; | 538 | return 1; |
434 | 539 | ||
435 | n_preds = filter->n_preds; | 540 | n_preds = filter->n_preds; |
436 | |||
437 | if (!n_preds) | 541 | if (!n_preds) |
438 | return 1; | 542 | return 1; |
439 | 543 | ||
440 | /* | 544 | /* |
441 | * n_preds, root and filter->preds are protect with preemption disabled. | 545 | * n_preds, root and filter->preds are protect with preemption disabled. |
442 | */ | 546 | */ |
443 | preds = rcu_dereference_sched(filter->preds); | ||
444 | root = rcu_dereference_sched(filter->root); | 547 | root = rcu_dereference_sched(filter->root); |
445 | if (!root) | 548 | if (!root) |
446 | return 1; | 549 | return 1; |
447 | 550 | ||
448 | pred = root; | 551 | data.preds = preds = rcu_dereference_sched(filter->preds); |
449 | 552 | ret = walk_pred_tree(preds, root, filter_match_preds_cb, &data); | |
450 | /* match is currently meaningless */ | 553 | WARN_ON(ret); |
451 | match = -1; | 554 | return data.match; |
452 | |||
453 | do { | ||
454 | switch (move) { | ||
455 | case MOVE_DOWN: | ||
456 | /* only AND and OR have children */ | ||
457 | if (pred->left != FILTER_PRED_INVALID) { | ||
458 | /* If ops is set, then it was folded. */ | ||
459 | if (!pred->ops) { | ||
460 | /* keep going to down the left side */ | ||
461 | pred = &preds[pred->left]; | ||
462 | continue; | ||
463 | } | ||
464 | /* We can treat folded ops as a leaf node */ | ||
465 | match = process_ops(preds, pred, rec); | ||
466 | } else | ||
467 | match = pred->fn(pred, rec); | ||
468 | /* If this pred is the only pred */ | ||
469 | if (pred == root) | ||
470 | break; | ||
471 | pred = get_pred_parent(pred, preds, | ||
472 | pred->parent, &move); | ||
473 | continue; | ||
474 | case MOVE_UP_FROM_LEFT: | ||
475 | /* | ||
476 | * Check for short circuits. | ||
477 | * | ||
478 | * Optimization: !!match == (pred->op == OP_OR) | ||
479 | * is the same as: | ||
480 | * if ((match && pred->op == OP_OR) || | ||
481 | * (!match && pred->op == OP_AND)) | ||
482 | */ | ||
483 | if (!!match == (pred->op == OP_OR)) { | ||
484 | if (pred == root) | ||
485 | break; | ||
486 | pred = get_pred_parent(pred, preds, | ||
487 | pred->parent, &move); | ||
488 | continue; | ||
489 | } | ||
490 | /* now go down the right side of the tree. */ | ||
491 | pred = &preds[pred->right]; | ||
492 | move = MOVE_DOWN; | ||
493 | continue; | ||
494 | case MOVE_UP_FROM_RIGHT: | ||
495 | /* We finished this equation. */ | ||
496 | if (pred == root) | ||
497 | break; | ||
498 | pred = get_pred_parent(pred, preds, | ||
499 | pred->parent, &move); | ||
500 | continue; | ||
501 | } | ||
502 | done = 1; | ||
503 | } while (!done); | ||
504 | |||
505 | return match; | ||
506 | } | 555 | } |
507 | EXPORT_SYMBOL_GPL(filter_match_preds); | 556 | EXPORT_SYMBOL_GPL(filter_match_preds); |
508 | 557 | ||
@@ -628,22 +677,6 @@ find_event_field(struct ftrace_event_call *call, char *name) | |||
628 | return __find_event_field(head, name); | 677 | return __find_event_field(head, name); |
629 | } | 678 | } |
630 | 679 | ||
631 | static void filter_free_pred(struct filter_pred *pred) | ||
632 | { | ||
633 | if (!pred) | ||
634 | return; | ||
635 | |||
636 | kfree(pred->field_name); | ||
637 | kfree(pred); | ||
638 | } | ||
639 | |||
640 | static void filter_clear_pred(struct filter_pred *pred) | ||
641 | { | ||
642 | kfree(pred->field_name); | ||
643 | pred->field_name = NULL; | ||
644 | pred->regex.len = 0; | ||
645 | } | ||
646 | |||
647 | static int __alloc_pred_stack(struct pred_stack *stack, int n_preds) | 680 | static int __alloc_pred_stack(struct pred_stack *stack, int n_preds) |
648 | { | 681 | { |
649 | stack->preds = kzalloc(sizeof(*stack->preds)*(n_preds + 1), GFP_KERNEL); | 682 | stack->preds = kzalloc(sizeof(*stack->preds)*(n_preds + 1), GFP_KERNEL); |
@@ -689,20 +722,13 @@ __pop_pred_stack(struct pred_stack *stack) | |||
689 | static int filter_set_pred(struct event_filter *filter, | 722 | static int filter_set_pred(struct event_filter *filter, |
690 | int idx, | 723 | int idx, |
691 | struct pred_stack *stack, | 724 | struct pred_stack *stack, |
692 | struct filter_pred *src, | 725 | struct filter_pred *src) |
693 | filter_pred_fn_t fn) | ||
694 | { | 726 | { |
695 | struct filter_pred *dest = &filter->preds[idx]; | 727 | struct filter_pred *dest = &filter->preds[idx]; |
696 | struct filter_pred *left; | 728 | struct filter_pred *left; |
697 | struct filter_pred *right; | 729 | struct filter_pred *right; |
698 | 730 | ||
699 | *dest = *src; | 731 | *dest = *src; |
700 | if (src->field_name) { | ||
701 | dest->field_name = kstrdup(src->field_name, GFP_KERNEL); | ||
702 | if (!dest->field_name) | ||
703 | return -ENOMEM; | ||
704 | } | ||
705 | dest->fn = fn; | ||
706 | dest->index = idx; | 732 | dest->index = idx; |
707 | 733 | ||
708 | if (dest->op == OP_OR || dest->op == OP_AND) { | 734 | if (dest->op == OP_OR || dest->op == OP_AND) { |
@@ -743,11 +769,7 @@ static int filter_set_pred(struct event_filter *filter, | |||
743 | 769 | ||
744 | static void __free_preds(struct event_filter *filter) | 770 | static void __free_preds(struct event_filter *filter) |
745 | { | 771 | { |
746 | int i; | ||
747 | |||
748 | if (filter->preds) { | 772 | if (filter->preds) { |
749 | for (i = 0; i < filter->a_preds; i++) | ||
750 | kfree(filter->preds[i].field_name); | ||
751 | kfree(filter->preds); | 773 | kfree(filter->preds); |
752 | filter->preds = NULL; | 774 | filter->preds = NULL; |
753 | } | 775 | } |
@@ -840,23 +862,19 @@ static void filter_free_subsystem_filters(struct event_subsystem *system) | |||
840 | } | 862 | } |
841 | } | 863 | } |
842 | 864 | ||
843 | static int filter_add_pred_fn(struct filter_parse_state *ps, | 865 | static int filter_add_pred(struct filter_parse_state *ps, |
844 | struct ftrace_event_call *call, | 866 | struct event_filter *filter, |
845 | struct event_filter *filter, | 867 | struct filter_pred *pred, |
846 | struct filter_pred *pred, | 868 | struct pred_stack *stack) |
847 | struct pred_stack *stack, | ||
848 | filter_pred_fn_t fn) | ||
849 | { | 869 | { |
850 | int idx, err; | 870 | int err; |
851 | 871 | ||
852 | if (WARN_ON(filter->n_preds == filter->a_preds)) { | 872 | if (WARN_ON(filter->n_preds == filter->a_preds)) { |
853 | parse_error(ps, FILT_ERR_TOO_MANY_PREDS, 0); | 873 | parse_error(ps, FILT_ERR_TOO_MANY_PREDS, 0); |
854 | return -ENOSPC; | 874 | return -ENOSPC; |
855 | } | 875 | } |
856 | 876 | ||
857 | idx = filter->n_preds; | 877 | err = filter_set_pred(filter, filter->n_preds, stack, pred); |
858 | filter_clear_pred(&filter->preds[idx]); | ||
859 | err = filter_set_pred(filter, idx, stack, pred, fn); | ||
860 | if (err) | 878 | if (err) |
861 | return err; | 879 | return err; |
862 | 880 | ||
@@ -937,31 +955,15 @@ static filter_pred_fn_t select_comparison_fn(int op, int field_size, | |||
937 | return fn; | 955 | return fn; |
938 | } | 956 | } |
939 | 957 | ||
940 | static int filter_add_pred(struct filter_parse_state *ps, | 958 | static int init_pred(struct filter_parse_state *ps, |
941 | struct ftrace_event_call *call, | 959 | struct ftrace_event_field *field, |
942 | struct event_filter *filter, | 960 | struct filter_pred *pred) |
943 | struct filter_pred *pred, | 961 | |
944 | struct pred_stack *stack, | ||
945 | bool dry_run) | ||
946 | { | 962 | { |
947 | struct ftrace_event_field *field; | 963 | filter_pred_fn_t fn = filter_pred_none; |
948 | filter_pred_fn_t fn; | ||
949 | unsigned long long val; | 964 | unsigned long long val; |
950 | int ret; | 965 | int ret; |
951 | 966 | ||
952 | fn = pred->fn = filter_pred_none; | ||
953 | |||
954 | if (pred->op == OP_AND) | ||
955 | goto add_pred_fn; | ||
956 | else if (pred->op == OP_OR) | ||
957 | goto add_pred_fn; | ||
958 | |||
959 | field = find_event_field(call, pred->field_name); | ||
960 | if (!field) { | ||
961 | parse_error(ps, FILT_ERR_FIELD_NOT_FOUND, 0); | ||
962 | return -EINVAL; | ||
963 | } | ||
964 | |||
965 | pred->offset = field->offset; | 967 | pred->offset = field->offset; |
966 | 968 | ||
967 | if (!is_legal_op(field, pred->op)) { | 969 | if (!is_legal_op(field, pred->op)) { |
@@ -1001,9 +1003,7 @@ static int filter_add_pred(struct filter_parse_state *ps, | |||
1001 | if (pred->op == OP_NE) | 1003 | if (pred->op == OP_NE) |
1002 | pred->not = 1; | 1004 | pred->not = 1; |
1003 | 1005 | ||
1004 | add_pred_fn: | 1006 | pred->fn = fn; |
1005 | if (!dry_run) | ||
1006 | return filter_add_pred_fn(ps, call, filter, pred, stack, fn); | ||
1007 | return 0; | 1007 | return 0; |
1008 | } | 1008 | } |
1009 | 1009 | ||
@@ -1302,39 +1302,37 @@ parse_operand: | |||
1302 | return 0; | 1302 | return 0; |
1303 | } | 1303 | } |
1304 | 1304 | ||
1305 | static struct filter_pred *create_pred(int op, char *operand1, char *operand2) | 1305 | static struct filter_pred *create_pred(struct filter_parse_state *ps, |
1306 | struct ftrace_event_call *call, | ||
1307 | int op, char *operand1, char *operand2) | ||
1306 | { | 1308 | { |
1307 | struct filter_pred *pred; | 1309 | struct ftrace_event_field *field; |
1310 | static struct filter_pred pred; | ||
1308 | 1311 | ||
1309 | pred = kzalloc(sizeof(*pred), GFP_KERNEL); | 1312 | memset(&pred, 0, sizeof(pred)); |
1310 | if (!pred) | 1313 | pred.op = op; |
1311 | return NULL; | ||
1312 | 1314 | ||
1313 | pred->field_name = kstrdup(operand1, GFP_KERNEL); | 1315 | if (op == OP_AND || op == OP_OR) |
1314 | if (!pred->field_name) { | 1316 | return &pred; |
1315 | kfree(pred); | 1317 | |
1318 | if (!operand1 || !operand2) { | ||
1319 | parse_error(ps, FILT_ERR_MISSING_FIELD, 0); | ||
1316 | return NULL; | 1320 | return NULL; |
1317 | } | 1321 | } |
1318 | 1322 | ||
1319 | strcpy(pred->regex.pattern, operand2); | 1323 | field = find_event_field(call, operand1); |
1320 | pred->regex.len = strlen(pred->regex.pattern); | 1324 | if (!field) { |
1321 | 1325 | parse_error(ps, FILT_ERR_FIELD_NOT_FOUND, 0); | |
1322 | pred->op = op; | ||
1323 | |||
1324 | return pred; | ||
1325 | } | ||
1326 | |||
1327 | static struct filter_pred *create_logical_pred(int op) | ||
1328 | { | ||
1329 | struct filter_pred *pred; | ||
1330 | |||
1331 | pred = kzalloc(sizeof(*pred), GFP_KERNEL); | ||
1332 | if (!pred) | ||
1333 | return NULL; | 1326 | return NULL; |
1327 | } | ||
1334 | 1328 | ||
1335 | pred->op = op; | 1329 | strcpy(pred.regex.pattern, operand2); |
1330 | pred.regex.len = strlen(pred.regex.pattern); | ||
1336 | 1331 | ||
1337 | return pred; | 1332 | #ifdef CONFIG_FTRACE_STARTUP_TEST |
1333 | pred.field = field; | ||
1334 | #endif | ||
1335 | return init_pred(ps, field, &pred) ? NULL : &pred; | ||
1338 | } | 1336 | } |
1339 | 1337 | ||
1340 | static int check_preds(struct filter_parse_state *ps) | 1338 | static int check_preds(struct filter_parse_state *ps) |
@@ -1375,6 +1373,23 @@ static int count_preds(struct filter_parse_state *ps) | |||
1375 | return n_preds; | 1373 | return n_preds; |
1376 | } | 1374 | } |
1377 | 1375 | ||
1376 | struct check_pred_data { | ||
1377 | int count; | ||
1378 | int max; | ||
1379 | }; | ||
1380 | |||
1381 | static int check_pred_tree_cb(enum move_type move, struct filter_pred *pred, | ||
1382 | int *err, void *data) | ||
1383 | { | ||
1384 | struct check_pred_data *d = data; | ||
1385 | |||
1386 | if (WARN_ON(d->count++ > d->max)) { | ||
1387 | *err = -EINVAL; | ||
1388 | return WALK_PRED_ABORT; | ||
1389 | } | ||
1390 | return WALK_PRED_DEFAULT; | ||
1391 | } | ||
1392 | |||
1378 | /* | 1393 | /* |
1379 | * The tree is walked at filtering of an event. If the tree is not correctly | 1394 | * The tree is walked at filtering of an event. If the tree is not correctly |
1380 | * built, it may cause an infinite loop. Check here that the tree does | 1395 | * built, it may cause an infinite loop. Check here that the tree does |
@@ -1383,107 +1398,76 @@ static int count_preds(struct filter_parse_state *ps) | |||
1383 | static int check_pred_tree(struct event_filter *filter, | 1398 | static int check_pred_tree(struct event_filter *filter, |
1384 | struct filter_pred *root) | 1399 | struct filter_pred *root) |
1385 | { | 1400 | { |
1386 | struct filter_pred *preds; | 1401 | struct check_pred_data data = { |
1387 | struct filter_pred *pred; | 1402 | /* |
1388 | enum move_type move = MOVE_DOWN; | 1403 | * The max that we can hit a node is three times. |
1389 | int count = 0; | 1404 | * Once going down, once coming up from left, and |
1390 | int done = 0; | 1405 | * once coming up from right. This is more than enough |
1391 | int max; | 1406 | * since leafs are only hit a single time. |
1392 | 1407 | */ | |
1393 | /* | 1408 | .max = 3 * filter->n_preds, |
1394 | * The max that we can hit a node is three times. | 1409 | .count = 0, |
1395 | * Once going down, once coming up from left, and | 1410 | }; |
1396 | * once coming up from right. This is more than enough | ||
1397 | * since leafs are only hit a single time. | ||
1398 | */ | ||
1399 | max = 3 * filter->n_preds; | ||
1400 | 1411 | ||
1401 | preds = filter->preds; | 1412 | return walk_pred_tree(filter->preds, root, |
1402 | if (!preds) | 1413 | check_pred_tree_cb, &data); |
1403 | return -EINVAL; | 1414 | } |
1404 | pred = root; | ||
1405 | 1415 | ||
1406 | do { | 1416 | static int count_leafs_cb(enum move_type move, struct filter_pred *pred, |
1407 | if (WARN_ON(count++ > max)) | 1417 | int *err, void *data) |
1408 | return -EINVAL; | 1418 | { |
1419 | int *count = data; | ||
1409 | 1420 | ||
1410 | switch (move) { | 1421 | if ((move == MOVE_DOWN) && |
1411 | case MOVE_DOWN: | 1422 | (pred->left == FILTER_PRED_INVALID)) |
1412 | if (pred->left != FILTER_PRED_INVALID) { | 1423 | (*count)++; |
1413 | pred = &preds[pred->left]; | ||
1414 | continue; | ||
1415 | } | ||
1416 | /* A leaf at the root is just a leaf in the tree */ | ||
1417 | if (pred == root) | ||
1418 | break; | ||
1419 | pred = get_pred_parent(pred, preds, | ||
1420 | pred->parent, &move); | ||
1421 | continue; | ||
1422 | case MOVE_UP_FROM_LEFT: | ||
1423 | pred = &preds[pred->right]; | ||
1424 | move = MOVE_DOWN; | ||
1425 | continue; | ||
1426 | case MOVE_UP_FROM_RIGHT: | ||
1427 | if (pred == root) | ||
1428 | break; | ||
1429 | pred = get_pred_parent(pred, preds, | ||
1430 | pred->parent, &move); | ||
1431 | continue; | ||
1432 | } | ||
1433 | done = 1; | ||
1434 | } while (!done); | ||
1435 | 1424 | ||
1436 | /* We are fine. */ | 1425 | return WALK_PRED_DEFAULT; |
1437 | return 0; | ||
1438 | } | 1426 | } |
1439 | 1427 | ||
1440 | static int count_leafs(struct filter_pred *preds, struct filter_pred *root) | 1428 | static int count_leafs(struct filter_pred *preds, struct filter_pred *root) |
1441 | { | 1429 | { |
1442 | struct filter_pred *pred; | 1430 | int count = 0, ret; |
1443 | enum move_type move = MOVE_DOWN; | ||
1444 | int count = 0; | ||
1445 | int done = 0; | ||
1446 | 1431 | ||
1447 | pred = root; | 1432 | ret = walk_pred_tree(preds, root, count_leafs_cb, &count); |
1433 | WARN_ON(ret); | ||
1434 | return count; | ||
1435 | } | ||
1448 | 1436 | ||
1449 | do { | 1437 | struct fold_pred_data { |
1450 | switch (move) { | 1438 | struct filter_pred *root; |
1451 | case MOVE_DOWN: | 1439 | int count; |
1452 | if (pred->left != FILTER_PRED_INVALID) { | 1440 | int children; |
1453 | pred = &preds[pred->left]; | 1441 | }; |
1454 | continue; | ||
1455 | } | ||
1456 | /* A leaf at the root is just a leaf in the tree */ | ||
1457 | if (pred == root) | ||
1458 | return 1; | ||
1459 | count++; | ||
1460 | pred = get_pred_parent(pred, preds, | ||
1461 | pred->parent, &move); | ||
1462 | continue; | ||
1463 | case MOVE_UP_FROM_LEFT: | ||
1464 | pred = &preds[pred->right]; | ||
1465 | move = MOVE_DOWN; | ||
1466 | continue; | ||
1467 | case MOVE_UP_FROM_RIGHT: | ||
1468 | if (pred == root) | ||
1469 | break; | ||
1470 | pred = get_pred_parent(pred, preds, | ||
1471 | pred->parent, &move); | ||
1472 | continue; | ||
1473 | } | ||
1474 | done = 1; | ||
1475 | } while (!done); | ||
1476 | 1442 | ||
1477 | return count; | 1443 | static int fold_pred_cb(enum move_type move, struct filter_pred *pred, |
1444 | int *err, void *data) | ||
1445 | { | ||
1446 | struct fold_pred_data *d = data; | ||
1447 | struct filter_pred *root = d->root; | ||
1448 | |||
1449 | if (move != MOVE_DOWN) | ||
1450 | return WALK_PRED_DEFAULT; | ||
1451 | if (pred->left != FILTER_PRED_INVALID) | ||
1452 | return WALK_PRED_DEFAULT; | ||
1453 | |||
1454 | if (WARN_ON(d->count == d->children)) { | ||
1455 | *err = -EINVAL; | ||
1456 | return WALK_PRED_ABORT; | ||
1457 | } | ||
1458 | |||
1459 | pred->index &= ~FILTER_PRED_FOLD; | ||
1460 | root->ops[d->count++] = pred->index; | ||
1461 | return WALK_PRED_DEFAULT; | ||
1478 | } | 1462 | } |
1479 | 1463 | ||
1480 | static int fold_pred(struct filter_pred *preds, struct filter_pred *root) | 1464 | static int fold_pred(struct filter_pred *preds, struct filter_pred *root) |
1481 | { | 1465 | { |
1482 | struct filter_pred *pred; | 1466 | struct fold_pred_data data = { |
1483 | enum move_type move = MOVE_DOWN; | 1467 | .root = root, |
1484 | int count = 0; | 1468 | .count = 0, |
1469 | }; | ||
1485 | int children; | 1470 | int children; |
1486 | int done = 0; | ||
1487 | 1471 | ||
1488 | /* No need to keep the fold flag */ | 1472 | /* No need to keep the fold flag */ |
1489 | root->index &= ~FILTER_PRED_FOLD; | 1473 | root->index &= ~FILTER_PRED_FOLD; |
@@ -1501,37 +1485,26 @@ static int fold_pred(struct filter_pred *preds, struct filter_pred *root) | |||
1501 | return -ENOMEM; | 1485 | return -ENOMEM; |
1502 | 1486 | ||
1503 | root->val = children; | 1487 | root->val = children; |
1488 | data.children = children; | ||
1489 | return walk_pred_tree(preds, root, fold_pred_cb, &data); | ||
1490 | } | ||
1504 | 1491 | ||
1505 | pred = root; | 1492 | static int fold_pred_tree_cb(enum move_type move, struct filter_pred *pred, |
1506 | do { | 1493 | int *err, void *data) |
1507 | switch (move) { | 1494 | { |
1508 | case MOVE_DOWN: | 1495 | struct filter_pred *preds = data; |
1509 | if (pred->left != FILTER_PRED_INVALID) { | ||
1510 | pred = &preds[pred->left]; | ||
1511 | continue; | ||
1512 | } | ||
1513 | if (WARN_ON(count == children)) | ||
1514 | return -EINVAL; | ||
1515 | pred->index &= ~FILTER_PRED_FOLD; | ||
1516 | root->ops[count++] = pred->index; | ||
1517 | pred = get_pred_parent(pred, preds, | ||
1518 | pred->parent, &move); | ||
1519 | continue; | ||
1520 | case MOVE_UP_FROM_LEFT: | ||
1521 | pred = &preds[pred->right]; | ||
1522 | move = MOVE_DOWN; | ||
1523 | continue; | ||
1524 | case MOVE_UP_FROM_RIGHT: | ||
1525 | if (pred == root) | ||
1526 | break; | ||
1527 | pred = get_pred_parent(pred, preds, | ||
1528 | pred->parent, &move); | ||
1529 | continue; | ||
1530 | } | ||
1531 | done = 1; | ||
1532 | } while (!done); | ||
1533 | 1496 | ||
1534 | return 0; | 1497 | if (move != MOVE_DOWN) |
1498 | return WALK_PRED_DEFAULT; | ||
1499 | if (!(pred->index & FILTER_PRED_FOLD)) | ||
1500 | return WALK_PRED_DEFAULT; | ||
1501 | |||
1502 | *err = fold_pred(preds, pred); | ||
1503 | if (*err) | ||
1504 | return WALK_PRED_ABORT; | ||
1505 | |||
1506 | /* eveyrhing below is folded, continue with parent */ | ||
1507 | return WALK_PRED_PARENT; | ||
1535 | } | 1508 | } |
1536 | 1509 | ||
1537 | /* | 1510 | /* |
@@ -1542,51 +1515,8 @@ static int fold_pred(struct filter_pred *preds, struct filter_pred *root) | |||
1542 | static int fold_pred_tree(struct event_filter *filter, | 1515 | static int fold_pred_tree(struct event_filter *filter, |
1543 | struct filter_pred *root) | 1516 | struct filter_pred *root) |
1544 | { | 1517 | { |
1545 | struct filter_pred *preds; | 1518 | return walk_pred_tree(filter->preds, root, fold_pred_tree_cb, |
1546 | struct filter_pred *pred; | 1519 | filter->preds); |
1547 | enum move_type move = MOVE_DOWN; | ||
1548 | int done = 0; | ||
1549 | int err; | ||
1550 | |||
1551 | preds = filter->preds; | ||
1552 | if (!preds) | ||
1553 | return -EINVAL; | ||
1554 | pred = root; | ||
1555 | |||
1556 | do { | ||
1557 | switch (move) { | ||
1558 | case MOVE_DOWN: | ||
1559 | if (pred->index & FILTER_PRED_FOLD) { | ||
1560 | err = fold_pred(preds, pred); | ||
1561 | if (err) | ||
1562 | return err; | ||
1563 | /* Folded nodes are like leafs */ | ||
1564 | } else if (pred->left != FILTER_PRED_INVALID) { | ||
1565 | pred = &preds[pred->left]; | ||
1566 | continue; | ||
1567 | } | ||
1568 | |||
1569 | /* A leaf at the root is just a leaf in the tree */ | ||
1570 | if (pred == root) | ||
1571 | break; | ||
1572 | pred = get_pred_parent(pred, preds, | ||
1573 | pred->parent, &move); | ||
1574 | continue; | ||
1575 | case MOVE_UP_FROM_LEFT: | ||
1576 | pred = &preds[pred->right]; | ||
1577 | move = MOVE_DOWN; | ||
1578 | continue; | ||
1579 | case MOVE_UP_FROM_RIGHT: | ||
1580 | if (pred == root) | ||
1581 | break; | ||
1582 | pred = get_pred_parent(pred, preds, | ||
1583 | pred->parent, &move); | ||
1584 | continue; | ||
1585 | } | ||
1586 | done = 1; | ||
1587 | } while (!done); | ||
1588 | |||
1589 | return 0; | ||
1590 | } | 1520 | } |
1591 | 1521 | ||
1592 | static int replace_preds(struct ftrace_event_call *call, | 1522 | static int replace_preds(struct ftrace_event_call *call, |
@@ -1643,27 +1573,17 @@ static int replace_preds(struct ftrace_event_call *call, | |||
1643 | goto fail; | 1573 | goto fail; |
1644 | } | 1574 | } |
1645 | 1575 | ||
1646 | if (elt->op == OP_AND || elt->op == OP_OR) { | 1576 | pred = create_pred(ps, call, elt->op, operand1, operand2); |
1647 | pred = create_logical_pred(elt->op); | 1577 | if (!pred) { |
1648 | goto add_pred; | ||
1649 | } | ||
1650 | |||
1651 | if (!operand1 || !operand2) { | ||
1652 | parse_error(ps, FILT_ERR_MISSING_FIELD, 0); | ||
1653 | err = -EINVAL; | 1578 | err = -EINVAL; |
1654 | goto fail; | 1579 | goto fail; |
1655 | } | 1580 | } |
1656 | 1581 | ||
1657 | pred = create_pred(elt->op, operand1, operand2); | 1582 | if (!dry_run) { |
1658 | add_pred: | 1583 | err = filter_add_pred(ps, filter, pred, &stack); |
1659 | if (!pred) { | 1584 | if (err) |
1660 | err = -ENOMEM; | 1585 | goto fail; |
1661 | goto fail; | ||
1662 | } | 1586 | } |
1663 | err = filter_add_pred(ps, call, filter, pred, &stack, dry_run); | ||
1664 | filter_free_pred(pred); | ||
1665 | if (err) | ||
1666 | goto fail; | ||
1667 | 1587 | ||
1668 | operand1 = operand2 = NULL; | 1588 | operand1 = operand2 = NULL; |
1669 | } | 1589 | } |
@@ -1958,17 +1878,14 @@ int ftrace_profile_set_filter(struct perf_event *event, int event_id, | |||
1958 | int err; | 1878 | int err; |
1959 | struct event_filter *filter; | 1879 | struct event_filter *filter; |
1960 | struct filter_parse_state *ps; | 1880 | struct filter_parse_state *ps; |
1961 | struct ftrace_event_call *call = NULL; | 1881 | struct ftrace_event_call *call; |
1962 | 1882 | ||
1963 | mutex_lock(&event_mutex); | 1883 | mutex_lock(&event_mutex); |
1964 | 1884 | ||
1965 | list_for_each_entry(call, &ftrace_events, list) { | 1885 | call = event->tp_event; |
1966 | if (call->event.type == event_id) | ||
1967 | break; | ||
1968 | } | ||
1969 | 1886 | ||
1970 | err = -EINVAL; | 1887 | err = -EINVAL; |
1971 | if (&call->list == &ftrace_events) | 1888 | if (!call) |
1972 | goto out_unlock; | 1889 | goto out_unlock; |
1973 | 1890 | ||
1974 | err = -EEXIST; | 1891 | err = -EEXIST; |
@@ -2012,3 +1929,215 @@ out_unlock: | |||
2012 | 1929 | ||
2013 | #endif /* CONFIG_PERF_EVENTS */ | 1930 | #endif /* CONFIG_PERF_EVENTS */ |
2014 | 1931 | ||
1932 | #ifdef CONFIG_FTRACE_STARTUP_TEST | ||
1933 | |||
1934 | #include <linux/types.h> | ||
1935 | #include <linux/tracepoint.h> | ||
1936 | |||
1937 | #define CREATE_TRACE_POINTS | ||
1938 | #include "trace_events_filter_test.h" | ||
1939 | |||
1940 | static int test_get_filter(char *filter_str, struct ftrace_event_call *call, | ||
1941 | struct event_filter **pfilter) | ||
1942 | { | ||
1943 | struct event_filter *filter; | ||
1944 | struct filter_parse_state *ps; | ||
1945 | int err = -ENOMEM; | ||
1946 | |||
1947 | filter = __alloc_filter(); | ||
1948 | if (!filter) | ||
1949 | goto out; | ||
1950 | |||
1951 | ps = kzalloc(sizeof(*ps), GFP_KERNEL); | ||
1952 | if (!ps) | ||
1953 | goto free_filter; | ||
1954 | |||
1955 | parse_init(ps, filter_ops, filter_str); | ||
1956 | err = filter_parse(ps); | ||
1957 | if (err) | ||
1958 | goto free_ps; | ||
1959 | |||
1960 | err = replace_preds(call, filter, ps, filter_str, false); | ||
1961 | if (!err) | ||
1962 | *pfilter = filter; | ||
1963 | |||
1964 | free_ps: | ||
1965 | filter_opstack_clear(ps); | ||
1966 | postfix_clear(ps); | ||
1967 | kfree(ps); | ||
1968 | |||
1969 | free_filter: | ||
1970 | if (err) | ||
1971 | __free_filter(filter); | ||
1972 | |||
1973 | out: | ||
1974 | return err; | ||
1975 | } | ||
1976 | |||
1977 | #define DATA_REC(m, va, vb, vc, vd, ve, vf, vg, vh, nvisit) \ | ||
1978 | { \ | ||
1979 | .filter = FILTER, \ | ||
1980 | .rec = { .a = va, .b = vb, .c = vc, .d = vd, \ | ||
1981 | .e = ve, .f = vf, .g = vg, .h = vh }, \ | ||
1982 | .match = m, \ | ||
1983 | .not_visited = nvisit, \ | ||
1984 | } | ||
1985 | #define YES 1 | ||
1986 | #define NO 0 | ||
1987 | |||
1988 | static struct test_filter_data_t { | ||
1989 | char *filter; | ||
1990 | struct ftrace_raw_ftrace_test_filter rec; | ||
1991 | int match; | ||
1992 | char *not_visited; | ||
1993 | } test_filter_data[] = { | ||
1994 | #define FILTER "a == 1 && b == 1 && c == 1 && d == 1 && " \ | ||
1995 | "e == 1 && f == 1 && g == 1 && h == 1" | ||
1996 | DATA_REC(YES, 1, 1, 1, 1, 1, 1, 1, 1, ""), | ||
1997 | DATA_REC(NO, 0, 1, 1, 1, 1, 1, 1, 1, "bcdefgh"), | ||
1998 | DATA_REC(NO, 1, 1, 1, 1, 1, 1, 1, 0, ""), | ||
1999 | #undef FILTER | ||
2000 | #define FILTER "a == 1 || b == 1 || c == 1 || d == 1 || " \ | ||
2001 | "e == 1 || f == 1 || g == 1 || h == 1" | ||
2002 | DATA_REC(NO, 0, 0, 0, 0, 0, 0, 0, 0, ""), | ||
2003 | DATA_REC(YES, 0, 0, 0, 0, 0, 0, 0, 1, ""), | ||
2004 | DATA_REC(YES, 1, 0, 0, 0, 0, 0, 0, 0, "bcdefgh"), | ||
2005 | #undef FILTER | ||
2006 | #define FILTER "(a == 1 || b == 1) && (c == 1 || d == 1) && " \ | ||
2007 | "(e == 1 || f == 1) && (g == 1 || h == 1)" | ||
2008 | DATA_REC(NO, 0, 0, 1, 1, 1, 1, 1, 1, "dfh"), | ||
2009 | DATA_REC(YES, 0, 1, 0, 1, 0, 1, 0, 1, ""), | ||
2010 | DATA_REC(YES, 1, 0, 1, 0, 0, 1, 0, 1, "bd"), | ||
2011 | DATA_REC(NO, 1, 0, 1, 0, 0, 1, 0, 0, "bd"), | ||
2012 | #undef FILTER | ||
2013 | #define FILTER "(a == 1 && b == 1) || (c == 1 && d == 1) || " \ | ||
2014 | "(e == 1 && f == 1) || (g == 1 && h == 1)" | ||
2015 | DATA_REC(YES, 1, 0, 1, 1, 1, 1, 1, 1, "efgh"), | ||
2016 | DATA_REC(YES, 0, 0, 0, 0, 0, 0, 1, 1, ""), | ||
2017 | DATA_REC(NO, 0, 0, 0, 0, 0, 0, 0, 1, ""), | ||
2018 | #undef FILTER | ||
2019 | #define FILTER "(a == 1 && b == 1) && (c == 1 && d == 1) && " \ | ||
2020 | "(e == 1 && f == 1) || (g == 1 && h == 1)" | ||
2021 | DATA_REC(YES, 1, 1, 1, 1, 1, 1, 0, 0, "gh"), | ||
2022 | DATA_REC(NO, 0, 0, 0, 0, 0, 0, 0, 1, ""), | ||
2023 | DATA_REC(YES, 1, 1, 1, 1, 1, 0, 1, 1, ""), | ||
2024 | #undef FILTER | ||
2025 | #define FILTER "((a == 1 || b == 1) || (c == 1 || d == 1) || " \ | ||
2026 | "(e == 1 || f == 1)) && (g == 1 || h == 1)" | ||
2027 | DATA_REC(YES, 1, 1, 1, 1, 1, 1, 0, 1, "bcdef"), | ||
2028 | DATA_REC(NO, 0, 0, 0, 0, 0, 0, 0, 0, ""), | ||
2029 | DATA_REC(YES, 1, 1, 1, 1, 1, 0, 1, 1, "h"), | ||
2030 | #undef FILTER | ||
2031 | #define FILTER "((((((((a == 1) && (b == 1)) || (c == 1)) && (d == 1)) || " \ | ||
2032 | "(e == 1)) && (f == 1)) || (g == 1)) && (h == 1))" | ||
2033 | DATA_REC(YES, 1, 1, 1, 1, 1, 1, 1, 1, "ceg"), | ||
2034 | DATA_REC(NO, 0, 1, 0, 1, 0, 1, 0, 1, ""), | ||
2035 | DATA_REC(NO, 1, 0, 1, 0, 1, 0, 1, 0, ""), | ||
2036 | #undef FILTER | ||
2037 | #define FILTER "((((((((a == 1) || (b == 1)) && (c == 1)) || (d == 1)) && " \ | ||
2038 | "(e == 1)) || (f == 1)) && (g == 1)) || (h == 1))" | ||
2039 | DATA_REC(YES, 1, 1, 1, 1, 1, 1, 1, 1, "bdfh"), | ||
2040 | DATA_REC(YES, 0, 1, 0, 1, 0, 1, 0, 1, ""), | ||
2041 | DATA_REC(YES, 1, 0, 1, 0, 1, 0, 1, 0, "bdfh"), | ||
2042 | }; | ||
2043 | |||
2044 | #undef DATA_REC | ||
2045 | #undef FILTER | ||
2046 | #undef YES | ||
2047 | #undef NO | ||
2048 | |||
2049 | #define DATA_CNT (sizeof(test_filter_data)/sizeof(struct test_filter_data_t)) | ||
2050 | |||
2051 | static int test_pred_visited; | ||
2052 | |||
2053 | static int test_pred_visited_fn(struct filter_pred *pred, void *event) | ||
2054 | { | ||
2055 | struct ftrace_event_field *field = pred->field; | ||
2056 | |||
2057 | test_pred_visited = 1; | ||
2058 | printk(KERN_INFO "\npred visited %s\n", field->name); | ||
2059 | return 1; | ||
2060 | } | ||
2061 | |||
2062 | static int test_walk_pred_cb(enum move_type move, struct filter_pred *pred, | ||
2063 | int *err, void *data) | ||
2064 | { | ||
2065 | char *fields = data; | ||
2066 | |||
2067 | if ((move == MOVE_DOWN) && | ||
2068 | (pred->left == FILTER_PRED_INVALID)) { | ||
2069 | struct ftrace_event_field *field = pred->field; | ||
2070 | |||
2071 | if (!field) { | ||
2072 | WARN(1, "all leafs should have field defined"); | ||
2073 | return WALK_PRED_DEFAULT; | ||
2074 | } | ||
2075 | if (!strchr(fields, *field->name)) | ||
2076 | return WALK_PRED_DEFAULT; | ||
2077 | |||
2078 | WARN_ON(!pred->fn); | ||
2079 | pred->fn = test_pred_visited_fn; | ||
2080 | } | ||
2081 | return WALK_PRED_DEFAULT; | ||
2082 | } | ||
2083 | |||
2084 | static __init int ftrace_test_event_filter(void) | ||
2085 | { | ||
2086 | int i; | ||
2087 | |||
2088 | printk(KERN_INFO "Testing ftrace filter: "); | ||
2089 | |||
2090 | for (i = 0; i < DATA_CNT; i++) { | ||
2091 | struct event_filter *filter = NULL; | ||
2092 | struct test_filter_data_t *d = &test_filter_data[i]; | ||
2093 | int err; | ||
2094 | |||
2095 | err = test_get_filter(d->filter, &event_ftrace_test_filter, | ||
2096 | &filter); | ||
2097 | if (err) { | ||
2098 | printk(KERN_INFO | ||
2099 | "Failed to get filter for '%s', err %d\n", | ||
2100 | d->filter, err); | ||
2101 | break; | ||
2102 | } | ||
2103 | |||
2104 | /* | ||
2105 | * The preemption disabling is not really needed for self | ||
2106 | * tests, but the rcu dereference will complain without it. | ||
2107 | */ | ||
2108 | preempt_disable(); | ||
2109 | if (*d->not_visited) | ||
2110 | walk_pred_tree(filter->preds, filter->root, | ||
2111 | test_walk_pred_cb, | ||
2112 | d->not_visited); | ||
2113 | |||
2114 | test_pred_visited = 0; | ||
2115 | err = filter_match_preds(filter, &d->rec); | ||
2116 | preempt_enable(); | ||
2117 | |||
2118 | __free_filter(filter); | ||
2119 | |||
2120 | if (test_pred_visited) { | ||
2121 | printk(KERN_INFO | ||
2122 | "Failed, unwanted pred visited for filter %s\n", | ||
2123 | d->filter); | ||
2124 | break; | ||
2125 | } | ||
2126 | |||
2127 | if (err != d->match) { | ||
2128 | printk(KERN_INFO | ||
2129 | "Failed to match filter '%s', expected %d\n", | ||
2130 | d->filter, d->match); | ||
2131 | break; | ||
2132 | } | ||
2133 | } | ||
2134 | |||
2135 | if (i == DATA_CNT) | ||
2136 | printk(KERN_CONT "OK\n"); | ||
2137 | |||
2138 | return 0; | ||
2139 | } | ||
2140 | |||
2141 | late_initcall(ftrace_test_event_filter); | ||
2142 | |||
2143 | #endif /* CONFIG_FTRACE_STARTUP_TEST */ | ||
diff --git a/kernel/trace/trace_events_filter_test.h b/kernel/trace/trace_events_filter_test.h new file mode 100644 index 000000000000..bfd4dba0d603 --- /dev/null +++ b/kernel/trace/trace_events_filter_test.h | |||
@@ -0,0 +1,50 @@ | |||
1 | #undef TRACE_SYSTEM | ||
2 | #define TRACE_SYSTEM test | ||
3 | |||
4 | #if !defined(_TRACE_TEST_H) || defined(TRACE_HEADER_MULTI_READ) | ||
5 | #define _TRACE_TEST_H | ||
6 | |||
7 | #include <linux/tracepoint.h> | ||
8 | |||
9 | TRACE_EVENT(ftrace_test_filter, | ||
10 | |||
11 | TP_PROTO(int a, int b, int c, int d, int e, int f, int g, int h), | ||
12 | |||
13 | TP_ARGS(a, b, c, d, e, f, g, h), | ||
14 | |||
15 | TP_STRUCT__entry( | ||
16 | __field(int, a) | ||
17 | __field(int, b) | ||
18 | __field(int, c) | ||
19 | __field(int, d) | ||
20 | __field(int, e) | ||
21 | __field(int, f) | ||
22 | __field(int, g) | ||
23 | __field(int, h) | ||
24 | ), | ||
25 | |||
26 | TP_fast_assign( | ||
27 | __entry->a = a; | ||
28 | __entry->b = b; | ||
29 | __entry->c = c; | ||
30 | __entry->d = d; | ||
31 | __entry->e = e; | ||
32 | __entry->f = f; | ||
33 | __entry->g = g; | ||
34 | __entry->h = h; | ||
35 | ), | ||
36 | |||
37 | TP_printk("a %d, b %d, c %d, d %d, e %d, f %d, g %d, h %d", | ||
38 | __entry->a, __entry->b, __entry->c, __entry->d, | ||
39 | __entry->e, __entry->f, __entry->g, __entry->h) | ||
40 | ); | ||
41 | |||
42 | #endif /* _TRACE_TEST_H || TRACE_HEADER_MULTI_READ */ | ||
43 | |||
44 | #undef TRACE_INCLUDE_PATH | ||
45 | #undef TRACE_INCLUDE_FILE | ||
46 | #define TRACE_INCLUDE_PATH . | ||
47 | #define TRACE_INCLUDE_FILE trace_events_filter_test | ||
48 | |||
49 | /* This part must be outside protection */ | ||
50 | #include <trace/define_trace.h> | ||
diff --git a/kernel/trace/trace_irqsoff.c b/kernel/trace/trace_irqsoff.c index 667aa8cc0cfc..20dad0d7a163 100644 --- a/kernel/trace/trace_irqsoff.c +++ b/kernel/trace/trace_irqsoff.c | |||
@@ -23,7 +23,7 @@ static int tracer_enabled __read_mostly; | |||
23 | 23 | ||
24 | static DEFINE_PER_CPU(int, tracing_cpu); | 24 | static DEFINE_PER_CPU(int, tracing_cpu); |
25 | 25 | ||
26 | static DEFINE_SPINLOCK(max_trace_lock); | 26 | static DEFINE_RAW_SPINLOCK(max_trace_lock); |
27 | 27 | ||
28 | enum { | 28 | enum { |
29 | TRACER_IRQS_OFF = (1 << 1), | 29 | TRACER_IRQS_OFF = (1 << 1), |
@@ -321,7 +321,7 @@ check_critical_timing(struct trace_array *tr, | |||
321 | if (!report_latency(delta)) | 321 | if (!report_latency(delta)) |
322 | goto out; | 322 | goto out; |
323 | 323 | ||
324 | spin_lock_irqsave(&max_trace_lock, flags); | 324 | raw_spin_lock_irqsave(&max_trace_lock, flags); |
325 | 325 | ||
326 | /* check if we are still the max latency */ | 326 | /* check if we are still the max latency */ |
327 | if (!report_latency(delta)) | 327 | if (!report_latency(delta)) |
@@ -344,7 +344,7 @@ check_critical_timing(struct trace_array *tr, | |||
344 | max_sequence++; | 344 | max_sequence++; |
345 | 345 | ||
346 | out_unlock: | 346 | out_unlock: |
347 | spin_unlock_irqrestore(&max_trace_lock, flags); | 347 | raw_spin_unlock_irqrestore(&max_trace_lock, flags); |
348 | 348 | ||
349 | out: | 349 | out: |
350 | data->critical_sequence = max_sequence; | 350 | data->critical_sequence = max_sequence; |
@@ -505,13 +505,13 @@ EXPORT_SYMBOL(trace_hardirqs_off_caller); | |||
505 | #ifdef CONFIG_PREEMPT_TRACER | 505 | #ifdef CONFIG_PREEMPT_TRACER |
506 | void trace_preempt_on(unsigned long a0, unsigned long a1) | 506 | void trace_preempt_on(unsigned long a0, unsigned long a1) |
507 | { | 507 | { |
508 | if (preempt_trace()) | 508 | if (preempt_trace() && !irq_trace()) |
509 | stop_critical_timing(a0, a1); | 509 | stop_critical_timing(a0, a1); |
510 | } | 510 | } |
511 | 511 | ||
512 | void trace_preempt_off(unsigned long a0, unsigned long a1) | 512 | void trace_preempt_off(unsigned long a0, unsigned long a1) |
513 | { | 513 | { |
514 | if (preempt_trace()) | 514 | if (preempt_trace() && !irq_trace()) |
515 | start_critical_timing(a0, a1); | 515 | start_critical_timing(a0, a1); |
516 | } | 516 | } |
517 | #endif /* CONFIG_PREEMPT_TRACER */ | 517 | #endif /* CONFIG_PREEMPT_TRACER */ |
diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index 5fb3697bf0e5..00d527c945a4 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c | |||
@@ -836,11 +836,17 @@ static void __unregister_trace_probe(struct trace_probe *tp) | |||
836 | } | 836 | } |
837 | 837 | ||
838 | /* Unregister a trace_probe and probe_event: call with locking probe_lock */ | 838 | /* Unregister a trace_probe and probe_event: call with locking probe_lock */ |
839 | static void unregister_trace_probe(struct trace_probe *tp) | 839 | static int unregister_trace_probe(struct trace_probe *tp) |
840 | { | 840 | { |
841 | /* Enabled event can not be unregistered */ | ||
842 | if (trace_probe_is_enabled(tp)) | ||
843 | return -EBUSY; | ||
844 | |||
841 | __unregister_trace_probe(tp); | 845 | __unregister_trace_probe(tp); |
842 | list_del(&tp->list); | 846 | list_del(&tp->list); |
843 | unregister_probe_event(tp); | 847 | unregister_probe_event(tp); |
848 | |||
849 | return 0; | ||
844 | } | 850 | } |
845 | 851 | ||
846 | /* Register a trace_probe and probe_event */ | 852 | /* Register a trace_probe and probe_event */ |
@@ -854,7 +860,9 @@ static int register_trace_probe(struct trace_probe *tp) | |||
854 | /* Delete old (same name) event if exist */ | 860 | /* Delete old (same name) event if exist */ |
855 | old_tp = find_trace_probe(tp->call.name, tp->call.class->system); | 861 | old_tp = find_trace_probe(tp->call.name, tp->call.class->system); |
856 | if (old_tp) { | 862 | if (old_tp) { |
857 | unregister_trace_probe(old_tp); | 863 | ret = unregister_trace_probe(old_tp); |
864 | if (ret < 0) | ||
865 | goto end; | ||
858 | free_trace_probe(old_tp); | 866 | free_trace_probe(old_tp); |
859 | } | 867 | } |
860 | 868 | ||
@@ -892,6 +900,7 @@ static int trace_probe_module_callback(struct notifier_block *nb, | |||
892 | mutex_lock(&probe_lock); | 900 | mutex_lock(&probe_lock); |
893 | list_for_each_entry(tp, &probe_list, list) { | 901 | list_for_each_entry(tp, &probe_list, list) { |
894 | if (trace_probe_within_module(tp, mod)) { | 902 | if (trace_probe_within_module(tp, mod)) { |
903 | /* Don't need to check busy - this should have gone. */ | ||
895 | __unregister_trace_probe(tp); | 904 | __unregister_trace_probe(tp); |
896 | ret = __register_trace_probe(tp); | 905 | ret = __register_trace_probe(tp); |
897 | if (ret) | 906 | if (ret) |
@@ -1205,10 +1214,11 @@ static int create_trace_probe(int argc, char **argv) | |||
1205 | return -ENOENT; | 1214 | return -ENOENT; |
1206 | } | 1215 | } |
1207 | /* delete an event */ | 1216 | /* delete an event */ |
1208 | unregister_trace_probe(tp); | 1217 | ret = unregister_trace_probe(tp); |
1209 | free_trace_probe(tp); | 1218 | if (ret == 0) |
1219 | free_trace_probe(tp); | ||
1210 | mutex_unlock(&probe_lock); | 1220 | mutex_unlock(&probe_lock); |
1211 | return 0; | 1221 | return ret; |
1212 | } | 1222 | } |
1213 | 1223 | ||
1214 | if (argc < 2) { | 1224 | if (argc < 2) { |
@@ -1317,18 +1327,29 @@ error: | |||
1317 | return ret; | 1327 | return ret; |
1318 | } | 1328 | } |
1319 | 1329 | ||
1320 | static void release_all_trace_probes(void) | 1330 | static int release_all_trace_probes(void) |
1321 | { | 1331 | { |
1322 | struct trace_probe *tp; | 1332 | struct trace_probe *tp; |
1333 | int ret = 0; | ||
1323 | 1334 | ||
1324 | mutex_lock(&probe_lock); | 1335 | mutex_lock(&probe_lock); |
1336 | /* Ensure no probe is in use. */ | ||
1337 | list_for_each_entry(tp, &probe_list, list) | ||
1338 | if (trace_probe_is_enabled(tp)) { | ||
1339 | ret = -EBUSY; | ||
1340 | goto end; | ||
1341 | } | ||
1325 | /* TODO: Use batch unregistration */ | 1342 | /* TODO: Use batch unregistration */ |
1326 | while (!list_empty(&probe_list)) { | 1343 | while (!list_empty(&probe_list)) { |
1327 | tp = list_entry(probe_list.next, struct trace_probe, list); | 1344 | tp = list_entry(probe_list.next, struct trace_probe, list); |
1328 | unregister_trace_probe(tp); | 1345 | unregister_trace_probe(tp); |
1329 | free_trace_probe(tp); | 1346 | free_trace_probe(tp); |
1330 | } | 1347 | } |
1348 | |||
1349 | end: | ||
1331 | mutex_unlock(&probe_lock); | 1350 | mutex_unlock(&probe_lock); |
1351 | |||
1352 | return ret; | ||
1332 | } | 1353 | } |
1333 | 1354 | ||
1334 | /* Probes listing interfaces */ | 1355 | /* Probes listing interfaces */ |
@@ -1380,9 +1401,13 @@ static const struct seq_operations probes_seq_op = { | |||
1380 | 1401 | ||
1381 | static int probes_open(struct inode *inode, struct file *file) | 1402 | static int probes_open(struct inode *inode, struct file *file) |
1382 | { | 1403 | { |
1383 | if ((file->f_mode & FMODE_WRITE) && | 1404 | int ret; |
1384 | (file->f_flags & O_TRUNC)) | 1405 | |
1385 | release_all_trace_probes(); | 1406 | if ((file->f_mode & FMODE_WRITE) && (file->f_flags & O_TRUNC)) { |
1407 | ret = release_all_trace_probes(); | ||
1408 | if (ret < 0) | ||
1409 | return ret; | ||
1410 | } | ||
1386 | 1411 | ||
1387 | return seq_open(file, &probes_seq_op); | 1412 | return seq_open(file, &probes_seq_op); |
1388 | } | 1413 | } |
@@ -2055,6 +2080,21 @@ static __init int kprobe_trace_self_tests_init(void) | |||
2055 | 2080 | ||
2056 | ret = target(1, 2, 3, 4, 5, 6); | 2081 | ret = target(1, 2, 3, 4, 5, 6); |
2057 | 2082 | ||
2083 | /* Disable trace points before removing it */ | ||
2084 | tp = find_trace_probe("testprobe", KPROBE_EVENT_SYSTEM); | ||
2085 | if (WARN_ON_ONCE(tp == NULL)) { | ||
2086 | pr_warning("error on getting test probe.\n"); | ||
2087 | warn++; | ||
2088 | } else | ||
2089 | disable_trace_probe(tp, TP_FLAG_TRACE); | ||
2090 | |||
2091 | tp = find_trace_probe("testprobe2", KPROBE_EVENT_SYSTEM); | ||
2092 | if (WARN_ON_ONCE(tp == NULL)) { | ||
2093 | pr_warning("error on getting 2nd test probe.\n"); | ||
2094 | warn++; | ||
2095 | } else | ||
2096 | disable_trace_probe(tp, TP_FLAG_TRACE); | ||
2097 | |||
2058 | ret = command_trace_probe("-:testprobe"); | 2098 | ret = command_trace_probe("-:testprobe"); |
2059 | if (WARN_ON_ONCE(ret)) { | 2099 | if (WARN_ON_ONCE(ret)) { |
2060 | pr_warning("error on deleting a probe.\n"); | 2100 | pr_warning("error on deleting a probe.\n"); |
diff --git a/kernel/trace/trace_printk.c b/kernel/trace/trace_printk.c index 1f06468a10d7..6fd4ffd042f9 100644 --- a/kernel/trace/trace_printk.c +++ b/kernel/trace/trace_printk.c | |||
@@ -59,18 +59,19 @@ void hold_module_trace_bprintk_format(const char **start, const char **end) | |||
59 | continue; | 59 | continue; |
60 | } | 60 | } |
61 | 61 | ||
62 | fmt = NULL; | ||
62 | tb_fmt = kmalloc(sizeof(*tb_fmt), GFP_KERNEL); | 63 | tb_fmt = kmalloc(sizeof(*tb_fmt), GFP_KERNEL); |
63 | if (tb_fmt) | 64 | if (tb_fmt) { |
64 | fmt = kmalloc(strlen(*iter) + 1, GFP_KERNEL); | 65 | fmt = kmalloc(strlen(*iter) + 1, GFP_KERNEL); |
65 | if (tb_fmt && fmt) { | 66 | if (fmt) { |
66 | list_add_tail(&tb_fmt->list, &trace_bprintk_fmt_list); | 67 | list_add_tail(&tb_fmt->list, &trace_bprintk_fmt_list); |
67 | strcpy(fmt, *iter); | 68 | strcpy(fmt, *iter); |
68 | tb_fmt->fmt = fmt; | 69 | tb_fmt->fmt = fmt; |
69 | *iter = tb_fmt->fmt; | 70 | } else |
70 | } else { | 71 | kfree(tb_fmt); |
71 | kfree(tb_fmt); | ||
72 | *iter = NULL; | ||
73 | } | 72 | } |
73 | *iter = fmt; | ||
74 | |||
74 | } | 75 | } |
75 | mutex_unlock(&btrace_mutex); | 76 | mutex_unlock(&btrace_mutex); |
76 | } | 77 | } |
diff --git a/kernel/tracepoint.c b/kernel/tracepoint.c index b219f1449c54..db110b8ae030 100644 --- a/kernel/tracepoint.c +++ b/kernel/tracepoint.c | |||
@@ -34,11 +34,16 @@ extern struct tracepoint * const __stop___tracepoints_ptrs[]; | |||
34 | static const int tracepoint_debug; | 34 | static const int tracepoint_debug; |
35 | 35 | ||
36 | /* | 36 | /* |
37 | * tracepoints_mutex nests inside module_mutex. Tracepoints mutex protects the | 37 | * Tracepoints mutex protects the builtin and module tracepoints and the hash |
38 | * builtin and module tracepoints and the hash table. | 38 | * table, as well as the local module list. |
39 | */ | 39 | */ |
40 | static DEFINE_MUTEX(tracepoints_mutex); | 40 | static DEFINE_MUTEX(tracepoints_mutex); |
41 | 41 | ||
42 | #ifdef CONFIG_MODULES | ||
43 | /* Local list of struct module */ | ||
44 | static LIST_HEAD(tracepoint_module_list); | ||
45 | #endif /* CONFIG_MODULES */ | ||
46 | |||
42 | /* | 47 | /* |
43 | * Tracepoint hash table, containing the active tracepoints. | 48 | * Tracepoint hash table, containing the active tracepoints. |
44 | * Protected by tracepoints_mutex. | 49 | * Protected by tracepoints_mutex. |
@@ -292,9 +297,10 @@ static void disable_tracepoint(struct tracepoint *elem) | |||
292 | * @end: end of the range | 297 | * @end: end of the range |
293 | * | 298 | * |
294 | * Updates the probe callback corresponding to a range of tracepoints. | 299 | * Updates the probe callback corresponding to a range of tracepoints. |
300 | * Called with tracepoints_mutex held. | ||
295 | */ | 301 | */ |
296 | void tracepoint_update_probe_range(struct tracepoint * const *begin, | 302 | static void tracepoint_update_probe_range(struct tracepoint * const *begin, |
297 | struct tracepoint * const *end) | 303 | struct tracepoint * const *end) |
298 | { | 304 | { |
299 | struct tracepoint * const *iter; | 305 | struct tracepoint * const *iter; |
300 | struct tracepoint_entry *mark_entry; | 306 | struct tracepoint_entry *mark_entry; |
@@ -302,7 +308,6 @@ void tracepoint_update_probe_range(struct tracepoint * const *begin, | |||
302 | if (!begin) | 308 | if (!begin) |
303 | return; | 309 | return; |
304 | 310 | ||
305 | mutex_lock(&tracepoints_mutex); | ||
306 | for (iter = begin; iter < end; iter++) { | 311 | for (iter = begin; iter < end; iter++) { |
307 | mark_entry = get_tracepoint((*iter)->name); | 312 | mark_entry = get_tracepoint((*iter)->name); |
308 | if (mark_entry) { | 313 | if (mark_entry) { |
@@ -312,11 +317,27 @@ void tracepoint_update_probe_range(struct tracepoint * const *begin, | |||
312 | disable_tracepoint(*iter); | 317 | disable_tracepoint(*iter); |
313 | } | 318 | } |
314 | } | 319 | } |
315 | mutex_unlock(&tracepoints_mutex); | ||
316 | } | 320 | } |
317 | 321 | ||
322 | #ifdef CONFIG_MODULES | ||
323 | void module_update_tracepoints(void) | ||
324 | { | ||
325 | struct tp_module *tp_mod; | ||
326 | |||
327 | list_for_each_entry(tp_mod, &tracepoint_module_list, list) | ||
328 | tracepoint_update_probe_range(tp_mod->tracepoints_ptrs, | ||
329 | tp_mod->tracepoints_ptrs + tp_mod->num_tracepoints); | ||
330 | } | ||
331 | #else /* CONFIG_MODULES */ | ||
332 | void module_update_tracepoints(void) | ||
333 | { | ||
334 | } | ||
335 | #endif /* CONFIG_MODULES */ | ||
336 | |||
337 | |||
318 | /* | 338 | /* |
319 | * Update probes, removing the faulty probes. | 339 | * Update probes, removing the faulty probes. |
340 | * Called with tracepoints_mutex held. | ||
320 | */ | 341 | */ |
321 | static void tracepoint_update_probes(void) | 342 | static void tracepoint_update_probes(void) |
322 | { | 343 | { |
@@ -359,11 +380,12 @@ int tracepoint_probe_register(const char *name, void *probe, void *data) | |||
359 | 380 | ||
360 | mutex_lock(&tracepoints_mutex); | 381 | mutex_lock(&tracepoints_mutex); |
361 | old = tracepoint_add_probe(name, probe, data); | 382 | old = tracepoint_add_probe(name, probe, data); |
362 | mutex_unlock(&tracepoints_mutex); | 383 | if (IS_ERR(old)) { |
363 | if (IS_ERR(old)) | 384 | mutex_unlock(&tracepoints_mutex); |
364 | return PTR_ERR(old); | 385 | return PTR_ERR(old); |
365 | 386 | } | |
366 | tracepoint_update_probes(); /* may update entry */ | 387 | tracepoint_update_probes(); /* may update entry */ |
388 | mutex_unlock(&tracepoints_mutex); | ||
367 | release_probes(old); | 389 | release_probes(old); |
368 | return 0; | 390 | return 0; |
369 | } | 391 | } |
@@ -402,11 +424,12 @@ int tracepoint_probe_unregister(const char *name, void *probe, void *data) | |||
402 | 424 | ||
403 | mutex_lock(&tracepoints_mutex); | 425 | mutex_lock(&tracepoints_mutex); |
404 | old = tracepoint_remove_probe(name, probe, data); | 426 | old = tracepoint_remove_probe(name, probe, data); |
405 | mutex_unlock(&tracepoints_mutex); | 427 | if (IS_ERR(old)) { |
406 | if (IS_ERR(old)) | 428 | mutex_unlock(&tracepoints_mutex); |
407 | return PTR_ERR(old); | 429 | return PTR_ERR(old); |
408 | 430 | } | |
409 | tracepoint_update_probes(); /* may update entry */ | 431 | tracepoint_update_probes(); /* may update entry */ |
432 | mutex_unlock(&tracepoints_mutex); | ||
410 | release_probes(old); | 433 | release_probes(old); |
411 | return 0; | 434 | return 0; |
412 | } | 435 | } |
@@ -489,9 +512,8 @@ void tracepoint_probe_update_all(void) | |||
489 | if (!list_empty(&old_probes)) | 512 | if (!list_empty(&old_probes)) |
490 | list_replace_init(&old_probes, &release_probes); | 513 | list_replace_init(&old_probes, &release_probes); |
491 | need_update = 0; | 514 | need_update = 0; |
492 | mutex_unlock(&tracepoints_mutex); | ||
493 | |||
494 | tracepoint_update_probes(); | 515 | tracepoint_update_probes(); |
516 | mutex_unlock(&tracepoints_mutex); | ||
495 | list_for_each_entry_safe(pos, next, &release_probes, u.list) { | 517 | list_for_each_entry_safe(pos, next, &release_probes, u.list) { |
496 | list_del(&pos->u.list); | 518 | list_del(&pos->u.list); |
497 | call_rcu_sched(&pos->u.rcu, rcu_free_old_probes); | 519 | call_rcu_sched(&pos->u.rcu, rcu_free_old_probes); |
@@ -509,7 +531,7 @@ EXPORT_SYMBOL_GPL(tracepoint_probe_update_all); | |||
509 | * Will return the first tracepoint in the range if the input tracepoint is | 531 | * Will return the first tracepoint in the range if the input tracepoint is |
510 | * NULL. | 532 | * NULL. |
511 | */ | 533 | */ |
512 | int tracepoint_get_iter_range(struct tracepoint * const **tracepoint, | 534 | static int tracepoint_get_iter_range(struct tracepoint * const **tracepoint, |
513 | struct tracepoint * const *begin, struct tracepoint * const *end) | 535 | struct tracepoint * const *begin, struct tracepoint * const *end) |
514 | { | 536 | { |
515 | if (!*tracepoint && begin != end) { | 537 | if (!*tracepoint && begin != end) { |
@@ -520,11 +542,12 @@ int tracepoint_get_iter_range(struct tracepoint * const **tracepoint, | |||
520 | return 1; | 542 | return 1; |
521 | return 0; | 543 | return 0; |
522 | } | 544 | } |
523 | EXPORT_SYMBOL_GPL(tracepoint_get_iter_range); | ||
524 | 545 | ||
546 | #ifdef CONFIG_MODULES | ||
525 | static void tracepoint_get_iter(struct tracepoint_iter *iter) | 547 | static void tracepoint_get_iter(struct tracepoint_iter *iter) |
526 | { | 548 | { |
527 | int found = 0; | 549 | int found = 0; |
550 | struct tp_module *iter_mod; | ||
528 | 551 | ||
529 | /* Core kernel tracepoints */ | 552 | /* Core kernel tracepoints */ |
530 | if (!iter->module) { | 553 | if (!iter->module) { |
@@ -534,12 +557,43 @@ static void tracepoint_get_iter(struct tracepoint_iter *iter) | |||
534 | if (found) | 557 | if (found) |
535 | goto end; | 558 | goto end; |
536 | } | 559 | } |
537 | /* tracepoints in modules. */ | 560 | /* Tracepoints in modules */ |
538 | found = module_get_iter_tracepoints(iter); | 561 | mutex_lock(&tracepoints_mutex); |
562 | list_for_each_entry(iter_mod, &tracepoint_module_list, list) { | ||
563 | /* | ||
564 | * Sorted module list | ||
565 | */ | ||
566 | if (iter_mod < iter->module) | ||
567 | continue; | ||
568 | else if (iter_mod > iter->module) | ||
569 | iter->tracepoint = NULL; | ||
570 | found = tracepoint_get_iter_range(&iter->tracepoint, | ||
571 | iter_mod->tracepoints_ptrs, | ||
572 | iter_mod->tracepoints_ptrs | ||
573 | + iter_mod->num_tracepoints); | ||
574 | if (found) { | ||
575 | iter->module = iter_mod; | ||
576 | break; | ||
577 | } | ||
578 | } | ||
579 | mutex_unlock(&tracepoints_mutex); | ||
539 | end: | 580 | end: |
540 | if (!found) | 581 | if (!found) |
541 | tracepoint_iter_reset(iter); | 582 | tracepoint_iter_reset(iter); |
542 | } | 583 | } |
584 | #else /* CONFIG_MODULES */ | ||
585 | static void tracepoint_get_iter(struct tracepoint_iter *iter) | ||
586 | { | ||
587 | int found = 0; | ||
588 | |||
589 | /* Core kernel tracepoints */ | ||
590 | found = tracepoint_get_iter_range(&iter->tracepoint, | ||
591 | __start___tracepoints_ptrs, | ||
592 | __stop___tracepoints_ptrs); | ||
593 | if (!found) | ||
594 | tracepoint_iter_reset(iter); | ||
595 | } | ||
596 | #endif /* CONFIG_MODULES */ | ||
543 | 597 | ||
544 | void tracepoint_iter_start(struct tracepoint_iter *iter) | 598 | void tracepoint_iter_start(struct tracepoint_iter *iter) |
545 | { | 599 | { |
@@ -566,26 +620,98 @@ EXPORT_SYMBOL_GPL(tracepoint_iter_stop); | |||
566 | 620 | ||
567 | void tracepoint_iter_reset(struct tracepoint_iter *iter) | 621 | void tracepoint_iter_reset(struct tracepoint_iter *iter) |
568 | { | 622 | { |
623 | #ifdef CONFIG_MODULES | ||
569 | iter->module = NULL; | 624 | iter->module = NULL; |
625 | #endif /* CONFIG_MODULES */ | ||
570 | iter->tracepoint = NULL; | 626 | iter->tracepoint = NULL; |
571 | } | 627 | } |
572 | EXPORT_SYMBOL_GPL(tracepoint_iter_reset); | 628 | EXPORT_SYMBOL_GPL(tracepoint_iter_reset); |
573 | 629 | ||
574 | #ifdef CONFIG_MODULES | 630 | #ifdef CONFIG_MODULES |
631 | static int tracepoint_module_coming(struct module *mod) | ||
632 | { | ||
633 | struct tp_module *tp_mod, *iter; | ||
634 | int ret = 0; | ||
635 | |||
636 | /* | ||
637 | * We skip modules that tain the kernel, especially those with different | ||
638 | * module header (for forced load), to make sure we don't cause a crash. | ||
639 | */ | ||
640 | if (mod->taints) | ||
641 | return 0; | ||
642 | mutex_lock(&tracepoints_mutex); | ||
643 | tp_mod = kmalloc(sizeof(struct tp_module), GFP_KERNEL); | ||
644 | if (!tp_mod) { | ||
645 | ret = -ENOMEM; | ||
646 | goto end; | ||
647 | } | ||
648 | tp_mod->num_tracepoints = mod->num_tracepoints; | ||
649 | tp_mod->tracepoints_ptrs = mod->tracepoints_ptrs; | ||
650 | |||
651 | /* | ||
652 | * tracepoint_module_list is kept sorted by struct module pointer | ||
653 | * address for iteration on tracepoints from a seq_file that can release | ||
654 | * the mutex between calls. | ||
655 | */ | ||
656 | list_for_each_entry_reverse(iter, &tracepoint_module_list, list) { | ||
657 | BUG_ON(iter == tp_mod); /* Should never be in the list twice */ | ||
658 | if (iter < tp_mod) { | ||
659 | /* We belong to the location right after iter. */ | ||
660 | list_add(&tp_mod->list, &iter->list); | ||
661 | goto module_added; | ||
662 | } | ||
663 | } | ||
664 | /* We belong to the beginning of the list */ | ||
665 | list_add(&tp_mod->list, &tracepoint_module_list); | ||
666 | module_added: | ||
667 | tracepoint_update_probe_range(mod->tracepoints_ptrs, | ||
668 | mod->tracepoints_ptrs + mod->num_tracepoints); | ||
669 | end: | ||
670 | mutex_unlock(&tracepoints_mutex); | ||
671 | return ret; | ||
672 | } | ||
673 | |||
674 | static int tracepoint_module_going(struct module *mod) | ||
675 | { | ||
676 | struct tp_module *pos; | ||
677 | |||
678 | mutex_lock(&tracepoints_mutex); | ||
679 | tracepoint_update_probe_range(mod->tracepoints_ptrs, | ||
680 | mod->tracepoints_ptrs + mod->num_tracepoints); | ||
681 | list_for_each_entry(pos, &tracepoint_module_list, list) { | ||
682 | if (pos->tracepoints_ptrs == mod->tracepoints_ptrs) { | ||
683 | list_del(&pos->list); | ||
684 | kfree(pos); | ||
685 | break; | ||
686 | } | ||
687 | } | ||
688 | /* | ||
689 | * In the case of modules that were tainted at "coming", we'll simply | ||
690 | * walk through the list without finding it. We cannot use the "tainted" | ||
691 | * flag on "going", in case a module taints the kernel only after being | ||
692 | * loaded. | ||
693 | */ | ||
694 | mutex_unlock(&tracepoints_mutex); | ||
695 | return 0; | ||
696 | } | ||
575 | 697 | ||
576 | int tracepoint_module_notify(struct notifier_block *self, | 698 | int tracepoint_module_notify(struct notifier_block *self, |
577 | unsigned long val, void *data) | 699 | unsigned long val, void *data) |
578 | { | 700 | { |
579 | struct module *mod = data; | 701 | struct module *mod = data; |
702 | int ret = 0; | ||
580 | 703 | ||
581 | switch (val) { | 704 | switch (val) { |
582 | case MODULE_STATE_COMING: | 705 | case MODULE_STATE_COMING: |
706 | ret = tracepoint_module_coming(mod); | ||
707 | break; | ||
708 | case MODULE_STATE_LIVE: | ||
709 | break; | ||
583 | case MODULE_STATE_GOING: | 710 | case MODULE_STATE_GOING: |
584 | tracepoint_update_probe_range(mod->tracepoints_ptrs, | 711 | ret = tracepoint_module_going(mod); |
585 | mod->tracepoints_ptrs + mod->num_tracepoints); | ||
586 | break; | 712 | break; |
587 | } | 713 | } |
588 | return 0; | 714 | return ret; |
589 | } | 715 | } |
590 | 716 | ||
591 | struct notifier_block tracepoint_module_nb = { | 717 | struct notifier_block tracepoint_module_nb = { |
@@ -598,7 +724,6 @@ static int init_tracepoints(void) | |||
598 | return register_module_notifier(&tracepoint_module_nb); | 724 | return register_module_notifier(&tracepoint_module_nb); |
599 | } | 725 | } |
600 | __initcall(init_tracepoints); | 726 | __initcall(init_tracepoints); |
601 | |||
602 | #endif /* CONFIG_MODULES */ | 727 | #endif /* CONFIG_MODULES */ |
603 | 728 | ||
604 | #ifdef CONFIG_HAVE_SYSCALL_TRACEPOINTS | 729 | #ifdef CONFIG_HAVE_SYSCALL_TRACEPOINTS |
diff --git a/kernel/tsacct.c b/kernel/tsacct.c index 24dc60d9fa1f..5bbfac85866e 100644 --- a/kernel/tsacct.c +++ b/kernel/tsacct.c | |||
@@ -78,6 +78,7 @@ void bacct_add_tsk(struct taskstats *stats, struct task_struct *tsk) | |||
78 | 78 | ||
79 | #define KB 1024 | 79 | #define KB 1024 |
80 | #define MB (1024*KB) | 80 | #define MB (1024*KB) |
81 | #define KB_MASK (~(KB-1)) | ||
81 | /* | 82 | /* |
82 | * fill in extended accounting fields | 83 | * fill in extended accounting fields |
83 | */ | 84 | */ |
@@ -95,14 +96,14 @@ void xacct_add_tsk(struct taskstats *stats, struct task_struct *p) | |||
95 | stats->hiwater_vm = get_mm_hiwater_vm(mm) * PAGE_SIZE / KB; | 96 | stats->hiwater_vm = get_mm_hiwater_vm(mm) * PAGE_SIZE / KB; |
96 | mmput(mm); | 97 | mmput(mm); |
97 | } | 98 | } |
98 | stats->read_char = p->ioac.rchar; | 99 | stats->read_char = p->ioac.rchar & KB_MASK; |
99 | stats->write_char = p->ioac.wchar; | 100 | stats->write_char = p->ioac.wchar & KB_MASK; |
100 | stats->read_syscalls = p->ioac.syscr; | 101 | stats->read_syscalls = p->ioac.syscr & KB_MASK; |
101 | stats->write_syscalls = p->ioac.syscw; | 102 | stats->write_syscalls = p->ioac.syscw & KB_MASK; |
102 | #ifdef CONFIG_TASK_IO_ACCOUNTING | 103 | #ifdef CONFIG_TASK_IO_ACCOUNTING |
103 | stats->read_bytes = p->ioac.read_bytes; | 104 | stats->read_bytes = p->ioac.read_bytes & KB_MASK; |
104 | stats->write_bytes = p->ioac.write_bytes; | 105 | stats->write_bytes = p->ioac.write_bytes & KB_MASK; |
105 | stats->cancelled_write_bytes = p->ioac.cancelled_write_bytes; | 106 | stats->cancelled_write_bytes = p->ioac.cancelled_write_bytes & KB_MASK; |
106 | #else | 107 | #else |
107 | stats->read_bytes = 0; | 108 | stats->read_bytes = 0; |
108 | stats->write_bytes = 0; | 109 | stats->write_bytes = 0; |
diff --git a/kernel/watchdog.c b/kernel/watchdog.c index 36491cd5b7d4..d680381b0e9c 100644 --- a/kernel/watchdog.c +++ b/kernel/watchdog.c | |||
@@ -321,7 +321,7 @@ static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer) | |||
321 | */ | 321 | */ |
322 | static int watchdog(void *unused) | 322 | static int watchdog(void *unused) |
323 | { | 323 | { |
324 | static struct sched_param param = { .sched_priority = MAX_RT_PRIO-1 }; | 324 | struct sched_param param = { .sched_priority = MAX_RT_PRIO-1 }; |
325 | struct hrtimer *hrtimer = &__raw_get_cpu_var(watchdog_hrtimer); | 325 | struct hrtimer *hrtimer = &__raw_get_cpu_var(watchdog_hrtimer); |
326 | 326 | ||
327 | sched_setscheduler(current, SCHED_FIFO, ¶m); | 327 | sched_setscheduler(current, SCHED_FIFO, ¶m); |
@@ -350,7 +350,8 @@ static int watchdog(void *unused) | |||
350 | set_current_state(TASK_INTERRUPTIBLE); | 350 | set_current_state(TASK_INTERRUPTIBLE); |
351 | } | 351 | } |
352 | __set_current_state(TASK_RUNNING); | 352 | __set_current_state(TASK_RUNNING); |
353 | 353 | param.sched_priority = 0; | |
354 | sched_setscheduler(current, SCHED_NORMAL, ¶m); | ||
354 | return 0; | 355 | return 0; |
355 | } | 356 | } |
356 | 357 | ||
@@ -438,7 +439,7 @@ static int watchdog_enable(int cpu) | |||
438 | 439 | ||
439 | /* create the watchdog thread */ | 440 | /* create the watchdog thread */ |
440 | if (!p) { | 441 | if (!p) { |
441 | p = kthread_create(watchdog, (void *)(unsigned long)cpu, "watchdog/%d", cpu); | 442 | p = kthread_create_on_node(watchdog, NULL, cpu_to_node(cpu), "watchdog/%d", cpu); |
442 | if (IS_ERR(p)) { | 443 | if (IS_ERR(p)) { |
443 | printk(KERN_ERR "softlockup watchdog for %i failed\n", cpu); | 444 | printk(KERN_ERR "softlockup watchdog for %i failed\n", cpu); |
444 | if (!err) { | 445 | if (!err) { |
diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 25fb1b0e53fa..1783aabc6128 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c | |||
@@ -2412,8 +2412,13 @@ reflush: | |||
2412 | 2412 | ||
2413 | for_each_cwq_cpu(cpu, wq) { | 2413 | for_each_cwq_cpu(cpu, wq) { |
2414 | struct cpu_workqueue_struct *cwq = get_cwq(cpu, wq); | 2414 | struct cpu_workqueue_struct *cwq = get_cwq(cpu, wq); |
2415 | bool drained; | ||
2415 | 2416 | ||
2416 | if (!cwq->nr_active && list_empty(&cwq->delayed_works)) | 2417 | spin_lock_irq(&cwq->gcwq->lock); |
2418 | drained = !cwq->nr_active && list_empty(&cwq->delayed_works); | ||
2419 | spin_unlock_irq(&cwq->gcwq->lock); | ||
2420 | |||
2421 | if (drained) | ||
2417 | continue; | 2422 | continue; |
2418 | 2423 | ||
2419 | if (++flush_cnt == 10 || | 2424 | if (++flush_cnt == 10 || |