aboutsummaryrefslogtreecommitdiffstats
path: root/kernel
diff options
context:
space:
mode:
authorThomas Gleixner <tglx@linutronix.de>2011-05-20 14:06:24 -0400
committerThomas Gleixner <tglx@linutronix.de>2011-05-20 14:08:05 -0400
commit250f972d85effad5b6e10da4bbd877e6a4b503b6 (patch)
tree007393a6fc6439af7e0121dd99a6f9f9fb8405bc /kernel
parent7372b0b122af0f6675f3ab65bfd91c8a438e0480 (diff)
parentbbe7b8bef48c567f5ff3f6041c1fb011292e8f12 (diff)
Merge branch 'timers/urgent' into timers/core
Reason: Get upstream fixes and kfree_rcu which is necessary for a follow up patch. Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Diffstat (limited to 'kernel')
-rw-r--r--kernel/Makefile6
-rw-r--r--kernel/capability.c12
-rw-r--r--kernel/cgroup.c27
-rw-r--r--kernel/cpuset.c2
-rw-r--r--kernel/cred.c12
-rw-r--r--kernel/events/Makefile6
-rw-r--r--kernel/events/core.c (renamed from kernel/perf_event.c)64
-rw-r--r--kernel/events/hw_breakpoint.c (renamed from kernel/hw_breakpoint.c)0
-rw-r--r--kernel/exit.c2
-rw-r--r--kernel/extable.c8
-rw-r--r--kernel/fork.c5
-rw-r--r--kernel/freezer.c4
-rw-r--r--kernel/hung_task.c2
-rw-r--r--kernel/irq/Kconfig4
-rw-r--r--kernel/irq/Makefile1
-rw-r--r--kernel/irq/chip.c3
-rw-r--r--kernel/irq/debug.h1
-rw-r--r--kernel/irq/generic-chip.c354
-rw-r--r--kernel/irq/irqdesc.c22
-rw-r--r--kernel/irq/manage.c3
-rw-r--r--kernel/irq/proc.c2
-rw-r--r--kernel/irq/settings.h17
-rw-r--r--kernel/jump_label.c539
-rw-r--r--kernel/kexec.c9
-rw-r--r--kernel/kmod.c16
-rw-r--r--kernel/ksysfs.c10
-rw-r--r--kernel/lockdep.c206
-rw-r--r--kernel/module.c105
-rw-r--r--kernel/mutex-debug.c2
-rw-r--r--kernel/mutex-debug.h2
-rw-r--r--kernel/mutex.c9
-rw-r--r--kernel/mutex.h2
-rw-r--r--kernel/params.c23
-rw-r--r--kernel/power/Kconfig10
-rw-r--r--kernel/power/hibernate.c58
-rw-r--r--kernel/power/main.c1
-rw-r--r--kernel/power/power.h4
-rw-r--r--kernel/power/snapshot.c33
-rw-r--r--kernel/power/suspend.c14
-rw-r--r--kernel/power/user.c5
-rw-r--r--kernel/ptrace.c17
-rw-r--r--kernel/rcupdate.c32
-rw-r--r--kernel/rcutiny.c45
-rw-r--r--kernel/rcutiny_plugin.h203
-rw-r--r--kernel/rcutorture.c26
-rw-r--r--kernel/rcutree.c526
-rw-r--r--kernel/rcutree.h104
-rw-r--r--kernel/rcutree_plugin.h568
-rw-r--r--kernel/rcutree_trace.c180
-rw-r--r--kernel/sched.c1658
-rw-r--r--kernel/sched_debug.c6
-rw-r--r--kernel/sched_fair.c126
-rw-r--r--kernel/sched_features.h6
-rw-r--r--kernel/sched_idletask.c2
-rw-r--r--kernel/sched_rt.c83
-rw-r--r--kernel/sched_stoptask.c5
-rw-r--r--kernel/softirq.c2
-rw-r--r--kernel/sys.c5
-rw-r--r--kernel/time/clockevents.c64
-rw-r--r--kernel/time/clocksource.c42
-rw-r--r--kernel/time/tick-broadcast.c12
-rw-r--r--kernel/trace/ftrace.c1261
-rw-r--r--kernel/trace/trace.c16
-rw-r--r--kernel/trace/trace.h2
-rw-r--r--kernel/trace/trace_events.c1
-rw-r--r--kernel/trace/trace_functions.c2
-rw-r--r--kernel/trace/trace_irqsoff.c1
-rw-r--r--kernel/trace/trace_kprobe.c1
-rw-r--r--kernel/trace/trace_output.c3
-rw-r--r--kernel/trace/trace_printk.c120
-rw-r--r--kernel/trace/trace_sched_wakeup.c1
-rw-r--r--kernel/trace/trace_selftest.c214
-rw-r--r--kernel/trace/trace_selftest_dynamic.c6
-rw-r--r--kernel/trace/trace_stack.c1
-rw-r--r--kernel/tracepoint.c23
75 files changed, 4643 insertions, 2326 deletions
diff --git a/kernel/Makefile b/kernel/Makefile
index 85cbfb31e73e..e9cf19155b46 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -21,7 +21,6 @@ CFLAGS_REMOVE_mutex-debug.o = -pg
21CFLAGS_REMOVE_rtmutex-debug.o = -pg 21CFLAGS_REMOVE_rtmutex-debug.o = -pg
22CFLAGS_REMOVE_cgroup-debug.o = -pg 22CFLAGS_REMOVE_cgroup-debug.o = -pg
23CFLAGS_REMOVE_sched_clock.o = -pg 23CFLAGS_REMOVE_sched_clock.o = -pg
24CFLAGS_REMOVE_perf_event.o = -pg
25CFLAGS_REMOVE_irq_work.o = -pg 24CFLAGS_REMOVE_irq_work.o = -pg
26endif 25endif
27 26
@@ -103,8 +102,9 @@ obj-$(CONFIG_RING_BUFFER) += trace/
103obj-$(CONFIG_TRACEPOINTS) += trace/ 102obj-$(CONFIG_TRACEPOINTS) += trace/
104obj-$(CONFIG_SMP) += sched_cpupri.o 103obj-$(CONFIG_SMP) += sched_cpupri.o
105obj-$(CONFIG_IRQ_WORK) += irq_work.o 104obj-$(CONFIG_IRQ_WORK) += irq_work.o
106obj-$(CONFIG_PERF_EVENTS) += perf_event.o 105
107obj-$(CONFIG_HAVE_HW_BREAKPOINT) += hw_breakpoint.o 106obj-$(CONFIG_PERF_EVENTS) += events/
107
108obj-$(CONFIG_USER_RETURN_NOTIFIER) += user-return-notifier.o 108obj-$(CONFIG_USER_RETURN_NOTIFIER) += user-return-notifier.o
109obj-$(CONFIG_PADATA) += padata.o 109obj-$(CONFIG_PADATA) += padata.o
110obj-$(CONFIG_CRASH_DUMP) += crash_dump.o 110obj-$(CONFIG_CRASH_DUMP) += crash_dump.o
diff --git a/kernel/capability.c b/kernel/capability.c
index bf0c734d0c12..32a80e08ff4b 100644
--- a/kernel/capability.c
+++ b/kernel/capability.c
@@ -399,3 +399,15 @@ bool task_ns_capable(struct task_struct *t, int cap)
399 return ns_capable(task_cred_xxx(t, user)->user_ns, cap); 399 return ns_capable(task_cred_xxx(t, user)->user_ns, cap);
400} 400}
401EXPORT_SYMBOL(task_ns_capable); 401EXPORT_SYMBOL(task_ns_capable);
402
403/**
404 * nsown_capable - Check superior capability to one's own user_ns
405 * @cap: The capability in question
406 *
407 * Return true if the current task has the given superior capability
408 * targeted at its own user namespace.
409 */
410bool nsown_capable(int cap)
411{
412 return ns_capable(current_user_ns(), cap);
413}
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 25c7eb52de1a..909a35510af5 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -326,12 +326,6 @@ static struct hlist_head *css_set_hash(struct cgroup_subsys_state *css[])
326 return &css_set_table[index]; 326 return &css_set_table[index];
327} 327}
328 328
329static void free_css_set_rcu(struct rcu_head *obj)
330{
331 struct css_set *cg = container_of(obj, struct css_set, rcu_head);
332 kfree(cg);
333}
334
335/* We don't maintain the lists running through each css_set to its 329/* We don't maintain the lists running through each css_set to its
336 * task until after the first call to cgroup_iter_start(). This 330 * task until after the first call to cgroup_iter_start(). This
337 * reduces the fork()/exit() overhead for people who have cgroups 331 * reduces the fork()/exit() overhead for people who have cgroups
@@ -375,7 +369,7 @@ static void __put_css_set(struct css_set *cg, int taskexit)
375 } 369 }
376 370
377 write_unlock(&css_set_lock); 371 write_unlock(&css_set_lock);
378 call_rcu(&cg->rcu_head, free_css_set_rcu); 372 kfree_rcu(cg, rcu_head);
379} 373}
380 374
381/* 375/*
@@ -812,13 +806,6 @@ static int cgroup_call_pre_destroy(struct cgroup *cgrp)
812 return ret; 806 return ret;
813} 807}
814 808
815static void free_cgroup_rcu(struct rcu_head *obj)
816{
817 struct cgroup *cgrp = container_of(obj, struct cgroup, rcu_head);
818
819 kfree(cgrp);
820}
821
822static void cgroup_diput(struct dentry *dentry, struct inode *inode) 809static void cgroup_diput(struct dentry *dentry, struct inode *inode)
823{ 810{
824 /* is dentry a directory ? if so, kfree() associated cgroup */ 811 /* is dentry a directory ? if so, kfree() associated cgroup */
@@ -856,7 +843,7 @@ static void cgroup_diput(struct dentry *dentry, struct inode *inode)
856 */ 843 */
857 BUG_ON(!list_empty(&cgrp->pidlists)); 844 BUG_ON(!list_empty(&cgrp->pidlists));
858 845
859 call_rcu(&cgrp->rcu_head, free_cgroup_rcu); 846 kfree_rcu(cgrp, rcu_head);
860 } 847 }
861 iput(inode); 848 iput(inode);
862} 849}
@@ -4623,14 +4610,6 @@ bool css_is_ancestor(struct cgroup_subsys_state *child,
4623 return ret; 4610 return ret;
4624} 4611}
4625 4612
4626static void __free_css_id_cb(struct rcu_head *head)
4627{
4628 struct css_id *id;
4629
4630 id = container_of(head, struct css_id, rcu_head);
4631 kfree(id);
4632}
4633
4634void free_css_id(struct cgroup_subsys *ss, struct cgroup_subsys_state *css) 4613void free_css_id(struct cgroup_subsys *ss, struct cgroup_subsys_state *css)
4635{ 4614{
4636 struct css_id *id = css->id; 4615 struct css_id *id = css->id;
@@ -4645,7 +4624,7 @@ void free_css_id(struct cgroup_subsys *ss, struct cgroup_subsys_state *css)
4645 spin_lock(&ss->id_lock); 4624 spin_lock(&ss->id_lock);
4646 idr_remove(&ss->idr, id->id); 4625 idr_remove(&ss->idr, id->id);
4647 spin_unlock(&ss->id_lock); 4626 spin_unlock(&ss->id_lock);
4648 call_rcu(&id->rcu_head, __free_css_id_cb); 4627 kfree_rcu(id, rcu_head);
4649} 4628}
4650EXPORT_SYMBOL_GPL(free_css_id); 4629EXPORT_SYMBOL_GPL(free_css_id);
4651 4630
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index 33eee16addb8..2bb8c2e98fff 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -1159,7 +1159,7 @@ int current_cpuset_is_being_rebound(void)
1159static int update_relax_domain_level(struct cpuset *cs, s64 val) 1159static int update_relax_domain_level(struct cpuset *cs, s64 val)
1160{ 1160{
1161#ifdef CONFIG_SMP 1161#ifdef CONFIG_SMP
1162 if (val < -1 || val >= SD_LV_MAX) 1162 if (val < -1 || val >= sched_domain_level_max)
1163 return -EINVAL; 1163 return -EINVAL;
1164#endif 1164#endif
1165 1165
diff --git a/kernel/cred.c b/kernel/cred.c
index 5557b55048df..8093c16b84b1 100644
--- a/kernel/cred.c
+++ b/kernel/cred.c
@@ -54,6 +54,7 @@ struct cred init_cred = {
54 .cap_effective = CAP_INIT_EFF_SET, 54 .cap_effective = CAP_INIT_EFF_SET,
55 .cap_bset = CAP_INIT_BSET, 55 .cap_bset = CAP_INIT_BSET,
56 .user = INIT_USER, 56 .user = INIT_USER,
57 .user_ns = &init_user_ns,
57 .group_info = &init_groups, 58 .group_info = &init_groups,
58#ifdef CONFIG_KEYS 59#ifdef CONFIG_KEYS
59 .tgcred = &init_tgcred, 60 .tgcred = &init_tgcred,
@@ -410,6 +411,11 @@ int copy_creds(struct task_struct *p, unsigned long clone_flags)
410 goto error_put; 411 goto error_put;
411 } 412 }
412 413
414 /* cache user_ns in cred. Doesn't need a refcount because it will
415 * stay pinned by cred->user
416 */
417 new->user_ns = new->user->user_ns;
418
413#ifdef CONFIG_KEYS 419#ifdef CONFIG_KEYS
414 /* new threads get their own thread keyrings if their parent already 420 /* new threads get their own thread keyrings if their parent already
415 * had one */ 421 * had one */
@@ -741,12 +747,6 @@ int set_create_files_as(struct cred *new, struct inode *inode)
741} 747}
742EXPORT_SYMBOL(set_create_files_as); 748EXPORT_SYMBOL(set_create_files_as);
743 749
744struct user_namespace *current_user_ns(void)
745{
746 return _current_user_ns();
747}
748EXPORT_SYMBOL(current_user_ns);
749
750#ifdef CONFIG_DEBUG_CREDENTIALS 750#ifdef CONFIG_DEBUG_CREDENTIALS
751 751
752bool creds_are_invalid(const struct cred *cred) 752bool creds_are_invalid(const struct cred *cred)
diff --git a/kernel/events/Makefile b/kernel/events/Makefile
new file mode 100644
index 000000000000..1ce23d3d8394
--- /dev/null
+++ b/kernel/events/Makefile
@@ -0,0 +1,6 @@
1ifdef CONFIG_FUNCTION_TRACER
2CFLAGS_REMOVE_core.o = -pg
3endif
4
5obj-y := core.o
6obj-$(CONFIG_HAVE_HW_BREAKPOINT) += hw_breakpoint.o
diff --git a/kernel/perf_event.c b/kernel/events/core.c
index 8e81a9860a0d..c09767f7db3e 100644
--- a/kernel/perf_event.c
+++ b/kernel/events/core.c
@@ -2,8 +2,8 @@
2 * Performance events core code: 2 * Performance events core code:
3 * 3 *
4 * Copyright (C) 2008 Thomas Gleixner <tglx@linutronix.de> 4 * Copyright (C) 2008 Thomas Gleixner <tglx@linutronix.de>
5 * Copyright (C) 2008-2009 Red Hat, Inc., Ingo Molnar 5 * Copyright (C) 2008-2011 Red Hat, Inc., Ingo Molnar
6 * Copyright (C) 2008-2009 Red Hat, Inc., Peter Zijlstra <pzijlstr@redhat.com> 6 * Copyright (C) 2008-2011 Red Hat, Inc., Peter Zijlstra <pzijlstr@redhat.com>
7 * Copyright © 2009 Paul Mackerras, IBM Corp. <paulus@au1.ibm.com> 7 * Copyright © 2009 Paul Mackerras, IBM Corp. <paulus@au1.ibm.com>
8 * 8 *
9 * For licensing details see kernel-base/COPYING 9 * For licensing details see kernel-base/COPYING
@@ -39,10 +39,10 @@
39#include <asm/irq_regs.h> 39#include <asm/irq_regs.h>
40 40
41struct remote_function_call { 41struct remote_function_call {
42 struct task_struct *p; 42 struct task_struct *p;
43 int (*func)(void *info); 43 int (*func)(void *info);
44 void *info; 44 void *info;
45 int ret; 45 int ret;
46}; 46};
47 47
48static void remote_function(void *data) 48static void remote_function(void *data)
@@ -76,10 +76,10 @@ static int
76task_function_call(struct task_struct *p, int (*func) (void *info), void *info) 76task_function_call(struct task_struct *p, int (*func) (void *info), void *info)
77{ 77{
78 struct remote_function_call data = { 78 struct remote_function_call data = {
79 .p = p, 79 .p = p,
80 .func = func, 80 .func = func,
81 .info = info, 81 .info = info,
82 .ret = -ESRCH, /* No such (running) process */ 82 .ret = -ESRCH, /* No such (running) process */
83 }; 83 };
84 84
85 if (task_curr(p)) 85 if (task_curr(p))
@@ -100,10 +100,10 @@ task_function_call(struct task_struct *p, int (*func) (void *info), void *info)
100static int cpu_function_call(int cpu, int (*func) (void *info), void *info) 100static int cpu_function_call(int cpu, int (*func) (void *info), void *info)
101{ 101{
102 struct remote_function_call data = { 102 struct remote_function_call data = {
103 .p = NULL, 103 .p = NULL,
104 .func = func, 104 .func = func,
105 .info = info, 105 .info = info,
106 .ret = -ENXIO, /* No such CPU */ 106 .ret = -ENXIO, /* No such CPU */
107 }; 107 };
108 108
109 smp_call_function_single(cpu, remote_function, &data, 1); 109 smp_call_function_single(cpu, remote_function, &data, 1);
@@ -125,7 +125,7 @@ enum event_type_t {
125 * perf_sched_events : >0 events exist 125 * perf_sched_events : >0 events exist
126 * perf_cgroup_events: >0 per-cpu cgroup events exist on this cpu 126 * perf_cgroup_events: >0 per-cpu cgroup events exist on this cpu
127 */ 127 */
128atomic_t perf_sched_events __read_mostly; 128struct jump_label_key perf_sched_events __read_mostly;
129static DEFINE_PER_CPU(atomic_t, perf_cgroup_events); 129static DEFINE_PER_CPU(atomic_t, perf_cgroup_events);
130 130
131static atomic_t nr_mmap_events __read_mostly; 131static atomic_t nr_mmap_events __read_mostly;
@@ -586,14 +586,6 @@ static void get_ctx(struct perf_event_context *ctx)
586 WARN_ON(!atomic_inc_not_zero(&ctx->refcount)); 586 WARN_ON(!atomic_inc_not_zero(&ctx->refcount));
587} 587}
588 588
589static void free_ctx(struct rcu_head *head)
590{
591 struct perf_event_context *ctx;
592
593 ctx = container_of(head, struct perf_event_context, rcu_head);
594 kfree(ctx);
595}
596
597static void put_ctx(struct perf_event_context *ctx) 589static void put_ctx(struct perf_event_context *ctx)
598{ 590{
599 if (atomic_dec_and_test(&ctx->refcount)) { 591 if (atomic_dec_and_test(&ctx->refcount)) {
@@ -601,7 +593,7 @@ static void put_ctx(struct perf_event_context *ctx)
601 put_ctx(ctx->parent_ctx); 593 put_ctx(ctx->parent_ctx);
602 if (ctx->task) 594 if (ctx->task)
603 put_task_struct(ctx->task); 595 put_task_struct(ctx->task);
604 call_rcu(&ctx->rcu_head, free_ctx); 596 kfree_rcu(ctx, rcu_head);
605 } 597 }
606} 598}
607 599
@@ -5331,14 +5323,6 @@ swevent_hlist_deref(struct swevent_htable *swhash)
5331 lockdep_is_held(&swhash->hlist_mutex)); 5323 lockdep_is_held(&swhash->hlist_mutex));
5332} 5324}
5333 5325
5334static void swevent_hlist_release_rcu(struct rcu_head *rcu_head)
5335{
5336 struct swevent_hlist *hlist;
5337
5338 hlist = container_of(rcu_head, struct swevent_hlist, rcu_head);
5339 kfree(hlist);
5340}
5341
5342static void swevent_hlist_release(struct swevent_htable *swhash) 5326static void swevent_hlist_release(struct swevent_htable *swhash)
5343{ 5327{
5344 struct swevent_hlist *hlist = swevent_hlist_deref(swhash); 5328 struct swevent_hlist *hlist = swevent_hlist_deref(swhash);
@@ -5347,7 +5331,7 @@ static void swevent_hlist_release(struct swevent_htable *swhash)
5347 return; 5331 return;
5348 5332
5349 rcu_assign_pointer(swhash->swevent_hlist, NULL); 5333 rcu_assign_pointer(swhash->swevent_hlist, NULL);
5350 call_rcu(&hlist->rcu_head, swevent_hlist_release_rcu); 5334 kfree_rcu(hlist, rcu_head);
5351} 5335}
5352 5336
5353static void swevent_hlist_put_cpu(struct perf_event *event, int cpu) 5337static void swevent_hlist_put_cpu(struct perf_event *event, int cpu)
@@ -5429,7 +5413,7 @@ fail:
5429 return err; 5413 return err;
5430} 5414}
5431 5415
5432atomic_t perf_swevent_enabled[PERF_COUNT_SW_MAX]; 5416struct jump_label_key perf_swevent_enabled[PERF_COUNT_SW_MAX];
5433 5417
5434static void sw_perf_event_destroy(struct perf_event *event) 5418static void sw_perf_event_destroy(struct perf_event *event)
5435{ 5419{
@@ -7445,11 +7429,11 @@ static void perf_cgroup_exit(struct cgroup_subsys *ss, struct cgroup *cgrp,
7445} 7429}
7446 7430
7447struct cgroup_subsys perf_subsys = { 7431struct cgroup_subsys perf_subsys = {
7448 .name = "perf_event", 7432 .name = "perf_event",
7449 .subsys_id = perf_subsys_id, 7433 .subsys_id = perf_subsys_id,
7450 .create = perf_cgroup_create, 7434 .create = perf_cgroup_create,
7451 .destroy = perf_cgroup_destroy, 7435 .destroy = perf_cgroup_destroy,
7452 .exit = perf_cgroup_exit, 7436 .exit = perf_cgroup_exit,
7453 .attach = perf_cgroup_attach, 7437 .attach = perf_cgroup_attach,
7454}; 7438};
7455#endif /* CONFIG_CGROUP_PERF */ 7439#endif /* CONFIG_CGROUP_PERF */
diff --git a/kernel/hw_breakpoint.c b/kernel/events/hw_breakpoint.c
index 086adf25a55e..086adf25a55e 100644
--- a/kernel/hw_breakpoint.c
+++ b/kernel/events/hw_breakpoint.c
diff --git a/kernel/exit.c b/kernel/exit.c
index f5d2f63bae0b..8dd874181542 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -1016,7 +1016,7 @@ NORET_TYPE void do_exit(long code)
1016 /* 1016 /*
1017 * FIXME: do that only when needed, using sched_exit tracepoint 1017 * FIXME: do that only when needed, using sched_exit tracepoint
1018 */ 1018 */
1019 flush_ptrace_hw_breakpoint(tsk); 1019 ptrace_put_breakpoints(tsk);
1020 1020
1021 exit_notify(tsk, group_dead); 1021 exit_notify(tsk, group_dead);
1022#ifdef CONFIG_NUMA 1022#ifdef CONFIG_NUMA
diff --git a/kernel/extable.c b/kernel/extable.c
index 7f8f263f8524..c2d625fcda77 100644
--- a/kernel/extable.c
+++ b/kernel/extable.c
@@ -72,6 +72,14 @@ int core_kernel_text(unsigned long addr)
72 return 0; 72 return 0;
73} 73}
74 74
75int core_kernel_data(unsigned long addr)
76{
77 if (addr >= (unsigned long)_sdata &&
78 addr < (unsigned long)_edata)
79 return 1;
80 return 0;
81}
82
75int __kernel_text_address(unsigned long addr) 83int __kernel_text_address(unsigned long addr)
76{ 84{
77 if (core_kernel_text(addr)) 85 if (core_kernel_text(addr))
diff --git a/kernel/fork.c b/kernel/fork.c
index e7548dee636b..2b44d82b8237 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1103,7 +1103,6 @@ static struct task_struct *copy_process(unsigned long clone_flags,
1103 1103
1104 posix_cpu_timers_init(p); 1104 posix_cpu_timers_init(p);
1105 1105
1106 p->lock_depth = -1; /* -1 = no lock */
1107 do_posix_clock_monotonic_gettime(&p->start_time); 1106 do_posix_clock_monotonic_gettime(&p->start_time);
1108 p->real_start_time = p->start_time; 1107 p->real_start_time = p->start_time;
1109 monotonic_to_bootbased(&p->real_start_time); 1108 monotonic_to_bootbased(&p->real_start_time);
@@ -1153,7 +1152,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
1153#endif 1152#endif
1154 1153
1155 /* Perform scheduler related setup. Assign this task to a CPU. */ 1154 /* Perform scheduler related setup. Assign this task to a CPU. */
1156 sched_fork(p, clone_flags); 1155 sched_fork(p);
1157 1156
1158 retval = perf_event_init_task(p); 1157 retval = perf_event_init_task(p);
1159 if (retval) 1158 if (retval)
@@ -1464,7 +1463,7 @@ long do_fork(unsigned long clone_flags,
1464 */ 1463 */
1465 p->flags &= ~PF_STARTING; 1464 p->flags &= ~PF_STARTING;
1466 1465
1467 wake_up_new_task(p, clone_flags); 1466 wake_up_new_task(p);
1468 1467
1469 tracehook_report_clone_complete(trace, regs, 1468 tracehook_report_clone_complete(trace, regs,
1470 clone_flags, nr, p); 1469 clone_flags, nr, p);
diff --git a/kernel/freezer.c b/kernel/freezer.c
index 66ecd2ead215..7b01de98bb6a 100644
--- a/kernel/freezer.c
+++ b/kernel/freezer.c
@@ -17,7 +17,7 @@ static inline void frozen_process(void)
17{ 17{
18 if (!unlikely(current->flags & PF_NOFREEZE)) { 18 if (!unlikely(current->flags & PF_NOFREEZE)) {
19 current->flags |= PF_FROZEN; 19 current->flags |= PF_FROZEN;
20 wmb(); 20 smp_wmb();
21 } 21 }
22 clear_freeze_flag(current); 22 clear_freeze_flag(current);
23} 23}
@@ -93,7 +93,7 @@ bool freeze_task(struct task_struct *p, bool sig_only)
93 * the task as frozen and next clears its TIF_FREEZE. 93 * the task as frozen and next clears its TIF_FREEZE.
94 */ 94 */
95 if (!freezing(p)) { 95 if (!freezing(p)) {
96 rmb(); 96 smp_rmb();
97 if (frozen(p)) 97 if (frozen(p))
98 return false; 98 return false;
99 99
diff --git a/kernel/hung_task.c b/kernel/hung_task.c
index 53ead174da2f..ea640120ab86 100644
--- a/kernel/hung_task.c
+++ b/kernel/hung_task.c
@@ -33,7 +33,7 @@ unsigned long __read_mostly sysctl_hung_task_check_count = PID_MAX_LIMIT;
33/* 33/*
34 * Zero means infinite timeout - no checking done: 34 * Zero means infinite timeout - no checking done:
35 */ 35 */
36unsigned long __read_mostly sysctl_hung_task_timeout_secs = 120; 36unsigned long __read_mostly sysctl_hung_task_timeout_secs = CONFIG_DEFAULT_HUNG_TASK_TIMEOUT;
37 37
38unsigned long __read_mostly sysctl_hung_task_warnings = 10; 38unsigned long __read_mostly sysctl_hung_task_warnings = 10;
39 39
diff --git a/kernel/irq/Kconfig b/kernel/irq/Kconfig
index c574f9a12c48..d1d051b38e0b 100644
--- a/kernel/irq/Kconfig
+++ b/kernel/irq/Kconfig
@@ -48,6 +48,10 @@ config IRQ_PREFLOW_FASTEOI
48config IRQ_EDGE_EOI_HANDLER 48config IRQ_EDGE_EOI_HANDLER
49 bool 49 bool
50 50
51# Generic configurable interrupt chip implementation
52config GENERIC_IRQ_CHIP
53 bool
54
51# Support forced irq threading 55# Support forced irq threading
52config IRQ_FORCED_THREADING 56config IRQ_FORCED_THREADING
53 bool 57 bool
diff --git a/kernel/irq/Makefile b/kernel/irq/Makefile
index 54329cd7b3ee..73290056cfb6 100644
--- a/kernel/irq/Makefile
+++ b/kernel/irq/Makefile
@@ -1,5 +1,6 @@
1 1
2obj-y := irqdesc.o handle.o manage.o spurious.o resend.o chip.o dummychip.o devres.o 2obj-y := irqdesc.o handle.o manage.o spurious.o resend.o chip.o dummychip.o devres.o
3obj-$(CONFIG_GENERIC_IRQ_CHIP) += generic-chip.o
3obj-$(CONFIG_GENERIC_IRQ_PROBE) += autoprobe.o 4obj-$(CONFIG_GENERIC_IRQ_PROBE) += autoprobe.o
4obj-$(CONFIG_PROC_FS) += proc.o 5obj-$(CONFIG_PROC_FS) += proc.o
5obj-$(CONFIG_GENERIC_PENDING_IRQ) += migration.o 6obj-$(CONFIG_GENERIC_PENDING_IRQ) += migration.o
diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c
index 4af1e2b244cb..d5a3009da71a 100644
--- a/kernel/irq/chip.c
+++ b/kernel/irq/chip.c
@@ -310,6 +310,7 @@ handle_simple_irq(unsigned int irq, struct irq_desc *desc)
310out_unlock: 310out_unlock:
311 raw_spin_unlock(&desc->lock); 311 raw_spin_unlock(&desc->lock);
312} 312}
313EXPORT_SYMBOL_GPL(handle_simple_irq);
313 314
314/** 315/**
315 * handle_level_irq - Level type irq handler 316 * handle_level_irq - Level type irq handler
@@ -573,6 +574,7 @@ __irq_set_handler(unsigned int irq, irq_flow_handler_t handle, int is_chained,
573 if (handle != handle_bad_irq && is_chained) { 574 if (handle != handle_bad_irq && is_chained) {
574 irq_settings_set_noprobe(desc); 575 irq_settings_set_noprobe(desc);
575 irq_settings_set_norequest(desc); 576 irq_settings_set_norequest(desc);
577 irq_settings_set_nothread(desc);
576 irq_startup(desc); 578 irq_startup(desc);
577 } 579 }
578out: 580out:
@@ -612,6 +614,7 @@ void irq_modify_status(unsigned int irq, unsigned long clr, unsigned long set)
612 614
613 irq_put_desc_unlock(desc, flags); 615 irq_put_desc_unlock(desc, flags);
614} 616}
617EXPORT_SYMBOL_GPL(irq_modify_status);
615 618
616/** 619/**
617 * irq_cpu_online - Invoke all irq_cpu_online functions. 620 * irq_cpu_online - Invoke all irq_cpu_online functions.
diff --git a/kernel/irq/debug.h b/kernel/irq/debug.h
index 306cba37e9a5..97a8bfadc88a 100644
--- a/kernel/irq/debug.h
+++ b/kernel/irq/debug.h
@@ -27,6 +27,7 @@ static inline void print_irq_desc(unsigned int irq, struct irq_desc *desc)
27 P(IRQ_PER_CPU); 27 P(IRQ_PER_CPU);
28 P(IRQ_NOPROBE); 28 P(IRQ_NOPROBE);
29 P(IRQ_NOREQUEST); 29 P(IRQ_NOREQUEST);
30 P(IRQ_NOTHREAD);
30 P(IRQ_NOAUTOEN); 31 P(IRQ_NOAUTOEN);
31 32
32 PS(IRQS_AUTODETECT); 33 PS(IRQS_AUTODETECT);
diff --git a/kernel/irq/generic-chip.c b/kernel/irq/generic-chip.c
new file mode 100644
index 000000000000..31a9db711906
--- /dev/null
+++ b/kernel/irq/generic-chip.c
@@ -0,0 +1,354 @@
1/*
2 * Library implementing the most common irq chip callback functions
3 *
4 * Copyright (C) 2011, Thomas Gleixner
5 */
6#include <linux/io.h>
7#include <linux/irq.h>
8#include <linux/slab.h>
9#include <linux/interrupt.h>
10#include <linux/kernel_stat.h>
11#include <linux/syscore_ops.h>
12
13#include "internals.h"
14
15static LIST_HEAD(gc_list);
16static DEFINE_RAW_SPINLOCK(gc_lock);
17
18static inline struct irq_chip_regs *cur_regs(struct irq_data *d)
19{
20 return &container_of(d->chip, struct irq_chip_type, chip)->regs;
21}
22
23/**
24 * irq_gc_noop - NOOP function
25 * @d: irq_data
26 */
27void irq_gc_noop(struct irq_data *d)
28{
29}
30
31/**
32 * irq_gc_mask_disable_reg - Mask chip via disable register
33 * @d: irq_data
34 *
35 * Chip has separate enable/disable registers instead of a single mask
36 * register.
37 */
38void irq_gc_mask_disable_reg(struct irq_data *d)
39{
40 struct irq_chip_generic *gc = irq_data_get_irq_chip_data(d);
41 u32 mask = 1 << (d->irq - gc->irq_base);
42
43 irq_gc_lock(gc);
44 irq_reg_writel(mask, gc->reg_base + cur_regs(d)->disable);
45 gc->mask_cache &= ~mask;
46 irq_gc_unlock(gc);
47}
48
49/**
50 * irq_gc_mask_set_mask_bit - Mask chip via setting bit in mask register
51 * @d: irq_data
52 *
53 * Chip has a single mask register. Values of this register are cached
54 * and protected by gc->lock
55 */
56void irq_gc_mask_set_bit(struct irq_data *d)
57{
58 struct irq_chip_generic *gc = irq_data_get_irq_chip_data(d);
59 u32 mask = 1 << (d->irq - gc->irq_base);
60
61 irq_gc_lock(gc);
62 gc->mask_cache |= mask;
63 irq_reg_writel(gc->mask_cache, gc->reg_base + cur_regs(d)->mask);
64 irq_gc_unlock(gc);
65}
66
67/**
68 * irq_gc_mask_set_mask_bit - Mask chip via clearing bit in mask register
69 * @d: irq_data
70 *
71 * Chip has a single mask register. Values of this register are cached
72 * and protected by gc->lock
73 */
74void irq_gc_mask_clr_bit(struct irq_data *d)
75{
76 struct irq_chip_generic *gc = irq_data_get_irq_chip_data(d);
77 u32 mask = 1 << (d->irq - gc->irq_base);
78
79 irq_gc_lock(gc);
80 gc->mask_cache &= ~mask;
81 irq_reg_writel(gc->mask_cache, gc->reg_base + cur_regs(d)->mask);
82 irq_gc_unlock(gc);
83}
84
85/**
86 * irq_gc_unmask_enable_reg - Unmask chip via enable register
87 * @d: irq_data
88 *
89 * Chip has separate enable/disable registers instead of a single mask
90 * register.
91 */
92void irq_gc_unmask_enable_reg(struct irq_data *d)
93{
94 struct irq_chip_generic *gc = irq_data_get_irq_chip_data(d);
95 u32 mask = 1 << (d->irq - gc->irq_base);
96
97 irq_gc_lock(gc);
98 irq_reg_writel(mask, gc->reg_base + cur_regs(d)->enable);
99 gc->mask_cache |= mask;
100 irq_gc_unlock(gc);
101}
102
103/**
104 * irq_gc_ack - Ack pending interrupt
105 * @d: irq_data
106 */
107void irq_gc_ack(struct irq_data *d)
108{
109 struct irq_chip_generic *gc = irq_data_get_irq_chip_data(d);
110 u32 mask = 1 << (d->irq - gc->irq_base);
111
112 irq_gc_lock(gc);
113 irq_reg_writel(mask, gc->reg_base + cur_regs(d)->ack);
114 irq_gc_unlock(gc);
115}
116
117/**
118 * irq_gc_mask_disable_reg_and_ack- Mask and ack pending interrupt
119 * @d: irq_data
120 */
121void irq_gc_mask_disable_reg_and_ack(struct irq_data *d)
122{
123 struct irq_chip_generic *gc = irq_data_get_irq_chip_data(d);
124 u32 mask = 1 << (d->irq - gc->irq_base);
125
126 irq_gc_lock(gc);
127 irq_reg_writel(mask, gc->reg_base + cur_regs(d)->mask);
128 irq_reg_writel(mask, gc->reg_base + cur_regs(d)->ack);
129 irq_gc_unlock(gc);
130}
131
132/**
133 * irq_gc_eoi - EOI interrupt
134 * @d: irq_data
135 */
136void irq_gc_eoi(struct irq_data *d)
137{
138 struct irq_chip_generic *gc = irq_data_get_irq_chip_data(d);
139 u32 mask = 1 << (d->irq - gc->irq_base);
140
141 irq_gc_lock(gc);
142 irq_reg_writel(mask, gc->reg_base + cur_regs(d)->eoi);
143 irq_gc_unlock(gc);
144}
145
146/**
147 * irq_gc_set_wake - Set/clr wake bit for an interrupt
148 * @d: irq_data
149 *
150 * For chips where the wake from suspend functionality is not
151 * configured in a separate register and the wakeup active state is
152 * just stored in a bitmask.
153 */
154int irq_gc_set_wake(struct irq_data *d, unsigned int on)
155{
156 struct irq_chip_generic *gc = irq_data_get_irq_chip_data(d);
157 u32 mask = 1 << (d->irq - gc->irq_base);
158
159 if (!(mask & gc->wake_enabled))
160 return -EINVAL;
161
162 irq_gc_lock(gc);
163 if (on)
164 gc->wake_active |= mask;
165 else
166 gc->wake_active &= ~mask;
167 irq_gc_unlock(gc);
168 return 0;
169}
170
171/**
172 * irq_alloc_generic_chip - Allocate a generic chip and initialize it
173 * @name: Name of the irq chip
174 * @num_ct: Number of irq_chip_type instances associated with this
175 * @irq_base: Interrupt base nr for this chip
176 * @reg_base: Register base address (virtual)
177 * @handler: Default flow handler associated with this chip
178 *
179 * Returns an initialized irq_chip_generic structure. The chip defaults
180 * to the primary (index 0) irq_chip_type and @handler
181 */
182struct irq_chip_generic *
183irq_alloc_generic_chip(const char *name, int num_ct, unsigned int irq_base,
184 void __iomem *reg_base, irq_flow_handler_t handler)
185{
186 struct irq_chip_generic *gc;
187 unsigned long sz = sizeof(*gc) + num_ct * sizeof(struct irq_chip_type);
188
189 gc = kzalloc(sz, GFP_KERNEL);
190 if (gc) {
191 raw_spin_lock_init(&gc->lock);
192 gc->num_ct = num_ct;
193 gc->irq_base = irq_base;
194 gc->reg_base = reg_base;
195 gc->chip_types->chip.name = name;
196 gc->chip_types->handler = handler;
197 }
198 return gc;
199}
200
201/*
202 * Separate lockdep class for interrupt chip which can nest irq_desc
203 * lock.
204 */
205static struct lock_class_key irq_nested_lock_class;
206
207/**
208 * irq_setup_generic_chip - Setup a range of interrupts with a generic chip
209 * @gc: Generic irq chip holding all data
210 * @msk: Bitmask holding the irqs to initialize relative to gc->irq_base
211 * @flags: Flags for initialization
212 * @clr: IRQ_* bits to clear
213 * @set: IRQ_* bits to set
214 *
215 * Set up max. 32 interrupts starting from gc->irq_base. Note, this
216 * initializes all interrupts to the primary irq_chip_type and its
217 * associated handler.
218 */
219void irq_setup_generic_chip(struct irq_chip_generic *gc, u32 msk,
220 enum irq_gc_flags flags, unsigned int clr,
221 unsigned int set)
222{
223 struct irq_chip_type *ct = gc->chip_types;
224 unsigned int i;
225
226 raw_spin_lock(&gc_lock);
227 list_add_tail(&gc->list, &gc_list);
228 raw_spin_unlock(&gc_lock);
229
230 /* Init mask cache ? */
231 if (flags & IRQ_GC_INIT_MASK_CACHE)
232 gc->mask_cache = irq_reg_readl(gc->reg_base + ct->regs.mask);
233
234 for (i = gc->irq_base; msk; msk >>= 1, i++) {
235 if (!msk & 0x01)
236 continue;
237
238 if (flags & IRQ_GC_INIT_NESTED_LOCK)
239 irq_set_lockdep_class(i, &irq_nested_lock_class);
240
241 irq_set_chip_and_handler(i, &ct->chip, ct->handler);
242 irq_set_chip_data(i, gc);
243 irq_modify_status(i, clr, set);
244 }
245 gc->irq_cnt = i - gc->irq_base;
246}
247
248/**
249 * irq_setup_alt_chip - Switch to alternative chip
250 * @d: irq_data for this interrupt
251 * @type Flow type to be initialized
252 *
253 * Only to be called from chip->irq_set_type() callbacks.
254 */
255int irq_setup_alt_chip(struct irq_data *d, unsigned int type)
256{
257 struct irq_chip_generic *gc = irq_data_get_irq_chip_data(d);
258 struct irq_chip_type *ct = gc->chip_types;
259 unsigned int i;
260
261 for (i = 0; i < gc->num_ct; i++, ct++) {
262 if (ct->type & type) {
263 d->chip = &ct->chip;
264 irq_data_to_desc(d)->handle_irq = ct->handler;
265 return 0;
266 }
267 }
268 return -EINVAL;
269}
270
271/**
272 * irq_remove_generic_chip - Remove a chip
273 * @gc: Generic irq chip holding all data
274 * @msk: Bitmask holding the irqs to initialize relative to gc->irq_base
275 * @clr: IRQ_* bits to clear
276 * @set: IRQ_* bits to set
277 *
278 * Remove up to 32 interrupts starting from gc->irq_base.
279 */
280void irq_remove_generic_chip(struct irq_chip_generic *gc, u32 msk,
281 unsigned int clr, unsigned int set)
282{
283 unsigned int i = gc->irq_base;
284
285 raw_spin_lock(&gc_lock);
286 list_del(&gc->list);
287 raw_spin_unlock(&gc_lock);
288
289 for (; msk; msk >>= 1, i++) {
290 if (!msk & 0x01)
291 continue;
292
293 /* Remove handler first. That will mask the irq line */
294 irq_set_handler(i, NULL);
295 irq_set_chip(i, &no_irq_chip);
296 irq_set_chip_data(i, NULL);
297 irq_modify_status(i, clr, set);
298 }
299}
300
301#ifdef CONFIG_PM
302static int irq_gc_suspend(void)
303{
304 struct irq_chip_generic *gc;
305
306 list_for_each_entry(gc, &gc_list, list) {
307 struct irq_chip_type *ct = gc->chip_types;
308
309 if (ct->chip.irq_suspend)
310 ct->chip.irq_suspend(irq_get_irq_data(gc->irq_base));
311 }
312 return 0;
313}
314
315static void irq_gc_resume(void)
316{
317 struct irq_chip_generic *gc;
318
319 list_for_each_entry(gc, &gc_list, list) {
320 struct irq_chip_type *ct = gc->chip_types;
321
322 if (ct->chip.irq_resume)
323 ct->chip.irq_resume(irq_get_irq_data(gc->irq_base));
324 }
325}
326#else
327#define irq_gc_suspend NULL
328#define irq_gc_resume NULL
329#endif
330
331static void irq_gc_shutdown(void)
332{
333 struct irq_chip_generic *gc;
334
335 list_for_each_entry(gc, &gc_list, list) {
336 struct irq_chip_type *ct = gc->chip_types;
337
338 if (ct->chip.irq_pm_shutdown)
339 ct->chip.irq_pm_shutdown(irq_get_irq_data(gc->irq_base));
340 }
341}
342
343static struct syscore_ops irq_gc_syscore_ops = {
344 .suspend = irq_gc_suspend,
345 .resume = irq_gc_resume,
346 .shutdown = irq_gc_shutdown,
347};
348
349static int __init irq_gc_init_ops(void)
350{
351 register_syscore_ops(&irq_gc_syscore_ops);
352 return 0;
353}
354device_initcall(irq_gc_init_ops);
diff --git a/kernel/irq/irqdesc.c b/kernel/irq/irqdesc.c
index 2c039c9b9383..886e80347b32 100644
--- a/kernel/irq/irqdesc.c
+++ b/kernel/irq/irqdesc.c
@@ -22,7 +22,7 @@
22 */ 22 */
23static struct lock_class_key irq_desc_lock_class; 23static struct lock_class_key irq_desc_lock_class;
24 24
25#if defined(CONFIG_SMP) && defined(CONFIG_GENERIC_HARDIRQS) 25#if defined(CONFIG_SMP)
26static void __init init_irq_default_affinity(void) 26static void __init init_irq_default_affinity(void)
27{ 27{
28 alloc_cpumask_var(&irq_default_affinity, GFP_NOWAIT); 28 alloc_cpumask_var(&irq_default_affinity, GFP_NOWAIT);
@@ -290,6 +290,22 @@ static int irq_expand_nr_irqs(unsigned int nr)
290 290
291#endif /* !CONFIG_SPARSE_IRQ */ 291#endif /* !CONFIG_SPARSE_IRQ */
292 292
293/**
294 * generic_handle_irq - Invoke the handler for a particular irq
295 * @irq: The irq number to handle
296 *
297 */
298int generic_handle_irq(unsigned int irq)
299{
300 struct irq_desc *desc = irq_to_desc(irq);
301
302 if (!desc)
303 return -EINVAL;
304 generic_handle_irq_desc(irq, desc);
305 return 0;
306}
307EXPORT_SYMBOL_GPL(generic_handle_irq);
308
293/* Dynamic interrupt handling */ 309/* Dynamic interrupt handling */
294 310
295/** 311/**
@@ -311,6 +327,7 @@ void irq_free_descs(unsigned int from, unsigned int cnt)
311 bitmap_clear(allocated_irqs, from, cnt); 327 bitmap_clear(allocated_irqs, from, cnt);
312 mutex_unlock(&sparse_irq_lock); 328 mutex_unlock(&sparse_irq_lock);
313} 329}
330EXPORT_SYMBOL_GPL(irq_free_descs);
314 331
315/** 332/**
316 * irq_alloc_descs - allocate and initialize a range of irq descriptors 333 * irq_alloc_descs - allocate and initialize a range of irq descriptors
@@ -351,6 +368,7 @@ err:
351 mutex_unlock(&sparse_irq_lock); 368 mutex_unlock(&sparse_irq_lock);
352 return ret; 369 return ret;
353} 370}
371EXPORT_SYMBOL_GPL(irq_alloc_descs);
354 372
355/** 373/**
356 * irq_reserve_irqs - mark irqs allocated 374 * irq_reserve_irqs - mark irqs allocated
@@ -430,7 +448,6 @@ unsigned int kstat_irqs_cpu(unsigned int irq, int cpu)
430 *per_cpu_ptr(desc->kstat_irqs, cpu) : 0; 448 *per_cpu_ptr(desc->kstat_irqs, cpu) : 0;
431} 449}
432 450
433#ifdef CONFIG_GENERIC_HARDIRQS
434unsigned int kstat_irqs(unsigned int irq) 451unsigned int kstat_irqs(unsigned int irq)
435{ 452{
436 struct irq_desc *desc = irq_to_desc(irq); 453 struct irq_desc *desc = irq_to_desc(irq);
@@ -443,4 +460,3 @@ unsigned int kstat_irqs(unsigned int irq)
443 sum += *per_cpu_ptr(desc->kstat_irqs, cpu); 460 sum += *per_cpu_ptr(desc->kstat_irqs, cpu);
444 return sum; 461 return sum;
445} 462}
446#endif /* CONFIG_GENERIC_HARDIRQS */
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index 07c1611f3899..f7ce0021e1c4 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -900,7 +900,8 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new)
900 */ 900 */
901 new->handler = irq_nested_primary_handler; 901 new->handler = irq_nested_primary_handler;
902 } else { 902 } else {
903 irq_setup_forced_threading(new); 903 if (irq_settings_can_thread(desc))
904 irq_setup_forced_threading(new);
904 } 905 }
905 906
906 /* 907 /*
diff --git a/kernel/irq/proc.c b/kernel/irq/proc.c
index dd201bd35103..834899f2500f 100644
--- a/kernel/irq/proc.c
+++ b/kernel/irq/proc.c
@@ -419,7 +419,7 @@ int show_interrupts(struct seq_file *p, void *v)
419 } else { 419 } else {
420 seq_printf(p, " %8s", "None"); 420 seq_printf(p, " %8s", "None");
421 } 421 }
422#ifdef CONFIG_GENIRC_IRQ_SHOW_LEVEL 422#ifdef CONFIG_GENERIC_IRQ_SHOW_LEVEL
423 seq_printf(p, " %-8s", irqd_is_level_type(&desc->irq_data) ? "Level" : "Edge"); 423 seq_printf(p, " %-8s", irqd_is_level_type(&desc->irq_data) ? "Level" : "Edge");
424#endif 424#endif
425 if (desc->name) 425 if (desc->name)
diff --git a/kernel/irq/settings.h b/kernel/irq/settings.h
index 0d91730b6330..f1667833d444 100644
--- a/kernel/irq/settings.h
+++ b/kernel/irq/settings.h
@@ -8,6 +8,7 @@ enum {
8 _IRQ_LEVEL = IRQ_LEVEL, 8 _IRQ_LEVEL = IRQ_LEVEL,
9 _IRQ_NOPROBE = IRQ_NOPROBE, 9 _IRQ_NOPROBE = IRQ_NOPROBE,
10 _IRQ_NOREQUEST = IRQ_NOREQUEST, 10 _IRQ_NOREQUEST = IRQ_NOREQUEST,
11 _IRQ_NOTHREAD = IRQ_NOTHREAD,
11 _IRQ_NOAUTOEN = IRQ_NOAUTOEN, 12 _IRQ_NOAUTOEN = IRQ_NOAUTOEN,
12 _IRQ_MOVE_PCNTXT = IRQ_MOVE_PCNTXT, 13 _IRQ_MOVE_PCNTXT = IRQ_MOVE_PCNTXT,
13 _IRQ_NO_BALANCING = IRQ_NO_BALANCING, 14 _IRQ_NO_BALANCING = IRQ_NO_BALANCING,
@@ -20,6 +21,7 @@ enum {
20#define IRQ_LEVEL GOT_YOU_MORON 21#define IRQ_LEVEL GOT_YOU_MORON
21#define IRQ_NOPROBE GOT_YOU_MORON 22#define IRQ_NOPROBE GOT_YOU_MORON
22#define IRQ_NOREQUEST GOT_YOU_MORON 23#define IRQ_NOREQUEST GOT_YOU_MORON
24#define IRQ_NOTHREAD GOT_YOU_MORON
23#define IRQ_NOAUTOEN GOT_YOU_MORON 25#define IRQ_NOAUTOEN GOT_YOU_MORON
24#define IRQ_NESTED_THREAD GOT_YOU_MORON 26#define IRQ_NESTED_THREAD GOT_YOU_MORON
25#undef IRQF_MODIFY_MASK 27#undef IRQF_MODIFY_MASK
@@ -94,6 +96,21 @@ static inline void irq_settings_set_norequest(struct irq_desc *desc)
94 desc->status_use_accessors |= _IRQ_NOREQUEST; 96 desc->status_use_accessors |= _IRQ_NOREQUEST;
95} 97}
96 98
99static inline bool irq_settings_can_thread(struct irq_desc *desc)
100{
101 return !(desc->status_use_accessors & _IRQ_NOTHREAD);
102}
103
104static inline void irq_settings_clr_nothread(struct irq_desc *desc)
105{
106 desc->status_use_accessors &= ~_IRQ_NOTHREAD;
107}
108
109static inline void irq_settings_set_nothread(struct irq_desc *desc)
110{
111 desc->status_use_accessors |= _IRQ_NOTHREAD;
112}
113
97static inline bool irq_settings_can_probe(struct irq_desc *desc) 114static inline bool irq_settings_can_probe(struct irq_desc *desc)
98{ 115{
99 return !(desc->status_use_accessors & _IRQ_NOPROBE); 116 return !(desc->status_use_accessors & _IRQ_NOPROBE);
diff --git a/kernel/jump_label.c b/kernel/jump_label.c
index 3b79bd938330..74d1c099fbd1 100644
--- a/kernel/jump_label.c
+++ b/kernel/jump_label.c
@@ -2,43 +2,23 @@
2 * jump label support 2 * jump label support
3 * 3 *
4 * Copyright (C) 2009 Jason Baron <jbaron@redhat.com> 4 * Copyright (C) 2009 Jason Baron <jbaron@redhat.com>
5 * Copyright (C) 2011 Peter Zijlstra <pzijlstr@redhat.com>
5 * 6 *
6 */ 7 */
7#include <linux/jump_label.h>
8#include <linux/memory.h> 8#include <linux/memory.h>
9#include <linux/uaccess.h> 9#include <linux/uaccess.h>
10#include <linux/module.h> 10#include <linux/module.h>
11#include <linux/list.h> 11#include <linux/list.h>
12#include <linux/jhash.h>
13#include <linux/slab.h> 12#include <linux/slab.h>
14#include <linux/sort.h> 13#include <linux/sort.h>
15#include <linux/err.h> 14#include <linux/err.h>
15#include <linux/jump_label.h>
16 16
17#ifdef HAVE_JUMP_LABEL 17#ifdef HAVE_JUMP_LABEL
18 18
19#define JUMP_LABEL_HASH_BITS 6
20#define JUMP_LABEL_TABLE_SIZE (1 << JUMP_LABEL_HASH_BITS)
21static struct hlist_head jump_label_table[JUMP_LABEL_TABLE_SIZE];
22
23/* mutex to protect coming/going of the the jump_label table */ 19/* mutex to protect coming/going of the the jump_label table */
24static DEFINE_MUTEX(jump_label_mutex); 20static DEFINE_MUTEX(jump_label_mutex);
25 21
26struct jump_label_entry {
27 struct hlist_node hlist;
28 struct jump_entry *table;
29 int nr_entries;
30 /* hang modules off here */
31 struct hlist_head modules;
32 unsigned long key;
33};
34
35struct jump_label_module_entry {
36 struct hlist_node hlist;
37 struct jump_entry *table;
38 int nr_entries;
39 struct module *mod;
40};
41
42void jump_label_lock(void) 22void jump_label_lock(void)
43{ 23{
44 mutex_lock(&jump_label_mutex); 24 mutex_lock(&jump_label_mutex);
@@ -49,6 +29,11 @@ void jump_label_unlock(void)
49 mutex_unlock(&jump_label_mutex); 29 mutex_unlock(&jump_label_mutex);
50} 30}
51 31
32bool jump_label_enabled(struct jump_label_key *key)
33{
34 return !!atomic_read(&key->enabled);
35}
36
52static int jump_label_cmp(const void *a, const void *b) 37static int jump_label_cmp(const void *a, const void *b)
53{ 38{
54 const struct jump_entry *jea = a; 39 const struct jump_entry *jea = a;
@@ -64,7 +49,7 @@ static int jump_label_cmp(const void *a, const void *b)
64} 49}
65 50
66static void 51static void
67sort_jump_label_entries(struct jump_entry *start, struct jump_entry *stop) 52jump_label_sort_entries(struct jump_entry *start, struct jump_entry *stop)
68{ 53{
69 unsigned long size; 54 unsigned long size;
70 55
@@ -73,118 +58,25 @@ sort_jump_label_entries(struct jump_entry *start, struct jump_entry *stop)
73 sort(start, size, sizeof(struct jump_entry), jump_label_cmp, NULL); 58 sort(start, size, sizeof(struct jump_entry), jump_label_cmp, NULL);
74} 59}
75 60
76static struct jump_label_entry *get_jump_label_entry(jump_label_t key) 61static void jump_label_update(struct jump_label_key *key, int enable);
77{
78 struct hlist_head *head;
79 struct hlist_node *node;
80 struct jump_label_entry *e;
81 u32 hash = jhash((void *)&key, sizeof(jump_label_t), 0);
82
83 head = &jump_label_table[hash & (JUMP_LABEL_TABLE_SIZE - 1)];
84 hlist_for_each_entry(e, node, head, hlist) {
85 if (key == e->key)
86 return e;
87 }
88 return NULL;
89}
90 62
91static struct jump_label_entry * 63void jump_label_inc(struct jump_label_key *key)
92add_jump_label_entry(jump_label_t key, int nr_entries, struct jump_entry *table)
93{ 64{
94 struct hlist_head *head; 65 if (atomic_inc_not_zero(&key->enabled))
95 struct jump_label_entry *e; 66 return;
96 u32 hash;
97
98 e = get_jump_label_entry(key);
99 if (e)
100 return ERR_PTR(-EEXIST);
101
102 e = kmalloc(sizeof(struct jump_label_entry), GFP_KERNEL);
103 if (!e)
104 return ERR_PTR(-ENOMEM);
105
106 hash = jhash((void *)&key, sizeof(jump_label_t), 0);
107 head = &jump_label_table[hash & (JUMP_LABEL_TABLE_SIZE - 1)];
108 e->key = key;
109 e->table = table;
110 e->nr_entries = nr_entries;
111 INIT_HLIST_HEAD(&(e->modules));
112 hlist_add_head(&e->hlist, head);
113 return e;
114}
115 67
116static int 68 jump_label_lock();
117build_jump_label_hashtable(struct jump_entry *start, struct jump_entry *stop) 69 if (atomic_add_return(1, &key->enabled) == 1)
118{ 70 jump_label_update(key, JUMP_LABEL_ENABLE);
119 struct jump_entry *iter, *iter_begin; 71 jump_label_unlock();
120 struct jump_label_entry *entry;
121 int count;
122
123 sort_jump_label_entries(start, stop);
124 iter = start;
125 while (iter < stop) {
126 entry = get_jump_label_entry(iter->key);
127 if (!entry) {
128 iter_begin = iter;
129 count = 0;
130 while ((iter < stop) &&
131 (iter->key == iter_begin->key)) {
132 iter++;
133 count++;
134 }
135 entry = add_jump_label_entry(iter_begin->key,
136 count, iter_begin);
137 if (IS_ERR(entry))
138 return PTR_ERR(entry);
139 } else {
140 WARN_ONCE(1, KERN_ERR "build_jump_hashtable: unexpected entry!\n");
141 return -1;
142 }
143 }
144 return 0;
145} 72}
146 73
147/*** 74void jump_label_dec(struct jump_label_key *key)
148 * jump_label_update - update jump label text
149 * @key - key value associated with a a jump label
150 * @type - enum set to JUMP_LABEL_ENABLE or JUMP_LABEL_DISABLE
151 *
152 * Will enable/disable the jump for jump label @key, depending on the
153 * value of @type.
154 *
155 */
156
157void jump_label_update(unsigned long key, enum jump_label_type type)
158{ 75{
159 struct jump_entry *iter; 76 if (!atomic_dec_and_mutex_lock(&key->enabled, &jump_label_mutex))
160 struct jump_label_entry *entry; 77 return;
161 struct hlist_node *module_node;
162 struct jump_label_module_entry *e_module;
163 int count;
164 78
165 jump_label_lock(); 79 jump_label_update(key, JUMP_LABEL_DISABLE);
166 entry = get_jump_label_entry((jump_label_t)key);
167 if (entry) {
168 count = entry->nr_entries;
169 iter = entry->table;
170 while (count--) {
171 if (kernel_text_address(iter->code))
172 arch_jump_label_transform(iter, type);
173 iter++;
174 }
175 /* eanble/disable jump labels in modules */
176 hlist_for_each_entry(e_module, module_node, &(entry->modules),
177 hlist) {
178 count = e_module->nr_entries;
179 iter = e_module->table;
180 while (count--) {
181 if (iter->key &&
182 kernel_text_address(iter->code))
183 arch_jump_label_transform(iter, type);
184 iter++;
185 }
186 }
187 }
188 jump_label_unlock(); 80 jump_label_unlock();
189} 81}
190 82
@@ -197,77 +89,33 @@ static int addr_conflict(struct jump_entry *entry, void *start, void *end)
197 return 0; 89 return 0;
198} 90}
199 91
200#ifdef CONFIG_MODULES 92static int __jump_label_text_reserved(struct jump_entry *iter_start,
201 93 struct jump_entry *iter_stop, void *start, void *end)
202static int module_conflict(void *start, void *end)
203{ 94{
204 struct hlist_head *head;
205 struct hlist_node *node, *node_next, *module_node, *module_node_next;
206 struct jump_label_entry *e;
207 struct jump_label_module_entry *e_module;
208 struct jump_entry *iter; 95 struct jump_entry *iter;
209 int i, count;
210 int conflict = 0;
211
212 for (i = 0; i < JUMP_LABEL_TABLE_SIZE; i++) {
213 head = &jump_label_table[i];
214 hlist_for_each_entry_safe(e, node, node_next, head, hlist) {
215 hlist_for_each_entry_safe(e_module, module_node,
216 module_node_next,
217 &(e->modules), hlist) {
218 count = e_module->nr_entries;
219 iter = e_module->table;
220 while (count--) {
221 if (addr_conflict(iter, start, end)) {
222 conflict = 1;
223 goto out;
224 }
225 iter++;
226 }
227 }
228 }
229 }
230out:
231 return conflict;
232}
233
234#endif
235
236/***
237 * jump_label_text_reserved - check if addr range is reserved
238 * @start: start text addr
239 * @end: end text addr
240 *
241 * checks if the text addr located between @start and @end
242 * overlaps with any of the jump label patch addresses. Code
243 * that wants to modify kernel text should first verify that
244 * it does not overlap with any of the jump label addresses.
245 * Caller must hold jump_label_mutex.
246 *
247 * returns 1 if there is an overlap, 0 otherwise
248 */
249int jump_label_text_reserved(void *start, void *end)
250{
251 struct jump_entry *iter;
252 struct jump_entry *iter_start = __start___jump_table;
253 struct jump_entry *iter_stop = __start___jump_table;
254 int conflict = 0;
255 96
256 iter = iter_start; 97 iter = iter_start;
257 while (iter < iter_stop) { 98 while (iter < iter_stop) {
258 if (addr_conflict(iter, start, end)) { 99 if (addr_conflict(iter, start, end))
259 conflict = 1; 100 return 1;
260 goto out;
261 }
262 iter++; 101 iter++;
263 } 102 }
264 103
265 /* now check modules */ 104 return 0;
266#ifdef CONFIG_MODULES 105}
267 conflict = module_conflict(start, end); 106
268#endif 107static void __jump_label_update(struct jump_label_key *key,
269out: 108 struct jump_entry *entry, int enable)
270 return conflict; 109{
110 for (; entry->key == (jump_label_t)(unsigned long)key; entry++) {
111 /*
112 * entry->code set to 0 invalidates module init text sections
113 * kernel_text_address() verifies we are not in core kernel
114 * init code, see jump_label_invalidate_module_init().
115 */
116 if (entry->code && kernel_text_address(entry->code))
117 arch_jump_label_transform(entry, enable);
118 }
271} 119}
272 120
273/* 121/*
@@ -277,142 +125,173 @@ void __weak arch_jump_label_text_poke_early(jump_label_t addr)
277{ 125{
278} 126}
279 127
280static __init int init_jump_label(void) 128static __init int jump_label_init(void)
281{ 129{
282 int ret;
283 struct jump_entry *iter_start = __start___jump_table; 130 struct jump_entry *iter_start = __start___jump_table;
284 struct jump_entry *iter_stop = __stop___jump_table; 131 struct jump_entry *iter_stop = __stop___jump_table;
132 struct jump_label_key *key = NULL;
285 struct jump_entry *iter; 133 struct jump_entry *iter;
286 134
287 jump_label_lock(); 135 jump_label_lock();
288 ret = build_jump_label_hashtable(__start___jump_table, 136 jump_label_sort_entries(iter_start, iter_stop);
289 __stop___jump_table); 137
290 iter = iter_start; 138 for (iter = iter_start; iter < iter_stop; iter++) {
291 while (iter < iter_stop) {
292 arch_jump_label_text_poke_early(iter->code); 139 arch_jump_label_text_poke_early(iter->code);
293 iter++; 140 if (iter->key == (jump_label_t)(unsigned long)key)
141 continue;
142
143 key = (struct jump_label_key *)(unsigned long)iter->key;
144 atomic_set(&key->enabled, 0);
145 key->entries = iter;
146#ifdef CONFIG_MODULES
147 key->next = NULL;
148#endif
294 } 149 }
295 jump_label_unlock(); 150 jump_label_unlock();
296 return ret; 151
152 return 0;
297} 153}
298early_initcall(init_jump_label); 154early_initcall(jump_label_init);
299 155
300#ifdef CONFIG_MODULES 156#ifdef CONFIG_MODULES
301 157
302static struct jump_label_module_entry * 158struct jump_label_mod {
303add_jump_label_module_entry(struct jump_label_entry *entry, 159 struct jump_label_mod *next;
304 struct jump_entry *iter_begin, 160 struct jump_entry *entries;
305 int count, struct module *mod) 161 struct module *mod;
162};
163
164static int __jump_label_mod_text_reserved(void *start, void *end)
165{
166 struct module *mod;
167
168 mod = __module_text_address((unsigned long)start);
169 if (!mod)
170 return 0;
171
172 WARN_ON_ONCE(__module_text_address((unsigned long)end) != mod);
173
174 return __jump_label_text_reserved(mod->jump_entries,
175 mod->jump_entries + mod->num_jump_entries,
176 start, end);
177}
178
179static void __jump_label_mod_update(struct jump_label_key *key, int enable)
180{
181 struct jump_label_mod *mod = key->next;
182
183 while (mod) {
184 __jump_label_update(key, mod->entries, enable);
185 mod = mod->next;
186 }
187}
188
189/***
190 * apply_jump_label_nops - patch module jump labels with arch_get_jump_label_nop()
191 * @mod: module to patch
192 *
193 * Allow for run-time selection of the optimal nops. Before the module
194 * loads patch these with arch_get_jump_label_nop(), which is specified by
195 * the arch specific jump label code.
196 */
197void jump_label_apply_nops(struct module *mod)
306{ 198{
307 struct jump_label_module_entry *e; 199 struct jump_entry *iter_start = mod->jump_entries;
308 200 struct jump_entry *iter_stop = iter_start + mod->num_jump_entries;
309 e = kmalloc(sizeof(struct jump_label_module_entry), GFP_KERNEL); 201 struct jump_entry *iter;
310 if (!e) 202
311 return ERR_PTR(-ENOMEM); 203 /* if the module doesn't have jump label entries, just return */
312 e->mod = mod; 204 if (iter_start == iter_stop)
313 e->nr_entries = count; 205 return;
314 e->table = iter_begin; 206
315 hlist_add_head(&e->hlist, &entry->modules); 207 for (iter = iter_start; iter < iter_stop; iter++)
316 return e; 208 arch_jump_label_text_poke_early(iter->code);
317} 209}
318 210
319static int add_jump_label_module(struct module *mod) 211static int jump_label_add_module(struct module *mod)
320{ 212{
321 struct jump_entry *iter, *iter_begin; 213 struct jump_entry *iter_start = mod->jump_entries;
322 struct jump_label_entry *entry; 214 struct jump_entry *iter_stop = iter_start + mod->num_jump_entries;
323 struct jump_label_module_entry *module_entry; 215 struct jump_entry *iter;
324 int count; 216 struct jump_label_key *key = NULL;
217 struct jump_label_mod *jlm;
325 218
326 /* if the module doesn't have jump label entries, just return */ 219 /* if the module doesn't have jump label entries, just return */
327 if (!mod->num_jump_entries) 220 if (iter_start == iter_stop)
328 return 0; 221 return 0;
329 222
330 sort_jump_label_entries(mod->jump_entries, 223 jump_label_sort_entries(iter_start, iter_stop);
331 mod->jump_entries + mod->num_jump_entries); 224
332 iter = mod->jump_entries; 225 for (iter = iter_start; iter < iter_stop; iter++) {
333 while (iter < mod->jump_entries + mod->num_jump_entries) { 226 if (iter->key == (jump_label_t)(unsigned long)key)
334 entry = get_jump_label_entry(iter->key); 227 continue;
335 iter_begin = iter; 228
336 count = 0; 229 key = (struct jump_label_key *)(unsigned long)iter->key;
337 while ((iter < mod->jump_entries + mod->num_jump_entries) && 230
338 (iter->key == iter_begin->key)) { 231 if (__module_address(iter->key) == mod) {
339 iter++; 232 atomic_set(&key->enabled, 0);
340 count++; 233 key->entries = iter;
341 } 234 key->next = NULL;
342 if (!entry) { 235 continue;
343 entry = add_jump_label_entry(iter_begin->key, 0, NULL);
344 if (IS_ERR(entry))
345 return PTR_ERR(entry);
346 } 236 }
347 module_entry = add_jump_label_module_entry(entry, iter_begin, 237
348 count, mod); 238 jlm = kzalloc(sizeof(struct jump_label_mod), GFP_KERNEL);
349 if (IS_ERR(module_entry)) 239 if (!jlm)
350 return PTR_ERR(module_entry); 240 return -ENOMEM;
241
242 jlm->mod = mod;
243 jlm->entries = iter;
244 jlm->next = key->next;
245 key->next = jlm;
246
247 if (jump_label_enabled(key))
248 __jump_label_update(key, iter, JUMP_LABEL_ENABLE);
351 } 249 }
250
352 return 0; 251 return 0;
353} 252}
354 253
355static void remove_jump_label_module(struct module *mod) 254static void jump_label_del_module(struct module *mod)
356{ 255{
357 struct hlist_head *head; 256 struct jump_entry *iter_start = mod->jump_entries;
358 struct hlist_node *node, *node_next, *module_node, *module_node_next; 257 struct jump_entry *iter_stop = iter_start + mod->num_jump_entries;
359 struct jump_label_entry *e; 258 struct jump_entry *iter;
360 struct jump_label_module_entry *e_module; 259 struct jump_label_key *key = NULL;
361 int i; 260 struct jump_label_mod *jlm, **prev;
362 261
363 /* if the module doesn't have jump label entries, just return */ 262 for (iter = iter_start; iter < iter_stop; iter++) {
364 if (!mod->num_jump_entries) 263 if (iter->key == (jump_label_t)(unsigned long)key)
365 return; 264 continue;
265
266 key = (struct jump_label_key *)(unsigned long)iter->key;
267
268 if (__module_address(iter->key) == mod)
269 continue;
270
271 prev = &key->next;
272 jlm = key->next;
366 273
367 for (i = 0; i < JUMP_LABEL_TABLE_SIZE; i++) { 274 while (jlm && jlm->mod != mod) {
368 head = &jump_label_table[i]; 275 prev = &jlm->next;
369 hlist_for_each_entry_safe(e, node, node_next, head, hlist) { 276 jlm = jlm->next;
370 hlist_for_each_entry_safe(e_module, module_node, 277 }
371 module_node_next, 278
372 &(e->modules), hlist) { 279 if (jlm) {
373 if (e_module->mod == mod) { 280 *prev = jlm->next;
374 hlist_del(&e_module->hlist); 281 kfree(jlm);
375 kfree(e_module);
376 }
377 }
378 if (hlist_empty(&e->modules) && (e->nr_entries == 0)) {
379 hlist_del(&e->hlist);
380 kfree(e);
381 }
382 } 282 }
383 } 283 }
384} 284}
385 285
386static void remove_jump_label_module_init(struct module *mod) 286static void jump_label_invalidate_module_init(struct module *mod)
387{ 287{
388 struct hlist_head *head; 288 struct jump_entry *iter_start = mod->jump_entries;
389 struct hlist_node *node, *node_next, *module_node, *module_node_next; 289 struct jump_entry *iter_stop = iter_start + mod->num_jump_entries;
390 struct jump_label_entry *e;
391 struct jump_label_module_entry *e_module;
392 struct jump_entry *iter; 290 struct jump_entry *iter;
393 int i, count;
394
395 /* if the module doesn't have jump label entries, just return */
396 if (!mod->num_jump_entries)
397 return;
398 291
399 for (i = 0; i < JUMP_LABEL_TABLE_SIZE; i++) { 292 for (iter = iter_start; iter < iter_stop; iter++) {
400 head = &jump_label_table[i]; 293 if (within_module_init(iter->code, mod))
401 hlist_for_each_entry_safe(e, node, node_next, head, hlist) { 294 iter->code = 0;
402 hlist_for_each_entry_safe(e_module, module_node,
403 module_node_next,
404 &(e->modules), hlist) {
405 if (e_module->mod != mod)
406 continue;
407 count = e_module->nr_entries;
408 iter = e_module->table;
409 while (count--) {
410 if (within_module_init(iter->code, mod))
411 iter->key = 0;
412 iter++;
413 }
414 }
415 }
416 } 295 }
417} 296}
418 297
@@ -426,59 +305,77 @@ jump_label_module_notify(struct notifier_block *self, unsigned long val,
426 switch (val) { 305 switch (val) {
427 case MODULE_STATE_COMING: 306 case MODULE_STATE_COMING:
428 jump_label_lock(); 307 jump_label_lock();
429 ret = add_jump_label_module(mod); 308 ret = jump_label_add_module(mod);
430 if (ret) 309 if (ret)
431 remove_jump_label_module(mod); 310 jump_label_del_module(mod);
432 jump_label_unlock(); 311 jump_label_unlock();
433 break; 312 break;
434 case MODULE_STATE_GOING: 313 case MODULE_STATE_GOING:
435 jump_label_lock(); 314 jump_label_lock();
436 remove_jump_label_module(mod); 315 jump_label_del_module(mod);
437 jump_label_unlock(); 316 jump_label_unlock();
438 break; 317 break;
439 case MODULE_STATE_LIVE: 318 case MODULE_STATE_LIVE:
440 jump_label_lock(); 319 jump_label_lock();
441 remove_jump_label_module_init(mod); 320 jump_label_invalidate_module_init(mod);
442 jump_label_unlock(); 321 jump_label_unlock();
443 break; 322 break;
444 } 323 }
445 return ret;
446}
447 324
448/*** 325 return notifier_from_errno(ret);
449 * apply_jump_label_nops - patch module jump labels with arch_get_jump_label_nop()
450 * @mod: module to patch
451 *
452 * Allow for run-time selection of the optimal nops. Before the module
453 * loads patch these with arch_get_jump_label_nop(), which is specified by
454 * the arch specific jump label code.
455 */
456void jump_label_apply_nops(struct module *mod)
457{
458 struct jump_entry *iter;
459
460 /* if the module doesn't have jump label entries, just return */
461 if (!mod->num_jump_entries)
462 return;
463
464 iter = mod->jump_entries;
465 while (iter < mod->jump_entries + mod->num_jump_entries) {
466 arch_jump_label_text_poke_early(iter->code);
467 iter++;
468 }
469} 326}
470 327
471struct notifier_block jump_label_module_nb = { 328struct notifier_block jump_label_module_nb = {
472 .notifier_call = jump_label_module_notify, 329 .notifier_call = jump_label_module_notify,
473 .priority = 0, 330 .priority = 1, /* higher than tracepoints */
474}; 331};
475 332
476static __init int init_jump_label_module(void) 333static __init int jump_label_init_module(void)
477{ 334{
478 return register_module_notifier(&jump_label_module_nb); 335 return register_module_notifier(&jump_label_module_nb);
479} 336}
480early_initcall(init_jump_label_module); 337early_initcall(jump_label_init_module);
481 338
482#endif /* CONFIG_MODULES */ 339#endif /* CONFIG_MODULES */
483 340
341/***
342 * jump_label_text_reserved - check if addr range is reserved
343 * @start: start text addr
344 * @end: end text addr
345 *
346 * checks if the text addr located between @start and @end
347 * overlaps with any of the jump label patch addresses. Code
348 * that wants to modify kernel text should first verify that
349 * it does not overlap with any of the jump label addresses.
350 * Caller must hold jump_label_mutex.
351 *
352 * returns 1 if there is an overlap, 0 otherwise
353 */
354int jump_label_text_reserved(void *start, void *end)
355{
356 int ret = __jump_label_text_reserved(__start___jump_table,
357 __stop___jump_table, start, end);
358
359 if (ret)
360 return ret;
361
362#ifdef CONFIG_MODULES
363 ret = __jump_label_mod_text_reserved(start, end);
364#endif
365 return ret;
366}
367
368static void jump_label_update(struct jump_label_key *key, int enable)
369{
370 struct jump_entry *entry = key->entries;
371
372 /* if there are no users, entry can be NULL */
373 if (entry)
374 __jump_label_update(key, entry, enable);
375
376#ifdef CONFIG_MODULES
377 __jump_label_mod_update(key, enable);
378#endif
379}
380
484#endif 381#endif
diff --git a/kernel/kexec.c b/kernel/kexec.c
index 87b77de03dd3..8d814cbc8109 100644
--- a/kernel/kexec.c
+++ b/kernel/kexec.c
@@ -1531,13 +1531,7 @@ int kernel_kexec(void)
1531 if (error) 1531 if (error)
1532 goto Enable_cpus; 1532 goto Enable_cpus;
1533 local_irq_disable(); 1533 local_irq_disable();
1534 /* Suspend system devices */ 1534 error = syscore_suspend();
1535 error = sysdev_suspend(PMSG_FREEZE);
1536 if (!error) {
1537 error = syscore_suspend();
1538 if (error)
1539 sysdev_resume();
1540 }
1541 if (error) 1535 if (error)
1542 goto Enable_irqs; 1536 goto Enable_irqs;
1543 } else 1537 } else
@@ -1553,7 +1547,6 @@ int kernel_kexec(void)
1553#ifdef CONFIG_KEXEC_JUMP 1547#ifdef CONFIG_KEXEC_JUMP
1554 if (kexec_image->preserve_context) { 1548 if (kexec_image->preserve_context) {
1555 syscore_resume(); 1549 syscore_resume();
1556 sysdev_resume();
1557 Enable_irqs: 1550 Enable_irqs:
1558 local_irq_enable(); 1551 local_irq_enable();
1559 Enable_cpus: 1552 Enable_cpus:
diff --git a/kernel/kmod.c b/kernel/kmod.c
index 9cd0591c96a2..5ae0ff38425f 100644
--- a/kernel/kmod.c
+++ b/kernel/kmod.c
@@ -245,7 +245,6 @@ static void __call_usermodehelper(struct work_struct *work)
245 } 245 }
246} 246}
247 247
248#ifdef CONFIG_PM_SLEEP
249/* 248/*
250 * If set, call_usermodehelper_exec() will exit immediately returning -EBUSY 249 * If set, call_usermodehelper_exec() will exit immediately returning -EBUSY
251 * (used for preventing user land processes from being created after the user 250 * (used for preventing user land processes from being created after the user
@@ -301,6 +300,15 @@ void usermodehelper_enable(void)
301 usermodehelper_disabled = 0; 300 usermodehelper_disabled = 0;
302} 301}
303 302
303/**
304 * usermodehelper_is_disabled - check if new helpers are allowed to be started
305 */
306bool usermodehelper_is_disabled(void)
307{
308 return usermodehelper_disabled;
309}
310EXPORT_SYMBOL_GPL(usermodehelper_is_disabled);
311
304static void helper_lock(void) 312static void helper_lock(void)
305{ 313{
306 atomic_inc(&running_helpers); 314 atomic_inc(&running_helpers);
@@ -312,12 +320,6 @@ static void helper_unlock(void)
312 if (atomic_dec_and_test(&running_helpers)) 320 if (atomic_dec_and_test(&running_helpers))
313 wake_up(&running_helpers_waitq); 321 wake_up(&running_helpers_waitq);
314} 322}
315#else /* CONFIG_PM_SLEEP */
316#define usermodehelper_disabled 0
317
318static inline void helper_lock(void) {}
319static inline void helper_unlock(void) {}
320#endif /* CONFIG_PM_SLEEP */
321 323
322/** 324/**
323 * call_usermodehelper_setup - prepare to call a usermode helper 325 * call_usermodehelper_setup - prepare to call a usermode helper
diff --git a/kernel/ksysfs.c b/kernel/ksysfs.c
index 0b624e791805..3b053c04dd86 100644
--- a/kernel/ksysfs.c
+++ b/kernel/ksysfs.c
@@ -16,6 +16,7 @@
16#include <linux/kexec.h> 16#include <linux/kexec.h>
17#include <linux/profile.h> 17#include <linux/profile.h>
18#include <linux/sched.h> 18#include <linux/sched.h>
19#include <linux/capability.h>
19 20
20#define KERNEL_ATTR_RO(_name) \ 21#define KERNEL_ATTR_RO(_name) \
21static struct kobj_attribute _name##_attr = __ATTR_RO(_name) 22static struct kobj_attribute _name##_attr = __ATTR_RO(_name)
@@ -131,6 +132,14 @@ KERNEL_ATTR_RO(vmcoreinfo);
131 132
132#endif /* CONFIG_KEXEC */ 133#endif /* CONFIG_KEXEC */
133 134
135/* whether file capabilities are enabled */
136static ssize_t fscaps_show(struct kobject *kobj,
137 struct kobj_attribute *attr, char *buf)
138{
139 return sprintf(buf, "%d\n", file_caps_enabled);
140}
141KERNEL_ATTR_RO(fscaps);
142
134/* 143/*
135 * Make /sys/kernel/notes give the raw contents of our kernel .notes section. 144 * Make /sys/kernel/notes give the raw contents of our kernel .notes section.
136 */ 145 */
@@ -158,6 +167,7 @@ struct kobject *kernel_kobj;
158EXPORT_SYMBOL_GPL(kernel_kobj); 167EXPORT_SYMBOL_GPL(kernel_kobj);
159 168
160static struct attribute * kernel_attrs[] = { 169static struct attribute * kernel_attrs[] = {
170 &fscaps_attr.attr,
161#if defined(CONFIG_HOTPLUG) 171#if defined(CONFIG_HOTPLUG)
162 &uevent_seqnum_attr.attr, 172 &uevent_seqnum_attr.attr,
163 &uevent_helper_attr.attr, 173 &uevent_helper_attr.attr,
diff --git a/kernel/lockdep.c b/kernel/lockdep.c
index 53a68956f131..63437d065ac8 100644
--- a/kernel/lockdep.c
+++ b/kernel/lockdep.c
@@ -490,6 +490,18 @@ void get_usage_chars(struct lock_class *class, char usage[LOCK_USAGE_CHARS])
490 usage[i] = '\0'; 490 usage[i] = '\0';
491} 491}
492 492
493static int __print_lock_name(struct lock_class *class)
494{
495 char str[KSYM_NAME_LEN];
496 const char *name;
497
498 name = class->name;
499 if (!name)
500 name = __get_key_name(class->key, str);
501
502 return printk("%s", name);
503}
504
493static void print_lock_name(struct lock_class *class) 505static void print_lock_name(struct lock_class *class)
494{ 506{
495 char str[KSYM_NAME_LEN], usage[LOCK_USAGE_CHARS]; 507 char str[KSYM_NAME_LEN], usage[LOCK_USAGE_CHARS];
@@ -1053,6 +1065,56 @@ print_circular_bug_entry(struct lock_list *target, int depth)
1053 return 0; 1065 return 0;
1054} 1066}
1055 1067
1068static void
1069print_circular_lock_scenario(struct held_lock *src,
1070 struct held_lock *tgt,
1071 struct lock_list *prt)
1072{
1073 struct lock_class *source = hlock_class(src);
1074 struct lock_class *target = hlock_class(tgt);
1075 struct lock_class *parent = prt->class;
1076
1077 /*
1078 * A direct locking problem where unsafe_class lock is taken
1079 * directly by safe_class lock, then all we need to show
1080 * is the deadlock scenario, as it is obvious that the
1081 * unsafe lock is taken under the safe lock.
1082 *
1083 * But if there is a chain instead, where the safe lock takes
1084 * an intermediate lock (middle_class) where this lock is
1085 * not the same as the safe lock, then the lock chain is
1086 * used to describe the problem. Otherwise we would need
1087 * to show a different CPU case for each link in the chain
1088 * from the safe_class lock to the unsafe_class lock.
1089 */
1090 if (parent != source) {
1091 printk("Chain exists of:\n ");
1092 __print_lock_name(source);
1093 printk(" --> ");
1094 __print_lock_name(parent);
1095 printk(" --> ");
1096 __print_lock_name(target);
1097 printk("\n\n");
1098 }
1099
1100 printk(" Possible unsafe locking scenario:\n\n");
1101 printk(" CPU0 CPU1\n");
1102 printk(" ---- ----\n");
1103 printk(" lock(");
1104 __print_lock_name(target);
1105 printk(");\n");
1106 printk(" lock(");
1107 __print_lock_name(parent);
1108 printk(");\n");
1109 printk(" lock(");
1110 __print_lock_name(target);
1111 printk(");\n");
1112 printk(" lock(");
1113 __print_lock_name(source);
1114 printk(");\n");
1115 printk("\n *** DEADLOCK ***\n\n");
1116}
1117
1056/* 1118/*
1057 * When a circular dependency is detected, print the 1119 * When a circular dependency is detected, print the
1058 * header first: 1120 * header first:
@@ -1096,6 +1158,7 @@ static noinline int print_circular_bug(struct lock_list *this,
1096{ 1158{
1097 struct task_struct *curr = current; 1159 struct task_struct *curr = current;
1098 struct lock_list *parent; 1160 struct lock_list *parent;
1161 struct lock_list *first_parent;
1099 int depth; 1162 int depth;
1100 1163
1101 if (!debug_locks_off_graph_unlock() || debug_locks_silent) 1164 if (!debug_locks_off_graph_unlock() || debug_locks_silent)
@@ -1109,6 +1172,7 @@ static noinline int print_circular_bug(struct lock_list *this,
1109 print_circular_bug_header(target, depth, check_src, check_tgt); 1172 print_circular_bug_header(target, depth, check_src, check_tgt);
1110 1173
1111 parent = get_lock_parent(target); 1174 parent = get_lock_parent(target);
1175 first_parent = parent;
1112 1176
1113 while (parent) { 1177 while (parent) {
1114 print_circular_bug_entry(parent, --depth); 1178 print_circular_bug_entry(parent, --depth);
@@ -1116,6 +1180,9 @@ static noinline int print_circular_bug(struct lock_list *this,
1116 } 1180 }
1117 1181
1118 printk("\nother info that might help us debug this:\n\n"); 1182 printk("\nother info that might help us debug this:\n\n");
1183 print_circular_lock_scenario(check_src, check_tgt,
1184 first_parent);
1185
1119 lockdep_print_held_locks(curr); 1186 lockdep_print_held_locks(curr);
1120 1187
1121 printk("\nstack backtrace:\n"); 1188 printk("\nstack backtrace:\n");
@@ -1314,7 +1381,7 @@ print_shortest_lock_dependencies(struct lock_list *leaf,
1314 printk("\n"); 1381 printk("\n");
1315 1382
1316 if (depth == 0 && (entry != root)) { 1383 if (depth == 0 && (entry != root)) {
1317 printk("lockdep:%s bad BFS generated tree\n", __func__); 1384 printk("lockdep:%s bad path found in chain graph\n", __func__);
1318 break; 1385 break;
1319 } 1386 }
1320 1387
@@ -1325,6 +1392,62 @@ print_shortest_lock_dependencies(struct lock_list *leaf,
1325 return; 1392 return;
1326} 1393}
1327 1394
1395static void
1396print_irq_lock_scenario(struct lock_list *safe_entry,
1397 struct lock_list *unsafe_entry,
1398 struct lock_class *prev_class,
1399 struct lock_class *next_class)
1400{
1401 struct lock_class *safe_class = safe_entry->class;
1402 struct lock_class *unsafe_class = unsafe_entry->class;
1403 struct lock_class *middle_class = prev_class;
1404
1405 if (middle_class == safe_class)
1406 middle_class = next_class;
1407
1408 /*
1409 * A direct locking problem where unsafe_class lock is taken
1410 * directly by safe_class lock, then all we need to show
1411 * is the deadlock scenario, as it is obvious that the
1412 * unsafe lock is taken under the safe lock.
1413 *
1414 * But if there is a chain instead, where the safe lock takes
1415 * an intermediate lock (middle_class) where this lock is
1416 * not the same as the safe lock, then the lock chain is
1417 * used to describe the problem. Otherwise we would need
1418 * to show a different CPU case for each link in the chain
1419 * from the safe_class lock to the unsafe_class lock.
1420 */
1421 if (middle_class != unsafe_class) {
1422 printk("Chain exists of:\n ");
1423 __print_lock_name(safe_class);
1424 printk(" --> ");
1425 __print_lock_name(middle_class);
1426 printk(" --> ");
1427 __print_lock_name(unsafe_class);
1428 printk("\n\n");
1429 }
1430
1431 printk(" Possible interrupt unsafe locking scenario:\n\n");
1432 printk(" CPU0 CPU1\n");
1433 printk(" ---- ----\n");
1434 printk(" lock(");
1435 __print_lock_name(unsafe_class);
1436 printk(");\n");
1437 printk(" local_irq_disable();\n");
1438 printk(" lock(");
1439 __print_lock_name(safe_class);
1440 printk(");\n");
1441 printk(" lock(");
1442 __print_lock_name(middle_class);
1443 printk(");\n");
1444 printk(" <Interrupt>\n");
1445 printk(" lock(");
1446 __print_lock_name(safe_class);
1447 printk(");\n");
1448 printk("\n *** DEADLOCK ***\n\n");
1449}
1450
1328static int 1451static int
1329print_bad_irq_dependency(struct task_struct *curr, 1452print_bad_irq_dependency(struct task_struct *curr,
1330 struct lock_list *prev_root, 1453 struct lock_list *prev_root,
@@ -1376,6 +1499,9 @@ print_bad_irq_dependency(struct task_struct *curr,
1376 print_stack_trace(forwards_entry->class->usage_traces + bit2, 1); 1499 print_stack_trace(forwards_entry->class->usage_traces + bit2, 1);
1377 1500
1378 printk("\nother info that might help us debug this:\n\n"); 1501 printk("\nother info that might help us debug this:\n\n");
1502 print_irq_lock_scenario(backwards_entry, forwards_entry,
1503 hlock_class(prev), hlock_class(next));
1504
1379 lockdep_print_held_locks(curr); 1505 lockdep_print_held_locks(curr);
1380 1506
1381 printk("\nthe dependencies between %s-irq-safe lock", irqclass); 1507 printk("\nthe dependencies between %s-irq-safe lock", irqclass);
@@ -1539,6 +1665,26 @@ static inline void inc_chains(void)
1539 1665
1540#endif 1666#endif
1541 1667
1668static void
1669print_deadlock_scenario(struct held_lock *nxt,
1670 struct held_lock *prv)
1671{
1672 struct lock_class *next = hlock_class(nxt);
1673 struct lock_class *prev = hlock_class(prv);
1674
1675 printk(" Possible unsafe locking scenario:\n\n");
1676 printk(" CPU0\n");
1677 printk(" ----\n");
1678 printk(" lock(");
1679 __print_lock_name(prev);
1680 printk(");\n");
1681 printk(" lock(");
1682 __print_lock_name(next);
1683 printk(");\n");
1684 printk("\n *** DEADLOCK ***\n\n");
1685 printk(" May be due to missing lock nesting notation\n\n");
1686}
1687
1542static int 1688static int
1543print_deadlock_bug(struct task_struct *curr, struct held_lock *prev, 1689print_deadlock_bug(struct task_struct *curr, struct held_lock *prev,
1544 struct held_lock *next) 1690 struct held_lock *next)
@@ -1557,6 +1703,7 @@ print_deadlock_bug(struct task_struct *curr, struct held_lock *prev,
1557 print_lock(prev); 1703 print_lock(prev);
1558 1704
1559 printk("\nother info that might help us debug this:\n"); 1705 printk("\nother info that might help us debug this:\n");
1706 print_deadlock_scenario(next, prev);
1560 lockdep_print_held_locks(curr); 1707 lockdep_print_held_locks(curr);
1561 1708
1562 printk("\nstack backtrace:\n"); 1709 printk("\nstack backtrace:\n");
@@ -1826,7 +1973,7 @@ static inline int lookup_chain_cache(struct task_struct *curr,
1826 struct list_head *hash_head = chainhashentry(chain_key); 1973 struct list_head *hash_head = chainhashentry(chain_key);
1827 struct lock_chain *chain; 1974 struct lock_chain *chain;
1828 struct held_lock *hlock_curr, *hlock_next; 1975 struct held_lock *hlock_curr, *hlock_next;
1829 int i, j, n, cn; 1976 int i, j;
1830 1977
1831 if (DEBUG_LOCKS_WARN_ON(!irqs_disabled())) 1978 if (DEBUG_LOCKS_WARN_ON(!irqs_disabled()))
1832 return 0; 1979 return 0;
@@ -1886,15 +2033,9 @@ cache_hit:
1886 } 2033 }
1887 i++; 2034 i++;
1888 chain->depth = curr->lockdep_depth + 1 - i; 2035 chain->depth = curr->lockdep_depth + 1 - i;
1889 cn = nr_chain_hlocks; 2036 if (likely(nr_chain_hlocks + chain->depth <= MAX_LOCKDEP_CHAIN_HLOCKS)) {
1890 while (cn + chain->depth <= MAX_LOCKDEP_CHAIN_HLOCKS) { 2037 chain->base = nr_chain_hlocks;
1891 n = cmpxchg(&nr_chain_hlocks, cn, cn + chain->depth); 2038 nr_chain_hlocks += chain->depth;
1892 if (n == cn)
1893 break;
1894 cn = n;
1895 }
1896 if (likely(cn + chain->depth <= MAX_LOCKDEP_CHAIN_HLOCKS)) {
1897 chain->base = cn;
1898 for (j = 0; j < chain->depth - 1; j++, i++) { 2039 for (j = 0; j < chain->depth - 1; j++, i++) {
1899 int lock_id = curr->held_locks[i].class_idx - 1; 2040 int lock_id = curr->held_locks[i].class_idx - 1;
1900 chain_hlocks[chain->base + j] = lock_id; 2041 chain_hlocks[chain->base + j] = lock_id;
@@ -2011,6 +2152,24 @@ static void check_chain_key(struct task_struct *curr)
2011#endif 2152#endif
2012} 2153}
2013 2154
2155static void
2156print_usage_bug_scenario(struct held_lock *lock)
2157{
2158 struct lock_class *class = hlock_class(lock);
2159
2160 printk(" Possible unsafe locking scenario:\n\n");
2161 printk(" CPU0\n");
2162 printk(" ----\n");
2163 printk(" lock(");
2164 __print_lock_name(class);
2165 printk(");\n");
2166 printk(" <Interrupt>\n");
2167 printk(" lock(");
2168 __print_lock_name(class);
2169 printk(");\n");
2170 printk("\n *** DEADLOCK ***\n\n");
2171}
2172
2014static int 2173static int
2015print_usage_bug(struct task_struct *curr, struct held_lock *this, 2174print_usage_bug(struct task_struct *curr, struct held_lock *this,
2016 enum lock_usage_bit prev_bit, enum lock_usage_bit new_bit) 2175 enum lock_usage_bit prev_bit, enum lock_usage_bit new_bit)
@@ -2039,6 +2198,8 @@ print_usage_bug(struct task_struct *curr, struct held_lock *this,
2039 2198
2040 print_irqtrace_events(curr); 2199 print_irqtrace_events(curr);
2041 printk("\nother info that might help us debug this:\n"); 2200 printk("\nother info that might help us debug this:\n");
2201 print_usage_bug_scenario(this);
2202
2042 lockdep_print_held_locks(curr); 2203 lockdep_print_held_locks(curr);
2043 2204
2044 printk("\nstack backtrace:\n"); 2205 printk("\nstack backtrace:\n");
@@ -2073,6 +2234,10 @@ print_irq_inversion_bug(struct task_struct *curr,
2073 struct held_lock *this, int forwards, 2234 struct held_lock *this, int forwards,
2074 const char *irqclass) 2235 const char *irqclass)
2075{ 2236{
2237 struct lock_list *entry = other;
2238 struct lock_list *middle = NULL;
2239 int depth;
2240
2076 if (!debug_locks_off_graph_unlock() || debug_locks_silent) 2241 if (!debug_locks_off_graph_unlock() || debug_locks_silent)
2077 return 0; 2242 return 0;
2078 2243
@@ -2091,6 +2256,25 @@ print_irq_inversion_bug(struct task_struct *curr,
2091 printk("\n\nand interrupts could create inverse lock ordering between them.\n\n"); 2256 printk("\n\nand interrupts could create inverse lock ordering between them.\n\n");
2092 2257
2093 printk("\nother info that might help us debug this:\n"); 2258 printk("\nother info that might help us debug this:\n");
2259
2260 /* Find a middle lock (if one exists) */
2261 depth = get_lock_depth(other);
2262 do {
2263 if (depth == 0 && (entry != root)) {
2264 printk("lockdep:%s bad path found in chain graph\n", __func__);
2265 break;
2266 }
2267 middle = entry;
2268 entry = get_lock_parent(entry);
2269 depth--;
2270 } while (entry && entry != root && (depth >= 0));
2271 if (forwards)
2272 print_irq_lock_scenario(root, other,
2273 middle ? middle->class : root->class, other->class);
2274 else
2275 print_irq_lock_scenario(other, root,
2276 middle ? middle->class : other->class, root->class);
2277
2094 lockdep_print_held_locks(curr); 2278 lockdep_print_held_locks(curr);
2095 2279
2096 printk("\nthe shortest dependencies between 2nd lock and 1st lock:\n"); 2280 printk("\nthe shortest dependencies between 2nd lock and 1st lock:\n");
diff --git a/kernel/module.c b/kernel/module.c
index d5938a5c19c4..22879725678d 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -57,6 +57,7 @@
57#include <linux/kmemleak.h> 57#include <linux/kmemleak.h>
58#include <linux/jump_label.h> 58#include <linux/jump_label.h>
59#include <linux/pfn.h> 59#include <linux/pfn.h>
60#include <linux/bsearch.h>
60 61
61#define CREATE_TRACE_POINTS 62#define CREATE_TRACE_POINTS
62#include <trace/events/module.h> 63#include <trace/events/module.h>
@@ -240,23 +241,24 @@ static bool each_symbol_in_section(const struct symsearch *arr,
240 struct module *owner, 241 struct module *owner,
241 bool (*fn)(const struct symsearch *syms, 242 bool (*fn)(const struct symsearch *syms,
242 struct module *owner, 243 struct module *owner,
243 unsigned int symnum, void *data), 244 void *data),
244 void *data) 245 void *data)
245{ 246{
246 unsigned int i, j; 247 unsigned int j;
247 248
248 for (j = 0; j < arrsize; j++) { 249 for (j = 0; j < arrsize; j++) {
249 for (i = 0; i < arr[j].stop - arr[j].start; i++) 250 if (fn(&arr[j], owner, data))
250 if (fn(&arr[j], owner, i, data)) 251 return true;
251 return true;
252 } 252 }
253 253
254 return false; 254 return false;
255} 255}
256 256
257/* Returns true as soon as fn returns true, otherwise false. */ 257/* Returns true as soon as fn returns true, otherwise false. */
258bool each_symbol(bool (*fn)(const struct symsearch *arr, struct module *owner, 258bool each_symbol_section(bool (*fn)(const struct symsearch *arr,
259 unsigned int symnum, void *data), void *data) 259 struct module *owner,
260 void *data),
261 void *data)
260{ 262{
261 struct module *mod; 263 struct module *mod;
262 static const struct symsearch arr[] = { 264 static const struct symsearch arr[] = {
@@ -309,7 +311,7 @@ bool each_symbol(bool (*fn)(const struct symsearch *arr, struct module *owner,
309 } 311 }
310 return false; 312 return false;
311} 313}
312EXPORT_SYMBOL_GPL(each_symbol); 314EXPORT_SYMBOL_GPL(each_symbol_section);
313 315
314struct find_symbol_arg { 316struct find_symbol_arg {
315 /* Input */ 317 /* Input */
@@ -323,15 +325,12 @@ struct find_symbol_arg {
323 const struct kernel_symbol *sym; 325 const struct kernel_symbol *sym;
324}; 326};
325 327
326static bool find_symbol_in_section(const struct symsearch *syms, 328static bool check_symbol(const struct symsearch *syms,
327 struct module *owner, 329 struct module *owner,
328 unsigned int symnum, void *data) 330 unsigned int symnum, void *data)
329{ 331{
330 struct find_symbol_arg *fsa = data; 332 struct find_symbol_arg *fsa = data;
331 333
332 if (strcmp(syms->start[symnum].name, fsa->name) != 0)
333 return false;
334
335 if (!fsa->gplok) { 334 if (!fsa->gplok) {
336 if (syms->licence == GPL_ONLY) 335 if (syms->licence == GPL_ONLY)
337 return false; 336 return false;
@@ -365,6 +364,30 @@ static bool find_symbol_in_section(const struct symsearch *syms,
365 return true; 364 return true;
366} 365}
367 366
367static int cmp_name(const void *va, const void *vb)
368{
369 const char *a;
370 const struct kernel_symbol *b;
371 a = va; b = vb;
372 return strcmp(a, b->name);
373}
374
375static bool find_symbol_in_section(const struct symsearch *syms,
376 struct module *owner,
377 void *data)
378{
379 struct find_symbol_arg *fsa = data;
380 struct kernel_symbol *sym;
381
382 sym = bsearch(fsa->name, syms->start, syms->stop - syms->start,
383 sizeof(struct kernel_symbol), cmp_name);
384
385 if (sym != NULL && check_symbol(syms, owner, sym - syms->start, data))
386 return true;
387
388 return false;
389}
390
368/* Find a symbol and return it, along with, (optional) crc and 391/* Find a symbol and return it, along with, (optional) crc and
369 * (optional) module which owns it. Needs preempt disabled or module_mutex. */ 392 * (optional) module which owns it. Needs preempt disabled or module_mutex. */
370const struct kernel_symbol *find_symbol(const char *name, 393const struct kernel_symbol *find_symbol(const char *name,
@@ -379,7 +402,7 @@ const struct kernel_symbol *find_symbol(const char *name,
379 fsa.gplok = gplok; 402 fsa.gplok = gplok;
380 fsa.warn = warn; 403 fsa.warn = warn;
381 404
382 if (each_symbol(find_symbol_in_section, &fsa)) { 405 if (each_symbol_section(find_symbol_in_section, &fsa)) {
383 if (owner) 406 if (owner)
384 *owner = fsa.owner; 407 *owner = fsa.owner;
385 if (crc) 408 if (crc)
@@ -1607,27 +1630,28 @@ static void set_section_ro_nx(void *base,
1607 } 1630 }
1608} 1631}
1609 1632
1610/* Setting memory back to RW+NX before releasing it */ 1633static void unset_module_core_ro_nx(struct module *mod)
1611void unset_section_ro_nx(struct module *mod, void *module_region)
1612{ 1634{
1613 unsigned long total_pages; 1635 set_page_attributes(mod->module_core + mod->core_text_size,
1614 1636 mod->module_core + mod->core_size,
1615 if (mod->module_core == module_region) { 1637 set_memory_x);
1616 /* Set core as NX+RW */ 1638 set_page_attributes(mod->module_core,
1617 total_pages = MOD_NUMBER_OF_PAGES(mod->module_core, mod->core_size); 1639 mod->module_core + mod->core_ro_size,
1618 set_memory_nx((unsigned long)mod->module_core, total_pages); 1640 set_memory_rw);
1619 set_memory_rw((unsigned long)mod->module_core, total_pages); 1641}
1620 1642
1621 } else if (mod->module_init == module_region) { 1643static void unset_module_init_ro_nx(struct module *mod)
1622 /* Set init as NX+RW */ 1644{
1623 total_pages = MOD_NUMBER_OF_PAGES(mod->module_init, mod->init_size); 1645 set_page_attributes(mod->module_init + mod->init_text_size,
1624 set_memory_nx((unsigned long)mod->module_init, total_pages); 1646 mod->module_init + mod->init_size,
1625 set_memory_rw((unsigned long)mod->module_init, total_pages); 1647 set_memory_x);
1626 } 1648 set_page_attributes(mod->module_init,
1649 mod->module_init + mod->init_ro_size,
1650 set_memory_rw);
1627} 1651}
1628 1652
1629/* Iterate through all modules and set each module's text as RW */ 1653/* Iterate through all modules and set each module's text as RW */
1630void set_all_modules_text_rw() 1654void set_all_modules_text_rw(void)
1631{ 1655{
1632 struct module *mod; 1656 struct module *mod;
1633 1657
@@ -1648,7 +1672,7 @@ void set_all_modules_text_rw()
1648} 1672}
1649 1673
1650/* Iterate through all modules and set each module's text as RO */ 1674/* Iterate through all modules and set each module's text as RO */
1651void set_all_modules_text_ro() 1675void set_all_modules_text_ro(void)
1652{ 1676{
1653 struct module *mod; 1677 struct module *mod;
1654 1678
@@ -1669,7 +1693,8 @@ void set_all_modules_text_ro()
1669} 1693}
1670#else 1694#else
1671static inline void set_section_ro_nx(void *base, unsigned long text_size, unsigned long ro_size, unsigned long total_size) { } 1695static inline void set_section_ro_nx(void *base, unsigned long text_size, unsigned long ro_size, unsigned long total_size) { }
1672static inline void unset_section_ro_nx(struct module *mod, void *module_region) { } 1696static void unset_module_core_ro_nx(struct module *mod) { }
1697static void unset_module_init_ro_nx(struct module *mod) { }
1673#endif 1698#endif
1674 1699
1675/* Free a module, remove from lists, etc. */ 1700/* Free a module, remove from lists, etc. */
@@ -1696,7 +1721,7 @@ static void free_module(struct module *mod)
1696 destroy_params(mod->kp, mod->num_kp); 1721 destroy_params(mod->kp, mod->num_kp);
1697 1722
1698 /* This may be NULL, but that's OK */ 1723 /* This may be NULL, but that's OK */
1699 unset_section_ro_nx(mod, mod->module_init); 1724 unset_module_init_ro_nx(mod);
1700 module_free(mod, mod->module_init); 1725 module_free(mod, mod->module_init);
1701 kfree(mod->args); 1726 kfree(mod->args);
1702 percpu_modfree(mod); 1727 percpu_modfree(mod);
@@ -1705,7 +1730,7 @@ static void free_module(struct module *mod)
1705 lockdep_free_key_range(mod->module_core, mod->core_size); 1730 lockdep_free_key_range(mod->module_core, mod->core_size);
1706 1731
1707 /* Finally, free the core (containing the module structure) */ 1732 /* Finally, free the core (containing the module structure) */
1708 unset_section_ro_nx(mod, mod->module_core); 1733 unset_module_core_ro_nx(mod);
1709 module_free(mod, mod->module_core); 1734 module_free(mod, mod->module_core);
1710 1735
1711#ifdef CONFIG_MPU 1736#ifdef CONFIG_MPU
@@ -2030,11 +2055,8 @@ static const struct kernel_symbol *lookup_symbol(const char *name,
2030 const struct kernel_symbol *start, 2055 const struct kernel_symbol *start,
2031 const struct kernel_symbol *stop) 2056 const struct kernel_symbol *stop)
2032{ 2057{
2033 const struct kernel_symbol *ks = start; 2058 return bsearch(name, start, stop - start,
2034 for (; ks < stop; ks++) 2059 sizeof(struct kernel_symbol), cmp_name);
2035 if (strcmp(ks->name, name) == 0)
2036 return ks;
2037 return NULL;
2038} 2060}
2039 2061
2040static int is_exported(const char *name, unsigned long value, 2062static int is_exported(const char *name, unsigned long value,
@@ -2931,10 +2953,11 @@ SYSCALL_DEFINE3(init_module, void __user *, umod,
2931 mod->symtab = mod->core_symtab; 2953 mod->symtab = mod->core_symtab;
2932 mod->strtab = mod->core_strtab; 2954 mod->strtab = mod->core_strtab;
2933#endif 2955#endif
2934 unset_section_ro_nx(mod, mod->module_init); 2956 unset_module_init_ro_nx(mod);
2935 module_free(mod, mod->module_init); 2957 module_free(mod, mod->module_init);
2936 mod->module_init = NULL; 2958 mod->module_init = NULL;
2937 mod->init_size = 0; 2959 mod->init_size = 0;
2960 mod->init_ro_size = 0;
2938 mod->init_text_size = 0; 2961 mod->init_text_size = 0;
2939 mutex_unlock(&module_mutex); 2962 mutex_unlock(&module_mutex);
2940 2963
diff --git a/kernel/mutex-debug.c b/kernel/mutex-debug.c
index ec815a960b5d..73da83aff418 100644
--- a/kernel/mutex-debug.c
+++ b/kernel/mutex-debug.c
@@ -75,7 +75,7 @@ void debug_mutex_unlock(struct mutex *lock)
75 return; 75 return;
76 76
77 DEBUG_LOCKS_WARN_ON(lock->magic != lock); 77 DEBUG_LOCKS_WARN_ON(lock->magic != lock);
78 DEBUG_LOCKS_WARN_ON(lock->owner != current_thread_info()); 78 DEBUG_LOCKS_WARN_ON(lock->owner != current);
79 DEBUG_LOCKS_WARN_ON(!lock->wait_list.prev && !lock->wait_list.next); 79 DEBUG_LOCKS_WARN_ON(!lock->wait_list.prev && !lock->wait_list.next);
80 mutex_clear_owner(lock); 80 mutex_clear_owner(lock);
81} 81}
diff --git a/kernel/mutex-debug.h b/kernel/mutex-debug.h
index 57d527a16f9d..0799fd3e4cfa 100644
--- a/kernel/mutex-debug.h
+++ b/kernel/mutex-debug.h
@@ -29,7 +29,7 @@ extern void debug_mutex_init(struct mutex *lock, const char *name,
29 29
30static inline void mutex_set_owner(struct mutex *lock) 30static inline void mutex_set_owner(struct mutex *lock)
31{ 31{
32 lock->owner = current_thread_info(); 32 lock->owner = current;
33} 33}
34 34
35static inline void mutex_clear_owner(struct mutex *lock) 35static inline void mutex_clear_owner(struct mutex *lock)
diff --git a/kernel/mutex.c b/kernel/mutex.c
index c4195fa98900..2c938e2337cd 100644
--- a/kernel/mutex.c
+++ b/kernel/mutex.c
@@ -160,14 +160,7 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass,
160 */ 160 */
161 161
162 for (;;) { 162 for (;;) {
163 struct thread_info *owner; 163 struct task_struct *owner;
164
165 /*
166 * If we own the BKL, then don't spin. The owner of
167 * the mutex might be waiting on us to release the BKL.
168 */
169 if (unlikely(current->lock_depth >= 0))
170 break;
171 164
172 /* 165 /*
173 * If there's an owner, wait for it to either 166 * If there's an owner, wait for it to either
diff --git a/kernel/mutex.h b/kernel/mutex.h
index 67578ca48f94..4115fbf83b12 100644
--- a/kernel/mutex.h
+++ b/kernel/mutex.h
@@ -19,7 +19,7 @@
19#ifdef CONFIG_SMP 19#ifdef CONFIG_SMP
20static inline void mutex_set_owner(struct mutex *lock) 20static inline void mutex_set_owner(struct mutex *lock)
21{ 21{
22 lock->owner = current_thread_info(); 22 lock->owner = current;
23} 23}
24 24
25static inline void mutex_clear_owner(struct mutex *lock) 25static inline void mutex_clear_owner(struct mutex *lock)
diff --git a/kernel/params.c b/kernel/params.c
index 7ab388a48a2e..ed72e1330862 100644
--- a/kernel/params.c
+++ b/kernel/params.c
@@ -297,21 +297,15 @@ EXPORT_SYMBOL(param_ops_charp);
297int param_set_bool(const char *val, const struct kernel_param *kp) 297int param_set_bool(const char *val, const struct kernel_param *kp)
298{ 298{
299 bool v; 299 bool v;
300 int ret;
300 301
301 /* No equals means "set"... */ 302 /* No equals means "set"... */
302 if (!val) val = "1"; 303 if (!val) val = "1";
303 304
304 /* One of =[yYnN01] */ 305 /* One of =[yYnN01] */
305 switch (val[0]) { 306 ret = strtobool(val, &v);
306 case 'y': case 'Y': case '1': 307 if (ret)
307 v = true; 308 return ret;
308 break;
309 case 'n': case 'N': case '0':
310 v = false;
311 break;
312 default:
313 return -EINVAL;
314 }
315 309
316 if (kp->flags & KPARAM_ISBOOL) 310 if (kp->flags & KPARAM_ISBOOL)
317 *(bool *)kp->arg = v; 311 *(bool *)kp->arg = v;
@@ -821,15 +815,18 @@ ssize_t __modver_version_show(struct module_attribute *mattr,
821 return sprintf(buf, "%s\n", vattr->version); 815 return sprintf(buf, "%s\n", vattr->version);
822} 816}
823 817
824extern struct module_version_attribute __start___modver[], __stop___modver[]; 818extern const struct module_version_attribute *__start___modver[];
819extern const struct module_version_attribute *__stop___modver[];
825 820
826static void __init version_sysfs_builtin(void) 821static void __init version_sysfs_builtin(void)
827{ 822{
828 const struct module_version_attribute *vattr; 823 const struct module_version_attribute **p;
829 struct module_kobject *mk; 824 struct module_kobject *mk;
830 int err; 825 int err;
831 826
832 for (vattr = __start___modver; vattr < __stop___modver; vattr++) { 827 for (p = __start___modver; p < __stop___modver; p++) {
828 const struct module_version_attribute *vattr = *p;
829
833 mk = locate_module_kobject(vattr->module_name); 830 mk = locate_module_kobject(vattr->module_name);
834 if (mk) { 831 if (mk) {
835 err = sysfs_create_file(&mk->kobj, &vattr->mattr.attr); 832 err = sysfs_create_file(&mk->kobj, &vattr->mattr.attr);
diff --git a/kernel/power/Kconfig b/kernel/power/Kconfig
index 6de9a8fc3417..87f4d24b55b0 100644
--- a/kernel/power/Kconfig
+++ b/kernel/power/Kconfig
@@ -125,12 +125,6 @@ config PM_DEBUG
125 code. This is helpful when debugging and reporting PM bugs, like 125 code. This is helpful when debugging and reporting PM bugs, like
126 suspend support. 126 suspend support.
127 127
128config PM_VERBOSE
129 bool "Verbose Power Management debugging"
130 depends on PM_DEBUG
131 ---help---
132 This option enables verbose messages from the Power Management code.
133
134config PM_ADVANCED_DEBUG 128config PM_ADVANCED_DEBUG
135 bool "Extra PM attributes in sysfs for low-level debugging/testing" 129 bool "Extra PM attributes in sysfs for low-level debugging/testing"
136 depends on PM_DEBUG 130 depends on PM_DEBUG
@@ -229,3 +223,7 @@ config PM_OPP
229 representing individual voltage domains and provides SOC 223 representing individual voltage domains and provides SOC
230 implementations a ready to use framework to manage OPPs. 224 implementations a ready to use framework to manage OPPs.
231 For more information, read <file:Documentation/power/opp.txt> 225 For more information, read <file:Documentation/power/opp.txt>
226
227config PM_RUNTIME_CLK
228 def_bool y
229 depends on PM_RUNTIME && HAVE_CLK
diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c
index 50aae660174d..f9bec56d8825 100644
--- a/kernel/power/hibernate.c
+++ b/kernel/power/hibernate.c
@@ -272,12 +272,7 @@ static int create_image(int platform_mode)
272 272
273 local_irq_disable(); 273 local_irq_disable();
274 274
275 error = sysdev_suspend(PMSG_FREEZE); 275 error = syscore_suspend();
276 if (!error) {
277 error = syscore_suspend();
278 if (error)
279 sysdev_resume();
280 }
281 if (error) { 276 if (error) {
282 printk(KERN_ERR "PM: Some system devices failed to power down, " 277 printk(KERN_ERR "PM: Some system devices failed to power down, "
283 "aborting hibernation\n"); 278 "aborting hibernation\n");
@@ -302,7 +297,6 @@ static int create_image(int platform_mode)
302 297
303 Power_up: 298 Power_up:
304 syscore_resume(); 299 syscore_resume();
305 sysdev_resume();
306 /* NOTE: dpm_resume_noirq() is just a resume() for devices 300 /* NOTE: dpm_resume_noirq() is just a resume() for devices
307 * that suspended with irqs off ... no overall powerup. 301 * that suspended with irqs off ... no overall powerup.
308 */ 302 */
@@ -333,20 +327,25 @@ static int create_image(int platform_mode)
333 327
334int hibernation_snapshot(int platform_mode) 328int hibernation_snapshot(int platform_mode)
335{ 329{
330 pm_message_t msg = PMSG_RECOVER;
336 int error; 331 int error;
337 332
338 error = platform_begin(platform_mode); 333 error = platform_begin(platform_mode);
339 if (error) 334 if (error)
340 goto Close; 335 goto Close;
341 336
337 error = dpm_prepare(PMSG_FREEZE);
338 if (error)
339 goto Complete_devices;
340
342 /* Preallocate image memory before shutting down devices. */ 341 /* Preallocate image memory before shutting down devices. */
343 error = hibernate_preallocate_memory(); 342 error = hibernate_preallocate_memory();
344 if (error) 343 if (error)
345 goto Close; 344 goto Complete_devices;
346 345
347 suspend_console(); 346 suspend_console();
348 pm_restrict_gfp_mask(); 347 pm_restrict_gfp_mask();
349 error = dpm_suspend_start(PMSG_FREEZE); 348 error = dpm_suspend(PMSG_FREEZE);
350 if (error) 349 if (error)
351 goto Recover_platform; 350 goto Recover_platform;
352 351
@@ -364,13 +363,17 @@ int hibernation_snapshot(int platform_mode)
364 if (error || !in_suspend) 363 if (error || !in_suspend)
365 swsusp_free(); 364 swsusp_free();
366 365
367 dpm_resume_end(in_suspend ? 366 msg = in_suspend ? (error ? PMSG_RECOVER : PMSG_THAW) : PMSG_RESTORE;
368 (error ? PMSG_RECOVER : PMSG_THAW) : PMSG_RESTORE); 367 dpm_resume(msg);
369 368
370 if (error || !in_suspend) 369 if (error || !in_suspend)
371 pm_restore_gfp_mask(); 370 pm_restore_gfp_mask();
372 371
373 resume_console(); 372 resume_console();
373
374 Complete_devices:
375 dpm_complete(msg);
376
374 Close: 377 Close:
375 platform_end(platform_mode); 378 platform_end(platform_mode);
376 return error; 379 return error;
@@ -409,12 +412,7 @@ static int resume_target_kernel(bool platform_mode)
409 412
410 local_irq_disable(); 413 local_irq_disable();
411 414
412 error = sysdev_suspend(PMSG_QUIESCE); 415 error = syscore_suspend();
413 if (!error) {
414 error = syscore_suspend();
415 if (error)
416 sysdev_resume();
417 }
418 if (error) 416 if (error)
419 goto Enable_irqs; 417 goto Enable_irqs;
420 418
@@ -442,7 +440,6 @@ static int resume_target_kernel(bool platform_mode)
442 touch_softlockup_watchdog(); 440 touch_softlockup_watchdog();
443 441
444 syscore_resume(); 442 syscore_resume();
445 sysdev_resume();
446 443
447 Enable_irqs: 444 Enable_irqs:
448 local_irq_enable(); 445 local_irq_enable();
@@ -528,7 +525,6 @@ int hibernation_platform_enter(void)
528 goto Platform_finish; 525 goto Platform_finish;
529 526
530 local_irq_disable(); 527 local_irq_disable();
531 sysdev_suspend(PMSG_HIBERNATE);
532 syscore_suspend(); 528 syscore_suspend();
533 if (pm_wakeup_pending()) { 529 if (pm_wakeup_pending()) {
534 error = -EAGAIN; 530 error = -EAGAIN;
@@ -541,7 +537,6 @@ int hibernation_platform_enter(void)
541 537
542 Power_up: 538 Power_up:
543 syscore_resume(); 539 syscore_resume();
544 sysdev_resume();
545 local_irq_enable(); 540 local_irq_enable();
546 enable_nonboot_cpus(); 541 enable_nonboot_cpus();
547 542
@@ -982,10 +977,33 @@ static ssize_t image_size_store(struct kobject *kobj, struct kobj_attribute *att
982 977
983power_attr(image_size); 978power_attr(image_size);
984 979
980static ssize_t reserved_size_show(struct kobject *kobj,
981 struct kobj_attribute *attr, char *buf)
982{
983 return sprintf(buf, "%lu\n", reserved_size);
984}
985
986static ssize_t reserved_size_store(struct kobject *kobj,
987 struct kobj_attribute *attr,
988 const char *buf, size_t n)
989{
990 unsigned long size;
991
992 if (sscanf(buf, "%lu", &size) == 1) {
993 reserved_size = size;
994 return n;
995 }
996
997 return -EINVAL;
998}
999
1000power_attr(reserved_size);
1001
985static struct attribute * g[] = { 1002static struct attribute * g[] = {
986 &disk_attr.attr, 1003 &disk_attr.attr,
987 &resume_attr.attr, 1004 &resume_attr.attr,
988 &image_size_attr.attr, 1005 &image_size_attr.attr,
1006 &reserved_size_attr.attr,
989 NULL, 1007 NULL,
990}; 1008};
991 1009
diff --git a/kernel/power/main.c b/kernel/power/main.c
index de9aef8742f4..2981af4ce7cb 100644
--- a/kernel/power/main.c
+++ b/kernel/power/main.c
@@ -337,6 +337,7 @@ static int __init pm_init(void)
337 if (error) 337 if (error)
338 return error; 338 return error;
339 hibernate_image_size_init(); 339 hibernate_image_size_init();
340 hibernate_reserved_size_init();
340 power_kobj = kobject_create_and_add("power", NULL); 341 power_kobj = kobject_create_and_add("power", NULL);
341 if (!power_kobj) 342 if (!power_kobj)
342 return -ENOMEM; 343 return -ENOMEM;
diff --git a/kernel/power/power.h b/kernel/power/power.h
index 03634be55f62..9a00a0a26280 100644
--- a/kernel/power/power.h
+++ b/kernel/power/power.h
@@ -15,6 +15,7 @@ struct swsusp_info {
15 15
16#ifdef CONFIG_HIBERNATION 16#ifdef CONFIG_HIBERNATION
17/* kernel/power/snapshot.c */ 17/* kernel/power/snapshot.c */
18extern void __init hibernate_reserved_size_init(void);
18extern void __init hibernate_image_size_init(void); 19extern void __init hibernate_image_size_init(void);
19 20
20#ifdef CONFIG_ARCH_HIBERNATION_HEADER 21#ifdef CONFIG_ARCH_HIBERNATION_HEADER
@@ -55,6 +56,7 @@ extern int hibernation_platform_enter(void);
55 56
56#else /* !CONFIG_HIBERNATION */ 57#else /* !CONFIG_HIBERNATION */
57 58
59static inline void hibernate_reserved_size_init(void) {}
58static inline void hibernate_image_size_init(void) {} 60static inline void hibernate_image_size_init(void) {}
59#endif /* !CONFIG_HIBERNATION */ 61#endif /* !CONFIG_HIBERNATION */
60 62
@@ -72,6 +74,8 @@ static struct kobj_attribute _name##_attr = { \
72 74
73/* Preferred image size in bytes (default 500 MB) */ 75/* Preferred image size in bytes (default 500 MB) */
74extern unsigned long image_size; 76extern unsigned long image_size;
77/* Size of memory reserved for drivers (default SPARE_PAGES x PAGE_SIZE) */
78extern unsigned long reserved_size;
75extern int in_suspend; 79extern int in_suspend;
76extern dev_t swsusp_resume_device; 80extern dev_t swsusp_resume_device;
77extern sector_t swsusp_resume_block; 81extern sector_t swsusp_resume_block;
diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c
index ca0aacc24874..ace55889f702 100644
--- a/kernel/power/snapshot.c
+++ b/kernel/power/snapshot.c
@@ -41,16 +41,28 @@ static void swsusp_set_page_forbidden(struct page *);
41static void swsusp_unset_page_forbidden(struct page *); 41static void swsusp_unset_page_forbidden(struct page *);
42 42
43/* 43/*
44 * Number of bytes to reserve for memory allocations made by device drivers
45 * from their ->freeze() and ->freeze_noirq() callbacks so that they don't
46 * cause image creation to fail (tunable via /sys/power/reserved_size).
47 */
48unsigned long reserved_size;
49
50void __init hibernate_reserved_size_init(void)
51{
52 reserved_size = SPARE_PAGES * PAGE_SIZE;
53}
54
55/*
44 * Preferred image size in bytes (tunable via /sys/power/image_size). 56 * Preferred image size in bytes (tunable via /sys/power/image_size).
45 * When it is set to N, the image creating code will do its best to 57 * When it is set to N, swsusp will do its best to ensure the image
46 * ensure the image size will not exceed N bytes, but if that is 58 * size will not exceed N bytes, but if that is impossible, it will
47 * impossible, it will try to create the smallest image possible. 59 * try to create the smallest image possible.
48 */ 60 */
49unsigned long image_size; 61unsigned long image_size;
50 62
51void __init hibernate_image_size_init(void) 63void __init hibernate_image_size_init(void)
52{ 64{
53 image_size = (totalram_pages / 3) * PAGE_SIZE; 65 image_size = ((totalram_pages * 2) / 5) * PAGE_SIZE;
54} 66}
55 67
56/* List of PBEs needed for restoring the pages that were allocated before 68/* List of PBEs needed for restoring the pages that were allocated before
@@ -1263,11 +1275,13 @@ static unsigned long minimum_image_size(unsigned long saveable)
1263 * frame in use. We also need a number of page frames to be free during 1275 * frame in use. We also need a number of page frames to be free during
1264 * hibernation for allocations made while saving the image and for device 1276 * hibernation for allocations made while saving the image and for device
1265 * drivers, in case they need to allocate memory from their hibernation 1277 * drivers, in case they need to allocate memory from their hibernation
1266 * callbacks (these two numbers are given by PAGES_FOR_IO and SPARE_PAGES, 1278 * callbacks (these two numbers are given by PAGES_FOR_IO (which is a rough
1267 * respectively, both of which are rough estimates). To make this happen, we 1279 * estimate) and reserverd_size divided by PAGE_SIZE (which is tunable through
1268 * compute the total number of available page frames and allocate at least 1280 * /sys/power/reserved_size, respectively). To make this happen, we compute the
1281 * total number of available page frames and allocate at least
1269 * 1282 *
1270 * ([page frames total] + PAGES_FOR_IO + [metadata pages]) / 2 + 2 * SPARE_PAGES 1283 * ([page frames total] + PAGES_FOR_IO + [metadata pages]) / 2
1284 * + 2 * DIV_ROUND_UP(reserved_size, PAGE_SIZE)
1271 * 1285 *
1272 * of them, which corresponds to the maximum size of a hibernation image. 1286 * of them, which corresponds to the maximum size of a hibernation image.
1273 * 1287 *
@@ -1322,7 +1336,8 @@ int hibernate_preallocate_memory(void)
1322 count -= totalreserve_pages; 1336 count -= totalreserve_pages;
1323 1337
1324 /* Compute the maximum number of saveable pages to leave in memory. */ 1338 /* Compute the maximum number of saveable pages to leave in memory. */
1325 max_size = (count - (size + PAGES_FOR_IO)) / 2 - 2 * SPARE_PAGES; 1339 max_size = (count - (size + PAGES_FOR_IO)) / 2
1340 - 2 * DIV_ROUND_UP(reserved_size, PAGE_SIZE);
1326 /* Compute the desired number of image pages specified by image_size. */ 1341 /* Compute the desired number of image pages specified by image_size. */
1327 size = DIV_ROUND_UP(image_size, PAGE_SIZE); 1342 size = DIV_ROUND_UP(image_size, PAGE_SIZE);
1328 if (size > max_size) 1343 if (size > max_size)
diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c
index 8935369d503a..1c41ba215419 100644
--- a/kernel/power/suspend.c
+++ b/kernel/power/suspend.c
@@ -163,19 +163,13 @@ static int suspend_enter(suspend_state_t state)
163 arch_suspend_disable_irqs(); 163 arch_suspend_disable_irqs();
164 BUG_ON(!irqs_disabled()); 164 BUG_ON(!irqs_disabled());
165 165
166 error = sysdev_suspend(PMSG_SUSPEND); 166 error = syscore_suspend();
167 if (!error) {
168 error = syscore_suspend();
169 if (error)
170 sysdev_resume();
171 }
172 if (!error) { 167 if (!error) {
173 if (!(suspend_test(TEST_CORE) || pm_wakeup_pending())) { 168 if (!(suspend_test(TEST_CORE) || pm_wakeup_pending())) {
174 error = suspend_ops->enter(state); 169 error = suspend_ops->enter(state);
175 events_check_enabled = false; 170 events_check_enabled = false;
176 } 171 }
177 syscore_resume(); 172 syscore_resume();
178 sysdev_resume();
179 } 173 }
180 174
181 arch_suspend_enable_irqs(); 175 arch_suspend_enable_irqs();
@@ -216,7 +210,6 @@ int suspend_devices_and_enter(suspend_state_t state)
216 goto Close; 210 goto Close;
217 } 211 }
218 suspend_console(); 212 suspend_console();
219 pm_restrict_gfp_mask();
220 suspend_test_start(); 213 suspend_test_start();
221 error = dpm_suspend_start(PMSG_SUSPEND); 214 error = dpm_suspend_start(PMSG_SUSPEND);
222 if (error) { 215 if (error) {
@@ -227,13 +220,12 @@ int suspend_devices_and_enter(suspend_state_t state)
227 if (suspend_test(TEST_DEVICES)) 220 if (suspend_test(TEST_DEVICES))
228 goto Recover_platform; 221 goto Recover_platform;
229 222
230 suspend_enter(state); 223 error = suspend_enter(state);
231 224
232 Resume_devices: 225 Resume_devices:
233 suspend_test_start(); 226 suspend_test_start();
234 dpm_resume_end(PMSG_RESUME); 227 dpm_resume_end(PMSG_RESUME);
235 suspend_test_finish("resume devices"); 228 suspend_test_finish("resume devices");
236 pm_restore_gfp_mask();
237 resume_console(); 229 resume_console();
238 Close: 230 Close:
239 if (suspend_ops->end) 231 if (suspend_ops->end)
@@ -294,7 +286,9 @@ int enter_state(suspend_state_t state)
294 goto Finish; 286 goto Finish;
295 287
296 pr_debug("PM: Entering %s sleep\n", pm_states[state]); 288 pr_debug("PM: Entering %s sleep\n", pm_states[state]);
289 pm_restrict_gfp_mask();
297 error = suspend_devices_and_enter(state); 290 error = suspend_devices_and_enter(state);
291 pm_restore_gfp_mask();
298 292
299 Finish: 293 Finish:
300 pr_debug("PM: Finishing wakeup.\n"); 294 pr_debug("PM: Finishing wakeup.\n");
diff --git a/kernel/power/user.c b/kernel/power/user.c
index c36c3b9e8a84..7d02d33be699 100644
--- a/kernel/power/user.c
+++ b/kernel/power/user.c
@@ -135,8 +135,10 @@ static int snapshot_release(struct inode *inode, struct file *filp)
135 free_basic_memory_bitmaps(); 135 free_basic_memory_bitmaps();
136 data = filp->private_data; 136 data = filp->private_data;
137 free_all_swap_pages(data->swap); 137 free_all_swap_pages(data->swap);
138 if (data->frozen) 138 if (data->frozen) {
139 pm_restore_gfp_mask();
139 thaw_processes(); 140 thaw_processes();
141 }
140 pm_notifier_call_chain(data->mode == O_RDONLY ? 142 pm_notifier_call_chain(data->mode == O_RDONLY ?
141 PM_POST_HIBERNATION : PM_POST_RESTORE); 143 PM_POST_HIBERNATION : PM_POST_RESTORE);
142 atomic_inc(&snapshot_device_available); 144 atomic_inc(&snapshot_device_available);
@@ -379,6 +381,7 @@ static long snapshot_ioctl(struct file *filp, unsigned int cmd,
379 * PM_HIBERNATION_PREPARE 381 * PM_HIBERNATION_PREPARE
380 */ 382 */
381 error = suspend_devices_and_enter(PM_SUSPEND_MEM); 383 error = suspend_devices_and_enter(PM_SUSPEND_MEM);
384 data->ready = 0;
382 break; 385 break;
383 386
384 case SNAPSHOT_PLATFORM_SUPPORT: 387 case SNAPSHOT_PLATFORM_SUPPORT:
diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index 0fc1eed28d27..dc7ab65f3b36 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -22,6 +22,7 @@
22#include <linux/syscalls.h> 22#include <linux/syscalls.h>
23#include <linux/uaccess.h> 23#include <linux/uaccess.h>
24#include <linux/regset.h> 24#include <linux/regset.h>
25#include <linux/hw_breakpoint.h>
25 26
26 27
27/* 28/*
@@ -879,3 +880,19 @@ asmlinkage long compat_sys_ptrace(compat_long_t request, compat_long_t pid,
879 return ret; 880 return ret;
880} 881}
881#endif /* CONFIG_COMPAT */ 882#endif /* CONFIG_COMPAT */
883
884#ifdef CONFIG_HAVE_HW_BREAKPOINT
885int ptrace_get_breakpoints(struct task_struct *tsk)
886{
887 if (atomic_inc_not_zero(&tsk->ptrace_bp_refcnt))
888 return 0;
889
890 return -1;
891}
892
893void ptrace_put_breakpoints(struct task_struct *tsk)
894{
895 if (atomic_dec_and_test(&tsk->ptrace_bp_refcnt))
896 flush_ptrace_hw_breakpoint(tsk);
897}
898#endif /* CONFIG_HAVE_HW_BREAKPOINT */
diff --git a/kernel/rcupdate.c b/kernel/rcupdate.c
index f3240e987928..7784bd216b6a 100644
--- a/kernel/rcupdate.c
+++ b/kernel/rcupdate.c
@@ -142,10 +142,17 @@ static int rcuhead_fixup_init(void *addr, enum debug_obj_state state)
142 * Ensure that queued callbacks are all executed. 142 * Ensure that queued callbacks are all executed.
143 * If we detect that we are nested in a RCU read-side critical 143 * If we detect that we are nested in a RCU read-side critical
144 * section, we should simply fail, otherwise we would deadlock. 144 * section, we should simply fail, otherwise we would deadlock.
145 * In !PREEMPT configurations, there is no way to tell if we are
146 * in a RCU read-side critical section or not, so we never
147 * attempt any fixup and just print a warning.
145 */ 148 */
149#ifndef CONFIG_PREEMPT
150 WARN_ON_ONCE(1);
151 return 0;
152#endif
146 if (rcu_preempt_depth() != 0 || preempt_count() != 0 || 153 if (rcu_preempt_depth() != 0 || preempt_count() != 0 ||
147 irqs_disabled()) { 154 irqs_disabled()) {
148 WARN_ON(1); 155 WARN_ON_ONCE(1);
149 return 0; 156 return 0;
150 } 157 }
151 rcu_barrier(); 158 rcu_barrier();
@@ -184,10 +191,17 @@ static int rcuhead_fixup_activate(void *addr, enum debug_obj_state state)
184 * Ensure that queued callbacks are all executed. 191 * Ensure that queued callbacks are all executed.
185 * If we detect that we are nested in a RCU read-side critical 192 * If we detect that we are nested in a RCU read-side critical
186 * section, we should simply fail, otherwise we would deadlock. 193 * section, we should simply fail, otherwise we would deadlock.
194 * In !PREEMPT configurations, there is no way to tell if we are
195 * in a RCU read-side critical section or not, so we never
196 * attempt any fixup and just print a warning.
187 */ 197 */
198#ifndef CONFIG_PREEMPT
199 WARN_ON_ONCE(1);
200 return 0;
201#endif
188 if (rcu_preempt_depth() != 0 || preempt_count() != 0 || 202 if (rcu_preempt_depth() != 0 || preempt_count() != 0 ||
189 irqs_disabled()) { 203 irqs_disabled()) {
190 WARN_ON(1); 204 WARN_ON_ONCE(1);
191 return 0; 205 return 0;
192 } 206 }
193 rcu_barrier(); 207 rcu_barrier();
@@ -214,15 +228,17 @@ static int rcuhead_fixup_free(void *addr, enum debug_obj_state state)
214 * Ensure that queued callbacks are all executed. 228 * Ensure that queued callbacks are all executed.
215 * If we detect that we are nested in a RCU read-side critical 229 * If we detect that we are nested in a RCU read-side critical
216 * section, we should simply fail, otherwise we would deadlock. 230 * section, we should simply fail, otherwise we would deadlock.
217 * Note that the machinery to reliably determine whether 231 * In !PREEMPT configurations, there is no way to tell if we are
218 * or not we are in an RCU read-side critical section 232 * in a RCU read-side critical section or not, so we never
219 * exists only in the preemptible RCU implementations 233 * attempt any fixup and just print a warning.
220 * (TINY_PREEMPT_RCU and TREE_PREEMPT_RCU), which is why
221 * DEBUG_OBJECTS_RCU_HEAD is disallowed if !PREEMPT.
222 */ 234 */
235#ifndef CONFIG_PREEMPT
236 WARN_ON_ONCE(1);
237 return 0;
238#endif
223 if (rcu_preempt_depth() != 0 || preempt_count() != 0 || 239 if (rcu_preempt_depth() != 0 || preempt_count() != 0 ||
224 irqs_disabled()) { 240 irqs_disabled()) {
225 WARN_ON(1); 241 WARN_ON_ONCE(1);
226 return 0; 242 return 0;
227 } 243 }
228 rcu_barrier(); 244 rcu_barrier();
diff --git a/kernel/rcutiny.c b/kernel/rcutiny.c
index 0c343b9a46d5..421abfd3641d 100644
--- a/kernel/rcutiny.c
+++ b/kernel/rcutiny.c
@@ -40,10 +40,10 @@
40static struct task_struct *rcu_kthread_task; 40static struct task_struct *rcu_kthread_task;
41static DECLARE_WAIT_QUEUE_HEAD(rcu_kthread_wq); 41static DECLARE_WAIT_QUEUE_HEAD(rcu_kthread_wq);
42static unsigned long have_rcu_kthread_work; 42static unsigned long have_rcu_kthread_work;
43static void invoke_rcu_kthread(void);
44 43
45/* Forward declarations for rcutiny_plugin.h. */ 44/* Forward declarations for rcutiny_plugin.h. */
46struct rcu_ctrlblk; 45struct rcu_ctrlblk;
46static void invoke_rcu_kthread(void);
47static void rcu_process_callbacks(struct rcu_ctrlblk *rcp); 47static void rcu_process_callbacks(struct rcu_ctrlblk *rcp);
48static int rcu_kthread(void *arg); 48static int rcu_kthread(void *arg);
49static void __call_rcu(struct rcu_head *head, 49static void __call_rcu(struct rcu_head *head,
@@ -79,36 +79,45 @@ void rcu_exit_nohz(void)
79#endif /* #ifdef CONFIG_NO_HZ */ 79#endif /* #ifdef CONFIG_NO_HZ */
80 80
81/* 81/*
82 * Helper function for rcu_qsctr_inc() and rcu_bh_qsctr_inc(). 82 * Helper function for rcu_sched_qs() and rcu_bh_qs().
83 * Also disable irqs to avoid confusion due to interrupt handlers 83 * Also irqs are disabled to avoid confusion due to interrupt handlers
84 * invoking call_rcu(). 84 * invoking call_rcu().
85 */ 85 */
86static int rcu_qsctr_help(struct rcu_ctrlblk *rcp) 86static int rcu_qsctr_help(struct rcu_ctrlblk *rcp)
87{ 87{
88 unsigned long flags;
89
90 local_irq_save(flags);
91 if (rcp->rcucblist != NULL && 88 if (rcp->rcucblist != NULL &&
92 rcp->donetail != rcp->curtail) { 89 rcp->donetail != rcp->curtail) {
93 rcp->donetail = rcp->curtail; 90 rcp->donetail = rcp->curtail;
94 local_irq_restore(flags);
95 return 1; 91 return 1;
96 } 92 }
97 local_irq_restore(flags);
98 93
99 return 0; 94 return 0;
100} 95}
101 96
102/* 97/*
98 * Wake up rcu_kthread() to process callbacks now eligible for invocation
99 * or to boost readers.
100 */
101static void invoke_rcu_kthread(void)
102{
103 have_rcu_kthread_work = 1;
104 wake_up(&rcu_kthread_wq);
105}
106
107/*
103 * Record an rcu quiescent state. And an rcu_bh quiescent state while we 108 * Record an rcu quiescent state. And an rcu_bh quiescent state while we
104 * are at it, given that any rcu quiescent state is also an rcu_bh 109 * are at it, given that any rcu quiescent state is also an rcu_bh
105 * quiescent state. Use "+" instead of "||" to defeat short circuiting. 110 * quiescent state. Use "+" instead of "||" to defeat short circuiting.
106 */ 111 */
107void rcu_sched_qs(int cpu) 112void rcu_sched_qs(int cpu)
108{ 113{
114 unsigned long flags;
115
116 local_irq_save(flags);
109 if (rcu_qsctr_help(&rcu_sched_ctrlblk) + 117 if (rcu_qsctr_help(&rcu_sched_ctrlblk) +
110 rcu_qsctr_help(&rcu_bh_ctrlblk)) 118 rcu_qsctr_help(&rcu_bh_ctrlblk))
111 invoke_rcu_kthread(); 119 invoke_rcu_kthread();
120 local_irq_restore(flags);
112} 121}
113 122
114/* 123/*
@@ -116,8 +125,12 @@ void rcu_sched_qs(int cpu)
116 */ 125 */
117void rcu_bh_qs(int cpu) 126void rcu_bh_qs(int cpu)
118{ 127{
128 unsigned long flags;
129
130 local_irq_save(flags);
119 if (rcu_qsctr_help(&rcu_bh_ctrlblk)) 131 if (rcu_qsctr_help(&rcu_bh_ctrlblk))
120 invoke_rcu_kthread(); 132 invoke_rcu_kthread();
133 local_irq_restore(flags);
121} 134}
122 135
123/* 136/*
@@ -167,7 +180,7 @@ static void rcu_process_callbacks(struct rcu_ctrlblk *rcp)
167 prefetch(next); 180 prefetch(next);
168 debug_rcu_head_unqueue(list); 181 debug_rcu_head_unqueue(list);
169 local_bh_disable(); 182 local_bh_disable();
170 list->func(list); 183 __rcu_reclaim(list);
171 local_bh_enable(); 184 local_bh_enable();
172 list = next; 185 list = next;
173 RCU_TRACE(cb_count++); 186 RCU_TRACE(cb_count++);
@@ -208,20 +221,6 @@ static int rcu_kthread(void *arg)
208} 221}
209 222
210/* 223/*
211 * Wake up rcu_kthread() to process callbacks now eligible for invocation
212 * or to boost readers.
213 */
214static void invoke_rcu_kthread(void)
215{
216 unsigned long flags;
217
218 local_irq_save(flags);
219 have_rcu_kthread_work = 1;
220 wake_up(&rcu_kthread_wq);
221 local_irq_restore(flags);
222}
223
224/*
225 * Wait for a grace period to elapse. But it is illegal to invoke 224 * Wait for a grace period to elapse. But it is illegal to invoke
226 * synchronize_sched() from within an RCU read-side critical section. 225 * synchronize_sched() from within an RCU read-side critical section.
227 * Therefore, any legal call to synchronize_sched() is a quiescent 226 * Therefore, any legal call to synchronize_sched() is a quiescent
diff --git a/kernel/rcutiny_plugin.h b/kernel/rcutiny_plugin.h
index 3cb8e362e883..f259c676195f 100644
--- a/kernel/rcutiny_plugin.h
+++ b/kernel/rcutiny_plugin.h
@@ -100,23 +100,28 @@ struct rcu_preempt_ctrlblk {
100 u8 completed; /* Last grace period completed. */ 100 u8 completed; /* Last grace period completed. */
101 /* If all three are equal, RCU is idle. */ 101 /* If all three are equal, RCU is idle. */
102#ifdef CONFIG_RCU_BOOST 102#ifdef CONFIG_RCU_BOOST
103 s8 boosted_this_gp; /* Has boosting already happened? */
104 unsigned long boost_time; /* When to start boosting (jiffies) */ 103 unsigned long boost_time; /* When to start boosting (jiffies) */
105#endif /* #ifdef CONFIG_RCU_BOOST */ 104#endif /* #ifdef CONFIG_RCU_BOOST */
106#ifdef CONFIG_RCU_TRACE 105#ifdef CONFIG_RCU_TRACE
107 unsigned long n_grace_periods; 106 unsigned long n_grace_periods;
108#ifdef CONFIG_RCU_BOOST 107#ifdef CONFIG_RCU_BOOST
109 unsigned long n_tasks_boosted; 108 unsigned long n_tasks_boosted;
109 /* Total number of tasks boosted. */
110 unsigned long n_exp_boosts; 110 unsigned long n_exp_boosts;
111 /* Number of tasks boosted for expedited GP. */
111 unsigned long n_normal_boosts; 112 unsigned long n_normal_boosts;
112 unsigned long n_normal_balk_blkd_tasks; 113 /* Number of tasks boosted for normal GP. */
113 unsigned long n_normal_balk_gp_tasks; 114 unsigned long n_balk_blkd_tasks;
114 unsigned long n_normal_balk_boost_tasks; 115 /* Refused to boost: no blocked tasks. */
115 unsigned long n_normal_balk_boosted; 116 unsigned long n_balk_exp_gp_tasks;
116 unsigned long n_normal_balk_notyet; 117 /* Refused to boost: nothing blocking GP. */
117 unsigned long n_normal_balk_nos; 118 unsigned long n_balk_boost_tasks;
118 unsigned long n_exp_balk_blkd_tasks; 119 /* Refused to boost: already boosting. */
119 unsigned long n_exp_balk_nos; 120 unsigned long n_balk_notyet;
121 /* Refused to boost: not yet time. */
122 unsigned long n_balk_nos;
123 /* Refused to boost: not sure why, though. */
124 /* This can happen due to race conditions. */
120#endif /* #ifdef CONFIG_RCU_BOOST */ 125#endif /* #ifdef CONFIG_RCU_BOOST */
121#endif /* #ifdef CONFIG_RCU_TRACE */ 126#endif /* #ifdef CONFIG_RCU_TRACE */
122}; 127};
@@ -201,7 +206,6 @@ static struct list_head *rcu_next_node_entry(struct task_struct *t)
201 206
202#ifdef CONFIG_RCU_BOOST 207#ifdef CONFIG_RCU_BOOST
203static void rcu_initiate_boost_trace(void); 208static void rcu_initiate_boost_trace(void);
204static void rcu_initiate_exp_boost_trace(void);
205#endif /* #ifdef CONFIG_RCU_BOOST */ 209#endif /* #ifdef CONFIG_RCU_BOOST */
206 210
207/* 211/*
@@ -219,41 +223,21 @@ static void show_tiny_preempt_stats(struct seq_file *m)
219 "N."[!rcu_preempt_ctrlblk.gp_tasks], 223 "N."[!rcu_preempt_ctrlblk.gp_tasks],
220 "E."[!rcu_preempt_ctrlblk.exp_tasks]); 224 "E."[!rcu_preempt_ctrlblk.exp_tasks]);
221#ifdef CONFIG_RCU_BOOST 225#ifdef CONFIG_RCU_BOOST
222 seq_printf(m, " ttb=%c btg=", 226 seq_printf(m, "%sttb=%c ntb=%lu neb=%lu nnb=%lu j=%04x bt=%04x\n",
223 "B."[!rcu_preempt_ctrlblk.boost_tasks]); 227 " ",
224 switch (rcu_preempt_ctrlblk.boosted_this_gp) { 228 "B."[!rcu_preempt_ctrlblk.boost_tasks],
225 case -1:
226 seq_puts(m, "exp");
227 break;
228 case 0:
229 seq_puts(m, "no");
230 break;
231 case 1:
232 seq_puts(m, "begun");
233 break;
234 case 2:
235 seq_puts(m, "done");
236 break;
237 default:
238 seq_printf(m, "?%d?", rcu_preempt_ctrlblk.boosted_this_gp);
239 }
240 seq_printf(m, " ntb=%lu neb=%lu nnb=%lu j=%04x bt=%04x\n",
241 rcu_preempt_ctrlblk.n_tasks_boosted, 229 rcu_preempt_ctrlblk.n_tasks_boosted,
242 rcu_preempt_ctrlblk.n_exp_boosts, 230 rcu_preempt_ctrlblk.n_exp_boosts,
243 rcu_preempt_ctrlblk.n_normal_boosts, 231 rcu_preempt_ctrlblk.n_normal_boosts,
244 (int)(jiffies & 0xffff), 232 (int)(jiffies & 0xffff),
245 (int)(rcu_preempt_ctrlblk.boost_time & 0xffff)); 233 (int)(rcu_preempt_ctrlblk.boost_time & 0xffff));
246 seq_printf(m, " %s: nt=%lu gt=%lu bt=%lu b=%lu ny=%lu nos=%lu\n", 234 seq_printf(m, "%s: nt=%lu egt=%lu bt=%lu ny=%lu nos=%lu\n",
247 "normal balk", 235 " balk",
248 rcu_preempt_ctrlblk.n_normal_balk_blkd_tasks, 236 rcu_preempt_ctrlblk.n_balk_blkd_tasks,
249 rcu_preempt_ctrlblk.n_normal_balk_gp_tasks, 237 rcu_preempt_ctrlblk.n_balk_exp_gp_tasks,
250 rcu_preempt_ctrlblk.n_normal_balk_boost_tasks, 238 rcu_preempt_ctrlblk.n_balk_boost_tasks,
251 rcu_preempt_ctrlblk.n_normal_balk_boosted, 239 rcu_preempt_ctrlblk.n_balk_notyet,
252 rcu_preempt_ctrlblk.n_normal_balk_notyet, 240 rcu_preempt_ctrlblk.n_balk_nos);
253 rcu_preempt_ctrlblk.n_normal_balk_nos);
254 seq_printf(m, " exp balk: bt=%lu nos=%lu\n",
255 rcu_preempt_ctrlblk.n_exp_balk_blkd_tasks,
256 rcu_preempt_ctrlblk.n_exp_balk_nos);
257#endif /* #ifdef CONFIG_RCU_BOOST */ 241#endif /* #ifdef CONFIG_RCU_BOOST */
258} 242}
259 243
@@ -271,25 +255,59 @@ static int rcu_boost(void)
271{ 255{
272 unsigned long flags; 256 unsigned long flags;
273 struct rt_mutex mtx; 257 struct rt_mutex mtx;
274 struct list_head *np;
275 struct task_struct *t; 258 struct task_struct *t;
259 struct list_head *tb;
276 260
277 if (rcu_preempt_ctrlblk.boost_tasks == NULL) 261 if (rcu_preempt_ctrlblk.boost_tasks == NULL &&
262 rcu_preempt_ctrlblk.exp_tasks == NULL)
278 return 0; /* Nothing to boost. */ 263 return 0; /* Nothing to boost. */
264
279 raw_local_irq_save(flags); 265 raw_local_irq_save(flags);
280 rcu_preempt_ctrlblk.boosted_this_gp++; 266
281 t = container_of(rcu_preempt_ctrlblk.boost_tasks, struct task_struct, 267 /*
282 rcu_node_entry); 268 * Recheck with irqs disabled: all tasks in need of boosting
283 np = rcu_next_node_entry(t); 269 * might exit their RCU read-side critical sections on their own
270 * if we are preempted just before disabling irqs.
271 */
272 if (rcu_preempt_ctrlblk.boost_tasks == NULL &&
273 rcu_preempt_ctrlblk.exp_tasks == NULL) {
274 raw_local_irq_restore(flags);
275 return 0;
276 }
277
278 /*
279 * Preferentially boost tasks blocking expedited grace periods.
280 * This cannot starve the normal grace periods because a second
281 * expedited grace period must boost all blocked tasks, including
282 * those blocking the pre-existing normal grace period.
283 */
284 if (rcu_preempt_ctrlblk.exp_tasks != NULL) {
285 tb = rcu_preempt_ctrlblk.exp_tasks;
286 RCU_TRACE(rcu_preempt_ctrlblk.n_exp_boosts++);
287 } else {
288 tb = rcu_preempt_ctrlblk.boost_tasks;
289 RCU_TRACE(rcu_preempt_ctrlblk.n_normal_boosts++);
290 }
291 RCU_TRACE(rcu_preempt_ctrlblk.n_tasks_boosted++);
292
293 /*
294 * We boost task t by manufacturing an rt_mutex that appears to
295 * be held by task t. We leave a pointer to that rt_mutex where
296 * task t can find it, and task t will release the mutex when it
297 * exits its outermost RCU read-side critical section. Then
298 * simply acquiring this artificial rt_mutex will boost task
299 * t's priority. (Thanks to tglx for suggesting this approach!)
300 */
301 t = container_of(tb, struct task_struct, rcu_node_entry);
284 rt_mutex_init_proxy_locked(&mtx, t); 302 rt_mutex_init_proxy_locked(&mtx, t);
285 t->rcu_boost_mutex = &mtx; 303 t->rcu_boost_mutex = &mtx;
286 t->rcu_read_unlock_special |= RCU_READ_UNLOCK_BOOSTED; 304 t->rcu_read_unlock_special |= RCU_READ_UNLOCK_BOOSTED;
287 raw_local_irq_restore(flags); 305 raw_local_irq_restore(flags);
288 rt_mutex_lock(&mtx); 306 rt_mutex_lock(&mtx);
289 RCU_TRACE(rcu_preempt_ctrlblk.n_tasks_boosted++); 307 rt_mutex_unlock(&mtx); /* Keep lockdep happy. */
290 rcu_preempt_ctrlblk.boosted_this_gp++; 308
291 rt_mutex_unlock(&mtx); 309 return rcu_preempt_ctrlblk.boost_tasks != NULL ||
292 return rcu_preempt_ctrlblk.boost_tasks != NULL; 310 rcu_preempt_ctrlblk.exp_tasks != NULL;
293} 311}
294 312
295/* 313/*
@@ -304,42 +322,25 @@ static int rcu_boost(void)
304 */ 322 */
305static int rcu_initiate_boost(void) 323static int rcu_initiate_boost(void)
306{ 324{
307 if (!rcu_preempt_blocked_readers_cgp()) { 325 if (!rcu_preempt_blocked_readers_cgp() &&
308 RCU_TRACE(rcu_preempt_ctrlblk.n_normal_balk_blkd_tasks++); 326 rcu_preempt_ctrlblk.exp_tasks == NULL) {
327 RCU_TRACE(rcu_preempt_ctrlblk.n_balk_exp_gp_tasks++);
309 return 0; 328 return 0;
310 } 329 }
311 if (rcu_preempt_ctrlblk.gp_tasks != NULL && 330 if (rcu_preempt_ctrlblk.exp_tasks != NULL ||
312 rcu_preempt_ctrlblk.boost_tasks == NULL && 331 (rcu_preempt_ctrlblk.gp_tasks != NULL &&
313 rcu_preempt_ctrlblk.boosted_this_gp == 0 && 332 rcu_preempt_ctrlblk.boost_tasks == NULL &&
314 ULONG_CMP_GE(jiffies, rcu_preempt_ctrlblk.boost_time)) { 333 ULONG_CMP_GE(jiffies, rcu_preempt_ctrlblk.boost_time))) {
315 rcu_preempt_ctrlblk.boost_tasks = rcu_preempt_ctrlblk.gp_tasks; 334 if (rcu_preempt_ctrlblk.exp_tasks == NULL)
335 rcu_preempt_ctrlblk.boost_tasks =
336 rcu_preempt_ctrlblk.gp_tasks;
316 invoke_rcu_kthread(); 337 invoke_rcu_kthread();
317 RCU_TRACE(rcu_preempt_ctrlblk.n_normal_boosts++);
318 } else 338 } else
319 RCU_TRACE(rcu_initiate_boost_trace()); 339 RCU_TRACE(rcu_initiate_boost_trace());
320 return 1; 340 return 1;
321} 341}
322 342
323/* 343#define RCU_BOOST_DELAY_JIFFIES DIV_ROUND_UP(CONFIG_RCU_BOOST_DELAY * HZ, 1000)
324 * Initiate boosting for an expedited grace period.
325 */
326static void rcu_initiate_expedited_boost(void)
327{
328 unsigned long flags;
329
330 raw_local_irq_save(flags);
331 if (!list_empty(&rcu_preempt_ctrlblk.blkd_tasks)) {
332 rcu_preempt_ctrlblk.boost_tasks =
333 rcu_preempt_ctrlblk.blkd_tasks.next;
334 rcu_preempt_ctrlblk.boosted_this_gp = -1;
335 invoke_rcu_kthread();
336 RCU_TRACE(rcu_preempt_ctrlblk.n_exp_boosts++);
337 } else
338 RCU_TRACE(rcu_initiate_exp_boost_trace());
339 raw_local_irq_restore(flags);
340}
341
342#define RCU_BOOST_DELAY_JIFFIES DIV_ROUND_UP(CONFIG_RCU_BOOST_DELAY * HZ, 1000);
343 344
344/* 345/*
345 * Do priority-boost accounting for the start of a new grace period. 346 * Do priority-boost accounting for the start of a new grace period.
@@ -347,8 +348,6 @@ static void rcu_initiate_expedited_boost(void)
347static void rcu_preempt_boost_start_gp(void) 348static void rcu_preempt_boost_start_gp(void)
348{ 349{
349 rcu_preempt_ctrlblk.boost_time = jiffies + RCU_BOOST_DELAY_JIFFIES; 350 rcu_preempt_ctrlblk.boost_time = jiffies + RCU_BOOST_DELAY_JIFFIES;
350 if (rcu_preempt_ctrlblk.boosted_this_gp > 0)
351 rcu_preempt_ctrlblk.boosted_this_gp = 0;
352} 351}
353 352
354#else /* #ifdef CONFIG_RCU_BOOST */ 353#else /* #ifdef CONFIG_RCU_BOOST */
@@ -372,13 +371,6 @@ static int rcu_initiate_boost(void)
372} 371}
373 372
374/* 373/*
375 * If there is no RCU priority boosting, we don't initiate expedited boosting.
376 */
377static void rcu_initiate_expedited_boost(void)
378{
379}
380
381/*
382 * If there is no RCU priority boosting, nothing to do at grace-period start. 374 * If there is no RCU priority boosting, nothing to do at grace-period start.
383 */ 375 */
384static void rcu_preempt_boost_start_gp(void) 376static void rcu_preempt_boost_start_gp(void)
@@ -418,7 +410,7 @@ static void rcu_preempt_cpu_qs(void)
418 if (!rcu_preempt_gp_in_progress()) 410 if (!rcu_preempt_gp_in_progress())
419 return; 411 return;
420 /* 412 /*
421 * Check up on boosting. If there are no readers blocking the 413 * Check up on boosting. If there are readers blocking the
422 * current grace period, leave. 414 * current grace period, leave.
423 */ 415 */
424 if (rcu_initiate_boost()) 416 if (rcu_initiate_boost())
@@ -578,7 +570,7 @@ static void rcu_read_unlock_special(struct task_struct *t)
578 empty = !rcu_preempt_blocked_readers_cgp(); 570 empty = !rcu_preempt_blocked_readers_cgp();
579 empty_exp = rcu_preempt_ctrlblk.exp_tasks == NULL; 571 empty_exp = rcu_preempt_ctrlblk.exp_tasks == NULL;
580 np = rcu_next_node_entry(t); 572 np = rcu_next_node_entry(t);
581 list_del(&t->rcu_node_entry); 573 list_del_init(&t->rcu_node_entry);
582 if (&t->rcu_node_entry == rcu_preempt_ctrlblk.gp_tasks) 574 if (&t->rcu_node_entry == rcu_preempt_ctrlblk.gp_tasks)
583 rcu_preempt_ctrlblk.gp_tasks = np; 575 rcu_preempt_ctrlblk.gp_tasks = np;
584 if (&t->rcu_node_entry == rcu_preempt_ctrlblk.exp_tasks) 576 if (&t->rcu_node_entry == rcu_preempt_ctrlblk.exp_tasks)
@@ -587,7 +579,6 @@ static void rcu_read_unlock_special(struct task_struct *t)
587 if (&t->rcu_node_entry == rcu_preempt_ctrlblk.boost_tasks) 579 if (&t->rcu_node_entry == rcu_preempt_ctrlblk.boost_tasks)
588 rcu_preempt_ctrlblk.boost_tasks = np; 580 rcu_preempt_ctrlblk.boost_tasks = np;
589#endif /* #ifdef CONFIG_RCU_BOOST */ 581#endif /* #ifdef CONFIG_RCU_BOOST */
590 INIT_LIST_HEAD(&t->rcu_node_entry);
591 582
592 /* 583 /*
593 * If this was the last task on the current list, and if 584 * If this was the last task on the current list, and if
@@ -812,13 +803,16 @@ void synchronize_rcu_expedited(void)
812 rpcp->exp_tasks = rpcp->blkd_tasks.next; 803 rpcp->exp_tasks = rpcp->blkd_tasks.next;
813 if (rpcp->exp_tasks == &rpcp->blkd_tasks) 804 if (rpcp->exp_tasks == &rpcp->blkd_tasks)
814 rpcp->exp_tasks = NULL; 805 rpcp->exp_tasks = NULL;
815 local_irq_restore(flags);
816 806
817 /* Wait for tail of ->blkd_tasks list to drain. */ 807 /* Wait for tail of ->blkd_tasks list to drain. */
818 if (rcu_preempted_readers_exp()) 808 if (!rcu_preempted_readers_exp())
819 rcu_initiate_expedited_boost(); 809 local_irq_restore(flags);
810 else {
811 rcu_initiate_boost();
812 local_irq_restore(flags);
820 wait_event(sync_rcu_preempt_exp_wq, 813 wait_event(sync_rcu_preempt_exp_wq,
821 !rcu_preempted_readers_exp()); 814 !rcu_preempted_readers_exp());
815 }
822 816
823 /* Clean up and exit. */ 817 /* Clean up and exit. */
824 barrier(); /* ensure expedited GP seen before counter increment. */ 818 barrier(); /* ensure expedited GP seen before counter increment. */
@@ -931,24 +925,17 @@ void __init rcu_scheduler_starting(void)
931 925
932static void rcu_initiate_boost_trace(void) 926static void rcu_initiate_boost_trace(void)
933{ 927{
934 if (rcu_preempt_ctrlblk.gp_tasks == NULL) 928 if (list_empty(&rcu_preempt_ctrlblk.blkd_tasks))
935 rcu_preempt_ctrlblk.n_normal_balk_gp_tasks++; 929 rcu_preempt_ctrlblk.n_balk_blkd_tasks++;
930 else if (rcu_preempt_ctrlblk.gp_tasks == NULL &&
931 rcu_preempt_ctrlblk.exp_tasks == NULL)
932 rcu_preempt_ctrlblk.n_balk_exp_gp_tasks++;
936 else if (rcu_preempt_ctrlblk.boost_tasks != NULL) 933 else if (rcu_preempt_ctrlblk.boost_tasks != NULL)
937 rcu_preempt_ctrlblk.n_normal_balk_boost_tasks++; 934 rcu_preempt_ctrlblk.n_balk_boost_tasks++;
938 else if (rcu_preempt_ctrlblk.boosted_this_gp != 0)
939 rcu_preempt_ctrlblk.n_normal_balk_boosted++;
940 else if (!ULONG_CMP_GE(jiffies, rcu_preempt_ctrlblk.boost_time)) 935 else if (!ULONG_CMP_GE(jiffies, rcu_preempt_ctrlblk.boost_time))
941 rcu_preempt_ctrlblk.n_normal_balk_notyet++; 936 rcu_preempt_ctrlblk.n_balk_notyet++;
942 else
943 rcu_preempt_ctrlblk.n_normal_balk_nos++;
944}
945
946static void rcu_initiate_exp_boost_trace(void)
947{
948 if (list_empty(&rcu_preempt_ctrlblk.blkd_tasks))
949 rcu_preempt_ctrlblk.n_exp_balk_blkd_tasks++;
950 else 937 else
951 rcu_preempt_ctrlblk.n_exp_balk_nos++; 938 rcu_preempt_ctrlblk.n_balk_nos++;
952} 939}
953 940
954#endif /* #ifdef CONFIG_RCU_BOOST */ 941#endif /* #ifdef CONFIG_RCU_BOOST */
diff --git a/kernel/rcutorture.c b/kernel/rcutorture.c
index c224da41890c..2e138db03382 100644
--- a/kernel/rcutorture.c
+++ b/kernel/rcutorture.c
@@ -131,7 +131,7 @@ struct rcu_torture {
131 131
132static LIST_HEAD(rcu_torture_freelist); 132static LIST_HEAD(rcu_torture_freelist);
133static struct rcu_torture __rcu *rcu_torture_current; 133static struct rcu_torture __rcu *rcu_torture_current;
134static long rcu_torture_current_version; 134static unsigned long rcu_torture_current_version;
135static struct rcu_torture rcu_tortures[10 * RCU_TORTURE_PIPE_LEN]; 135static struct rcu_torture rcu_tortures[10 * RCU_TORTURE_PIPE_LEN];
136static DEFINE_SPINLOCK(rcu_torture_lock); 136static DEFINE_SPINLOCK(rcu_torture_lock);
137static DEFINE_PER_CPU(long [RCU_TORTURE_PIPE_LEN + 1], rcu_torture_count) = 137static DEFINE_PER_CPU(long [RCU_TORTURE_PIPE_LEN + 1], rcu_torture_count) =
@@ -146,8 +146,6 @@ static atomic_t n_rcu_torture_mberror;
146static atomic_t n_rcu_torture_error; 146static atomic_t n_rcu_torture_error;
147static long n_rcu_torture_boost_ktrerror; 147static long n_rcu_torture_boost_ktrerror;
148static long n_rcu_torture_boost_rterror; 148static long n_rcu_torture_boost_rterror;
149static long n_rcu_torture_boost_allocerror;
150static long n_rcu_torture_boost_afferror;
151static long n_rcu_torture_boost_failure; 149static long n_rcu_torture_boost_failure;
152static long n_rcu_torture_boosts; 150static long n_rcu_torture_boosts;
153static long n_rcu_torture_timers; 151static long n_rcu_torture_timers;
@@ -163,11 +161,11 @@ static int stutter_pause_test;
163#endif 161#endif
164int rcutorture_runnable = RCUTORTURE_RUNNABLE_INIT; 162int rcutorture_runnable = RCUTORTURE_RUNNABLE_INIT;
165 163
166#ifdef CONFIG_RCU_BOOST 164#if defined(CONFIG_RCU_BOOST) && !defined(CONFIG_HOTPLUG_CPU)
167#define rcu_can_boost() 1 165#define rcu_can_boost() 1
168#else /* #ifdef CONFIG_RCU_BOOST */ 166#else /* #if defined(CONFIG_RCU_BOOST) && !defined(CONFIG_HOTPLUG_CPU) */
169#define rcu_can_boost() 0 167#define rcu_can_boost() 0
170#endif /* #else #ifdef CONFIG_RCU_BOOST */ 168#endif /* #else #if defined(CONFIG_RCU_BOOST) && !defined(CONFIG_HOTPLUG_CPU) */
171 169
172static unsigned long boost_starttime; /* jiffies of next boost test start. */ 170static unsigned long boost_starttime; /* jiffies of next boost test start. */
173DEFINE_MUTEX(boost_mutex); /* protect setting boost_starttime */ 171DEFINE_MUTEX(boost_mutex); /* protect setting boost_starttime */
@@ -751,6 +749,7 @@ static int rcu_torture_boost(void *arg)
751 n_rcu_torture_boost_rterror++; 749 n_rcu_torture_boost_rterror++;
752 } 750 }
753 751
752 init_rcu_head_on_stack(&rbi.rcu);
754 /* Each pass through the following loop does one boost-test cycle. */ 753 /* Each pass through the following loop does one boost-test cycle. */
755 do { 754 do {
756 /* Wait for the next test interval. */ 755 /* Wait for the next test interval. */
@@ -810,6 +809,7 @@ checkwait: rcu_stutter_wait("rcu_torture_boost");
810 809
811 /* Clean up and exit. */ 810 /* Clean up and exit. */
812 VERBOSE_PRINTK_STRING("rcu_torture_boost task stopping"); 811 VERBOSE_PRINTK_STRING("rcu_torture_boost task stopping");
812 destroy_rcu_head_on_stack(&rbi.rcu);
813 rcutorture_shutdown_absorb("rcu_torture_boost"); 813 rcutorture_shutdown_absorb("rcu_torture_boost");
814 while (!kthread_should_stop() || rbi.inflight) 814 while (!kthread_should_stop() || rbi.inflight)
815 schedule_timeout_uninterruptible(1); 815 schedule_timeout_uninterruptible(1);
@@ -886,7 +886,7 @@ rcu_torture_writer(void *arg)
886 old_rp->rtort_pipe_count++; 886 old_rp->rtort_pipe_count++;
887 cur_ops->deferred_free(old_rp); 887 cur_ops->deferred_free(old_rp);
888 } 888 }
889 rcu_torture_current_version++; 889 rcutorture_record_progress(++rcu_torture_current_version);
890 oldbatch = cur_ops->completed(); 890 oldbatch = cur_ops->completed();
891 rcu_stutter_wait("rcu_torture_writer"); 891 rcu_stutter_wait("rcu_torture_writer");
892 } while (!kthread_should_stop() && fullstop == FULLSTOP_DONTSTOP); 892 } while (!kthread_should_stop() && fullstop == FULLSTOP_DONTSTOP);
@@ -1066,8 +1066,8 @@ rcu_torture_printk(char *page)
1066 } 1066 }
1067 cnt += sprintf(&page[cnt], "%s%s ", torture_type, TORTURE_FLAG); 1067 cnt += sprintf(&page[cnt], "%s%s ", torture_type, TORTURE_FLAG);
1068 cnt += sprintf(&page[cnt], 1068 cnt += sprintf(&page[cnt],
1069 "rtc: %p ver: %ld tfle: %d rta: %d rtaf: %d rtf: %d " 1069 "rtc: %p ver: %lu tfle: %d rta: %d rtaf: %d rtf: %d "
1070 "rtmbe: %d rtbke: %ld rtbre: %ld rtbae: %ld rtbafe: %ld " 1070 "rtmbe: %d rtbke: %ld rtbre: %ld "
1071 "rtbf: %ld rtb: %ld nt: %ld", 1071 "rtbf: %ld rtb: %ld nt: %ld",
1072 rcu_torture_current, 1072 rcu_torture_current,
1073 rcu_torture_current_version, 1073 rcu_torture_current_version,
@@ -1078,16 +1078,12 @@ rcu_torture_printk(char *page)
1078 atomic_read(&n_rcu_torture_mberror), 1078 atomic_read(&n_rcu_torture_mberror),
1079 n_rcu_torture_boost_ktrerror, 1079 n_rcu_torture_boost_ktrerror,
1080 n_rcu_torture_boost_rterror, 1080 n_rcu_torture_boost_rterror,
1081 n_rcu_torture_boost_allocerror,
1082 n_rcu_torture_boost_afferror,
1083 n_rcu_torture_boost_failure, 1081 n_rcu_torture_boost_failure,
1084 n_rcu_torture_boosts, 1082 n_rcu_torture_boosts,
1085 n_rcu_torture_timers); 1083 n_rcu_torture_timers);
1086 if (atomic_read(&n_rcu_torture_mberror) != 0 || 1084 if (atomic_read(&n_rcu_torture_mberror) != 0 ||
1087 n_rcu_torture_boost_ktrerror != 0 || 1085 n_rcu_torture_boost_ktrerror != 0 ||
1088 n_rcu_torture_boost_rterror != 0 || 1086 n_rcu_torture_boost_rterror != 0 ||
1089 n_rcu_torture_boost_allocerror != 0 ||
1090 n_rcu_torture_boost_afferror != 0 ||
1091 n_rcu_torture_boost_failure != 0) 1087 n_rcu_torture_boost_failure != 0)
1092 cnt += sprintf(&page[cnt], " !!!"); 1088 cnt += sprintf(&page[cnt], " !!!");
1093 cnt += sprintf(&page[cnt], "\n%s%s ", torture_type, TORTURE_FLAG); 1089 cnt += sprintf(&page[cnt], "\n%s%s ", torture_type, TORTURE_FLAG);
@@ -1331,6 +1327,7 @@ rcu_torture_cleanup(void)
1331 int i; 1327 int i;
1332 1328
1333 mutex_lock(&fullstop_mutex); 1329 mutex_lock(&fullstop_mutex);
1330 rcutorture_record_test_transition();
1334 if (fullstop == FULLSTOP_SHUTDOWN) { 1331 if (fullstop == FULLSTOP_SHUTDOWN) {
1335 printk(KERN_WARNING /* but going down anyway, so... */ 1332 printk(KERN_WARNING /* but going down anyway, so... */
1336 "Concurrent 'rmmod rcutorture' and shutdown illegal!\n"); 1333 "Concurrent 'rmmod rcutorture' and shutdown illegal!\n");
@@ -1486,8 +1483,6 @@ rcu_torture_init(void)
1486 atomic_set(&n_rcu_torture_error, 0); 1483 atomic_set(&n_rcu_torture_error, 0);
1487 n_rcu_torture_boost_ktrerror = 0; 1484 n_rcu_torture_boost_ktrerror = 0;
1488 n_rcu_torture_boost_rterror = 0; 1485 n_rcu_torture_boost_rterror = 0;
1489 n_rcu_torture_boost_allocerror = 0;
1490 n_rcu_torture_boost_afferror = 0;
1491 n_rcu_torture_boost_failure = 0; 1486 n_rcu_torture_boost_failure = 0;
1492 n_rcu_torture_boosts = 0; 1487 n_rcu_torture_boosts = 0;
1493 for (i = 0; i < RCU_TORTURE_PIPE_LEN + 1; i++) 1488 for (i = 0; i < RCU_TORTURE_PIPE_LEN + 1; i++)
@@ -1624,6 +1619,7 @@ rcu_torture_init(void)
1624 } 1619 }
1625 } 1620 }
1626 register_reboot_notifier(&rcutorture_shutdown_nb); 1621 register_reboot_notifier(&rcutorture_shutdown_nb);
1622 rcutorture_record_test_transition();
1627 mutex_unlock(&fullstop_mutex); 1623 mutex_unlock(&fullstop_mutex);
1628 return 0; 1624 return 0;
1629 1625
diff --git a/kernel/rcutree.c b/kernel/rcutree.c
index dd4aea806f8e..e486f7c3ffb8 100644
--- a/kernel/rcutree.c
+++ b/kernel/rcutree.c
@@ -47,6 +47,8 @@
47#include <linux/mutex.h> 47#include <linux/mutex.h>
48#include <linux/time.h> 48#include <linux/time.h>
49#include <linux/kernel_stat.h> 49#include <linux/kernel_stat.h>
50#include <linux/wait.h>
51#include <linux/kthread.h>
50 52
51#include "rcutree.h" 53#include "rcutree.h"
52 54
@@ -79,10 +81,41 @@ DEFINE_PER_CPU(struct rcu_data, rcu_sched_data);
79struct rcu_state rcu_bh_state = RCU_STATE_INITIALIZER(rcu_bh_state); 81struct rcu_state rcu_bh_state = RCU_STATE_INITIALIZER(rcu_bh_state);
80DEFINE_PER_CPU(struct rcu_data, rcu_bh_data); 82DEFINE_PER_CPU(struct rcu_data, rcu_bh_data);
81 83
84static struct rcu_state *rcu_state;
85
82int rcu_scheduler_active __read_mostly; 86int rcu_scheduler_active __read_mostly;
83EXPORT_SYMBOL_GPL(rcu_scheduler_active); 87EXPORT_SYMBOL_GPL(rcu_scheduler_active);
84 88
85/* 89/*
90 * Control variables for per-CPU and per-rcu_node kthreads. These
91 * handle all flavors of RCU.
92 */
93static DEFINE_PER_CPU(struct task_struct *, rcu_cpu_kthread_task);
94DEFINE_PER_CPU(unsigned int, rcu_cpu_kthread_status);
95DEFINE_PER_CPU(int, rcu_cpu_kthread_cpu);
96DEFINE_PER_CPU(unsigned int, rcu_cpu_kthread_loops);
97static DEFINE_PER_CPU(wait_queue_head_t, rcu_cpu_wq);
98DEFINE_PER_CPU(char, rcu_cpu_has_work);
99static char rcu_kthreads_spawnable;
100
101static void rcu_node_kthread_setaffinity(struct rcu_node *rnp, int outgoingcpu);
102static void invoke_rcu_cpu_kthread(void);
103
104#define RCU_KTHREAD_PRIO 1 /* RT priority for per-CPU kthreads. */
105
106/*
107 * Track the rcutorture test sequence number and the update version
108 * number within a given test. The rcutorture_testseq is incremented
109 * on every rcutorture module load and unload, so has an odd value
110 * when a test is running. The rcutorture_vernum is set to zero
111 * when rcutorture starts and is incremented on each rcutorture update.
112 * These variables enable correlating rcutorture output with the
113 * RCU tracing information.
114 */
115unsigned long rcutorture_testseq;
116unsigned long rcutorture_vernum;
117
118/*
86 * Return true if an RCU grace period is in progress. The ACCESS_ONCE()s 119 * Return true if an RCU grace period is in progress. The ACCESS_ONCE()s
87 * permit this function to be invoked without holding the root rcu_node 120 * permit this function to be invoked without holding the root rcu_node
88 * structure's ->lock, but of course results can be subject to change. 121 * structure's ->lock, but of course results can be subject to change.
@@ -124,6 +157,7 @@ void rcu_note_context_switch(int cpu)
124 rcu_sched_qs(cpu); 157 rcu_sched_qs(cpu);
125 rcu_preempt_note_context_switch(cpu); 158 rcu_preempt_note_context_switch(cpu);
126} 159}
160EXPORT_SYMBOL_GPL(rcu_note_context_switch);
127 161
128#ifdef CONFIG_NO_HZ 162#ifdef CONFIG_NO_HZ
129DEFINE_PER_CPU(struct rcu_dynticks, rcu_dynticks) = { 163DEFINE_PER_CPU(struct rcu_dynticks, rcu_dynticks) = {
@@ -140,10 +174,8 @@ module_param(blimit, int, 0);
140module_param(qhimark, int, 0); 174module_param(qhimark, int, 0);
141module_param(qlowmark, int, 0); 175module_param(qlowmark, int, 0);
142 176
143#ifdef CONFIG_RCU_CPU_STALL_DETECTOR 177int rcu_cpu_stall_suppress __read_mostly;
144int rcu_cpu_stall_suppress __read_mostly = RCU_CPU_STALL_SUPPRESS_INIT;
145module_param(rcu_cpu_stall_suppress, int, 0644); 178module_param(rcu_cpu_stall_suppress, int, 0644);
146#endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */
147 179
148static void force_quiescent_state(struct rcu_state *rsp, int relaxed); 180static void force_quiescent_state(struct rcu_state *rsp, int relaxed);
149static int rcu_pending(int cpu); 181static int rcu_pending(int cpu);
@@ -176,6 +208,31 @@ void rcu_bh_force_quiescent_state(void)
176EXPORT_SYMBOL_GPL(rcu_bh_force_quiescent_state); 208EXPORT_SYMBOL_GPL(rcu_bh_force_quiescent_state);
177 209
178/* 210/*
211 * Record the number of times rcutorture tests have been initiated and
212 * terminated. This information allows the debugfs tracing stats to be
213 * correlated to the rcutorture messages, even when the rcutorture module
214 * is being repeatedly loaded and unloaded. In other words, we cannot
215 * store this state in rcutorture itself.
216 */
217void rcutorture_record_test_transition(void)
218{
219 rcutorture_testseq++;
220 rcutorture_vernum = 0;
221}
222EXPORT_SYMBOL_GPL(rcutorture_record_test_transition);
223
224/*
225 * Record the number of writer passes through the current rcutorture test.
226 * This is also used to correlate debugfs tracing stats with the rcutorture
227 * messages.
228 */
229void rcutorture_record_progress(unsigned long vernum)
230{
231 rcutorture_vernum++;
232}
233EXPORT_SYMBOL_GPL(rcutorture_record_progress);
234
235/*
179 * Force a quiescent state for RCU-sched. 236 * Force a quiescent state for RCU-sched.
180 */ 237 */
181void rcu_sched_force_quiescent_state(void) 238void rcu_sched_force_quiescent_state(void)
@@ -234,8 +291,8 @@ static int rcu_implicit_offline_qs(struct rcu_data *rdp)
234 return 1; 291 return 1;
235 } 292 }
236 293
237 /* If preemptable RCU, no point in sending reschedule IPI. */ 294 /* If preemptible RCU, no point in sending reschedule IPI. */
238 if (rdp->preemptable) 295 if (rdp->preemptible)
239 return 0; 296 return 0;
240 297
241 /* The CPU is online, so send it a reschedule IPI. */ 298 /* The CPU is online, so send it a reschedule IPI. */
@@ -450,8 +507,6 @@ static int rcu_implicit_dynticks_qs(struct rcu_data *rdp)
450 507
451#endif /* #else #ifdef CONFIG_NO_HZ */ 508#endif /* #else #ifdef CONFIG_NO_HZ */
452 509
453#ifdef CONFIG_RCU_CPU_STALL_DETECTOR
454
455int rcu_cpu_stall_suppress __read_mostly; 510int rcu_cpu_stall_suppress __read_mostly;
456 511
457static void record_gp_stall_check_time(struct rcu_state *rsp) 512static void record_gp_stall_check_time(struct rcu_state *rsp)
@@ -537,21 +592,24 @@ static void print_cpu_stall(struct rcu_state *rsp)
537 592
538static void check_cpu_stall(struct rcu_state *rsp, struct rcu_data *rdp) 593static void check_cpu_stall(struct rcu_state *rsp, struct rcu_data *rdp)
539{ 594{
540 long delta; 595 unsigned long j;
596 unsigned long js;
541 struct rcu_node *rnp; 597 struct rcu_node *rnp;
542 598
543 if (rcu_cpu_stall_suppress) 599 if (rcu_cpu_stall_suppress)
544 return; 600 return;
545 delta = jiffies - ACCESS_ONCE(rsp->jiffies_stall); 601 j = ACCESS_ONCE(jiffies);
602 js = ACCESS_ONCE(rsp->jiffies_stall);
546 rnp = rdp->mynode; 603 rnp = rdp->mynode;
547 if ((ACCESS_ONCE(rnp->qsmask) & rdp->grpmask) && delta >= 0) { 604 if ((ACCESS_ONCE(rnp->qsmask) & rdp->grpmask) && ULONG_CMP_GE(j, js)) {
548 605
549 /* We haven't checked in, so go dump stack. */ 606 /* We haven't checked in, so go dump stack. */
550 print_cpu_stall(rsp); 607 print_cpu_stall(rsp);
551 608
552 } else if (rcu_gp_in_progress(rsp) && delta >= RCU_STALL_RAT_DELAY) { 609 } else if (rcu_gp_in_progress(rsp) &&
610 ULONG_CMP_GE(j, js + RCU_STALL_RAT_DELAY)) {
553 611
554 /* They had two time units to dump stack, so complain. */ 612 /* They had a few time units to dump stack, so complain. */
555 print_other_cpu_stall(rsp); 613 print_other_cpu_stall(rsp);
556 } 614 }
557} 615}
@@ -587,26 +645,6 @@ static void __init check_cpu_stall_init(void)
587 atomic_notifier_chain_register(&panic_notifier_list, &rcu_panic_block); 645 atomic_notifier_chain_register(&panic_notifier_list, &rcu_panic_block);
588} 646}
589 647
590#else /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */
591
592static void record_gp_stall_check_time(struct rcu_state *rsp)
593{
594}
595
596static void check_cpu_stall(struct rcu_state *rsp, struct rcu_data *rdp)
597{
598}
599
600void rcu_cpu_stall_reset(void)
601{
602}
603
604static void __init check_cpu_stall_init(void)
605{
606}
607
608#endif /* #else #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */
609
610/* 648/*
611 * Update CPU-local rcu_data state to record the newly noticed grace period. 649 * Update CPU-local rcu_data state to record the newly noticed grace period.
612 * This is used both when we started the grace period and when we notice 650 * This is used both when we started the grace period and when we notice
@@ -809,6 +847,7 @@ rcu_start_gp(struct rcu_state *rsp, unsigned long flags)
809 rnp->completed = rsp->completed; 847 rnp->completed = rsp->completed;
810 rsp->signaled = RCU_SIGNAL_INIT; /* force_quiescent_state OK. */ 848 rsp->signaled = RCU_SIGNAL_INIT; /* force_quiescent_state OK. */
811 rcu_start_gp_per_cpu(rsp, rnp, rdp); 849 rcu_start_gp_per_cpu(rsp, rnp, rdp);
850 rcu_preempt_boost_start_gp(rnp);
812 raw_spin_unlock_irqrestore(&rnp->lock, flags); 851 raw_spin_unlock_irqrestore(&rnp->lock, flags);
813 return; 852 return;
814 } 853 }
@@ -844,6 +883,7 @@ rcu_start_gp(struct rcu_state *rsp, unsigned long flags)
844 rnp->completed = rsp->completed; 883 rnp->completed = rsp->completed;
845 if (rnp == rdp->mynode) 884 if (rnp == rdp->mynode)
846 rcu_start_gp_per_cpu(rsp, rnp, rdp); 885 rcu_start_gp_per_cpu(rsp, rnp, rdp);
886 rcu_preempt_boost_start_gp(rnp);
847 raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */ 887 raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */
848 } 888 }
849 889
@@ -864,7 +904,12 @@ rcu_start_gp(struct rcu_state *rsp, unsigned long flags)
864static void rcu_report_qs_rsp(struct rcu_state *rsp, unsigned long flags) 904static void rcu_report_qs_rsp(struct rcu_state *rsp, unsigned long flags)
865 __releases(rcu_get_root(rsp)->lock) 905 __releases(rcu_get_root(rsp)->lock)
866{ 906{
907 unsigned long gp_duration;
908
867 WARN_ON_ONCE(!rcu_gp_in_progress(rsp)); 909 WARN_ON_ONCE(!rcu_gp_in_progress(rsp));
910 gp_duration = jiffies - rsp->gp_start;
911 if (gp_duration > rsp->gp_max)
912 rsp->gp_max = gp_duration;
868 rsp->completed = rsp->gpnum; 913 rsp->completed = rsp->gpnum;
869 rsp->signaled = RCU_GP_IDLE; 914 rsp->signaled = RCU_GP_IDLE;
870 rcu_start_gp(rsp, flags); /* releases root node's rnp->lock. */ 915 rcu_start_gp(rsp, flags); /* releases root node's rnp->lock. */
@@ -894,7 +939,7 @@ rcu_report_qs_rnp(unsigned long mask, struct rcu_state *rsp,
894 return; 939 return;
895 } 940 }
896 rnp->qsmask &= ~mask; 941 rnp->qsmask &= ~mask;
897 if (rnp->qsmask != 0 || rcu_preempted_readers(rnp)) { 942 if (rnp->qsmask != 0 || rcu_preempt_blocked_readers_cgp(rnp)) {
898 943
899 /* Other bits still set at this level, so done. */ 944 /* Other bits still set at this level, so done. */
900 raw_spin_unlock_irqrestore(&rnp->lock, flags); 945 raw_spin_unlock_irqrestore(&rnp->lock, flags);
@@ -1037,6 +1082,8 @@ static void rcu_send_cbs_to_online(struct rcu_state *rsp)
1037/* 1082/*
1038 * Remove the outgoing CPU from the bitmasks in the rcu_node hierarchy 1083 * Remove the outgoing CPU from the bitmasks in the rcu_node hierarchy
1039 * and move all callbacks from the outgoing CPU to the current one. 1084 * and move all callbacks from the outgoing CPU to the current one.
1085 * There can only be one CPU hotplug operation at a time, so no other
1086 * CPU can be attempting to update rcu_cpu_kthread_task.
1040 */ 1087 */
1041static void __rcu_offline_cpu(int cpu, struct rcu_state *rsp) 1088static void __rcu_offline_cpu(int cpu, struct rcu_state *rsp)
1042{ 1089{
@@ -1045,6 +1092,14 @@ static void __rcu_offline_cpu(int cpu, struct rcu_state *rsp)
1045 int need_report = 0; 1092 int need_report = 0;
1046 struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu); 1093 struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu);
1047 struct rcu_node *rnp; 1094 struct rcu_node *rnp;
1095 struct task_struct *t;
1096
1097 /* Stop the CPU's kthread. */
1098 t = per_cpu(rcu_cpu_kthread_task, cpu);
1099 if (t != NULL) {
1100 per_cpu(rcu_cpu_kthread_task, cpu) = NULL;
1101 kthread_stop(t);
1102 }
1048 1103
1049 /* Exclude any attempts to start a new grace period. */ 1104 /* Exclude any attempts to start a new grace period. */
1050 raw_spin_lock_irqsave(&rsp->onofflock, flags); 1105 raw_spin_lock_irqsave(&rsp->onofflock, flags);
@@ -1082,6 +1137,7 @@ static void __rcu_offline_cpu(int cpu, struct rcu_state *rsp)
1082 raw_spin_unlock_irqrestore(&rnp->lock, flags); 1137 raw_spin_unlock_irqrestore(&rnp->lock, flags);
1083 if (need_report & RCU_OFL_TASKS_EXP_GP) 1138 if (need_report & RCU_OFL_TASKS_EXP_GP)
1084 rcu_report_exp_rnp(rsp, rnp); 1139 rcu_report_exp_rnp(rsp, rnp);
1140 rcu_node_kthread_setaffinity(rnp, -1);
1085} 1141}
1086 1142
1087/* 1143/*
@@ -1143,7 +1199,7 @@ static void rcu_do_batch(struct rcu_state *rsp, struct rcu_data *rdp)
1143 next = list->next; 1199 next = list->next;
1144 prefetch(next); 1200 prefetch(next);
1145 debug_rcu_head_unqueue(list); 1201 debug_rcu_head_unqueue(list);
1146 list->func(list); 1202 __rcu_reclaim(list);
1147 list = next; 1203 list = next;
1148 if (++count >= rdp->blimit) 1204 if (++count >= rdp->blimit)
1149 break; 1205 break;
@@ -1179,7 +1235,7 @@ static void rcu_do_batch(struct rcu_state *rsp, struct rcu_data *rdp)
1179 1235
1180 /* Re-raise the RCU softirq if there are callbacks remaining. */ 1236 /* Re-raise the RCU softirq if there are callbacks remaining. */
1181 if (cpu_has_callbacks_ready_to_invoke(rdp)) 1237 if (cpu_has_callbacks_ready_to_invoke(rdp))
1182 raise_softirq(RCU_SOFTIRQ); 1238 invoke_rcu_cpu_kthread();
1183} 1239}
1184 1240
1185/* 1241/*
@@ -1225,7 +1281,7 @@ void rcu_check_callbacks(int cpu, int user)
1225 } 1281 }
1226 rcu_preempt_check_callbacks(cpu); 1282 rcu_preempt_check_callbacks(cpu);
1227 if (rcu_pending(cpu)) 1283 if (rcu_pending(cpu))
1228 raise_softirq(RCU_SOFTIRQ); 1284 invoke_rcu_cpu_kthread();
1229} 1285}
1230 1286
1231#ifdef CONFIG_SMP 1287#ifdef CONFIG_SMP
@@ -1233,6 +1289,8 @@ void rcu_check_callbacks(int cpu, int user)
1233/* 1289/*
1234 * Scan the leaf rcu_node structures, processing dyntick state for any that 1290 * Scan the leaf rcu_node structures, processing dyntick state for any that
1235 * have not yet encountered a quiescent state, using the function specified. 1291 * have not yet encountered a quiescent state, using the function specified.
1292 * Also initiate boosting for any threads blocked on the root rcu_node.
1293 *
1236 * The caller must have suppressed start of new grace periods. 1294 * The caller must have suppressed start of new grace periods.
1237 */ 1295 */
1238static void force_qs_rnp(struct rcu_state *rsp, int (*f)(struct rcu_data *)) 1296static void force_qs_rnp(struct rcu_state *rsp, int (*f)(struct rcu_data *))
@@ -1251,7 +1309,7 @@ static void force_qs_rnp(struct rcu_state *rsp, int (*f)(struct rcu_data *))
1251 return; 1309 return;
1252 } 1310 }
1253 if (rnp->qsmask == 0) { 1311 if (rnp->qsmask == 0) {
1254 raw_spin_unlock_irqrestore(&rnp->lock, flags); 1312 rcu_initiate_boost(rnp, flags); /* releases rnp->lock */
1255 continue; 1313 continue;
1256 } 1314 }
1257 cpu = rnp->grplo; 1315 cpu = rnp->grplo;
@@ -1269,6 +1327,11 @@ static void force_qs_rnp(struct rcu_state *rsp, int (*f)(struct rcu_data *))
1269 } 1327 }
1270 raw_spin_unlock_irqrestore(&rnp->lock, flags); 1328 raw_spin_unlock_irqrestore(&rnp->lock, flags);
1271 } 1329 }
1330 rnp = rcu_get_root(rsp);
1331 if (rnp->qsmask == 0) {
1332 raw_spin_lock_irqsave(&rnp->lock, flags);
1333 rcu_initiate_boost(rnp, flags); /* releases rnp->lock. */
1334 }
1272} 1335}
1273 1336
1274/* 1337/*
@@ -1389,7 +1452,7 @@ __rcu_process_callbacks(struct rcu_state *rsp, struct rcu_data *rdp)
1389/* 1452/*
1390 * Do softirq processing for the current CPU. 1453 * Do softirq processing for the current CPU.
1391 */ 1454 */
1392static void rcu_process_callbacks(struct softirq_action *unused) 1455static void rcu_process_callbacks(void)
1393{ 1456{
1394 /* 1457 /*
1395 * Memory references from any prior RCU read-side critical sections 1458 * Memory references from any prior RCU read-side critical sections
@@ -1414,6 +1477,347 @@ static void rcu_process_callbacks(struct softirq_action *unused)
1414 rcu_needs_cpu_flush(); 1477 rcu_needs_cpu_flush();
1415} 1478}
1416 1479
1480/*
1481 * Wake up the current CPU's kthread. This replaces raise_softirq()
1482 * in earlier versions of RCU. Note that because we are running on
1483 * the current CPU with interrupts disabled, the rcu_cpu_kthread_task
1484 * cannot disappear out from under us.
1485 */
1486static void invoke_rcu_cpu_kthread(void)
1487{
1488 unsigned long flags;
1489
1490 local_irq_save(flags);
1491 __this_cpu_write(rcu_cpu_has_work, 1);
1492 if (__this_cpu_read(rcu_cpu_kthread_task) == NULL) {
1493 local_irq_restore(flags);
1494 return;
1495 }
1496 wake_up(&__get_cpu_var(rcu_cpu_wq));
1497 local_irq_restore(flags);
1498}
1499
1500/*
1501 * Wake up the specified per-rcu_node-structure kthread.
1502 * Because the per-rcu_node kthreads are immortal, we don't need
1503 * to do anything to keep them alive.
1504 */
1505static void invoke_rcu_node_kthread(struct rcu_node *rnp)
1506{
1507 struct task_struct *t;
1508
1509 t = rnp->node_kthread_task;
1510 if (t != NULL)
1511 wake_up_process(t);
1512}
1513
1514/*
1515 * Set the specified CPU's kthread to run RT or not, as specified by
1516 * the to_rt argument. The CPU-hotplug locks are held, so the task
1517 * is not going away.
1518 */
1519static void rcu_cpu_kthread_setrt(int cpu, int to_rt)
1520{
1521 int policy;
1522 struct sched_param sp;
1523 struct task_struct *t;
1524
1525 t = per_cpu(rcu_cpu_kthread_task, cpu);
1526 if (t == NULL)
1527 return;
1528 if (to_rt) {
1529 policy = SCHED_FIFO;
1530 sp.sched_priority = RCU_KTHREAD_PRIO;
1531 } else {
1532 policy = SCHED_NORMAL;
1533 sp.sched_priority = 0;
1534 }
1535 sched_setscheduler_nocheck(t, policy, &sp);
1536}
1537
1538/*
1539 * Timer handler to initiate the waking up of per-CPU kthreads that
1540 * have yielded the CPU due to excess numbers of RCU callbacks.
1541 * We wake up the per-rcu_node kthread, which in turn will wake up
1542 * the booster kthread.
1543 */
1544static void rcu_cpu_kthread_timer(unsigned long arg)
1545{
1546 unsigned long flags;
1547 struct rcu_data *rdp = per_cpu_ptr(rcu_state->rda, arg);
1548 struct rcu_node *rnp = rdp->mynode;
1549
1550 raw_spin_lock_irqsave(&rnp->lock, flags);
1551 rnp->wakemask |= rdp->grpmask;
1552 raw_spin_unlock_irqrestore(&rnp->lock, flags);
1553 invoke_rcu_node_kthread(rnp);
1554}
1555
1556/*
1557 * Drop to non-real-time priority and yield, but only after posting a
1558 * timer that will cause us to regain our real-time priority if we
1559 * remain preempted. Either way, we restore our real-time priority
1560 * before returning.
1561 */
1562static void rcu_yield(void (*f)(unsigned long), unsigned long arg)
1563{
1564 struct sched_param sp;
1565 struct timer_list yield_timer;
1566
1567 setup_timer_on_stack(&yield_timer, f, arg);
1568 mod_timer(&yield_timer, jiffies + 2);
1569 sp.sched_priority = 0;
1570 sched_setscheduler_nocheck(current, SCHED_NORMAL, &sp);
1571 set_user_nice(current, 19);
1572 schedule();
1573 sp.sched_priority = RCU_KTHREAD_PRIO;
1574 sched_setscheduler_nocheck(current, SCHED_FIFO, &sp);
1575 del_timer(&yield_timer);
1576}
1577
1578/*
1579 * Handle cases where the rcu_cpu_kthread() ends up on the wrong CPU.
1580 * This can happen while the corresponding CPU is either coming online
1581 * or going offline. We cannot wait until the CPU is fully online
1582 * before starting the kthread, because the various notifier functions
1583 * can wait for RCU grace periods. So we park rcu_cpu_kthread() until
1584 * the corresponding CPU is online.
1585 *
1586 * Return 1 if the kthread needs to stop, 0 otherwise.
1587 *
1588 * Caller must disable bh. This function can momentarily enable it.
1589 */
1590static int rcu_cpu_kthread_should_stop(int cpu)
1591{
1592 while (cpu_is_offline(cpu) ||
1593 !cpumask_equal(&current->cpus_allowed, cpumask_of(cpu)) ||
1594 smp_processor_id() != cpu) {
1595 if (kthread_should_stop())
1596 return 1;
1597 per_cpu(rcu_cpu_kthread_status, cpu) = RCU_KTHREAD_OFFCPU;
1598 per_cpu(rcu_cpu_kthread_cpu, cpu) = raw_smp_processor_id();
1599 local_bh_enable();
1600 schedule_timeout_uninterruptible(1);
1601 if (!cpumask_equal(&current->cpus_allowed, cpumask_of(cpu)))
1602 set_cpus_allowed_ptr(current, cpumask_of(cpu));
1603 local_bh_disable();
1604 }
1605 per_cpu(rcu_cpu_kthread_cpu, cpu) = cpu;
1606 return 0;
1607}
1608
1609/*
1610 * Per-CPU kernel thread that invokes RCU callbacks. This replaces the
1611 * earlier RCU softirq.
1612 */
1613static int rcu_cpu_kthread(void *arg)
1614{
1615 int cpu = (int)(long)arg;
1616 unsigned long flags;
1617 int spincnt = 0;
1618 unsigned int *statusp = &per_cpu(rcu_cpu_kthread_status, cpu);
1619 wait_queue_head_t *wqp = &per_cpu(rcu_cpu_wq, cpu);
1620 char work;
1621 char *workp = &per_cpu(rcu_cpu_has_work, cpu);
1622
1623 for (;;) {
1624 *statusp = RCU_KTHREAD_WAITING;
1625 wait_event_interruptible(*wqp,
1626 *workp != 0 || kthread_should_stop());
1627 local_bh_disable();
1628 if (rcu_cpu_kthread_should_stop(cpu)) {
1629 local_bh_enable();
1630 break;
1631 }
1632 *statusp = RCU_KTHREAD_RUNNING;
1633 per_cpu(rcu_cpu_kthread_loops, cpu)++;
1634 local_irq_save(flags);
1635 work = *workp;
1636 *workp = 0;
1637 local_irq_restore(flags);
1638 if (work)
1639 rcu_process_callbacks();
1640 local_bh_enable();
1641 if (*workp != 0)
1642 spincnt++;
1643 else
1644 spincnt = 0;
1645 if (spincnt > 10) {
1646 *statusp = RCU_KTHREAD_YIELDING;
1647 rcu_yield(rcu_cpu_kthread_timer, (unsigned long)cpu);
1648 spincnt = 0;
1649 }
1650 }
1651 *statusp = RCU_KTHREAD_STOPPED;
1652 return 0;
1653}
1654
1655/*
1656 * Spawn a per-CPU kthread, setting up affinity and priority.
1657 * Because the CPU hotplug lock is held, no other CPU will be attempting
1658 * to manipulate rcu_cpu_kthread_task. There might be another CPU
1659 * attempting to access it during boot, but the locking in kthread_bind()
1660 * will enforce sufficient ordering.
1661 */
1662static int __cpuinit rcu_spawn_one_cpu_kthread(int cpu)
1663{
1664 struct sched_param sp;
1665 struct task_struct *t;
1666
1667 if (!rcu_kthreads_spawnable ||
1668 per_cpu(rcu_cpu_kthread_task, cpu) != NULL)
1669 return 0;
1670 t = kthread_create(rcu_cpu_kthread, (void *)(long)cpu, "rcuc%d", cpu);
1671 if (IS_ERR(t))
1672 return PTR_ERR(t);
1673 kthread_bind(t, cpu);
1674 per_cpu(rcu_cpu_kthread_cpu, cpu) = cpu;
1675 WARN_ON_ONCE(per_cpu(rcu_cpu_kthread_task, cpu) != NULL);
1676 per_cpu(rcu_cpu_kthread_task, cpu) = t;
1677 wake_up_process(t);
1678 sp.sched_priority = RCU_KTHREAD_PRIO;
1679 sched_setscheduler_nocheck(t, SCHED_FIFO, &sp);
1680 return 0;
1681}
1682
1683/*
1684 * Per-rcu_node kthread, which is in charge of waking up the per-CPU
1685 * kthreads when needed. We ignore requests to wake up kthreads
1686 * for offline CPUs, which is OK because force_quiescent_state()
1687 * takes care of this case.
1688 */
1689static int rcu_node_kthread(void *arg)
1690{
1691 int cpu;
1692 unsigned long flags;
1693 unsigned long mask;
1694 struct rcu_node *rnp = (struct rcu_node *)arg;
1695 struct sched_param sp;
1696 struct task_struct *t;
1697
1698 for (;;) {
1699 rnp->node_kthread_status = RCU_KTHREAD_WAITING;
1700 wait_event_interruptible(rnp->node_wq, rnp->wakemask != 0);
1701 rnp->node_kthread_status = RCU_KTHREAD_RUNNING;
1702 raw_spin_lock_irqsave(&rnp->lock, flags);
1703 mask = rnp->wakemask;
1704 rnp->wakemask = 0;
1705 rcu_initiate_boost(rnp, flags); /* releases rnp->lock. */
1706 for (cpu = rnp->grplo; cpu <= rnp->grphi; cpu++, mask >>= 1) {
1707 if ((mask & 0x1) == 0)
1708 continue;
1709 preempt_disable();
1710 t = per_cpu(rcu_cpu_kthread_task, cpu);
1711 if (!cpu_online(cpu) || t == NULL) {
1712 preempt_enable();
1713 continue;
1714 }
1715 per_cpu(rcu_cpu_has_work, cpu) = 1;
1716 sp.sched_priority = RCU_KTHREAD_PRIO;
1717 sched_setscheduler_nocheck(t, SCHED_FIFO, &sp);
1718 preempt_enable();
1719 }
1720 }
1721 /* NOTREACHED */
1722 rnp->node_kthread_status = RCU_KTHREAD_STOPPED;
1723 return 0;
1724}
1725
1726/*
1727 * Set the per-rcu_node kthread's affinity to cover all CPUs that are
1728 * served by the rcu_node in question. The CPU hotplug lock is still
1729 * held, so the value of rnp->qsmaskinit will be stable.
1730 *
1731 * We don't include outgoingcpu in the affinity set, use -1 if there is
1732 * no outgoing CPU. If there are no CPUs left in the affinity set,
1733 * this function allows the kthread to execute on any CPU.
1734 */
1735static void rcu_node_kthread_setaffinity(struct rcu_node *rnp, int outgoingcpu)
1736{
1737 cpumask_var_t cm;
1738 int cpu;
1739 unsigned long mask = rnp->qsmaskinit;
1740
1741 if (rnp->node_kthread_task == NULL)
1742 return;
1743 if (!alloc_cpumask_var(&cm, GFP_KERNEL))
1744 return;
1745 cpumask_clear(cm);
1746 for (cpu = rnp->grplo; cpu <= rnp->grphi; cpu++, mask >>= 1)
1747 if ((mask & 0x1) && cpu != outgoingcpu)
1748 cpumask_set_cpu(cpu, cm);
1749 if (cpumask_weight(cm) == 0) {
1750 cpumask_setall(cm);
1751 for (cpu = rnp->grplo; cpu <= rnp->grphi; cpu++)
1752 cpumask_clear_cpu(cpu, cm);
1753 WARN_ON_ONCE(cpumask_weight(cm) == 0);
1754 }
1755 set_cpus_allowed_ptr(rnp->node_kthread_task, cm);
1756 rcu_boost_kthread_setaffinity(rnp, cm);
1757 free_cpumask_var(cm);
1758}
1759
1760/*
1761 * Spawn a per-rcu_node kthread, setting priority and affinity.
1762 * Called during boot before online/offline can happen, or, if
1763 * during runtime, with the main CPU-hotplug locks held. So only
1764 * one of these can be executing at a time.
1765 */
1766static int __cpuinit rcu_spawn_one_node_kthread(struct rcu_state *rsp,
1767 struct rcu_node *rnp)
1768{
1769 unsigned long flags;
1770 int rnp_index = rnp - &rsp->node[0];
1771 struct sched_param sp;
1772 struct task_struct *t;
1773
1774 if (!rcu_kthreads_spawnable ||
1775 rnp->qsmaskinit == 0)
1776 return 0;
1777 if (rnp->node_kthread_task == NULL) {
1778 t = kthread_create(rcu_node_kthread, (void *)rnp,
1779 "rcun%d", rnp_index);
1780 if (IS_ERR(t))
1781 return PTR_ERR(t);
1782 raw_spin_lock_irqsave(&rnp->lock, flags);
1783 rnp->node_kthread_task = t;
1784 raw_spin_unlock_irqrestore(&rnp->lock, flags);
1785 wake_up_process(t);
1786 sp.sched_priority = 99;
1787 sched_setscheduler_nocheck(t, SCHED_FIFO, &sp);
1788 }
1789 return rcu_spawn_one_boost_kthread(rsp, rnp, rnp_index);
1790}
1791
1792/*
1793 * Spawn all kthreads -- called as soon as the scheduler is running.
1794 */
1795static int __init rcu_spawn_kthreads(void)
1796{
1797 int cpu;
1798 struct rcu_node *rnp;
1799
1800 rcu_kthreads_spawnable = 1;
1801 for_each_possible_cpu(cpu) {
1802 init_waitqueue_head(&per_cpu(rcu_cpu_wq, cpu));
1803 per_cpu(rcu_cpu_has_work, cpu) = 0;
1804 if (cpu_online(cpu))
1805 (void)rcu_spawn_one_cpu_kthread(cpu);
1806 }
1807 rnp = rcu_get_root(rcu_state);
1808 init_waitqueue_head(&rnp->node_wq);
1809 rcu_init_boost_waitqueue(rnp);
1810 (void)rcu_spawn_one_node_kthread(rcu_state, rnp);
1811 if (NUM_RCU_NODES > 1)
1812 rcu_for_each_leaf_node(rcu_state, rnp) {
1813 init_waitqueue_head(&rnp->node_wq);
1814 rcu_init_boost_waitqueue(rnp);
1815 (void)rcu_spawn_one_node_kthread(rcu_state, rnp);
1816 }
1817 return 0;
1818}
1819early_initcall(rcu_spawn_kthreads);
1820
1417static void 1821static void
1418__call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu), 1822__call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu),
1419 struct rcu_state *rsp) 1823 struct rcu_state *rsp)
@@ -1439,6 +1843,13 @@ __call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu),
1439 /* Add the callback to our list. */ 1843 /* Add the callback to our list. */
1440 *rdp->nxttail[RCU_NEXT_TAIL] = head; 1844 *rdp->nxttail[RCU_NEXT_TAIL] = head;
1441 rdp->nxttail[RCU_NEXT_TAIL] = &head->next; 1845 rdp->nxttail[RCU_NEXT_TAIL] = &head->next;
1846 rdp->qlen++;
1847
1848 /* If interrupts were disabled, don't dive into RCU core. */
1849 if (irqs_disabled_flags(flags)) {
1850 local_irq_restore(flags);
1851 return;
1852 }
1442 1853
1443 /* 1854 /*
1444 * Force the grace period if too many callbacks or too long waiting. 1855 * Force the grace period if too many callbacks or too long waiting.
@@ -1447,7 +1858,7 @@ __call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu),
1447 * invoking force_quiescent_state() if the newly enqueued callback 1858 * invoking force_quiescent_state() if the newly enqueued callback
1448 * is the only one waiting for a grace period to complete. 1859 * is the only one waiting for a grace period to complete.
1449 */ 1860 */
1450 if (unlikely(++rdp->qlen > rdp->qlen_last_fqs_check + qhimark)) { 1861 if (unlikely(rdp->qlen > rdp->qlen_last_fqs_check + qhimark)) {
1451 1862
1452 /* Are we ignoring a completed grace period? */ 1863 /* Are we ignoring a completed grace period? */
1453 rcu_process_gp_end(rsp, rdp); 1864 rcu_process_gp_end(rsp, rdp);
@@ -1583,7 +1994,7 @@ static int __rcu_pending(struct rcu_state *rsp, struct rcu_data *rdp)
1583 * or RCU-bh, force a local reschedule. 1994 * or RCU-bh, force a local reschedule.
1584 */ 1995 */
1585 rdp->n_rp_qs_pending++; 1996 rdp->n_rp_qs_pending++;
1586 if (!rdp->preemptable && 1997 if (!rdp->preemptible &&
1587 ULONG_CMP_LT(ACCESS_ONCE(rsp->jiffies_force_qs) - 1, 1998 ULONG_CMP_LT(ACCESS_ONCE(rsp->jiffies_force_qs) - 1,
1588 jiffies)) 1999 jiffies))
1589 set_need_resched(); 2000 set_need_resched();
@@ -1760,7 +2171,7 @@ rcu_boot_init_percpu_data(int cpu, struct rcu_state *rsp)
1760 * that this CPU cannot possibly have any RCU callbacks in flight yet. 2171 * that this CPU cannot possibly have any RCU callbacks in flight yet.
1761 */ 2172 */
1762static void __cpuinit 2173static void __cpuinit
1763rcu_init_percpu_data(int cpu, struct rcu_state *rsp, int preemptable) 2174rcu_init_percpu_data(int cpu, struct rcu_state *rsp, int preemptible)
1764{ 2175{
1765 unsigned long flags; 2176 unsigned long flags;
1766 unsigned long mask; 2177 unsigned long mask;
@@ -1772,7 +2183,7 @@ rcu_init_percpu_data(int cpu, struct rcu_state *rsp, int preemptable)
1772 rdp->passed_quiesc = 0; /* We could be racing with new GP, */ 2183 rdp->passed_quiesc = 0; /* We could be racing with new GP, */
1773 rdp->qs_pending = 1; /* so set up to respond to current GP. */ 2184 rdp->qs_pending = 1; /* so set up to respond to current GP. */
1774 rdp->beenonline = 1; /* We have now been online. */ 2185 rdp->beenonline = 1; /* We have now been online. */
1775 rdp->preemptable = preemptable; 2186 rdp->preemptible = preemptible;
1776 rdp->qlen_last_fqs_check = 0; 2187 rdp->qlen_last_fqs_check = 0;
1777 rdp->n_force_qs_snap = rsp->n_force_qs; 2188 rdp->n_force_qs_snap = rsp->n_force_qs;
1778 rdp->blimit = blimit; 2189 rdp->blimit = blimit;
@@ -1813,6 +2224,19 @@ static void __cpuinit rcu_online_cpu(int cpu)
1813 rcu_preempt_init_percpu_data(cpu); 2224 rcu_preempt_init_percpu_data(cpu);
1814} 2225}
1815 2226
2227static void __cpuinit rcu_online_kthreads(int cpu)
2228{
2229 struct rcu_data *rdp = per_cpu_ptr(rcu_state->rda, cpu);
2230 struct rcu_node *rnp = rdp->mynode;
2231
2232 /* Fire up the incoming CPU's kthread and leaf rcu_node kthread. */
2233 if (rcu_kthreads_spawnable) {
2234 (void)rcu_spawn_one_cpu_kthread(cpu);
2235 if (rnp->node_kthread_task == NULL)
2236 (void)rcu_spawn_one_node_kthread(rcu_state, rnp);
2237 }
2238}
2239
1816/* 2240/*
1817 * Handle CPU online/offline notification events. 2241 * Handle CPU online/offline notification events.
1818 */ 2242 */
@@ -1820,11 +2244,23 @@ static int __cpuinit rcu_cpu_notify(struct notifier_block *self,
1820 unsigned long action, void *hcpu) 2244 unsigned long action, void *hcpu)
1821{ 2245{
1822 long cpu = (long)hcpu; 2246 long cpu = (long)hcpu;
2247 struct rcu_data *rdp = per_cpu_ptr(rcu_state->rda, cpu);
2248 struct rcu_node *rnp = rdp->mynode;
1823 2249
1824 switch (action) { 2250 switch (action) {
1825 case CPU_UP_PREPARE: 2251 case CPU_UP_PREPARE:
1826 case CPU_UP_PREPARE_FROZEN: 2252 case CPU_UP_PREPARE_FROZEN:
1827 rcu_online_cpu(cpu); 2253 rcu_online_cpu(cpu);
2254 rcu_online_kthreads(cpu);
2255 break;
2256 case CPU_ONLINE:
2257 case CPU_DOWN_FAILED:
2258 rcu_node_kthread_setaffinity(rnp, -1);
2259 rcu_cpu_kthread_setrt(cpu, 1);
2260 break;
2261 case CPU_DOWN_PREPARE:
2262 rcu_node_kthread_setaffinity(rnp, cpu);
2263 rcu_cpu_kthread_setrt(cpu, 0);
1828 break; 2264 break;
1829 case CPU_DYING: 2265 case CPU_DYING:
1830 case CPU_DYING_FROZEN: 2266 case CPU_DYING_FROZEN:
@@ -1943,10 +2379,7 @@ static void __init rcu_init_one(struct rcu_state *rsp,
1943 j / rsp->levelspread[i - 1]; 2379 j / rsp->levelspread[i - 1];
1944 } 2380 }
1945 rnp->level = i; 2381 rnp->level = i;
1946 INIT_LIST_HEAD(&rnp->blocked_tasks[0]); 2382 INIT_LIST_HEAD(&rnp->blkd_tasks);
1947 INIT_LIST_HEAD(&rnp->blocked_tasks[1]);
1948 INIT_LIST_HEAD(&rnp->blocked_tasks[2]);
1949 INIT_LIST_HEAD(&rnp->blocked_tasks[3]);
1950 } 2383 }
1951 } 2384 }
1952 2385
@@ -1968,7 +2401,6 @@ void __init rcu_init(void)
1968 rcu_init_one(&rcu_sched_state, &rcu_sched_data); 2401 rcu_init_one(&rcu_sched_state, &rcu_sched_data);
1969 rcu_init_one(&rcu_bh_state, &rcu_bh_data); 2402 rcu_init_one(&rcu_bh_state, &rcu_bh_data);
1970 __rcu_init_preempt(); 2403 __rcu_init_preempt();
1971 open_softirq(RCU_SOFTIRQ, rcu_process_callbacks);
1972 2404
1973 /* 2405 /*
1974 * We don't need protection against CPU-hotplug here because 2406 * We don't need protection against CPU-hotplug here because
diff --git a/kernel/rcutree.h b/kernel/rcutree.h
index e8f057e44e3e..257664815d5d 100644
--- a/kernel/rcutree.h
+++ b/kernel/rcutree.h
@@ -91,6 +91,14 @@ struct rcu_dynticks {
91 /* remains even for nmi from irq handler. */ 91 /* remains even for nmi from irq handler. */
92}; 92};
93 93
94/* RCU's kthread states for tracing. */
95#define RCU_KTHREAD_STOPPED 0
96#define RCU_KTHREAD_RUNNING 1
97#define RCU_KTHREAD_WAITING 2
98#define RCU_KTHREAD_OFFCPU 3
99#define RCU_KTHREAD_YIELDING 4
100#define RCU_KTHREAD_MAX 4
101
94/* 102/*
95 * Definition for node within the RCU grace-period-detection hierarchy. 103 * Definition for node within the RCU grace-period-detection hierarchy.
96 */ 104 */
@@ -109,10 +117,11 @@ struct rcu_node {
109 /* an rcu_data structure, otherwise, each */ 117 /* an rcu_data structure, otherwise, each */
110 /* bit corresponds to a child rcu_node */ 118 /* bit corresponds to a child rcu_node */
111 /* structure. */ 119 /* structure. */
112 unsigned long expmask; /* Groups that have ->blocked_tasks[] */ 120 unsigned long expmask; /* Groups that have ->blkd_tasks */
113 /* elements that need to drain to allow the */ 121 /* elements that need to drain to allow the */
114 /* current expedited grace period to */ 122 /* current expedited grace period to */
115 /* complete (only for TREE_PREEMPT_RCU). */ 123 /* complete (only for TREE_PREEMPT_RCU). */
124 unsigned long wakemask; /* CPUs whose kthread needs to be awakened. */
116 unsigned long qsmaskinit; 125 unsigned long qsmaskinit;
117 /* Per-GP initial value for qsmask & expmask. */ 126 /* Per-GP initial value for qsmask & expmask. */
118 unsigned long grpmask; /* Mask to apply to parent qsmask. */ 127 unsigned long grpmask; /* Mask to apply to parent qsmask. */
@@ -122,11 +131,68 @@ struct rcu_node {
122 u8 grpnum; /* CPU/group number for next level up. */ 131 u8 grpnum; /* CPU/group number for next level up. */
123 u8 level; /* root is at level 0. */ 132 u8 level; /* root is at level 0. */
124 struct rcu_node *parent; 133 struct rcu_node *parent;
125 struct list_head blocked_tasks[4]; 134 struct list_head blkd_tasks;
126 /* Tasks blocked in RCU read-side critsect. */ 135 /* Tasks blocked in RCU read-side critical */
127 /* Grace period number (->gpnum) x blocked */ 136 /* section. Tasks are placed at the head */
128 /* by tasks on the (x & 0x1) element of the */ 137 /* of this list and age towards the tail. */
129 /* blocked_tasks[] array. */ 138 struct list_head *gp_tasks;
139 /* Pointer to the first task blocking the */
140 /* current grace period, or NULL if there */
141 /* is no such task. */
142 struct list_head *exp_tasks;
143 /* Pointer to the first task blocking the */
144 /* current expedited grace period, or NULL */
145 /* if there is no such task. If there */
146 /* is no current expedited grace period, */
147 /* then there can cannot be any such task. */
148#ifdef CONFIG_RCU_BOOST
149 struct list_head *boost_tasks;
150 /* Pointer to first task that needs to be */
151 /* priority boosted, or NULL if no priority */
152 /* boosting is needed for this rcu_node */
153 /* structure. If there are no tasks */
154 /* queued on this rcu_node structure that */
155 /* are blocking the current grace period, */
156 /* there can be no such task. */
157 unsigned long boost_time;
158 /* When to start boosting (jiffies). */
159 struct task_struct *boost_kthread_task;
160 /* kthread that takes care of priority */
161 /* boosting for this rcu_node structure. */
162 wait_queue_head_t boost_wq;
163 /* Wait queue on which to park the boost */
164 /* kthread. */
165 unsigned int boost_kthread_status;
166 /* State of boost_kthread_task for tracing. */
167 unsigned long n_tasks_boosted;
168 /* Total number of tasks boosted. */
169 unsigned long n_exp_boosts;
170 /* Number of tasks boosted for expedited GP. */
171 unsigned long n_normal_boosts;
172 /* Number of tasks boosted for normal GP. */
173 unsigned long n_balk_blkd_tasks;
174 /* Refused to boost: no blocked tasks. */
175 unsigned long n_balk_exp_gp_tasks;
176 /* Refused to boost: nothing blocking GP. */
177 unsigned long n_balk_boost_tasks;
178 /* Refused to boost: already boosting. */
179 unsigned long n_balk_notblocked;
180 /* Refused to boost: RCU RS CS still running. */
181 unsigned long n_balk_notyet;
182 /* Refused to boost: not yet time. */
183 unsigned long n_balk_nos;
184 /* Refused to boost: not sure why, though. */
185 /* This can happen due to race conditions. */
186#endif /* #ifdef CONFIG_RCU_BOOST */
187 struct task_struct *node_kthread_task;
188 /* kthread that takes care of this rcu_node */
189 /* structure, for example, awakening the */
190 /* per-CPU kthreads as needed. */
191 wait_queue_head_t node_wq;
192 /* Wait queue on which to park the per-node */
193 /* kthread. */
194 unsigned int node_kthread_status;
195 /* State of node_kthread_task for tracing. */
130} ____cacheline_internodealigned_in_smp; 196} ____cacheline_internodealigned_in_smp;
131 197
132/* 198/*
@@ -175,7 +241,7 @@ struct rcu_data {
175 bool passed_quiesc; /* User-mode/idle loop etc. */ 241 bool passed_quiesc; /* User-mode/idle loop etc. */
176 bool qs_pending; /* Core waits for quiesc state. */ 242 bool qs_pending; /* Core waits for quiesc state. */
177 bool beenonline; /* CPU online at least once. */ 243 bool beenonline; /* CPU online at least once. */
178 bool preemptable; /* Preemptable RCU? */ 244 bool preemptible; /* Preemptible RCU? */
179 struct rcu_node *mynode; /* This CPU's leaf of hierarchy */ 245 struct rcu_node *mynode; /* This CPU's leaf of hierarchy */
180 unsigned long grpmask; /* Mask to apply to leaf qsmask. */ 246 unsigned long grpmask; /* Mask to apply to leaf qsmask. */
181 247
@@ -254,7 +320,6 @@ struct rcu_data {
254#endif /* #else #ifdef CONFIG_NO_HZ */ 320#endif /* #else #ifdef CONFIG_NO_HZ */
255 321
256#define RCU_JIFFIES_TILL_FORCE_QS 3 /* for rsp->jiffies_force_qs */ 322#define RCU_JIFFIES_TILL_FORCE_QS 3 /* for rsp->jiffies_force_qs */
257#ifdef CONFIG_RCU_CPU_STALL_DETECTOR
258 323
259#ifdef CONFIG_PROVE_RCU 324#ifdef CONFIG_PROVE_RCU
260#define RCU_STALL_DELAY_DELTA (5 * HZ) 325#define RCU_STALL_DELAY_DELTA (5 * HZ)
@@ -272,13 +337,6 @@ struct rcu_data {
272 /* scheduling clock irq */ 337 /* scheduling clock irq */
273 /* before ratting on them. */ 338 /* before ratting on them. */
274 339
275#ifdef CONFIG_RCU_CPU_STALL_DETECTOR_RUNNABLE
276#define RCU_CPU_STALL_SUPPRESS_INIT 0
277#else
278#define RCU_CPU_STALL_SUPPRESS_INIT 1
279#endif
280
281#endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */
282 340
283/* 341/*
284 * RCU global state, including node hierarchy. This hierarchy is 342 * RCU global state, including node hierarchy. This hierarchy is
@@ -325,12 +383,12 @@ struct rcu_state {
325 /* due to lock unavailable. */ 383 /* due to lock unavailable. */
326 unsigned long n_force_qs_ngp; /* Number of calls leaving */ 384 unsigned long n_force_qs_ngp; /* Number of calls leaving */
327 /* due to no GP active. */ 385 /* due to no GP active. */
328#ifdef CONFIG_RCU_CPU_STALL_DETECTOR
329 unsigned long gp_start; /* Time at which GP started, */ 386 unsigned long gp_start; /* Time at which GP started, */
330 /* but in jiffies. */ 387 /* but in jiffies. */
331 unsigned long jiffies_stall; /* Time at which to check */ 388 unsigned long jiffies_stall; /* Time at which to check */
332 /* for CPU stalls. */ 389 /* for CPU stalls. */
333#endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */ 390 unsigned long gp_max; /* Maximum GP duration in */
391 /* jiffies. */
334 char *name; /* Name of structure. */ 392 char *name; /* Name of structure. */
335}; 393};
336 394
@@ -361,16 +419,14 @@ DECLARE_PER_CPU(struct rcu_data, rcu_preempt_data);
361static void rcu_bootup_announce(void); 419static void rcu_bootup_announce(void);
362long rcu_batches_completed(void); 420long rcu_batches_completed(void);
363static void rcu_preempt_note_context_switch(int cpu); 421static void rcu_preempt_note_context_switch(int cpu);
364static int rcu_preempted_readers(struct rcu_node *rnp); 422static int rcu_preempt_blocked_readers_cgp(struct rcu_node *rnp);
365#ifdef CONFIG_HOTPLUG_CPU 423#ifdef CONFIG_HOTPLUG_CPU
366static void rcu_report_unblock_qs_rnp(struct rcu_node *rnp, 424static void rcu_report_unblock_qs_rnp(struct rcu_node *rnp,
367 unsigned long flags); 425 unsigned long flags);
368#endif /* #ifdef CONFIG_HOTPLUG_CPU */ 426#endif /* #ifdef CONFIG_HOTPLUG_CPU */
369#ifdef CONFIG_RCU_CPU_STALL_DETECTOR
370static void rcu_print_detail_task_stall(struct rcu_state *rsp); 427static void rcu_print_detail_task_stall(struct rcu_state *rsp);
371static void rcu_print_task_stall(struct rcu_node *rnp); 428static void rcu_print_task_stall(struct rcu_node *rnp);
372static void rcu_preempt_stall_reset(void); 429static void rcu_preempt_stall_reset(void);
373#endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */
374static void rcu_preempt_check_blocked_tasks(struct rcu_node *rnp); 430static void rcu_preempt_check_blocked_tasks(struct rcu_node *rnp);
375#ifdef CONFIG_HOTPLUG_CPU 431#ifdef CONFIG_HOTPLUG_CPU
376static int rcu_preempt_offline_tasks(struct rcu_state *rsp, 432static int rcu_preempt_offline_tasks(struct rcu_state *rsp,
@@ -390,5 +446,13 @@ static void __cpuinit rcu_preempt_init_percpu_data(int cpu);
390static void rcu_preempt_send_cbs_to_online(void); 446static void rcu_preempt_send_cbs_to_online(void);
391static void __init __rcu_init_preempt(void); 447static void __init __rcu_init_preempt(void);
392static void rcu_needs_cpu_flush(void); 448static void rcu_needs_cpu_flush(void);
449static void __init rcu_init_boost_waitqueue(struct rcu_node *rnp);
450static void rcu_initiate_boost(struct rcu_node *rnp, unsigned long flags);
451static void rcu_boost_kthread_setaffinity(struct rcu_node *rnp,
452 cpumask_var_t cm);
453static void rcu_preempt_boost_start_gp(struct rcu_node *rnp);
454static int __cpuinit rcu_spawn_one_boost_kthread(struct rcu_state *rsp,
455 struct rcu_node *rnp,
456 int rnp_index);
393 457
394#endif /* #ifndef RCU_TREE_NONCORE */ 458#endif /* #ifndef RCU_TREE_NONCORE */
diff --git a/kernel/rcutree_plugin.h b/kernel/rcutree_plugin.h
index a3638710dc67..3f6559a5f5cd 100644
--- a/kernel/rcutree_plugin.h
+++ b/kernel/rcutree_plugin.h
@@ -1,7 +1,7 @@
1/* 1/*
2 * Read-Copy Update mechanism for mutual exclusion (tree-based version) 2 * Read-Copy Update mechanism for mutual exclusion (tree-based version)
3 * Internal non-public definitions that provide either classic 3 * Internal non-public definitions that provide either classic
4 * or preemptable semantics. 4 * or preemptible semantics.
5 * 5 *
6 * This program is free software; you can redistribute it and/or modify 6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by 7 * it under the terms of the GNU General Public License as published by
@@ -54,10 +54,6 @@ static void __init rcu_bootup_announce_oddness(void)
54#ifdef CONFIG_RCU_TORTURE_TEST_RUNNABLE 54#ifdef CONFIG_RCU_TORTURE_TEST_RUNNABLE
55 printk(KERN_INFO "\tRCU torture testing starts during boot.\n"); 55 printk(KERN_INFO "\tRCU torture testing starts during boot.\n");
56#endif 56#endif
57#ifndef CONFIG_RCU_CPU_STALL_DETECTOR
58 printk(KERN_INFO
59 "\tRCU-based detection of stalled CPUs is disabled.\n");
60#endif
61#if defined(CONFIG_TREE_PREEMPT_RCU) && !defined(CONFIG_RCU_CPU_STALL_VERBOSE) 57#if defined(CONFIG_TREE_PREEMPT_RCU) && !defined(CONFIG_RCU_CPU_STALL_VERBOSE)
62 printk(KERN_INFO "\tVerbose stalled-CPUs detection is disabled.\n"); 58 printk(KERN_INFO "\tVerbose stalled-CPUs detection is disabled.\n");
63#endif 59#endif
@@ -70,6 +66,7 @@ static void __init rcu_bootup_announce_oddness(void)
70 66
71struct rcu_state rcu_preempt_state = RCU_STATE_INITIALIZER(rcu_preempt_state); 67struct rcu_state rcu_preempt_state = RCU_STATE_INITIALIZER(rcu_preempt_state);
72DEFINE_PER_CPU(struct rcu_data, rcu_preempt_data); 68DEFINE_PER_CPU(struct rcu_data, rcu_preempt_data);
69static struct rcu_state *rcu_state = &rcu_preempt_state;
73 70
74static int rcu_preempted_readers_exp(struct rcu_node *rnp); 71static int rcu_preempted_readers_exp(struct rcu_node *rnp);
75 72
@@ -78,7 +75,7 @@ static int rcu_preempted_readers_exp(struct rcu_node *rnp);
78 */ 75 */
79static void __init rcu_bootup_announce(void) 76static void __init rcu_bootup_announce(void)
80{ 77{
81 printk(KERN_INFO "Preemptable hierarchical RCU implementation.\n"); 78 printk(KERN_INFO "Preemptible hierarchical RCU implementation.\n");
82 rcu_bootup_announce_oddness(); 79 rcu_bootup_announce_oddness();
83} 80}
84 81
@@ -111,7 +108,7 @@ void rcu_force_quiescent_state(void)
111EXPORT_SYMBOL_GPL(rcu_force_quiescent_state); 108EXPORT_SYMBOL_GPL(rcu_force_quiescent_state);
112 109
113/* 110/*
114 * Record a preemptable-RCU quiescent state for the specified CPU. Note 111 * Record a preemptible-RCU quiescent state for the specified CPU. Note
115 * that this just means that the task currently running on the CPU is 112 * that this just means that the task currently running on the CPU is
116 * not in a quiescent state. There might be any number of tasks blocked 113 * not in a quiescent state. There might be any number of tasks blocked
117 * while in an RCU read-side critical section. 114 * while in an RCU read-side critical section.
@@ -134,12 +131,12 @@ static void rcu_preempt_qs(int cpu)
134 * We have entered the scheduler, and the current task might soon be 131 * We have entered the scheduler, and the current task might soon be
135 * context-switched away from. If this task is in an RCU read-side 132 * context-switched away from. If this task is in an RCU read-side
136 * critical section, we will no longer be able to rely on the CPU to 133 * critical section, we will no longer be able to rely on the CPU to
137 * record that fact, so we enqueue the task on the appropriate entry 134 * record that fact, so we enqueue the task on the blkd_tasks list.
138 * of the blocked_tasks[] array. The task will dequeue itself when 135 * The task will dequeue itself when it exits the outermost enclosing
139 * it exits the outermost enclosing RCU read-side critical section. 136 * RCU read-side critical section. Therefore, the current grace period
140 * Therefore, the current grace period cannot be permitted to complete 137 * cannot be permitted to complete until the blkd_tasks list entries
141 * until the blocked_tasks[] entry indexed by the low-order bit of 138 * predating the current grace period drain, in other words, until
142 * rnp->gpnum empties. 139 * rnp->gp_tasks becomes NULL.
143 * 140 *
144 * Caller must disable preemption. 141 * Caller must disable preemption.
145 */ 142 */
@@ -147,7 +144,6 @@ static void rcu_preempt_note_context_switch(int cpu)
147{ 144{
148 struct task_struct *t = current; 145 struct task_struct *t = current;
149 unsigned long flags; 146 unsigned long flags;
150 int phase;
151 struct rcu_data *rdp; 147 struct rcu_data *rdp;
152 struct rcu_node *rnp; 148 struct rcu_node *rnp;
153 149
@@ -169,15 +165,30 @@ static void rcu_preempt_note_context_switch(int cpu)
169 * (i.e., this CPU has not yet passed through a quiescent 165 * (i.e., this CPU has not yet passed through a quiescent
170 * state for the current grace period), then as long 166 * state for the current grace period), then as long
171 * as that task remains queued, the current grace period 167 * as that task remains queued, the current grace period
172 * cannot end. 168 * cannot end. Note that there is some uncertainty as
169 * to exactly when the current grace period started.
170 * We take a conservative approach, which can result
171 * in unnecessarily waiting on tasks that started very
172 * slightly after the current grace period began. C'est
173 * la vie!!!
173 * 174 *
174 * But first, note that the current CPU must still be 175 * But first, note that the current CPU must still be
175 * on line! 176 * on line!
176 */ 177 */
177 WARN_ON_ONCE((rdp->grpmask & rnp->qsmaskinit) == 0); 178 WARN_ON_ONCE((rdp->grpmask & rnp->qsmaskinit) == 0);
178 WARN_ON_ONCE(!list_empty(&t->rcu_node_entry)); 179 WARN_ON_ONCE(!list_empty(&t->rcu_node_entry));
179 phase = (rnp->gpnum + !(rnp->qsmask & rdp->grpmask)) & 0x1; 180 if ((rnp->qsmask & rdp->grpmask) && rnp->gp_tasks != NULL) {
180 list_add(&t->rcu_node_entry, &rnp->blocked_tasks[phase]); 181 list_add(&t->rcu_node_entry, rnp->gp_tasks->prev);
182 rnp->gp_tasks = &t->rcu_node_entry;
183#ifdef CONFIG_RCU_BOOST
184 if (rnp->boost_tasks != NULL)
185 rnp->boost_tasks = rnp->gp_tasks;
186#endif /* #ifdef CONFIG_RCU_BOOST */
187 } else {
188 list_add(&t->rcu_node_entry, &rnp->blkd_tasks);
189 if (rnp->qsmask & rdp->grpmask)
190 rnp->gp_tasks = &t->rcu_node_entry;
191 }
181 raw_spin_unlock_irqrestore(&rnp->lock, flags); 192 raw_spin_unlock_irqrestore(&rnp->lock, flags);
182 } 193 }
183 194
@@ -196,7 +207,7 @@ static void rcu_preempt_note_context_switch(int cpu)
196} 207}
197 208
198/* 209/*
199 * Tree-preemptable RCU implementation for rcu_read_lock(). 210 * Tree-preemptible RCU implementation for rcu_read_lock().
200 * Just increment ->rcu_read_lock_nesting, shared state will be updated 211 * Just increment ->rcu_read_lock_nesting, shared state will be updated
201 * if we block. 212 * if we block.
202 */ 213 */
@@ -212,12 +223,9 @@ EXPORT_SYMBOL_GPL(__rcu_read_lock);
212 * for the specified rcu_node structure. If the caller needs a reliable 223 * for the specified rcu_node structure. If the caller needs a reliable
213 * answer, it must hold the rcu_node's ->lock. 224 * answer, it must hold the rcu_node's ->lock.
214 */ 225 */
215static int rcu_preempted_readers(struct rcu_node *rnp) 226static int rcu_preempt_blocked_readers_cgp(struct rcu_node *rnp)
216{ 227{
217 int phase = rnp->gpnum & 0x1; 228 return rnp->gp_tasks != NULL;
218
219 return !list_empty(&rnp->blocked_tasks[phase]) ||
220 !list_empty(&rnp->blocked_tasks[phase + 2]);
221} 229}
222 230
223/* 231/*
@@ -233,7 +241,7 @@ static void rcu_report_unblock_qs_rnp(struct rcu_node *rnp, unsigned long flags)
233 unsigned long mask; 241 unsigned long mask;
234 struct rcu_node *rnp_p; 242 struct rcu_node *rnp_p;
235 243
236 if (rnp->qsmask != 0 || rcu_preempted_readers(rnp)) { 244 if (rnp->qsmask != 0 || rcu_preempt_blocked_readers_cgp(rnp)) {
237 raw_spin_unlock_irqrestore(&rnp->lock, flags); 245 raw_spin_unlock_irqrestore(&rnp->lock, flags);
238 return; /* Still need more quiescent states! */ 246 return; /* Still need more quiescent states! */
239 } 247 }
@@ -257,6 +265,21 @@ static void rcu_report_unblock_qs_rnp(struct rcu_node *rnp, unsigned long flags)
257} 265}
258 266
259/* 267/*
268 * Advance a ->blkd_tasks-list pointer to the next entry, instead
269 * returning NULL if at the end of the list.
270 */
271static struct list_head *rcu_next_node_entry(struct task_struct *t,
272 struct rcu_node *rnp)
273{
274 struct list_head *np;
275
276 np = t->rcu_node_entry.next;
277 if (np == &rnp->blkd_tasks)
278 np = NULL;
279 return np;
280}
281
282/*
260 * Handle special cases during rcu_read_unlock(), such as needing to 283 * Handle special cases during rcu_read_unlock(), such as needing to
261 * notify RCU core processing or task having blocked during the RCU 284 * notify RCU core processing or task having blocked during the RCU
262 * read-side critical section. 285 * read-side critical section.
@@ -266,6 +289,7 @@ static void rcu_read_unlock_special(struct task_struct *t)
266 int empty; 289 int empty;
267 int empty_exp; 290 int empty_exp;
268 unsigned long flags; 291 unsigned long flags;
292 struct list_head *np;
269 struct rcu_node *rnp; 293 struct rcu_node *rnp;
270 int special; 294 int special;
271 295
@@ -306,10 +330,19 @@ static void rcu_read_unlock_special(struct task_struct *t)
306 break; 330 break;
307 raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */ 331 raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */
308 } 332 }
309 empty = !rcu_preempted_readers(rnp); 333 empty = !rcu_preempt_blocked_readers_cgp(rnp);
310 empty_exp = !rcu_preempted_readers_exp(rnp); 334 empty_exp = !rcu_preempted_readers_exp(rnp);
311 smp_mb(); /* ensure expedited fastpath sees end of RCU c-s. */ 335 smp_mb(); /* ensure expedited fastpath sees end of RCU c-s. */
336 np = rcu_next_node_entry(t, rnp);
312 list_del_init(&t->rcu_node_entry); 337 list_del_init(&t->rcu_node_entry);
338 if (&t->rcu_node_entry == rnp->gp_tasks)
339 rnp->gp_tasks = np;
340 if (&t->rcu_node_entry == rnp->exp_tasks)
341 rnp->exp_tasks = np;
342#ifdef CONFIG_RCU_BOOST
343 if (&t->rcu_node_entry == rnp->boost_tasks)
344 rnp->boost_tasks = np;
345#endif /* #ifdef CONFIG_RCU_BOOST */
313 t->rcu_blocked_node = NULL; 346 t->rcu_blocked_node = NULL;
314 347
315 /* 348 /*
@@ -322,6 +355,15 @@ static void rcu_read_unlock_special(struct task_struct *t)
322 else 355 else
323 rcu_report_unblock_qs_rnp(rnp, flags); 356 rcu_report_unblock_qs_rnp(rnp, flags);
324 357
358#ifdef CONFIG_RCU_BOOST
359 /* Unboost if we were boosted. */
360 if (special & RCU_READ_UNLOCK_BOOSTED) {
361 t->rcu_read_unlock_special &= ~RCU_READ_UNLOCK_BOOSTED;
362 rt_mutex_unlock(t->rcu_boost_mutex);
363 t->rcu_boost_mutex = NULL;
364 }
365#endif /* #ifdef CONFIG_RCU_BOOST */
366
325 /* 367 /*
326 * If this was the last task on the expedited lists, 368 * If this was the last task on the expedited lists,
327 * then we need to report up the rcu_node hierarchy. 369 * then we need to report up the rcu_node hierarchy.
@@ -334,7 +376,7 @@ static void rcu_read_unlock_special(struct task_struct *t)
334} 376}
335 377
336/* 378/*
337 * Tree-preemptable RCU implementation for rcu_read_unlock(). 379 * Tree-preemptible RCU implementation for rcu_read_unlock().
338 * Decrement ->rcu_read_lock_nesting. If the result is zero (outermost 380 * Decrement ->rcu_read_lock_nesting. If the result is zero (outermost
339 * rcu_read_unlock()) and ->rcu_read_unlock_special is non-zero, then 381 * rcu_read_unlock()) and ->rcu_read_unlock_special is non-zero, then
340 * invoke rcu_read_unlock_special() to clean up after a context switch 382 * invoke rcu_read_unlock_special() to clean up after a context switch
@@ -356,8 +398,6 @@ void __rcu_read_unlock(void)
356} 398}
357EXPORT_SYMBOL_GPL(__rcu_read_unlock); 399EXPORT_SYMBOL_GPL(__rcu_read_unlock);
358 400
359#ifdef CONFIG_RCU_CPU_STALL_DETECTOR
360
361#ifdef CONFIG_RCU_CPU_STALL_VERBOSE 401#ifdef CONFIG_RCU_CPU_STALL_VERBOSE
362 402
363/* 403/*
@@ -367,18 +407,16 @@ EXPORT_SYMBOL_GPL(__rcu_read_unlock);
367static void rcu_print_detail_task_stall_rnp(struct rcu_node *rnp) 407static void rcu_print_detail_task_stall_rnp(struct rcu_node *rnp)
368{ 408{
369 unsigned long flags; 409 unsigned long flags;
370 struct list_head *lp;
371 int phase;
372 struct task_struct *t; 410 struct task_struct *t;
373 411
374 if (rcu_preempted_readers(rnp)) { 412 if (!rcu_preempt_blocked_readers_cgp(rnp))
375 raw_spin_lock_irqsave(&rnp->lock, flags); 413 return;
376 phase = rnp->gpnum & 0x1; 414 raw_spin_lock_irqsave(&rnp->lock, flags);
377 lp = &rnp->blocked_tasks[phase]; 415 t = list_entry(rnp->gp_tasks,
378 list_for_each_entry(t, lp, rcu_node_entry) 416 struct task_struct, rcu_node_entry);
379 sched_show_task(t); 417 list_for_each_entry_continue(t, &rnp->blkd_tasks, rcu_node_entry)
380 raw_spin_unlock_irqrestore(&rnp->lock, flags); 418 sched_show_task(t);
381 } 419 raw_spin_unlock_irqrestore(&rnp->lock, flags);
382} 420}
383 421
384/* 422/*
@@ -408,16 +446,14 @@ static void rcu_print_detail_task_stall(struct rcu_state *rsp)
408 */ 446 */
409static void rcu_print_task_stall(struct rcu_node *rnp) 447static void rcu_print_task_stall(struct rcu_node *rnp)
410{ 448{
411 struct list_head *lp;
412 int phase;
413 struct task_struct *t; 449 struct task_struct *t;
414 450
415 if (rcu_preempted_readers(rnp)) { 451 if (!rcu_preempt_blocked_readers_cgp(rnp))
416 phase = rnp->gpnum & 0x1; 452 return;
417 lp = &rnp->blocked_tasks[phase]; 453 t = list_entry(rnp->gp_tasks,
418 list_for_each_entry(t, lp, rcu_node_entry) 454 struct task_struct, rcu_node_entry);
419 printk(" P%d", t->pid); 455 list_for_each_entry_continue(t, &rnp->blkd_tasks, rcu_node_entry)
420 } 456 printk(" P%d", t->pid);
421} 457}
422 458
423/* 459/*
@@ -430,18 +466,21 @@ static void rcu_preempt_stall_reset(void)
430 rcu_preempt_state.jiffies_stall = jiffies + ULONG_MAX / 2; 466 rcu_preempt_state.jiffies_stall = jiffies + ULONG_MAX / 2;
431} 467}
432 468
433#endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */
434
435/* 469/*
436 * Check that the list of blocked tasks for the newly completed grace 470 * Check that the list of blocked tasks for the newly completed grace
437 * period is in fact empty. It is a serious bug to complete a grace 471 * period is in fact empty. It is a serious bug to complete a grace
438 * period that still has RCU readers blocked! This function must be 472 * period that still has RCU readers blocked! This function must be
439 * invoked -before- updating this rnp's ->gpnum, and the rnp's ->lock 473 * invoked -before- updating this rnp's ->gpnum, and the rnp's ->lock
440 * must be held by the caller. 474 * must be held by the caller.
475 *
476 * Also, if there are blocked tasks on the list, they automatically
477 * block the newly created grace period, so set up ->gp_tasks accordingly.
441 */ 478 */
442static void rcu_preempt_check_blocked_tasks(struct rcu_node *rnp) 479static void rcu_preempt_check_blocked_tasks(struct rcu_node *rnp)
443{ 480{
444 WARN_ON_ONCE(rcu_preempted_readers(rnp)); 481 WARN_ON_ONCE(rcu_preempt_blocked_readers_cgp(rnp));
482 if (!list_empty(&rnp->blkd_tasks))
483 rnp->gp_tasks = rnp->blkd_tasks.next;
445 WARN_ON_ONCE(rnp->qsmask); 484 WARN_ON_ONCE(rnp->qsmask);
446} 485}
447 486
@@ -465,50 +504,68 @@ static int rcu_preempt_offline_tasks(struct rcu_state *rsp,
465 struct rcu_node *rnp, 504 struct rcu_node *rnp,
466 struct rcu_data *rdp) 505 struct rcu_data *rdp)
467{ 506{
468 int i;
469 struct list_head *lp; 507 struct list_head *lp;
470 struct list_head *lp_root; 508 struct list_head *lp_root;
471 int retval = 0; 509 int retval = 0;
472 struct rcu_node *rnp_root = rcu_get_root(rsp); 510 struct rcu_node *rnp_root = rcu_get_root(rsp);
473 struct task_struct *tp; 511 struct task_struct *t;
474 512
475 if (rnp == rnp_root) { 513 if (rnp == rnp_root) {
476 WARN_ONCE(1, "Last CPU thought to be offlined?"); 514 WARN_ONCE(1, "Last CPU thought to be offlined?");
477 return 0; /* Shouldn't happen: at least one CPU online. */ 515 return 0; /* Shouldn't happen: at least one CPU online. */
478 } 516 }
479 WARN_ON_ONCE(rnp != rdp->mynode && 517
480 (!list_empty(&rnp->blocked_tasks[0]) || 518 /* If we are on an internal node, complain bitterly. */
481 !list_empty(&rnp->blocked_tasks[1]) || 519 WARN_ON_ONCE(rnp != rdp->mynode);
482 !list_empty(&rnp->blocked_tasks[2]) ||
483 !list_empty(&rnp->blocked_tasks[3])));
484 520
485 /* 521 /*
486 * Move tasks up to root rcu_node. Rely on the fact that the 522 * Move tasks up to root rcu_node. Don't try to get fancy for
487 * root rcu_node can be at most one ahead of the rest of the 523 * this corner-case operation -- just put this node's tasks
488 * rcu_nodes in terms of gp_num value. This fact allows us to 524 * at the head of the root node's list, and update the root node's
489 * move the blocked_tasks[] array directly, element by element. 525 * ->gp_tasks and ->exp_tasks pointers to those of this node's,
526 * if non-NULL. This might result in waiting for more tasks than
527 * absolutely necessary, but this is a good performance/complexity
528 * tradeoff.
490 */ 529 */
491 if (rcu_preempted_readers(rnp)) 530 if (rcu_preempt_blocked_readers_cgp(rnp))
492 retval |= RCU_OFL_TASKS_NORM_GP; 531 retval |= RCU_OFL_TASKS_NORM_GP;
493 if (rcu_preempted_readers_exp(rnp)) 532 if (rcu_preempted_readers_exp(rnp))
494 retval |= RCU_OFL_TASKS_EXP_GP; 533 retval |= RCU_OFL_TASKS_EXP_GP;
495 for (i = 0; i < 4; i++) { 534 lp = &rnp->blkd_tasks;
496 lp = &rnp->blocked_tasks[i]; 535 lp_root = &rnp_root->blkd_tasks;
497 lp_root = &rnp_root->blocked_tasks[i]; 536 while (!list_empty(lp)) {
498 while (!list_empty(lp)) { 537 t = list_entry(lp->next, typeof(*t), rcu_node_entry);
499 tp = list_entry(lp->next, typeof(*tp), rcu_node_entry); 538 raw_spin_lock(&rnp_root->lock); /* irqs already disabled */
500 raw_spin_lock(&rnp_root->lock); /* irqs already disabled */ 539 list_del(&t->rcu_node_entry);
501 list_del(&tp->rcu_node_entry); 540 t->rcu_blocked_node = rnp_root;
502 tp->rcu_blocked_node = rnp_root; 541 list_add(&t->rcu_node_entry, lp_root);
503 list_add(&tp->rcu_node_entry, lp_root); 542 if (&t->rcu_node_entry == rnp->gp_tasks)
504 raw_spin_unlock(&rnp_root->lock); /* irqs remain disabled */ 543 rnp_root->gp_tasks = rnp->gp_tasks;
505 } 544 if (&t->rcu_node_entry == rnp->exp_tasks)
545 rnp_root->exp_tasks = rnp->exp_tasks;
546#ifdef CONFIG_RCU_BOOST
547 if (&t->rcu_node_entry == rnp->boost_tasks)
548 rnp_root->boost_tasks = rnp->boost_tasks;
549#endif /* #ifdef CONFIG_RCU_BOOST */
550 raw_spin_unlock(&rnp_root->lock); /* irqs still disabled */
506 } 551 }
552
553#ifdef CONFIG_RCU_BOOST
554 /* In case root is being boosted and leaf is not. */
555 raw_spin_lock(&rnp_root->lock); /* irqs already disabled */
556 if (rnp_root->boost_tasks != NULL &&
557 rnp_root->boost_tasks != rnp_root->gp_tasks)
558 rnp_root->boost_tasks = rnp_root->gp_tasks;
559 raw_spin_unlock(&rnp_root->lock); /* irqs still disabled */
560#endif /* #ifdef CONFIG_RCU_BOOST */
561
562 rnp->gp_tasks = NULL;
563 rnp->exp_tasks = NULL;
507 return retval; 564 return retval;
508} 565}
509 566
510/* 567/*
511 * Do CPU-offline processing for preemptable RCU. 568 * Do CPU-offline processing for preemptible RCU.
512 */ 569 */
513static void rcu_preempt_offline_cpu(int cpu) 570static void rcu_preempt_offline_cpu(int cpu)
514{ 571{
@@ -537,7 +594,7 @@ static void rcu_preempt_check_callbacks(int cpu)
537} 594}
538 595
539/* 596/*
540 * Process callbacks for preemptable RCU. 597 * Process callbacks for preemptible RCU.
541 */ 598 */
542static void rcu_preempt_process_callbacks(void) 599static void rcu_preempt_process_callbacks(void)
543{ 600{
@@ -546,7 +603,7 @@ static void rcu_preempt_process_callbacks(void)
546} 603}
547 604
548/* 605/*
549 * Queue a preemptable-RCU callback for invocation after a grace period. 606 * Queue a preemptible-RCU callback for invocation after a grace period.
550 */ 607 */
551void call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu)) 608void call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu))
552{ 609{
@@ -594,8 +651,7 @@ static DEFINE_MUTEX(sync_rcu_preempt_exp_mutex);
594 */ 651 */
595static int rcu_preempted_readers_exp(struct rcu_node *rnp) 652static int rcu_preempted_readers_exp(struct rcu_node *rnp)
596{ 653{
597 return !list_empty(&rnp->blocked_tasks[2]) || 654 return rnp->exp_tasks != NULL;
598 !list_empty(&rnp->blocked_tasks[3]);
599} 655}
600 656
601/* 657/*
@@ -655,13 +711,17 @@ static void rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp)
655static void 711static void
656sync_rcu_preempt_exp_init(struct rcu_state *rsp, struct rcu_node *rnp) 712sync_rcu_preempt_exp_init(struct rcu_state *rsp, struct rcu_node *rnp)
657{ 713{
658 int must_wait; 714 unsigned long flags;
715 int must_wait = 0;
659 716
660 raw_spin_lock(&rnp->lock); /* irqs already disabled */ 717 raw_spin_lock_irqsave(&rnp->lock, flags);
661 list_splice_init(&rnp->blocked_tasks[0], &rnp->blocked_tasks[2]); 718 if (list_empty(&rnp->blkd_tasks))
662 list_splice_init(&rnp->blocked_tasks[1], &rnp->blocked_tasks[3]); 719 raw_spin_unlock_irqrestore(&rnp->lock, flags);
663 must_wait = rcu_preempted_readers_exp(rnp); 720 else {
664 raw_spin_unlock(&rnp->lock); /* irqs remain disabled */ 721 rnp->exp_tasks = rnp->blkd_tasks.next;
722 rcu_initiate_boost(rnp, flags); /* releases rnp->lock */
723 must_wait = 1;
724 }
665 if (!must_wait) 725 if (!must_wait)
666 rcu_report_exp_rnp(rsp, rnp); 726 rcu_report_exp_rnp(rsp, rnp);
667} 727}
@@ -669,9 +729,7 @@ sync_rcu_preempt_exp_init(struct rcu_state *rsp, struct rcu_node *rnp)
669/* 729/*
670 * Wait for an rcu-preempt grace period, but expedite it. The basic idea 730 * Wait for an rcu-preempt grace period, but expedite it. The basic idea
671 * is to invoke synchronize_sched_expedited() to push all the tasks to 731 * is to invoke synchronize_sched_expedited() to push all the tasks to
672 * the ->blocked_tasks[] lists, move all entries from the first set of 732 * the ->blkd_tasks lists and wait for this list to drain.
673 * ->blocked_tasks[] lists to the second set, and finally wait for this
674 * second set to drain.
675 */ 733 */
676void synchronize_rcu_expedited(void) 734void synchronize_rcu_expedited(void)
677{ 735{
@@ -703,7 +761,7 @@ void synchronize_rcu_expedited(void)
703 if ((ACCESS_ONCE(sync_rcu_preempt_exp_count) - snap) > 0) 761 if ((ACCESS_ONCE(sync_rcu_preempt_exp_count) - snap) > 0)
704 goto unlock_mb_ret; /* Others did our work for us. */ 762 goto unlock_mb_ret; /* Others did our work for us. */
705 763
706 /* force all RCU readers onto blocked_tasks[]. */ 764 /* force all RCU readers onto ->blkd_tasks lists. */
707 synchronize_sched_expedited(); 765 synchronize_sched_expedited();
708 766
709 raw_spin_lock_irqsave(&rsp->onofflock, flags); 767 raw_spin_lock_irqsave(&rsp->onofflock, flags);
@@ -715,7 +773,7 @@ void synchronize_rcu_expedited(void)
715 raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */ 773 raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */
716 } 774 }
717 775
718 /* Snapshot current state of ->blocked_tasks[] lists. */ 776 /* Snapshot current state of ->blkd_tasks lists. */
719 rcu_for_each_leaf_node(rsp, rnp) 777 rcu_for_each_leaf_node(rsp, rnp)
720 sync_rcu_preempt_exp_init(rsp, rnp); 778 sync_rcu_preempt_exp_init(rsp, rnp);
721 if (NUM_RCU_NODES > 1) 779 if (NUM_RCU_NODES > 1)
@@ -723,7 +781,7 @@ void synchronize_rcu_expedited(void)
723 781
724 raw_spin_unlock_irqrestore(&rsp->onofflock, flags); 782 raw_spin_unlock_irqrestore(&rsp->onofflock, flags);
725 783
726 /* Wait for snapshotted ->blocked_tasks[] lists to drain. */ 784 /* Wait for snapshotted ->blkd_tasks lists to drain. */
727 rnp = rcu_get_root(rsp); 785 rnp = rcu_get_root(rsp);
728 wait_event(sync_rcu_preempt_exp_wq, 786 wait_event(sync_rcu_preempt_exp_wq,
729 sync_rcu_preempt_exp_done(rnp)); 787 sync_rcu_preempt_exp_done(rnp));
@@ -739,7 +797,7 @@ mb_ret:
739EXPORT_SYMBOL_GPL(synchronize_rcu_expedited); 797EXPORT_SYMBOL_GPL(synchronize_rcu_expedited);
740 798
741/* 799/*
742 * Check to see if there is any immediate preemptable-RCU-related work 800 * Check to see if there is any immediate preemptible-RCU-related work
743 * to be done. 801 * to be done.
744 */ 802 */
745static int rcu_preempt_pending(int cpu) 803static int rcu_preempt_pending(int cpu)
@@ -749,7 +807,7 @@ static int rcu_preempt_pending(int cpu)
749} 807}
750 808
751/* 809/*
752 * Does preemptable RCU need the CPU to stay out of dynticks mode? 810 * Does preemptible RCU need the CPU to stay out of dynticks mode?
753 */ 811 */
754static int rcu_preempt_needs_cpu(int cpu) 812static int rcu_preempt_needs_cpu(int cpu)
755{ 813{
@@ -766,7 +824,7 @@ void rcu_barrier(void)
766EXPORT_SYMBOL_GPL(rcu_barrier); 824EXPORT_SYMBOL_GPL(rcu_barrier);
767 825
768/* 826/*
769 * Initialize preemptable RCU's per-CPU data. 827 * Initialize preemptible RCU's per-CPU data.
770 */ 828 */
771static void __cpuinit rcu_preempt_init_percpu_data(int cpu) 829static void __cpuinit rcu_preempt_init_percpu_data(int cpu)
772{ 830{
@@ -774,7 +832,7 @@ static void __cpuinit rcu_preempt_init_percpu_data(int cpu)
774} 832}
775 833
776/* 834/*
777 * Move preemptable RCU's callbacks from dying CPU to other online CPU. 835 * Move preemptible RCU's callbacks from dying CPU to other online CPU.
778 */ 836 */
779static void rcu_preempt_send_cbs_to_online(void) 837static void rcu_preempt_send_cbs_to_online(void)
780{ 838{
@@ -782,7 +840,7 @@ static void rcu_preempt_send_cbs_to_online(void)
782} 840}
783 841
784/* 842/*
785 * Initialize preemptable RCU's state structures. 843 * Initialize preemptible RCU's state structures.
786 */ 844 */
787static void __init __rcu_init_preempt(void) 845static void __init __rcu_init_preempt(void)
788{ 846{
@@ -790,7 +848,7 @@ static void __init __rcu_init_preempt(void)
790} 848}
791 849
792/* 850/*
793 * Check for a task exiting while in a preemptable-RCU read-side 851 * Check for a task exiting while in a preemptible-RCU read-side
794 * critical section, clean up if so. No need to issue warnings, 852 * critical section, clean up if so. No need to issue warnings,
795 * as debug_check_no_locks_held() already does this if lockdep 853 * as debug_check_no_locks_held() already does this if lockdep
796 * is enabled. 854 * is enabled.
@@ -802,11 +860,13 @@ void exit_rcu(void)
802 if (t->rcu_read_lock_nesting == 0) 860 if (t->rcu_read_lock_nesting == 0)
803 return; 861 return;
804 t->rcu_read_lock_nesting = 1; 862 t->rcu_read_lock_nesting = 1;
805 rcu_read_unlock(); 863 __rcu_read_unlock();
806} 864}
807 865
808#else /* #ifdef CONFIG_TREE_PREEMPT_RCU */ 866#else /* #ifdef CONFIG_TREE_PREEMPT_RCU */
809 867
868static struct rcu_state *rcu_state = &rcu_sched_state;
869
810/* 870/*
811 * Tell them what RCU they are running. 871 * Tell them what RCU they are running.
812 */ 872 */
@@ -836,7 +896,7 @@ void rcu_force_quiescent_state(void)
836EXPORT_SYMBOL_GPL(rcu_force_quiescent_state); 896EXPORT_SYMBOL_GPL(rcu_force_quiescent_state);
837 897
838/* 898/*
839 * Because preemptable RCU does not exist, we never have to check for 899 * Because preemptible RCU does not exist, we never have to check for
840 * CPUs being in quiescent states. 900 * CPUs being in quiescent states.
841 */ 901 */
842static void rcu_preempt_note_context_switch(int cpu) 902static void rcu_preempt_note_context_switch(int cpu)
@@ -844,10 +904,10 @@ static void rcu_preempt_note_context_switch(int cpu)
844} 904}
845 905
846/* 906/*
847 * Because preemptable RCU does not exist, there are never any preempted 907 * Because preemptible RCU does not exist, there are never any preempted
848 * RCU readers. 908 * RCU readers.
849 */ 909 */
850static int rcu_preempted_readers(struct rcu_node *rnp) 910static int rcu_preempt_blocked_readers_cgp(struct rcu_node *rnp)
851{ 911{
852 return 0; 912 return 0;
853} 913}
@@ -862,10 +922,8 @@ static void rcu_report_unblock_qs_rnp(struct rcu_node *rnp, unsigned long flags)
862 922
863#endif /* #ifdef CONFIG_HOTPLUG_CPU */ 923#endif /* #ifdef CONFIG_HOTPLUG_CPU */
864 924
865#ifdef CONFIG_RCU_CPU_STALL_DETECTOR
866
867/* 925/*
868 * Because preemptable RCU does not exist, we never have to check for 926 * Because preemptible RCU does not exist, we never have to check for
869 * tasks blocked within RCU read-side critical sections. 927 * tasks blocked within RCU read-side critical sections.
870 */ 928 */
871static void rcu_print_detail_task_stall(struct rcu_state *rsp) 929static void rcu_print_detail_task_stall(struct rcu_state *rsp)
@@ -873,7 +931,7 @@ static void rcu_print_detail_task_stall(struct rcu_state *rsp)
873} 931}
874 932
875/* 933/*
876 * Because preemptable RCU does not exist, we never have to check for 934 * Because preemptible RCU does not exist, we never have to check for
877 * tasks blocked within RCU read-side critical sections. 935 * tasks blocked within RCU read-side critical sections.
878 */ 936 */
879static void rcu_print_task_stall(struct rcu_node *rnp) 937static void rcu_print_task_stall(struct rcu_node *rnp)
@@ -888,10 +946,8 @@ static void rcu_preempt_stall_reset(void)
888{ 946{
889} 947}
890 948
891#endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */
892
893/* 949/*
894 * Because there is no preemptable RCU, there can be no readers blocked, 950 * Because there is no preemptible RCU, there can be no readers blocked,
895 * so there is no need to check for blocked tasks. So check only for 951 * so there is no need to check for blocked tasks. So check only for
896 * bogus qsmask values. 952 * bogus qsmask values.
897 */ 953 */
@@ -903,7 +959,7 @@ static void rcu_preempt_check_blocked_tasks(struct rcu_node *rnp)
903#ifdef CONFIG_HOTPLUG_CPU 959#ifdef CONFIG_HOTPLUG_CPU
904 960
905/* 961/*
906 * Because preemptable RCU does not exist, it never needs to migrate 962 * Because preemptible RCU does not exist, it never needs to migrate
907 * tasks that were blocked within RCU read-side critical sections, and 963 * tasks that were blocked within RCU read-side critical sections, and
908 * such non-existent tasks cannot possibly have been blocking the current 964 * such non-existent tasks cannot possibly have been blocking the current
909 * grace period. 965 * grace period.
@@ -916,7 +972,7 @@ static int rcu_preempt_offline_tasks(struct rcu_state *rsp,
916} 972}
917 973
918/* 974/*
919 * Because preemptable RCU does not exist, it never needs CPU-offline 975 * Because preemptible RCU does not exist, it never needs CPU-offline
920 * processing. 976 * processing.
921 */ 977 */
922static void rcu_preempt_offline_cpu(int cpu) 978static void rcu_preempt_offline_cpu(int cpu)
@@ -926,7 +982,7 @@ static void rcu_preempt_offline_cpu(int cpu)
926#endif /* #ifdef CONFIG_HOTPLUG_CPU */ 982#endif /* #ifdef CONFIG_HOTPLUG_CPU */
927 983
928/* 984/*
929 * Because preemptable RCU does not exist, it never has any callbacks 985 * Because preemptible RCU does not exist, it never has any callbacks
930 * to check. 986 * to check.
931 */ 987 */
932static void rcu_preempt_check_callbacks(int cpu) 988static void rcu_preempt_check_callbacks(int cpu)
@@ -934,7 +990,7 @@ static void rcu_preempt_check_callbacks(int cpu)
934} 990}
935 991
936/* 992/*
937 * Because preemptable RCU does not exist, it never has any callbacks 993 * Because preemptible RCU does not exist, it never has any callbacks
938 * to process. 994 * to process.
939 */ 995 */
940static void rcu_preempt_process_callbacks(void) 996static void rcu_preempt_process_callbacks(void)
@@ -943,7 +999,7 @@ static void rcu_preempt_process_callbacks(void)
943 999
944/* 1000/*
945 * Wait for an rcu-preempt grace period, but make it happen quickly. 1001 * Wait for an rcu-preempt grace period, but make it happen quickly.
946 * But because preemptable RCU does not exist, map to rcu-sched. 1002 * But because preemptible RCU does not exist, map to rcu-sched.
947 */ 1003 */
948void synchronize_rcu_expedited(void) 1004void synchronize_rcu_expedited(void)
949{ 1005{
@@ -954,7 +1010,7 @@ EXPORT_SYMBOL_GPL(synchronize_rcu_expedited);
954#ifdef CONFIG_HOTPLUG_CPU 1010#ifdef CONFIG_HOTPLUG_CPU
955 1011
956/* 1012/*
957 * Because preemptable RCU does not exist, there is never any need to 1013 * Because preemptible RCU does not exist, there is never any need to
958 * report on tasks preempted in RCU read-side critical sections during 1014 * report on tasks preempted in RCU read-side critical sections during
959 * expedited RCU grace periods. 1015 * expedited RCU grace periods.
960 */ 1016 */
@@ -966,7 +1022,7 @@ static void rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp)
966#endif /* #ifdef CONFIG_HOTPLUG_CPU */ 1022#endif /* #ifdef CONFIG_HOTPLUG_CPU */
967 1023
968/* 1024/*
969 * Because preemptable RCU does not exist, it never has any work to do. 1025 * Because preemptible RCU does not exist, it never has any work to do.
970 */ 1026 */
971static int rcu_preempt_pending(int cpu) 1027static int rcu_preempt_pending(int cpu)
972{ 1028{
@@ -974,7 +1030,7 @@ static int rcu_preempt_pending(int cpu)
974} 1030}
975 1031
976/* 1032/*
977 * Because preemptable RCU does not exist, it never needs any CPU. 1033 * Because preemptible RCU does not exist, it never needs any CPU.
978 */ 1034 */
979static int rcu_preempt_needs_cpu(int cpu) 1035static int rcu_preempt_needs_cpu(int cpu)
980{ 1036{
@@ -982,7 +1038,7 @@ static int rcu_preempt_needs_cpu(int cpu)
982} 1038}
983 1039
984/* 1040/*
985 * Because preemptable RCU does not exist, rcu_barrier() is just 1041 * Because preemptible RCU does not exist, rcu_barrier() is just
986 * another name for rcu_barrier_sched(). 1042 * another name for rcu_barrier_sched().
987 */ 1043 */
988void rcu_barrier(void) 1044void rcu_barrier(void)
@@ -992,7 +1048,7 @@ void rcu_barrier(void)
992EXPORT_SYMBOL_GPL(rcu_barrier); 1048EXPORT_SYMBOL_GPL(rcu_barrier);
993 1049
994/* 1050/*
995 * Because preemptable RCU does not exist, there is no per-CPU 1051 * Because preemptible RCU does not exist, there is no per-CPU
996 * data to initialize. 1052 * data to initialize.
997 */ 1053 */
998static void __cpuinit rcu_preempt_init_percpu_data(int cpu) 1054static void __cpuinit rcu_preempt_init_percpu_data(int cpu)
@@ -1000,14 +1056,14 @@ static void __cpuinit rcu_preempt_init_percpu_data(int cpu)
1000} 1056}
1001 1057
1002/* 1058/*
1003 * Because there is no preemptable RCU, there are no callbacks to move. 1059 * Because there is no preemptible RCU, there are no callbacks to move.
1004 */ 1060 */
1005static void rcu_preempt_send_cbs_to_online(void) 1061static void rcu_preempt_send_cbs_to_online(void)
1006{ 1062{
1007} 1063}
1008 1064
1009/* 1065/*
1010 * Because preemptable RCU does not exist, it need not be initialized. 1066 * Because preemptible RCU does not exist, it need not be initialized.
1011 */ 1067 */
1012static void __init __rcu_init_preempt(void) 1068static void __init __rcu_init_preempt(void)
1013{ 1069{
@@ -1015,6 +1071,276 @@ static void __init __rcu_init_preempt(void)
1015 1071
1016#endif /* #else #ifdef CONFIG_TREE_PREEMPT_RCU */ 1072#endif /* #else #ifdef CONFIG_TREE_PREEMPT_RCU */
1017 1073
1074#ifdef CONFIG_RCU_BOOST
1075
1076#include "rtmutex_common.h"
1077
1078#ifdef CONFIG_RCU_TRACE
1079
1080static void rcu_initiate_boost_trace(struct rcu_node *rnp)
1081{
1082 if (list_empty(&rnp->blkd_tasks))
1083 rnp->n_balk_blkd_tasks++;
1084 else if (rnp->exp_tasks == NULL && rnp->gp_tasks == NULL)
1085 rnp->n_balk_exp_gp_tasks++;
1086 else if (rnp->gp_tasks != NULL && rnp->boost_tasks != NULL)
1087 rnp->n_balk_boost_tasks++;
1088 else if (rnp->gp_tasks != NULL && rnp->qsmask != 0)
1089 rnp->n_balk_notblocked++;
1090 else if (rnp->gp_tasks != NULL &&
1091 ULONG_CMP_LT(jiffies, rnp->boost_time))
1092 rnp->n_balk_notyet++;
1093 else
1094 rnp->n_balk_nos++;
1095}
1096
1097#else /* #ifdef CONFIG_RCU_TRACE */
1098
1099static void rcu_initiate_boost_trace(struct rcu_node *rnp)
1100{
1101}
1102
1103#endif /* #else #ifdef CONFIG_RCU_TRACE */
1104
1105/*
1106 * Carry out RCU priority boosting on the task indicated by ->exp_tasks
1107 * or ->boost_tasks, advancing the pointer to the next task in the
1108 * ->blkd_tasks list.
1109 *
1110 * Note that irqs must be enabled: boosting the task can block.
1111 * Returns 1 if there are more tasks needing to be boosted.
1112 */
1113static int rcu_boost(struct rcu_node *rnp)
1114{
1115 unsigned long flags;
1116 struct rt_mutex mtx;
1117 struct task_struct *t;
1118 struct list_head *tb;
1119
1120 if (rnp->exp_tasks == NULL && rnp->boost_tasks == NULL)
1121 return 0; /* Nothing left to boost. */
1122
1123 raw_spin_lock_irqsave(&rnp->lock, flags);
1124
1125 /*
1126 * Recheck under the lock: all tasks in need of boosting
1127 * might exit their RCU read-side critical sections on their own.
1128 */
1129 if (rnp->exp_tasks == NULL && rnp->boost_tasks == NULL) {
1130 raw_spin_unlock_irqrestore(&rnp->lock, flags);
1131 return 0;
1132 }
1133
1134 /*
1135 * Preferentially boost tasks blocking expedited grace periods.
1136 * This cannot starve the normal grace periods because a second
1137 * expedited grace period must boost all blocked tasks, including
1138 * those blocking the pre-existing normal grace period.
1139 */
1140 if (rnp->exp_tasks != NULL) {
1141 tb = rnp->exp_tasks;
1142 rnp->n_exp_boosts++;
1143 } else {
1144 tb = rnp->boost_tasks;
1145 rnp->n_normal_boosts++;
1146 }
1147 rnp->n_tasks_boosted++;
1148
1149 /*
1150 * We boost task t by manufacturing an rt_mutex that appears to
1151 * be held by task t. We leave a pointer to that rt_mutex where
1152 * task t can find it, and task t will release the mutex when it
1153 * exits its outermost RCU read-side critical section. Then
1154 * simply acquiring this artificial rt_mutex will boost task
1155 * t's priority. (Thanks to tglx for suggesting this approach!)
1156 *
1157 * Note that task t must acquire rnp->lock to remove itself from
1158 * the ->blkd_tasks list, which it will do from exit() if from
1159 * nowhere else. We therefore are guaranteed that task t will
1160 * stay around at least until we drop rnp->lock. Note that
1161 * rnp->lock also resolves races between our priority boosting
1162 * and task t's exiting its outermost RCU read-side critical
1163 * section.
1164 */
1165 t = container_of(tb, struct task_struct, rcu_node_entry);
1166 rt_mutex_init_proxy_locked(&mtx, t);
1167 t->rcu_boost_mutex = &mtx;
1168 t->rcu_read_unlock_special |= RCU_READ_UNLOCK_BOOSTED;
1169 raw_spin_unlock_irqrestore(&rnp->lock, flags);
1170 rt_mutex_lock(&mtx); /* Side effect: boosts task t's priority. */
1171 rt_mutex_unlock(&mtx); /* Keep lockdep happy. */
1172
1173 return rnp->exp_tasks != NULL || rnp->boost_tasks != NULL;
1174}
1175
1176/*
1177 * Timer handler to initiate waking up of boost kthreads that
1178 * have yielded the CPU due to excessive numbers of tasks to
1179 * boost. We wake up the per-rcu_node kthread, which in turn
1180 * will wake up the booster kthread.
1181 */
1182static void rcu_boost_kthread_timer(unsigned long arg)
1183{
1184 invoke_rcu_node_kthread((struct rcu_node *)arg);
1185}
1186
1187/*
1188 * Priority-boosting kthread. One per leaf rcu_node and one for the
1189 * root rcu_node.
1190 */
1191static int rcu_boost_kthread(void *arg)
1192{
1193 struct rcu_node *rnp = (struct rcu_node *)arg;
1194 int spincnt = 0;
1195 int more2boost;
1196
1197 for (;;) {
1198 rnp->boost_kthread_status = RCU_KTHREAD_WAITING;
1199 wait_event_interruptible(rnp->boost_wq, rnp->boost_tasks ||
1200 rnp->exp_tasks);
1201 rnp->boost_kthread_status = RCU_KTHREAD_RUNNING;
1202 more2boost = rcu_boost(rnp);
1203 if (more2boost)
1204 spincnt++;
1205 else
1206 spincnt = 0;
1207 if (spincnt > 10) {
1208 rcu_yield(rcu_boost_kthread_timer, (unsigned long)rnp);
1209 spincnt = 0;
1210 }
1211 }
1212 /* NOTREACHED */
1213 return 0;
1214}
1215
1216/*
1217 * Check to see if it is time to start boosting RCU readers that are
1218 * blocking the current grace period, and, if so, tell the per-rcu_node
1219 * kthread to start boosting them. If there is an expedited grace
1220 * period in progress, it is always time to boost.
1221 *
1222 * The caller must hold rnp->lock, which this function releases,
1223 * but irqs remain disabled. The ->boost_kthread_task is immortal,
1224 * so we don't need to worry about it going away.
1225 */
1226static void rcu_initiate_boost(struct rcu_node *rnp, unsigned long flags)
1227{
1228 struct task_struct *t;
1229
1230 if (!rcu_preempt_blocked_readers_cgp(rnp) && rnp->exp_tasks == NULL) {
1231 rnp->n_balk_exp_gp_tasks++;
1232 raw_spin_unlock_irqrestore(&rnp->lock, flags);
1233 return;
1234 }
1235 if (rnp->exp_tasks != NULL ||
1236 (rnp->gp_tasks != NULL &&
1237 rnp->boost_tasks == NULL &&
1238 rnp->qsmask == 0 &&
1239 ULONG_CMP_GE(jiffies, rnp->boost_time))) {
1240 if (rnp->exp_tasks == NULL)
1241 rnp->boost_tasks = rnp->gp_tasks;
1242 raw_spin_unlock_irqrestore(&rnp->lock, flags);
1243 t = rnp->boost_kthread_task;
1244 if (t != NULL)
1245 wake_up_process(t);
1246 } else {
1247 rcu_initiate_boost_trace(rnp);
1248 raw_spin_unlock_irqrestore(&rnp->lock, flags);
1249 }
1250}
1251
1252/*
1253 * Set the affinity of the boost kthread. The CPU-hotplug locks are
1254 * held, so no one should be messing with the existence of the boost
1255 * kthread.
1256 */
1257static void rcu_boost_kthread_setaffinity(struct rcu_node *rnp,
1258 cpumask_var_t cm)
1259{
1260 struct task_struct *t;
1261
1262 t = rnp->boost_kthread_task;
1263 if (t != NULL)
1264 set_cpus_allowed_ptr(rnp->boost_kthread_task, cm);
1265}
1266
1267#define RCU_BOOST_DELAY_JIFFIES DIV_ROUND_UP(CONFIG_RCU_BOOST_DELAY * HZ, 1000)
1268
1269/*
1270 * Do priority-boost accounting for the start of a new grace period.
1271 */
1272static void rcu_preempt_boost_start_gp(struct rcu_node *rnp)
1273{
1274 rnp->boost_time = jiffies + RCU_BOOST_DELAY_JIFFIES;
1275}
1276
1277/*
1278 * Initialize the RCU-boost waitqueue.
1279 */
1280static void __init rcu_init_boost_waitqueue(struct rcu_node *rnp)
1281{
1282 init_waitqueue_head(&rnp->boost_wq);
1283}
1284
1285/*
1286 * Create an RCU-boost kthread for the specified node if one does not
1287 * already exist. We only create this kthread for preemptible RCU.
1288 * Returns zero if all is well, a negated errno otherwise.
1289 */
1290static int __cpuinit rcu_spawn_one_boost_kthread(struct rcu_state *rsp,
1291 struct rcu_node *rnp,
1292 int rnp_index)
1293{
1294 unsigned long flags;
1295 struct sched_param sp;
1296 struct task_struct *t;
1297
1298 if (&rcu_preempt_state != rsp)
1299 return 0;
1300 if (rnp->boost_kthread_task != NULL)
1301 return 0;
1302 t = kthread_create(rcu_boost_kthread, (void *)rnp,
1303 "rcub%d", rnp_index);
1304 if (IS_ERR(t))
1305 return PTR_ERR(t);
1306 raw_spin_lock_irqsave(&rnp->lock, flags);
1307 rnp->boost_kthread_task = t;
1308 raw_spin_unlock_irqrestore(&rnp->lock, flags);
1309 wake_up_process(t);
1310 sp.sched_priority = RCU_KTHREAD_PRIO;
1311 sched_setscheduler_nocheck(t, SCHED_FIFO, &sp);
1312 return 0;
1313}
1314
1315#else /* #ifdef CONFIG_RCU_BOOST */
1316
1317static void rcu_initiate_boost(struct rcu_node *rnp, unsigned long flags)
1318{
1319 raw_spin_unlock_irqrestore(&rnp->lock, flags);
1320}
1321
1322static void rcu_boost_kthread_setaffinity(struct rcu_node *rnp,
1323 cpumask_var_t cm)
1324{
1325}
1326
1327static void rcu_preempt_boost_start_gp(struct rcu_node *rnp)
1328{
1329}
1330
1331static void __init rcu_init_boost_waitqueue(struct rcu_node *rnp)
1332{
1333}
1334
1335static int __cpuinit rcu_spawn_one_boost_kthread(struct rcu_state *rsp,
1336 struct rcu_node *rnp,
1337 int rnp_index)
1338{
1339 return 0;
1340}
1341
1342#endif /* #else #ifdef CONFIG_RCU_BOOST */
1343
1018#ifndef CONFIG_SMP 1344#ifndef CONFIG_SMP
1019 1345
1020void synchronize_sched_expedited(void) 1346void synchronize_sched_expedited(void)
@@ -1187,8 +1513,8 @@ static DEFINE_PER_CPU(unsigned long, rcu_dyntick_holdoff);
1187 * 1513 *
1188 * Because it is not legal to invoke rcu_process_callbacks() with irqs 1514 * Because it is not legal to invoke rcu_process_callbacks() with irqs
1189 * disabled, we do one pass of force_quiescent_state(), then do a 1515 * disabled, we do one pass of force_quiescent_state(), then do a
1190 * raise_softirq() to cause rcu_process_callbacks() to be invoked later. 1516 * invoke_rcu_cpu_kthread() to cause rcu_process_callbacks() to be invoked
1191 * The per-cpu rcu_dyntick_drain variable controls the sequencing. 1517 * later. The per-cpu rcu_dyntick_drain variable controls the sequencing.
1192 */ 1518 */
1193int rcu_needs_cpu(int cpu) 1519int rcu_needs_cpu(int cpu)
1194{ 1520{
@@ -1239,7 +1565,7 @@ int rcu_needs_cpu(int cpu)
1239 1565
1240 /* If RCU callbacks are still pending, RCU still needs this CPU. */ 1566 /* If RCU callbacks are still pending, RCU still needs this CPU. */
1241 if (c) 1567 if (c)
1242 raise_softirq(RCU_SOFTIRQ); 1568 invoke_rcu_cpu_kthread();
1243 return c; 1569 return c;
1244} 1570}
1245 1571
diff --git a/kernel/rcutree_trace.c b/kernel/rcutree_trace.c
index c8e97853b970..aa0fd72b4bc7 100644
--- a/kernel/rcutree_trace.c
+++ b/kernel/rcutree_trace.c
@@ -46,6 +46,18 @@
46#define RCU_TREE_NONCORE 46#define RCU_TREE_NONCORE
47#include "rcutree.h" 47#include "rcutree.h"
48 48
49DECLARE_PER_CPU(unsigned int, rcu_cpu_kthread_status);
50DECLARE_PER_CPU(unsigned int, rcu_cpu_kthread_cpu);
51DECLARE_PER_CPU(unsigned int, rcu_cpu_kthread_loops);
52DECLARE_PER_CPU(char, rcu_cpu_has_work);
53
54static char convert_kthread_status(unsigned int kthread_status)
55{
56 if (kthread_status > RCU_KTHREAD_MAX)
57 return '?';
58 return "SRWOY"[kthread_status];
59}
60
49static void print_one_rcu_data(struct seq_file *m, struct rcu_data *rdp) 61static void print_one_rcu_data(struct seq_file *m, struct rcu_data *rdp)
50{ 62{
51 if (!rdp->beenonline) 63 if (!rdp->beenonline)
@@ -64,7 +76,21 @@ static void print_one_rcu_data(struct seq_file *m, struct rcu_data *rdp)
64 rdp->dynticks_fqs); 76 rdp->dynticks_fqs);
65#endif /* #ifdef CONFIG_NO_HZ */ 77#endif /* #ifdef CONFIG_NO_HZ */
66 seq_printf(m, " of=%lu ri=%lu", rdp->offline_fqs, rdp->resched_ipi); 78 seq_printf(m, " of=%lu ri=%lu", rdp->offline_fqs, rdp->resched_ipi);
67 seq_printf(m, " ql=%ld b=%ld", rdp->qlen, rdp->blimit); 79 seq_printf(m, " ql=%ld qs=%c%c%c%c kt=%d/%c/%d ktl=%x b=%ld",
80 rdp->qlen,
81 ".N"[rdp->nxttail[RCU_NEXT_READY_TAIL] !=
82 rdp->nxttail[RCU_NEXT_TAIL]],
83 ".R"[rdp->nxttail[RCU_WAIT_TAIL] !=
84 rdp->nxttail[RCU_NEXT_READY_TAIL]],
85 ".W"[rdp->nxttail[RCU_DONE_TAIL] !=
86 rdp->nxttail[RCU_WAIT_TAIL]],
87 ".D"[&rdp->nxtlist != rdp->nxttail[RCU_DONE_TAIL]],
88 per_cpu(rcu_cpu_has_work, rdp->cpu),
89 convert_kthread_status(per_cpu(rcu_cpu_kthread_status,
90 rdp->cpu)),
91 per_cpu(rcu_cpu_kthread_cpu, rdp->cpu),
92 per_cpu(rcu_cpu_kthread_loops, rdp->cpu) & 0xffff,
93 rdp->blimit);
68 seq_printf(m, " ci=%lu co=%lu ca=%lu\n", 94 seq_printf(m, " ci=%lu co=%lu ca=%lu\n",
69 rdp->n_cbs_invoked, rdp->n_cbs_orphaned, rdp->n_cbs_adopted); 95 rdp->n_cbs_invoked, rdp->n_cbs_orphaned, rdp->n_cbs_adopted);
70} 96}
@@ -121,7 +147,18 @@ static void print_one_rcu_data_csv(struct seq_file *m, struct rcu_data *rdp)
121 rdp->dynticks_fqs); 147 rdp->dynticks_fqs);
122#endif /* #ifdef CONFIG_NO_HZ */ 148#endif /* #ifdef CONFIG_NO_HZ */
123 seq_printf(m, ",%lu,%lu", rdp->offline_fqs, rdp->resched_ipi); 149 seq_printf(m, ",%lu,%lu", rdp->offline_fqs, rdp->resched_ipi);
124 seq_printf(m, ",%ld,%ld", rdp->qlen, rdp->blimit); 150 seq_printf(m, ",%ld,\"%c%c%c%c\",%d,\"%c\",%ld", rdp->qlen,
151 ".N"[rdp->nxttail[RCU_NEXT_READY_TAIL] !=
152 rdp->nxttail[RCU_NEXT_TAIL]],
153 ".R"[rdp->nxttail[RCU_WAIT_TAIL] !=
154 rdp->nxttail[RCU_NEXT_READY_TAIL]],
155 ".W"[rdp->nxttail[RCU_DONE_TAIL] !=
156 rdp->nxttail[RCU_WAIT_TAIL]],
157 ".D"[&rdp->nxtlist != rdp->nxttail[RCU_DONE_TAIL]],
158 per_cpu(rcu_cpu_has_work, rdp->cpu),
159 convert_kthread_status(per_cpu(rcu_cpu_kthread_status,
160 rdp->cpu)),
161 rdp->blimit);
125 seq_printf(m, ",%lu,%lu,%lu\n", 162 seq_printf(m, ",%lu,%lu,%lu\n",
126 rdp->n_cbs_invoked, rdp->n_cbs_orphaned, rdp->n_cbs_adopted); 163 rdp->n_cbs_invoked, rdp->n_cbs_orphaned, rdp->n_cbs_adopted);
127} 164}
@@ -157,11 +194,76 @@ static const struct file_operations rcudata_csv_fops = {
157 .release = single_release, 194 .release = single_release,
158}; 195};
159 196
197#ifdef CONFIG_RCU_BOOST
198
199static void print_one_rcu_node_boost(struct seq_file *m, struct rcu_node *rnp)
200{
201 seq_printf(m, "%d:%d tasks=%c%c%c%c kt=%c ntb=%lu neb=%lu nnb=%lu "
202 "j=%04x bt=%04x\n",
203 rnp->grplo, rnp->grphi,
204 "T."[list_empty(&rnp->blkd_tasks)],
205 "N."[!rnp->gp_tasks],
206 "E."[!rnp->exp_tasks],
207 "B."[!rnp->boost_tasks],
208 convert_kthread_status(rnp->boost_kthread_status),
209 rnp->n_tasks_boosted, rnp->n_exp_boosts,
210 rnp->n_normal_boosts,
211 (int)(jiffies & 0xffff),
212 (int)(rnp->boost_time & 0xffff));
213 seq_printf(m, "%s: nt=%lu egt=%lu bt=%lu nb=%lu ny=%lu nos=%lu\n",
214 " balk",
215 rnp->n_balk_blkd_tasks,
216 rnp->n_balk_exp_gp_tasks,
217 rnp->n_balk_boost_tasks,
218 rnp->n_balk_notblocked,
219 rnp->n_balk_notyet,
220 rnp->n_balk_nos);
221}
222
223static int show_rcu_node_boost(struct seq_file *m, void *unused)
224{
225 struct rcu_node *rnp;
226
227 rcu_for_each_leaf_node(&rcu_preempt_state, rnp)
228 print_one_rcu_node_boost(m, rnp);
229 return 0;
230}
231
232static int rcu_node_boost_open(struct inode *inode, struct file *file)
233{
234 return single_open(file, show_rcu_node_boost, NULL);
235}
236
237static const struct file_operations rcu_node_boost_fops = {
238 .owner = THIS_MODULE,
239 .open = rcu_node_boost_open,
240 .read = seq_read,
241 .llseek = seq_lseek,
242 .release = single_release,
243};
244
245/*
246 * Create the rcuboost debugfs entry. Standard error return.
247 */
248static int rcu_boost_trace_create_file(struct dentry *rcudir)
249{
250 return !debugfs_create_file("rcuboost", 0444, rcudir, NULL,
251 &rcu_node_boost_fops);
252}
253
254#else /* #ifdef CONFIG_RCU_BOOST */
255
256static int rcu_boost_trace_create_file(struct dentry *rcudir)
257{
258 return 0; /* There cannot be an error if we didn't create it! */
259}
260
261#endif /* #else #ifdef CONFIG_RCU_BOOST */
262
160static void print_one_rcu_state(struct seq_file *m, struct rcu_state *rsp) 263static void print_one_rcu_state(struct seq_file *m, struct rcu_state *rsp)
161{ 264{
162 unsigned long gpnum; 265 unsigned long gpnum;
163 int level = 0; 266 int level = 0;
164 int phase;
165 struct rcu_node *rnp; 267 struct rcu_node *rnp;
166 268
167 gpnum = rsp->gpnum; 269 gpnum = rsp->gpnum;
@@ -178,13 +280,11 @@ static void print_one_rcu_state(struct seq_file *m, struct rcu_state *rsp)
178 seq_puts(m, "\n"); 280 seq_puts(m, "\n");
179 level = rnp->level; 281 level = rnp->level;
180 } 282 }
181 phase = gpnum & 0x1; 283 seq_printf(m, "%lx/%lx %c%c>%c %d:%d ^%d ",
182 seq_printf(m, "%lx/%lx %c%c>%c%c %d:%d ^%d ",
183 rnp->qsmask, rnp->qsmaskinit, 284 rnp->qsmask, rnp->qsmaskinit,
184 "T."[list_empty(&rnp->blocked_tasks[phase])], 285 ".G"[rnp->gp_tasks != NULL],
185 "E."[list_empty(&rnp->blocked_tasks[phase + 2])], 286 ".E"[rnp->exp_tasks != NULL],
186 "T."[list_empty(&rnp->blocked_tasks[!phase])], 287 ".T"[!list_empty(&rnp->blkd_tasks)],
187 "E."[list_empty(&rnp->blocked_tasks[!phase + 2])],
188 rnp->grplo, rnp->grphi, rnp->grpnum); 288 rnp->grplo, rnp->grphi, rnp->grpnum);
189 } 289 }
190 seq_puts(m, "\n"); 290 seq_puts(m, "\n");
@@ -216,16 +316,35 @@ static const struct file_operations rcuhier_fops = {
216 .release = single_release, 316 .release = single_release,
217}; 317};
218 318
319static void show_one_rcugp(struct seq_file *m, struct rcu_state *rsp)
320{
321 unsigned long flags;
322 unsigned long completed;
323 unsigned long gpnum;
324 unsigned long gpage;
325 unsigned long gpmax;
326 struct rcu_node *rnp = &rsp->node[0];
327
328 raw_spin_lock_irqsave(&rnp->lock, flags);
329 completed = rsp->completed;
330 gpnum = rsp->gpnum;
331 if (rsp->completed == rsp->gpnum)
332 gpage = 0;
333 else
334 gpage = jiffies - rsp->gp_start;
335 gpmax = rsp->gp_max;
336 raw_spin_unlock_irqrestore(&rnp->lock, flags);
337 seq_printf(m, "%s: completed=%ld gpnum=%lu age=%ld max=%ld\n",
338 rsp->name, completed, gpnum, gpage, gpmax);
339}
340
219static int show_rcugp(struct seq_file *m, void *unused) 341static int show_rcugp(struct seq_file *m, void *unused)
220{ 342{
221#ifdef CONFIG_TREE_PREEMPT_RCU 343#ifdef CONFIG_TREE_PREEMPT_RCU
222 seq_printf(m, "rcu_preempt: completed=%ld gpnum=%lu\n", 344 show_one_rcugp(m, &rcu_preempt_state);
223 rcu_preempt_state.completed, rcu_preempt_state.gpnum);
224#endif /* #ifdef CONFIG_TREE_PREEMPT_RCU */ 345#endif /* #ifdef CONFIG_TREE_PREEMPT_RCU */
225 seq_printf(m, "rcu_sched: completed=%ld gpnum=%lu\n", 346 show_one_rcugp(m, &rcu_sched_state);
226 rcu_sched_state.completed, rcu_sched_state.gpnum); 347 show_one_rcugp(m, &rcu_bh_state);
227 seq_printf(m, "rcu_bh: completed=%ld gpnum=%lu\n",
228 rcu_bh_state.completed, rcu_bh_state.gpnum);
229 return 0; 348 return 0;
230} 349}
231 350
@@ -298,6 +417,29 @@ static const struct file_operations rcu_pending_fops = {
298 .release = single_release, 417 .release = single_release,
299}; 418};
300 419
420static int show_rcutorture(struct seq_file *m, void *unused)
421{
422 seq_printf(m, "rcutorture test sequence: %lu %s\n",
423 rcutorture_testseq >> 1,
424 (rcutorture_testseq & 0x1) ? "(test in progress)" : "");
425 seq_printf(m, "rcutorture update version number: %lu\n",
426 rcutorture_vernum);
427 return 0;
428}
429
430static int rcutorture_open(struct inode *inode, struct file *file)
431{
432 return single_open(file, show_rcutorture, NULL);
433}
434
435static const struct file_operations rcutorture_fops = {
436 .owner = THIS_MODULE,
437 .open = rcutorture_open,
438 .read = seq_read,
439 .llseek = seq_lseek,
440 .release = single_release,
441};
442
301static struct dentry *rcudir; 443static struct dentry *rcudir;
302 444
303static int __init rcutree_trace_init(void) 445static int __init rcutree_trace_init(void)
@@ -318,6 +460,9 @@ static int __init rcutree_trace_init(void)
318 if (!retval) 460 if (!retval)
319 goto free_out; 461 goto free_out;
320 462
463 if (rcu_boost_trace_create_file(rcudir))
464 goto free_out;
465
321 retval = debugfs_create_file("rcugp", 0444, rcudir, NULL, &rcugp_fops); 466 retval = debugfs_create_file("rcugp", 0444, rcudir, NULL, &rcugp_fops);
322 if (!retval) 467 if (!retval)
323 goto free_out; 468 goto free_out;
@@ -331,6 +476,11 @@ static int __init rcutree_trace_init(void)
331 NULL, &rcu_pending_fops); 476 NULL, &rcu_pending_fops);
332 if (!retval) 477 if (!retval)
333 goto free_out; 478 goto free_out;
479
480 retval = debugfs_create_file("rcutorture", 0444, rcudir,
481 NULL, &rcutorture_fops);
482 if (!retval)
483 goto free_out;
334 return 0; 484 return 0;
335free_out: 485free_out:
336 debugfs_remove_recursive(rcudir); 486 debugfs_remove_recursive(rcudir);
diff --git a/kernel/sched.c b/kernel/sched.c
index 312f8b95c2d4..c62acf45d3b9 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -231,7 +231,7 @@ static void destroy_rt_bandwidth(struct rt_bandwidth *rt_b)
231#endif 231#endif
232 232
233/* 233/*
234 * sched_domains_mutex serializes calls to arch_init_sched_domains, 234 * sched_domains_mutex serializes calls to init_sched_domains,
235 * detach_destroy_domains and partition_sched_domains. 235 * detach_destroy_domains and partition_sched_domains.
236 */ 236 */
237static DEFINE_MUTEX(sched_domains_mutex); 237static DEFINE_MUTEX(sched_domains_mutex);
@@ -312,6 +312,9 @@ struct cfs_rq {
312 312
313 u64 exec_clock; 313 u64 exec_clock;
314 u64 min_vruntime; 314 u64 min_vruntime;
315#ifndef CONFIG_64BIT
316 u64 min_vruntime_copy;
317#endif
315 318
316 struct rb_root tasks_timeline; 319 struct rb_root tasks_timeline;
317 struct rb_node *rb_leftmost; 320 struct rb_node *rb_leftmost;
@@ -325,7 +328,9 @@ struct cfs_rq {
325 */ 328 */
326 struct sched_entity *curr, *next, *last, *skip; 329 struct sched_entity *curr, *next, *last, *skip;
327 330
331#ifdef CONFIG_SCHED_DEBUG
328 unsigned int nr_spread_over; 332 unsigned int nr_spread_over;
333#endif
329 334
330#ifdef CONFIG_FAIR_GROUP_SCHED 335#ifdef CONFIG_FAIR_GROUP_SCHED
331 struct rq *rq; /* cpu runqueue to which this cfs_rq is attached */ 336 struct rq *rq; /* cpu runqueue to which this cfs_rq is attached */
@@ -417,6 +422,7 @@ struct rt_rq {
417 */ 422 */
418struct root_domain { 423struct root_domain {
419 atomic_t refcount; 424 atomic_t refcount;
425 struct rcu_head rcu;
420 cpumask_var_t span; 426 cpumask_var_t span;
421 cpumask_var_t online; 427 cpumask_var_t online;
422 428
@@ -460,7 +466,7 @@ struct rq {
460 u64 nohz_stamp; 466 u64 nohz_stamp;
461 unsigned char nohz_balance_kick; 467 unsigned char nohz_balance_kick;
462#endif 468#endif
463 unsigned int skip_clock_update; 469 int skip_clock_update;
464 470
465 /* capture load from *all* tasks on this cpu: */ 471 /* capture load from *all* tasks on this cpu: */
466 struct load_weight load; 472 struct load_weight load;
@@ -553,6 +559,10 @@ struct rq {
553 unsigned int ttwu_count; 559 unsigned int ttwu_count;
554 unsigned int ttwu_local; 560 unsigned int ttwu_local;
555#endif 561#endif
562
563#ifdef CONFIG_SMP
564 struct task_struct *wake_list;
565#endif
556}; 566};
557 567
558static DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues); 568static DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues);
@@ -571,7 +581,7 @@ static inline int cpu_of(struct rq *rq)
571 581
572#define rcu_dereference_check_sched_domain(p) \ 582#define rcu_dereference_check_sched_domain(p) \
573 rcu_dereference_check((p), \ 583 rcu_dereference_check((p), \
574 rcu_read_lock_sched_held() || \ 584 rcu_read_lock_held() || \
575 lockdep_is_held(&sched_domains_mutex)) 585 lockdep_is_held(&sched_domains_mutex))
576 586
577/* 587/*
@@ -596,7 +606,7 @@ static inline int cpu_of(struct rq *rq)
596 * Return the group to which this tasks belongs. 606 * Return the group to which this tasks belongs.
597 * 607 *
598 * We use task_subsys_state_check() and extend the RCU verification 608 * We use task_subsys_state_check() and extend the RCU verification
599 * with lockdep_is_held(&task_rq(p)->lock) because cpu_cgroup_attach() 609 * with lockdep_is_held(&p->pi_lock) because cpu_cgroup_attach()
600 * holds that lock for each task it moves into the cgroup. Therefore 610 * holds that lock for each task it moves into the cgroup. Therefore
601 * by holding that lock, we pin the task to the current cgroup. 611 * by holding that lock, we pin the task to the current cgroup.
602 */ 612 */
@@ -606,7 +616,7 @@ static inline struct task_group *task_group(struct task_struct *p)
606 struct cgroup_subsys_state *css; 616 struct cgroup_subsys_state *css;
607 617
608 css = task_subsys_state_check(p, cpu_cgroup_subsys_id, 618 css = task_subsys_state_check(p, cpu_cgroup_subsys_id,
609 lockdep_is_held(&task_rq(p)->lock)); 619 lockdep_is_held(&p->pi_lock));
610 tg = container_of(css, struct task_group, css); 620 tg = container_of(css, struct task_group, css);
611 621
612 return autogroup_task_group(p, tg); 622 return autogroup_task_group(p, tg);
@@ -642,7 +652,7 @@ static void update_rq_clock(struct rq *rq)
642{ 652{
643 s64 delta; 653 s64 delta;
644 654
645 if (rq->skip_clock_update) 655 if (rq->skip_clock_update > 0)
646 return; 656 return;
647 657
648 delta = sched_clock_cpu(cpu_of(rq)) - rq->clock; 658 delta = sched_clock_cpu(cpu_of(rq)) - rq->clock;
@@ -838,18 +848,39 @@ static inline int task_current(struct rq *rq, struct task_struct *p)
838 return rq->curr == p; 848 return rq->curr == p;
839} 849}
840 850
841#ifndef __ARCH_WANT_UNLOCKED_CTXSW
842static inline int task_running(struct rq *rq, struct task_struct *p) 851static inline int task_running(struct rq *rq, struct task_struct *p)
843{ 852{
853#ifdef CONFIG_SMP
854 return p->on_cpu;
855#else
844 return task_current(rq, p); 856 return task_current(rq, p);
857#endif
845} 858}
846 859
860#ifndef __ARCH_WANT_UNLOCKED_CTXSW
847static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next) 861static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next)
848{ 862{
863#ifdef CONFIG_SMP
864 /*
865 * We can optimise this out completely for !SMP, because the
866 * SMP rebalancing from interrupt is the only thing that cares
867 * here.
868 */
869 next->on_cpu = 1;
870#endif
849} 871}
850 872
851static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev) 873static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev)
852{ 874{
875#ifdef CONFIG_SMP
876 /*
877 * After ->on_cpu is cleared, the task can be moved to a different CPU.
878 * We must ensure this doesn't happen until the switch is completely
879 * finished.
880 */
881 smp_wmb();
882 prev->on_cpu = 0;
883#endif
853#ifdef CONFIG_DEBUG_SPINLOCK 884#ifdef CONFIG_DEBUG_SPINLOCK
854 /* this is a valid case when another task releases the spinlock */ 885 /* this is a valid case when another task releases the spinlock */
855 rq->lock.owner = current; 886 rq->lock.owner = current;
@@ -865,15 +896,6 @@ static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev)
865} 896}
866 897
867#else /* __ARCH_WANT_UNLOCKED_CTXSW */ 898#else /* __ARCH_WANT_UNLOCKED_CTXSW */
868static inline int task_running(struct rq *rq, struct task_struct *p)
869{
870#ifdef CONFIG_SMP
871 return p->oncpu;
872#else
873 return task_current(rq, p);
874#endif
875}
876
877static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next) 899static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next)
878{ 900{
879#ifdef CONFIG_SMP 901#ifdef CONFIG_SMP
@@ -882,7 +904,7 @@ static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next)
882 * SMP rebalancing from interrupt is the only thing that cares 904 * SMP rebalancing from interrupt is the only thing that cares
883 * here. 905 * here.
884 */ 906 */
885 next->oncpu = 1; 907 next->on_cpu = 1;
886#endif 908#endif
887#ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW 909#ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW
888 raw_spin_unlock_irq(&rq->lock); 910 raw_spin_unlock_irq(&rq->lock);
@@ -895,12 +917,12 @@ static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev)
895{ 917{
896#ifdef CONFIG_SMP 918#ifdef CONFIG_SMP
897 /* 919 /*
898 * After ->oncpu is cleared, the task can be moved to a different CPU. 920 * After ->on_cpu is cleared, the task can be moved to a different CPU.
899 * We must ensure this doesn't happen until the switch is completely 921 * We must ensure this doesn't happen until the switch is completely
900 * finished. 922 * finished.
901 */ 923 */
902 smp_wmb(); 924 smp_wmb();
903 prev->oncpu = 0; 925 prev->on_cpu = 0;
904#endif 926#endif
905#ifndef __ARCH_WANT_INTERRUPTS_ON_CTXSW 927#ifndef __ARCH_WANT_INTERRUPTS_ON_CTXSW
906 local_irq_enable(); 928 local_irq_enable();
@@ -909,23 +931,15 @@ static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev)
909#endif /* __ARCH_WANT_UNLOCKED_CTXSW */ 931#endif /* __ARCH_WANT_UNLOCKED_CTXSW */
910 932
911/* 933/*
912 * Check whether the task is waking, we use this to synchronize ->cpus_allowed 934 * __task_rq_lock - lock the rq @p resides on.
913 * against ttwu().
914 */
915static inline int task_is_waking(struct task_struct *p)
916{
917 return unlikely(p->state == TASK_WAKING);
918}
919
920/*
921 * __task_rq_lock - lock the runqueue a given task resides on.
922 * Must be called interrupts disabled.
923 */ 935 */
924static inline struct rq *__task_rq_lock(struct task_struct *p) 936static inline struct rq *__task_rq_lock(struct task_struct *p)
925 __acquires(rq->lock) 937 __acquires(rq->lock)
926{ 938{
927 struct rq *rq; 939 struct rq *rq;
928 940
941 lockdep_assert_held(&p->pi_lock);
942
929 for (;;) { 943 for (;;) {
930 rq = task_rq(p); 944 rq = task_rq(p);
931 raw_spin_lock(&rq->lock); 945 raw_spin_lock(&rq->lock);
@@ -936,22 +950,22 @@ static inline struct rq *__task_rq_lock(struct task_struct *p)
936} 950}
937 951
938/* 952/*
939 * task_rq_lock - lock the runqueue a given task resides on and disable 953 * task_rq_lock - lock p->pi_lock and lock the rq @p resides on.
940 * interrupts. Note the ordering: we can safely lookup the task_rq without
941 * explicitly disabling preemption.
942 */ 954 */
943static struct rq *task_rq_lock(struct task_struct *p, unsigned long *flags) 955static struct rq *task_rq_lock(struct task_struct *p, unsigned long *flags)
956 __acquires(p->pi_lock)
944 __acquires(rq->lock) 957 __acquires(rq->lock)
945{ 958{
946 struct rq *rq; 959 struct rq *rq;
947 960
948 for (;;) { 961 for (;;) {
949 local_irq_save(*flags); 962 raw_spin_lock_irqsave(&p->pi_lock, *flags);
950 rq = task_rq(p); 963 rq = task_rq(p);
951 raw_spin_lock(&rq->lock); 964 raw_spin_lock(&rq->lock);
952 if (likely(rq == task_rq(p))) 965 if (likely(rq == task_rq(p)))
953 return rq; 966 return rq;
954 raw_spin_unlock_irqrestore(&rq->lock, *flags); 967 raw_spin_unlock(&rq->lock);
968 raw_spin_unlock_irqrestore(&p->pi_lock, *flags);
955 } 969 }
956} 970}
957 971
@@ -961,10 +975,13 @@ static void __task_rq_unlock(struct rq *rq)
961 raw_spin_unlock(&rq->lock); 975 raw_spin_unlock(&rq->lock);
962} 976}
963 977
964static inline void task_rq_unlock(struct rq *rq, unsigned long *flags) 978static inline void
979task_rq_unlock(struct rq *rq, struct task_struct *p, unsigned long *flags)
965 __releases(rq->lock) 980 __releases(rq->lock)
981 __releases(p->pi_lock)
966{ 982{
967 raw_spin_unlock_irqrestore(&rq->lock, *flags); 983 raw_spin_unlock(&rq->lock);
984 raw_spin_unlock_irqrestore(&p->pi_lock, *flags);
968} 985}
969 986
970/* 987/*
@@ -1193,11 +1210,17 @@ int get_nohz_timer_target(void)
1193 int i; 1210 int i;
1194 struct sched_domain *sd; 1211 struct sched_domain *sd;
1195 1212
1213 rcu_read_lock();
1196 for_each_domain(cpu, sd) { 1214 for_each_domain(cpu, sd) {
1197 for_each_cpu(i, sched_domain_span(sd)) 1215 for_each_cpu(i, sched_domain_span(sd)) {
1198 if (!idle_cpu(i)) 1216 if (!idle_cpu(i)) {
1199 return i; 1217 cpu = i;
1218 goto unlock;
1219 }
1220 }
1200 } 1221 }
1222unlock:
1223 rcu_read_unlock();
1201 return cpu; 1224 return cpu;
1202} 1225}
1203/* 1226/*
@@ -1307,15 +1330,15 @@ calc_delta_mine(unsigned long delta_exec, unsigned long weight,
1307{ 1330{
1308 u64 tmp; 1331 u64 tmp;
1309 1332
1333 tmp = (u64)delta_exec * weight;
1334
1310 if (!lw->inv_weight) { 1335 if (!lw->inv_weight) {
1311 if (BITS_PER_LONG > 32 && unlikely(lw->weight >= WMULT_CONST)) 1336 if (BITS_PER_LONG > 32 && unlikely(lw->weight >= WMULT_CONST))
1312 lw->inv_weight = 1; 1337 lw->inv_weight = 1;
1313 else 1338 else
1314 lw->inv_weight = 1 + (WMULT_CONST-lw->weight/2) 1339 lw->inv_weight = WMULT_CONST / lw->weight;
1315 / (lw->weight+1);
1316 } 1340 }
1317 1341
1318 tmp = (u64)delta_exec * weight;
1319 /* 1342 /*
1320 * Check whether we'd overflow the 64-bit multiplication: 1343 * Check whether we'd overflow the 64-bit multiplication:
1321 */ 1344 */
@@ -1773,7 +1796,6 @@ static void enqueue_task(struct rq *rq, struct task_struct *p, int flags)
1773 update_rq_clock(rq); 1796 update_rq_clock(rq);
1774 sched_info_queued(p); 1797 sched_info_queued(p);
1775 p->sched_class->enqueue_task(rq, p, flags); 1798 p->sched_class->enqueue_task(rq, p, flags);
1776 p->se.on_rq = 1;
1777} 1799}
1778 1800
1779static void dequeue_task(struct rq *rq, struct task_struct *p, int flags) 1801static void dequeue_task(struct rq *rq, struct task_struct *p, int flags)
@@ -1781,7 +1803,6 @@ static void dequeue_task(struct rq *rq, struct task_struct *p, int flags)
1781 update_rq_clock(rq); 1803 update_rq_clock(rq);
1782 sched_info_dequeued(p); 1804 sched_info_dequeued(p);
1783 p->sched_class->dequeue_task(rq, p, flags); 1805 p->sched_class->dequeue_task(rq, p, flags);
1784 p->se.on_rq = 0;
1785} 1806}
1786 1807
1787/* 1808/*
@@ -2116,7 +2137,7 @@ static void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags)
2116 * A queue event has occurred, and we're going to schedule. In 2137 * A queue event has occurred, and we're going to schedule. In
2117 * this case, we can save a useless back to back clock update. 2138 * this case, we can save a useless back to back clock update.
2118 */ 2139 */
2119 if (rq->curr->se.on_rq && test_tsk_need_resched(rq->curr)) 2140 if (rq->curr->on_rq && test_tsk_need_resched(rq->curr))
2120 rq->skip_clock_update = 1; 2141 rq->skip_clock_update = 1;
2121} 2142}
2122 2143
@@ -2162,6 +2183,11 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
2162 */ 2183 */
2163 WARN_ON_ONCE(p->state != TASK_RUNNING && p->state != TASK_WAKING && 2184 WARN_ON_ONCE(p->state != TASK_RUNNING && p->state != TASK_WAKING &&
2164 !(task_thread_info(p)->preempt_count & PREEMPT_ACTIVE)); 2185 !(task_thread_info(p)->preempt_count & PREEMPT_ACTIVE));
2186
2187#ifdef CONFIG_LOCKDEP
2188 WARN_ON_ONCE(debug_locks && !(lockdep_is_held(&p->pi_lock) ||
2189 lockdep_is_held(&task_rq(p)->lock)));
2190#endif
2165#endif 2191#endif
2166 2192
2167 trace_sched_migrate_task(p, new_cpu); 2193 trace_sched_migrate_task(p, new_cpu);
@@ -2182,19 +2208,6 @@ struct migration_arg {
2182static int migration_cpu_stop(void *data); 2208static int migration_cpu_stop(void *data);
2183 2209
2184/* 2210/*
2185 * The task's runqueue lock must be held.
2186 * Returns true if you have to wait for migration thread.
2187 */
2188static bool migrate_task(struct task_struct *p, struct rq *rq)
2189{
2190 /*
2191 * If the task is not on a runqueue (and not running), then
2192 * the next wake-up will properly place the task.
2193 */
2194 return p->se.on_rq || task_running(rq, p);
2195}
2196
2197/*
2198 * wait_task_inactive - wait for a thread to unschedule. 2211 * wait_task_inactive - wait for a thread to unschedule.
2199 * 2212 *
2200 * If @match_state is nonzero, it's the @p->state value just checked and 2213 * If @match_state is nonzero, it's the @p->state value just checked and
@@ -2251,11 +2264,11 @@ unsigned long wait_task_inactive(struct task_struct *p, long match_state)
2251 rq = task_rq_lock(p, &flags); 2264 rq = task_rq_lock(p, &flags);
2252 trace_sched_wait_task(p); 2265 trace_sched_wait_task(p);
2253 running = task_running(rq, p); 2266 running = task_running(rq, p);
2254 on_rq = p->se.on_rq; 2267 on_rq = p->on_rq;
2255 ncsw = 0; 2268 ncsw = 0;
2256 if (!match_state || p->state == match_state) 2269 if (!match_state || p->state == match_state)
2257 ncsw = p->nvcsw | LONG_MIN; /* sets MSB */ 2270 ncsw = p->nvcsw | LONG_MIN; /* sets MSB */
2258 task_rq_unlock(rq, &flags); 2271 task_rq_unlock(rq, p, &flags);
2259 2272
2260 /* 2273 /*
2261 * If it changed from the expected state, bail out now. 2274 * If it changed from the expected state, bail out now.
@@ -2330,7 +2343,7 @@ EXPORT_SYMBOL_GPL(kick_process);
2330 2343
2331#ifdef CONFIG_SMP 2344#ifdef CONFIG_SMP
2332/* 2345/*
2333 * ->cpus_allowed is protected by either TASK_WAKING or rq->lock held. 2346 * ->cpus_allowed is protected by both rq->lock and p->pi_lock
2334 */ 2347 */
2335static int select_fallback_rq(int cpu, struct task_struct *p) 2348static int select_fallback_rq(int cpu, struct task_struct *p)
2336{ 2349{
@@ -2363,12 +2376,12 @@ static int select_fallback_rq(int cpu, struct task_struct *p)
2363} 2376}
2364 2377
2365/* 2378/*
2366 * The caller (fork, wakeup) owns TASK_WAKING, ->cpus_allowed is stable. 2379 * The caller (fork, wakeup) owns p->pi_lock, ->cpus_allowed is stable.
2367 */ 2380 */
2368static inline 2381static inline
2369int select_task_rq(struct rq *rq, struct task_struct *p, int sd_flags, int wake_flags) 2382int select_task_rq(struct task_struct *p, int sd_flags, int wake_flags)
2370{ 2383{
2371 int cpu = p->sched_class->select_task_rq(rq, p, sd_flags, wake_flags); 2384 int cpu = p->sched_class->select_task_rq(p, sd_flags, wake_flags);
2372 2385
2373 /* 2386 /*
2374 * In order not to call set_task_cpu() on a blocking task we need 2387 * In order not to call set_task_cpu() on a blocking task we need
@@ -2394,27 +2407,62 @@ static void update_avg(u64 *avg, u64 sample)
2394} 2407}
2395#endif 2408#endif
2396 2409
2397static inline void ttwu_activate(struct task_struct *p, struct rq *rq, 2410static void
2398 bool is_sync, bool is_migrate, bool is_local, 2411ttwu_stat(struct task_struct *p, int cpu, int wake_flags)
2399 unsigned long en_flags)
2400{ 2412{
2413#ifdef CONFIG_SCHEDSTATS
2414 struct rq *rq = this_rq();
2415
2416#ifdef CONFIG_SMP
2417 int this_cpu = smp_processor_id();
2418
2419 if (cpu == this_cpu) {
2420 schedstat_inc(rq, ttwu_local);
2421 schedstat_inc(p, se.statistics.nr_wakeups_local);
2422 } else {
2423 struct sched_domain *sd;
2424
2425 schedstat_inc(p, se.statistics.nr_wakeups_remote);
2426 rcu_read_lock();
2427 for_each_domain(this_cpu, sd) {
2428 if (cpumask_test_cpu(cpu, sched_domain_span(sd))) {
2429 schedstat_inc(sd, ttwu_wake_remote);
2430 break;
2431 }
2432 }
2433 rcu_read_unlock();
2434 }
2435#endif /* CONFIG_SMP */
2436
2437 schedstat_inc(rq, ttwu_count);
2401 schedstat_inc(p, se.statistics.nr_wakeups); 2438 schedstat_inc(p, se.statistics.nr_wakeups);
2402 if (is_sync) 2439
2440 if (wake_flags & WF_SYNC)
2403 schedstat_inc(p, se.statistics.nr_wakeups_sync); 2441 schedstat_inc(p, se.statistics.nr_wakeups_sync);
2404 if (is_migrate) 2442
2443 if (cpu != task_cpu(p))
2405 schedstat_inc(p, se.statistics.nr_wakeups_migrate); 2444 schedstat_inc(p, se.statistics.nr_wakeups_migrate);
2406 if (is_local)
2407 schedstat_inc(p, se.statistics.nr_wakeups_local);
2408 else
2409 schedstat_inc(p, se.statistics.nr_wakeups_remote);
2410 2445
2446#endif /* CONFIG_SCHEDSTATS */
2447}
2448
2449static void ttwu_activate(struct rq *rq, struct task_struct *p, int en_flags)
2450{
2411 activate_task(rq, p, en_flags); 2451 activate_task(rq, p, en_flags);
2452 p->on_rq = 1;
2453
2454 /* if a worker is waking up, notify workqueue */
2455 if (p->flags & PF_WQ_WORKER)
2456 wq_worker_waking_up(p, cpu_of(rq));
2412} 2457}
2413 2458
2414static inline void ttwu_post_activation(struct task_struct *p, struct rq *rq, 2459/*
2415 int wake_flags, bool success) 2460 * Mark the task runnable and perform wakeup-preemption.
2461 */
2462static void
2463ttwu_do_wakeup(struct rq *rq, struct task_struct *p, int wake_flags)
2416{ 2464{
2417 trace_sched_wakeup(p, success); 2465 trace_sched_wakeup(p, true);
2418 check_preempt_curr(rq, p, wake_flags); 2466 check_preempt_curr(rq, p, wake_flags);
2419 2467
2420 p->state = TASK_RUNNING; 2468 p->state = TASK_RUNNING;
@@ -2433,9 +2481,99 @@ static inline void ttwu_post_activation(struct task_struct *p, struct rq *rq,
2433 rq->idle_stamp = 0; 2481 rq->idle_stamp = 0;
2434 } 2482 }
2435#endif 2483#endif
2436 /* if a worker is waking up, notify workqueue */ 2484}
2437 if ((p->flags & PF_WQ_WORKER) && success) 2485
2438 wq_worker_waking_up(p, cpu_of(rq)); 2486static void
2487ttwu_do_activate(struct rq *rq, struct task_struct *p, int wake_flags)
2488{
2489#ifdef CONFIG_SMP
2490 if (p->sched_contributes_to_load)
2491 rq->nr_uninterruptible--;
2492#endif
2493
2494 ttwu_activate(rq, p, ENQUEUE_WAKEUP | ENQUEUE_WAKING);
2495 ttwu_do_wakeup(rq, p, wake_flags);
2496}
2497
2498/*
2499 * Called in case the task @p isn't fully descheduled from its runqueue,
2500 * in this case we must do a remote wakeup. Its a 'light' wakeup though,
2501 * since all we need to do is flip p->state to TASK_RUNNING, since
2502 * the task is still ->on_rq.
2503 */
2504static int ttwu_remote(struct task_struct *p, int wake_flags)
2505{
2506 struct rq *rq;
2507 int ret = 0;
2508
2509 rq = __task_rq_lock(p);
2510 if (p->on_rq) {
2511 ttwu_do_wakeup(rq, p, wake_flags);
2512 ret = 1;
2513 }
2514 __task_rq_unlock(rq);
2515
2516 return ret;
2517}
2518
2519#ifdef CONFIG_SMP
2520static void sched_ttwu_pending(void)
2521{
2522 struct rq *rq = this_rq();
2523 struct task_struct *list = xchg(&rq->wake_list, NULL);
2524
2525 if (!list)
2526 return;
2527
2528 raw_spin_lock(&rq->lock);
2529
2530 while (list) {
2531 struct task_struct *p = list;
2532 list = list->wake_entry;
2533 ttwu_do_activate(rq, p, 0);
2534 }
2535
2536 raw_spin_unlock(&rq->lock);
2537}
2538
2539void scheduler_ipi(void)
2540{
2541 sched_ttwu_pending();
2542}
2543
2544static void ttwu_queue_remote(struct task_struct *p, int cpu)
2545{
2546 struct rq *rq = cpu_rq(cpu);
2547 struct task_struct *next = rq->wake_list;
2548
2549 for (;;) {
2550 struct task_struct *old = next;
2551
2552 p->wake_entry = next;
2553 next = cmpxchg(&rq->wake_list, old, p);
2554 if (next == old)
2555 break;
2556 }
2557
2558 if (!next)
2559 smp_send_reschedule(cpu);
2560}
2561#endif
2562
2563static void ttwu_queue(struct task_struct *p, int cpu)
2564{
2565 struct rq *rq = cpu_rq(cpu);
2566
2567#if defined(CONFIG_SMP) && defined(CONFIG_SCHED_TTWU_QUEUE)
2568 if (sched_feat(TTWU_QUEUE) && cpu != smp_processor_id()) {
2569 ttwu_queue_remote(p, cpu);
2570 return;
2571 }
2572#endif
2573
2574 raw_spin_lock(&rq->lock);
2575 ttwu_do_activate(rq, p, 0);
2576 raw_spin_unlock(&rq->lock);
2439} 2577}
2440 2578
2441/** 2579/**
@@ -2453,92 +2591,64 @@ static inline void ttwu_post_activation(struct task_struct *p, struct rq *rq,
2453 * Returns %true if @p was woken up, %false if it was already running 2591 * Returns %true if @p was woken up, %false if it was already running
2454 * or @state didn't match @p's state. 2592 * or @state didn't match @p's state.
2455 */ 2593 */
2456static int try_to_wake_up(struct task_struct *p, unsigned int state, 2594static int
2457 int wake_flags) 2595try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
2458{ 2596{
2459 int cpu, orig_cpu, this_cpu, success = 0;
2460 unsigned long flags; 2597 unsigned long flags;
2461 unsigned long en_flags = ENQUEUE_WAKEUP; 2598 int cpu, success = 0;
2462 struct rq *rq;
2463
2464 this_cpu = get_cpu();
2465 2599
2466 smp_wmb(); 2600 smp_wmb();
2467 rq = task_rq_lock(p, &flags); 2601 raw_spin_lock_irqsave(&p->pi_lock, flags);
2468 if (!(p->state & state)) 2602 if (!(p->state & state))
2469 goto out; 2603 goto out;
2470 2604
2471 if (p->se.on_rq) 2605 success = 1; /* we're going to change ->state */
2472 goto out_running;
2473
2474 cpu = task_cpu(p); 2606 cpu = task_cpu(p);
2475 orig_cpu = cpu;
2476 2607
2477#ifdef CONFIG_SMP 2608 if (p->on_rq && ttwu_remote(p, wake_flags))
2478 if (unlikely(task_running(rq, p))) 2609 goto stat;
2479 goto out_activate;
2480 2610
2611#ifdef CONFIG_SMP
2481 /* 2612 /*
2482 * In order to handle concurrent wakeups and release the rq->lock 2613 * If the owning (remote) cpu is still in the middle of schedule() with
2483 * we put the task in TASK_WAKING state. 2614 * this task as prev, wait until its done referencing the task.
2484 *
2485 * First fix up the nr_uninterruptible count:
2486 */ 2615 */
2487 if (task_contributes_to_load(p)) { 2616 while (p->on_cpu) {
2488 if (likely(cpu_online(orig_cpu))) 2617#ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW
2489 rq->nr_uninterruptible--; 2618 /*
2490 else 2619 * If called from interrupt context we could have landed in the
2491 this_rq()->nr_uninterruptible--; 2620 * middle of schedule(), in this case we should take care not
2492 } 2621 * to spin on ->on_cpu if p is current, since that would
2493 p->state = TASK_WAKING; 2622 * deadlock.
2494 2623 */
2495 if (p->sched_class->task_waking) { 2624 if (p == current) {
2496 p->sched_class->task_waking(rq, p); 2625 ttwu_queue(p, cpu);
2497 en_flags |= ENQUEUE_WAKING; 2626 goto stat;
2627 }
2628#endif
2629 cpu_relax();
2498 } 2630 }
2499
2500 cpu = select_task_rq(rq, p, SD_BALANCE_WAKE, wake_flags);
2501 if (cpu != orig_cpu)
2502 set_task_cpu(p, cpu);
2503 __task_rq_unlock(rq);
2504
2505 rq = cpu_rq(cpu);
2506 raw_spin_lock(&rq->lock);
2507
2508 /* 2631 /*
2509 * We migrated the task without holding either rq->lock, however 2632 * Pairs with the smp_wmb() in finish_lock_switch().
2510 * since the task is not on the task list itself, nobody else
2511 * will try and migrate the task, hence the rq should match the
2512 * cpu we just moved it to.
2513 */ 2633 */
2514 WARN_ON(task_cpu(p) != cpu); 2634 smp_rmb();
2515 WARN_ON(p->state != TASK_WAKING);
2516 2635
2517#ifdef CONFIG_SCHEDSTATS 2636 p->sched_contributes_to_load = !!task_contributes_to_load(p);
2518 schedstat_inc(rq, ttwu_count); 2637 p->state = TASK_WAKING;
2519 if (cpu == this_cpu) 2638
2520 schedstat_inc(rq, ttwu_local); 2639 if (p->sched_class->task_waking)
2521 else { 2640 p->sched_class->task_waking(p);
2522 struct sched_domain *sd;
2523 for_each_domain(this_cpu, sd) {
2524 if (cpumask_test_cpu(cpu, sched_domain_span(sd))) {
2525 schedstat_inc(sd, ttwu_wake_remote);
2526 break;
2527 }
2528 }
2529 }
2530#endif /* CONFIG_SCHEDSTATS */
2531 2641
2532out_activate: 2642 cpu = select_task_rq(p, SD_BALANCE_WAKE, wake_flags);
2643 if (task_cpu(p) != cpu)
2644 set_task_cpu(p, cpu);
2533#endif /* CONFIG_SMP */ 2645#endif /* CONFIG_SMP */
2534 ttwu_activate(p, rq, wake_flags & WF_SYNC, orig_cpu != cpu, 2646
2535 cpu == this_cpu, en_flags); 2647 ttwu_queue(p, cpu);
2536 success = 1; 2648stat:
2537out_running: 2649 ttwu_stat(p, cpu, wake_flags);
2538 ttwu_post_activation(p, rq, wake_flags, success);
2539out: 2650out:
2540 task_rq_unlock(rq, &flags); 2651 raw_spin_unlock_irqrestore(&p->pi_lock, flags);
2541 put_cpu();
2542 2652
2543 return success; 2653 return success;
2544} 2654}
@@ -2547,31 +2657,34 @@ out:
2547 * try_to_wake_up_local - try to wake up a local task with rq lock held 2657 * try_to_wake_up_local - try to wake up a local task with rq lock held
2548 * @p: the thread to be awakened 2658 * @p: the thread to be awakened
2549 * 2659 *
2550 * Put @p on the run-queue if it's not already there. The caller must 2660 * Put @p on the run-queue if it's not already there. The caller must
2551 * ensure that this_rq() is locked, @p is bound to this_rq() and not 2661 * ensure that this_rq() is locked, @p is bound to this_rq() and not
2552 * the current task. this_rq() stays locked over invocation. 2662 * the current task.
2553 */ 2663 */
2554static void try_to_wake_up_local(struct task_struct *p) 2664static void try_to_wake_up_local(struct task_struct *p)
2555{ 2665{
2556 struct rq *rq = task_rq(p); 2666 struct rq *rq = task_rq(p);
2557 bool success = false;
2558 2667
2559 BUG_ON(rq != this_rq()); 2668 BUG_ON(rq != this_rq());
2560 BUG_ON(p == current); 2669 BUG_ON(p == current);
2561 lockdep_assert_held(&rq->lock); 2670 lockdep_assert_held(&rq->lock);
2562 2671
2672 if (!raw_spin_trylock(&p->pi_lock)) {
2673 raw_spin_unlock(&rq->lock);
2674 raw_spin_lock(&p->pi_lock);
2675 raw_spin_lock(&rq->lock);
2676 }
2677
2563 if (!(p->state & TASK_NORMAL)) 2678 if (!(p->state & TASK_NORMAL))
2564 return; 2679 goto out;
2565 2680
2566 if (!p->se.on_rq) { 2681 if (!p->on_rq)
2567 if (likely(!task_running(rq, p))) { 2682 ttwu_activate(rq, p, ENQUEUE_WAKEUP);
2568 schedstat_inc(rq, ttwu_count); 2683
2569 schedstat_inc(rq, ttwu_local); 2684 ttwu_do_wakeup(rq, p, 0);
2570 } 2685 ttwu_stat(p, smp_processor_id(), 0);
2571 ttwu_activate(p, rq, false, false, true, ENQUEUE_WAKEUP); 2686out:
2572 success = true; 2687 raw_spin_unlock(&p->pi_lock);
2573 }
2574 ttwu_post_activation(p, rq, 0, success);
2575} 2688}
2576 2689
2577/** 2690/**
@@ -2604,19 +2717,21 @@ int wake_up_state(struct task_struct *p, unsigned int state)
2604 */ 2717 */
2605static void __sched_fork(struct task_struct *p) 2718static void __sched_fork(struct task_struct *p)
2606{ 2719{
2720 p->on_rq = 0;
2721
2722 p->se.on_rq = 0;
2607 p->se.exec_start = 0; 2723 p->se.exec_start = 0;
2608 p->se.sum_exec_runtime = 0; 2724 p->se.sum_exec_runtime = 0;
2609 p->se.prev_sum_exec_runtime = 0; 2725 p->se.prev_sum_exec_runtime = 0;
2610 p->se.nr_migrations = 0; 2726 p->se.nr_migrations = 0;
2611 p->se.vruntime = 0; 2727 p->se.vruntime = 0;
2728 INIT_LIST_HEAD(&p->se.group_node);
2612 2729
2613#ifdef CONFIG_SCHEDSTATS 2730#ifdef CONFIG_SCHEDSTATS
2614 memset(&p->se.statistics, 0, sizeof(p->se.statistics)); 2731 memset(&p->se.statistics, 0, sizeof(p->se.statistics));
2615#endif 2732#endif
2616 2733
2617 INIT_LIST_HEAD(&p->rt.run_list); 2734 INIT_LIST_HEAD(&p->rt.run_list);
2618 p->se.on_rq = 0;
2619 INIT_LIST_HEAD(&p->se.group_node);
2620 2735
2621#ifdef CONFIG_PREEMPT_NOTIFIERS 2736#ifdef CONFIG_PREEMPT_NOTIFIERS
2622 INIT_HLIST_HEAD(&p->preempt_notifiers); 2737 INIT_HLIST_HEAD(&p->preempt_notifiers);
@@ -2626,8 +2741,9 @@ static void __sched_fork(struct task_struct *p)
2626/* 2741/*
2627 * fork()/clone()-time setup: 2742 * fork()/clone()-time setup:
2628 */ 2743 */
2629void sched_fork(struct task_struct *p, int clone_flags) 2744void sched_fork(struct task_struct *p)
2630{ 2745{
2746 unsigned long flags;
2631 int cpu = get_cpu(); 2747 int cpu = get_cpu();
2632 2748
2633 __sched_fork(p); 2749 __sched_fork(p);
@@ -2678,16 +2794,16 @@ void sched_fork(struct task_struct *p, int clone_flags)
2678 * 2794 *
2679 * Silence PROVE_RCU. 2795 * Silence PROVE_RCU.
2680 */ 2796 */
2681 rcu_read_lock(); 2797 raw_spin_lock_irqsave(&p->pi_lock, flags);
2682 set_task_cpu(p, cpu); 2798 set_task_cpu(p, cpu);
2683 rcu_read_unlock(); 2799 raw_spin_unlock_irqrestore(&p->pi_lock, flags);
2684 2800
2685#if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT) 2801#if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT)
2686 if (likely(sched_info_on())) 2802 if (likely(sched_info_on()))
2687 memset(&p->sched_info, 0, sizeof(p->sched_info)); 2803 memset(&p->sched_info, 0, sizeof(p->sched_info));
2688#endif 2804#endif
2689#if defined(CONFIG_SMP) && defined(__ARCH_WANT_UNLOCKED_CTXSW) 2805#if defined(CONFIG_SMP)
2690 p->oncpu = 0; 2806 p->on_cpu = 0;
2691#endif 2807#endif
2692#ifdef CONFIG_PREEMPT 2808#ifdef CONFIG_PREEMPT
2693 /* Want to start with kernel preemption disabled. */ 2809 /* Want to start with kernel preemption disabled. */
@@ -2707,41 +2823,31 @@ void sched_fork(struct task_struct *p, int clone_flags)
2707 * that must be done for every newly created context, then puts the task 2823 * that must be done for every newly created context, then puts the task
2708 * on the runqueue and wakes it. 2824 * on the runqueue and wakes it.
2709 */ 2825 */
2710void wake_up_new_task(struct task_struct *p, unsigned long clone_flags) 2826void wake_up_new_task(struct task_struct *p)
2711{ 2827{
2712 unsigned long flags; 2828 unsigned long flags;
2713 struct rq *rq; 2829 struct rq *rq;
2714 int cpu __maybe_unused = get_cpu();
2715 2830
2831 raw_spin_lock_irqsave(&p->pi_lock, flags);
2716#ifdef CONFIG_SMP 2832#ifdef CONFIG_SMP
2717 rq = task_rq_lock(p, &flags);
2718 p->state = TASK_WAKING;
2719
2720 /* 2833 /*
2721 * Fork balancing, do it here and not earlier because: 2834 * Fork balancing, do it here and not earlier because:
2722 * - cpus_allowed can change in the fork path 2835 * - cpus_allowed can change in the fork path
2723 * - any previously selected cpu might disappear through hotplug 2836 * - any previously selected cpu might disappear through hotplug
2724 *
2725 * We set TASK_WAKING so that select_task_rq() can drop rq->lock
2726 * without people poking at ->cpus_allowed.
2727 */ 2837 */
2728 cpu = select_task_rq(rq, p, SD_BALANCE_FORK, 0); 2838 set_task_cpu(p, select_task_rq(p, SD_BALANCE_FORK, 0));
2729 set_task_cpu(p, cpu);
2730
2731 p->state = TASK_RUNNING;
2732 task_rq_unlock(rq, &flags);
2733#endif 2839#endif
2734 2840
2735 rq = task_rq_lock(p, &flags); 2841 rq = __task_rq_lock(p);
2736 activate_task(rq, p, 0); 2842 activate_task(rq, p, 0);
2737 trace_sched_wakeup_new(p, 1); 2843 p->on_rq = 1;
2844 trace_sched_wakeup_new(p, true);
2738 check_preempt_curr(rq, p, WF_FORK); 2845 check_preempt_curr(rq, p, WF_FORK);
2739#ifdef CONFIG_SMP 2846#ifdef CONFIG_SMP
2740 if (p->sched_class->task_woken) 2847 if (p->sched_class->task_woken)
2741 p->sched_class->task_woken(rq, p); 2848 p->sched_class->task_woken(rq, p);
2742#endif 2849#endif
2743 task_rq_unlock(rq, &flags); 2850 task_rq_unlock(rq, p, &flags);
2744 put_cpu();
2745} 2851}
2746 2852
2747#ifdef CONFIG_PREEMPT_NOTIFIERS 2853#ifdef CONFIG_PREEMPT_NOTIFIERS
@@ -3450,27 +3556,22 @@ void sched_exec(void)
3450{ 3556{
3451 struct task_struct *p = current; 3557 struct task_struct *p = current;
3452 unsigned long flags; 3558 unsigned long flags;
3453 struct rq *rq;
3454 int dest_cpu; 3559 int dest_cpu;
3455 3560
3456 rq = task_rq_lock(p, &flags); 3561 raw_spin_lock_irqsave(&p->pi_lock, flags);
3457 dest_cpu = p->sched_class->select_task_rq(rq, p, SD_BALANCE_EXEC, 0); 3562 dest_cpu = p->sched_class->select_task_rq(p, SD_BALANCE_EXEC, 0);
3458 if (dest_cpu == smp_processor_id()) 3563 if (dest_cpu == smp_processor_id())
3459 goto unlock; 3564 goto unlock;
3460 3565
3461 /* 3566 if (likely(cpu_active(dest_cpu))) {
3462 * select_task_rq() can race against ->cpus_allowed
3463 */
3464 if (cpumask_test_cpu(dest_cpu, &p->cpus_allowed) &&
3465 likely(cpu_active(dest_cpu)) && migrate_task(p, rq)) {
3466 struct migration_arg arg = { p, dest_cpu }; 3567 struct migration_arg arg = { p, dest_cpu };
3467 3568
3468 task_rq_unlock(rq, &flags); 3569 raw_spin_unlock_irqrestore(&p->pi_lock, flags);
3469 stop_one_cpu(cpu_of(rq), migration_cpu_stop, &arg); 3570 stop_one_cpu(task_cpu(p), migration_cpu_stop, &arg);
3470 return; 3571 return;
3471 } 3572 }
3472unlock: 3573unlock:
3473 task_rq_unlock(rq, &flags); 3574 raw_spin_unlock_irqrestore(&p->pi_lock, flags);
3474} 3575}
3475 3576
3476#endif 3577#endif
@@ -3507,7 +3608,7 @@ unsigned long long task_delta_exec(struct task_struct *p)
3507 3608
3508 rq = task_rq_lock(p, &flags); 3609 rq = task_rq_lock(p, &flags);
3509 ns = do_task_delta_exec(p, rq); 3610 ns = do_task_delta_exec(p, rq);
3510 task_rq_unlock(rq, &flags); 3611 task_rq_unlock(rq, p, &flags);
3511 3612
3512 return ns; 3613 return ns;
3513} 3614}
@@ -3525,7 +3626,7 @@ unsigned long long task_sched_runtime(struct task_struct *p)
3525 3626
3526 rq = task_rq_lock(p, &flags); 3627 rq = task_rq_lock(p, &flags);
3527 ns = p->se.sum_exec_runtime + do_task_delta_exec(p, rq); 3628 ns = p->se.sum_exec_runtime + do_task_delta_exec(p, rq);
3528 task_rq_unlock(rq, &flags); 3629 task_rq_unlock(rq, p, &flags);
3529 3630
3530 return ns; 3631 return ns;
3531} 3632}
@@ -3549,7 +3650,7 @@ unsigned long long thread_group_sched_runtime(struct task_struct *p)
3549 rq = task_rq_lock(p, &flags); 3650 rq = task_rq_lock(p, &flags);
3550 thread_group_cputime(p, &totals); 3651 thread_group_cputime(p, &totals);
3551 ns = totals.sum_exec_runtime + do_task_delta_exec(p, rq); 3652 ns = totals.sum_exec_runtime + do_task_delta_exec(p, rq);
3552 task_rq_unlock(rq, &flags); 3653 task_rq_unlock(rq, p, &flags);
3553 3654
3554 return ns; 3655 return ns;
3555} 3656}
@@ -3903,9 +4004,6 @@ void thread_group_times(struct task_struct *p, cputime_t *ut, cputime_t *st)
3903/* 4004/*
3904 * This function gets called by the timer code, with HZ frequency. 4005 * This function gets called by the timer code, with HZ frequency.
3905 * We call it with interrupts disabled. 4006 * We call it with interrupts disabled.
3906 *
3907 * It also gets called by the fork code, when changing the parent's
3908 * timeslices.
3909 */ 4007 */
3910void scheduler_tick(void) 4008void scheduler_tick(void)
3911{ 4009{
@@ -4025,17 +4123,11 @@ static inline void schedule_debug(struct task_struct *prev)
4025 profile_hit(SCHED_PROFILING, __builtin_return_address(0)); 4123 profile_hit(SCHED_PROFILING, __builtin_return_address(0));
4026 4124
4027 schedstat_inc(this_rq(), sched_count); 4125 schedstat_inc(this_rq(), sched_count);
4028#ifdef CONFIG_SCHEDSTATS
4029 if (unlikely(prev->lock_depth >= 0)) {
4030 schedstat_inc(this_rq(), rq_sched_info.bkl_count);
4031 schedstat_inc(prev, sched_info.bkl_count);
4032 }
4033#endif
4034} 4126}
4035 4127
4036static void put_prev_task(struct rq *rq, struct task_struct *prev) 4128static void put_prev_task(struct rq *rq, struct task_struct *prev)
4037{ 4129{
4038 if (prev->se.on_rq) 4130 if (prev->on_rq || rq->skip_clock_update < 0)
4039 update_rq_clock(rq); 4131 update_rq_clock(rq);
4040 prev->sched_class->put_prev_task(rq, prev); 4132 prev->sched_class->put_prev_task(rq, prev);
4041} 4133}
@@ -4097,11 +4189,13 @@ need_resched:
4097 if (unlikely(signal_pending_state(prev->state, prev))) { 4189 if (unlikely(signal_pending_state(prev->state, prev))) {
4098 prev->state = TASK_RUNNING; 4190 prev->state = TASK_RUNNING;
4099 } else { 4191 } else {
4192 deactivate_task(rq, prev, DEQUEUE_SLEEP);
4193 prev->on_rq = 0;
4194
4100 /* 4195 /*
4101 * If a worker is going to sleep, notify and 4196 * If a worker went to sleep, notify and ask workqueue
4102 * ask workqueue whether it wants to wake up a 4197 * whether it wants to wake up a task to maintain
4103 * task to maintain concurrency. If so, wake 4198 * concurrency.
4104 * up the task.
4105 */ 4199 */
4106 if (prev->flags & PF_WQ_WORKER) { 4200 if (prev->flags & PF_WQ_WORKER) {
4107 struct task_struct *to_wakeup; 4201 struct task_struct *to_wakeup;
@@ -4110,11 +4204,10 @@ need_resched:
4110 if (to_wakeup) 4204 if (to_wakeup)
4111 try_to_wake_up_local(to_wakeup); 4205 try_to_wake_up_local(to_wakeup);
4112 } 4206 }
4113 deactivate_task(rq, prev, DEQUEUE_SLEEP);
4114 4207
4115 /* 4208 /*
4116 * If we are going to sleep and we have plugged IO queued, make 4209 * If we are going to sleep and we have plugged IO
4117 * sure to submit it to avoid deadlocks. 4210 * queued, make sure to submit it to avoid deadlocks.
4118 */ 4211 */
4119 if (blk_needs_flush_plug(prev)) { 4212 if (blk_needs_flush_plug(prev)) {
4120 raw_spin_unlock(&rq->lock); 4213 raw_spin_unlock(&rq->lock);
@@ -4161,70 +4254,53 @@ need_resched:
4161EXPORT_SYMBOL(schedule); 4254EXPORT_SYMBOL(schedule);
4162 4255
4163#ifdef CONFIG_MUTEX_SPIN_ON_OWNER 4256#ifdef CONFIG_MUTEX_SPIN_ON_OWNER
4164/*
4165 * Look out! "owner" is an entirely speculative pointer
4166 * access and not reliable.
4167 */
4168int mutex_spin_on_owner(struct mutex *lock, struct thread_info *owner)
4169{
4170 unsigned int cpu;
4171 struct rq *rq;
4172 4257
4173 if (!sched_feat(OWNER_SPIN)) 4258static inline bool owner_running(struct mutex *lock, struct task_struct *owner)
4174 return 0; 4259{
4260 bool ret = false;
4175 4261
4176#ifdef CONFIG_DEBUG_PAGEALLOC 4262 rcu_read_lock();
4177 /* 4263 if (lock->owner != owner)
4178 * Need to access the cpu field knowing that 4264 goto fail;
4179 * DEBUG_PAGEALLOC could have unmapped it if
4180 * the mutex owner just released it and exited.
4181 */
4182 if (probe_kernel_address(&owner->cpu, cpu))
4183 return 0;
4184#else
4185 cpu = owner->cpu;
4186#endif
4187 4265
4188 /* 4266 /*
4189 * Even if the access succeeded (likely case), 4267 * Ensure we emit the owner->on_cpu, dereference _after_ checking
4190 * the cpu field may no longer be valid. 4268 * lock->owner still matches owner, if that fails, owner might
4269 * point to free()d memory, if it still matches, the rcu_read_lock()
4270 * ensures the memory stays valid.
4191 */ 4271 */
4192 if (cpu >= nr_cpumask_bits) 4272 barrier();
4193 return 0;
4194 4273
4195 /* 4274 ret = owner->on_cpu;
4196 * We need to validate that we can do a 4275fail:
4197 * get_cpu() and that we have the percpu area. 4276 rcu_read_unlock();
4198 */
4199 if (!cpu_online(cpu))
4200 return 0;
4201 4277
4202 rq = cpu_rq(cpu); 4278 return ret;
4279}
4203 4280
4204 for (;;) { 4281/*
4205 /* 4282 * Look out! "owner" is an entirely speculative pointer
4206 * Owner changed, break to re-assess state. 4283 * access and not reliable.
4207 */ 4284 */
4208 if (lock->owner != owner) { 4285int mutex_spin_on_owner(struct mutex *lock, struct task_struct *owner)
4209 /* 4286{
4210 * If the lock has switched to a different owner, 4287 if (!sched_feat(OWNER_SPIN))
4211 * we likely have heavy contention. Return 0 to quit 4288 return 0;
4212 * optimistic spinning and not contend further:
4213 */
4214 if (lock->owner)
4215 return 0;
4216 break;
4217 }
4218 4289
4219 /* 4290 while (owner_running(lock, owner)) {
4220 * Is that owner really running on that cpu? 4291 if (need_resched())
4221 */
4222 if (task_thread_info(rq->curr) != owner || need_resched())
4223 return 0; 4292 return 0;
4224 4293
4225 arch_mutex_cpu_relax(); 4294 arch_mutex_cpu_relax();
4226 } 4295 }
4227 4296
4297 /*
4298 * If the owner changed to another task there is likely
4299 * heavy contention, stop spinning.
4300 */
4301 if (lock->owner)
4302 return 0;
4303
4228 return 1; 4304 return 1;
4229} 4305}
4230#endif 4306#endif
@@ -4684,19 +4760,18 @@ EXPORT_SYMBOL(sleep_on_timeout);
4684 */ 4760 */
4685void rt_mutex_setprio(struct task_struct *p, int prio) 4761void rt_mutex_setprio(struct task_struct *p, int prio)
4686{ 4762{
4687 unsigned long flags;
4688 int oldprio, on_rq, running; 4763 int oldprio, on_rq, running;
4689 struct rq *rq; 4764 struct rq *rq;
4690 const struct sched_class *prev_class; 4765 const struct sched_class *prev_class;
4691 4766
4692 BUG_ON(prio < 0 || prio > MAX_PRIO); 4767 BUG_ON(prio < 0 || prio > MAX_PRIO);
4693 4768
4694 rq = task_rq_lock(p, &flags); 4769 rq = __task_rq_lock(p);
4695 4770
4696 trace_sched_pi_setprio(p, prio); 4771 trace_sched_pi_setprio(p, prio);
4697 oldprio = p->prio; 4772 oldprio = p->prio;
4698 prev_class = p->sched_class; 4773 prev_class = p->sched_class;
4699 on_rq = p->se.on_rq; 4774 on_rq = p->on_rq;
4700 running = task_current(rq, p); 4775 running = task_current(rq, p);
4701 if (on_rq) 4776 if (on_rq)
4702 dequeue_task(rq, p, 0); 4777 dequeue_task(rq, p, 0);
@@ -4716,7 +4791,7 @@ void rt_mutex_setprio(struct task_struct *p, int prio)
4716 enqueue_task(rq, p, oldprio < prio ? ENQUEUE_HEAD : 0); 4791 enqueue_task(rq, p, oldprio < prio ? ENQUEUE_HEAD : 0);
4717 4792
4718 check_class_changed(rq, p, prev_class, oldprio); 4793 check_class_changed(rq, p, prev_class, oldprio);
4719 task_rq_unlock(rq, &flags); 4794 __task_rq_unlock(rq);
4720} 4795}
4721 4796
4722#endif 4797#endif
@@ -4744,7 +4819,7 @@ void set_user_nice(struct task_struct *p, long nice)
4744 p->static_prio = NICE_TO_PRIO(nice); 4819 p->static_prio = NICE_TO_PRIO(nice);
4745 goto out_unlock; 4820 goto out_unlock;
4746 } 4821 }
4747 on_rq = p->se.on_rq; 4822 on_rq = p->on_rq;
4748 if (on_rq) 4823 if (on_rq)
4749 dequeue_task(rq, p, 0); 4824 dequeue_task(rq, p, 0);
4750 4825
@@ -4764,7 +4839,7 @@ void set_user_nice(struct task_struct *p, long nice)
4764 resched_task(rq->curr); 4839 resched_task(rq->curr);
4765 } 4840 }
4766out_unlock: 4841out_unlock:
4767 task_rq_unlock(rq, &flags); 4842 task_rq_unlock(rq, p, &flags);
4768} 4843}
4769EXPORT_SYMBOL(set_user_nice); 4844EXPORT_SYMBOL(set_user_nice);
4770 4845
@@ -4878,8 +4953,6 @@ static struct task_struct *find_process_by_pid(pid_t pid)
4878static void 4953static void
4879__setscheduler(struct rq *rq, struct task_struct *p, int policy, int prio) 4954__setscheduler(struct rq *rq, struct task_struct *p, int policy, int prio)
4880{ 4955{
4881 BUG_ON(p->se.on_rq);
4882
4883 p->policy = policy; 4956 p->policy = policy;
4884 p->rt_priority = prio; 4957 p->rt_priority = prio;
4885 p->normal_prio = normal_prio(p); 4958 p->normal_prio = normal_prio(p);
@@ -4994,20 +5067,17 @@ recheck:
4994 /* 5067 /*
4995 * make sure no PI-waiters arrive (or leave) while we are 5068 * make sure no PI-waiters arrive (or leave) while we are
4996 * changing the priority of the task: 5069 * changing the priority of the task:
4997 */ 5070 *
4998 raw_spin_lock_irqsave(&p->pi_lock, flags);
4999 /*
5000 * To be able to change p->policy safely, the appropriate 5071 * To be able to change p->policy safely, the appropriate
5001 * runqueue lock must be held. 5072 * runqueue lock must be held.
5002 */ 5073 */
5003 rq = __task_rq_lock(p); 5074 rq = task_rq_lock(p, &flags);
5004 5075
5005 /* 5076 /*
5006 * Changing the policy of the stop threads its a very bad idea 5077 * Changing the policy of the stop threads its a very bad idea
5007 */ 5078 */
5008 if (p == rq->stop) { 5079 if (p == rq->stop) {
5009 __task_rq_unlock(rq); 5080 task_rq_unlock(rq, p, &flags);
5010 raw_spin_unlock_irqrestore(&p->pi_lock, flags);
5011 return -EINVAL; 5081 return -EINVAL;
5012 } 5082 }
5013 5083
@@ -5031,8 +5101,7 @@ recheck:
5031 if (rt_bandwidth_enabled() && rt_policy(policy) && 5101 if (rt_bandwidth_enabled() && rt_policy(policy) &&
5032 task_group(p)->rt_bandwidth.rt_runtime == 0 && 5102 task_group(p)->rt_bandwidth.rt_runtime == 0 &&
5033 !task_group_is_autogroup(task_group(p))) { 5103 !task_group_is_autogroup(task_group(p))) {
5034 __task_rq_unlock(rq); 5104 task_rq_unlock(rq, p, &flags);
5035 raw_spin_unlock_irqrestore(&p->pi_lock, flags);
5036 return -EPERM; 5105 return -EPERM;
5037 } 5106 }
5038 } 5107 }
@@ -5041,11 +5110,10 @@ recheck:
5041 /* recheck policy now with rq lock held */ 5110 /* recheck policy now with rq lock held */
5042 if (unlikely(oldpolicy != -1 && oldpolicy != p->policy)) { 5111 if (unlikely(oldpolicy != -1 && oldpolicy != p->policy)) {
5043 policy = oldpolicy = -1; 5112 policy = oldpolicy = -1;
5044 __task_rq_unlock(rq); 5113 task_rq_unlock(rq, p, &flags);
5045 raw_spin_unlock_irqrestore(&p->pi_lock, flags);
5046 goto recheck; 5114 goto recheck;
5047 } 5115 }
5048 on_rq = p->se.on_rq; 5116 on_rq = p->on_rq;
5049 running = task_current(rq, p); 5117 running = task_current(rq, p);
5050 if (on_rq) 5118 if (on_rq)
5051 deactivate_task(rq, p, 0); 5119 deactivate_task(rq, p, 0);
@@ -5064,8 +5132,7 @@ recheck:
5064 activate_task(rq, p, 0); 5132 activate_task(rq, p, 0);
5065 5133
5066 check_class_changed(rq, p, prev_class, oldprio); 5134 check_class_changed(rq, p, prev_class, oldprio);
5067 __task_rq_unlock(rq); 5135 task_rq_unlock(rq, p, &flags);
5068 raw_spin_unlock_irqrestore(&p->pi_lock, flags);
5069 5136
5070 rt_mutex_adjust_pi(p); 5137 rt_mutex_adjust_pi(p);
5071 5138
@@ -5316,7 +5383,6 @@ long sched_getaffinity(pid_t pid, struct cpumask *mask)
5316{ 5383{
5317 struct task_struct *p; 5384 struct task_struct *p;
5318 unsigned long flags; 5385 unsigned long flags;
5319 struct rq *rq;
5320 int retval; 5386 int retval;
5321 5387
5322 get_online_cpus(); 5388 get_online_cpus();
@@ -5331,9 +5397,9 @@ long sched_getaffinity(pid_t pid, struct cpumask *mask)
5331 if (retval) 5397 if (retval)
5332 goto out_unlock; 5398 goto out_unlock;
5333 5399
5334 rq = task_rq_lock(p, &flags); 5400 raw_spin_lock_irqsave(&p->pi_lock, flags);
5335 cpumask_and(mask, &p->cpus_allowed, cpu_online_mask); 5401 cpumask_and(mask, &p->cpus_allowed, cpu_online_mask);
5336 task_rq_unlock(rq, &flags); 5402 raw_spin_unlock_irqrestore(&p->pi_lock, flags);
5337 5403
5338out_unlock: 5404out_unlock:
5339 rcu_read_unlock(); 5405 rcu_read_unlock();
@@ -5658,7 +5724,7 @@ SYSCALL_DEFINE2(sched_rr_get_interval, pid_t, pid,
5658 5724
5659 rq = task_rq_lock(p, &flags); 5725 rq = task_rq_lock(p, &flags);
5660 time_slice = p->sched_class->get_rr_interval(rq, p); 5726 time_slice = p->sched_class->get_rr_interval(rq, p);
5661 task_rq_unlock(rq, &flags); 5727 task_rq_unlock(rq, p, &flags);
5662 5728
5663 rcu_read_unlock(); 5729 rcu_read_unlock();
5664 jiffies_to_timespec(time_slice, &t); 5730 jiffies_to_timespec(time_slice, &t);
@@ -5776,17 +5842,14 @@ void __cpuinit init_idle(struct task_struct *idle, int cpu)
5776 rcu_read_unlock(); 5842 rcu_read_unlock();
5777 5843
5778 rq->curr = rq->idle = idle; 5844 rq->curr = rq->idle = idle;
5779#if defined(CONFIG_SMP) && defined(__ARCH_WANT_UNLOCKED_CTXSW) 5845#if defined(CONFIG_SMP)
5780 idle->oncpu = 1; 5846 idle->on_cpu = 1;
5781#endif 5847#endif
5782 raw_spin_unlock_irqrestore(&rq->lock, flags); 5848 raw_spin_unlock_irqrestore(&rq->lock, flags);
5783 5849
5784 /* Set the preempt count _outside_ the spinlocks! */ 5850 /* Set the preempt count _outside_ the spinlocks! */
5785#if defined(CONFIG_PREEMPT)
5786 task_thread_info(idle)->preempt_count = (idle->lock_depth >= 0);
5787#else
5788 task_thread_info(idle)->preempt_count = 0; 5851 task_thread_info(idle)->preempt_count = 0;
5789#endif 5852
5790 /* 5853 /*
5791 * The idle tasks have their own, simple scheduling class: 5854 * The idle tasks have their own, simple scheduling class:
5792 */ 5855 */
@@ -5881,26 +5944,17 @@ int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask)
5881 unsigned int dest_cpu; 5944 unsigned int dest_cpu;
5882 int ret = 0; 5945 int ret = 0;
5883 5946
5884 /*
5885 * Serialize against TASK_WAKING so that ttwu() and wunt() can
5886 * drop the rq->lock and still rely on ->cpus_allowed.
5887 */
5888again:
5889 while (task_is_waking(p))
5890 cpu_relax();
5891 rq = task_rq_lock(p, &flags); 5947 rq = task_rq_lock(p, &flags);
5892 if (task_is_waking(p)) { 5948
5893 task_rq_unlock(rq, &flags); 5949 if (cpumask_equal(&p->cpus_allowed, new_mask))
5894 goto again; 5950 goto out;
5895 }
5896 5951
5897 if (!cpumask_intersects(new_mask, cpu_active_mask)) { 5952 if (!cpumask_intersects(new_mask, cpu_active_mask)) {
5898 ret = -EINVAL; 5953 ret = -EINVAL;
5899 goto out; 5954 goto out;
5900 } 5955 }
5901 5956
5902 if (unlikely((p->flags & PF_THREAD_BOUND) && p != current && 5957 if (unlikely((p->flags & PF_THREAD_BOUND) && p != current)) {
5903 !cpumask_equal(&p->cpus_allowed, new_mask))) {
5904 ret = -EINVAL; 5958 ret = -EINVAL;
5905 goto out; 5959 goto out;
5906 } 5960 }
@@ -5917,16 +5971,16 @@ again:
5917 goto out; 5971 goto out;
5918 5972
5919 dest_cpu = cpumask_any_and(cpu_active_mask, new_mask); 5973 dest_cpu = cpumask_any_and(cpu_active_mask, new_mask);
5920 if (migrate_task(p, rq)) { 5974 if (p->on_rq) {
5921 struct migration_arg arg = { p, dest_cpu }; 5975 struct migration_arg arg = { p, dest_cpu };
5922 /* Need help from migration thread: drop lock and wait. */ 5976 /* Need help from migration thread: drop lock and wait. */
5923 task_rq_unlock(rq, &flags); 5977 task_rq_unlock(rq, p, &flags);
5924 stop_one_cpu(cpu_of(rq), migration_cpu_stop, &arg); 5978 stop_one_cpu(cpu_of(rq), migration_cpu_stop, &arg);
5925 tlb_migrate_finish(p->mm); 5979 tlb_migrate_finish(p->mm);
5926 return 0; 5980 return 0;
5927 } 5981 }
5928out: 5982out:
5929 task_rq_unlock(rq, &flags); 5983 task_rq_unlock(rq, p, &flags);
5930 5984
5931 return ret; 5985 return ret;
5932} 5986}
@@ -5954,6 +6008,7 @@ static int __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu)
5954 rq_src = cpu_rq(src_cpu); 6008 rq_src = cpu_rq(src_cpu);
5955 rq_dest = cpu_rq(dest_cpu); 6009 rq_dest = cpu_rq(dest_cpu);
5956 6010
6011 raw_spin_lock(&p->pi_lock);
5957 double_rq_lock(rq_src, rq_dest); 6012 double_rq_lock(rq_src, rq_dest);
5958 /* Already moved. */ 6013 /* Already moved. */
5959 if (task_cpu(p) != src_cpu) 6014 if (task_cpu(p) != src_cpu)
@@ -5966,7 +6021,7 @@ static int __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu)
5966 * If we're not on a rq, the next wake-up will ensure we're 6021 * If we're not on a rq, the next wake-up will ensure we're
5967 * placed properly. 6022 * placed properly.
5968 */ 6023 */
5969 if (p->se.on_rq) { 6024 if (p->on_rq) {
5970 deactivate_task(rq_src, p, 0); 6025 deactivate_task(rq_src, p, 0);
5971 set_task_cpu(p, dest_cpu); 6026 set_task_cpu(p, dest_cpu);
5972 activate_task(rq_dest, p, 0); 6027 activate_task(rq_dest, p, 0);
@@ -5976,6 +6031,7 @@ done:
5976 ret = 1; 6031 ret = 1;
5977fail: 6032fail:
5978 double_rq_unlock(rq_src, rq_dest); 6033 double_rq_unlock(rq_src, rq_dest);
6034 raw_spin_unlock(&p->pi_lock);
5979 return ret; 6035 return ret;
5980} 6036}
5981 6037
@@ -6316,6 +6372,7 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
6316 6372
6317#ifdef CONFIG_HOTPLUG_CPU 6373#ifdef CONFIG_HOTPLUG_CPU
6318 case CPU_DYING: 6374 case CPU_DYING:
6375 sched_ttwu_pending();
6319 /* Update our root-domain */ 6376 /* Update our root-domain */
6320 raw_spin_lock_irqsave(&rq->lock, flags); 6377 raw_spin_lock_irqsave(&rq->lock, flags);
6321 if (rq->rd) { 6378 if (rq->rd) {
@@ -6394,6 +6451,8 @@ early_initcall(migration_init);
6394 6451
6395#ifdef CONFIG_SMP 6452#ifdef CONFIG_SMP
6396 6453
6454static cpumask_var_t sched_domains_tmpmask; /* sched_domains_mutex */
6455
6397#ifdef CONFIG_SCHED_DEBUG 6456#ifdef CONFIG_SCHED_DEBUG
6398 6457
6399static __read_mostly int sched_domain_debug_enabled; 6458static __read_mostly int sched_domain_debug_enabled;
@@ -6489,7 +6548,6 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level,
6489 6548
6490static void sched_domain_debug(struct sched_domain *sd, int cpu) 6549static void sched_domain_debug(struct sched_domain *sd, int cpu)
6491{ 6550{
6492 cpumask_var_t groupmask;
6493 int level = 0; 6551 int level = 0;
6494 6552
6495 if (!sched_domain_debug_enabled) 6553 if (!sched_domain_debug_enabled)
@@ -6502,20 +6560,14 @@ static void sched_domain_debug(struct sched_domain *sd, int cpu)
6502 6560
6503 printk(KERN_DEBUG "CPU%d attaching sched-domain:\n", cpu); 6561 printk(KERN_DEBUG "CPU%d attaching sched-domain:\n", cpu);
6504 6562
6505 if (!alloc_cpumask_var(&groupmask, GFP_KERNEL)) {
6506 printk(KERN_DEBUG "Cannot load-balance (out of memory)\n");
6507 return;
6508 }
6509
6510 for (;;) { 6563 for (;;) {
6511 if (sched_domain_debug_one(sd, cpu, level, groupmask)) 6564 if (sched_domain_debug_one(sd, cpu, level, sched_domains_tmpmask))
6512 break; 6565 break;
6513 level++; 6566 level++;
6514 sd = sd->parent; 6567 sd = sd->parent;
6515 if (!sd) 6568 if (!sd)
6516 break; 6569 break;
6517 } 6570 }
6518 free_cpumask_var(groupmask);
6519} 6571}
6520#else /* !CONFIG_SCHED_DEBUG */ 6572#else /* !CONFIG_SCHED_DEBUG */
6521# define sched_domain_debug(sd, cpu) do { } while (0) 6573# define sched_domain_debug(sd, cpu) do { } while (0)
@@ -6572,12 +6624,11 @@ sd_parent_degenerate(struct sched_domain *sd, struct sched_domain *parent)
6572 return 1; 6624 return 1;
6573} 6625}
6574 6626
6575static void free_rootdomain(struct root_domain *rd) 6627static void free_rootdomain(struct rcu_head *rcu)
6576{ 6628{
6577 synchronize_sched(); 6629 struct root_domain *rd = container_of(rcu, struct root_domain, rcu);
6578 6630
6579 cpupri_cleanup(&rd->cpupri); 6631 cpupri_cleanup(&rd->cpupri);
6580
6581 free_cpumask_var(rd->rto_mask); 6632 free_cpumask_var(rd->rto_mask);
6582 free_cpumask_var(rd->online); 6633 free_cpumask_var(rd->online);
6583 free_cpumask_var(rd->span); 6634 free_cpumask_var(rd->span);
@@ -6618,7 +6669,7 @@ static void rq_attach_root(struct rq *rq, struct root_domain *rd)
6618 raw_spin_unlock_irqrestore(&rq->lock, flags); 6669 raw_spin_unlock_irqrestore(&rq->lock, flags);
6619 6670
6620 if (old_rd) 6671 if (old_rd)
6621 free_rootdomain(old_rd); 6672 call_rcu_sched(&old_rd->rcu, free_rootdomain);
6622} 6673}
6623 6674
6624static int init_rootdomain(struct root_domain *rd) 6675static int init_rootdomain(struct root_domain *rd)
@@ -6669,6 +6720,25 @@ static struct root_domain *alloc_rootdomain(void)
6669 return rd; 6720 return rd;
6670} 6721}
6671 6722
6723static void free_sched_domain(struct rcu_head *rcu)
6724{
6725 struct sched_domain *sd = container_of(rcu, struct sched_domain, rcu);
6726 if (atomic_dec_and_test(&sd->groups->ref))
6727 kfree(sd->groups);
6728 kfree(sd);
6729}
6730
6731static void destroy_sched_domain(struct sched_domain *sd, int cpu)
6732{
6733 call_rcu(&sd->rcu, free_sched_domain);
6734}
6735
6736static void destroy_sched_domains(struct sched_domain *sd, int cpu)
6737{
6738 for (; sd; sd = sd->parent)
6739 destroy_sched_domain(sd, cpu);
6740}
6741
6672/* 6742/*
6673 * Attach the domain 'sd' to 'cpu' as its base domain. Callers must 6743 * Attach the domain 'sd' to 'cpu' as its base domain. Callers must
6674 * hold the hotplug lock. 6744 * hold the hotplug lock.
@@ -6679,9 +6749,6 @@ cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu)
6679 struct rq *rq = cpu_rq(cpu); 6749 struct rq *rq = cpu_rq(cpu);
6680 struct sched_domain *tmp; 6750 struct sched_domain *tmp;
6681 6751
6682 for (tmp = sd; tmp; tmp = tmp->parent)
6683 tmp->span_weight = cpumask_weight(sched_domain_span(tmp));
6684
6685 /* Remove the sched domains which do not contribute to scheduling. */ 6752 /* Remove the sched domains which do not contribute to scheduling. */
6686 for (tmp = sd; tmp; ) { 6753 for (tmp = sd; tmp; ) {
6687 struct sched_domain *parent = tmp->parent; 6754 struct sched_domain *parent = tmp->parent;
@@ -6692,12 +6759,15 @@ cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu)
6692 tmp->parent = parent->parent; 6759 tmp->parent = parent->parent;
6693 if (parent->parent) 6760 if (parent->parent)
6694 parent->parent->child = tmp; 6761 parent->parent->child = tmp;
6762 destroy_sched_domain(parent, cpu);
6695 } else 6763 } else
6696 tmp = tmp->parent; 6764 tmp = tmp->parent;
6697 } 6765 }
6698 6766
6699 if (sd && sd_degenerate(sd)) { 6767 if (sd && sd_degenerate(sd)) {
6768 tmp = sd;
6700 sd = sd->parent; 6769 sd = sd->parent;
6770 destroy_sched_domain(tmp, cpu);
6701 if (sd) 6771 if (sd)
6702 sd->child = NULL; 6772 sd->child = NULL;
6703 } 6773 }
@@ -6705,7 +6775,9 @@ cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu)
6705 sched_domain_debug(sd, cpu); 6775 sched_domain_debug(sd, cpu);
6706 6776
6707 rq_attach_root(rq, rd); 6777 rq_attach_root(rq, rd);
6778 tmp = rq->sd;
6708 rcu_assign_pointer(rq->sd, sd); 6779 rcu_assign_pointer(rq->sd, sd);
6780 destroy_sched_domains(tmp, cpu);
6709} 6781}
6710 6782
6711/* cpus with isolated domains */ 6783/* cpus with isolated domains */
@@ -6721,56 +6793,6 @@ static int __init isolated_cpu_setup(char *str)
6721 6793
6722__setup("isolcpus=", isolated_cpu_setup); 6794__setup("isolcpus=", isolated_cpu_setup);
6723 6795
6724/*
6725 * init_sched_build_groups takes the cpumask we wish to span, and a pointer
6726 * to a function which identifies what group(along with sched group) a CPU
6727 * belongs to. The return value of group_fn must be a >= 0 and < nr_cpu_ids
6728 * (due to the fact that we keep track of groups covered with a struct cpumask).
6729 *
6730 * init_sched_build_groups will build a circular linked list of the groups
6731 * covered by the given span, and will set each group's ->cpumask correctly,
6732 * and ->cpu_power to 0.
6733 */
6734static void
6735init_sched_build_groups(const struct cpumask *span,
6736 const struct cpumask *cpu_map,
6737 int (*group_fn)(int cpu, const struct cpumask *cpu_map,
6738 struct sched_group **sg,
6739 struct cpumask *tmpmask),
6740 struct cpumask *covered, struct cpumask *tmpmask)
6741{
6742 struct sched_group *first = NULL, *last = NULL;
6743 int i;
6744
6745 cpumask_clear(covered);
6746
6747 for_each_cpu(i, span) {
6748 struct sched_group *sg;
6749 int group = group_fn(i, cpu_map, &sg, tmpmask);
6750 int j;
6751
6752 if (cpumask_test_cpu(i, covered))
6753 continue;
6754
6755 cpumask_clear(sched_group_cpus(sg));
6756 sg->cpu_power = 0;
6757
6758 for_each_cpu(j, span) {
6759 if (group_fn(j, cpu_map, NULL, tmpmask) != group)
6760 continue;
6761
6762 cpumask_set_cpu(j, covered);
6763 cpumask_set_cpu(j, sched_group_cpus(sg));
6764 }
6765 if (!first)
6766 first = sg;
6767 if (last)
6768 last->next = sg;
6769 last = sg;
6770 }
6771 last->next = first;
6772}
6773
6774#define SD_NODES_PER_DOMAIN 16 6796#define SD_NODES_PER_DOMAIN 16
6775 6797
6776#ifdef CONFIG_NUMA 6798#ifdef CONFIG_NUMA
@@ -6787,7 +6809,7 @@ init_sched_build_groups(const struct cpumask *span,
6787 */ 6809 */
6788static int find_next_best_node(int node, nodemask_t *used_nodes) 6810static int find_next_best_node(int node, nodemask_t *used_nodes)
6789{ 6811{
6790 int i, n, val, min_val, best_node = 0; 6812 int i, n, val, min_val, best_node = -1;
6791 6813
6792 min_val = INT_MAX; 6814 min_val = INT_MAX;
6793 6815
@@ -6811,7 +6833,8 @@ static int find_next_best_node(int node, nodemask_t *used_nodes)
6811 } 6833 }
6812 } 6834 }
6813 6835
6814 node_set(best_node, *used_nodes); 6836 if (best_node != -1)
6837 node_set(best_node, *used_nodes);
6815 return best_node; 6838 return best_node;
6816} 6839}
6817 6840
@@ -6837,315 +6860,130 @@ static void sched_domain_node_span(int node, struct cpumask *span)
6837 6860
6838 for (i = 1; i < SD_NODES_PER_DOMAIN; i++) { 6861 for (i = 1; i < SD_NODES_PER_DOMAIN; i++) {
6839 int next_node = find_next_best_node(node, &used_nodes); 6862 int next_node = find_next_best_node(node, &used_nodes);
6840 6863 if (next_node < 0)
6864 break;
6841 cpumask_or(span, span, cpumask_of_node(next_node)); 6865 cpumask_or(span, span, cpumask_of_node(next_node));
6842 } 6866 }
6843} 6867}
6868
6869static const struct cpumask *cpu_node_mask(int cpu)
6870{
6871 lockdep_assert_held(&sched_domains_mutex);
6872
6873 sched_domain_node_span(cpu_to_node(cpu), sched_domains_tmpmask);
6874
6875 return sched_domains_tmpmask;
6876}
6877
6878static const struct cpumask *cpu_allnodes_mask(int cpu)
6879{
6880 return cpu_possible_mask;
6881}
6844#endif /* CONFIG_NUMA */ 6882#endif /* CONFIG_NUMA */
6845 6883
6846int sched_smt_power_savings = 0, sched_mc_power_savings = 0; 6884static const struct cpumask *cpu_cpu_mask(int cpu)
6885{
6886 return cpumask_of_node(cpu_to_node(cpu));
6887}
6847 6888
6848/* 6889int sched_smt_power_savings = 0, sched_mc_power_savings = 0;
6849 * The cpus mask in sched_group and sched_domain hangs off the end.
6850 *
6851 * ( See the the comments in include/linux/sched.h:struct sched_group
6852 * and struct sched_domain. )
6853 */
6854struct static_sched_group {
6855 struct sched_group sg;
6856 DECLARE_BITMAP(cpus, CONFIG_NR_CPUS);
6857};
6858 6890
6859struct static_sched_domain { 6891struct sd_data {
6860 struct sched_domain sd; 6892 struct sched_domain **__percpu sd;
6861 DECLARE_BITMAP(span, CONFIG_NR_CPUS); 6893 struct sched_group **__percpu sg;
6862}; 6894};
6863 6895
6864struct s_data { 6896struct s_data {
6865#ifdef CONFIG_NUMA 6897 struct sched_domain ** __percpu sd;
6866 int sd_allnodes;
6867 cpumask_var_t domainspan;
6868 cpumask_var_t covered;
6869 cpumask_var_t notcovered;
6870#endif
6871 cpumask_var_t nodemask;
6872 cpumask_var_t this_sibling_map;
6873 cpumask_var_t this_core_map;
6874 cpumask_var_t this_book_map;
6875 cpumask_var_t send_covered;
6876 cpumask_var_t tmpmask;
6877 struct sched_group **sched_group_nodes;
6878 struct root_domain *rd; 6898 struct root_domain *rd;
6879}; 6899};
6880 6900
6881enum s_alloc { 6901enum s_alloc {
6882 sa_sched_groups = 0,
6883 sa_rootdomain, 6902 sa_rootdomain,
6884 sa_tmpmask, 6903 sa_sd,
6885 sa_send_covered, 6904 sa_sd_storage,
6886 sa_this_book_map,
6887 sa_this_core_map,
6888 sa_this_sibling_map,
6889 sa_nodemask,
6890 sa_sched_group_nodes,
6891#ifdef CONFIG_NUMA
6892 sa_notcovered,
6893 sa_covered,
6894 sa_domainspan,
6895#endif
6896 sa_none, 6905 sa_none,
6897}; 6906};
6898 6907
6899/* 6908struct sched_domain_topology_level;
6900 * SMT sched-domains:
6901 */
6902#ifdef CONFIG_SCHED_SMT
6903static DEFINE_PER_CPU(struct static_sched_domain, cpu_domains);
6904static DEFINE_PER_CPU(struct static_sched_group, sched_groups);
6905 6909
6906static int 6910typedef struct sched_domain *(*sched_domain_init_f)(struct sched_domain_topology_level *tl, int cpu);
6907cpu_to_cpu_group(int cpu, const struct cpumask *cpu_map, 6911typedef const struct cpumask *(*sched_domain_mask_f)(int cpu);
6908 struct sched_group **sg, struct cpumask *unused)
6909{
6910 if (sg)
6911 *sg = &per_cpu(sched_groups, cpu).sg;
6912 return cpu;
6913}
6914#endif /* CONFIG_SCHED_SMT */
6915 6912
6916/* 6913struct sched_domain_topology_level {
6917 * multi-core sched-domains: 6914 sched_domain_init_f init;
6918 */ 6915 sched_domain_mask_f mask;
6919#ifdef CONFIG_SCHED_MC 6916 struct sd_data data;
6920static DEFINE_PER_CPU(struct static_sched_domain, core_domains); 6917};
6921static DEFINE_PER_CPU(struct static_sched_group, sched_group_core);
6922
6923static int
6924cpu_to_core_group(int cpu, const struct cpumask *cpu_map,
6925 struct sched_group **sg, struct cpumask *mask)
6926{
6927 int group;
6928#ifdef CONFIG_SCHED_SMT
6929 cpumask_and(mask, topology_thread_cpumask(cpu), cpu_map);
6930 group = cpumask_first(mask);
6931#else
6932 group = cpu;
6933#endif
6934 if (sg)
6935 *sg = &per_cpu(sched_group_core, group).sg;
6936 return group;
6937}
6938#endif /* CONFIG_SCHED_MC */
6939 6918
6940/* 6919/*
6941 * book sched-domains: 6920 * Assumes the sched_domain tree is fully constructed
6942 */ 6921 */
6943#ifdef CONFIG_SCHED_BOOK 6922static int get_group(int cpu, struct sd_data *sdd, struct sched_group **sg)
6944static DEFINE_PER_CPU(struct static_sched_domain, book_domains);
6945static DEFINE_PER_CPU(struct static_sched_group, sched_group_book);
6946
6947static int
6948cpu_to_book_group(int cpu, const struct cpumask *cpu_map,
6949 struct sched_group **sg, struct cpumask *mask)
6950{ 6923{
6951 int group = cpu; 6924 struct sched_domain *sd = *per_cpu_ptr(sdd->sd, cpu);
6952#ifdef CONFIG_SCHED_MC 6925 struct sched_domain *child = sd->child;
6953 cpumask_and(mask, cpu_coregroup_mask(cpu), cpu_map);
6954 group = cpumask_first(mask);
6955#elif defined(CONFIG_SCHED_SMT)
6956 cpumask_and(mask, topology_thread_cpumask(cpu), cpu_map);
6957 group = cpumask_first(mask);
6958#endif
6959 if (sg)
6960 *sg = &per_cpu(sched_group_book, group).sg;
6961 return group;
6962}
6963#endif /* CONFIG_SCHED_BOOK */
6964 6926
6965static DEFINE_PER_CPU(struct static_sched_domain, phys_domains); 6927 if (child)
6966static DEFINE_PER_CPU(struct static_sched_group, sched_group_phys); 6928 cpu = cpumask_first(sched_domain_span(child));
6967 6929
6968static int
6969cpu_to_phys_group(int cpu, const struct cpumask *cpu_map,
6970 struct sched_group **sg, struct cpumask *mask)
6971{
6972 int group;
6973#ifdef CONFIG_SCHED_BOOK
6974 cpumask_and(mask, cpu_book_mask(cpu), cpu_map);
6975 group = cpumask_first(mask);
6976#elif defined(CONFIG_SCHED_MC)
6977 cpumask_and(mask, cpu_coregroup_mask(cpu), cpu_map);
6978 group = cpumask_first(mask);
6979#elif defined(CONFIG_SCHED_SMT)
6980 cpumask_and(mask, topology_thread_cpumask(cpu), cpu_map);
6981 group = cpumask_first(mask);
6982#else
6983 group = cpu;
6984#endif
6985 if (sg) 6930 if (sg)
6986 *sg = &per_cpu(sched_group_phys, group).sg; 6931 *sg = *per_cpu_ptr(sdd->sg, cpu);
6987 return group; 6932
6933 return cpu;
6988} 6934}
6989 6935
6990#ifdef CONFIG_NUMA
6991/* 6936/*
6992 * The init_sched_build_groups can't handle what we want to do with node 6937 * build_sched_groups takes the cpumask we wish to span, and a pointer
6993 * groups, so roll our own. Now each node has its own list of groups which 6938 * to a function which identifies what group(along with sched group) a CPU
6994 * gets dynamically allocated. 6939 * belongs to. The return value of group_fn must be a >= 0 and < nr_cpu_ids
6940 * (due to the fact that we keep track of groups covered with a struct cpumask).
6941 *
6942 * build_sched_groups will build a circular linked list of the groups
6943 * covered by the given span, and will set each group's ->cpumask correctly,
6944 * and ->cpu_power to 0.
6995 */ 6945 */
6996static DEFINE_PER_CPU(struct static_sched_domain, node_domains); 6946static void
6997static struct sched_group ***sched_group_nodes_bycpu; 6947build_sched_groups(struct sched_domain *sd)
6998
6999static DEFINE_PER_CPU(struct static_sched_domain, allnodes_domains);
7000static DEFINE_PER_CPU(struct static_sched_group, sched_group_allnodes);
7001
7002static int cpu_to_allnodes_group(int cpu, const struct cpumask *cpu_map,
7003 struct sched_group **sg,
7004 struct cpumask *nodemask)
7005{
7006 int group;
7007
7008 cpumask_and(nodemask, cpumask_of_node(cpu_to_node(cpu)), cpu_map);
7009 group = cpumask_first(nodemask);
7010
7011 if (sg)
7012 *sg = &per_cpu(sched_group_allnodes, group).sg;
7013 return group;
7014}
7015
7016static void init_numa_sched_groups_power(struct sched_group *group_head)
7017{
7018 struct sched_group *sg = group_head;
7019 int j;
7020
7021 if (!sg)
7022 return;
7023 do {
7024 for_each_cpu(j, sched_group_cpus(sg)) {
7025 struct sched_domain *sd;
7026
7027 sd = &per_cpu(phys_domains, j).sd;
7028 if (j != group_first_cpu(sd->groups)) {
7029 /*
7030 * Only add "power" once for each
7031 * physical package.
7032 */
7033 continue;
7034 }
7035
7036 sg->cpu_power += sd->groups->cpu_power;
7037 }
7038 sg = sg->next;
7039 } while (sg != group_head);
7040}
7041
7042static int build_numa_sched_groups(struct s_data *d,
7043 const struct cpumask *cpu_map, int num)
7044{ 6948{
7045 struct sched_domain *sd; 6949 struct sched_group *first = NULL, *last = NULL;
7046 struct sched_group *sg, *prev; 6950 struct sd_data *sdd = sd->private;
7047 int n, j; 6951 const struct cpumask *span = sched_domain_span(sd);
7048 6952 struct cpumask *covered;
7049 cpumask_clear(d->covered); 6953 int i;
7050 cpumask_and(d->nodemask, cpumask_of_node(num), cpu_map);
7051 if (cpumask_empty(d->nodemask)) {
7052 d->sched_group_nodes[num] = NULL;
7053 goto out;
7054 }
7055
7056 sched_domain_node_span(num, d->domainspan);
7057 cpumask_and(d->domainspan, d->domainspan, cpu_map);
7058
7059 sg = kmalloc_node(sizeof(struct sched_group) + cpumask_size(),
7060 GFP_KERNEL, num);
7061 if (!sg) {
7062 printk(KERN_WARNING "Can not alloc domain group for node %d\n",
7063 num);
7064 return -ENOMEM;
7065 }
7066 d->sched_group_nodes[num] = sg;
7067
7068 for_each_cpu(j, d->nodemask) {
7069 sd = &per_cpu(node_domains, j).sd;
7070 sd->groups = sg;
7071 }
7072
7073 sg->cpu_power = 0;
7074 cpumask_copy(sched_group_cpus(sg), d->nodemask);
7075 sg->next = sg;
7076 cpumask_or(d->covered, d->covered, d->nodemask);
7077 6954
7078 prev = sg; 6955 lockdep_assert_held(&sched_domains_mutex);
7079 for (j = 0; j < nr_node_ids; j++) { 6956 covered = sched_domains_tmpmask;
7080 n = (num + j) % nr_node_ids;
7081 cpumask_complement(d->notcovered, d->covered);
7082 cpumask_and(d->tmpmask, d->notcovered, cpu_map);
7083 cpumask_and(d->tmpmask, d->tmpmask, d->domainspan);
7084 if (cpumask_empty(d->tmpmask))
7085 break;
7086 cpumask_and(d->tmpmask, d->tmpmask, cpumask_of_node(n));
7087 if (cpumask_empty(d->tmpmask))
7088 continue;
7089 sg = kmalloc_node(sizeof(struct sched_group) + cpumask_size(),
7090 GFP_KERNEL, num);
7091 if (!sg) {
7092 printk(KERN_WARNING
7093 "Can not alloc domain group for node %d\n", j);
7094 return -ENOMEM;
7095 }
7096 sg->cpu_power = 0;
7097 cpumask_copy(sched_group_cpus(sg), d->tmpmask);
7098 sg->next = prev->next;
7099 cpumask_or(d->covered, d->covered, d->tmpmask);
7100 prev->next = sg;
7101 prev = sg;
7102 }
7103out:
7104 return 0;
7105}
7106#endif /* CONFIG_NUMA */
7107 6957
7108#ifdef CONFIG_NUMA 6958 cpumask_clear(covered);
7109/* Free memory allocated for various sched_group structures */
7110static void free_sched_groups(const struct cpumask *cpu_map,
7111 struct cpumask *nodemask)
7112{
7113 int cpu, i;
7114 6959
7115 for_each_cpu(cpu, cpu_map) { 6960 for_each_cpu(i, span) {
7116 struct sched_group **sched_group_nodes 6961 struct sched_group *sg;
7117 = sched_group_nodes_bycpu[cpu]; 6962 int group = get_group(i, sdd, &sg);
6963 int j;
7118 6964
7119 if (!sched_group_nodes) 6965 if (cpumask_test_cpu(i, covered))
7120 continue; 6966 continue;
7121 6967
7122 for (i = 0; i < nr_node_ids; i++) { 6968 cpumask_clear(sched_group_cpus(sg));
7123 struct sched_group *oldsg, *sg = sched_group_nodes[i]; 6969 sg->cpu_power = 0;
7124 6970
7125 cpumask_and(nodemask, cpumask_of_node(i), cpu_map); 6971 for_each_cpu(j, span) {
7126 if (cpumask_empty(nodemask)) 6972 if (get_group(j, sdd, NULL) != group)
7127 continue; 6973 continue;
7128 6974
7129 if (sg == NULL) 6975 cpumask_set_cpu(j, covered);
7130 continue; 6976 cpumask_set_cpu(j, sched_group_cpus(sg));
7131 sg = sg->next;
7132next_sg:
7133 oldsg = sg;
7134 sg = sg->next;
7135 kfree(oldsg);
7136 if (oldsg != sched_group_nodes[i])
7137 goto next_sg;
7138 } 6977 }
7139 kfree(sched_group_nodes); 6978
7140 sched_group_nodes_bycpu[cpu] = NULL; 6979 if (!first)
6980 first = sg;
6981 if (last)
6982 last->next = sg;
6983 last = sg;
7141 } 6984 }
6985 last->next = first;
7142} 6986}
7143#else /* !CONFIG_NUMA */
7144static void free_sched_groups(const struct cpumask *cpu_map,
7145 struct cpumask *nodemask)
7146{
7147}
7148#endif /* CONFIG_NUMA */
7149 6987
7150/* 6988/*
7151 * Initialize sched groups cpu_power. 6989 * Initialize sched groups cpu_power.
@@ -7159,11 +6997,6 @@ static void free_sched_groups(const struct cpumask *cpu_map,
7159 */ 6997 */
7160static void init_sched_groups_power(int cpu, struct sched_domain *sd) 6998static void init_sched_groups_power(int cpu, struct sched_domain *sd)
7161{ 6999{
7162 struct sched_domain *child;
7163 struct sched_group *group;
7164 long power;
7165 int weight;
7166
7167 WARN_ON(!sd || !sd->groups); 7000 WARN_ON(!sd || !sd->groups);
7168 7001
7169 if (cpu != group_first_cpu(sd->groups)) 7002 if (cpu != group_first_cpu(sd->groups))
@@ -7171,36 +7004,7 @@ static void init_sched_groups_power(int cpu, struct sched_domain *sd)
7171 7004
7172 sd->groups->group_weight = cpumask_weight(sched_group_cpus(sd->groups)); 7005 sd->groups->group_weight = cpumask_weight(sched_group_cpus(sd->groups));
7173 7006
7174 child = sd->child; 7007 update_group_power(sd, cpu);
7175
7176 sd->groups->cpu_power = 0;
7177
7178 if (!child) {
7179 power = SCHED_LOAD_SCALE;
7180 weight = cpumask_weight(sched_domain_span(sd));
7181 /*
7182 * SMT siblings share the power of a single core.
7183 * Usually multiple threads get a better yield out of
7184 * that one core than a single thread would have,
7185 * reflect that in sd->smt_gain.
7186 */
7187 if ((sd->flags & SD_SHARE_CPUPOWER) && weight > 1) {
7188 power *= sd->smt_gain;
7189 power /= weight;
7190 power >>= SCHED_LOAD_SHIFT;
7191 }
7192 sd->groups->cpu_power += power;
7193 return;
7194 }
7195
7196 /*
7197 * Add cpu_power of each child group to this groups cpu_power.
7198 */
7199 group = child->groups;
7200 do {
7201 sd->groups->cpu_power += group->cpu_power;
7202 group = group->next;
7203 } while (group != child->groups);
7204} 7008}
7205 7009
7206/* 7010/*
@@ -7214,15 +7018,15 @@ static void init_sched_groups_power(int cpu, struct sched_domain *sd)
7214# define SD_INIT_NAME(sd, type) do { } while (0) 7018# define SD_INIT_NAME(sd, type) do { } while (0)
7215#endif 7019#endif
7216 7020
7217#define SD_INIT(sd, type) sd_init_##type(sd) 7021#define SD_INIT_FUNC(type) \
7218 7022static noinline struct sched_domain * \
7219#define SD_INIT_FUNC(type) \ 7023sd_init_##type(struct sched_domain_topology_level *tl, int cpu) \
7220static noinline void sd_init_##type(struct sched_domain *sd) \ 7024{ \
7221{ \ 7025 struct sched_domain *sd = *per_cpu_ptr(tl->data.sd, cpu); \
7222 memset(sd, 0, sizeof(*sd)); \ 7026 *sd = SD_##type##_INIT; \
7223 *sd = SD_##type##_INIT; \ 7027 SD_INIT_NAME(sd, type); \
7224 sd->level = SD_LV_##type; \ 7028 sd->private = &tl->data; \
7225 SD_INIT_NAME(sd, type); \ 7029 return sd; \
7226} 7030}
7227 7031
7228SD_INIT_FUNC(CPU) 7032SD_INIT_FUNC(CPU)
@@ -7241,13 +7045,14 @@ SD_INIT_FUNC(CPU)
7241#endif 7045#endif
7242 7046
7243static int default_relax_domain_level = -1; 7047static int default_relax_domain_level = -1;
7048int sched_domain_level_max;
7244 7049
7245static int __init setup_relax_domain_level(char *str) 7050static int __init setup_relax_domain_level(char *str)
7246{ 7051{
7247 unsigned long val; 7052 unsigned long val;
7248 7053
7249 val = simple_strtoul(str, NULL, 0); 7054 val = simple_strtoul(str, NULL, 0);
7250 if (val < SD_LV_MAX) 7055 if (val < sched_domain_level_max)
7251 default_relax_domain_level = val; 7056 default_relax_domain_level = val;
7252 7057
7253 return 1; 7058 return 1;
@@ -7275,37 +7080,20 @@ static void set_domain_attribute(struct sched_domain *sd,
7275 } 7080 }
7276} 7081}
7277 7082
7083static void __sdt_free(const struct cpumask *cpu_map);
7084static int __sdt_alloc(const struct cpumask *cpu_map);
7085
7278static void __free_domain_allocs(struct s_data *d, enum s_alloc what, 7086static void __free_domain_allocs(struct s_data *d, enum s_alloc what,
7279 const struct cpumask *cpu_map) 7087 const struct cpumask *cpu_map)
7280{ 7088{
7281 switch (what) { 7089 switch (what) {
7282 case sa_sched_groups:
7283 free_sched_groups(cpu_map, d->tmpmask); /* fall through */
7284 d->sched_group_nodes = NULL;
7285 case sa_rootdomain: 7090 case sa_rootdomain:
7286 free_rootdomain(d->rd); /* fall through */ 7091 if (!atomic_read(&d->rd->refcount))
7287 case sa_tmpmask: 7092 free_rootdomain(&d->rd->rcu); /* fall through */
7288 free_cpumask_var(d->tmpmask); /* fall through */ 7093 case sa_sd:
7289 case sa_send_covered: 7094 free_percpu(d->sd); /* fall through */
7290 free_cpumask_var(d->send_covered); /* fall through */ 7095 case sa_sd_storage:
7291 case sa_this_book_map: 7096 __sdt_free(cpu_map); /* fall through */
7292 free_cpumask_var(d->this_book_map); /* fall through */
7293 case sa_this_core_map:
7294 free_cpumask_var(d->this_core_map); /* fall through */
7295 case sa_this_sibling_map:
7296 free_cpumask_var(d->this_sibling_map); /* fall through */
7297 case sa_nodemask:
7298 free_cpumask_var(d->nodemask); /* fall through */
7299 case sa_sched_group_nodes:
7300#ifdef CONFIG_NUMA
7301 kfree(d->sched_group_nodes); /* fall through */
7302 case sa_notcovered:
7303 free_cpumask_var(d->notcovered); /* fall through */
7304 case sa_covered:
7305 free_cpumask_var(d->covered); /* fall through */
7306 case sa_domainspan:
7307 free_cpumask_var(d->domainspan); /* fall through */
7308#endif
7309 case sa_none: 7097 case sa_none:
7310 break; 7098 break;
7311 } 7099 }
@@ -7314,308 +7102,212 @@ static void __free_domain_allocs(struct s_data *d, enum s_alloc what,
7314static enum s_alloc __visit_domain_allocation_hell(struct s_data *d, 7102static enum s_alloc __visit_domain_allocation_hell(struct s_data *d,
7315 const struct cpumask *cpu_map) 7103 const struct cpumask *cpu_map)
7316{ 7104{
7317#ifdef CONFIG_NUMA 7105 memset(d, 0, sizeof(*d));
7318 if (!alloc_cpumask_var(&d->domainspan, GFP_KERNEL)) 7106
7319 return sa_none; 7107 if (__sdt_alloc(cpu_map))
7320 if (!alloc_cpumask_var(&d->covered, GFP_KERNEL)) 7108 return sa_sd_storage;
7321 return sa_domainspan; 7109 d->sd = alloc_percpu(struct sched_domain *);
7322 if (!alloc_cpumask_var(&d->notcovered, GFP_KERNEL)) 7110 if (!d->sd)
7323 return sa_covered; 7111 return sa_sd_storage;
7324 /* Allocate the per-node list of sched groups */
7325 d->sched_group_nodes = kcalloc(nr_node_ids,
7326 sizeof(struct sched_group *), GFP_KERNEL);
7327 if (!d->sched_group_nodes) {
7328 printk(KERN_WARNING "Can not alloc sched group node list\n");
7329 return sa_notcovered;
7330 }
7331 sched_group_nodes_bycpu[cpumask_first(cpu_map)] = d->sched_group_nodes;
7332#endif
7333 if (!alloc_cpumask_var(&d->nodemask, GFP_KERNEL))
7334 return sa_sched_group_nodes;
7335 if (!alloc_cpumask_var(&d->this_sibling_map, GFP_KERNEL))
7336 return sa_nodemask;
7337 if (!alloc_cpumask_var(&d->this_core_map, GFP_KERNEL))
7338 return sa_this_sibling_map;
7339 if (!alloc_cpumask_var(&d->this_book_map, GFP_KERNEL))
7340 return sa_this_core_map;
7341 if (!alloc_cpumask_var(&d->send_covered, GFP_KERNEL))
7342 return sa_this_book_map;
7343 if (!alloc_cpumask_var(&d->tmpmask, GFP_KERNEL))
7344 return sa_send_covered;
7345 d->rd = alloc_rootdomain(); 7112 d->rd = alloc_rootdomain();
7346 if (!d->rd) { 7113 if (!d->rd)
7347 printk(KERN_WARNING "Cannot alloc root domain\n"); 7114 return sa_sd;
7348 return sa_tmpmask;
7349 }
7350 return sa_rootdomain; 7115 return sa_rootdomain;
7351} 7116}
7352 7117
7353static struct sched_domain *__build_numa_sched_domains(struct s_data *d, 7118/*
7354 const struct cpumask *cpu_map, struct sched_domain_attr *attr, int i) 7119 * NULL the sd_data elements we've used to build the sched_domain and
7120 * sched_group structure so that the subsequent __free_domain_allocs()
7121 * will not free the data we're using.
7122 */
7123static void claim_allocations(int cpu, struct sched_domain *sd)
7355{ 7124{
7356 struct sched_domain *sd = NULL; 7125 struct sd_data *sdd = sd->private;
7357#ifdef CONFIG_NUMA 7126 struct sched_group *sg = sd->groups;
7358 struct sched_domain *parent;
7359
7360 d->sd_allnodes = 0;
7361 if (cpumask_weight(cpu_map) >
7362 SD_NODES_PER_DOMAIN * cpumask_weight(d->nodemask)) {
7363 sd = &per_cpu(allnodes_domains, i).sd;
7364 SD_INIT(sd, ALLNODES);
7365 set_domain_attribute(sd, attr);
7366 cpumask_copy(sched_domain_span(sd), cpu_map);
7367 cpu_to_allnodes_group(i, cpu_map, &sd->groups, d->tmpmask);
7368 d->sd_allnodes = 1;
7369 }
7370 parent = sd;
7371
7372 sd = &per_cpu(node_domains, i).sd;
7373 SD_INIT(sd, NODE);
7374 set_domain_attribute(sd, attr);
7375 sched_domain_node_span(cpu_to_node(i), sched_domain_span(sd));
7376 sd->parent = parent;
7377 if (parent)
7378 parent->child = sd;
7379 cpumask_and(sched_domain_span(sd), sched_domain_span(sd), cpu_map);
7380#endif
7381 return sd;
7382}
7383 7127
7384static struct sched_domain *__build_cpu_sched_domain(struct s_data *d, 7128 WARN_ON_ONCE(*per_cpu_ptr(sdd->sd, cpu) != sd);
7385 const struct cpumask *cpu_map, struct sched_domain_attr *attr, 7129 *per_cpu_ptr(sdd->sd, cpu) = NULL;
7386 struct sched_domain *parent, int i)
7387{
7388 struct sched_domain *sd;
7389 sd = &per_cpu(phys_domains, i).sd;
7390 SD_INIT(sd, CPU);
7391 set_domain_attribute(sd, attr);
7392 cpumask_copy(sched_domain_span(sd), d->nodemask);
7393 sd->parent = parent;
7394 if (parent)
7395 parent->child = sd;
7396 cpu_to_phys_group(i, cpu_map, &sd->groups, d->tmpmask);
7397 return sd;
7398}
7399 7130
7400static struct sched_domain *__build_book_sched_domain(struct s_data *d, 7131 if (cpu == cpumask_first(sched_group_cpus(sg))) {
7401 const struct cpumask *cpu_map, struct sched_domain_attr *attr, 7132 WARN_ON_ONCE(*per_cpu_ptr(sdd->sg, cpu) != sg);
7402 struct sched_domain *parent, int i) 7133 *per_cpu_ptr(sdd->sg, cpu) = NULL;
7403{ 7134 }
7404 struct sched_domain *sd = parent;
7405#ifdef CONFIG_SCHED_BOOK
7406 sd = &per_cpu(book_domains, i).sd;
7407 SD_INIT(sd, BOOK);
7408 set_domain_attribute(sd, attr);
7409 cpumask_and(sched_domain_span(sd), cpu_map, cpu_book_mask(i));
7410 sd->parent = parent;
7411 parent->child = sd;
7412 cpu_to_book_group(i, cpu_map, &sd->groups, d->tmpmask);
7413#endif
7414 return sd;
7415} 7135}
7416 7136
7417static struct sched_domain *__build_mc_sched_domain(struct s_data *d, 7137#ifdef CONFIG_SCHED_SMT
7418 const struct cpumask *cpu_map, struct sched_domain_attr *attr, 7138static const struct cpumask *cpu_smt_mask(int cpu)
7419 struct sched_domain *parent, int i)
7420{ 7139{
7421 struct sched_domain *sd = parent; 7140 return topology_thread_cpumask(cpu);
7422#ifdef CONFIG_SCHED_MC
7423 sd = &per_cpu(core_domains, i).sd;
7424 SD_INIT(sd, MC);
7425 set_domain_attribute(sd, attr);
7426 cpumask_and(sched_domain_span(sd), cpu_map, cpu_coregroup_mask(i));
7427 sd->parent = parent;
7428 parent->child = sd;
7429 cpu_to_core_group(i, cpu_map, &sd->groups, d->tmpmask);
7430#endif
7431 return sd;
7432} 7141}
7433
7434static struct sched_domain *__build_smt_sched_domain(struct s_data *d,
7435 const struct cpumask *cpu_map, struct sched_domain_attr *attr,
7436 struct sched_domain *parent, int i)
7437{
7438 struct sched_domain *sd = parent;
7439#ifdef CONFIG_SCHED_SMT
7440 sd = &per_cpu(cpu_domains, i).sd;
7441 SD_INIT(sd, SIBLING);
7442 set_domain_attribute(sd, attr);
7443 cpumask_and(sched_domain_span(sd), cpu_map, topology_thread_cpumask(i));
7444 sd->parent = parent;
7445 parent->child = sd;
7446 cpu_to_cpu_group(i, cpu_map, &sd->groups, d->tmpmask);
7447#endif 7142#endif
7448 return sd;
7449}
7450 7143
7451static void build_sched_groups(struct s_data *d, enum sched_domain_level l, 7144/*
7452 const struct cpumask *cpu_map, int cpu) 7145 * Topology list, bottom-up.
7453{ 7146 */
7454 switch (l) { 7147static struct sched_domain_topology_level default_topology[] = {
7455#ifdef CONFIG_SCHED_SMT 7148#ifdef CONFIG_SCHED_SMT
7456 case SD_LV_SIBLING: /* set up CPU (sibling) groups */ 7149 { sd_init_SIBLING, cpu_smt_mask, },
7457 cpumask_and(d->this_sibling_map, cpu_map,
7458 topology_thread_cpumask(cpu));
7459 if (cpu == cpumask_first(d->this_sibling_map))
7460 init_sched_build_groups(d->this_sibling_map, cpu_map,
7461 &cpu_to_cpu_group,
7462 d->send_covered, d->tmpmask);
7463 break;
7464#endif 7150#endif
7465#ifdef CONFIG_SCHED_MC 7151#ifdef CONFIG_SCHED_MC
7466 case SD_LV_MC: /* set up multi-core groups */ 7152 { sd_init_MC, cpu_coregroup_mask, },
7467 cpumask_and(d->this_core_map, cpu_map, cpu_coregroup_mask(cpu));
7468 if (cpu == cpumask_first(d->this_core_map))
7469 init_sched_build_groups(d->this_core_map, cpu_map,
7470 &cpu_to_core_group,
7471 d->send_covered, d->tmpmask);
7472 break;
7473#endif 7153#endif
7474#ifdef CONFIG_SCHED_BOOK 7154#ifdef CONFIG_SCHED_BOOK
7475 case SD_LV_BOOK: /* set up book groups */ 7155 { sd_init_BOOK, cpu_book_mask, },
7476 cpumask_and(d->this_book_map, cpu_map, cpu_book_mask(cpu));
7477 if (cpu == cpumask_first(d->this_book_map))
7478 init_sched_build_groups(d->this_book_map, cpu_map,
7479 &cpu_to_book_group,
7480 d->send_covered, d->tmpmask);
7481 break;
7482#endif 7156#endif
7483 case SD_LV_CPU: /* set up physical groups */ 7157 { sd_init_CPU, cpu_cpu_mask, },
7484 cpumask_and(d->nodemask, cpumask_of_node(cpu), cpu_map);
7485 if (!cpumask_empty(d->nodemask))
7486 init_sched_build_groups(d->nodemask, cpu_map,
7487 &cpu_to_phys_group,
7488 d->send_covered, d->tmpmask);
7489 break;
7490#ifdef CONFIG_NUMA 7158#ifdef CONFIG_NUMA
7491 case SD_LV_ALLNODES: 7159 { sd_init_NODE, cpu_node_mask, },
7492 init_sched_build_groups(cpu_map, cpu_map, &cpu_to_allnodes_group, 7160 { sd_init_ALLNODES, cpu_allnodes_mask, },
7493 d->send_covered, d->tmpmask);
7494 break;
7495#endif 7161#endif
7496 default: 7162 { NULL, },
7497 break; 7163};
7164
7165static struct sched_domain_topology_level *sched_domain_topology = default_topology;
7166
7167static int __sdt_alloc(const struct cpumask *cpu_map)
7168{
7169 struct sched_domain_topology_level *tl;
7170 int j;
7171
7172 for (tl = sched_domain_topology; tl->init; tl++) {
7173 struct sd_data *sdd = &tl->data;
7174
7175 sdd->sd = alloc_percpu(struct sched_domain *);
7176 if (!sdd->sd)
7177 return -ENOMEM;
7178
7179 sdd->sg = alloc_percpu(struct sched_group *);
7180 if (!sdd->sg)
7181 return -ENOMEM;
7182
7183 for_each_cpu(j, cpu_map) {
7184 struct sched_domain *sd;
7185 struct sched_group *sg;
7186
7187 sd = kzalloc_node(sizeof(struct sched_domain) + cpumask_size(),
7188 GFP_KERNEL, cpu_to_node(j));
7189 if (!sd)
7190 return -ENOMEM;
7191
7192 *per_cpu_ptr(sdd->sd, j) = sd;
7193
7194 sg = kzalloc_node(sizeof(struct sched_group) + cpumask_size(),
7195 GFP_KERNEL, cpu_to_node(j));
7196 if (!sg)
7197 return -ENOMEM;
7198
7199 *per_cpu_ptr(sdd->sg, j) = sg;
7200 }
7201 }
7202
7203 return 0;
7204}
7205
7206static void __sdt_free(const struct cpumask *cpu_map)
7207{
7208 struct sched_domain_topology_level *tl;
7209 int j;
7210
7211 for (tl = sched_domain_topology; tl->init; tl++) {
7212 struct sd_data *sdd = &tl->data;
7213
7214 for_each_cpu(j, cpu_map) {
7215 kfree(*per_cpu_ptr(sdd->sd, j));
7216 kfree(*per_cpu_ptr(sdd->sg, j));
7217 }
7218 free_percpu(sdd->sd);
7219 free_percpu(sdd->sg);
7220 }
7221}
7222
7223struct sched_domain *build_sched_domain(struct sched_domain_topology_level *tl,
7224 struct s_data *d, const struct cpumask *cpu_map,
7225 struct sched_domain_attr *attr, struct sched_domain *child,
7226 int cpu)
7227{
7228 struct sched_domain *sd = tl->init(tl, cpu);
7229 if (!sd)
7230 return child;
7231
7232 set_domain_attribute(sd, attr);
7233 cpumask_and(sched_domain_span(sd), cpu_map, tl->mask(cpu));
7234 if (child) {
7235 sd->level = child->level + 1;
7236 sched_domain_level_max = max(sched_domain_level_max, sd->level);
7237 child->parent = sd;
7498 } 7238 }
7239 sd->child = child;
7240
7241 return sd;
7499} 7242}
7500 7243
7501/* 7244/*
7502 * Build sched domains for a given set of cpus and attach the sched domains 7245 * Build sched domains for a given set of cpus and attach the sched domains
7503 * to the individual cpus 7246 * to the individual cpus
7504 */ 7247 */
7505static int __build_sched_domains(const struct cpumask *cpu_map, 7248static int build_sched_domains(const struct cpumask *cpu_map,
7506 struct sched_domain_attr *attr) 7249 struct sched_domain_attr *attr)
7507{ 7250{
7508 enum s_alloc alloc_state = sa_none; 7251 enum s_alloc alloc_state = sa_none;
7509 struct s_data d;
7510 struct sched_domain *sd; 7252 struct sched_domain *sd;
7511 int i; 7253 struct s_data d;
7512#ifdef CONFIG_NUMA 7254 int i, ret = -ENOMEM;
7513 d.sd_allnodes = 0;
7514#endif
7515 7255
7516 alloc_state = __visit_domain_allocation_hell(&d, cpu_map); 7256 alloc_state = __visit_domain_allocation_hell(&d, cpu_map);
7517 if (alloc_state != sa_rootdomain) 7257 if (alloc_state != sa_rootdomain)
7518 goto error; 7258 goto error;
7519 alloc_state = sa_sched_groups;
7520 7259
7521 /* 7260 /* Set up domains for cpus specified by the cpu_map. */
7522 * Set up domains for cpus specified by the cpu_map.
7523 */
7524 for_each_cpu(i, cpu_map) { 7261 for_each_cpu(i, cpu_map) {
7525 cpumask_and(d.nodemask, cpumask_of_node(cpu_to_node(i)), 7262 struct sched_domain_topology_level *tl;
7526 cpu_map);
7527 7263
7528 sd = __build_numa_sched_domains(&d, cpu_map, attr, i); 7264 sd = NULL;
7529 sd = __build_cpu_sched_domain(&d, cpu_map, attr, sd, i); 7265 for (tl = sched_domain_topology; tl->init; tl++)
7530 sd = __build_book_sched_domain(&d, cpu_map, attr, sd, i); 7266 sd = build_sched_domain(tl, &d, cpu_map, attr, sd, i);
7531 sd = __build_mc_sched_domain(&d, cpu_map, attr, sd, i);
7532 sd = __build_smt_sched_domain(&d, cpu_map, attr, sd, i);
7533 }
7534
7535 for_each_cpu(i, cpu_map) {
7536 build_sched_groups(&d, SD_LV_SIBLING, cpu_map, i);
7537 build_sched_groups(&d, SD_LV_BOOK, cpu_map, i);
7538 build_sched_groups(&d, SD_LV_MC, cpu_map, i);
7539 }
7540
7541 /* Set up physical groups */
7542 for (i = 0; i < nr_node_ids; i++)
7543 build_sched_groups(&d, SD_LV_CPU, cpu_map, i);
7544 7267
7545#ifdef CONFIG_NUMA 7268 while (sd->child)
7546 /* Set up node groups */ 7269 sd = sd->child;
7547 if (d.sd_allnodes)
7548 build_sched_groups(&d, SD_LV_ALLNODES, cpu_map, 0);
7549
7550 for (i = 0; i < nr_node_ids; i++)
7551 if (build_numa_sched_groups(&d, cpu_map, i))
7552 goto error;
7553#endif
7554 7270
7555 /* Calculate CPU power for physical packages and nodes */ 7271 *per_cpu_ptr(d.sd, i) = sd;
7556#ifdef CONFIG_SCHED_SMT
7557 for_each_cpu(i, cpu_map) {
7558 sd = &per_cpu(cpu_domains, i).sd;
7559 init_sched_groups_power(i, sd);
7560 }
7561#endif
7562#ifdef CONFIG_SCHED_MC
7563 for_each_cpu(i, cpu_map) {
7564 sd = &per_cpu(core_domains, i).sd;
7565 init_sched_groups_power(i, sd);
7566 } 7272 }
7567#endif
7568#ifdef CONFIG_SCHED_BOOK
7569 for_each_cpu(i, cpu_map) {
7570 sd = &per_cpu(book_domains, i).sd;
7571 init_sched_groups_power(i, sd);
7572 }
7573#endif
7574 7273
7274 /* Build the groups for the domains */
7575 for_each_cpu(i, cpu_map) { 7275 for_each_cpu(i, cpu_map) {
7576 sd = &per_cpu(phys_domains, i).sd; 7276 for (sd = *per_cpu_ptr(d.sd, i); sd; sd = sd->parent) {
7577 init_sched_groups_power(i, sd); 7277 sd->span_weight = cpumask_weight(sched_domain_span(sd));
7578 } 7278 get_group(i, sd->private, &sd->groups);
7279 atomic_inc(&sd->groups->ref);
7579 7280
7580#ifdef CONFIG_NUMA 7281 if (i != cpumask_first(sched_domain_span(sd)))
7581 for (i = 0; i < nr_node_ids; i++) 7282 continue;
7582 init_numa_sched_groups_power(d.sched_group_nodes[i]);
7583 7283
7584 if (d.sd_allnodes) { 7284 build_sched_groups(sd);
7585 struct sched_group *sg; 7285 }
7286 }
7586 7287
7587 cpu_to_allnodes_group(cpumask_first(cpu_map), cpu_map, &sg, 7288 /* Calculate CPU power for physical packages and nodes */
7588 d.tmpmask); 7289 for (i = nr_cpumask_bits-1; i >= 0; i--) {
7589 init_numa_sched_groups_power(sg); 7290 if (!cpumask_test_cpu(i, cpu_map))
7291 continue;
7292
7293 for (sd = *per_cpu_ptr(d.sd, i); sd; sd = sd->parent) {
7294 claim_allocations(i, sd);
7295 init_sched_groups_power(i, sd);
7296 }
7590 } 7297 }
7591#endif
7592 7298
7593 /* Attach the domains */ 7299 /* Attach the domains */
7300 rcu_read_lock();
7594 for_each_cpu(i, cpu_map) { 7301 for_each_cpu(i, cpu_map) {
7595#ifdef CONFIG_SCHED_SMT 7302 sd = *per_cpu_ptr(d.sd, i);
7596 sd = &per_cpu(cpu_domains, i).sd;
7597#elif defined(CONFIG_SCHED_MC)
7598 sd = &per_cpu(core_domains, i).sd;
7599#elif defined(CONFIG_SCHED_BOOK)
7600 sd = &per_cpu(book_domains, i).sd;
7601#else
7602 sd = &per_cpu(phys_domains, i).sd;
7603#endif
7604 cpu_attach_domain(sd, d.rd, i); 7303 cpu_attach_domain(sd, d.rd, i);
7605 } 7304 }
7305 rcu_read_unlock();
7606 7306
7607 d.sched_group_nodes = NULL; /* don't free this we still need it */ 7307 ret = 0;
7608 __free_domain_allocs(&d, sa_tmpmask, cpu_map);
7609 return 0;
7610
7611error: 7308error:
7612 __free_domain_allocs(&d, alloc_state, cpu_map); 7309 __free_domain_allocs(&d, alloc_state, cpu_map);
7613 return -ENOMEM; 7310 return ret;
7614}
7615
7616static int build_sched_domains(const struct cpumask *cpu_map)
7617{
7618 return __build_sched_domains(cpu_map, NULL);
7619} 7311}
7620 7312
7621static cpumask_var_t *doms_cur; /* current sched domains */ 7313static cpumask_var_t *doms_cur; /* current sched domains */
@@ -7670,7 +7362,7 @@ void free_sched_domains(cpumask_var_t doms[], unsigned int ndoms)
7670 * For now this just excludes isolated cpus, but could be used to 7362 * For now this just excludes isolated cpus, but could be used to
7671 * exclude other special cases in the future. 7363 * exclude other special cases in the future.
7672 */ 7364 */
7673static int arch_init_sched_domains(const struct cpumask *cpu_map) 7365static int init_sched_domains(const struct cpumask *cpu_map)
7674{ 7366{
7675 int err; 7367 int err;
7676 7368
@@ -7681,32 +7373,24 @@ static int arch_init_sched_domains(const struct cpumask *cpu_map)
7681 doms_cur = &fallback_doms; 7373 doms_cur = &fallback_doms;
7682 cpumask_andnot(doms_cur[0], cpu_map, cpu_isolated_map); 7374 cpumask_andnot(doms_cur[0], cpu_map, cpu_isolated_map);
7683 dattr_cur = NULL; 7375 dattr_cur = NULL;
7684 err = build_sched_domains(doms_cur[0]); 7376 err = build_sched_domains(doms_cur[0], NULL);
7685 register_sched_domain_sysctl(); 7377 register_sched_domain_sysctl();
7686 7378
7687 return err; 7379 return err;
7688} 7380}
7689 7381
7690static void arch_destroy_sched_domains(const struct cpumask *cpu_map,
7691 struct cpumask *tmpmask)
7692{
7693 free_sched_groups(cpu_map, tmpmask);
7694}
7695
7696/* 7382/*
7697 * Detach sched domains from a group of cpus specified in cpu_map 7383 * Detach sched domains from a group of cpus specified in cpu_map
7698 * These cpus will now be attached to the NULL domain 7384 * These cpus will now be attached to the NULL domain
7699 */ 7385 */
7700static void detach_destroy_domains(const struct cpumask *cpu_map) 7386static void detach_destroy_domains(const struct cpumask *cpu_map)
7701{ 7387{
7702 /* Save because hotplug lock held. */
7703 static DECLARE_BITMAP(tmpmask, CONFIG_NR_CPUS);
7704 int i; 7388 int i;
7705 7389
7390 rcu_read_lock();
7706 for_each_cpu(i, cpu_map) 7391 for_each_cpu(i, cpu_map)
7707 cpu_attach_domain(NULL, &def_root_domain, i); 7392 cpu_attach_domain(NULL, &def_root_domain, i);
7708 synchronize_sched(); 7393 rcu_read_unlock();
7709 arch_destroy_sched_domains(cpu_map, to_cpumask(tmpmask));
7710} 7394}
7711 7395
7712/* handle null as "default" */ 7396/* handle null as "default" */
@@ -7795,8 +7479,7 @@ match1:
7795 goto match2; 7479 goto match2;
7796 } 7480 }
7797 /* no match - add a new doms_new */ 7481 /* no match - add a new doms_new */
7798 __build_sched_domains(doms_new[i], 7482 build_sched_domains(doms_new[i], dattr_new ? dattr_new + i : NULL);
7799 dattr_new ? dattr_new + i : NULL);
7800match2: 7483match2:
7801 ; 7484 ;
7802 } 7485 }
@@ -7815,7 +7498,7 @@ match2:
7815} 7498}
7816 7499
7817#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT) 7500#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
7818static void arch_reinit_sched_domains(void) 7501static void reinit_sched_domains(void)
7819{ 7502{
7820 get_online_cpus(); 7503 get_online_cpus();
7821 7504
@@ -7848,7 +7531,7 @@ static ssize_t sched_power_savings_store(const char *buf, size_t count, int smt)
7848 else 7531 else
7849 sched_mc_power_savings = level; 7532 sched_mc_power_savings = level;
7850 7533
7851 arch_reinit_sched_domains(); 7534 reinit_sched_domains();
7852 7535
7853 return count; 7536 return count;
7854} 7537}
@@ -7967,14 +7650,9 @@ void __init sched_init_smp(void)
7967 alloc_cpumask_var(&non_isolated_cpus, GFP_KERNEL); 7650 alloc_cpumask_var(&non_isolated_cpus, GFP_KERNEL);
7968 alloc_cpumask_var(&fallback_doms, GFP_KERNEL); 7651 alloc_cpumask_var(&fallback_doms, GFP_KERNEL);
7969 7652
7970#if defined(CONFIG_NUMA)
7971 sched_group_nodes_bycpu = kzalloc(nr_cpu_ids * sizeof(void **),
7972 GFP_KERNEL);
7973 BUG_ON(sched_group_nodes_bycpu == NULL);
7974#endif
7975 get_online_cpus(); 7653 get_online_cpus();
7976 mutex_lock(&sched_domains_mutex); 7654 mutex_lock(&sched_domains_mutex);
7977 arch_init_sched_domains(cpu_active_mask); 7655 init_sched_domains(cpu_active_mask);
7978 cpumask_andnot(non_isolated_cpus, cpu_possible_mask, cpu_isolated_map); 7656 cpumask_andnot(non_isolated_cpus, cpu_possible_mask, cpu_isolated_map);
7979 if (cpumask_empty(non_isolated_cpus)) 7657 if (cpumask_empty(non_isolated_cpus))
7980 cpumask_set_cpu(smp_processor_id(), non_isolated_cpus); 7658 cpumask_set_cpu(smp_processor_id(), non_isolated_cpus);
@@ -8281,6 +7959,7 @@ void __init sched_init(void)
8281 /* Allocate the nohz_cpu_mask if CONFIG_CPUMASK_OFFSTACK */ 7959 /* Allocate the nohz_cpu_mask if CONFIG_CPUMASK_OFFSTACK */
8282 zalloc_cpumask_var(&nohz_cpu_mask, GFP_NOWAIT); 7960 zalloc_cpumask_var(&nohz_cpu_mask, GFP_NOWAIT);
8283#ifdef CONFIG_SMP 7961#ifdef CONFIG_SMP
7962 zalloc_cpumask_var(&sched_domains_tmpmask, GFP_NOWAIT);
8284#ifdef CONFIG_NO_HZ 7963#ifdef CONFIG_NO_HZ
8285 zalloc_cpumask_var(&nohz.idle_cpus_mask, GFP_NOWAIT); 7964 zalloc_cpumask_var(&nohz.idle_cpus_mask, GFP_NOWAIT);
8286 alloc_cpumask_var(&nohz.grp_idle_mask, GFP_NOWAIT); 7965 alloc_cpumask_var(&nohz.grp_idle_mask, GFP_NOWAIT);
@@ -8340,7 +8019,7 @@ static void normalize_task(struct rq *rq, struct task_struct *p)
8340 int old_prio = p->prio; 8019 int old_prio = p->prio;
8341 int on_rq; 8020 int on_rq;
8342 8021
8343 on_rq = p->se.on_rq; 8022 on_rq = p->on_rq;
8344 if (on_rq) 8023 if (on_rq)
8345 deactivate_task(rq, p, 0); 8024 deactivate_task(rq, p, 0);
8346 __setscheduler(rq, p, SCHED_NORMAL, 0); 8025 __setscheduler(rq, p, SCHED_NORMAL, 0);
@@ -8553,7 +8232,6 @@ int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent)
8553{ 8232{
8554 struct rt_rq *rt_rq; 8233 struct rt_rq *rt_rq;
8555 struct sched_rt_entity *rt_se; 8234 struct sched_rt_entity *rt_se;
8556 struct rq *rq;
8557 int i; 8235 int i;
8558 8236
8559 tg->rt_rq = kzalloc(sizeof(rt_rq) * nr_cpu_ids, GFP_KERNEL); 8237 tg->rt_rq = kzalloc(sizeof(rt_rq) * nr_cpu_ids, GFP_KERNEL);
@@ -8567,8 +8245,6 @@ int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent)
8567 ktime_to_ns(def_rt_bandwidth.rt_period), 0); 8245 ktime_to_ns(def_rt_bandwidth.rt_period), 0);
8568 8246
8569 for_each_possible_cpu(i) { 8247 for_each_possible_cpu(i) {
8570 rq = cpu_rq(i);
8571
8572 rt_rq = kzalloc_node(sizeof(struct rt_rq), 8248 rt_rq = kzalloc_node(sizeof(struct rt_rq),
8573 GFP_KERNEL, cpu_to_node(i)); 8249 GFP_KERNEL, cpu_to_node(i));
8574 if (!rt_rq) 8250 if (!rt_rq)
@@ -8683,7 +8359,7 @@ void sched_move_task(struct task_struct *tsk)
8683 rq = task_rq_lock(tsk, &flags); 8359 rq = task_rq_lock(tsk, &flags);
8684 8360
8685 running = task_current(rq, tsk); 8361 running = task_current(rq, tsk);
8686 on_rq = tsk->se.on_rq; 8362 on_rq = tsk->on_rq;
8687 8363
8688 if (on_rq) 8364 if (on_rq)
8689 dequeue_task(rq, tsk, 0); 8365 dequeue_task(rq, tsk, 0);
@@ -8702,7 +8378,7 @@ void sched_move_task(struct task_struct *tsk)
8702 if (on_rq) 8378 if (on_rq)
8703 enqueue_task(rq, tsk, 0); 8379 enqueue_task(rq, tsk, 0);
8704 8380
8705 task_rq_unlock(rq, &flags); 8381 task_rq_unlock(rq, tsk, &flags);
8706} 8382}
8707#endif /* CONFIG_CGROUP_SCHED */ 8383#endif /* CONFIG_CGROUP_SCHED */
8708 8384
diff --git a/kernel/sched_debug.c b/kernel/sched_debug.c
index 7bacd83a4158..a6710a112b4f 100644
--- a/kernel/sched_debug.c
+++ b/kernel/sched_debug.c
@@ -152,7 +152,7 @@ static void print_rq(struct seq_file *m, struct rq *rq, int rq_cpu)
152 read_lock_irqsave(&tasklist_lock, flags); 152 read_lock_irqsave(&tasklist_lock, flags);
153 153
154 do_each_thread(g, p) { 154 do_each_thread(g, p) {
155 if (!p->se.on_rq || task_cpu(p) != rq_cpu) 155 if (!p->on_rq || task_cpu(p) != rq_cpu)
156 continue; 156 continue;
157 157
158 print_task(m, rq, p); 158 print_task(m, rq, p);
@@ -296,9 +296,6 @@ static void print_cpu(struct seq_file *m, int cpu)
296 P(ttwu_count); 296 P(ttwu_count);
297 P(ttwu_local); 297 P(ttwu_local);
298 298
299 SEQ_printf(m, " .%-30s: %d\n", "bkl_count",
300 rq->rq_sched_info.bkl_count);
301
302#undef P 299#undef P
303#undef P64 300#undef P64
304#endif 301#endif
@@ -441,7 +438,6 @@ void proc_sched_show_task(struct task_struct *p, struct seq_file *m)
441 P(se.statistics.wait_count); 438 P(se.statistics.wait_count);
442 PN(se.statistics.iowait_sum); 439 PN(se.statistics.iowait_sum);
443 P(se.statistics.iowait_count); 440 P(se.statistics.iowait_count);
444 P(sched_info.bkl_count);
445 P(se.nr_migrations); 441 P(se.nr_migrations);
446 P(se.statistics.nr_migrations_cold); 442 P(se.statistics.nr_migrations_cold);
447 P(se.statistics.nr_failed_migrations_affine); 443 P(se.statistics.nr_failed_migrations_affine);
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index 6fa833ab2cb8..37f22626225e 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -358,6 +358,10 @@ static void update_min_vruntime(struct cfs_rq *cfs_rq)
358 } 358 }
359 359
360 cfs_rq->min_vruntime = max_vruntime(cfs_rq->min_vruntime, vruntime); 360 cfs_rq->min_vruntime = max_vruntime(cfs_rq->min_vruntime, vruntime);
361#ifndef CONFIG_64BIT
362 smp_wmb();
363 cfs_rq->min_vruntime_copy = cfs_rq->min_vruntime;
364#endif
361} 365}
362 366
363/* 367/*
@@ -1340,6 +1344,8 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
1340 hrtick_update(rq); 1344 hrtick_update(rq);
1341} 1345}
1342 1346
1347static void set_next_buddy(struct sched_entity *se);
1348
1343/* 1349/*
1344 * The dequeue_task method is called before nr_running is 1350 * The dequeue_task method is called before nr_running is
1345 * decreased. We remove the task from the rbtree and 1351 * decreased. We remove the task from the rbtree and
@@ -1349,14 +1355,22 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
1349{ 1355{
1350 struct cfs_rq *cfs_rq; 1356 struct cfs_rq *cfs_rq;
1351 struct sched_entity *se = &p->se; 1357 struct sched_entity *se = &p->se;
1358 int task_sleep = flags & DEQUEUE_SLEEP;
1352 1359
1353 for_each_sched_entity(se) { 1360 for_each_sched_entity(se) {
1354 cfs_rq = cfs_rq_of(se); 1361 cfs_rq = cfs_rq_of(se);
1355 dequeue_entity(cfs_rq, se, flags); 1362 dequeue_entity(cfs_rq, se, flags);
1356 1363
1357 /* Don't dequeue parent if it has other entities besides us */ 1364 /* Don't dequeue parent if it has other entities besides us */
1358 if (cfs_rq->load.weight) 1365 if (cfs_rq->load.weight) {
1366 /*
1367 * Bias pick_next to pick a task from this cfs_rq, as
1368 * p is sleeping when it is within its sched_slice.
1369 */
1370 if (task_sleep && parent_entity(se))
1371 set_next_buddy(parent_entity(se));
1359 break; 1372 break;
1373 }
1360 flags |= DEQUEUE_SLEEP; 1374 flags |= DEQUEUE_SLEEP;
1361 } 1375 }
1362 1376
@@ -1372,12 +1386,25 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
1372 1386
1373#ifdef CONFIG_SMP 1387#ifdef CONFIG_SMP
1374 1388
1375static void task_waking_fair(struct rq *rq, struct task_struct *p) 1389static void task_waking_fair(struct task_struct *p)
1376{ 1390{
1377 struct sched_entity *se = &p->se; 1391 struct sched_entity *se = &p->se;
1378 struct cfs_rq *cfs_rq = cfs_rq_of(se); 1392 struct cfs_rq *cfs_rq = cfs_rq_of(se);
1393 u64 min_vruntime;
1379 1394
1380 se->vruntime -= cfs_rq->min_vruntime; 1395#ifndef CONFIG_64BIT
1396 u64 min_vruntime_copy;
1397
1398 do {
1399 min_vruntime_copy = cfs_rq->min_vruntime_copy;
1400 smp_rmb();
1401 min_vruntime = cfs_rq->min_vruntime;
1402 } while (min_vruntime != min_vruntime_copy);
1403#else
1404 min_vruntime = cfs_rq->min_vruntime;
1405#endif
1406
1407 se->vruntime -= min_vruntime;
1381} 1408}
1382 1409
1383#ifdef CONFIG_FAIR_GROUP_SCHED 1410#ifdef CONFIG_FAIR_GROUP_SCHED
@@ -1622,6 +1649,7 @@ static int select_idle_sibling(struct task_struct *p, int target)
1622 /* 1649 /*
1623 * Otherwise, iterate the domains and find an elegible idle cpu. 1650 * Otherwise, iterate the domains and find an elegible idle cpu.
1624 */ 1651 */
1652 rcu_read_lock();
1625 for_each_domain(target, sd) { 1653 for_each_domain(target, sd) {
1626 if (!(sd->flags & SD_SHARE_PKG_RESOURCES)) 1654 if (!(sd->flags & SD_SHARE_PKG_RESOURCES))
1627 break; 1655 break;
@@ -1641,6 +1669,7 @@ static int select_idle_sibling(struct task_struct *p, int target)
1641 cpumask_test_cpu(prev_cpu, sched_domain_span(sd))) 1669 cpumask_test_cpu(prev_cpu, sched_domain_span(sd)))
1642 break; 1670 break;
1643 } 1671 }
1672 rcu_read_unlock();
1644 1673
1645 return target; 1674 return target;
1646} 1675}
@@ -1657,7 +1686,7 @@ static int select_idle_sibling(struct task_struct *p, int target)
1657 * preempt must be disabled. 1686 * preempt must be disabled.
1658 */ 1687 */
1659static int 1688static int
1660select_task_rq_fair(struct rq *rq, struct task_struct *p, int sd_flag, int wake_flags) 1689select_task_rq_fair(struct task_struct *p, int sd_flag, int wake_flags)
1661{ 1690{
1662 struct sched_domain *tmp, *affine_sd = NULL, *sd = NULL; 1691 struct sched_domain *tmp, *affine_sd = NULL, *sd = NULL;
1663 int cpu = smp_processor_id(); 1692 int cpu = smp_processor_id();
@@ -1673,6 +1702,7 @@ select_task_rq_fair(struct rq *rq, struct task_struct *p, int sd_flag, int wake_
1673 new_cpu = prev_cpu; 1702 new_cpu = prev_cpu;
1674 } 1703 }
1675 1704
1705 rcu_read_lock();
1676 for_each_domain(cpu, tmp) { 1706 for_each_domain(cpu, tmp) {
1677 if (!(tmp->flags & SD_LOAD_BALANCE)) 1707 if (!(tmp->flags & SD_LOAD_BALANCE))
1678 continue; 1708 continue;
@@ -1723,9 +1753,10 @@ select_task_rq_fair(struct rq *rq, struct task_struct *p, int sd_flag, int wake_
1723 1753
1724 if (affine_sd) { 1754 if (affine_sd) {
1725 if (cpu == prev_cpu || wake_affine(affine_sd, p, sync)) 1755 if (cpu == prev_cpu || wake_affine(affine_sd, p, sync))
1726 return select_idle_sibling(p, cpu); 1756 prev_cpu = cpu;
1727 else 1757
1728 return select_idle_sibling(p, prev_cpu); 1758 new_cpu = select_idle_sibling(p, prev_cpu);
1759 goto unlock;
1729 } 1760 }
1730 1761
1731 while (sd) { 1762 while (sd) {
@@ -1766,6 +1797,8 @@ select_task_rq_fair(struct rq *rq, struct task_struct *p, int sd_flag, int wake_
1766 } 1797 }
1767 /* while loop will break here if sd == NULL */ 1798 /* while loop will break here if sd == NULL */
1768 } 1799 }
1800unlock:
1801 rcu_read_unlock();
1769 1802
1770 return new_cpu; 1803 return new_cpu;
1771} 1804}
@@ -1789,10 +1822,7 @@ wakeup_gran(struct sched_entity *curr, struct sched_entity *se)
1789 * This is especially important for buddies when the leftmost 1822 * This is especially important for buddies when the leftmost
1790 * task is higher priority than the buddy. 1823 * task is higher priority than the buddy.
1791 */ 1824 */
1792 if (unlikely(se->load.weight != NICE_0_LOAD)) 1825 return calc_delta_fair(gran, se);
1793 gran = calc_delta_fair(gran, se);
1794
1795 return gran;
1796} 1826}
1797 1827
1798/* 1828/*
@@ -1826,26 +1856,26 @@ wakeup_preempt_entity(struct sched_entity *curr, struct sched_entity *se)
1826 1856
1827static void set_last_buddy(struct sched_entity *se) 1857static void set_last_buddy(struct sched_entity *se)
1828{ 1858{
1829 if (likely(task_of(se)->policy != SCHED_IDLE)) { 1859 if (entity_is_task(se) && unlikely(task_of(se)->policy == SCHED_IDLE))
1830 for_each_sched_entity(se) 1860 return;
1831 cfs_rq_of(se)->last = se; 1861
1832 } 1862 for_each_sched_entity(se)
1863 cfs_rq_of(se)->last = se;
1833} 1864}
1834 1865
1835static void set_next_buddy(struct sched_entity *se) 1866static void set_next_buddy(struct sched_entity *se)
1836{ 1867{
1837 if (likely(task_of(se)->policy != SCHED_IDLE)) { 1868 if (entity_is_task(se) && unlikely(task_of(se)->policy == SCHED_IDLE))
1838 for_each_sched_entity(se) 1869 return;
1839 cfs_rq_of(se)->next = se; 1870
1840 } 1871 for_each_sched_entity(se)
1872 cfs_rq_of(se)->next = se;
1841} 1873}
1842 1874
1843static void set_skip_buddy(struct sched_entity *se) 1875static void set_skip_buddy(struct sched_entity *se)
1844{ 1876{
1845 if (likely(task_of(se)->policy != SCHED_IDLE)) { 1877 for_each_sched_entity(se)
1846 for_each_sched_entity(se) 1878 cfs_rq_of(se)->skip = se;
1847 cfs_rq_of(se)->skip = se;
1848 }
1849} 1879}
1850 1880
1851/* 1881/*
@@ -1857,12 +1887,15 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_
1857 struct sched_entity *se = &curr->se, *pse = &p->se; 1887 struct sched_entity *se = &curr->se, *pse = &p->se;
1858 struct cfs_rq *cfs_rq = task_cfs_rq(curr); 1888 struct cfs_rq *cfs_rq = task_cfs_rq(curr);
1859 int scale = cfs_rq->nr_running >= sched_nr_latency; 1889 int scale = cfs_rq->nr_running >= sched_nr_latency;
1890 int next_buddy_marked = 0;
1860 1891
1861 if (unlikely(se == pse)) 1892 if (unlikely(se == pse))
1862 return; 1893 return;
1863 1894
1864 if (sched_feat(NEXT_BUDDY) && scale && !(wake_flags & WF_FORK)) 1895 if (sched_feat(NEXT_BUDDY) && scale && !(wake_flags & WF_FORK)) {
1865 set_next_buddy(pse); 1896 set_next_buddy(pse);
1897 next_buddy_marked = 1;
1898 }
1866 1899
1867 /* 1900 /*
1868 * We can come here with TIF_NEED_RESCHED already set from new task 1901 * We can come here with TIF_NEED_RESCHED already set from new task
@@ -1890,8 +1923,15 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_
1890 update_curr(cfs_rq); 1923 update_curr(cfs_rq);
1891 find_matching_se(&se, &pse); 1924 find_matching_se(&se, &pse);
1892 BUG_ON(!pse); 1925 BUG_ON(!pse);
1893 if (wakeup_preempt_entity(se, pse) == 1) 1926 if (wakeup_preempt_entity(se, pse) == 1) {
1927 /*
1928 * Bias pick_next to pick the sched entity that is
1929 * triggering this preemption.
1930 */
1931 if (!next_buddy_marked)
1932 set_next_buddy(pse);
1894 goto preempt; 1933 goto preempt;
1934 }
1895 1935
1896 return; 1936 return;
1897 1937
@@ -2102,7 +2142,7 @@ static unsigned long
2102balance_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest, 2142balance_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest,
2103 unsigned long max_load_move, struct sched_domain *sd, 2143 unsigned long max_load_move, struct sched_domain *sd,
2104 enum cpu_idle_type idle, int *all_pinned, 2144 enum cpu_idle_type idle, int *all_pinned,
2105 int *this_best_prio, struct cfs_rq *busiest_cfs_rq) 2145 struct cfs_rq *busiest_cfs_rq)
2106{ 2146{
2107 int loops = 0, pulled = 0; 2147 int loops = 0, pulled = 0;
2108 long rem_load_move = max_load_move; 2148 long rem_load_move = max_load_move;
@@ -2140,9 +2180,6 @@ balance_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest,
2140 */ 2180 */
2141 if (rem_load_move <= 0) 2181 if (rem_load_move <= 0)
2142 break; 2182 break;
2143
2144 if (p->prio < *this_best_prio)
2145 *this_best_prio = p->prio;
2146 } 2183 }
2147out: 2184out:
2148 /* 2185 /*
@@ -2202,7 +2239,7 @@ static unsigned long
2202load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest, 2239load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest,
2203 unsigned long max_load_move, 2240 unsigned long max_load_move,
2204 struct sched_domain *sd, enum cpu_idle_type idle, 2241 struct sched_domain *sd, enum cpu_idle_type idle,
2205 int *all_pinned, int *this_best_prio) 2242 int *all_pinned)
2206{ 2243{
2207 long rem_load_move = max_load_move; 2244 long rem_load_move = max_load_move;
2208 int busiest_cpu = cpu_of(busiest); 2245 int busiest_cpu = cpu_of(busiest);
@@ -2227,7 +2264,7 @@ load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest,
2227 rem_load = div_u64(rem_load, busiest_h_load + 1); 2264 rem_load = div_u64(rem_load, busiest_h_load + 1);
2228 2265
2229 moved_load = balance_tasks(this_rq, this_cpu, busiest, 2266 moved_load = balance_tasks(this_rq, this_cpu, busiest,
2230 rem_load, sd, idle, all_pinned, this_best_prio, 2267 rem_load, sd, idle, all_pinned,
2231 busiest_cfs_rq); 2268 busiest_cfs_rq);
2232 2269
2233 if (!moved_load) 2270 if (!moved_load)
@@ -2253,11 +2290,11 @@ static unsigned long
2253load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest, 2290load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest,
2254 unsigned long max_load_move, 2291 unsigned long max_load_move,
2255 struct sched_domain *sd, enum cpu_idle_type idle, 2292 struct sched_domain *sd, enum cpu_idle_type idle,
2256 int *all_pinned, int *this_best_prio) 2293 int *all_pinned)
2257{ 2294{
2258 return balance_tasks(this_rq, this_cpu, busiest, 2295 return balance_tasks(this_rq, this_cpu, busiest,
2259 max_load_move, sd, idle, all_pinned, 2296 max_load_move, sd, idle, all_pinned,
2260 this_best_prio, &busiest->cfs); 2297 &busiest->cfs);
2261} 2298}
2262#endif 2299#endif
2263 2300
@@ -2274,12 +2311,11 @@ static int move_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest,
2274 int *all_pinned) 2311 int *all_pinned)
2275{ 2312{
2276 unsigned long total_load_moved = 0, load_moved; 2313 unsigned long total_load_moved = 0, load_moved;
2277 int this_best_prio = this_rq->curr->prio;
2278 2314
2279 do { 2315 do {
2280 load_moved = load_balance_fair(this_rq, this_cpu, busiest, 2316 load_moved = load_balance_fair(this_rq, this_cpu, busiest,
2281 max_load_move - total_load_moved, 2317 max_load_move - total_load_moved,
2282 sd, idle, all_pinned, &this_best_prio); 2318 sd, idle, all_pinned);
2283 2319
2284 total_load_moved += load_moved; 2320 total_load_moved += load_moved;
2285 2321
@@ -2648,7 +2684,7 @@ fix_small_capacity(struct sched_domain *sd, struct sched_group *group)
2648 /* 2684 /*
2649 * Only siblings can have significantly less than SCHED_LOAD_SCALE 2685 * Only siblings can have significantly less than SCHED_LOAD_SCALE
2650 */ 2686 */
2651 if (sd->level != SD_LV_SIBLING) 2687 if (!(sd->flags & SD_SHARE_CPUPOWER))
2652 return 0; 2688 return 0;
2653 2689
2654 /* 2690 /*
@@ -3465,6 +3501,7 @@ static void idle_balance(int this_cpu, struct rq *this_rq)
3465 raw_spin_unlock(&this_rq->lock); 3501 raw_spin_unlock(&this_rq->lock);
3466 3502
3467 update_shares(this_cpu); 3503 update_shares(this_cpu);
3504 rcu_read_lock();
3468 for_each_domain(this_cpu, sd) { 3505 for_each_domain(this_cpu, sd) {
3469 unsigned long interval; 3506 unsigned long interval;
3470 int balance = 1; 3507 int balance = 1;
@@ -3486,6 +3523,7 @@ static void idle_balance(int this_cpu, struct rq *this_rq)
3486 break; 3523 break;
3487 } 3524 }
3488 } 3525 }
3526 rcu_read_unlock();
3489 3527
3490 raw_spin_lock(&this_rq->lock); 3528 raw_spin_lock(&this_rq->lock);
3491 3529
@@ -3534,6 +3572,7 @@ static int active_load_balance_cpu_stop(void *data)
3534 double_lock_balance(busiest_rq, target_rq); 3572 double_lock_balance(busiest_rq, target_rq);
3535 3573
3536 /* Search for an sd spanning us and the target CPU. */ 3574 /* Search for an sd spanning us and the target CPU. */
3575 rcu_read_lock();
3537 for_each_domain(target_cpu, sd) { 3576 for_each_domain(target_cpu, sd) {
3538 if ((sd->flags & SD_LOAD_BALANCE) && 3577 if ((sd->flags & SD_LOAD_BALANCE) &&
3539 cpumask_test_cpu(busiest_cpu, sched_domain_span(sd))) 3578 cpumask_test_cpu(busiest_cpu, sched_domain_span(sd)))
@@ -3549,6 +3588,7 @@ static int active_load_balance_cpu_stop(void *data)
3549 else 3588 else
3550 schedstat_inc(sd, alb_failed); 3589 schedstat_inc(sd, alb_failed);
3551 } 3590 }
3591 rcu_read_unlock();
3552 double_unlock_balance(busiest_rq, target_rq); 3592 double_unlock_balance(busiest_rq, target_rq);
3553out_unlock: 3593out_unlock:
3554 busiest_rq->active_balance = 0; 3594 busiest_rq->active_balance = 0;
@@ -3675,6 +3715,7 @@ static int find_new_ilb(int cpu)
3675{ 3715{
3676 struct sched_domain *sd; 3716 struct sched_domain *sd;
3677 struct sched_group *ilb_group; 3717 struct sched_group *ilb_group;
3718 int ilb = nr_cpu_ids;
3678 3719
3679 /* 3720 /*
3680 * Have idle load balancer selection from semi-idle packages only 3721 * Have idle load balancer selection from semi-idle packages only
@@ -3690,20 +3731,25 @@ static int find_new_ilb(int cpu)
3690 if (cpumask_weight(nohz.idle_cpus_mask) < 2) 3731 if (cpumask_weight(nohz.idle_cpus_mask) < 2)
3691 goto out_done; 3732 goto out_done;
3692 3733
3734 rcu_read_lock();
3693 for_each_flag_domain(cpu, sd, SD_POWERSAVINGS_BALANCE) { 3735 for_each_flag_domain(cpu, sd, SD_POWERSAVINGS_BALANCE) {
3694 ilb_group = sd->groups; 3736 ilb_group = sd->groups;
3695 3737
3696 do { 3738 do {
3697 if (is_semi_idle_group(ilb_group)) 3739 if (is_semi_idle_group(ilb_group)) {
3698 return cpumask_first(nohz.grp_idle_mask); 3740 ilb = cpumask_first(nohz.grp_idle_mask);
3741 goto unlock;
3742 }
3699 3743
3700 ilb_group = ilb_group->next; 3744 ilb_group = ilb_group->next;
3701 3745
3702 } while (ilb_group != sd->groups); 3746 } while (ilb_group != sd->groups);
3703 } 3747 }
3748unlock:
3749 rcu_read_unlock();
3704 3750
3705out_done: 3751out_done:
3706 return nr_cpu_ids; 3752 return ilb;
3707} 3753}
3708#else /* (CONFIG_SCHED_MC || CONFIG_SCHED_SMT) */ 3754#else /* (CONFIG_SCHED_MC || CONFIG_SCHED_SMT) */
3709static inline int find_new_ilb(int call_cpu) 3755static inline int find_new_ilb(int call_cpu)
@@ -3848,6 +3894,7 @@ static void rebalance_domains(int cpu, enum cpu_idle_type idle)
3848 3894
3849 update_shares(cpu); 3895 update_shares(cpu);
3850 3896
3897 rcu_read_lock();
3851 for_each_domain(cpu, sd) { 3898 for_each_domain(cpu, sd) {
3852 if (!(sd->flags & SD_LOAD_BALANCE)) 3899 if (!(sd->flags & SD_LOAD_BALANCE))
3853 continue; 3900 continue;
@@ -3893,6 +3940,7 @@ out:
3893 if (!balance) 3940 if (!balance)
3894 break; 3941 break;
3895 } 3942 }
3943 rcu_read_unlock();
3896 3944
3897 /* 3945 /*
3898 * next_balance will be updated only when there is a need. 3946 * next_balance will be updated only when there is a need.
diff --git a/kernel/sched_features.h b/kernel/sched_features.h
index 68e69acc29b9..be40f7371ee1 100644
--- a/kernel/sched_features.h
+++ b/kernel/sched_features.h
@@ -64,3 +64,9 @@ SCHED_FEAT(OWNER_SPIN, 1)
64 * Decrement CPU power based on irq activity 64 * Decrement CPU power based on irq activity
65 */ 65 */
66SCHED_FEAT(NONIRQ_POWER, 1) 66SCHED_FEAT(NONIRQ_POWER, 1)
67
68/*
69 * Queue remote wakeups on the target CPU and process them
70 * using the scheduler IPI. Reduces rq->lock contention/bounces.
71 */
72SCHED_FEAT(TTWU_QUEUE, 1)
diff --git a/kernel/sched_idletask.c b/kernel/sched_idletask.c
index a776a6396427..0a51882534ea 100644
--- a/kernel/sched_idletask.c
+++ b/kernel/sched_idletask.c
@@ -7,7 +7,7 @@
7 7
8#ifdef CONFIG_SMP 8#ifdef CONFIG_SMP
9static int 9static int
10select_task_rq_idle(struct rq *rq, struct task_struct *p, int sd_flag, int flags) 10select_task_rq_idle(struct task_struct *p, int sd_flag, int flags)
11{ 11{
12 return task_cpu(p); /* IDLE tasks as never migrated */ 12 return task_cpu(p); /* IDLE tasks as never migrated */
13} 13}
diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c
index e7cebdc65f82..64b2a37c07d0 100644
--- a/kernel/sched_rt.c
+++ b/kernel/sched_rt.c
@@ -183,6 +183,14 @@ static inline u64 sched_rt_period(struct rt_rq *rt_rq)
183 return ktime_to_ns(rt_rq->tg->rt_bandwidth.rt_period); 183 return ktime_to_ns(rt_rq->tg->rt_bandwidth.rt_period);
184} 184}
185 185
186typedef struct task_group *rt_rq_iter_t;
187
188#define for_each_rt_rq(rt_rq, iter, rq) \
189 for (iter = list_entry_rcu(task_groups.next, typeof(*iter), list); \
190 (&iter->list != &task_groups) && \
191 (rt_rq = iter->rt_rq[cpu_of(rq)]); \
192 iter = list_entry_rcu(iter->list.next, typeof(*iter), list))
193
186static inline void list_add_leaf_rt_rq(struct rt_rq *rt_rq) 194static inline void list_add_leaf_rt_rq(struct rt_rq *rt_rq)
187{ 195{
188 list_add_rcu(&rt_rq->leaf_rt_rq_list, 196 list_add_rcu(&rt_rq->leaf_rt_rq_list,
@@ -288,6 +296,11 @@ static inline u64 sched_rt_period(struct rt_rq *rt_rq)
288 return ktime_to_ns(def_rt_bandwidth.rt_period); 296 return ktime_to_ns(def_rt_bandwidth.rt_period);
289} 297}
290 298
299typedef struct rt_rq *rt_rq_iter_t;
300
301#define for_each_rt_rq(rt_rq, iter, rq) \
302 for ((void) iter, rt_rq = &rq->rt; rt_rq; rt_rq = NULL)
303
291static inline void list_add_leaf_rt_rq(struct rt_rq *rt_rq) 304static inline void list_add_leaf_rt_rq(struct rt_rq *rt_rq)
292{ 305{
293} 306}
@@ -402,12 +415,13 @@ next:
402static void __disable_runtime(struct rq *rq) 415static void __disable_runtime(struct rq *rq)
403{ 416{
404 struct root_domain *rd = rq->rd; 417 struct root_domain *rd = rq->rd;
418 rt_rq_iter_t iter;
405 struct rt_rq *rt_rq; 419 struct rt_rq *rt_rq;
406 420
407 if (unlikely(!scheduler_running)) 421 if (unlikely(!scheduler_running))
408 return; 422 return;
409 423
410 for_each_leaf_rt_rq(rt_rq, rq) { 424 for_each_rt_rq(rt_rq, iter, rq) {
411 struct rt_bandwidth *rt_b = sched_rt_bandwidth(rt_rq); 425 struct rt_bandwidth *rt_b = sched_rt_bandwidth(rt_rq);
412 s64 want; 426 s64 want;
413 int i; 427 int i;
@@ -487,6 +501,7 @@ static void disable_runtime(struct rq *rq)
487 501
488static void __enable_runtime(struct rq *rq) 502static void __enable_runtime(struct rq *rq)
489{ 503{
504 rt_rq_iter_t iter;
490 struct rt_rq *rt_rq; 505 struct rt_rq *rt_rq;
491 506
492 if (unlikely(!scheduler_running)) 507 if (unlikely(!scheduler_running))
@@ -495,7 +510,7 @@ static void __enable_runtime(struct rq *rq)
495 /* 510 /*
496 * Reset each runqueue's bandwidth settings 511 * Reset each runqueue's bandwidth settings
497 */ 512 */
498 for_each_leaf_rt_rq(rt_rq, rq) { 513 for_each_rt_rq(rt_rq, iter, rq) {
499 struct rt_bandwidth *rt_b = sched_rt_bandwidth(rt_rq); 514 struct rt_bandwidth *rt_b = sched_rt_bandwidth(rt_rq);
500 515
501 raw_spin_lock(&rt_b->rt_runtime_lock); 516 raw_spin_lock(&rt_b->rt_runtime_lock);
@@ -562,6 +577,13 @@ static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun)
562 if (rt_rq->rt_throttled && rt_rq->rt_time < runtime) { 577 if (rt_rq->rt_throttled && rt_rq->rt_time < runtime) {
563 rt_rq->rt_throttled = 0; 578 rt_rq->rt_throttled = 0;
564 enqueue = 1; 579 enqueue = 1;
580
581 /*
582 * Force a clock update if the CPU was idle,
583 * lest wakeup -> unthrottle time accumulate.
584 */
585 if (rt_rq->rt_nr_running && rq->curr == rq->idle)
586 rq->skip_clock_update = -1;
565 } 587 }
566 if (rt_rq->rt_time || rt_rq->rt_nr_running) 588 if (rt_rq->rt_time || rt_rq->rt_nr_running)
567 idle = 0; 589 idle = 0;
@@ -977,13 +999,23 @@ static void yield_task_rt(struct rq *rq)
977static int find_lowest_rq(struct task_struct *task); 999static int find_lowest_rq(struct task_struct *task);
978 1000
979static int 1001static int
980select_task_rq_rt(struct rq *rq, struct task_struct *p, int sd_flag, int flags) 1002select_task_rq_rt(struct task_struct *p, int sd_flag, int flags)
981{ 1003{
1004 struct task_struct *curr;
1005 struct rq *rq;
1006 int cpu;
1007
982 if (sd_flag != SD_BALANCE_WAKE) 1008 if (sd_flag != SD_BALANCE_WAKE)
983 return smp_processor_id(); 1009 return smp_processor_id();
984 1010
1011 cpu = task_cpu(p);
1012 rq = cpu_rq(cpu);
1013
1014 rcu_read_lock();
1015 curr = ACCESS_ONCE(rq->curr); /* unlocked access */
1016
985 /* 1017 /*
986 * If the current task is an RT task, then 1018 * If the current task on @p's runqueue is an RT task, then
987 * try to see if we can wake this RT task up on another 1019 * try to see if we can wake this RT task up on another
988 * runqueue. Otherwise simply start this RT task 1020 * runqueue. Otherwise simply start this RT task
989 * on its current runqueue. 1021 * on its current runqueue.
@@ -997,21 +1029,25 @@ select_task_rq_rt(struct rq *rq, struct task_struct *p, int sd_flag, int flags)
997 * lock? 1029 * lock?
998 * 1030 *
999 * For equal prio tasks, we just let the scheduler sort it out. 1031 * For equal prio tasks, we just let the scheduler sort it out.
1032 *
1033 * Otherwise, just let it ride on the affined RQ and the
1034 * post-schedule router will push the preempted task away
1035 *
1036 * This test is optimistic, if we get it wrong the load-balancer
1037 * will have to sort it out.
1000 */ 1038 */
1001 if (unlikely(rt_task(rq->curr)) && 1039 if (curr && unlikely(rt_task(curr)) &&
1002 (rq->curr->rt.nr_cpus_allowed < 2 || 1040 (curr->rt.nr_cpus_allowed < 2 ||
1003 rq->curr->prio < p->prio) && 1041 curr->prio < p->prio) &&
1004 (p->rt.nr_cpus_allowed > 1)) { 1042 (p->rt.nr_cpus_allowed > 1)) {
1005 int cpu = find_lowest_rq(p); 1043 int target = find_lowest_rq(p);
1006 1044
1007 return (cpu == -1) ? task_cpu(p) : cpu; 1045 if (target != -1)
1046 cpu = target;
1008 } 1047 }
1048 rcu_read_unlock();
1009 1049
1010 /* 1050 return cpu;
1011 * Otherwise, just let it ride on the affined RQ and the
1012 * post-schedule router will push the preempted task away
1013 */
1014 return task_cpu(p);
1015} 1051}
1016 1052
1017static void check_preempt_equal_prio(struct rq *rq, struct task_struct *p) 1053static void check_preempt_equal_prio(struct rq *rq, struct task_struct *p)
@@ -1136,7 +1172,7 @@ static void put_prev_task_rt(struct rq *rq, struct task_struct *p)
1136 * The previous task needs to be made eligible for pushing 1172 * The previous task needs to be made eligible for pushing
1137 * if it is still active 1173 * if it is still active
1138 */ 1174 */
1139 if (p->se.on_rq && p->rt.nr_cpus_allowed > 1) 1175 if (on_rt_rq(&p->rt) && p->rt.nr_cpus_allowed > 1)
1140 enqueue_pushable_task(rq, p); 1176 enqueue_pushable_task(rq, p);
1141} 1177}
1142 1178
@@ -1287,7 +1323,7 @@ static struct rq *find_lock_lowest_rq(struct task_struct *task, struct rq *rq)
1287 !cpumask_test_cpu(lowest_rq->cpu, 1323 !cpumask_test_cpu(lowest_rq->cpu,
1288 &task->cpus_allowed) || 1324 &task->cpus_allowed) ||
1289 task_running(rq, task) || 1325 task_running(rq, task) ||
1290 !task->se.on_rq)) { 1326 !task->on_rq)) {
1291 1327
1292 raw_spin_unlock(&lowest_rq->lock); 1328 raw_spin_unlock(&lowest_rq->lock);
1293 lowest_rq = NULL; 1329 lowest_rq = NULL;
@@ -1321,7 +1357,7 @@ static struct task_struct *pick_next_pushable_task(struct rq *rq)
1321 BUG_ON(task_current(rq, p)); 1357 BUG_ON(task_current(rq, p));
1322 BUG_ON(p->rt.nr_cpus_allowed <= 1); 1358 BUG_ON(p->rt.nr_cpus_allowed <= 1);
1323 1359
1324 BUG_ON(!p->se.on_rq); 1360 BUG_ON(!p->on_rq);
1325 BUG_ON(!rt_task(p)); 1361 BUG_ON(!rt_task(p));
1326 1362
1327 return p; 1363 return p;
@@ -1467,7 +1503,7 @@ static int pull_rt_task(struct rq *this_rq)
1467 */ 1503 */
1468 if (p && (p->prio < this_rq->rt.highest_prio.curr)) { 1504 if (p && (p->prio < this_rq->rt.highest_prio.curr)) {
1469 WARN_ON(p == src_rq->curr); 1505 WARN_ON(p == src_rq->curr);
1470 WARN_ON(!p->se.on_rq); 1506 WARN_ON(!p->on_rq);
1471 1507
1472 /* 1508 /*
1473 * There's a chance that p is higher in priority 1509 * There's a chance that p is higher in priority
@@ -1538,7 +1574,7 @@ static void set_cpus_allowed_rt(struct task_struct *p,
1538 * Update the migration status of the RQ if we have an RT task 1574 * Update the migration status of the RQ if we have an RT task
1539 * which is running AND changing its weight value. 1575 * which is running AND changing its weight value.
1540 */ 1576 */
1541 if (p->se.on_rq && (weight != p->rt.nr_cpus_allowed)) { 1577 if (p->on_rq && (weight != p->rt.nr_cpus_allowed)) {
1542 struct rq *rq = task_rq(p); 1578 struct rq *rq = task_rq(p);
1543 1579
1544 if (!task_current(rq, p)) { 1580 if (!task_current(rq, p)) {
@@ -1608,7 +1644,7 @@ static void switched_from_rt(struct rq *rq, struct task_struct *p)
1608 * we may need to handle the pulling of RT tasks 1644 * we may need to handle the pulling of RT tasks
1609 * now. 1645 * now.
1610 */ 1646 */
1611 if (p->se.on_rq && !rq->rt.rt_nr_running) 1647 if (p->on_rq && !rq->rt.rt_nr_running)
1612 pull_rt_task(rq); 1648 pull_rt_task(rq);
1613} 1649}
1614 1650
@@ -1638,7 +1674,7 @@ static void switched_to_rt(struct rq *rq, struct task_struct *p)
1638 * If that current running task is also an RT task 1674 * If that current running task is also an RT task
1639 * then see if we can move to another run queue. 1675 * then see if we can move to another run queue.
1640 */ 1676 */
1641 if (p->se.on_rq && rq->curr != p) { 1677 if (p->on_rq && rq->curr != p) {
1642#ifdef CONFIG_SMP 1678#ifdef CONFIG_SMP
1643 if (rq->rt.overloaded && push_rt_task(rq) && 1679 if (rq->rt.overloaded && push_rt_task(rq) &&
1644 /* Don't resched if we changed runqueues */ 1680 /* Don't resched if we changed runqueues */
@@ -1657,7 +1693,7 @@ static void switched_to_rt(struct rq *rq, struct task_struct *p)
1657static void 1693static void
1658prio_changed_rt(struct rq *rq, struct task_struct *p, int oldprio) 1694prio_changed_rt(struct rq *rq, struct task_struct *p, int oldprio)
1659{ 1695{
1660 if (!p->se.on_rq) 1696 if (!p->on_rq)
1661 return; 1697 return;
1662 1698
1663 if (rq->curr == p) { 1699 if (rq->curr == p) {
@@ -1796,10 +1832,11 @@ extern void print_rt_rq(struct seq_file *m, int cpu, struct rt_rq *rt_rq);
1796 1832
1797static void print_rt_stats(struct seq_file *m, int cpu) 1833static void print_rt_stats(struct seq_file *m, int cpu)
1798{ 1834{
1835 rt_rq_iter_t iter;
1799 struct rt_rq *rt_rq; 1836 struct rt_rq *rt_rq;
1800 1837
1801 rcu_read_lock(); 1838 rcu_read_lock();
1802 for_each_leaf_rt_rq(rt_rq, cpu_rq(cpu)) 1839 for_each_rt_rq(rt_rq, iter, cpu_rq(cpu))
1803 print_rt_rq(m, cpu, rt_rq); 1840 print_rt_rq(m, cpu, rt_rq);
1804 rcu_read_unlock(); 1841 rcu_read_unlock();
1805} 1842}
diff --git a/kernel/sched_stoptask.c b/kernel/sched_stoptask.c
index 1ba2bd40fdac..6f437632afab 100644
--- a/kernel/sched_stoptask.c
+++ b/kernel/sched_stoptask.c
@@ -9,8 +9,7 @@
9 9
10#ifdef CONFIG_SMP 10#ifdef CONFIG_SMP
11static int 11static int
12select_task_rq_stop(struct rq *rq, struct task_struct *p, 12select_task_rq_stop(struct task_struct *p, int sd_flag, int flags)
13 int sd_flag, int flags)
14{ 13{
15 return task_cpu(p); /* stop tasks as never migrate */ 14 return task_cpu(p); /* stop tasks as never migrate */
16} 15}
@@ -26,7 +25,7 @@ static struct task_struct *pick_next_task_stop(struct rq *rq)
26{ 25{
27 struct task_struct *stop = rq->stop; 26 struct task_struct *stop = rq->stop;
28 27
29 if (stop && stop->se.on_rq) 28 if (stop && stop->on_rq)
30 return stop; 29 return stop;
31 30
32 return NULL; 31 return NULL;
diff --git a/kernel/softirq.c b/kernel/softirq.c
index 174f976c2874..13960170cad4 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -58,7 +58,7 @@ DEFINE_PER_CPU(struct task_struct *, ksoftirqd);
58 58
59char *softirq_to_name[NR_SOFTIRQS] = { 59char *softirq_to_name[NR_SOFTIRQS] = {
60 "HI", "TIMER", "NET_TX", "NET_RX", "BLOCK", "BLOCK_IOPOLL", 60 "HI", "TIMER", "NET_TX", "NET_RX", "BLOCK", "BLOCK_IOPOLL",
61 "TASKLET", "SCHED", "HRTIMER", "RCU" 61 "TASKLET", "SCHED", "HRTIMER"
62}; 62};
63 63
64/* 64/*
diff --git a/kernel/sys.c b/kernel/sys.c
index af468edf096a..e4128b278f23 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -314,8 +314,8 @@ void kernel_restart_prepare(char *cmd)
314{ 314{
315 blocking_notifier_call_chain(&reboot_notifier_list, SYS_RESTART, cmd); 315 blocking_notifier_call_chain(&reboot_notifier_list, SYS_RESTART, cmd);
316 system_state = SYSTEM_RESTART; 316 system_state = SYSTEM_RESTART;
317 usermodehelper_disable();
317 device_shutdown(); 318 device_shutdown();
318 sysdev_shutdown();
319 syscore_shutdown(); 319 syscore_shutdown();
320} 320}
321 321
@@ -344,6 +344,7 @@ static void kernel_shutdown_prepare(enum system_states state)
344 blocking_notifier_call_chain(&reboot_notifier_list, 344 blocking_notifier_call_chain(&reboot_notifier_list,
345 (state == SYSTEM_HALT)?SYS_HALT:SYS_POWER_OFF, NULL); 345 (state == SYSTEM_HALT)?SYS_HALT:SYS_POWER_OFF, NULL);
346 system_state = state; 346 system_state = state;
347 usermodehelper_disable();
347 device_shutdown(); 348 device_shutdown();
348} 349}
349/** 350/**
@@ -354,7 +355,6 @@ static void kernel_shutdown_prepare(enum system_states state)
354void kernel_halt(void) 355void kernel_halt(void)
355{ 356{
356 kernel_shutdown_prepare(SYSTEM_HALT); 357 kernel_shutdown_prepare(SYSTEM_HALT);
357 sysdev_shutdown();
358 syscore_shutdown(); 358 syscore_shutdown();
359 printk(KERN_EMERG "System halted.\n"); 359 printk(KERN_EMERG "System halted.\n");
360 kmsg_dump(KMSG_DUMP_HALT); 360 kmsg_dump(KMSG_DUMP_HALT);
@@ -374,7 +374,6 @@ void kernel_power_off(void)
374 if (pm_power_off_prepare) 374 if (pm_power_off_prepare)
375 pm_power_off_prepare(); 375 pm_power_off_prepare();
376 disable_nonboot_cpus(); 376 disable_nonboot_cpus();
377 sysdev_shutdown();
378 syscore_shutdown(); 377 syscore_shutdown();
379 printk(KERN_EMERG "Power down.\n"); 378 printk(KERN_EMERG "Power down.\n");
380 kmsg_dump(KMSG_DUMP_POWEROFF); 379 kmsg_dump(KMSG_DUMP_POWEROFF);
diff --git a/kernel/time/clockevents.c b/kernel/time/clockevents.c
index 0d74b9ba90c8..c027d4f602f1 100644
--- a/kernel/time/clockevents.c
+++ b/kernel/time/clockevents.c
@@ -194,6 +194,70 @@ void clockevents_register_device(struct clock_event_device *dev)
194} 194}
195EXPORT_SYMBOL_GPL(clockevents_register_device); 195EXPORT_SYMBOL_GPL(clockevents_register_device);
196 196
197static void clockevents_config(struct clock_event_device *dev,
198 u32 freq)
199{
200 u64 sec;
201
202 if (!(dev->features & CLOCK_EVT_FEAT_ONESHOT))
203 return;
204
205 /*
206 * Calculate the maximum number of seconds we can sleep. Limit
207 * to 10 minutes for hardware which can program more than
208 * 32bit ticks so we still get reasonable conversion values.
209 */
210 sec = dev->max_delta_ticks;
211 do_div(sec, freq);
212 if (!sec)
213 sec = 1;
214 else if (sec > 600 && dev->max_delta_ticks > UINT_MAX)
215 sec = 600;
216
217 clockevents_calc_mult_shift(dev, freq, sec);
218 dev->min_delta_ns = clockevent_delta2ns(dev->min_delta_ticks, dev);
219 dev->max_delta_ns = clockevent_delta2ns(dev->max_delta_ticks, dev);
220}
221
222/**
223 * clockevents_config_and_register - Configure and register a clock event device
224 * @dev: device to register
225 * @freq: The clock frequency
226 * @min_delta: The minimum clock ticks to program in oneshot mode
227 * @max_delta: The maximum clock ticks to program in oneshot mode
228 *
229 * min/max_delta can be 0 for devices which do not support oneshot mode.
230 */
231void clockevents_config_and_register(struct clock_event_device *dev,
232 u32 freq, unsigned long min_delta,
233 unsigned long max_delta)
234{
235 dev->min_delta_ticks = min_delta;
236 dev->max_delta_ticks = max_delta;
237 clockevents_config(dev, freq);
238 clockevents_register_device(dev);
239}
240
241/**
242 * clockevents_update_freq - Update frequency and reprogram a clock event device.
243 * @dev: device to modify
244 * @freq: new device frequency
245 *
246 * Reconfigure and reprogram a clock event device in oneshot
247 * mode. Must be called on the cpu for which the device delivers per
248 * cpu timer events with interrupts disabled! Returns 0 on success,
249 * -ETIME when the event is in the past.
250 */
251int clockevents_update_freq(struct clock_event_device *dev, u32 freq)
252{
253 clockevents_config(dev, freq);
254
255 if (dev->mode != CLOCK_EVT_MODE_ONESHOT)
256 return 0;
257
258 return clockevents_program_event(dev, dev->next_event, ktime_get());
259}
260
197/* 261/*
198 * Noop handler when we shut down an event device 262 * Noop handler when we shut down an event device
199 */ 263 */
diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c
index 6519cf62d9cd..1c95fd677328 100644
--- a/kernel/time/clocksource.c
+++ b/kernel/time/clocksource.c
@@ -626,19 +626,6 @@ static void clocksource_enqueue(struct clocksource *cs)
626 list_add(&cs->list, entry); 626 list_add(&cs->list, entry);
627} 627}
628 628
629
630/*
631 * Maximum time we expect to go between ticks. This includes idle
632 * tickless time. It provides the trade off between selecting a
633 * mult/shift pair that is very precise but can only handle a short
634 * period of time, vs. a mult/shift pair that can handle long periods
635 * of time but isn't as precise.
636 *
637 * This is a subsystem constant, and actual hardware limitations
638 * may override it (ie: clocksources that wrap every 3 seconds).
639 */
640#define MAX_UPDATE_LENGTH 5 /* Seconds */
641
642/** 629/**
643 * __clocksource_updatefreq_scale - Used update clocksource with new freq 630 * __clocksource_updatefreq_scale - Used update clocksource with new freq
644 * @t: clocksource to be registered 631 * @t: clocksource to be registered
@@ -652,15 +639,28 @@ static void clocksource_enqueue(struct clocksource *cs)
652 */ 639 */
653void __clocksource_updatefreq_scale(struct clocksource *cs, u32 scale, u32 freq) 640void __clocksource_updatefreq_scale(struct clocksource *cs, u32 scale, u32 freq)
654{ 641{
642 u64 sec;
643
655 /* 644 /*
656 * Ideally we want to use some of the limits used in 645 * Calc the maximum number of seconds which we can run before
657 * clocksource_max_deferment, to provide a more informed 646 * wrapping around. For clocksources which have a mask > 32bit
658 * MAX_UPDATE_LENGTH. But for now this just gets the 647 * we need to limit the max sleep time to have a good
659 * register interface working properly. 648 * conversion precision. 10 minutes is still a reasonable
649 * amount. That results in a shift value of 24 for a
650 * clocksource with mask >= 40bit and f >= 4GHz. That maps to
651 * ~ 0.06ppm granularity for NTP. We apply the same 12.5%
652 * margin as we do in clocksource_max_deferment()
660 */ 653 */
654 sec = (cs->mask - (cs->mask >> 5));
655 do_div(sec, freq);
656 do_div(sec, scale);
657 if (!sec)
658 sec = 1;
659 else if (sec > 600 && cs->mask > UINT_MAX)
660 sec = 600;
661
661 clocks_calc_mult_shift(&cs->mult, &cs->shift, freq, 662 clocks_calc_mult_shift(&cs->mult, &cs->shift, freq,
662 NSEC_PER_SEC/scale, 663 NSEC_PER_SEC / scale, sec * scale);
663 MAX_UPDATE_LENGTH*scale);
664 cs->max_idle_ns = clocksource_max_deferment(cs); 664 cs->max_idle_ns = clocksource_max_deferment(cs);
665} 665}
666EXPORT_SYMBOL_GPL(__clocksource_updatefreq_scale); 666EXPORT_SYMBOL_GPL(__clocksource_updatefreq_scale);
@@ -685,8 +685,8 @@ int __clocksource_register_scale(struct clocksource *cs, u32 scale, u32 freq)
685 /* Add clocksource to the clcoksource list */ 685 /* Add clocksource to the clcoksource list */
686 mutex_lock(&clocksource_mutex); 686 mutex_lock(&clocksource_mutex);
687 clocksource_enqueue(cs); 687 clocksource_enqueue(cs);
688 clocksource_select();
689 clocksource_enqueue_watchdog(cs); 688 clocksource_enqueue_watchdog(cs);
689 clocksource_select();
690 mutex_unlock(&clocksource_mutex); 690 mutex_unlock(&clocksource_mutex);
691 return 0; 691 return 0;
692} 692}
@@ -706,8 +706,8 @@ int clocksource_register(struct clocksource *cs)
706 706
707 mutex_lock(&clocksource_mutex); 707 mutex_lock(&clocksource_mutex);
708 clocksource_enqueue(cs); 708 clocksource_enqueue(cs);
709 clocksource_select();
710 clocksource_enqueue_watchdog(cs); 709 clocksource_enqueue_watchdog(cs);
710 clocksource_select();
711 mutex_unlock(&clocksource_mutex); 711 mutex_unlock(&clocksource_mutex);
712 return 0; 712 return 0;
713} 713}
diff --git a/kernel/time/tick-broadcast.c b/kernel/time/tick-broadcast.c
index 827e0f862da4..c7218d132738 100644
--- a/kernel/time/tick-broadcast.c
+++ b/kernel/time/tick-broadcast.c
@@ -524,10 +524,11 @@ static void tick_broadcast_init_next_event(struct cpumask *mask,
524 */ 524 */
525void tick_broadcast_setup_oneshot(struct clock_event_device *bc) 525void tick_broadcast_setup_oneshot(struct clock_event_device *bc)
526{ 526{
527 int cpu = smp_processor_id();
528
527 /* Set it up only once ! */ 529 /* Set it up only once ! */
528 if (bc->event_handler != tick_handle_oneshot_broadcast) { 530 if (bc->event_handler != tick_handle_oneshot_broadcast) {
529 int was_periodic = bc->mode == CLOCK_EVT_MODE_PERIODIC; 531 int was_periodic = bc->mode == CLOCK_EVT_MODE_PERIODIC;
530 int cpu = smp_processor_id();
531 532
532 bc->event_handler = tick_handle_oneshot_broadcast; 533 bc->event_handler = tick_handle_oneshot_broadcast;
533 clockevents_set_mode(bc, CLOCK_EVT_MODE_ONESHOT); 534 clockevents_set_mode(bc, CLOCK_EVT_MODE_ONESHOT);
@@ -553,6 +554,15 @@ void tick_broadcast_setup_oneshot(struct clock_event_device *bc)
553 tick_broadcast_set_event(tick_next_period, 1); 554 tick_broadcast_set_event(tick_next_period, 1);
554 } else 555 } else
555 bc->next_event.tv64 = KTIME_MAX; 556 bc->next_event.tv64 = KTIME_MAX;
557 } else {
558 /*
559 * The first cpu which switches to oneshot mode sets
560 * the bit for all other cpus which are in the general
561 * (periodic) broadcast mask. So the bit is set and
562 * would prevent the first broadcast enter after this
563 * to program the bc device.
564 */
565 tick_broadcast_clear_oneshot(cpu);
556 } 566 }
557} 567}
558 568
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index ee24fa1935ac..d017c2c82c44 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -39,20 +39,26 @@
39#include "trace_stat.h" 39#include "trace_stat.h"
40 40
41#define FTRACE_WARN_ON(cond) \ 41#define FTRACE_WARN_ON(cond) \
42 do { \ 42 ({ \
43 if (WARN_ON(cond)) \ 43 int ___r = cond; \
44 if (WARN_ON(___r)) \
44 ftrace_kill(); \ 45 ftrace_kill(); \
45 } while (0) 46 ___r; \
47 })
46 48
47#define FTRACE_WARN_ON_ONCE(cond) \ 49#define FTRACE_WARN_ON_ONCE(cond) \
48 do { \ 50 ({ \
49 if (WARN_ON_ONCE(cond)) \ 51 int ___r = cond; \
52 if (WARN_ON_ONCE(___r)) \
50 ftrace_kill(); \ 53 ftrace_kill(); \
51 } while (0) 54 ___r; \
55 })
52 56
53/* hash bits for specific function selection */ 57/* hash bits for specific function selection */
54#define FTRACE_HASH_BITS 7 58#define FTRACE_HASH_BITS 7
55#define FTRACE_FUNC_HASHSIZE (1 << FTRACE_HASH_BITS) 59#define FTRACE_FUNC_HASHSIZE (1 << FTRACE_HASH_BITS)
60#define FTRACE_HASH_DEFAULT_BITS 10
61#define FTRACE_HASH_MAX_BITS 12
56 62
57/* ftrace_enabled is a method to turn ftrace on or off */ 63/* ftrace_enabled is a method to turn ftrace on or off */
58int ftrace_enabled __read_mostly; 64int ftrace_enabled __read_mostly;
@@ -81,23 +87,29 @@ static struct ftrace_ops ftrace_list_end __read_mostly =
81 .func = ftrace_stub, 87 .func = ftrace_stub,
82}; 88};
83 89
84static struct ftrace_ops *ftrace_list __read_mostly = &ftrace_list_end; 90static struct ftrace_ops *ftrace_global_list __read_mostly = &ftrace_list_end;
91static struct ftrace_ops *ftrace_ops_list __read_mostly = &ftrace_list_end;
85ftrace_func_t ftrace_trace_function __read_mostly = ftrace_stub; 92ftrace_func_t ftrace_trace_function __read_mostly = ftrace_stub;
86ftrace_func_t __ftrace_trace_function __read_mostly = ftrace_stub; 93ftrace_func_t __ftrace_trace_function __read_mostly = ftrace_stub;
87ftrace_func_t ftrace_pid_function __read_mostly = ftrace_stub; 94ftrace_func_t ftrace_pid_function __read_mostly = ftrace_stub;
95static struct ftrace_ops global_ops;
96
97static void
98ftrace_ops_list_func(unsigned long ip, unsigned long parent_ip);
88 99
89/* 100/*
90 * Traverse the ftrace_list, invoking all entries. The reason that we 101 * Traverse the ftrace_global_list, invoking all entries. The reason that we
91 * can use rcu_dereference_raw() is that elements removed from this list 102 * can use rcu_dereference_raw() is that elements removed from this list
92 * are simply leaked, so there is no need to interact with a grace-period 103 * are simply leaked, so there is no need to interact with a grace-period
93 * mechanism. The rcu_dereference_raw() calls are needed to handle 104 * mechanism. The rcu_dereference_raw() calls are needed to handle
94 * concurrent insertions into the ftrace_list. 105 * concurrent insertions into the ftrace_global_list.
95 * 106 *
96 * Silly Alpha and silly pointer-speculation compiler optimizations! 107 * Silly Alpha and silly pointer-speculation compiler optimizations!
97 */ 108 */
98static void ftrace_list_func(unsigned long ip, unsigned long parent_ip) 109static void ftrace_global_list_func(unsigned long ip,
110 unsigned long parent_ip)
99{ 111{
100 struct ftrace_ops *op = rcu_dereference_raw(ftrace_list); /*see above*/ 112 struct ftrace_ops *op = rcu_dereference_raw(ftrace_global_list); /*see above*/
101 113
102 while (op != &ftrace_list_end) { 114 while (op != &ftrace_list_end) {
103 op->func(ip, parent_ip); 115 op->func(ip, parent_ip);
@@ -147,46 +159,69 @@ static void ftrace_test_stop_func(unsigned long ip, unsigned long parent_ip)
147} 159}
148#endif 160#endif
149 161
150static int __register_ftrace_function(struct ftrace_ops *ops) 162static void update_global_ops(void)
151{ 163{
152 ops->next = ftrace_list; 164 ftrace_func_t func;
165
153 /* 166 /*
154 * We are entering ops into the ftrace_list but another 167 * If there's only one function registered, then call that
155 * CPU might be walking that list. We need to make sure 168 * function directly. Otherwise, we need to iterate over the
156 * the ops->next pointer is valid before another CPU sees 169 * registered callers.
157 * the ops pointer included into the ftrace_list.
158 */ 170 */
159 rcu_assign_pointer(ftrace_list, ops); 171 if (ftrace_global_list == &ftrace_list_end ||
172 ftrace_global_list->next == &ftrace_list_end)
173 func = ftrace_global_list->func;
174 else
175 func = ftrace_global_list_func;
160 176
161 if (ftrace_enabled) { 177 /* If we filter on pids, update to use the pid function */
162 ftrace_func_t func; 178 if (!list_empty(&ftrace_pids)) {
179 set_ftrace_pid_function(func);
180 func = ftrace_pid_func;
181 }
163 182
164 if (ops->next == &ftrace_list_end) 183 global_ops.func = func;
165 func = ops->func; 184}
166 else
167 func = ftrace_list_func;
168 185
169 if (!list_empty(&ftrace_pids)) { 186static void update_ftrace_function(void)
170 set_ftrace_pid_function(func); 187{
171 func = ftrace_pid_func; 188 ftrace_func_t func;
172 } 189
190 update_global_ops();
191
192 /*
193 * If we are at the end of the list and this ops is
194 * not dynamic, then have the mcount trampoline call
195 * the function directly
196 */
197 if (ftrace_ops_list == &ftrace_list_end ||
198 (ftrace_ops_list->next == &ftrace_list_end &&
199 !(ftrace_ops_list->flags & FTRACE_OPS_FL_DYNAMIC)))
200 func = ftrace_ops_list->func;
201 else
202 func = ftrace_ops_list_func;
173 203
174 /*
175 * For one func, simply call it directly.
176 * For more than one func, call the chain.
177 */
178#ifdef CONFIG_HAVE_FUNCTION_TRACE_MCOUNT_TEST 204#ifdef CONFIG_HAVE_FUNCTION_TRACE_MCOUNT_TEST
179 ftrace_trace_function = func; 205 ftrace_trace_function = func;
180#else 206#else
181 __ftrace_trace_function = func; 207 __ftrace_trace_function = func;
182 ftrace_trace_function = ftrace_test_stop_func; 208 ftrace_trace_function = ftrace_test_stop_func;
183#endif 209#endif
184 } 210}
185 211
186 return 0; 212static void add_ftrace_ops(struct ftrace_ops **list, struct ftrace_ops *ops)
213{
214 ops->next = *list;
215 /*
216 * We are entering ops into the list but another
217 * CPU might be walking that list. We need to make sure
218 * the ops->next pointer is valid before another CPU sees
219 * the ops pointer included into the list.
220 */
221 rcu_assign_pointer(*list, ops);
187} 222}
188 223
189static int __unregister_ftrace_function(struct ftrace_ops *ops) 224static int remove_ftrace_ops(struct ftrace_ops **list, struct ftrace_ops *ops)
190{ 225{
191 struct ftrace_ops **p; 226 struct ftrace_ops **p;
192 227
@@ -194,13 +229,12 @@ static int __unregister_ftrace_function(struct ftrace_ops *ops)
194 * If we are removing the last function, then simply point 229 * If we are removing the last function, then simply point
195 * to the ftrace_stub. 230 * to the ftrace_stub.
196 */ 231 */
197 if (ftrace_list == ops && ops->next == &ftrace_list_end) { 232 if (*list == ops && ops->next == &ftrace_list_end) {
198 ftrace_trace_function = ftrace_stub; 233 *list = &ftrace_list_end;
199 ftrace_list = &ftrace_list_end;
200 return 0; 234 return 0;
201 } 235 }
202 236
203 for (p = &ftrace_list; *p != &ftrace_list_end; p = &(*p)->next) 237 for (p = list; *p != &ftrace_list_end; p = &(*p)->next)
204 if (*p == ops) 238 if (*p == ops)
205 break; 239 break;
206 240
@@ -208,53 +242,83 @@ static int __unregister_ftrace_function(struct ftrace_ops *ops)
208 return -1; 242 return -1;
209 243
210 *p = (*p)->next; 244 *p = (*p)->next;
245 return 0;
246}
211 247
212 if (ftrace_enabled) { 248static int __register_ftrace_function(struct ftrace_ops *ops)
213 /* If we only have one func left, then call that directly */ 249{
214 if (ftrace_list->next == &ftrace_list_end) { 250 if (ftrace_disabled)
215 ftrace_func_t func = ftrace_list->func; 251 return -ENODEV;
216 252
217 if (!list_empty(&ftrace_pids)) { 253 if (FTRACE_WARN_ON(ops == &global_ops))
218 set_ftrace_pid_function(func); 254 return -EINVAL;
219 func = ftrace_pid_func; 255
220 } 256 if (WARN_ON(ops->flags & FTRACE_OPS_FL_ENABLED))
221#ifdef CONFIG_HAVE_FUNCTION_TRACE_MCOUNT_TEST 257 return -EBUSY;
222 ftrace_trace_function = func; 258
223#else 259 if (!core_kernel_data((unsigned long)ops))
224 __ftrace_trace_function = func; 260 ops->flags |= FTRACE_OPS_FL_DYNAMIC;
225#endif 261
226 } 262 if (ops->flags & FTRACE_OPS_FL_GLOBAL) {
227 } 263 int first = ftrace_global_list == &ftrace_list_end;
264 add_ftrace_ops(&ftrace_global_list, ops);
265 ops->flags |= FTRACE_OPS_FL_ENABLED;
266 if (first)
267 add_ftrace_ops(&ftrace_ops_list, &global_ops);
268 } else
269 add_ftrace_ops(&ftrace_ops_list, ops);
270
271 if (ftrace_enabled)
272 update_ftrace_function();
228 273
229 return 0; 274 return 0;
230} 275}
231 276
232static void ftrace_update_pid_func(void) 277static int __unregister_ftrace_function(struct ftrace_ops *ops)
233{ 278{
234 ftrace_func_t func; 279 int ret;
235 280
236 if (ftrace_trace_function == ftrace_stub) 281 if (ftrace_disabled)
237 return; 282 return -ENODEV;
238 283
239#ifdef CONFIG_HAVE_FUNCTION_TRACE_MCOUNT_TEST 284 if (WARN_ON(!(ops->flags & FTRACE_OPS_FL_ENABLED)))
240 func = ftrace_trace_function; 285 return -EBUSY;
241#else
242 func = __ftrace_trace_function;
243#endif
244 286
245 if (!list_empty(&ftrace_pids)) { 287 if (FTRACE_WARN_ON(ops == &global_ops))
246 set_ftrace_pid_function(func); 288 return -EINVAL;
247 func = ftrace_pid_func;
248 } else {
249 if (func == ftrace_pid_func)
250 func = ftrace_pid_function;
251 }
252 289
253#ifdef CONFIG_HAVE_FUNCTION_TRACE_MCOUNT_TEST 290 if (ops->flags & FTRACE_OPS_FL_GLOBAL) {
254 ftrace_trace_function = func; 291 ret = remove_ftrace_ops(&ftrace_global_list, ops);
255#else 292 if (!ret && ftrace_global_list == &ftrace_list_end)
256 __ftrace_trace_function = func; 293 ret = remove_ftrace_ops(&ftrace_ops_list, &global_ops);
257#endif 294 if (!ret)
295 ops->flags &= ~FTRACE_OPS_FL_ENABLED;
296 } else
297 ret = remove_ftrace_ops(&ftrace_ops_list, ops);
298
299 if (ret < 0)
300 return ret;
301
302 if (ftrace_enabled)
303 update_ftrace_function();
304
305 /*
306 * Dynamic ops may be freed, we must make sure that all
307 * callers are done before leaving this function.
308 */
309 if (ops->flags & FTRACE_OPS_FL_DYNAMIC)
310 synchronize_sched();
311
312 return 0;
313}
314
315static void ftrace_update_pid_func(void)
316{
317 /* Only do something if we are tracing something */
318 if (ftrace_trace_function == ftrace_stub)
319 return;
320
321 update_ftrace_function();
258} 322}
259 323
260#ifdef CONFIG_FUNCTION_PROFILER 324#ifdef CONFIG_FUNCTION_PROFILER
@@ -888,8 +952,35 @@ enum {
888 FTRACE_START_FUNC_RET = (1 << 3), 952 FTRACE_START_FUNC_RET = (1 << 3),
889 FTRACE_STOP_FUNC_RET = (1 << 4), 953 FTRACE_STOP_FUNC_RET = (1 << 4),
890}; 954};
955struct ftrace_func_entry {
956 struct hlist_node hlist;
957 unsigned long ip;
958};
891 959
892static int ftrace_filtered; 960struct ftrace_hash {
961 unsigned long size_bits;
962 struct hlist_head *buckets;
963 unsigned long count;
964 struct rcu_head rcu;
965};
966
967/*
968 * We make these constant because no one should touch them,
969 * but they are used as the default "empty hash", to avoid allocating
970 * it all the time. These are in a read only section such that if
971 * anyone does try to modify it, it will cause an exception.
972 */
973static const struct hlist_head empty_buckets[1];
974static const struct ftrace_hash empty_hash = {
975 .buckets = (struct hlist_head *)empty_buckets,
976};
977#define EMPTY_HASH ((struct ftrace_hash *)&empty_hash)
978
979static struct ftrace_ops global_ops = {
980 .func = ftrace_stub,
981 .notrace_hash = EMPTY_HASH,
982 .filter_hash = EMPTY_HASH,
983};
893 984
894static struct dyn_ftrace *ftrace_new_addrs; 985static struct dyn_ftrace *ftrace_new_addrs;
895 986
@@ -912,6 +1003,269 @@ static struct ftrace_page *ftrace_pages;
912 1003
913static struct dyn_ftrace *ftrace_free_records; 1004static struct dyn_ftrace *ftrace_free_records;
914 1005
1006static struct ftrace_func_entry *
1007ftrace_lookup_ip(struct ftrace_hash *hash, unsigned long ip)
1008{
1009 unsigned long key;
1010 struct ftrace_func_entry *entry;
1011 struct hlist_head *hhd;
1012 struct hlist_node *n;
1013
1014 if (!hash->count)
1015 return NULL;
1016
1017 if (hash->size_bits > 0)
1018 key = hash_long(ip, hash->size_bits);
1019 else
1020 key = 0;
1021
1022 hhd = &hash->buckets[key];
1023
1024 hlist_for_each_entry_rcu(entry, n, hhd, hlist) {
1025 if (entry->ip == ip)
1026 return entry;
1027 }
1028 return NULL;
1029}
1030
1031static void __add_hash_entry(struct ftrace_hash *hash,
1032 struct ftrace_func_entry *entry)
1033{
1034 struct hlist_head *hhd;
1035 unsigned long key;
1036
1037 if (hash->size_bits)
1038 key = hash_long(entry->ip, hash->size_bits);
1039 else
1040 key = 0;
1041
1042 hhd = &hash->buckets[key];
1043 hlist_add_head(&entry->hlist, hhd);
1044 hash->count++;
1045}
1046
1047static int add_hash_entry(struct ftrace_hash *hash, unsigned long ip)
1048{
1049 struct ftrace_func_entry *entry;
1050
1051 entry = kmalloc(sizeof(*entry), GFP_KERNEL);
1052 if (!entry)
1053 return -ENOMEM;
1054
1055 entry->ip = ip;
1056 __add_hash_entry(hash, entry);
1057
1058 return 0;
1059}
1060
1061static void
1062free_hash_entry(struct ftrace_hash *hash,
1063 struct ftrace_func_entry *entry)
1064{
1065 hlist_del(&entry->hlist);
1066 kfree(entry);
1067 hash->count--;
1068}
1069
1070static void
1071remove_hash_entry(struct ftrace_hash *hash,
1072 struct ftrace_func_entry *entry)
1073{
1074 hlist_del(&entry->hlist);
1075 hash->count--;
1076}
1077
1078static void ftrace_hash_clear(struct ftrace_hash *hash)
1079{
1080 struct hlist_head *hhd;
1081 struct hlist_node *tp, *tn;
1082 struct ftrace_func_entry *entry;
1083 int size = 1 << hash->size_bits;
1084 int i;
1085
1086 if (!hash->count)
1087 return;
1088
1089 for (i = 0; i < size; i++) {
1090 hhd = &hash->buckets[i];
1091 hlist_for_each_entry_safe(entry, tp, tn, hhd, hlist)
1092 free_hash_entry(hash, entry);
1093 }
1094 FTRACE_WARN_ON(hash->count);
1095}
1096
1097static void free_ftrace_hash(struct ftrace_hash *hash)
1098{
1099 if (!hash || hash == EMPTY_HASH)
1100 return;
1101 ftrace_hash_clear(hash);
1102 kfree(hash->buckets);
1103 kfree(hash);
1104}
1105
1106static void __free_ftrace_hash_rcu(struct rcu_head *rcu)
1107{
1108 struct ftrace_hash *hash;
1109
1110 hash = container_of(rcu, struct ftrace_hash, rcu);
1111 free_ftrace_hash(hash);
1112}
1113
1114static void free_ftrace_hash_rcu(struct ftrace_hash *hash)
1115{
1116 if (!hash || hash == EMPTY_HASH)
1117 return;
1118 call_rcu_sched(&hash->rcu, __free_ftrace_hash_rcu);
1119}
1120
1121static struct ftrace_hash *alloc_ftrace_hash(int size_bits)
1122{
1123 struct ftrace_hash *hash;
1124 int size;
1125
1126 hash = kzalloc(sizeof(*hash), GFP_KERNEL);
1127 if (!hash)
1128 return NULL;
1129
1130 size = 1 << size_bits;
1131 hash->buckets = kzalloc(sizeof(*hash->buckets) * size, GFP_KERNEL);
1132
1133 if (!hash->buckets) {
1134 kfree(hash);
1135 return NULL;
1136 }
1137
1138 hash->size_bits = size_bits;
1139
1140 return hash;
1141}
1142
1143static struct ftrace_hash *
1144alloc_and_copy_ftrace_hash(int size_bits, struct ftrace_hash *hash)
1145{
1146 struct ftrace_func_entry *entry;
1147 struct ftrace_hash *new_hash;
1148 struct hlist_node *tp;
1149 int size;
1150 int ret;
1151 int i;
1152
1153 new_hash = alloc_ftrace_hash(size_bits);
1154 if (!new_hash)
1155 return NULL;
1156
1157 /* Empty hash? */
1158 if (!hash || !hash->count)
1159 return new_hash;
1160
1161 size = 1 << hash->size_bits;
1162 for (i = 0; i < size; i++) {
1163 hlist_for_each_entry(entry, tp, &hash->buckets[i], hlist) {
1164 ret = add_hash_entry(new_hash, entry->ip);
1165 if (ret < 0)
1166 goto free_hash;
1167 }
1168 }
1169
1170 FTRACE_WARN_ON(new_hash->count != hash->count);
1171
1172 return new_hash;
1173
1174 free_hash:
1175 free_ftrace_hash(new_hash);
1176 return NULL;
1177}
1178
1179static int
1180ftrace_hash_move(struct ftrace_hash **dst, struct ftrace_hash *src)
1181{
1182 struct ftrace_func_entry *entry;
1183 struct hlist_node *tp, *tn;
1184 struct hlist_head *hhd;
1185 struct ftrace_hash *old_hash;
1186 struct ftrace_hash *new_hash;
1187 unsigned long key;
1188 int size = src->count;
1189 int bits = 0;
1190 int i;
1191
1192 /*
1193 * If the new source is empty, just free dst and assign it
1194 * the empty_hash.
1195 */
1196 if (!src->count) {
1197 free_ftrace_hash_rcu(*dst);
1198 rcu_assign_pointer(*dst, EMPTY_HASH);
1199 return 0;
1200 }
1201
1202 /*
1203 * Make the hash size about 1/2 the # found
1204 */
1205 for (size /= 2; size; size >>= 1)
1206 bits++;
1207
1208 /* Don't allocate too much */
1209 if (bits > FTRACE_HASH_MAX_BITS)
1210 bits = FTRACE_HASH_MAX_BITS;
1211
1212 new_hash = alloc_ftrace_hash(bits);
1213 if (!new_hash)
1214 return -ENOMEM;
1215
1216 size = 1 << src->size_bits;
1217 for (i = 0; i < size; i++) {
1218 hhd = &src->buckets[i];
1219 hlist_for_each_entry_safe(entry, tp, tn, hhd, hlist) {
1220 if (bits > 0)
1221 key = hash_long(entry->ip, bits);
1222 else
1223 key = 0;
1224 remove_hash_entry(src, entry);
1225 __add_hash_entry(new_hash, entry);
1226 }
1227 }
1228
1229 old_hash = *dst;
1230 rcu_assign_pointer(*dst, new_hash);
1231 free_ftrace_hash_rcu(old_hash);
1232
1233 return 0;
1234}
1235
1236/*
1237 * Test the hashes for this ops to see if we want to call
1238 * the ops->func or not.
1239 *
1240 * It's a match if the ip is in the ops->filter_hash or
1241 * the filter_hash does not exist or is empty,
1242 * AND
1243 * the ip is not in the ops->notrace_hash.
1244 *
1245 * This needs to be called with preemption disabled as
1246 * the hashes are freed with call_rcu_sched().
1247 */
1248static int
1249ftrace_ops_test(struct ftrace_ops *ops, unsigned long ip)
1250{
1251 struct ftrace_hash *filter_hash;
1252 struct ftrace_hash *notrace_hash;
1253 int ret;
1254
1255 filter_hash = rcu_dereference_raw(ops->filter_hash);
1256 notrace_hash = rcu_dereference_raw(ops->notrace_hash);
1257
1258 if ((!filter_hash || !filter_hash->count ||
1259 ftrace_lookup_ip(filter_hash, ip)) &&
1260 (!notrace_hash || !notrace_hash->count ||
1261 !ftrace_lookup_ip(notrace_hash, ip)))
1262 ret = 1;
1263 else
1264 ret = 0;
1265
1266 return ret;
1267}
1268
915/* 1269/*
916 * This is a double for. Do not use 'break' to break out of the loop, 1270 * This is a double for. Do not use 'break' to break out of the loop,
917 * you must use a goto. 1271 * you must use a goto.
@@ -926,6 +1280,105 @@ static struct dyn_ftrace *ftrace_free_records;
926 } \ 1280 } \
927 } 1281 }
928 1282
1283static void __ftrace_hash_rec_update(struct ftrace_ops *ops,
1284 int filter_hash,
1285 bool inc)
1286{
1287 struct ftrace_hash *hash;
1288 struct ftrace_hash *other_hash;
1289 struct ftrace_page *pg;
1290 struct dyn_ftrace *rec;
1291 int count = 0;
1292 int all = 0;
1293
1294 /* Only update if the ops has been registered */
1295 if (!(ops->flags & FTRACE_OPS_FL_ENABLED))
1296 return;
1297
1298 /*
1299 * In the filter_hash case:
1300 * If the count is zero, we update all records.
1301 * Otherwise we just update the items in the hash.
1302 *
1303 * In the notrace_hash case:
1304 * We enable the update in the hash.
1305 * As disabling notrace means enabling the tracing,
1306 * and enabling notrace means disabling, the inc variable
1307 * gets inversed.
1308 */
1309 if (filter_hash) {
1310 hash = ops->filter_hash;
1311 other_hash = ops->notrace_hash;
1312 if (!hash || !hash->count)
1313 all = 1;
1314 } else {
1315 inc = !inc;
1316 hash = ops->notrace_hash;
1317 other_hash = ops->filter_hash;
1318 /*
1319 * If the notrace hash has no items,
1320 * then there's nothing to do.
1321 */
1322 if (hash && !hash->count)
1323 return;
1324 }
1325
1326 do_for_each_ftrace_rec(pg, rec) {
1327 int in_other_hash = 0;
1328 int in_hash = 0;
1329 int match = 0;
1330
1331 if (all) {
1332 /*
1333 * Only the filter_hash affects all records.
1334 * Update if the record is not in the notrace hash.
1335 */
1336 if (!other_hash || !ftrace_lookup_ip(other_hash, rec->ip))
1337 match = 1;
1338 } else {
1339 in_hash = hash && !!ftrace_lookup_ip(hash, rec->ip);
1340 in_other_hash = other_hash && !!ftrace_lookup_ip(other_hash, rec->ip);
1341
1342 /*
1343 *
1344 */
1345 if (filter_hash && in_hash && !in_other_hash)
1346 match = 1;
1347 else if (!filter_hash && in_hash &&
1348 (in_other_hash || !other_hash->count))
1349 match = 1;
1350 }
1351 if (!match)
1352 continue;
1353
1354 if (inc) {
1355 rec->flags++;
1356 if (FTRACE_WARN_ON((rec->flags & ~FTRACE_FL_MASK) == FTRACE_REF_MAX))
1357 return;
1358 } else {
1359 if (FTRACE_WARN_ON((rec->flags & ~FTRACE_FL_MASK) == 0))
1360 return;
1361 rec->flags--;
1362 }
1363 count++;
1364 /* Shortcut, if we handled all records, we are done. */
1365 if (!all && count == hash->count)
1366 return;
1367 } while_for_each_ftrace_rec();
1368}
1369
1370static void ftrace_hash_rec_disable(struct ftrace_ops *ops,
1371 int filter_hash)
1372{
1373 __ftrace_hash_rec_update(ops, filter_hash, 0);
1374}
1375
1376static void ftrace_hash_rec_enable(struct ftrace_ops *ops,
1377 int filter_hash)
1378{
1379 __ftrace_hash_rec_update(ops, filter_hash, 1);
1380}
1381
929static void ftrace_free_rec(struct dyn_ftrace *rec) 1382static void ftrace_free_rec(struct dyn_ftrace *rec)
930{ 1383{
931 rec->freelist = ftrace_free_records; 1384 rec->freelist = ftrace_free_records;
@@ -1047,18 +1500,18 @@ __ftrace_replace_code(struct dyn_ftrace *rec, int enable)
1047 ftrace_addr = (unsigned long)FTRACE_ADDR; 1500 ftrace_addr = (unsigned long)FTRACE_ADDR;
1048 1501
1049 /* 1502 /*
1050 * If this record is not to be traced or we want to disable it, 1503 * If we are enabling tracing:
1051 * then disable it. 1504 *
1505 * If the record has a ref count, then we need to enable it
1506 * because someone is using it.
1052 * 1507 *
1053 * If we want to enable it and filtering is off, then enable it. 1508 * Otherwise we make sure its disabled.
1054 * 1509 *
1055 * If we want to enable it and filtering is on, enable it only if 1510 * If we are disabling tracing, then disable all records that
1056 * it's filtered 1511 * are enabled.
1057 */ 1512 */
1058 if (enable && !(rec->flags & FTRACE_FL_NOTRACE)) { 1513 if (enable && (rec->flags & ~FTRACE_FL_MASK))
1059 if (!ftrace_filtered || (rec->flags & FTRACE_FL_FILTER)) 1514 flag = FTRACE_FL_ENABLED;
1060 flag = FTRACE_FL_ENABLED;
1061 }
1062 1515
1063 /* If the state of this record hasn't changed, then do nothing */ 1516 /* If the state of this record hasn't changed, then do nothing */
1064 if ((rec->flags & FTRACE_FL_ENABLED) == flag) 1517 if ((rec->flags & FTRACE_FL_ENABLED) == flag)
@@ -1079,19 +1532,16 @@ static void ftrace_replace_code(int enable)
1079 struct ftrace_page *pg; 1532 struct ftrace_page *pg;
1080 int failed; 1533 int failed;
1081 1534
1535 if (unlikely(ftrace_disabled))
1536 return;
1537
1082 do_for_each_ftrace_rec(pg, rec) { 1538 do_for_each_ftrace_rec(pg, rec) {
1083 /* 1539 /* Skip over free records */
1084 * Skip over free records, records that have 1540 if (rec->flags & FTRACE_FL_FREE)
1085 * failed and not converted.
1086 */
1087 if (rec->flags & FTRACE_FL_FREE ||
1088 rec->flags & FTRACE_FL_FAILED ||
1089 !(rec->flags & FTRACE_FL_CONVERTED))
1090 continue; 1541 continue;
1091 1542
1092 failed = __ftrace_replace_code(rec, enable); 1543 failed = __ftrace_replace_code(rec, enable);
1093 if (failed) { 1544 if (failed) {
1094 rec->flags |= FTRACE_FL_FAILED;
1095 ftrace_bug(failed, rec->ip); 1545 ftrace_bug(failed, rec->ip);
1096 /* Stop processing */ 1546 /* Stop processing */
1097 return; 1547 return;
@@ -1107,10 +1557,12 @@ ftrace_code_disable(struct module *mod, struct dyn_ftrace *rec)
1107 1557
1108 ip = rec->ip; 1558 ip = rec->ip;
1109 1559
1560 if (unlikely(ftrace_disabled))
1561 return 0;
1562
1110 ret = ftrace_make_nop(mod, rec, MCOUNT_ADDR); 1563 ret = ftrace_make_nop(mod, rec, MCOUNT_ADDR);
1111 if (ret) { 1564 if (ret) {
1112 ftrace_bug(ret, ip); 1565 ftrace_bug(ret, ip);
1113 rec->flags |= FTRACE_FL_FAILED;
1114 return 0; 1566 return 0;
1115 } 1567 }
1116 return 1; 1568 return 1;
@@ -1171,6 +1623,7 @@ static void ftrace_run_update_code(int command)
1171 1623
1172static ftrace_func_t saved_ftrace_func; 1624static ftrace_func_t saved_ftrace_func;
1173static int ftrace_start_up; 1625static int ftrace_start_up;
1626static int global_start_up;
1174 1627
1175static void ftrace_startup_enable(int command) 1628static void ftrace_startup_enable(int command)
1176{ 1629{
@@ -1185,19 +1638,36 @@ static void ftrace_startup_enable(int command)
1185 ftrace_run_update_code(command); 1638 ftrace_run_update_code(command);
1186} 1639}
1187 1640
1188static void ftrace_startup(int command) 1641static void ftrace_startup(struct ftrace_ops *ops, int command)
1189{ 1642{
1643 bool hash_enable = true;
1644
1190 if (unlikely(ftrace_disabled)) 1645 if (unlikely(ftrace_disabled))
1191 return; 1646 return;
1192 1647
1193 ftrace_start_up++; 1648 ftrace_start_up++;
1194 command |= FTRACE_ENABLE_CALLS; 1649 command |= FTRACE_ENABLE_CALLS;
1195 1650
1651 /* ops marked global share the filter hashes */
1652 if (ops->flags & FTRACE_OPS_FL_GLOBAL) {
1653 ops = &global_ops;
1654 /* Don't update hash if global is already set */
1655 if (global_start_up)
1656 hash_enable = false;
1657 global_start_up++;
1658 }
1659
1660 ops->flags |= FTRACE_OPS_FL_ENABLED;
1661 if (hash_enable)
1662 ftrace_hash_rec_enable(ops, 1);
1663
1196 ftrace_startup_enable(command); 1664 ftrace_startup_enable(command);
1197} 1665}
1198 1666
1199static void ftrace_shutdown(int command) 1667static void ftrace_shutdown(struct ftrace_ops *ops, int command)
1200{ 1668{
1669 bool hash_disable = true;
1670
1201 if (unlikely(ftrace_disabled)) 1671 if (unlikely(ftrace_disabled))
1202 return; 1672 return;
1203 1673
@@ -1209,6 +1679,23 @@ static void ftrace_shutdown(int command)
1209 */ 1679 */
1210 WARN_ON_ONCE(ftrace_start_up < 0); 1680 WARN_ON_ONCE(ftrace_start_up < 0);
1211 1681
1682 if (ops->flags & FTRACE_OPS_FL_GLOBAL) {
1683 ops = &global_ops;
1684 global_start_up--;
1685 WARN_ON_ONCE(global_start_up < 0);
1686 /* Don't update hash if global still has users */
1687 if (global_start_up) {
1688 WARN_ON_ONCE(!ftrace_start_up);
1689 hash_disable = false;
1690 }
1691 }
1692
1693 if (hash_disable)
1694 ftrace_hash_rec_disable(ops, 1);
1695
1696 if (ops != &global_ops || !global_start_up)
1697 ops->flags &= ~FTRACE_OPS_FL_ENABLED;
1698
1212 if (!ftrace_start_up) 1699 if (!ftrace_start_up)
1213 command |= FTRACE_DISABLE_CALLS; 1700 command |= FTRACE_DISABLE_CALLS;
1214 1701
@@ -1273,10 +1760,10 @@ static int ftrace_update_code(struct module *mod)
1273 */ 1760 */
1274 if (!ftrace_code_disable(mod, p)) { 1761 if (!ftrace_code_disable(mod, p)) {
1275 ftrace_free_rec(p); 1762 ftrace_free_rec(p);
1276 continue; 1763 /* Game over */
1764 break;
1277 } 1765 }
1278 1766
1279 p->flags |= FTRACE_FL_CONVERTED;
1280 ftrace_update_cnt++; 1767 ftrace_update_cnt++;
1281 1768
1282 /* 1769 /*
@@ -1351,9 +1838,9 @@ static int __init ftrace_dyn_table_alloc(unsigned long num_to_init)
1351enum { 1838enum {
1352 FTRACE_ITER_FILTER = (1 << 0), 1839 FTRACE_ITER_FILTER = (1 << 0),
1353 FTRACE_ITER_NOTRACE = (1 << 1), 1840 FTRACE_ITER_NOTRACE = (1 << 1),
1354 FTRACE_ITER_FAILURES = (1 << 2), 1841 FTRACE_ITER_PRINTALL = (1 << 2),
1355 FTRACE_ITER_PRINTALL = (1 << 3), 1842 FTRACE_ITER_HASH = (1 << 3),
1356 FTRACE_ITER_HASH = (1 << 4), 1843 FTRACE_ITER_ENABLED = (1 << 4),
1357}; 1844};
1358 1845
1359#define FTRACE_BUFF_MAX (KSYM_SYMBOL_LEN+4) /* room for wildcards */ 1846#define FTRACE_BUFF_MAX (KSYM_SYMBOL_LEN+4) /* room for wildcards */
@@ -1365,6 +1852,8 @@ struct ftrace_iterator {
1365 struct dyn_ftrace *func; 1852 struct dyn_ftrace *func;
1366 struct ftrace_func_probe *probe; 1853 struct ftrace_func_probe *probe;
1367 struct trace_parser parser; 1854 struct trace_parser parser;
1855 struct ftrace_hash *hash;
1856 struct ftrace_ops *ops;
1368 int hidx; 1857 int hidx;
1369 int idx; 1858 int idx;
1370 unsigned flags; 1859 unsigned flags;
@@ -1461,8 +1950,12 @@ static void *
1461t_next(struct seq_file *m, void *v, loff_t *pos) 1950t_next(struct seq_file *m, void *v, loff_t *pos)
1462{ 1951{
1463 struct ftrace_iterator *iter = m->private; 1952 struct ftrace_iterator *iter = m->private;
1953 struct ftrace_ops *ops = &global_ops;
1464 struct dyn_ftrace *rec = NULL; 1954 struct dyn_ftrace *rec = NULL;
1465 1955
1956 if (unlikely(ftrace_disabled))
1957 return NULL;
1958
1466 if (iter->flags & FTRACE_ITER_HASH) 1959 if (iter->flags & FTRACE_ITER_HASH)
1467 return t_hash_next(m, pos); 1960 return t_hash_next(m, pos);
1468 1961
@@ -1483,17 +1976,15 @@ t_next(struct seq_file *m, void *v, loff_t *pos)
1483 rec = &iter->pg->records[iter->idx++]; 1976 rec = &iter->pg->records[iter->idx++];
1484 if ((rec->flags & FTRACE_FL_FREE) || 1977 if ((rec->flags & FTRACE_FL_FREE) ||
1485 1978
1486 (!(iter->flags & FTRACE_ITER_FAILURES) &&
1487 (rec->flags & FTRACE_FL_FAILED)) ||
1488
1489 ((iter->flags & FTRACE_ITER_FAILURES) &&
1490 !(rec->flags & FTRACE_FL_FAILED)) ||
1491
1492 ((iter->flags & FTRACE_ITER_FILTER) && 1979 ((iter->flags & FTRACE_ITER_FILTER) &&
1493 !(rec->flags & FTRACE_FL_FILTER)) || 1980 !(ftrace_lookup_ip(ops->filter_hash, rec->ip))) ||
1494 1981
1495 ((iter->flags & FTRACE_ITER_NOTRACE) && 1982 ((iter->flags & FTRACE_ITER_NOTRACE) &&
1496 !(rec->flags & FTRACE_FL_NOTRACE))) { 1983 !ftrace_lookup_ip(ops->notrace_hash, rec->ip)) ||
1984
1985 ((iter->flags & FTRACE_ITER_ENABLED) &&
1986 !(rec->flags & ~FTRACE_FL_MASK))) {
1987
1497 rec = NULL; 1988 rec = NULL;
1498 goto retry; 1989 goto retry;
1499 } 1990 }
@@ -1517,10 +2008,15 @@ static void reset_iter_read(struct ftrace_iterator *iter)
1517static void *t_start(struct seq_file *m, loff_t *pos) 2008static void *t_start(struct seq_file *m, loff_t *pos)
1518{ 2009{
1519 struct ftrace_iterator *iter = m->private; 2010 struct ftrace_iterator *iter = m->private;
2011 struct ftrace_ops *ops = &global_ops;
1520 void *p = NULL; 2012 void *p = NULL;
1521 loff_t l; 2013 loff_t l;
1522 2014
1523 mutex_lock(&ftrace_lock); 2015 mutex_lock(&ftrace_lock);
2016
2017 if (unlikely(ftrace_disabled))
2018 return NULL;
2019
1524 /* 2020 /*
1525 * If an lseek was done, then reset and start from beginning. 2021 * If an lseek was done, then reset and start from beginning.
1526 */ 2022 */
@@ -1532,7 +2028,7 @@ static void *t_start(struct seq_file *m, loff_t *pos)
1532 * off, we can short cut and just print out that all 2028 * off, we can short cut and just print out that all
1533 * functions are enabled. 2029 * functions are enabled.
1534 */ 2030 */
1535 if (iter->flags & FTRACE_ITER_FILTER && !ftrace_filtered) { 2031 if (iter->flags & FTRACE_ITER_FILTER && !ops->filter_hash->count) {
1536 if (*pos > 0) 2032 if (*pos > 0)
1537 return t_hash_start(m, pos); 2033 return t_hash_start(m, pos);
1538 iter->flags |= FTRACE_ITER_PRINTALL; 2034 iter->flags |= FTRACE_ITER_PRINTALL;
@@ -1590,7 +2086,11 @@ static int t_show(struct seq_file *m, void *v)
1590 if (!rec) 2086 if (!rec)
1591 return 0; 2087 return 0;
1592 2088
1593 seq_printf(m, "%ps\n", (void *)rec->ip); 2089 seq_printf(m, "%ps", (void *)rec->ip);
2090 if (iter->flags & FTRACE_ITER_ENABLED)
2091 seq_printf(m, " (%ld)",
2092 rec->flags & ~FTRACE_FL_MASK);
2093 seq_printf(m, "\n");
1594 2094
1595 return 0; 2095 return 0;
1596} 2096}
@@ -1630,44 +2130,46 @@ ftrace_avail_open(struct inode *inode, struct file *file)
1630} 2130}
1631 2131
1632static int 2132static int
1633ftrace_failures_open(struct inode *inode, struct file *file) 2133ftrace_enabled_open(struct inode *inode, struct file *file)
1634{ 2134{
1635 int ret;
1636 struct seq_file *m;
1637 struct ftrace_iterator *iter; 2135 struct ftrace_iterator *iter;
2136 int ret;
2137
2138 if (unlikely(ftrace_disabled))
2139 return -ENODEV;
2140
2141 iter = kzalloc(sizeof(*iter), GFP_KERNEL);
2142 if (!iter)
2143 return -ENOMEM;
2144
2145 iter->pg = ftrace_pages_start;
2146 iter->flags = FTRACE_ITER_ENABLED;
1638 2147
1639 ret = ftrace_avail_open(inode, file); 2148 ret = seq_open(file, &show_ftrace_seq_ops);
1640 if (!ret) { 2149 if (!ret) {
1641 m = file->private_data; 2150 struct seq_file *m = file->private_data;
1642 iter = m->private; 2151
1643 iter->flags = FTRACE_ITER_FAILURES; 2152 m->private = iter;
2153 } else {
2154 kfree(iter);
1644 } 2155 }
1645 2156
1646 return ret; 2157 return ret;
1647} 2158}
1648 2159
1649 2160static void ftrace_filter_reset(struct ftrace_hash *hash)
1650static void ftrace_filter_reset(int enable)
1651{ 2161{
1652 struct ftrace_page *pg;
1653 struct dyn_ftrace *rec;
1654 unsigned long type = enable ? FTRACE_FL_FILTER : FTRACE_FL_NOTRACE;
1655
1656 mutex_lock(&ftrace_lock); 2162 mutex_lock(&ftrace_lock);
1657 if (enable) 2163 ftrace_hash_clear(hash);
1658 ftrace_filtered = 0;
1659 do_for_each_ftrace_rec(pg, rec) {
1660 if (rec->flags & FTRACE_FL_FAILED)
1661 continue;
1662 rec->flags &= ~type;
1663 } while_for_each_ftrace_rec();
1664 mutex_unlock(&ftrace_lock); 2164 mutex_unlock(&ftrace_lock);
1665} 2165}
1666 2166
1667static int 2167static int
1668ftrace_regex_open(struct inode *inode, struct file *file, int enable) 2168ftrace_regex_open(struct ftrace_ops *ops, int flag,
2169 struct inode *inode, struct file *file)
1669{ 2170{
1670 struct ftrace_iterator *iter; 2171 struct ftrace_iterator *iter;
2172 struct ftrace_hash *hash;
1671 int ret = 0; 2173 int ret = 0;
1672 2174
1673 if (unlikely(ftrace_disabled)) 2175 if (unlikely(ftrace_disabled))
@@ -1682,21 +2184,42 @@ ftrace_regex_open(struct inode *inode, struct file *file, int enable)
1682 return -ENOMEM; 2184 return -ENOMEM;
1683 } 2185 }
1684 2186
2187 if (flag & FTRACE_ITER_NOTRACE)
2188 hash = ops->notrace_hash;
2189 else
2190 hash = ops->filter_hash;
2191
2192 iter->ops = ops;
2193 iter->flags = flag;
2194
2195 if (file->f_mode & FMODE_WRITE) {
2196 mutex_lock(&ftrace_lock);
2197 iter->hash = alloc_and_copy_ftrace_hash(FTRACE_HASH_DEFAULT_BITS, hash);
2198 mutex_unlock(&ftrace_lock);
2199
2200 if (!iter->hash) {
2201 trace_parser_put(&iter->parser);
2202 kfree(iter);
2203 return -ENOMEM;
2204 }
2205 }
2206
1685 mutex_lock(&ftrace_regex_lock); 2207 mutex_lock(&ftrace_regex_lock);
2208
1686 if ((file->f_mode & FMODE_WRITE) && 2209 if ((file->f_mode & FMODE_WRITE) &&
1687 (file->f_flags & O_TRUNC)) 2210 (file->f_flags & O_TRUNC))
1688 ftrace_filter_reset(enable); 2211 ftrace_filter_reset(iter->hash);
1689 2212
1690 if (file->f_mode & FMODE_READ) { 2213 if (file->f_mode & FMODE_READ) {
1691 iter->pg = ftrace_pages_start; 2214 iter->pg = ftrace_pages_start;
1692 iter->flags = enable ? FTRACE_ITER_FILTER :
1693 FTRACE_ITER_NOTRACE;
1694 2215
1695 ret = seq_open(file, &show_ftrace_seq_ops); 2216 ret = seq_open(file, &show_ftrace_seq_ops);
1696 if (!ret) { 2217 if (!ret) {
1697 struct seq_file *m = file->private_data; 2218 struct seq_file *m = file->private_data;
1698 m->private = iter; 2219 m->private = iter;
1699 } else { 2220 } else {
2221 /* Failed */
2222 free_ftrace_hash(iter->hash);
1700 trace_parser_put(&iter->parser); 2223 trace_parser_put(&iter->parser);
1701 kfree(iter); 2224 kfree(iter);
1702 } 2225 }
@@ -1710,13 +2233,15 @@ ftrace_regex_open(struct inode *inode, struct file *file, int enable)
1710static int 2233static int
1711ftrace_filter_open(struct inode *inode, struct file *file) 2234ftrace_filter_open(struct inode *inode, struct file *file)
1712{ 2235{
1713 return ftrace_regex_open(inode, file, 1); 2236 return ftrace_regex_open(&global_ops, FTRACE_ITER_FILTER,
2237 inode, file);
1714} 2238}
1715 2239
1716static int 2240static int
1717ftrace_notrace_open(struct inode *inode, struct file *file) 2241ftrace_notrace_open(struct inode *inode, struct file *file)
1718{ 2242{
1719 return ftrace_regex_open(inode, file, 0); 2243 return ftrace_regex_open(&global_ops, FTRACE_ITER_NOTRACE,
2244 inode, file);
1720} 2245}
1721 2246
1722static loff_t 2247static loff_t
@@ -1761,86 +2286,99 @@ static int ftrace_match(char *str, char *regex, int len, int type)
1761} 2286}
1762 2287
1763static int 2288static int
1764ftrace_match_record(struct dyn_ftrace *rec, char *regex, int len, int type) 2289enter_record(struct ftrace_hash *hash, struct dyn_ftrace *rec, int not)
2290{
2291 struct ftrace_func_entry *entry;
2292 int ret = 0;
2293
2294 entry = ftrace_lookup_ip(hash, rec->ip);
2295 if (not) {
2296 /* Do nothing if it doesn't exist */
2297 if (!entry)
2298 return 0;
2299
2300 free_hash_entry(hash, entry);
2301 } else {
2302 /* Do nothing if it exists */
2303 if (entry)
2304 return 0;
2305
2306 ret = add_hash_entry(hash, rec->ip);
2307 }
2308 return ret;
2309}
2310
2311static int
2312ftrace_match_record(struct dyn_ftrace *rec, char *mod,
2313 char *regex, int len, int type)
1765{ 2314{
1766 char str[KSYM_SYMBOL_LEN]; 2315 char str[KSYM_SYMBOL_LEN];
2316 char *modname;
2317
2318 kallsyms_lookup(rec->ip, NULL, NULL, &modname, str);
2319
2320 if (mod) {
2321 /* module lookup requires matching the module */
2322 if (!modname || strcmp(modname, mod))
2323 return 0;
2324
2325 /* blank search means to match all funcs in the mod */
2326 if (!len)
2327 return 1;
2328 }
1767 2329
1768 kallsyms_lookup(rec->ip, NULL, NULL, NULL, str);
1769 return ftrace_match(str, regex, len, type); 2330 return ftrace_match(str, regex, len, type);
1770} 2331}
1771 2332
1772static int ftrace_match_records(char *buff, int len, int enable) 2333static int
2334match_records(struct ftrace_hash *hash, char *buff,
2335 int len, char *mod, int not)
1773{ 2336{
1774 unsigned int search_len; 2337 unsigned search_len = 0;
1775 struct ftrace_page *pg; 2338 struct ftrace_page *pg;
1776 struct dyn_ftrace *rec; 2339 struct dyn_ftrace *rec;
1777 unsigned long flag; 2340 int type = MATCH_FULL;
1778 char *search; 2341 char *search = buff;
1779 int type;
1780 int not;
1781 int found = 0; 2342 int found = 0;
2343 int ret;
1782 2344
1783 flag = enable ? FTRACE_FL_FILTER : FTRACE_FL_NOTRACE; 2345 if (len) {
1784 type = filter_parse_regex(buff, len, &search, &not); 2346 type = filter_parse_regex(buff, len, &search, &not);
1785 2347 search_len = strlen(search);
1786 search_len = strlen(search); 2348 }
1787 2349
1788 mutex_lock(&ftrace_lock); 2350 mutex_lock(&ftrace_lock);
1789 do_for_each_ftrace_rec(pg, rec) {
1790 2351
1791 if (rec->flags & FTRACE_FL_FAILED) 2352 if (unlikely(ftrace_disabled))
1792 continue; 2353 goto out_unlock;
1793 2354
1794 if (ftrace_match_record(rec, search, search_len, type)) { 2355 do_for_each_ftrace_rec(pg, rec) {
1795 if (not) 2356
1796 rec->flags &= ~flag; 2357 if (ftrace_match_record(rec, mod, search, search_len, type)) {
1797 else 2358 ret = enter_record(hash, rec, not);
1798 rec->flags |= flag; 2359 if (ret < 0) {
2360 found = ret;
2361 goto out_unlock;
2362 }
1799 found = 1; 2363 found = 1;
1800 } 2364 }
1801 /*
1802 * Only enable filtering if we have a function that
1803 * is filtered on.
1804 */
1805 if (enable && (rec->flags & FTRACE_FL_FILTER))
1806 ftrace_filtered = 1;
1807 } while_for_each_ftrace_rec(); 2365 } while_for_each_ftrace_rec();
2366 out_unlock:
1808 mutex_unlock(&ftrace_lock); 2367 mutex_unlock(&ftrace_lock);
1809 2368
1810 return found; 2369 return found;
1811} 2370}
1812 2371
1813static int 2372static int
1814ftrace_match_module_record(struct dyn_ftrace *rec, char *mod, 2373ftrace_match_records(struct ftrace_hash *hash, char *buff, int len)
1815 char *regex, int len, int type)
1816{ 2374{
1817 char str[KSYM_SYMBOL_LEN]; 2375 return match_records(hash, buff, len, NULL, 0);
1818 char *modname;
1819
1820 kallsyms_lookup(rec->ip, NULL, NULL, &modname, str);
1821
1822 if (!modname || strcmp(modname, mod))
1823 return 0;
1824
1825 /* blank search means to match all funcs in the mod */
1826 if (len)
1827 return ftrace_match(str, regex, len, type);
1828 else
1829 return 1;
1830} 2376}
1831 2377
1832static int ftrace_match_module_records(char *buff, char *mod, int enable) 2378static int
2379ftrace_match_module_records(struct ftrace_hash *hash, char *buff, char *mod)
1833{ 2380{
1834 unsigned search_len = 0;
1835 struct ftrace_page *pg;
1836 struct dyn_ftrace *rec;
1837 int type = MATCH_FULL;
1838 char *search = buff;
1839 unsigned long flag;
1840 int not = 0; 2381 int not = 0;
1841 int found = 0;
1842
1843 flag = enable ? FTRACE_FL_FILTER : FTRACE_FL_NOTRACE;
1844 2382
1845 /* blank or '*' mean the same */ 2383 /* blank or '*' mean the same */
1846 if (strcmp(buff, "*") == 0) 2384 if (strcmp(buff, "*") == 0)
@@ -1852,32 +2390,7 @@ static int ftrace_match_module_records(char *buff, char *mod, int enable)
1852 not = 1; 2390 not = 1;
1853 } 2391 }
1854 2392
1855 if (strlen(buff)) { 2393 return match_records(hash, buff, strlen(buff), mod, not);
1856 type = filter_parse_regex(buff, strlen(buff), &search, &not);
1857 search_len = strlen(search);
1858 }
1859
1860 mutex_lock(&ftrace_lock);
1861 do_for_each_ftrace_rec(pg, rec) {
1862
1863 if (rec->flags & FTRACE_FL_FAILED)
1864 continue;
1865
1866 if (ftrace_match_module_record(rec, mod,
1867 search, search_len, type)) {
1868 if (not)
1869 rec->flags &= ~flag;
1870 else
1871 rec->flags |= flag;
1872 found = 1;
1873 }
1874 if (enable && (rec->flags & FTRACE_FL_FILTER))
1875 ftrace_filtered = 1;
1876
1877 } while_for_each_ftrace_rec();
1878 mutex_unlock(&ftrace_lock);
1879
1880 return found;
1881} 2394}
1882 2395
1883/* 2396/*
@@ -1888,7 +2401,10 @@ static int ftrace_match_module_records(char *buff, char *mod, int enable)
1888static int 2401static int
1889ftrace_mod_callback(char *func, char *cmd, char *param, int enable) 2402ftrace_mod_callback(char *func, char *cmd, char *param, int enable)
1890{ 2403{
2404 struct ftrace_ops *ops = &global_ops;
2405 struct ftrace_hash *hash;
1891 char *mod; 2406 char *mod;
2407 int ret = -EINVAL;
1892 2408
1893 /* 2409 /*
1894 * cmd == 'mod' because we only registered this func 2410 * cmd == 'mod' because we only registered this func
@@ -1900,15 +2416,24 @@ ftrace_mod_callback(char *func, char *cmd, char *param, int enable)
1900 2416
1901 /* we must have a module name */ 2417 /* we must have a module name */
1902 if (!param) 2418 if (!param)
1903 return -EINVAL; 2419 return ret;
1904 2420
1905 mod = strsep(&param, ":"); 2421 mod = strsep(&param, ":");
1906 if (!strlen(mod)) 2422 if (!strlen(mod))
1907 return -EINVAL; 2423 return ret;
1908 2424
1909 if (ftrace_match_module_records(func, mod, enable)) 2425 if (enable)
1910 return 0; 2426 hash = ops->filter_hash;
1911 return -EINVAL; 2427 else
2428 hash = ops->notrace_hash;
2429
2430 ret = ftrace_match_module_records(hash, func, mod);
2431 if (!ret)
2432 ret = -EINVAL;
2433 if (ret < 0)
2434 return ret;
2435
2436 return 0;
1912} 2437}
1913 2438
1914static struct ftrace_func_command ftrace_mod_cmd = { 2439static struct ftrace_func_command ftrace_mod_cmd = {
@@ -1959,6 +2484,7 @@ static int ftrace_probe_registered;
1959 2484
1960static void __enable_ftrace_function_probe(void) 2485static void __enable_ftrace_function_probe(void)
1961{ 2486{
2487 int ret;
1962 int i; 2488 int i;
1963 2489
1964 if (ftrace_probe_registered) 2490 if (ftrace_probe_registered)
@@ -1973,13 +2499,16 @@ static void __enable_ftrace_function_probe(void)
1973 if (i == FTRACE_FUNC_HASHSIZE) 2499 if (i == FTRACE_FUNC_HASHSIZE)
1974 return; 2500 return;
1975 2501
1976 __register_ftrace_function(&trace_probe_ops); 2502 ret = __register_ftrace_function(&trace_probe_ops);
1977 ftrace_startup(0); 2503 if (!ret)
2504 ftrace_startup(&trace_probe_ops, 0);
2505
1978 ftrace_probe_registered = 1; 2506 ftrace_probe_registered = 1;
1979} 2507}
1980 2508
1981static void __disable_ftrace_function_probe(void) 2509static void __disable_ftrace_function_probe(void)
1982{ 2510{
2511 int ret;
1983 int i; 2512 int i;
1984 2513
1985 if (!ftrace_probe_registered) 2514 if (!ftrace_probe_registered)
@@ -1992,8 +2521,10 @@ static void __disable_ftrace_function_probe(void)
1992 } 2521 }
1993 2522
1994 /* no more funcs left */ 2523 /* no more funcs left */
1995 __unregister_ftrace_function(&trace_probe_ops); 2524 ret = __unregister_ftrace_function(&trace_probe_ops);
1996 ftrace_shutdown(0); 2525 if (!ret)
2526 ftrace_shutdown(&trace_probe_ops, 0);
2527
1997 ftrace_probe_registered = 0; 2528 ftrace_probe_registered = 0;
1998} 2529}
1999 2530
@@ -2029,12 +2560,13 @@ register_ftrace_function_probe(char *glob, struct ftrace_probe_ops *ops,
2029 return -EINVAL; 2560 return -EINVAL;
2030 2561
2031 mutex_lock(&ftrace_lock); 2562 mutex_lock(&ftrace_lock);
2032 do_for_each_ftrace_rec(pg, rec) {
2033 2563
2034 if (rec->flags & FTRACE_FL_FAILED) 2564 if (unlikely(ftrace_disabled))
2035 continue; 2565 goto out_unlock;
2566
2567 do_for_each_ftrace_rec(pg, rec) {
2036 2568
2037 if (!ftrace_match_record(rec, search, len, type)) 2569 if (!ftrace_match_record(rec, NULL, search, len, type))
2038 continue; 2570 continue;
2039 2571
2040 entry = kmalloc(sizeof(*entry), GFP_KERNEL); 2572 entry = kmalloc(sizeof(*entry), GFP_KERNEL);
@@ -2195,18 +2727,22 @@ int unregister_ftrace_command(struct ftrace_func_command *cmd)
2195 return ret; 2727 return ret;
2196} 2728}
2197 2729
2198static int ftrace_process_regex(char *buff, int len, int enable) 2730static int ftrace_process_regex(struct ftrace_hash *hash,
2731 char *buff, int len, int enable)
2199{ 2732{
2200 char *func, *command, *next = buff; 2733 char *func, *command, *next = buff;
2201 struct ftrace_func_command *p; 2734 struct ftrace_func_command *p;
2202 int ret = -EINVAL; 2735 int ret;
2203 2736
2204 func = strsep(&next, ":"); 2737 func = strsep(&next, ":");
2205 2738
2206 if (!next) { 2739 if (!next) {
2207 if (ftrace_match_records(func, len, enable)) 2740 ret = ftrace_match_records(hash, func, len);
2208 return 0; 2741 if (!ret)
2209 return ret; 2742 ret = -EINVAL;
2743 if (ret < 0)
2744 return ret;
2745 return 0;
2210 } 2746 }
2211 2747
2212 /* command found */ 2748 /* command found */
@@ -2239,6 +2775,10 @@ ftrace_regex_write(struct file *file, const char __user *ubuf,
2239 2775
2240 mutex_lock(&ftrace_regex_lock); 2776 mutex_lock(&ftrace_regex_lock);
2241 2777
2778 ret = -ENODEV;
2779 if (unlikely(ftrace_disabled))
2780 goto out_unlock;
2781
2242 if (file->f_mode & FMODE_READ) { 2782 if (file->f_mode & FMODE_READ) {
2243 struct seq_file *m = file->private_data; 2783 struct seq_file *m = file->private_data;
2244 iter = m->private; 2784 iter = m->private;
@@ -2250,7 +2790,7 @@ ftrace_regex_write(struct file *file, const char __user *ubuf,
2250 2790
2251 if (read >= 0 && trace_parser_loaded(parser) && 2791 if (read >= 0 && trace_parser_loaded(parser) &&
2252 !trace_parser_cont(parser)) { 2792 !trace_parser_cont(parser)) {
2253 ret = ftrace_process_regex(parser->buffer, 2793 ret = ftrace_process_regex(iter->hash, parser->buffer,
2254 parser->idx, enable); 2794 parser->idx, enable);
2255 trace_parser_clear(parser); 2795 trace_parser_clear(parser);
2256 if (ret) 2796 if (ret)
@@ -2278,22 +2818,49 @@ ftrace_notrace_write(struct file *file, const char __user *ubuf,
2278 return ftrace_regex_write(file, ubuf, cnt, ppos, 0); 2818 return ftrace_regex_write(file, ubuf, cnt, ppos, 0);
2279} 2819}
2280 2820
2281static void 2821static int
2282ftrace_set_regex(unsigned char *buf, int len, int reset, int enable) 2822ftrace_set_regex(struct ftrace_ops *ops, unsigned char *buf, int len,
2823 int reset, int enable)
2283{ 2824{
2825 struct ftrace_hash **orig_hash;
2826 struct ftrace_hash *hash;
2827 int ret;
2828
2829 /* All global ops uses the global ops filters */
2830 if (ops->flags & FTRACE_OPS_FL_GLOBAL)
2831 ops = &global_ops;
2832
2284 if (unlikely(ftrace_disabled)) 2833 if (unlikely(ftrace_disabled))
2285 return; 2834 return -ENODEV;
2835
2836 if (enable)
2837 orig_hash = &ops->filter_hash;
2838 else
2839 orig_hash = &ops->notrace_hash;
2840
2841 hash = alloc_and_copy_ftrace_hash(FTRACE_HASH_DEFAULT_BITS, *orig_hash);
2842 if (!hash)
2843 return -ENOMEM;
2286 2844
2287 mutex_lock(&ftrace_regex_lock); 2845 mutex_lock(&ftrace_regex_lock);
2288 if (reset) 2846 if (reset)
2289 ftrace_filter_reset(enable); 2847 ftrace_filter_reset(hash);
2290 if (buf) 2848 if (buf)
2291 ftrace_match_records(buf, len, enable); 2849 ftrace_match_records(hash, buf, len);
2850
2851 mutex_lock(&ftrace_lock);
2852 ret = ftrace_hash_move(orig_hash, hash);
2853 mutex_unlock(&ftrace_lock);
2854
2292 mutex_unlock(&ftrace_regex_lock); 2855 mutex_unlock(&ftrace_regex_lock);
2856
2857 free_ftrace_hash(hash);
2858 return ret;
2293} 2859}
2294 2860
2295/** 2861/**
2296 * ftrace_set_filter - set a function to filter on in ftrace 2862 * ftrace_set_filter - set a function to filter on in ftrace
2863 * @ops - the ops to set the filter with
2297 * @buf - the string that holds the function filter text. 2864 * @buf - the string that holds the function filter text.
2298 * @len - the length of the string. 2865 * @len - the length of the string.
2299 * @reset - non zero to reset all filters before applying this filter. 2866 * @reset - non zero to reset all filters before applying this filter.
@@ -2301,13 +2868,16 @@ ftrace_set_regex(unsigned char *buf, int len, int reset, int enable)
2301 * Filters denote which functions should be enabled when tracing is enabled. 2868 * Filters denote which functions should be enabled when tracing is enabled.
2302 * If @buf is NULL and reset is set, all functions will be enabled for tracing. 2869 * If @buf is NULL and reset is set, all functions will be enabled for tracing.
2303 */ 2870 */
2304void ftrace_set_filter(unsigned char *buf, int len, int reset) 2871void ftrace_set_filter(struct ftrace_ops *ops, unsigned char *buf,
2872 int len, int reset)
2305{ 2873{
2306 ftrace_set_regex(buf, len, reset, 1); 2874 ftrace_set_regex(ops, buf, len, reset, 1);
2307} 2875}
2876EXPORT_SYMBOL_GPL(ftrace_set_filter);
2308 2877
2309/** 2878/**
2310 * ftrace_set_notrace - set a function to not trace in ftrace 2879 * ftrace_set_notrace - set a function to not trace in ftrace
2880 * @ops - the ops to set the notrace filter with
2311 * @buf - the string that holds the function notrace text. 2881 * @buf - the string that holds the function notrace text.
2312 * @len - the length of the string. 2882 * @len - the length of the string.
2313 * @reset - non zero to reset all filters before applying this filter. 2883 * @reset - non zero to reset all filters before applying this filter.
@@ -2316,10 +2886,44 @@ void ftrace_set_filter(unsigned char *buf, int len, int reset)
2316 * is enabled. If @buf is NULL and reset is set, all functions will be enabled 2886 * is enabled. If @buf is NULL and reset is set, all functions will be enabled
2317 * for tracing. 2887 * for tracing.
2318 */ 2888 */
2319void ftrace_set_notrace(unsigned char *buf, int len, int reset) 2889void ftrace_set_notrace(struct ftrace_ops *ops, unsigned char *buf,
2890 int len, int reset)
2320{ 2891{
2321 ftrace_set_regex(buf, len, reset, 0); 2892 ftrace_set_regex(ops, buf, len, reset, 0);
2322} 2893}
2894EXPORT_SYMBOL_GPL(ftrace_set_notrace);
2895/**
2896 * ftrace_set_filter - set a function to filter on in ftrace
2897 * @ops - the ops to set the filter with
2898 * @buf - the string that holds the function filter text.
2899 * @len - the length of the string.
2900 * @reset - non zero to reset all filters before applying this filter.
2901 *
2902 * Filters denote which functions should be enabled when tracing is enabled.
2903 * If @buf is NULL and reset is set, all functions will be enabled for tracing.
2904 */
2905void ftrace_set_global_filter(unsigned char *buf, int len, int reset)
2906{
2907 ftrace_set_regex(&global_ops, buf, len, reset, 1);
2908}
2909EXPORT_SYMBOL_GPL(ftrace_set_global_filter);
2910
2911/**
2912 * ftrace_set_notrace - set a function to not trace in ftrace
2913 * @ops - the ops to set the notrace filter with
2914 * @buf - the string that holds the function notrace text.
2915 * @len - the length of the string.
2916 * @reset - non zero to reset all filters before applying this filter.
2917 *
2918 * Notrace Filters denote which functions should not be enabled when tracing
2919 * is enabled. If @buf is NULL and reset is set, all functions will be enabled
2920 * for tracing.
2921 */
2922void ftrace_set_global_notrace(unsigned char *buf, int len, int reset)
2923{
2924 ftrace_set_regex(&global_ops, buf, len, reset, 0);
2925}
2926EXPORT_SYMBOL_GPL(ftrace_set_global_notrace);
2323 2927
2324/* 2928/*
2325 * command line interface to allow users to set filters on boot up. 2929 * command line interface to allow users to set filters on boot up.
@@ -2370,22 +2974,23 @@ static void __init set_ftrace_early_graph(char *buf)
2370} 2974}
2371#endif /* CONFIG_FUNCTION_GRAPH_TRACER */ 2975#endif /* CONFIG_FUNCTION_GRAPH_TRACER */
2372 2976
2373static void __init set_ftrace_early_filter(char *buf, int enable) 2977static void __init
2978set_ftrace_early_filter(struct ftrace_ops *ops, char *buf, int enable)
2374{ 2979{
2375 char *func; 2980 char *func;
2376 2981
2377 while (buf) { 2982 while (buf) {
2378 func = strsep(&buf, ","); 2983 func = strsep(&buf, ",");
2379 ftrace_set_regex(func, strlen(func), 0, enable); 2984 ftrace_set_regex(ops, func, strlen(func), 0, enable);
2380 } 2985 }
2381} 2986}
2382 2987
2383static void __init set_ftrace_early_filters(void) 2988static void __init set_ftrace_early_filters(void)
2384{ 2989{
2385 if (ftrace_filter_buf[0]) 2990 if (ftrace_filter_buf[0])
2386 set_ftrace_early_filter(ftrace_filter_buf, 1); 2991 set_ftrace_early_filter(&global_ops, ftrace_filter_buf, 1);
2387 if (ftrace_notrace_buf[0]) 2992 if (ftrace_notrace_buf[0])
2388 set_ftrace_early_filter(ftrace_notrace_buf, 0); 2993 set_ftrace_early_filter(&global_ops, ftrace_notrace_buf, 0);
2389#ifdef CONFIG_FUNCTION_GRAPH_TRACER 2994#ifdef CONFIG_FUNCTION_GRAPH_TRACER
2390 if (ftrace_graph_buf[0]) 2995 if (ftrace_graph_buf[0])
2391 set_ftrace_early_graph(ftrace_graph_buf); 2996 set_ftrace_early_graph(ftrace_graph_buf);
@@ -2393,11 +2998,14 @@ static void __init set_ftrace_early_filters(void)
2393} 2998}
2394 2999
2395static int 3000static int
2396ftrace_regex_release(struct inode *inode, struct file *file, int enable) 3001ftrace_regex_release(struct inode *inode, struct file *file)
2397{ 3002{
2398 struct seq_file *m = (struct seq_file *)file->private_data; 3003 struct seq_file *m = (struct seq_file *)file->private_data;
2399 struct ftrace_iterator *iter; 3004 struct ftrace_iterator *iter;
3005 struct ftrace_hash **orig_hash;
2400 struct trace_parser *parser; 3006 struct trace_parser *parser;
3007 int filter_hash;
3008 int ret;
2401 3009
2402 mutex_lock(&ftrace_regex_lock); 3010 mutex_lock(&ftrace_regex_lock);
2403 if (file->f_mode & FMODE_READ) { 3011 if (file->f_mode & FMODE_READ) {
@@ -2410,33 +3018,41 @@ ftrace_regex_release(struct inode *inode, struct file *file, int enable)
2410 parser = &iter->parser; 3018 parser = &iter->parser;
2411 if (trace_parser_loaded(parser)) { 3019 if (trace_parser_loaded(parser)) {
2412 parser->buffer[parser->idx] = 0; 3020 parser->buffer[parser->idx] = 0;
2413 ftrace_match_records(parser->buffer, parser->idx, enable); 3021 ftrace_match_records(iter->hash, parser->buffer, parser->idx);
2414 } 3022 }
2415 3023
2416 mutex_lock(&ftrace_lock);
2417 if (ftrace_start_up && ftrace_enabled)
2418 ftrace_run_update_code(FTRACE_ENABLE_CALLS);
2419 mutex_unlock(&ftrace_lock);
2420
2421 trace_parser_put(parser); 3024 trace_parser_put(parser);
3025
3026 if (file->f_mode & FMODE_WRITE) {
3027 filter_hash = !!(iter->flags & FTRACE_ITER_FILTER);
3028
3029 if (filter_hash)
3030 orig_hash = &iter->ops->filter_hash;
3031 else
3032 orig_hash = &iter->ops->notrace_hash;
3033
3034 mutex_lock(&ftrace_lock);
3035 /*
3036 * Remove the current set, update the hash and add
3037 * them back.
3038 */
3039 ftrace_hash_rec_disable(iter->ops, filter_hash);
3040 ret = ftrace_hash_move(orig_hash, iter->hash);
3041 if (!ret) {
3042 ftrace_hash_rec_enable(iter->ops, filter_hash);
3043 if (iter->ops->flags & FTRACE_OPS_FL_ENABLED
3044 && ftrace_enabled)
3045 ftrace_run_update_code(FTRACE_ENABLE_CALLS);
3046 }
3047 mutex_unlock(&ftrace_lock);
3048 }
3049 free_ftrace_hash(iter->hash);
2422 kfree(iter); 3050 kfree(iter);
2423 3051
2424 mutex_unlock(&ftrace_regex_lock); 3052 mutex_unlock(&ftrace_regex_lock);
2425 return 0; 3053 return 0;
2426} 3054}
2427 3055
2428static int
2429ftrace_filter_release(struct inode *inode, struct file *file)
2430{
2431 return ftrace_regex_release(inode, file, 1);
2432}
2433
2434static int
2435ftrace_notrace_release(struct inode *inode, struct file *file)
2436{
2437 return ftrace_regex_release(inode, file, 0);
2438}
2439
2440static const struct file_operations ftrace_avail_fops = { 3056static const struct file_operations ftrace_avail_fops = {
2441 .open = ftrace_avail_open, 3057 .open = ftrace_avail_open,
2442 .read = seq_read, 3058 .read = seq_read,
@@ -2444,8 +3060,8 @@ static const struct file_operations ftrace_avail_fops = {
2444 .release = seq_release_private, 3060 .release = seq_release_private,
2445}; 3061};
2446 3062
2447static const struct file_operations ftrace_failures_fops = { 3063static const struct file_operations ftrace_enabled_fops = {
2448 .open = ftrace_failures_open, 3064 .open = ftrace_enabled_open,
2449 .read = seq_read, 3065 .read = seq_read,
2450 .llseek = seq_lseek, 3066 .llseek = seq_lseek,
2451 .release = seq_release_private, 3067 .release = seq_release_private,
@@ -2456,7 +3072,7 @@ static const struct file_operations ftrace_filter_fops = {
2456 .read = seq_read, 3072 .read = seq_read,
2457 .write = ftrace_filter_write, 3073 .write = ftrace_filter_write,
2458 .llseek = ftrace_regex_lseek, 3074 .llseek = ftrace_regex_lseek,
2459 .release = ftrace_filter_release, 3075 .release = ftrace_regex_release,
2460}; 3076};
2461 3077
2462static const struct file_operations ftrace_notrace_fops = { 3078static const struct file_operations ftrace_notrace_fops = {
@@ -2464,7 +3080,7 @@ static const struct file_operations ftrace_notrace_fops = {
2464 .read = seq_read, 3080 .read = seq_read,
2465 .write = ftrace_notrace_write, 3081 .write = ftrace_notrace_write,
2466 .llseek = ftrace_regex_lseek, 3082 .llseek = ftrace_regex_lseek,
2467 .release = ftrace_notrace_release, 3083 .release = ftrace_regex_release,
2468}; 3084};
2469 3085
2470#ifdef CONFIG_FUNCTION_GRAPH_TRACER 3086#ifdef CONFIG_FUNCTION_GRAPH_TRACER
@@ -2573,9 +3189,6 @@ ftrace_set_func(unsigned long *array, int *idx, char *buffer)
2573 bool exists; 3189 bool exists;
2574 int i; 3190 int i;
2575 3191
2576 if (ftrace_disabled)
2577 return -ENODEV;
2578
2579 /* decode regex */ 3192 /* decode regex */
2580 type = filter_parse_regex(buffer, strlen(buffer), &search, &not); 3193 type = filter_parse_regex(buffer, strlen(buffer), &search, &not);
2581 if (!not && *idx >= FTRACE_GRAPH_MAX_FUNCS) 3194 if (!not && *idx >= FTRACE_GRAPH_MAX_FUNCS)
@@ -2584,12 +3197,18 @@ ftrace_set_func(unsigned long *array, int *idx, char *buffer)
2584 search_len = strlen(search); 3197 search_len = strlen(search);
2585 3198
2586 mutex_lock(&ftrace_lock); 3199 mutex_lock(&ftrace_lock);
3200
3201 if (unlikely(ftrace_disabled)) {
3202 mutex_unlock(&ftrace_lock);
3203 return -ENODEV;
3204 }
3205
2587 do_for_each_ftrace_rec(pg, rec) { 3206 do_for_each_ftrace_rec(pg, rec) {
2588 3207
2589 if (rec->flags & (FTRACE_FL_FAILED | FTRACE_FL_FREE)) 3208 if (rec->flags & FTRACE_FL_FREE)
2590 continue; 3209 continue;
2591 3210
2592 if (ftrace_match_record(rec, search, search_len, type)) { 3211 if (ftrace_match_record(rec, NULL, search, search_len, type)) {
2593 /* if it is in the array */ 3212 /* if it is in the array */
2594 exists = false; 3213 exists = false;
2595 for (i = 0; i < *idx; i++) { 3214 for (i = 0; i < *idx; i++) {
@@ -2679,8 +3298,8 @@ static __init int ftrace_init_dyn_debugfs(struct dentry *d_tracer)
2679 trace_create_file("available_filter_functions", 0444, 3298 trace_create_file("available_filter_functions", 0444,
2680 d_tracer, NULL, &ftrace_avail_fops); 3299 d_tracer, NULL, &ftrace_avail_fops);
2681 3300
2682 trace_create_file("failures", 0444, 3301 trace_create_file("enabled_functions", 0444,
2683 d_tracer, NULL, &ftrace_failures_fops); 3302 d_tracer, NULL, &ftrace_enabled_fops);
2684 3303
2685 trace_create_file("set_ftrace_filter", 0644, d_tracer, 3304 trace_create_file("set_ftrace_filter", 0644, d_tracer,
2686 NULL, &ftrace_filter_fops); 3305 NULL, &ftrace_filter_fops);
@@ -2703,7 +3322,6 @@ static int ftrace_process_locs(struct module *mod,
2703{ 3322{
2704 unsigned long *p; 3323 unsigned long *p;
2705 unsigned long addr; 3324 unsigned long addr;
2706 unsigned long flags;
2707 3325
2708 mutex_lock(&ftrace_lock); 3326 mutex_lock(&ftrace_lock);
2709 p = start; 3327 p = start;
@@ -2720,10 +3338,7 @@ static int ftrace_process_locs(struct module *mod,
2720 ftrace_record_ip(addr); 3338 ftrace_record_ip(addr);
2721 } 3339 }
2722 3340
2723 /* disable interrupts to prevent kstop machine */
2724 local_irq_save(flags);
2725 ftrace_update_code(mod); 3341 ftrace_update_code(mod);
2726 local_irq_restore(flags);
2727 mutex_unlock(&ftrace_lock); 3342 mutex_unlock(&ftrace_lock);
2728 3343
2729 return 0; 3344 return 0;
@@ -2735,10 +3350,11 @@ void ftrace_release_mod(struct module *mod)
2735 struct dyn_ftrace *rec; 3350 struct dyn_ftrace *rec;
2736 struct ftrace_page *pg; 3351 struct ftrace_page *pg;
2737 3352
3353 mutex_lock(&ftrace_lock);
3354
2738 if (ftrace_disabled) 3355 if (ftrace_disabled)
2739 return; 3356 goto out_unlock;
2740 3357
2741 mutex_lock(&ftrace_lock);
2742 do_for_each_ftrace_rec(pg, rec) { 3358 do_for_each_ftrace_rec(pg, rec) {
2743 if (within_module_core(rec->ip, mod)) { 3359 if (within_module_core(rec->ip, mod)) {
2744 /* 3360 /*
@@ -2749,6 +3365,7 @@ void ftrace_release_mod(struct module *mod)
2749 ftrace_free_rec(rec); 3365 ftrace_free_rec(rec);
2750 } 3366 }
2751 } while_for_each_ftrace_rec(); 3367 } while_for_each_ftrace_rec();
3368 out_unlock:
2752 mutex_unlock(&ftrace_lock); 3369 mutex_unlock(&ftrace_lock);
2753} 3370}
2754 3371
@@ -2835,6 +3452,10 @@ void __init ftrace_init(void)
2835 3452
2836#else 3453#else
2837 3454
3455static struct ftrace_ops global_ops = {
3456 .func = ftrace_stub,
3457};
3458
2838static int __init ftrace_nodyn_init(void) 3459static int __init ftrace_nodyn_init(void)
2839{ 3460{
2840 ftrace_enabled = 1; 3461 ftrace_enabled = 1;
@@ -2845,12 +3466,38 @@ device_initcall(ftrace_nodyn_init);
2845static inline int ftrace_init_dyn_debugfs(struct dentry *d_tracer) { return 0; } 3466static inline int ftrace_init_dyn_debugfs(struct dentry *d_tracer) { return 0; }
2846static inline void ftrace_startup_enable(int command) { } 3467static inline void ftrace_startup_enable(int command) { }
2847/* Keep as macros so we do not need to define the commands */ 3468/* Keep as macros so we do not need to define the commands */
2848# define ftrace_startup(command) do { } while (0) 3469# define ftrace_startup(ops, command) do { } while (0)
2849# define ftrace_shutdown(command) do { } while (0) 3470# define ftrace_shutdown(ops, command) do { } while (0)
2850# define ftrace_startup_sysctl() do { } while (0) 3471# define ftrace_startup_sysctl() do { } while (0)
2851# define ftrace_shutdown_sysctl() do { } while (0) 3472# define ftrace_shutdown_sysctl() do { } while (0)
3473
3474static inline int
3475ftrace_ops_test(struct ftrace_ops *ops, unsigned long ip)
3476{
3477 return 1;
3478}
3479
2852#endif /* CONFIG_DYNAMIC_FTRACE */ 3480#endif /* CONFIG_DYNAMIC_FTRACE */
2853 3481
3482static void
3483ftrace_ops_list_func(unsigned long ip, unsigned long parent_ip)
3484{
3485 struct ftrace_ops *op;
3486
3487 /*
3488 * Some of the ops may be dynamically allocated,
3489 * they must be freed after a synchronize_sched().
3490 */
3491 preempt_disable_notrace();
3492 op = rcu_dereference_raw(ftrace_ops_list);
3493 while (op != &ftrace_list_end) {
3494 if (ftrace_ops_test(op, ip))
3495 op->func(ip, parent_ip);
3496 op = rcu_dereference_raw(op->next);
3497 };
3498 preempt_enable_notrace();
3499}
3500
2854static void clear_ftrace_swapper(void) 3501static void clear_ftrace_swapper(void)
2855{ 3502{
2856 struct task_struct *p; 3503 struct task_struct *p;
@@ -3143,19 +3790,23 @@ void ftrace_kill(void)
3143 */ 3790 */
3144int register_ftrace_function(struct ftrace_ops *ops) 3791int register_ftrace_function(struct ftrace_ops *ops)
3145{ 3792{
3146 int ret; 3793 int ret = -1;
3147
3148 if (unlikely(ftrace_disabled))
3149 return -1;
3150 3794
3151 mutex_lock(&ftrace_lock); 3795 mutex_lock(&ftrace_lock);
3152 3796
3797 if (unlikely(ftrace_disabled))
3798 goto out_unlock;
3799
3153 ret = __register_ftrace_function(ops); 3800 ret = __register_ftrace_function(ops);
3154 ftrace_startup(0); 3801 if (!ret)
3802 ftrace_startup(ops, 0);
3155 3803
3804
3805 out_unlock:
3156 mutex_unlock(&ftrace_lock); 3806 mutex_unlock(&ftrace_lock);
3157 return ret; 3807 return ret;
3158} 3808}
3809EXPORT_SYMBOL_GPL(register_ftrace_function);
3159 3810
3160/** 3811/**
3161 * unregister_ftrace_function - unregister a function for profiling. 3812 * unregister_ftrace_function - unregister a function for profiling.
@@ -3169,25 +3820,27 @@ int unregister_ftrace_function(struct ftrace_ops *ops)
3169 3820
3170 mutex_lock(&ftrace_lock); 3821 mutex_lock(&ftrace_lock);
3171 ret = __unregister_ftrace_function(ops); 3822 ret = __unregister_ftrace_function(ops);
3172 ftrace_shutdown(0); 3823 if (!ret)
3824 ftrace_shutdown(ops, 0);
3173 mutex_unlock(&ftrace_lock); 3825 mutex_unlock(&ftrace_lock);
3174 3826
3175 return ret; 3827 return ret;
3176} 3828}
3829EXPORT_SYMBOL_GPL(unregister_ftrace_function);
3177 3830
3178int 3831int
3179ftrace_enable_sysctl(struct ctl_table *table, int write, 3832ftrace_enable_sysctl(struct ctl_table *table, int write,
3180 void __user *buffer, size_t *lenp, 3833 void __user *buffer, size_t *lenp,
3181 loff_t *ppos) 3834 loff_t *ppos)
3182{ 3835{
3183 int ret; 3836 int ret = -ENODEV;
3184
3185 if (unlikely(ftrace_disabled))
3186 return -ENODEV;
3187 3837
3188 mutex_lock(&ftrace_lock); 3838 mutex_lock(&ftrace_lock);
3189 3839
3190 ret = proc_dointvec(table, write, buffer, lenp, ppos); 3840 if (unlikely(ftrace_disabled))
3841 goto out;
3842
3843 ret = proc_dointvec(table, write, buffer, lenp, ppos);
3191 3844
3192 if (ret || !write || (last_ftrace_enabled == !!ftrace_enabled)) 3845 if (ret || !write || (last_ftrace_enabled == !!ftrace_enabled))
3193 goto out; 3846 goto out;
@@ -3199,11 +3852,11 @@ ftrace_enable_sysctl(struct ctl_table *table, int write,
3199 ftrace_startup_sysctl(); 3852 ftrace_startup_sysctl();
3200 3853
3201 /* we are starting ftrace again */ 3854 /* we are starting ftrace again */
3202 if (ftrace_list != &ftrace_list_end) { 3855 if (ftrace_ops_list != &ftrace_list_end) {
3203 if (ftrace_list->next == &ftrace_list_end) 3856 if (ftrace_ops_list->next == &ftrace_list_end)
3204 ftrace_trace_function = ftrace_list->func; 3857 ftrace_trace_function = ftrace_ops_list->func;
3205 else 3858 else
3206 ftrace_trace_function = ftrace_list_func; 3859 ftrace_trace_function = ftrace_ops_list_func;
3207 } 3860 }
3208 3861
3209 } else { 3862 } else {
@@ -3392,7 +4045,7 @@ int register_ftrace_graph(trace_func_graph_ret_t retfunc,
3392 ftrace_graph_return = retfunc; 4045 ftrace_graph_return = retfunc;
3393 ftrace_graph_entry = entryfunc; 4046 ftrace_graph_entry = entryfunc;
3394 4047
3395 ftrace_startup(FTRACE_START_FUNC_RET); 4048 ftrace_startup(&global_ops, FTRACE_START_FUNC_RET);
3396 4049
3397out: 4050out:
3398 mutex_unlock(&ftrace_lock); 4051 mutex_unlock(&ftrace_lock);
@@ -3409,7 +4062,7 @@ void unregister_ftrace_graph(void)
3409 ftrace_graph_active--; 4062 ftrace_graph_active--;
3410 ftrace_graph_return = (trace_func_graph_ret_t)ftrace_stub; 4063 ftrace_graph_return = (trace_func_graph_ret_t)ftrace_stub;
3411 ftrace_graph_entry = ftrace_graph_entry_stub; 4064 ftrace_graph_entry = ftrace_graph_entry_stub;
3412 ftrace_shutdown(FTRACE_STOP_FUNC_RET); 4065 ftrace_shutdown(&global_ops, FTRACE_STOP_FUNC_RET);
3413 unregister_pm_notifier(&ftrace_suspend_notifier); 4066 unregister_pm_notifier(&ftrace_suspend_notifier);
3414 unregister_trace_sched_switch(ftrace_graph_probe_sched_switch, NULL); 4067 unregister_trace_sched_switch(ftrace_graph_probe_sched_switch, NULL);
3415 4068
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index d38c16a06a6f..ee9c921d7f21 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -1110,6 +1110,7 @@ tracing_generic_entry_update(struct trace_entry *entry, unsigned long flags,
1110 1110
1111 entry->preempt_count = pc & 0xff; 1111 entry->preempt_count = pc & 0xff;
1112 entry->pid = (tsk) ? tsk->pid : 0; 1112 entry->pid = (tsk) ? tsk->pid : 0;
1113 entry->padding = 0;
1113 entry->flags = 1114 entry->flags =
1114#ifdef CONFIG_TRACE_IRQFLAGS_SUPPORT 1115#ifdef CONFIG_TRACE_IRQFLAGS_SUPPORT
1115 (irqs_disabled_flags(flags) ? TRACE_FLAG_IRQS_OFF : 0) | 1116 (irqs_disabled_flags(flags) ? TRACE_FLAG_IRQS_OFF : 0) |
@@ -2013,9 +2014,10 @@ enum print_line_t print_trace_line(struct trace_iterator *iter)
2013{ 2014{
2014 enum print_line_t ret; 2015 enum print_line_t ret;
2015 2016
2016 if (iter->lost_events) 2017 if (iter->lost_events &&
2017 trace_seq_printf(&iter->seq, "CPU:%d [LOST %lu EVENTS]\n", 2018 !trace_seq_printf(&iter->seq, "CPU:%d [LOST %lu EVENTS]\n",
2018 iter->cpu, iter->lost_events); 2019 iter->cpu, iter->lost_events))
2020 return TRACE_TYPE_PARTIAL_LINE;
2019 2021
2020 if (iter->trace && iter->trace->print_line) { 2022 if (iter->trace && iter->trace->print_line) {
2021 ret = iter->trace->print_line(iter); 2023 ret = iter->trace->print_line(iter);
@@ -3229,6 +3231,14 @@ waitagain:
3229 3231
3230 if (iter->seq.len >= cnt) 3232 if (iter->seq.len >= cnt)
3231 break; 3233 break;
3234
3235 /*
3236 * Setting the full flag means we reached the trace_seq buffer
3237 * size and we should leave by partial output condition above.
3238 * One of the trace_seq_* functions is not used properly.
3239 */
3240 WARN_ONCE(iter->seq.full, "full flag set for trace type %d",
3241 iter->ent->type);
3232 } 3242 }
3233 trace_access_unlock(iter->cpu_file); 3243 trace_access_unlock(iter->cpu_file);
3234 trace_event_read_unlock(); 3244 trace_event_read_unlock();
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 5e9dfc6286dd..6b69c4bd306f 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -419,6 +419,8 @@ extern void trace_find_cmdline(int pid, char comm[]);
419extern unsigned long ftrace_update_tot_cnt; 419extern unsigned long ftrace_update_tot_cnt;
420#define DYN_FTRACE_TEST_NAME trace_selftest_dynamic_test_func 420#define DYN_FTRACE_TEST_NAME trace_selftest_dynamic_test_func
421extern int DYN_FTRACE_TEST_NAME(void); 421extern int DYN_FTRACE_TEST_NAME(void);
422#define DYN_FTRACE_TEST_NAME2 trace_selftest_dynamic_test_func2
423extern int DYN_FTRACE_TEST_NAME2(void);
422#endif 424#endif
423 425
424extern int ring_buffer_expanded; 426extern int ring_buffer_expanded;
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index e88f74fe1d4c..2fe110341359 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -116,6 +116,7 @@ static int trace_define_common_fields(void)
116 __common_field(unsigned char, flags); 116 __common_field(unsigned char, flags);
117 __common_field(unsigned char, preempt_count); 117 __common_field(unsigned char, preempt_count);
118 __common_field(int, pid); 118 __common_field(int, pid);
119 __common_field(int, padding);
119 120
120 return ret; 121 return ret;
121} 122}
diff --git a/kernel/trace/trace_functions.c b/kernel/trace/trace_functions.c
index 16aee4d44e8f..8d0e1cc4e974 100644
--- a/kernel/trace/trace_functions.c
+++ b/kernel/trace/trace_functions.c
@@ -149,11 +149,13 @@ function_stack_trace_call(unsigned long ip, unsigned long parent_ip)
149static struct ftrace_ops trace_ops __read_mostly = 149static struct ftrace_ops trace_ops __read_mostly =
150{ 150{
151 .func = function_trace_call, 151 .func = function_trace_call,
152 .flags = FTRACE_OPS_FL_GLOBAL,
152}; 153};
153 154
154static struct ftrace_ops trace_stack_ops __read_mostly = 155static struct ftrace_ops trace_stack_ops __read_mostly =
155{ 156{
156 .func = function_stack_trace_call, 157 .func = function_stack_trace_call,
158 .flags = FTRACE_OPS_FL_GLOBAL,
157}; 159};
158 160
159/* Our two options */ 161/* Our two options */
diff --git a/kernel/trace/trace_irqsoff.c b/kernel/trace/trace_irqsoff.c
index a4969b47afc1..c77424be284d 100644
--- a/kernel/trace/trace_irqsoff.c
+++ b/kernel/trace/trace_irqsoff.c
@@ -153,6 +153,7 @@ irqsoff_tracer_call(unsigned long ip, unsigned long parent_ip)
153static struct ftrace_ops trace_ops __read_mostly = 153static struct ftrace_ops trace_ops __read_mostly =
154{ 154{
155 .func = irqsoff_tracer_call, 155 .func = irqsoff_tracer_call,
156 .flags = FTRACE_OPS_FL_GLOBAL,
156}; 157};
157#endif /* CONFIG_FUNCTION_TRACER */ 158#endif /* CONFIG_FUNCTION_TRACER */
158 159
diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c
index 35d55a386145..f925c45f0afa 100644
--- a/kernel/trace/trace_kprobe.c
+++ b/kernel/trace/trace_kprobe.c
@@ -53,7 +53,6 @@ const char *reserved_field_names[] = {
53 "common_preempt_count", 53 "common_preempt_count",
54 "common_pid", 54 "common_pid",
55 "common_tgid", 55 "common_tgid",
56 "common_lock_depth",
57 FIELD_STRING_IP, 56 FIELD_STRING_IP,
58 FIELD_STRING_RETIP, 57 FIELD_STRING_RETIP,
59 FIELD_STRING_FUNC, 58 FIELD_STRING_FUNC,
diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c
index 456be9063c2d..cf535ccedc86 100644
--- a/kernel/trace/trace_output.c
+++ b/kernel/trace/trace_output.c
@@ -830,6 +830,9 @@ EXPORT_SYMBOL_GPL(unregister_ftrace_event);
830enum print_line_t trace_nop_print(struct trace_iterator *iter, int flags, 830enum print_line_t trace_nop_print(struct trace_iterator *iter, int flags,
831 struct trace_event *event) 831 struct trace_event *event)
832{ 832{
833 if (!trace_seq_printf(&iter->seq, "type: %d\n", iter->ent->type))
834 return TRACE_TYPE_PARTIAL_LINE;
835
833 return TRACE_TYPE_HANDLED; 836 return TRACE_TYPE_HANDLED;
834} 837}
835 838
diff --git a/kernel/trace/trace_printk.c b/kernel/trace/trace_printk.c
index 2547d8813cf0..dff763b7baf1 100644
--- a/kernel/trace/trace_printk.c
+++ b/kernel/trace/trace_printk.c
@@ -32,7 +32,7 @@ static DEFINE_MUTEX(btrace_mutex);
32 32
33struct trace_bprintk_fmt { 33struct trace_bprintk_fmt {
34 struct list_head list; 34 struct list_head list;
35 char fmt[0]; 35 const char *fmt;
36}; 36};
37 37
38static inline struct trace_bprintk_fmt *lookup_format(const char *fmt) 38static inline struct trace_bprintk_fmt *lookup_format(const char *fmt)
@@ -49,6 +49,7 @@ static
49void hold_module_trace_bprintk_format(const char **start, const char **end) 49void hold_module_trace_bprintk_format(const char **start, const char **end)
50{ 50{
51 const char **iter; 51 const char **iter;
52 char *fmt;
52 53
53 mutex_lock(&btrace_mutex); 54 mutex_lock(&btrace_mutex);
54 for (iter = start; iter < end; iter++) { 55 for (iter = start; iter < end; iter++) {
@@ -58,14 +59,18 @@ void hold_module_trace_bprintk_format(const char **start, const char **end)
58 continue; 59 continue;
59 } 60 }
60 61
61 tb_fmt = kmalloc(offsetof(struct trace_bprintk_fmt, fmt) 62 tb_fmt = kmalloc(sizeof(*tb_fmt), GFP_KERNEL);
62 + strlen(*iter) + 1, GFP_KERNEL); 63 if (tb_fmt)
63 if (tb_fmt) { 64 fmt = kmalloc(strlen(*iter) + 1, GFP_KERNEL);
65 if (tb_fmt && fmt) {
64 list_add_tail(&tb_fmt->list, &trace_bprintk_fmt_list); 66 list_add_tail(&tb_fmt->list, &trace_bprintk_fmt_list);
65 strcpy(tb_fmt->fmt, *iter); 67 strcpy(fmt, *iter);
68 tb_fmt->fmt = fmt;
66 *iter = tb_fmt->fmt; 69 *iter = tb_fmt->fmt;
67 } else 70 } else {
71 kfree(tb_fmt);
68 *iter = NULL; 72 *iter = NULL;
73 }
69 } 74 }
70 mutex_unlock(&btrace_mutex); 75 mutex_unlock(&btrace_mutex);
71} 76}
@@ -84,6 +89,76 @@ static int module_trace_bprintk_format_notify(struct notifier_block *self,
84 return 0; 89 return 0;
85} 90}
86 91
92/*
93 * The debugfs/tracing/printk_formats file maps the addresses with
94 * the ASCII formats that are used in the bprintk events in the
95 * buffer. For userspace tools to be able to decode the events from
96 * the buffer, they need to be able to map the address with the format.
97 *
98 * The addresses of the bprintk formats are in their own section
99 * __trace_printk_fmt. But for modules we copy them into a link list.
100 * The code to print the formats and their addresses passes around the
101 * address of the fmt string. If the fmt address passed into the seq
102 * functions is within the kernel core __trace_printk_fmt section, then
103 * it simply uses the next pointer in the list.
104 *
105 * When the fmt pointer is outside the kernel core __trace_printk_fmt
106 * section, then we need to read the link list pointers. The trick is
107 * we pass the address of the string to the seq function just like
108 * we do for the kernel core formats. To get back the structure that
109 * holds the format, we simply use containerof() and then go to the
110 * next format in the list.
111 */
112static const char **
113find_next_mod_format(int start_index, void *v, const char **fmt, loff_t *pos)
114{
115 struct trace_bprintk_fmt *mod_fmt;
116
117 if (list_empty(&trace_bprintk_fmt_list))
118 return NULL;
119
120 /*
121 * v will point to the address of the fmt record from t_next
122 * v will be NULL from t_start.
123 * If this is the first pointer or called from start
124 * then we need to walk the list.
125 */
126 if (!v || start_index == *pos) {
127 struct trace_bprintk_fmt *p;
128
129 /* search the module list */
130 list_for_each_entry(p, &trace_bprintk_fmt_list, list) {
131 if (start_index == *pos)
132 return &p->fmt;
133 start_index++;
134 }
135 /* pos > index */
136 return NULL;
137 }
138
139 /*
140 * v points to the address of the fmt field in the mod list
141 * structure that holds the module print format.
142 */
143 mod_fmt = container_of(v, typeof(*mod_fmt), fmt);
144 if (mod_fmt->list.next == &trace_bprintk_fmt_list)
145 return NULL;
146
147 mod_fmt = container_of(mod_fmt->list.next, typeof(*mod_fmt), list);
148
149 return &mod_fmt->fmt;
150}
151
152static void format_mod_start(void)
153{
154 mutex_lock(&btrace_mutex);
155}
156
157static void format_mod_stop(void)
158{
159 mutex_unlock(&btrace_mutex);
160}
161
87#else /* !CONFIG_MODULES */ 162#else /* !CONFIG_MODULES */
88__init static int 163__init static int
89module_trace_bprintk_format_notify(struct notifier_block *self, 164module_trace_bprintk_format_notify(struct notifier_block *self,
@@ -91,6 +166,13 @@ module_trace_bprintk_format_notify(struct notifier_block *self,
91{ 166{
92 return 0; 167 return 0;
93} 168}
169static inline const char **
170find_next_mod_format(int start_index, void *v, const char **fmt, loff_t *pos)
171{
172 return NULL;
173}
174static inline void format_mod_start(void) { }
175static inline void format_mod_stop(void) { }
94#endif /* CONFIG_MODULES */ 176#endif /* CONFIG_MODULES */
95 177
96 178
@@ -153,20 +235,33 @@ int __ftrace_vprintk(unsigned long ip, const char *fmt, va_list ap)
153} 235}
154EXPORT_SYMBOL_GPL(__ftrace_vprintk); 236EXPORT_SYMBOL_GPL(__ftrace_vprintk);
155 237
238static const char **find_next(void *v, loff_t *pos)
239{
240 const char **fmt = v;
241 int start_index;
242
243 if (!fmt)
244 fmt = __start___trace_bprintk_fmt + *pos;
245
246 start_index = __stop___trace_bprintk_fmt - __start___trace_bprintk_fmt;
247
248 if (*pos < start_index)
249 return fmt;
250
251 return find_next_mod_format(start_index, v, fmt, pos);
252}
253
156static void * 254static void *
157t_start(struct seq_file *m, loff_t *pos) 255t_start(struct seq_file *m, loff_t *pos)
158{ 256{
159 const char **fmt = __start___trace_bprintk_fmt + *pos; 257 format_mod_start();
160 258 return find_next(NULL, pos);
161 if ((unsigned long)fmt >= (unsigned long)__stop___trace_bprintk_fmt)
162 return NULL;
163 return fmt;
164} 259}
165 260
166static void *t_next(struct seq_file *m, void * v, loff_t *pos) 261static void *t_next(struct seq_file *m, void * v, loff_t *pos)
167{ 262{
168 (*pos)++; 263 (*pos)++;
169 return t_start(m, pos); 264 return find_next(v, pos);
170} 265}
171 266
172static int t_show(struct seq_file *m, void *v) 267static int t_show(struct seq_file *m, void *v)
@@ -205,6 +300,7 @@ static int t_show(struct seq_file *m, void *v)
205 300
206static void t_stop(struct seq_file *m, void *p) 301static void t_stop(struct seq_file *m, void *p)
207{ 302{
303 format_mod_stop();
208} 304}
209 305
210static const struct seq_operations show_format_seq_ops = { 306static const struct seq_operations show_format_seq_ops = {
diff --git a/kernel/trace/trace_sched_wakeup.c b/kernel/trace/trace_sched_wakeup.c
index 7319559ed59f..f029dd4fd2ca 100644
--- a/kernel/trace/trace_sched_wakeup.c
+++ b/kernel/trace/trace_sched_wakeup.c
@@ -129,6 +129,7 @@ wakeup_tracer_call(unsigned long ip, unsigned long parent_ip)
129static struct ftrace_ops trace_ops __read_mostly = 129static struct ftrace_ops trace_ops __read_mostly =
130{ 130{
131 .func = wakeup_tracer_call, 131 .func = wakeup_tracer_call,
132 .flags = FTRACE_OPS_FL_GLOBAL,
132}; 133};
133#endif /* CONFIG_FUNCTION_TRACER */ 134#endif /* CONFIG_FUNCTION_TRACER */
134 135
diff --git a/kernel/trace/trace_selftest.c b/kernel/trace/trace_selftest.c
index 659732eba07c..288541f977fb 100644
--- a/kernel/trace/trace_selftest.c
+++ b/kernel/trace/trace_selftest.c
@@ -101,6 +101,206 @@ static inline void warn_failed_init_tracer(struct tracer *trace, int init_ret)
101 101
102#ifdef CONFIG_DYNAMIC_FTRACE 102#ifdef CONFIG_DYNAMIC_FTRACE
103 103
104static int trace_selftest_test_probe1_cnt;
105static void trace_selftest_test_probe1_func(unsigned long ip,
106 unsigned long pip)
107{
108 trace_selftest_test_probe1_cnt++;
109}
110
111static int trace_selftest_test_probe2_cnt;
112static void trace_selftest_test_probe2_func(unsigned long ip,
113 unsigned long pip)
114{
115 trace_selftest_test_probe2_cnt++;
116}
117
118static int trace_selftest_test_probe3_cnt;
119static void trace_selftest_test_probe3_func(unsigned long ip,
120 unsigned long pip)
121{
122 trace_selftest_test_probe3_cnt++;
123}
124
125static int trace_selftest_test_global_cnt;
126static void trace_selftest_test_global_func(unsigned long ip,
127 unsigned long pip)
128{
129 trace_selftest_test_global_cnt++;
130}
131
132static int trace_selftest_test_dyn_cnt;
133static void trace_selftest_test_dyn_func(unsigned long ip,
134 unsigned long pip)
135{
136 trace_selftest_test_dyn_cnt++;
137}
138
139static struct ftrace_ops test_probe1 = {
140 .func = trace_selftest_test_probe1_func,
141};
142
143static struct ftrace_ops test_probe2 = {
144 .func = trace_selftest_test_probe2_func,
145};
146
147static struct ftrace_ops test_probe3 = {
148 .func = trace_selftest_test_probe3_func,
149};
150
151static struct ftrace_ops test_global = {
152 .func = trace_selftest_test_global_func,
153 .flags = FTRACE_OPS_FL_GLOBAL,
154};
155
156static void print_counts(void)
157{
158 printk("(%d %d %d %d %d) ",
159 trace_selftest_test_probe1_cnt,
160 trace_selftest_test_probe2_cnt,
161 trace_selftest_test_probe3_cnt,
162 trace_selftest_test_global_cnt,
163 trace_selftest_test_dyn_cnt);
164}
165
166static void reset_counts(void)
167{
168 trace_selftest_test_probe1_cnt = 0;
169 trace_selftest_test_probe2_cnt = 0;
170 trace_selftest_test_probe3_cnt = 0;
171 trace_selftest_test_global_cnt = 0;
172 trace_selftest_test_dyn_cnt = 0;
173}
174
175static int trace_selftest_ops(int cnt)
176{
177 int save_ftrace_enabled = ftrace_enabled;
178 struct ftrace_ops *dyn_ops;
179 char *func1_name;
180 char *func2_name;
181 int len1;
182 int len2;
183 int ret = -1;
184
185 printk(KERN_CONT "PASSED\n");
186 pr_info("Testing dynamic ftrace ops #%d: ", cnt);
187
188 ftrace_enabled = 1;
189 reset_counts();
190
191 /* Handle PPC64 '.' name */
192 func1_name = "*" __stringify(DYN_FTRACE_TEST_NAME);
193 func2_name = "*" __stringify(DYN_FTRACE_TEST_NAME2);
194 len1 = strlen(func1_name);
195 len2 = strlen(func2_name);
196
197 /*
198 * Probe 1 will trace function 1.
199 * Probe 2 will trace function 2.
200 * Probe 3 will trace functions 1 and 2.
201 */
202 ftrace_set_filter(&test_probe1, func1_name, len1, 1);
203 ftrace_set_filter(&test_probe2, func2_name, len2, 1);
204 ftrace_set_filter(&test_probe3, func1_name, len1, 1);
205 ftrace_set_filter(&test_probe3, func2_name, len2, 0);
206
207 register_ftrace_function(&test_probe1);
208 register_ftrace_function(&test_probe2);
209 register_ftrace_function(&test_probe3);
210 register_ftrace_function(&test_global);
211
212 DYN_FTRACE_TEST_NAME();
213
214 print_counts();
215
216 if (trace_selftest_test_probe1_cnt != 1)
217 goto out;
218 if (trace_selftest_test_probe2_cnt != 0)
219 goto out;
220 if (trace_selftest_test_probe3_cnt != 1)
221 goto out;
222 if (trace_selftest_test_global_cnt == 0)
223 goto out;
224
225 DYN_FTRACE_TEST_NAME2();
226
227 print_counts();
228
229 if (trace_selftest_test_probe1_cnt != 1)
230 goto out;
231 if (trace_selftest_test_probe2_cnt != 1)
232 goto out;
233 if (trace_selftest_test_probe3_cnt != 2)
234 goto out;
235
236 /* Add a dynamic probe */
237 dyn_ops = kzalloc(sizeof(*dyn_ops), GFP_KERNEL);
238 if (!dyn_ops) {
239 printk("MEMORY ERROR ");
240 goto out;
241 }
242
243 dyn_ops->func = trace_selftest_test_dyn_func;
244
245 register_ftrace_function(dyn_ops);
246
247 trace_selftest_test_global_cnt = 0;
248
249 DYN_FTRACE_TEST_NAME();
250
251 print_counts();
252
253 if (trace_selftest_test_probe1_cnt != 2)
254 goto out_free;
255 if (trace_selftest_test_probe2_cnt != 1)
256 goto out_free;
257 if (trace_selftest_test_probe3_cnt != 3)
258 goto out_free;
259 if (trace_selftest_test_global_cnt == 0)
260 goto out;
261 if (trace_selftest_test_dyn_cnt == 0)
262 goto out_free;
263
264 DYN_FTRACE_TEST_NAME2();
265
266 print_counts();
267
268 if (trace_selftest_test_probe1_cnt != 2)
269 goto out_free;
270 if (trace_selftest_test_probe2_cnt != 2)
271 goto out_free;
272 if (trace_selftest_test_probe3_cnt != 4)
273 goto out_free;
274
275 ret = 0;
276 out_free:
277 unregister_ftrace_function(dyn_ops);
278 kfree(dyn_ops);
279
280 out:
281 /* Purposely unregister in the same order */
282 unregister_ftrace_function(&test_probe1);
283 unregister_ftrace_function(&test_probe2);
284 unregister_ftrace_function(&test_probe3);
285 unregister_ftrace_function(&test_global);
286
287 /* Make sure everything is off */
288 reset_counts();
289 DYN_FTRACE_TEST_NAME();
290 DYN_FTRACE_TEST_NAME();
291
292 if (trace_selftest_test_probe1_cnt ||
293 trace_selftest_test_probe2_cnt ||
294 trace_selftest_test_probe3_cnt ||
295 trace_selftest_test_global_cnt ||
296 trace_selftest_test_dyn_cnt)
297 ret = -1;
298
299 ftrace_enabled = save_ftrace_enabled;
300
301 return ret;
302}
303
104/* Test dynamic code modification and ftrace filters */ 304/* Test dynamic code modification and ftrace filters */
105int trace_selftest_startup_dynamic_tracing(struct tracer *trace, 305int trace_selftest_startup_dynamic_tracing(struct tracer *trace,
106 struct trace_array *tr, 306 struct trace_array *tr,
@@ -131,7 +331,7 @@ int trace_selftest_startup_dynamic_tracing(struct tracer *trace,
131 func_name = "*" __stringify(DYN_FTRACE_TEST_NAME); 331 func_name = "*" __stringify(DYN_FTRACE_TEST_NAME);
132 332
133 /* filter only on our function */ 333 /* filter only on our function */
134 ftrace_set_filter(func_name, strlen(func_name), 1); 334 ftrace_set_global_filter(func_name, strlen(func_name), 1);
135 335
136 /* enable tracing */ 336 /* enable tracing */
137 ret = tracer_init(trace, tr); 337 ret = tracer_init(trace, tr);
@@ -166,22 +366,30 @@ int trace_selftest_startup_dynamic_tracing(struct tracer *trace,
166 366
167 /* check the trace buffer */ 367 /* check the trace buffer */
168 ret = trace_test_buffer(tr, &count); 368 ret = trace_test_buffer(tr, &count);
169 trace->reset(tr);
170 tracing_start(); 369 tracing_start();
171 370
172 /* we should only have one item */ 371 /* we should only have one item */
173 if (!ret && count != 1) { 372 if (!ret && count != 1) {
373 trace->reset(tr);
174 printk(KERN_CONT ".. filter failed count=%ld ..", count); 374 printk(KERN_CONT ".. filter failed count=%ld ..", count);
175 ret = -1; 375 ret = -1;
176 goto out; 376 goto out;
177 } 377 }
178 378
379 /* Test the ops with global tracing running */
380 ret = trace_selftest_ops(1);
381 trace->reset(tr);
382
179 out: 383 out:
180 ftrace_enabled = save_ftrace_enabled; 384 ftrace_enabled = save_ftrace_enabled;
181 tracer_enabled = save_tracer_enabled; 385 tracer_enabled = save_tracer_enabled;
182 386
183 /* Enable tracing on all functions again */ 387 /* Enable tracing on all functions again */
184 ftrace_set_filter(NULL, 0, 1); 388 ftrace_set_global_filter(NULL, 0, 1);
389
390 /* Test the ops with global tracing off */
391 if (!ret)
392 ret = trace_selftest_ops(2);
185 393
186 return ret; 394 return ret;
187} 395}
diff --git a/kernel/trace/trace_selftest_dynamic.c b/kernel/trace/trace_selftest_dynamic.c
index 54dd77cce5bf..b4c475a0a48b 100644
--- a/kernel/trace/trace_selftest_dynamic.c
+++ b/kernel/trace/trace_selftest_dynamic.c
@@ -5,3 +5,9 @@ int DYN_FTRACE_TEST_NAME(void)
5 /* used to call mcount */ 5 /* used to call mcount */
6 return 0; 6 return 0;
7} 7}
8
9int DYN_FTRACE_TEST_NAME2(void)
10{
11 /* used to call mcount */
12 return 0;
13}
diff --git a/kernel/trace/trace_stack.c b/kernel/trace/trace_stack.c
index 4c5dead0c239..b0b53b8e4c25 100644
--- a/kernel/trace/trace_stack.c
+++ b/kernel/trace/trace_stack.c
@@ -133,6 +133,7 @@ stack_trace_call(unsigned long ip, unsigned long parent_ip)
133static struct ftrace_ops trace_ops __read_mostly = 133static struct ftrace_ops trace_ops __read_mostly =
134{ 134{
135 .func = stack_trace_call, 135 .func = stack_trace_call,
136 .flags = FTRACE_OPS_FL_GLOBAL,
136}; 137};
137 138
138static ssize_t 139static ssize_t
diff --git a/kernel/tracepoint.c b/kernel/tracepoint.c
index 68187af4889e..b219f1449c54 100644
--- a/kernel/tracepoint.c
+++ b/kernel/tracepoint.c
@@ -251,9 +251,9 @@ static void set_tracepoint(struct tracepoint_entry **entry,
251{ 251{
252 WARN_ON(strcmp((*entry)->name, elem->name) != 0); 252 WARN_ON(strcmp((*entry)->name, elem->name) != 0);
253 253
254 if (elem->regfunc && !elem->state && active) 254 if (elem->regfunc && !jump_label_enabled(&elem->key) && active)
255 elem->regfunc(); 255 elem->regfunc();
256 else if (elem->unregfunc && elem->state && !active) 256 else if (elem->unregfunc && jump_label_enabled(&elem->key) && !active)
257 elem->unregfunc(); 257 elem->unregfunc();
258 258
259 /* 259 /*
@@ -264,13 +264,10 @@ static void set_tracepoint(struct tracepoint_entry **entry,
264 * is used. 264 * is used.
265 */ 265 */
266 rcu_assign_pointer(elem->funcs, (*entry)->funcs); 266 rcu_assign_pointer(elem->funcs, (*entry)->funcs);
267 if (!elem->state && active) { 267 if (active && !jump_label_enabled(&elem->key))
268 jump_label_enable(&elem->state); 268 jump_label_inc(&elem->key);
269 elem->state = active; 269 else if (!active && jump_label_enabled(&elem->key))
270 } else if (elem->state && !active) { 270 jump_label_dec(&elem->key);
271 jump_label_disable(&elem->state);
272 elem->state = active;
273 }
274} 271}
275 272
276/* 273/*
@@ -281,13 +278,11 @@ static void set_tracepoint(struct tracepoint_entry **entry,
281 */ 278 */
282static void disable_tracepoint(struct tracepoint *elem) 279static void disable_tracepoint(struct tracepoint *elem)
283{ 280{
284 if (elem->unregfunc && elem->state) 281 if (elem->unregfunc && jump_label_enabled(&elem->key))
285 elem->unregfunc(); 282 elem->unregfunc();
286 283
287 if (elem->state) { 284 if (jump_label_enabled(&elem->key))
288 jump_label_disable(&elem->state); 285 jump_label_dec(&elem->key);
289 elem->state = 0;
290 }
291 rcu_assign_pointer(elem->funcs, NULL); 286 rcu_assign_pointer(elem->funcs, NULL);
292} 287}
293 288