aboutsummaryrefslogtreecommitdiffstats
path: root/kernel
diff options
context:
space:
mode:
Diffstat (limited to 'kernel')
-rw-r--r--kernel/Makefile6
-rw-r--r--kernel/audit_tree.c2
-rw-r--r--kernel/auditsc.c2
-rw-r--r--kernel/capability.c12
-rw-r--r--kernel/cgroup.c29
-rw-r--r--kernel/cpu.c2
-rw-r--r--kernel/cpuset.c2
-rw-r--r--kernel/cred.c12
-rw-r--r--kernel/debug/debug_core.c2
-rw-r--r--kernel/debug/kdb/kdb_main.c6
-rw-r--r--kernel/debug/kdb/kdb_support.c2
-rw-r--r--kernel/events/Makefile6
-rw-r--r--kernel/events/core.c (renamed from kernel/perf_event.c)76
-rw-r--r--kernel/events/hw_breakpoint.c (renamed from kernel/hw_breakpoint.c)0
-rw-r--r--kernel/exit.c4
-rw-r--r--kernel/extable.c8
-rw-r--r--kernel/fork.c5
-rw-r--r--kernel/freezer.c4
-rw-r--r--kernel/futex.c2
-rw-r--r--kernel/hrtimer.c10
-rw-r--r--kernel/hung_task.c2
-rw-r--r--kernel/irq/Kconfig4
-rw-r--r--kernel/irq/Makefile1
-rw-r--r--kernel/irq/chip.c5
-rw-r--r--kernel/irq/debug.h1
-rw-r--r--kernel/irq/generic-chip.c354
-rw-r--r--kernel/irq/irqdesc.c22
-rw-r--r--kernel/irq/manage.c4
-rw-r--r--kernel/irq/migration.c2
-rw-r--r--kernel/irq/proc.c2
-rw-r--r--kernel/irq/settings.h17
-rw-r--r--kernel/jump_label.c539
-rw-r--r--kernel/kexec.c12
-rw-r--r--kernel/kmod.c16
-rw-r--r--kernel/ksysfs.c10
-rw-r--r--kernel/kthread.c2
-rw-r--r--kernel/latencytop.c2
-rw-r--r--kernel/lockdep.c210
-rw-r--r--kernel/module.c111
-rw-r--r--kernel/mutex-debug.c2
-rw-r--r--kernel/mutex-debug.h2
-rw-r--r--kernel/mutex.c11
-rw-r--r--kernel/mutex.h2
-rw-r--r--kernel/padata.c8
-rw-r--r--kernel/params.c25
-rw-r--r--kernel/pid.c5
-rw-r--r--kernel/posix-cpu-timers.c2
-rw-r--r--kernel/posix-timers.c2
-rw-r--r--kernel/power/Kconfig16
-rw-r--r--kernel/power/hibernate.c52
-rw-r--r--kernel/power/main.c3
-rw-r--r--kernel/power/power.h4
-rw-r--r--kernel/power/snapshot.c33
-rw-r--r--kernel/power/suspend.c11
-rw-r--r--kernel/power/user.c5
-rw-r--r--kernel/ptrace.c17
-rw-r--r--kernel/rcupdate.c32
-rw-r--r--kernel/rcutiny.c46
-rw-r--r--kernel/rcutiny_plugin.h203
-rw-r--r--kernel/rcutorture.c26
-rw-r--r--kernel/rcutree.c527
-rw-r--r--kernel/rcutree.h104
-rw-r--r--kernel/rcutree_plugin.h568
-rw-r--r--kernel/rcutree_trace.c180
-rw-r--r--kernel/sched.c1683
-rw-r--r--kernel/sched_autogroup.c2
-rw-r--r--kernel/sched_debug.c6
-rw-r--r--kernel/sched_fair.c158
-rw-r--r--kernel/sched_features.h6
-rw-r--r--kernel/sched_idletask.c2
-rw-r--r--kernel/sched_rt.c87
-rw-r--r--kernel/sched_stoptask.c5
-rw-r--r--kernel/signal.c6
-rw-r--r--kernel/softirq.c4
-rw-r--r--kernel/sys.c5
-rw-r--r--kernel/time/Makefile2
-rw-r--r--kernel/time/alarmtimer.c694
-rw-r--r--kernel/time/clockevents.c64
-rw-r--r--kernel/time/clocksource.c42
-rw-r--r--kernel/time/jiffies.c2
-rw-r--r--kernel/time/posix-clock.c24
-rw-r--r--kernel/time/tick-broadcast.c12
-rw-r--r--kernel/time/timekeeping.c56
-rw-r--r--kernel/time/timer_stats.c2
-rw-r--r--kernel/trace/Kconfig2
-rw-r--r--kernel/trace/blktrace.c33
-rw-r--r--kernel/trace/ftrace.c1265
-rw-r--r--kernel/trace/ring_buffer.c4
-rw-r--r--kernel/trace/trace.c18
-rw-r--r--kernel/trace/trace.h2
-rw-r--r--kernel/trace/trace_clock.c2
-rw-r--r--kernel/trace/trace_entries.h2
-rw-r--r--kernel/trace/trace_events.c1
-rw-r--r--kernel/trace/trace_functions.c2
-rw-r--r--kernel/trace/trace_functions_graph.c2
-rw-r--r--kernel/trace/trace_irqsoff.c3
-rw-r--r--kernel/trace/trace_kprobe.c3
-rw-r--r--kernel/trace/trace_output.c3
-rw-r--r--kernel/trace/trace_printk.c120
-rw-r--r--kernel/trace/trace_sched_wakeup.c1
-rw-r--r--kernel/trace/trace_selftest.c214
-rw-r--r--kernel/trace/trace_selftest_dynamic.c6
-rw-r--r--kernel/trace/trace_stack.c1
-rw-r--r--kernel/tracepoint.c23
-rw-r--r--kernel/user-return-notifier.c2
-rw-r--r--kernel/wait.c2
-rw-r--r--kernel/watchdog.c5
-rw-r--r--kernel/workqueue.c8
108 files changed, 5540 insertions, 2440 deletions
diff --git a/kernel/Makefile b/kernel/Makefile
index 85cbfb31e73e..e9cf19155b46 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -21,7 +21,6 @@ CFLAGS_REMOVE_mutex-debug.o = -pg
21CFLAGS_REMOVE_rtmutex-debug.o = -pg 21CFLAGS_REMOVE_rtmutex-debug.o = -pg
22CFLAGS_REMOVE_cgroup-debug.o = -pg 22CFLAGS_REMOVE_cgroup-debug.o = -pg
23CFLAGS_REMOVE_sched_clock.o = -pg 23CFLAGS_REMOVE_sched_clock.o = -pg
24CFLAGS_REMOVE_perf_event.o = -pg
25CFLAGS_REMOVE_irq_work.o = -pg 24CFLAGS_REMOVE_irq_work.o = -pg
26endif 25endif
27 26
@@ -103,8 +102,9 @@ obj-$(CONFIG_RING_BUFFER) += trace/
103obj-$(CONFIG_TRACEPOINTS) += trace/ 102obj-$(CONFIG_TRACEPOINTS) += trace/
104obj-$(CONFIG_SMP) += sched_cpupri.o 103obj-$(CONFIG_SMP) += sched_cpupri.o
105obj-$(CONFIG_IRQ_WORK) += irq_work.o 104obj-$(CONFIG_IRQ_WORK) += irq_work.o
106obj-$(CONFIG_PERF_EVENTS) += perf_event.o 105
107obj-$(CONFIG_HAVE_HW_BREAKPOINT) += hw_breakpoint.o 106obj-$(CONFIG_PERF_EVENTS) += events/
107
108obj-$(CONFIG_USER_RETURN_NOTIFIER) += user-return-notifier.o 108obj-$(CONFIG_USER_RETURN_NOTIFIER) += user-return-notifier.o
109obj-$(CONFIG_PADATA) += padata.o 109obj-$(CONFIG_PADATA) += padata.o
110obj-$(CONFIG_CRASH_DUMP) += crash_dump.o 110obj-$(CONFIG_CRASH_DUMP) += crash_dump.o
diff --git a/kernel/audit_tree.c b/kernel/audit_tree.c
index 37b2bea170c8..e99dda04b126 100644
--- a/kernel/audit_tree.c
+++ b/kernel/audit_tree.c
@@ -607,7 +607,7 @@ void audit_trim_trees(void)
607 spin_lock(&hash_lock); 607 spin_lock(&hash_lock);
608 list_for_each_entry(node, &tree->chunks, list) { 608 list_for_each_entry(node, &tree->chunks, list) {
609 struct audit_chunk *chunk = find_chunk(node); 609 struct audit_chunk *chunk = find_chunk(node);
610 /* this could be NULL if the watch is dieing else where... */ 610 /* this could be NULL if the watch is dying else where... */
611 struct inode *inode = chunk->mark.i.inode; 611 struct inode *inode = chunk->mark.i.inode;
612 node->index |= 1U<<31; 612 node->index |= 1U<<31;
613 if (iterate_mounts(compare_root, inode, root_mnt)) 613 if (iterate_mounts(compare_root, inode, root_mnt))
diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index f49a0318c2ed..b33513a08beb 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -1011,7 +1011,7 @@ static int audit_log_pid_context(struct audit_context *context, pid_t pid,
1011/* 1011/*
1012 * to_send and len_sent accounting are very loose estimates. We aren't 1012 * to_send and len_sent accounting are very loose estimates. We aren't
1013 * really worried about a hard cap to MAX_EXECVE_AUDIT_LEN so much as being 1013 * really worried about a hard cap to MAX_EXECVE_AUDIT_LEN so much as being
1014 * within about 500 bytes (next page boundry) 1014 * within about 500 bytes (next page boundary)
1015 * 1015 *
1016 * why snprintf? an int is up to 12 digits long. if we just assumed when 1016 * why snprintf? an int is up to 12 digits long. if we just assumed when
1017 * logging that a[%d]= was going to be 16 characters long we would be wasting 1017 * logging that a[%d]= was going to be 16 characters long we would be wasting
diff --git a/kernel/capability.c b/kernel/capability.c
index bf0c734d0c12..32a80e08ff4b 100644
--- a/kernel/capability.c
+++ b/kernel/capability.c
@@ -399,3 +399,15 @@ bool task_ns_capable(struct task_struct *t, int cap)
399 return ns_capable(task_cred_xxx(t, user)->user_ns, cap); 399 return ns_capable(task_cred_xxx(t, user)->user_ns, cap);
400} 400}
401EXPORT_SYMBOL(task_ns_capable); 401EXPORT_SYMBOL(task_ns_capable);
402
403/**
404 * nsown_capable - Check superior capability to one's own user_ns
405 * @cap: The capability in question
406 *
407 * Return true if the current task has the given superior capability
408 * targeted at its own user namespace.
409 */
410bool nsown_capable(int cap)
411{
412 return ns_capable(current_user_ns(), cap);
413}
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index e31b220a743d..909a35510af5 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -157,7 +157,7 @@ struct css_id {
157}; 157};
158 158
159/* 159/*
160 * cgroup_event represents events which userspace want to recieve. 160 * cgroup_event represents events which userspace want to receive.
161 */ 161 */
162struct cgroup_event { 162struct cgroup_event {
163 /* 163 /*
@@ -326,12 +326,6 @@ static struct hlist_head *css_set_hash(struct cgroup_subsys_state *css[])
326 return &css_set_table[index]; 326 return &css_set_table[index];
327} 327}
328 328
329static void free_css_set_rcu(struct rcu_head *obj)
330{
331 struct css_set *cg = container_of(obj, struct css_set, rcu_head);
332 kfree(cg);
333}
334
335/* We don't maintain the lists running through each css_set to its 329/* We don't maintain the lists running through each css_set to its
336 * task until after the first call to cgroup_iter_start(). This 330 * task until after the first call to cgroup_iter_start(). This
337 * reduces the fork()/exit() overhead for people who have cgroups 331 * reduces the fork()/exit() overhead for people who have cgroups
@@ -375,7 +369,7 @@ static void __put_css_set(struct css_set *cg, int taskexit)
375 } 369 }
376 370
377 write_unlock(&css_set_lock); 371 write_unlock(&css_set_lock);
378 call_rcu(&cg->rcu_head, free_css_set_rcu); 372 kfree_rcu(cg, rcu_head);
379} 373}
380 374
381/* 375/*
@@ -812,13 +806,6 @@ static int cgroup_call_pre_destroy(struct cgroup *cgrp)
812 return ret; 806 return ret;
813} 807}
814 808
815static void free_cgroup_rcu(struct rcu_head *obj)
816{
817 struct cgroup *cgrp = container_of(obj, struct cgroup, rcu_head);
818
819 kfree(cgrp);
820}
821
822static void cgroup_diput(struct dentry *dentry, struct inode *inode) 809static void cgroup_diput(struct dentry *dentry, struct inode *inode)
823{ 810{
824 /* is dentry a directory ? if so, kfree() associated cgroup */ 811 /* is dentry a directory ? if so, kfree() associated cgroup */
@@ -856,7 +843,7 @@ static void cgroup_diput(struct dentry *dentry, struct inode *inode)
856 */ 843 */
857 BUG_ON(!list_empty(&cgrp->pidlists)); 844 BUG_ON(!list_empty(&cgrp->pidlists));
858 845
859 call_rcu(&cgrp->rcu_head, free_cgroup_rcu); 846 kfree_rcu(cgrp, rcu_head);
860 } 847 }
861 iput(inode); 848 iput(inode);
862} 849}
@@ -4623,14 +4610,6 @@ bool css_is_ancestor(struct cgroup_subsys_state *child,
4623 return ret; 4610 return ret;
4624} 4611}
4625 4612
4626static void __free_css_id_cb(struct rcu_head *head)
4627{
4628 struct css_id *id;
4629
4630 id = container_of(head, struct css_id, rcu_head);
4631 kfree(id);
4632}
4633
4634void free_css_id(struct cgroup_subsys *ss, struct cgroup_subsys_state *css) 4613void free_css_id(struct cgroup_subsys *ss, struct cgroup_subsys_state *css)
4635{ 4614{
4636 struct css_id *id = css->id; 4615 struct css_id *id = css->id;
@@ -4645,7 +4624,7 @@ void free_css_id(struct cgroup_subsys *ss, struct cgroup_subsys_state *css)
4645 spin_lock(&ss->id_lock); 4624 spin_lock(&ss->id_lock);
4646 idr_remove(&ss->idr, id->id); 4625 idr_remove(&ss->idr, id->id);
4647 spin_unlock(&ss->id_lock); 4626 spin_unlock(&ss->id_lock);
4648 call_rcu(&id->rcu_head, __free_css_id_cb); 4627 kfree_rcu(id, rcu_head);
4649} 4628}
4650EXPORT_SYMBOL_GPL(free_css_id); 4629EXPORT_SYMBOL_GPL(free_css_id);
4651 4630
diff --git a/kernel/cpu.c b/kernel/cpu.c
index c95fc4df0faa..12b7458f23b1 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -126,7 +126,7 @@ static void cpu_hotplug_done(void)
126#else /* #if CONFIG_HOTPLUG_CPU */ 126#else /* #if CONFIG_HOTPLUG_CPU */
127static void cpu_hotplug_begin(void) {} 127static void cpu_hotplug_begin(void) {}
128static void cpu_hotplug_done(void) {} 128static void cpu_hotplug_done(void) {}
129#endif /* #esle #if CONFIG_HOTPLUG_CPU */ 129#endif /* #else #if CONFIG_HOTPLUG_CPU */
130 130
131/* Need to know about CPUs going up/down? */ 131/* Need to know about CPUs going up/down? */
132int __ref register_cpu_notifier(struct notifier_block *nb) 132int __ref register_cpu_notifier(struct notifier_block *nb)
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index 33eee16addb8..2bb8c2e98fff 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -1159,7 +1159,7 @@ int current_cpuset_is_being_rebound(void)
1159static int update_relax_domain_level(struct cpuset *cs, s64 val) 1159static int update_relax_domain_level(struct cpuset *cs, s64 val)
1160{ 1160{
1161#ifdef CONFIG_SMP 1161#ifdef CONFIG_SMP
1162 if (val < -1 || val >= SD_LV_MAX) 1162 if (val < -1 || val >= sched_domain_level_max)
1163 return -EINVAL; 1163 return -EINVAL;
1164#endif 1164#endif
1165 1165
diff --git a/kernel/cred.c b/kernel/cred.c
index 5557b55048df..8093c16b84b1 100644
--- a/kernel/cred.c
+++ b/kernel/cred.c
@@ -54,6 +54,7 @@ struct cred init_cred = {
54 .cap_effective = CAP_INIT_EFF_SET, 54 .cap_effective = CAP_INIT_EFF_SET,
55 .cap_bset = CAP_INIT_BSET, 55 .cap_bset = CAP_INIT_BSET,
56 .user = INIT_USER, 56 .user = INIT_USER,
57 .user_ns = &init_user_ns,
57 .group_info = &init_groups, 58 .group_info = &init_groups,
58#ifdef CONFIG_KEYS 59#ifdef CONFIG_KEYS
59 .tgcred = &init_tgcred, 60 .tgcred = &init_tgcred,
@@ -410,6 +411,11 @@ int copy_creds(struct task_struct *p, unsigned long clone_flags)
410 goto error_put; 411 goto error_put;
411 } 412 }
412 413
414 /* cache user_ns in cred. Doesn't need a refcount because it will
415 * stay pinned by cred->user
416 */
417 new->user_ns = new->user->user_ns;
418
413#ifdef CONFIG_KEYS 419#ifdef CONFIG_KEYS
414 /* new threads get their own thread keyrings if their parent already 420 /* new threads get their own thread keyrings if their parent already
415 * had one */ 421 * had one */
@@ -741,12 +747,6 @@ int set_create_files_as(struct cred *new, struct inode *inode)
741} 747}
742EXPORT_SYMBOL(set_create_files_as); 748EXPORT_SYMBOL(set_create_files_as);
743 749
744struct user_namespace *current_user_ns(void)
745{
746 return _current_user_ns();
747}
748EXPORT_SYMBOL(current_user_ns);
749
750#ifdef CONFIG_DEBUG_CREDENTIALS 750#ifdef CONFIG_DEBUG_CREDENTIALS
751 751
752bool creds_are_invalid(const struct cred *cred) 752bool creds_are_invalid(const struct cred *cred)
diff --git a/kernel/debug/debug_core.c b/kernel/debug/debug_core.c
index cefd4a11f6d9..bad6786dee88 100644
--- a/kernel/debug/debug_core.c
+++ b/kernel/debug/debug_core.c
@@ -538,7 +538,7 @@ return_normal:
538 538
539 /* 539 /*
540 * For single stepping, try to only enter on the processor 540 * For single stepping, try to only enter on the processor
541 * that was single stepping. To gaurd against a deadlock, the 541 * that was single stepping. To guard against a deadlock, the
542 * kernel will only try for the value of sstep_tries before 542 * kernel will only try for the value of sstep_tries before
543 * giving up and continuing on. 543 * giving up and continuing on.
544 */ 544 */
diff --git a/kernel/debug/kdb/kdb_main.c b/kernel/debug/kdb/kdb_main.c
index 6bc6e3bc4f9c..be14779bcef6 100644
--- a/kernel/debug/kdb/kdb_main.c
+++ b/kernel/debug/kdb/kdb_main.c
@@ -441,9 +441,9 @@ static int kdb_check_regs(void)
441 * symbol name, and offset to the caller. 441 * symbol name, and offset to the caller.
442 * 442 *
443 * The argument may consist of a numeric value (decimal or 443 * The argument may consist of a numeric value (decimal or
444 * hexidecimal), a symbol name, a register name (preceeded by the 444 * hexidecimal), a symbol name, a register name (preceded by the
445 * percent sign), an environment variable with a numeric value 445 * percent sign), an environment variable with a numeric value
446 * (preceeded by a dollar sign) or a simple arithmetic expression 446 * (preceded by a dollar sign) or a simple arithmetic expression
447 * consisting of a symbol name, +/-, and a numeric constant value 447 * consisting of a symbol name, +/-, and a numeric constant value
448 * (offset). 448 * (offset).
449 * Parameters: 449 * Parameters:
@@ -1335,7 +1335,7 @@ void kdb_print_state(const char *text, int value)
1335 * error The hardware-defined error code 1335 * error The hardware-defined error code
1336 * reason2 kdb's current reason code. 1336 * reason2 kdb's current reason code.
1337 * Initially error but can change 1337 * Initially error but can change
1338 * acording to kdb state. 1338 * according to kdb state.
1339 * db_result Result code from break or debug point. 1339 * db_result Result code from break or debug point.
1340 * regs The exception frame at time of fault/breakpoint. 1340 * regs The exception frame at time of fault/breakpoint.
1341 * should always be valid. 1341 * should always be valid.
diff --git a/kernel/debug/kdb/kdb_support.c b/kernel/debug/kdb/kdb_support.c
index 6b2485dcb050..5532dd37aa86 100644
--- a/kernel/debug/kdb/kdb_support.c
+++ b/kernel/debug/kdb/kdb_support.c
@@ -545,7 +545,7 @@ int kdb_putword(unsigned long addr, unsigned long word, size_t size)
545 * Mask for process state. 545 * Mask for process state.
546 * Notes: 546 * Notes:
547 * The mask folds data from several sources into a single long value, so 547 * The mask folds data from several sources into a single long value, so
548 * be carefull not to overlap the bits. TASK_* bits are in the LSB, 548 * be careful not to overlap the bits. TASK_* bits are in the LSB,
549 * special cases like UNRUNNABLE are in the MSB. As of 2.6.10-rc1 there 549 * special cases like UNRUNNABLE are in the MSB. As of 2.6.10-rc1 there
550 * is no overlap between TASK_* and EXIT_* but that may not always be 550 * is no overlap between TASK_* and EXIT_* but that may not always be
551 * true, so EXIT_* bits are shifted left 16 bits before being stored in 551 * true, so EXIT_* bits are shifted left 16 bits before being stored in
diff --git a/kernel/events/Makefile b/kernel/events/Makefile
new file mode 100644
index 000000000000..1ce23d3d8394
--- /dev/null
+++ b/kernel/events/Makefile
@@ -0,0 +1,6 @@
1ifdef CONFIG_FUNCTION_TRACER
2CFLAGS_REMOVE_core.o = -pg
3endif
4
5obj-y := core.o
6obj-$(CONFIG_HAVE_HW_BREAKPOINT) += hw_breakpoint.o
diff --git a/kernel/perf_event.c b/kernel/events/core.c
index 27960f114efd..c09767f7db3e 100644
--- a/kernel/perf_event.c
+++ b/kernel/events/core.c
@@ -2,8 +2,8 @@
2 * Performance events core code: 2 * Performance events core code:
3 * 3 *
4 * Copyright (C) 2008 Thomas Gleixner <tglx@linutronix.de> 4 * Copyright (C) 2008 Thomas Gleixner <tglx@linutronix.de>
5 * Copyright (C) 2008-2009 Red Hat, Inc., Ingo Molnar 5 * Copyright (C) 2008-2011 Red Hat, Inc., Ingo Molnar
6 * Copyright (C) 2008-2009 Red Hat, Inc., Peter Zijlstra <pzijlstr@redhat.com> 6 * Copyright (C) 2008-2011 Red Hat, Inc., Peter Zijlstra <pzijlstr@redhat.com>
7 * Copyright © 2009 Paul Mackerras, IBM Corp. <paulus@au1.ibm.com> 7 * Copyright © 2009 Paul Mackerras, IBM Corp. <paulus@au1.ibm.com>
8 * 8 *
9 * For licensing details see kernel-base/COPYING 9 * For licensing details see kernel-base/COPYING
@@ -39,10 +39,10 @@
39#include <asm/irq_regs.h> 39#include <asm/irq_regs.h>
40 40
41struct remote_function_call { 41struct remote_function_call {
42 struct task_struct *p; 42 struct task_struct *p;
43 int (*func)(void *info); 43 int (*func)(void *info);
44 void *info; 44 void *info;
45 int ret; 45 int ret;
46}; 46};
47 47
48static void remote_function(void *data) 48static void remote_function(void *data)
@@ -76,10 +76,10 @@ static int
76task_function_call(struct task_struct *p, int (*func) (void *info), void *info) 76task_function_call(struct task_struct *p, int (*func) (void *info), void *info)
77{ 77{
78 struct remote_function_call data = { 78 struct remote_function_call data = {
79 .p = p, 79 .p = p,
80 .func = func, 80 .func = func,
81 .info = info, 81 .info = info,
82 .ret = -ESRCH, /* No such (running) process */ 82 .ret = -ESRCH, /* No such (running) process */
83 }; 83 };
84 84
85 if (task_curr(p)) 85 if (task_curr(p))
@@ -100,10 +100,10 @@ task_function_call(struct task_struct *p, int (*func) (void *info), void *info)
100static int cpu_function_call(int cpu, int (*func) (void *info), void *info) 100static int cpu_function_call(int cpu, int (*func) (void *info), void *info)
101{ 101{
102 struct remote_function_call data = { 102 struct remote_function_call data = {
103 .p = NULL, 103 .p = NULL,
104 .func = func, 104 .func = func,
105 .info = info, 105 .info = info,
106 .ret = -ENXIO, /* No such CPU */ 106 .ret = -ENXIO, /* No such CPU */
107 }; 107 };
108 108
109 smp_call_function_single(cpu, remote_function, &data, 1); 109 smp_call_function_single(cpu, remote_function, &data, 1);
@@ -125,7 +125,7 @@ enum event_type_t {
125 * perf_sched_events : >0 events exist 125 * perf_sched_events : >0 events exist
126 * perf_cgroup_events: >0 per-cpu cgroup events exist on this cpu 126 * perf_cgroup_events: >0 per-cpu cgroup events exist on this cpu
127 */ 127 */
128atomic_t perf_sched_events __read_mostly; 128struct jump_label_key perf_sched_events __read_mostly;
129static DEFINE_PER_CPU(atomic_t, perf_cgroup_events); 129static DEFINE_PER_CPU(atomic_t, perf_cgroup_events);
130 130
131static atomic_t nr_mmap_events __read_mostly; 131static atomic_t nr_mmap_events __read_mostly;
@@ -364,6 +364,7 @@ void perf_cgroup_switch(struct task_struct *task, int mode)
364 } 364 }
365 365
366 if (mode & PERF_CGROUP_SWIN) { 366 if (mode & PERF_CGROUP_SWIN) {
367 WARN_ON_ONCE(cpuctx->cgrp);
367 /* set cgrp before ctxsw in to 368 /* set cgrp before ctxsw in to
368 * allow event_filter_match() to not 369 * allow event_filter_match() to not
369 * have to pass task around 370 * have to pass task around
@@ -585,14 +586,6 @@ static void get_ctx(struct perf_event_context *ctx)
585 WARN_ON(!atomic_inc_not_zero(&ctx->refcount)); 586 WARN_ON(!atomic_inc_not_zero(&ctx->refcount));
586} 587}
587 588
588static void free_ctx(struct rcu_head *head)
589{
590 struct perf_event_context *ctx;
591
592 ctx = container_of(head, struct perf_event_context, rcu_head);
593 kfree(ctx);
594}
595
596static void put_ctx(struct perf_event_context *ctx) 589static void put_ctx(struct perf_event_context *ctx)
597{ 590{
598 if (atomic_dec_and_test(&ctx->refcount)) { 591 if (atomic_dec_and_test(&ctx->refcount)) {
@@ -600,7 +593,7 @@ static void put_ctx(struct perf_event_context *ctx)
600 put_ctx(ctx->parent_ctx); 593 put_ctx(ctx->parent_ctx);
601 if (ctx->task) 594 if (ctx->task)
602 put_task_struct(ctx->task); 595 put_task_struct(ctx->task);
603 call_rcu(&ctx->rcu_head, free_ctx); 596 kfree_rcu(ctx, rcu_head);
604 } 597 }
605} 598}
606 599
@@ -2423,6 +2416,14 @@ static void perf_event_enable_on_exec(struct perf_event_context *ctx)
2423 if (!ctx || !ctx->nr_events) 2416 if (!ctx || !ctx->nr_events)
2424 goto out; 2417 goto out;
2425 2418
2419 /*
2420 * We must ctxsw out cgroup events to avoid conflict
2421 * when invoking perf_task_event_sched_in() later on
2422 * in this function. Otherwise we end up trying to
2423 * ctxswin cgroup events which are already scheduled
2424 * in.
2425 */
2426 perf_cgroup_sched_out(current);
2426 task_ctx_sched_out(ctx, EVENT_ALL); 2427 task_ctx_sched_out(ctx, EVENT_ALL);
2427 2428
2428 raw_spin_lock(&ctx->lock); 2429 raw_spin_lock(&ctx->lock);
@@ -2447,6 +2448,9 @@ static void perf_event_enable_on_exec(struct perf_event_context *ctx)
2447 2448
2448 raw_spin_unlock(&ctx->lock); 2449 raw_spin_unlock(&ctx->lock);
2449 2450
2451 /*
2452 * Also calls ctxswin for cgroup events, if any:
2453 */
2450 perf_event_context_sched_in(ctx, ctx->task); 2454 perf_event_context_sched_in(ctx, ctx->task);
2451out: 2455out:
2452 local_irq_restore(flags); 2456 local_irq_restore(flags);
@@ -5319,14 +5323,6 @@ swevent_hlist_deref(struct swevent_htable *swhash)
5319 lockdep_is_held(&swhash->hlist_mutex)); 5323 lockdep_is_held(&swhash->hlist_mutex));
5320} 5324}
5321 5325
5322static void swevent_hlist_release_rcu(struct rcu_head *rcu_head)
5323{
5324 struct swevent_hlist *hlist;
5325
5326 hlist = container_of(rcu_head, struct swevent_hlist, rcu_head);
5327 kfree(hlist);
5328}
5329
5330static void swevent_hlist_release(struct swevent_htable *swhash) 5326static void swevent_hlist_release(struct swevent_htable *swhash)
5331{ 5327{
5332 struct swevent_hlist *hlist = swevent_hlist_deref(swhash); 5328 struct swevent_hlist *hlist = swevent_hlist_deref(swhash);
@@ -5335,7 +5331,7 @@ static void swevent_hlist_release(struct swevent_htable *swhash)
5335 return; 5331 return;
5336 5332
5337 rcu_assign_pointer(swhash->swevent_hlist, NULL); 5333 rcu_assign_pointer(swhash->swevent_hlist, NULL);
5338 call_rcu(&hlist->rcu_head, swevent_hlist_release_rcu); 5334 kfree_rcu(hlist, rcu_head);
5339} 5335}
5340 5336
5341static void swevent_hlist_put_cpu(struct perf_event *event, int cpu) 5337static void swevent_hlist_put_cpu(struct perf_event *event, int cpu)
@@ -5417,7 +5413,7 @@ fail:
5417 return err; 5413 return err;
5418} 5414}
5419 5415
5420atomic_t perf_swevent_enabled[PERF_COUNT_SW_MAX]; 5416struct jump_label_key perf_swevent_enabled[PERF_COUNT_SW_MAX];
5421 5417
5422static void sw_perf_event_destroy(struct perf_event *event) 5418static void sw_perf_event_destroy(struct perf_event *event)
5423{ 5419{
@@ -7433,11 +7429,11 @@ static void perf_cgroup_exit(struct cgroup_subsys *ss, struct cgroup *cgrp,
7433} 7429}
7434 7430
7435struct cgroup_subsys perf_subsys = { 7431struct cgroup_subsys perf_subsys = {
7436 .name = "perf_event", 7432 .name = "perf_event",
7437 .subsys_id = perf_subsys_id, 7433 .subsys_id = perf_subsys_id,
7438 .create = perf_cgroup_create, 7434 .create = perf_cgroup_create,
7439 .destroy = perf_cgroup_destroy, 7435 .destroy = perf_cgroup_destroy,
7440 .exit = perf_cgroup_exit, 7436 .exit = perf_cgroup_exit,
7441 .attach = perf_cgroup_attach, 7437 .attach = perf_cgroup_attach,
7442}; 7438};
7443#endif /* CONFIG_CGROUP_PERF */ 7439#endif /* CONFIG_CGROUP_PERF */
diff --git a/kernel/hw_breakpoint.c b/kernel/events/hw_breakpoint.c
index 086adf25a55e..086adf25a55e 100644
--- a/kernel/hw_breakpoint.c
+++ b/kernel/events/hw_breakpoint.c
diff --git a/kernel/exit.c b/kernel/exit.c
index 33837936b98c..20a406471525 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -841,7 +841,7 @@ static void exit_notify(struct task_struct *tsk, int group_dead)
841 /* Let father know we died 841 /* Let father know we died
842 * 842 *
843 * Thread signals are configurable, but you aren't going to use 843 * Thread signals are configurable, but you aren't going to use
844 * that to send signals to arbitary processes. 844 * that to send signals to arbitrary processes.
845 * That stops right now. 845 * That stops right now.
846 * 846 *
847 * If the parent exec id doesn't match the exec id we saved 847 * If the parent exec id doesn't match the exec id we saved
@@ -1016,7 +1016,7 @@ NORET_TYPE void do_exit(long code)
1016 /* 1016 /*
1017 * FIXME: do that only when needed, using sched_exit tracepoint 1017 * FIXME: do that only when needed, using sched_exit tracepoint
1018 */ 1018 */
1019 flush_ptrace_hw_breakpoint(tsk); 1019 ptrace_put_breakpoints(tsk);
1020 1020
1021 exit_notify(tsk, group_dead); 1021 exit_notify(tsk, group_dead);
1022#ifdef CONFIG_NUMA 1022#ifdef CONFIG_NUMA
diff --git a/kernel/extable.c b/kernel/extable.c
index 7f8f263f8524..c2d625fcda77 100644
--- a/kernel/extable.c
+++ b/kernel/extable.c
@@ -72,6 +72,14 @@ int core_kernel_text(unsigned long addr)
72 return 0; 72 return 0;
73} 73}
74 74
75int core_kernel_data(unsigned long addr)
76{
77 if (addr >= (unsigned long)_sdata &&
78 addr < (unsigned long)_edata)
79 return 1;
80 return 0;
81}
82
75int __kernel_text_address(unsigned long addr) 83int __kernel_text_address(unsigned long addr)
76{ 84{
77 if (core_kernel_text(addr)) 85 if (core_kernel_text(addr))
diff --git a/kernel/fork.c b/kernel/fork.c
index e7548dee636b..2b44d82b8237 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1103,7 +1103,6 @@ static struct task_struct *copy_process(unsigned long clone_flags,
1103 1103
1104 posix_cpu_timers_init(p); 1104 posix_cpu_timers_init(p);
1105 1105
1106 p->lock_depth = -1; /* -1 = no lock */
1107 do_posix_clock_monotonic_gettime(&p->start_time); 1106 do_posix_clock_monotonic_gettime(&p->start_time);
1108 p->real_start_time = p->start_time; 1107 p->real_start_time = p->start_time;
1109 monotonic_to_bootbased(&p->real_start_time); 1108 monotonic_to_bootbased(&p->real_start_time);
@@ -1153,7 +1152,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
1153#endif 1152#endif
1154 1153
1155 /* Perform scheduler related setup. Assign this task to a CPU. */ 1154 /* Perform scheduler related setup. Assign this task to a CPU. */
1156 sched_fork(p, clone_flags); 1155 sched_fork(p);
1157 1156
1158 retval = perf_event_init_task(p); 1157 retval = perf_event_init_task(p);
1159 if (retval) 1158 if (retval)
@@ -1464,7 +1463,7 @@ long do_fork(unsigned long clone_flags,
1464 */ 1463 */
1465 p->flags &= ~PF_STARTING; 1464 p->flags &= ~PF_STARTING;
1466 1465
1467 wake_up_new_task(p, clone_flags); 1466 wake_up_new_task(p);
1468 1467
1469 tracehook_report_clone_complete(trace, regs, 1468 tracehook_report_clone_complete(trace, regs,
1470 clone_flags, nr, p); 1469 clone_flags, nr, p);
diff --git a/kernel/freezer.c b/kernel/freezer.c
index 66ecd2ead215..7b01de98bb6a 100644
--- a/kernel/freezer.c
+++ b/kernel/freezer.c
@@ -17,7 +17,7 @@ static inline void frozen_process(void)
17{ 17{
18 if (!unlikely(current->flags & PF_NOFREEZE)) { 18 if (!unlikely(current->flags & PF_NOFREEZE)) {
19 current->flags |= PF_FROZEN; 19 current->flags |= PF_FROZEN;
20 wmb(); 20 smp_wmb();
21 } 21 }
22 clear_freeze_flag(current); 22 clear_freeze_flag(current);
23} 23}
@@ -93,7 +93,7 @@ bool freeze_task(struct task_struct *p, bool sig_only)
93 * the task as frozen and next clears its TIF_FREEZE. 93 * the task as frozen and next clears its TIF_FREEZE.
94 */ 94 */
95 if (!freezing(p)) { 95 if (!freezing(p)) {
96 rmb(); 96 smp_rmb();
97 if (frozen(p)) 97 if (frozen(p))
98 return false; 98 return false;
99 99
diff --git a/kernel/futex.c b/kernel/futex.c
index dfb924ffe65b..fe28dc282eae 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -1886,7 +1886,7 @@ retry:
1886 restart->futex.val = val; 1886 restart->futex.val = val;
1887 restart->futex.time = abs_time->tv64; 1887 restart->futex.time = abs_time->tv64;
1888 restart->futex.bitset = bitset; 1888 restart->futex.bitset = bitset;
1889 restart->futex.flags = flags; 1889 restart->futex.flags = flags | FLAGS_HAS_TIMEOUT;
1890 1890
1891 ret = -ERESTART_RESTARTBLOCK; 1891 ret = -ERESTART_RESTARTBLOCK;
1892 1892
diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c
index 9017478c5d4c..dbbbf7d43080 100644
--- a/kernel/hrtimer.c
+++ b/kernel/hrtimer.c
@@ -81,7 +81,11 @@ DEFINE_PER_CPU(struct hrtimer_cpu_base, hrtimer_bases) =
81 } 81 }
82}; 82};
83 83
84static int hrtimer_clock_to_base_table[MAX_CLOCKS]; 84static const int hrtimer_clock_to_base_table[MAX_CLOCKS] = {
85 [CLOCK_REALTIME] = HRTIMER_BASE_REALTIME,
86 [CLOCK_MONOTONIC] = HRTIMER_BASE_MONOTONIC,
87 [CLOCK_BOOTTIME] = HRTIMER_BASE_BOOTTIME,
88};
85 89
86static inline int hrtimer_clockid_to_base(clockid_t clock_id) 90static inline int hrtimer_clockid_to_base(clockid_t clock_id)
87{ 91{
@@ -1722,10 +1726,6 @@ static struct notifier_block __cpuinitdata hrtimers_nb = {
1722 1726
1723void __init hrtimers_init(void) 1727void __init hrtimers_init(void)
1724{ 1728{
1725 hrtimer_clock_to_base_table[CLOCK_REALTIME] = HRTIMER_BASE_REALTIME;
1726 hrtimer_clock_to_base_table[CLOCK_MONOTONIC] = HRTIMER_BASE_MONOTONIC;
1727 hrtimer_clock_to_base_table[CLOCK_BOOTTIME] = HRTIMER_BASE_BOOTTIME;
1728
1729 hrtimer_cpu_notify(&hrtimers_nb, (unsigned long)CPU_UP_PREPARE, 1729 hrtimer_cpu_notify(&hrtimers_nb, (unsigned long)CPU_UP_PREPARE,
1730 (void *)(long)smp_processor_id()); 1730 (void *)(long)smp_processor_id());
1731 register_cpu_notifier(&hrtimers_nb); 1731 register_cpu_notifier(&hrtimers_nb);
diff --git a/kernel/hung_task.c b/kernel/hung_task.c
index 53ead174da2f..ea640120ab86 100644
--- a/kernel/hung_task.c
+++ b/kernel/hung_task.c
@@ -33,7 +33,7 @@ unsigned long __read_mostly sysctl_hung_task_check_count = PID_MAX_LIMIT;
33/* 33/*
34 * Zero means infinite timeout - no checking done: 34 * Zero means infinite timeout - no checking done:
35 */ 35 */
36unsigned long __read_mostly sysctl_hung_task_timeout_secs = 120; 36unsigned long __read_mostly sysctl_hung_task_timeout_secs = CONFIG_DEFAULT_HUNG_TASK_TIMEOUT;
37 37
38unsigned long __read_mostly sysctl_hung_task_warnings = 10; 38unsigned long __read_mostly sysctl_hung_task_warnings = 10;
39 39
diff --git a/kernel/irq/Kconfig b/kernel/irq/Kconfig
index c574f9a12c48..d1d051b38e0b 100644
--- a/kernel/irq/Kconfig
+++ b/kernel/irq/Kconfig
@@ -48,6 +48,10 @@ config IRQ_PREFLOW_FASTEOI
48config IRQ_EDGE_EOI_HANDLER 48config IRQ_EDGE_EOI_HANDLER
49 bool 49 bool
50 50
51# Generic configurable interrupt chip implementation
52config GENERIC_IRQ_CHIP
53 bool
54
51# Support forced irq threading 55# Support forced irq threading
52config IRQ_FORCED_THREADING 56config IRQ_FORCED_THREADING
53 bool 57 bool
diff --git a/kernel/irq/Makefile b/kernel/irq/Makefile
index 54329cd7b3ee..73290056cfb6 100644
--- a/kernel/irq/Makefile
+++ b/kernel/irq/Makefile
@@ -1,5 +1,6 @@
1 1
2obj-y := irqdesc.o handle.o manage.o spurious.o resend.o chip.o dummychip.o devres.o 2obj-y := irqdesc.o handle.o manage.o spurious.o resend.o chip.o dummychip.o devres.o
3obj-$(CONFIG_GENERIC_IRQ_CHIP) += generic-chip.o
3obj-$(CONFIG_GENERIC_IRQ_PROBE) += autoprobe.o 4obj-$(CONFIG_GENERIC_IRQ_PROBE) += autoprobe.o
4obj-$(CONFIG_PROC_FS) += proc.o 5obj-$(CONFIG_PROC_FS) += proc.o
5obj-$(CONFIG_GENERIC_PENDING_IRQ) += migration.o 6obj-$(CONFIG_GENERIC_PENDING_IRQ) += migration.o
diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c
index 1dafc8652bd8..d5a3009da71a 100644
--- a/kernel/irq/chip.c
+++ b/kernel/irq/chip.c
@@ -310,6 +310,7 @@ handle_simple_irq(unsigned int irq, struct irq_desc *desc)
310out_unlock: 310out_unlock:
311 raw_spin_unlock(&desc->lock); 311 raw_spin_unlock(&desc->lock);
312} 312}
313EXPORT_SYMBOL_GPL(handle_simple_irq);
313 314
314/** 315/**
315 * handle_level_irq - Level type irq handler 316 * handle_level_irq - Level type irq handler
@@ -415,7 +416,7 @@ out:
415 * @desc: the interrupt description structure for this irq 416 * @desc: the interrupt description structure for this irq
416 * 417 *
417 * Interrupt occures on the falling and/or rising edge of a hardware 418 * Interrupt occures on the falling and/or rising edge of a hardware
418 * signal. The occurence is latched into the irq controller hardware 419 * signal. The occurrence is latched into the irq controller hardware
419 * and must be acked in order to be reenabled. After the ack another 420 * and must be acked in order to be reenabled. After the ack another
420 * interrupt can happen on the same source even before the first one 421 * interrupt can happen on the same source even before the first one
421 * is handled by the associated event handler. If this happens it 422 * is handled by the associated event handler. If this happens it
@@ -573,6 +574,7 @@ __irq_set_handler(unsigned int irq, irq_flow_handler_t handle, int is_chained,
573 if (handle != handle_bad_irq && is_chained) { 574 if (handle != handle_bad_irq && is_chained) {
574 irq_settings_set_noprobe(desc); 575 irq_settings_set_noprobe(desc);
575 irq_settings_set_norequest(desc); 576 irq_settings_set_norequest(desc);
577 irq_settings_set_nothread(desc);
576 irq_startup(desc); 578 irq_startup(desc);
577 } 579 }
578out: 580out:
@@ -612,6 +614,7 @@ void irq_modify_status(unsigned int irq, unsigned long clr, unsigned long set)
612 614
613 irq_put_desc_unlock(desc, flags); 615 irq_put_desc_unlock(desc, flags);
614} 616}
617EXPORT_SYMBOL_GPL(irq_modify_status);
615 618
616/** 619/**
617 * irq_cpu_online - Invoke all irq_cpu_online functions. 620 * irq_cpu_online - Invoke all irq_cpu_online functions.
diff --git a/kernel/irq/debug.h b/kernel/irq/debug.h
index 306cba37e9a5..97a8bfadc88a 100644
--- a/kernel/irq/debug.h
+++ b/kernel/irq/debug.h
@@ -27,6 +27,7 @@ static inline void print_irq_desc(unsigned int irq, struct irq_desc *desc)
27 P(IRQ_PER_CPU); 27 P(IRQ_PER_CPU);
28 P(IRQ_NOPROBE); 28 P(IRQ_NOPROBE);
29 P(IRQ_NOREQUEST); 29 P(IRQ_NOREQUEST);
30 P(IRQ_NOTHREAD);
30 P(IRQ_NOAUTOEN); 31 P(IRQ_NOAUTOEN);
31 32
32 PS(IRQS_AUTODETECT); 33 PS(IRQS_AUTODETECT);
diff --git a/kernel/irq/generic-chip.c b/kernel/irq/generic-chip.c
new file mode 100644
index 000000000000..31a9db711906
--- /dev/null
+++ b/kernel/irq/generic-chip.c
@@ -0,0 +1,354 @@
1/*
2 * Library implementing the most common irq chip callback functions
3 *
4 * Copyright (C) 2011, Thomas Gleixner
5 */
6#include <linux/io.h>
7#include <linux/irq.h>
8#include <linux/slab.h>
9#include <linux/interrupt.h>
10#include <linux/kernel_stat.h>
11#include <linux/syscore_ops.h>
12
13#include "internals.h"
14
15static LIST_HEAD(gc_list);
16static DEFINE_RAW_SPINLOCK(gc_lock);
17
18static inline struct irq_chip_regs *cur_regs(struct irq_data *d)
19{
20 return &container_of(d->chip, struct irq_chip_type, chip)->regs;
21}
22
23/**
24 * irq_gc_noop - NOOP function
25 * @d: irq_data
26 */
27void irq_gc_noop(struct irq_data *d)
28{
29}
30
31/**
32 * irq_gc_mask_disable_reg - Mask chip via disable register
33 * @d: irq_data
34 *
35 * Chip has separate enable/disable registers instead of a single mask
36 * register.
37 */
38void irq_gc_mask_disable_reg(struct irq_data *d)
39{
40 struct irq_chip_generic *gc = irq_data_get_irq_chip_data(d);
41 u32 mask = 1 << (d->irq - gc->irq_base);
42
43 irq_gc_lock(gc);
44 irq_reg_writel(mask, gc->reg_base + cur_regs(d)->disable);
45 gc->mask_cache &= ~mask;
46 irq_gc_unlock(gc);
47}
48
49/**
50 * irq_gc_mask_set_mask_bit - Mask chip via setting bit in mask register
51 * @d: irq_data
52 *
53 * Chip has a single mask register. Values of this register are cached
54 * and protected by gc->lock
55 */
56void irq_gc_mask_set_bit(struct irq_data *d)
57{
58 struct irq_chip_generic *gc = irq_data_get_irq_chip_data(d);
59 u32 mask = 1 << (d->irq - gc->irq_base);
60
61 irq_gc_lock(gc);
62 gc->mask_cache |= mask;
63 irq_reg_writel(gc->mask_cache, gc->reg_base + cur_regs(d)->mask);
64 irq_gc_unlock(gc);
65}
66
67/**
68 * irq_gc_mask_set_mask_bit - Mask chip via clearing bit in mask register
69 * @d: irq_data
70 *
71 * Chip has a single mask register. Values of this register are cached
72 * and protected by gc->lock
73 */
74void irq_gc_mask_clr_bit(struct irq_data *d)
75{
76 struct irq_chip_generic *gc = irq_data_get_irq_chip_data(d);
77 u32 mask = 1 << (d->irq - gc->irq_base);
78
79 irq_gc_lock(gc);
80 gc->mask_cache &= ~mask;
81 irq_reg_writel(gc->mask_cache, gc->reg_base + cur_regs(d)->mask);
82 irq_gc_unlock(gc);
83}
84
85/**
86 * irq_gc_unmask_enable_reg - Unmask chip via enable register
87 * @d: irq_data
88 *
89 * Chip has separate enable/disable registers instead of a single mask
90 * register.
91 */
92void irq_gc_unmask_enable_reg(struct irq_data *d)
93{
94 struct irq_chip_generic *gc = irq_data_get_irq_chip_data(d);
95 u32 mask = 1 << (d->irq - gc->irq_base);
96
97 irq_gc_lock(gc);
98 irq_reg_writel(mask, gc->reg_base + cur_regs(d)->enable);
99 gc->mask_cache |= mask;
100 irq_gc_unlock(gc);
101}
102
103/**
104 * irq_gc_ack - Ack pending interrupt
105 * @d: irq_data
106 */
107void irq_gc_ack(struct irq_data *d)
108{
109 struct irq_chip_generic *gc = irq_data_get_irq_chip_data(d);
110 u32 mask = 1 << (d->irq - gc->irq_base);
111
112 irq_gc_lock(gc);
113 irq_reg_writel(mask, gc->reg_base + cur_regs(d)->ack);
114 irq_gc_unlock(gc);
115}
116
117/**
118 * irq_gc_mask_disable_reg_and_ack- Mask and ack pending interrupt
119 * @d: irq_data
120 */
121void irq_gc_mask_disable_reg_and_ack(struct irq_data *d)
122{
123 struct irq_chip_generic *gc = irq_data_get_irq_chip_data(d);
124 u32 mask = 1 << (d->irq - gc->irq_base);
125
126 irq_gc_lock(gc);
127 irq_reg_writel(mask, gc->reg_base + cur_regs(d)->mask);
128 irq_reg_writel(mask, gc->reg_base + cur_regs(d)->ack);
129 irq_gc_unlock(gc);
130}
131
132/**
133 * irq_gc_eoi - EOI interrupt
134 * @d: irq_data
135 */
136void irq_gc_eoi(struct irq_data *d)
137{
138 struct irq_chip_generic *gc = irq_data_get_irq_chip_data(d);
139 u32 mask = 1 << (d->irq - gc->irq_base);
140
141 irq_gc_lock(gc);
142 irq_reg_writel(mask, gc->reg_base + cur_regs(d)->eoi);
143 irq_gc_unlock(gc);
144}
145
146/**
147 * irq_gc_set_wake - Set/clr wake bit for an interrupt
148 * @d: irq_data
149 *
150 * For chips where the wake from suspend functionality is not
151 * configured in a separate register and the wakeup active state is
152 * just stored in a bitmask.
153 */
154int irq_gc_set_wake(struct irq_data *d, unsigned int on)
155{
156 struct irq_chip_generic *gc = irq_data_get_irq_chip_data(d);
157 u32 mask = 1 << (d->irq - gc->irq_base);
158
159 if (!(mask & gc->wake_enabled))
160 return -EINVAL;
161
162 irq_gc_lock(gc);
163 if (on)
164 gc->wake_active |= mask;
165 else
166 gc->wake_active &= ~mask;
167 irq_gc_unlock(gc);
168 return 0;
169}
170
171/**
172 * irq_alloc_generic_chip - Allocate a generic chip and initialize it
173 * @name: Name of the irq chip
174 * @num_ct: Number of irq_chip_type instances associated with this
175 * @irq_base: Interrupt base nr for this chip
176 * @reg_base: Register base address (virtual)
177 * @handler: Default flow handler associated with this chip
178 *
179 * Returns an initialized irq_chip_generic structure. The chip defaults
180 * to the primary (index 0) irq_chip_type and @handler
181 */
182struct irq_chip_generic *
183irq_alloc_generic_chip(const char *name, int num_ct, unsigned int irq_base,
184 void __iomem *reg_base, irq_flow_handler_t handler)
185{
186 struct irq_chip_generic *gc;
187 unsigned long sz = sizeof(*gc) + num_ct * sizeof(struct irq_chip_type);
188
189 gc = kzalloc(sz, GFP_KERNEL);
190 if (gc) {
191 raw_spin_lock_init(&gc->lock);
192 gc->num_ct = num_ct;
193 gc->irq_base = irq_base;
194 gc->reg_base = reg_base;
195 gc->chip_types->chip.name = name;
196 gc->chip_types->handler = handler;
197 }
198 return gc;
199}
200
201/*
202 * Separate lockdep class for interrupt chip which can nest irq_desc
203 * lock.
204 */
205static struct lock_class_key irq_nested_lock_class;
206
207/**
208 * irq_setup_generic_chip - Setup a range of interrupts with a generic chip
209 * @gc: Generic irq chip holding all data
210 * @msk: Bitmask holding the irqs to initialize relative to gc->irq_base
211 * @flags: Flags for initialization
212 * @clr: IRQ_* bits to clear
213 * @set: IRQ_* bits to set
214 *
215 * Set up max. 32 interrupts starting from gc->irq_base. Note, this
216 * initializes all interrupts to the primary irq_chip_type and its
217 * associated handler.
218 */
219void irq_setup_generic_chip(struct irq_chip_generic *gc, u32 msk,
220 enum irq_gc_flags flags, unsigned int clr,
221 unsigned int set)
222{
223 struct irq_chip_type *ct = gc->chip_types;
224 unsigned int i;
225
226 raw_spin_lock(&gc_lock);
227 list_add_tail(&gc->list, &gc_list);
228 raw_spin_unlock(&gc_lock);
229
230 /* Init mask cache ? */
231 if (flags & IRQ_GC_INIT_MASK_CACHE)
232 gc->mask_cache = irq_reg_readl(gc->reg_base + ct->regs.mask);
233
234 for (i = gc->irq_base; msk; msk >>= 1, i++) {
235 if (!msk & 0x01)
236 continue;
237
238 if (flags & IRQ_GC_INIT_NESTED_LOCK)
239 irq_set_lockdep_class(i, &irq_nested_lock_class);
240
241 irq_set_chip_and_handler(i, &ct->chip, ct->handler);
242 irq_set_chip_data(i, gc);
243 irq_modify_status(i, clr, set);
244 }
245 gc->irq_cnt = i - gc->irq_base;
246}
247
248/**
249 * irq_setup_alt_chip - Switch to alternative chip
250 * @d: irq_data for this interrupt
251 * @type Flow type to be initialized
252 *
253 * Only to be called from chip->irq_set_type() callbacks.
254 */
255int irq_setup_alt_chip(struct irq_data *d, unsigned int type)
256{
257 struct irq_chip_generic *gc = irq_data_get_irq_chip_data(d);
258 struct irq_chip_type *ct = gc->chip_types;
259 unsigned int i;
260
261 for (i = 0; i < gc->num_ct; i++, ct++) {
262 if (ct->type & type) {
263 d->chip = &ct->chip;
264 irq_data_to_desc(d)->handle_irq = ct->handler;
265 return 0;
266 }
267 }
268 return -EINVAL;
269}
270
271/**
272 * irq_remove_generic_chip - Remove a chip
273 * @gc: Generic irq chip holding all data
274 * @msk: Bitmask holding the irqs to initialize relative to gc->irq_base
275 * @clr: IRQ_* bits to clear
276 * @set: IRQ_* bits to set
277 *
278 * Remove up to 32 interrupts starting from gc->irq_base.
279 */
280void irq_remove_generic_chip(struct irq_chip_generic *gc, u32 msk,
281 unsigned int clr, unsigned int set)
282{
283 unsigned int i = gc->irq_base;
284
285 raw_spin_lock(&gc_lock);
286 list_del(&gc->list);
287 raw_spin_unlock(&gc_lock);
288
289 for (; msk; msk >>= 1, i++) {
290 if (!msk & 0x01)
291 continue;
292
293 /* Remove handler first. That will mask the irq line */
294 irq_set_handler(i, NULL);
295 irq_set_chip(i, &no_irq_chip);
296 irq_set_chip_data(i, NULL);
297 irq_modify_status(i, clr, set);
298 }
299}
300
301#ifdef CONFIG_PM
302static int irq_gc_suspend(void)
303{
304 struct irq_chip_generic *gc;
305
306 list_for_each_entry(gc, &gc_list, list) {
307 struct irq_chip_type *ct = gc->chip_types;
308
309 if (ct->chip.irq_suspend)
310 ct->chip.irq_suspend(irq_get_irq_data(gc->irq_base));
311 }
312 return 0;
313}
314
315static void irq_gc_resume(void)
316{
317 struct irq_chip_generic *gc;
318
319 list_for_each_entry(gc, &gc_list, list) {
320 struct irq_chip_type *ct = gc->chip_types;
321
322 if (ct->chip.irq_resume)
323 ct->chip.irq_resume(irq_get_irq_data(gc->irq_base));
324 }
325}
326#else
327#define irq_gc_suspend NULL
328#define irq_gc_resume NULL
329#endif
330
331static void irq_gc_shutdown(void)
332{
333 struct irq_chip_generic *gc;
334
335 list_for_each_entry(gc, &gc_list, list) {
336 struct irq_chip_type *ct = gc->chip_types;
337
338 if (ct->chip.irq_pm_shutdown)
339 ct->chip.irq_pm_shutdown(irq_get_irq_data(gc->irq_base));
340 }
341}
342
343static struct syscore_ops irq_gc_syscore_ops = {
344 .suspend = irq_gc_suspend,
345 .resume = irq_gc_resume,
346 .shutdown = irq_gc_shutdown,
347};
348
349static int __init irq_gc_init_ops(void)
350{
351 register_syscore_ops(&irq_gc_syscore_ops);
352 return 0;
353}
354device_initcall(irq_gc_init_ops);
diff --git a/kernel/irq/irqdesc.c b/kernel/irq/irqdesc.c
index 2c039c9b9383..886e80347b32 100644
--- a/kernel/irq/irqdesc.c
+++ b/kernel/irq/irqdesc.c
@@ -22,7 +22,7 @@
22 */ 22 */
23static struct lock_class_key irq_desc_lock_class; 23static struct lock_class_key irq_desc_lock_class;
24 24
25#if defined(CONFIG_SMP) && defined(CONFIG_GENERIC_HARDIRQS) 25#if defined(CONFIG_SMP)
26static void __init init_irq_default_affinity(void) 26static void __init init_irq_default_affinity(void)
27{ 27{
28 alloc_cpumask_var(&irq_default_affinity, GFP_NOWAIT); 28 alloc_cpumask_var(&irq_default_affinity, GFP_NOWAIT);
@@ -290,6 +290,22 @@ static int irq_expand_nr_irqs(unsigned int nr)
290 290
291#endif /* !CONFIG_SPARSE_IRQ */ 291#endif /* !CONFIG_SPARSE_IRQ */
292 292
293/**
294 * generic_handle_irq - Invoke the handler for a particular irq
295 * @irq: The irq number to handle
296 *
297 */
298int generic_handle_irq(unsigned int irq)
299{
300 struct irq_desc *desc = irq_to_desc(irq);
301
302 if (!desc)
303 return -EINVAL;
304 generic_handle_irq_desc(irq, desc);
305 return 0;
306}
307EXPORT_SYMBOL_GPL(generic_handle_irq);
308
293/* Dynamic interrupt handling */ 309/* Dynamic interrupt handling */
294 310
295/** 311/**
@@ -311,6 +327,7 @@ void irq_free_descs(unsigned int from, unsigned int cnt)
311 bitmap_clear(allocated_irqs, from, cnt); 327 bitmap_clear(allocated_irqs, from, cnt);
312 mutex_unlock(&sparse_irq_lock); 328 mutex_unlock(&sparse_irq_lock);
313} 329}
330EXPORT_SYMBOL_GPL(irq_free_descs);
314 331
315/** 332/**
316 * irq_alloc_descs - allocate and initialize a range of irq descriptors 333 * irq_alloc_descs - allocate and initialize a range of irq descriptors
@@ -351,6 +368,7 @@ err:
351 mutex_unlock(&sparse_irq_lock); 368 mutex_unlock(&sparse_irq_lock);
352 return ret; 369 return ret;
353} 370}
371EXPORT_SYMBOL_GPL(irq_alloc_descs);
354 372
355/** 373/**
356 * irq_reserve_irqs - mark irqs allocated 374 * irq_reserve_irqs - mark irqs allocated
@@ -430,7 +448,6 @@ unsigned int kstat_irqs_cpu(unsigned int irq, int cpu)
430 *per_cpu_ptr(desc->kstat_irqs, cpu) : 0; 448 *per_cpu_ptr(desc->kstat_irqs, cpu) : 0;
431} 449}
432 450
433#ifdef CONFIG_GENERIC_HARDIRQS
434unsigned int kstat_irqs(unsigned int irq) 451unsigned int kstat_irqs(unsigned int irq)
435{ 452{
436 struct irq_desc *desc = irq_to_desc(irq); 453 struct irq_desc *desc = irq_to_desc(irq);
@@ -443,4 +460,3 @@ unsigned int kstat_irqs(unsigned int irq)
443 sum += *per_cpu_ptr(desc->kstat_irqs, cpu); 460 sum += *per_cpu_ptr(desc->kstat_irqs, cpu);
444 return sum; 461 return sum;
445} 462}
446#endif /* CONFIG_GENERIC_HARDIRQS */
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index 12a80fdae11c..f7ce0021e1c4 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -900,7 +900,8 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new)
900 */ 900 */
901 new->handler = irq_nested_primary_handler; 901 new->handler = irq_nested_primary_handler;
902 } else { 902 } else {
903 irq_setup_forced_threading(new); 903 if (irq_settings_can_thread(desc))
904 irq_setup_forced_threading(new);
904 } 905 }
905 906
906 /* 907 /*
@@ -1051,6 +1052,7 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new)
1051 register_irq_proc(irq, desc); 1052 register_irq_proc(irq, desc);
1052 new->dir = NULL; 1053 new->dir = NULL;
1053 register_handler_proc(irq, new); 1054 register_handler_proc(irq, new);
1055 free_cpumask_var(mask);
1054 1056
1055 return 0; 1057 return 0;
1056 1058
diff --git a/kernel/irq/migration.c b/kernel/irq/migration.c
index bc6194698dfd..47420908fba0 100644
--- a/kernel/irq/migration.c
+++ b/kernel/irq/migration.c
@@ -35,7 +35,7 @@ void irq_move_masked_irq(struct irq_data *idata)
35 * do the disable, re-program, enable sequence. 35 * do the disable, re-program, enable sequence.
36 * This is *not* particularly important for level triggered 36 * This is *not* particularly important for level triggered
37 * but in a edge trigger case, we might be setting rte 37 * but in a edge trigger case, we might be setting rte
38 * when an active trigger is comming in. This could 38 * when an active trigger is coming in. This could
39 * cause some ioapics to mal-function. 39 * cause some ioapics to mal-function.
40 * Being paranoid i guess! 40 * Being paranoid i guess!
41 * 41 *
diff --git a/kernel/irq/proc.c b/kernel/irq/proc.c
index dd201bd35103..834899f2500f 100644
--- a/kernel/irq/proc.c
+++ b/kernel/irq/proc.c
@@ -419,7 +419,7 @@ int show_interrupts(struct seq_file *p, void *v)
419 } else { 419 } else {
420 seq_printf(p, " %8s", "None"); 420 seq_printf(p, " %8s", "None");
421 } 421 }
422#ifdef CONFIG_GENIRC_IRQ_SHOW_LEVEL 422#ifdef CONFIG_GENERIC_IRQ_SHOW_LEVEL
423 seq_printf(p, " %-8s", irqd_is_level_type(&desc->irq_data) ? "Level" : "Edge"); 423 seq_printf(p, " %-8s", irqd_is_level_type(&desc->irq_data) ? "Level" : "Edge");
424#endif 424#endif
425 if (desc->name) 425 if (desc->name)
diff --git a/kernel/irq/settings.h b/kernel/irq/settings.h
index 0d91730b6330..f1667833d444 100644
--- a/kernel/irq/settings.h
+++ b/kernel/irq/settings.h
@@ -8,6 +8,7 @@ enum {
8 _IRQ_LEVEL = IRQ_LEVEL, 8 _IRQ_LEVEL = IRQ_LEVEL,
9 _IRQ_NOPROBE = IRQ_NOPROBE, 9 _IRQ_NOPROBE = IRQ_NOPROBE,
10 _IRQ_NOREQUEST = IRQ_NOREQUEST, 10 _IRQ_NOREQUEST = IRQ_NOREQUEST,
11 _IRQ_NOTHREAD = IRQ_NOTHREAD,
11 _IRQ_NOAUTOEN = IRQ_NOAUTOEN, 12 _IRQ_NOAUTOEN = IRQ_NOAUTOEN,
12 _IRQ_MOVE_PCNTXT = IRQ_MOVE_PCNTXT, 13 _IRQ_MOVE_PCNTXT = IRQ_MOVE_PCNTXT,
13 _IRQ_NO_BALANCING = IRQ_NO_BALANCING, 14 _IRQ_NO_BALANCING = IRQ_NO_BALANCING,
@@ -20,6 +21,7 @@ enum {
20#define IRQ_LEVEL GOT_YOU_MORON 21#define IRQ_LEVEL GOT_YOU_MORON
21#define IRQ_NOPROBE GOT_YOU_MORON 22#define IRQ_NOPROBE GOT_YOU_MORON
22#define IRQ_NOREQUEST GOT_YOU_MORON 23#define IRQ_NOREQUEST GOT_YOU_MORON
24#define IRQ_NOTHREAD GOT_YOU_MORON
23#define IRQ_NOAUTOEN GOT_YOU_MORON 25#define IRQ_NOAUTOEN GOT_YOU_MORON
24#define IRQ_NESTED_THREAD GOT_YOU_MORON 26#define IRQ_NESTED_THREAD GOT_YOU_MORON
25#undef IRQF_MODIFY_MASK 27#undef IRQF_MODIFY_MASK
@@ -94,6 +96,21 @@ static inline void irq_settings_set_norequest(struct irq_desc *desc)
94 desc->status_use_accessors |= _IRQ_NOREQUEST; 96 desc->status_use_accessors |= _IRQ_NOREQUEST;
95} 97}
96 98
99static inline bool irq_settings_can_thread(struct irq_desc *desc)
100{
101 return !(desc->status_use_accessors & _IRQ_NOTHREAD);
102}
103
104static inline void irq_settings_clr_nothread(struct irq_desc *desc)
105{
106 desc->status_use_accessors &= ~_IRQ_NOTHREAD;
107}
108
109static inline void irq_settings_set_nothread(struct irq_desc *desc)
110{
111 desc->status_use_accessors |= _IRQ_NOTHREAD;
112}
113
97static inline bool irq_settings_can_probe(struct irq_desc *desc) 114static inline bool irq_settings_can_probe(struct irq_desc *desc)
98{ 115{
99 return !(desc->status_use_accessors & _IRQ_NOPROBE); 116 return !(desc->status_use_accessors & _IRQ_NOPROBE);
diff --git a/kernel/jump_label.c b/kernel/jump_label.c
index 3b79bd938330..74d1c099fbd1 100644
--- a/kernel/jump_label.c
+++ b/kernel/jump_label.c
@@ -2,43 +2,23 @@
2 * jump label support 2 * jump label support
3 * 3 *
4 * Copyright (C) 2009 Jason Baron <jbaron@redhat.com> 4 * Copyright (C) 2009 Jason Baron <jbaron@redhat.com>
5 * Copyright (C) 2011 Peter Zijlstra <pzijlstr@redhat.com>
5 * 6 *
6 */ 7 */
7#include <linux/jump_label.h>
8#include <linux/memory.h> 8#include <linux/memory.h>
9#include <linux/uaccess.h> 9#include <linux/uaccess.h>
10#include <linux/module.h> 10#include <linux/module.h>
11#include <linux/list.h> 11#include <linux/list.h>
12#include <linux/jhash.h>
13#include <linux/slab.h> 12#include <linux/slab.h>
14#include <linux/sort.h> 13#include <linux/sort.h>
15#include <linux/err.h> 14#include <linux/err.h>
15#include <linux/jump_label.h>
16 16
17#ifdef HAVE_JUMP_LABEL 17#ifdef HAVE_JUMP_LABEL
18 18
19#define JUMP_LABEL_HASH_BITS 6
20#define JUMP_LABEL_TABLE_SIZE (1 << JUMP_LABEL_HASH_BITS)
21static struct hlist_head jump_label_table[JUMP_LABEL_TABLE_SIZE];
22
23/* mutex to protect coming/going of the the jump_label table */ 19/* mutex to protect coming/going of the the jump_label table */
24static DEFINE_MUTEX(jump_label_mutex); 20static DEFINE_MUTEX(jump_label_mutex);
25 21
26struct jump_label_entry {
27 struct hlist_node hlist;
28 struct jump_entry *table;
29 int nr_entries;
30 /* hang modules off here */
31 struct hlist_head modules;
32 unsigned long key;
33};
34
35struct jump_label_module_entry {
36 struct hlist_node hlist;
37 struct jump_entry *table;
38 int nr_entries;
39 struct module *mod;
40};
41
42void jump_label_lock(void) 22void jump_label_lock(void)
43{ 23{
44 mutex_lock(&jump_label_mutex); 24 mutex_lock(&jump_label_mutex);
@@ -49,6 +29,11 @@ void jump_label_unlock(void)
49 mutex_unlock(&jump_label_mutex); 29 mutex_unlock(&jump_label_mutex);
50} 30}
51 31
32bool jump_label_enabled(struct jump_label_key *key)
33{
34 return !!atomic_read(&key->enabled);
35}
36
52static int jump_label_cmp(const void *a, const void *b) 37static int jump_label_cmp(const void *a, const void *b)
53{ 38{
54 const struct jump_entry *jea = a; 39 const struct jump_entry *jea = a;
@@ -64,7 +49,7 @@ static int jump_label_cmp(const void *a, const void *b)
64} 49}
65 50
66static void 51static void
67sort_jump_label_entries(struct jump_entry *start, struct jump_entry *stop) 52jump_label_sort_entries(struct jump_entry *start, struct jump_entry *stop)
68{ 53{
69 unsigned long size; 54 unsigned long size;
70 55
@@ -73,118 +58,25 @@ sort_jump_label_entries(struct jump_entry *start, struct jump_entry *stop)
73 sort(start, size, sizeof(struct jump_entry), jump_label_cmp, NULL); 58 sort(start, size, sizeof(struct jump_entry), jump_label_cmp, NULL);
74} 59}
75 60
76static struct jump_label_entry *get_jump_label_entry(jump_label_t key) 61static void jump_label_update(struct jump_label_key *key, int enable);
77{
78 struct hlist_head *head;
79 struct hlist_node *node;
80 struct jump_label_entry *e;
81 u32 hash = jhash((void *)&key, sizeof(jump_label_t), 0);
82
83 head = &jump_label_table[hash & (JUMP_LABEL_TABLE_SIZE - 1)];
84 hlist_for_each_entry(e, node, head, hlist) {
85 if (key == e->key)
86 return e;
87 }
88 return NULL;
89}
90 62
91static struct jump_label_entry * 63void jump_label_inc(struct jump_label_key *key)
92add_jump_label_entry(jump_label_t key, int nr_entries, struct jump_entry *table)
93{ 64{
94 struct hlist_head *head; 65 if (atomic_inc_not_zero(&key->enabled))
95 struct jump_label_entry *e; 66 return;
96 u32 hash;
97
98 e = get_jump_label_entry(key);
99 if (e)
100 return ERR_PTR(-EEXIST);
101
102 e = kmalloc(sizeof(struct jump_label_entry), GFP_KERNEL);
103 if (!e)
104 return ERR_PTR(-ENOMEM);
105
106 hash = jhash((void *)&key, sizeof(jump_label_t), 0);
107 head = &jump_label_table[hash & (JUMP_LABEL_TABLE_SIZE - 1)];
108 e->key = key;
109 e->table = table;
110 e->nr_entries = nr_entries;
111 INIT_HLIST_HEAD(&(e->modules));
112 hlist_add_head(&e->hlist, head);
113 return e;
114}
115 67
116static int 68 jump_label_lock();
117build_jump_label_hashtable(struct jump_entry *start, struct jump_entry *stop) 69 if (atomic_add_return(1, &key->enabled) == 1)
118{ 70 jump_label_update(key, JUMP_LABEL_ENABLE);
119 struct jump_entry *iter, *iter_begin; 71 jump_label_unlock();
120 struct jump_label_entry *entry;
121 int count;
122
123 sort_jump_label_entries(start, stop);
124 iter = start;
125 while (iter < stop) {
126 entry = get_jump_label_entry(iter->key);
127 if (!entry) {
128 iter_begin = iter;
129 count = 0;
130 while ((iter < stop) &&
131 (iter->key == iter_begin->key)) {
132 iter++;
133 count++;
134 }
135 entry = add_jump_label_entry(iter_begin->key,
136 count, iter_begin);
137 if (IS_ERR(entry))
138 return PTR_ERR(entry);
139 } else {
140 WARN_ONCE(1, KERN_ERR "build_jump_hashtable: unexpected entry!\n");
141 return -1;
142 }
143 }
144 return 0;
145} 72}
146 73
147/*** 74void jump_label_dec(struct jump_label_key *key)
148 * jump_label_update - update jump label text
149 * @key - key value associated with a a jump label
150 * @type - enum set to JUMP_LABEL_ENABLE or JUMP_LABEL_DISABLE
151 *
152 * Will enable/disable the jump for jump label @key, depending on the
153 * value of @type.
154 *
155 */
156
157void jump_label_update(unsigned long key, enum jump_label_type type)
158{ 75{
159 struct jump_entry *iter; 76 if (!atomic_dec_and_mutex_lock(&key->enabled, &jump_label_mutex))
160 struct jump_label_entry *entry; 77 return;
161 struct hlist_node *module_node;
162 struct jump_label_module_entry *e_module;
163 int count;
164 78
165 jump_label_lock(); 79 jump_label_update(key, JUMP_LABEL_DISABLE);
166 entry = get_jump_label_entry((jump_label_t)key);
167 if (entry) {
168 count = entry->nr_entries;
169 iter = entry->table;
170 while (count--) {
171 if (kernel_text_address(iter->code))
172 arch_jump_label_transform(iter, type);
173 iter++;
174 }
175 /* eanble/disable jump labels in modules */
176 hlist_for_each_entry(e_module, module_node, &(entry->modules),
177 hlist) {
178 count = e_module->nr_entries;
179 iter = e_module->table;
180 while (count--) {
181 if (iter->key &&
182 kernel_text_address(iter->code))
183 arch_jump_label_transform(iter, type);
184 iter++;
185 }
186 }
187 }
188 jump_label_unlock(); 80 jump_label_unlock();
189} 81}
190 82
@@ -197,77 +89,33 @@ static int addr_conflict(struct jump_entry *entry, void *start, void *end)
197 return 0; 89 return 0;
198} 90}
199 91
200#ifdef CONFIG_MODULES 92static int __jump_label_text_reserved(struct jump_entry *iter_start,
201 93 struct jump_entry *iter_stop, void *start, void *end)
202static int module_conflict(void *start, void *end)
203{ 94{
204 struct hlist_head *head;
205 struct hlist_node *node, *node_next, *module_node, *module_node_next;
206 struct jump_label_entry *e;
207 struct jump_label_module_entry *e_module;
208 struct jump_entry *iter; 95 struct jump_entry *iter;
209 int i, count;
210 int conflict = 0;
211
212 for (i = 0; i < JUMP_LABEL_TABLE_SIZE; i++) {
213 head = &jump_label_table[i];
214 hlist_for_each_entry_safe(e, node, node_next, head, hlist) {
215 hlist_for_each_entry_safe(e_module, module_node,
216 module_node_next,
217 &(e->modules), hlist) {
218 count = e_module->nr_entries;
219 iter = e_module->table;
220 while (count--) {
221 if (addr_conflict(iter, start, end)) {
222 conflict = 1;
223 goto out;
224 }
225 iter++;
226 }
227 }
228 }
229 }
230out:
231 return conflict;
232}
233
234#endif
235
236/***
237 * jump_label_text_reserved - check if addr range is reserved
238 * @start: start text addr
239 * @end: end text addr
240 *
241 * checks if the text addr located between @start and @end
242 * overlaps with any of the jump label patch addresses. Code
243 * that wants to modify kernel text should first verify that
244 * it does not overlap with any of the jump label addresses.
245 * Caller must hold jump_label_mutex.
246 *
247 * returns 1 if there is an overlap, 0 otherwise
248 */
249int jump_label_text_reserved(void *start, void *end)
250{
251 struct jump_entry *iter;
252 struct jump_entry *iter_start = __start___jump_table;
253 struct jump_entry *iter_stop = __start___jump_table;
254 int conflict = 0;
255 96
256 iter = iter_start; 97 iter = iter_start;
257 while (iter < iter_stop) { 98 while (iter < iter_stop) {
258 if (addr_conflict(iter, start, end)) { 99 if (addr_conflict(iter, start, end))
259 conflict = 1; 100 return 1;
260 goto out;
261 }
262 iter++; 101 iter++;
263 } 102 }
264 103
265 /* now check modules */ 104 return 0;
266#ifdef CONFIG_MODULES 105}
267 conflict = module_conflict(start, end); 106
268#endif 107static void __jump_label_update(struct jump_label_key *key,
269out: 108 struct jump_entry *entry, int enable)
270 return conflict; 109{
110 for (; entry->key == (jump_label_t)(unsigned long)key; entry++) {
111 /*
112 * entry->code set to 0 invalidates module init text sections
113 * kernel_text_address() verifies we are not in core kernel
114 * init code, see jump_label_invalidate_module_init().
115 */
116 if (entry->code && kernel_text_address(entry->code))
117 arch_jump_label_transform(entry, enable);
118 }
271} 119}
272 120
273/* 121/*
@@ -277,142 +125,173 @@ void __weak arch_jump_label_text_poke_early(jump_label_t addr)
277{ 125{
278} 126}
279 127
280static __init int init_jump_label(void) 128static __init int jump_label_init(void)
281{ 129{
282 int ret;
283 struct jump_entry *iter_start = __start___jump_table; 130 struct jump_entry *iter_start = __start___jump_table;
284 struct jump_entry *iter_stop = __stop___jump_table; 131 struct jump_entry *iter_stop = __stop___jump_table;
132 struct jump_label_key *key = NULL;
285 struct jump_entry *iter; 133 struct jump_entry *iter;
286 134
287 jump_label_lock(); 135 jump_label_lock();
288 ret = build_jump_label_hashtable(__start___jump_table, 136 jump_label_sort_entries(iter_start, iter_stop);
289 __stop___jump_table); 137
290 iter = iter_start; 138 for (iter = iter_start; iter < iter_stop; iter++) {
291 while (iter < iter_stop) {
292 arch_jump_label_text_poke_early(iter->code); 139 arch_jump_label_text_poke_early(iter->code);
293 iter++; 140 if (iter->key == (jump_label_t)(unsigned long)key)
141 continue;
142
143 key = (struct jump_label_key *)(unsigned long)iter->key;
144 atomic_set(&key->enabled, 0);
145 key->entries = iter;
146#ifdef CONFIG_MODULES
147 key->next = NULL;
148#endif
294 } 149 }
295 jump_label_unlock(); 150 jump_label_unlock();
296 return ret; 151
152 return 0;
297} 153}
298early_initcall(init_jump_label); 154early_initcall(jump_label_init);
299 155
300#ifdef CONFIG_MODULES 156#ifdef CONFIG_MODULES
301 157
302static struct jump_label_module_entry * 158struct jump_label_mod {
303add_jump_label_module_entry(struct jump_label_entry *entry, 159 struct jump_label_mod *next;
304 struct jump_entry *iter_begin, 160 struct jump_entry *entries;
305 int count, struct module *mod) 161 struct module *mod;
162};
163
164static int __jump_label_mod_text_reserved(void *start, void *end)
165{
166 struct module *mod;
167
168 mod = __module_text_address((unsigned long)start);
169 if (!mod)
170 return 0;
171
172 WARN_ON_ONCE(__module_text_address((unsigned long)end) != mod);
173
174 return __jump_label_text_reserved(mod->jump_entries,
175 mod->jump_entries + mod->num_jump_entries,
176 start, end);
177}
178
179static void __jump_label_mod_update(struct jump_label_key *key, int enable)
180{
181 struct jump_label_mod *mod = key->next;
182
183 while (mod) {
184 __jump_label_update(key, mod->entries, enable);
185 mod = mod->next;
186 }
187}
188
189/***
190 * apply_jump_label_nops - patch module jump labels with arch_get_jump_label_nop()
191 * @mod: module to patch
192 *
193 * Allow for run-time selection of the optimal nops. Before the module
194 * loads patch these with arch_get_jump_label_nop(), which is specified by
195 * the arch specific jump label code.
196 */
197void jump_label_apply_nops(struct module *mod)
306{ 198{
307 struct jump_label_module_entry *e; 199 struct jump_entry *iter_start = mod->jump_entries;
308 200 struct jump_entry *iter_stop = iter_start + mod->num_jump_entries;
309 e = kmalloc(sizeof(struct jump_label_module_entry), GFP_KERNEL); 201 struct jump_entry *iter;
310 if (!e) 202
311 return ERR_PTR(-ENOMEM); 203 /* if the module doesn't have jump label entries, just return */
312 e->mod = mod; 204 if (iter_start == iter_stop)
313 e->nr_entries = count; 205 return;
314 e->table = iter_begin; 206
315 hlist_add_head(&e->hlist, &entry->modules); 207 for (iter = iter_start; iter < iter_stop; iter++)
316 return e; 208 arch_jump_label_text_poke_early(iter->code);
317} 209}
318 210
319static int add_jump_label_module(struct module *mod) 211static int jump_label_add_module(struct module *mod)
320{ 212{
321 struct jump_entry *iter, *iter_begin; 213 struct jump_entry *iter_start = mod->jump_entries;
322 struct jump_label_entry *entry; 214 struct jump_entry *iter_stop = iter_start + mod->num_jump_entries;
323 struct jump_label_module_entry *module_entry; 215 struct jump_entry *iter;
324 int count; 216 struct jump_label_key *key = NULL;
217 struct jump_label_mod *jlm;
325 218
326 /* if the module doesn't have jump label entries, just return */ 219 /* if the module doesn't have jump label entries, just return */
327 if (!mod->num_jump_entries) 220 if (iter_start == iter_stop)
328 return 0; 221 return 0;
329 222
330 sort_jump_label_entries(mod->jump_entries, 223 jump_label_sort_entries(iter_start, iter_stop);
331 mod->jump_entries + mod->num_jump_entries); 224
332 iter = mod->jump_entries; 225 for (iter = iter_start; iter < iter_stop; iter++) {
333 while (iter < mod->jump_entries + mod->num_jump_entries) { 226 if (iter->key == (jump_label_t)(unsigned long)key)
334 entry = get_jump_label_entry(iter->key); 227 continue;
335 iter_begin = iter; 228
336 count = 0; 229 key = (struct jump_label_key *)(unsigned long)iter->key;
337 while ((iter < mod->jump_entries + mod->num_jump_entries) && 230
338 (iter->key == iter_begin->key)) { 231 if (__module_address(iter->key) == mod) {
339 iter++; 232 atomic_set(&key->enabled, 0);
340 count++; 233 key->entries = iter;
341 } 234 key->next = NULL;
342 if (!entry) { 235 continue;
343 entry = add_jump_label_entry(iter_begin->key, 0, NULL);
344 if (IS_ERR(entry))
345 return PTR_ERR(entry);
346 } 236 }
347 module_entry = add_jump_label_module_entry(entry, iter_begin, 237
348 count, mod); 238 jlm = kzalloc(sizeof(struct jump_label_mod), GFP_KERNEL);
349 if (IS_ERR(module_entry)) 239 if (!jlm)
350 return PTR_ERR(module_entry); 240 return -ENOMEM;
241
242 jlm->mod = mod;
243 jlm->entries = iter;
244 jlm->next = key->next;
245 key->next = jlm;
246
247 if (jump_label_enabled(key))
248 __jump_label_update(key, iter, JUMP_LABEL_ENABLE);
351 } 249 }
250
352 return 0; 251 return 0;
353} 252}
354 253
355static void remove_jump_label_module(struct module *mod) 254static void jump_label_del_module(struct module *mod)
356{ 255{
357 struct hlist_head *head; 256 struct jump_entry *iter_start = mod->jump_entries;
358 struct hlist_node *node, *node_next, *module_node, *module_node_next; 257 struct jump_entry *iter_stop = iter_start + mod->num_jump_entries;
359 struct jump_label_entry *e; 258 struct jump_entry *iter;
360 struct jump_label_module_entry *e_module; 259 struct jump_label_key *key = NULL;
361 int i; 260 struct jump_label_mod *jlm, **prev;
362 261
363 /* if the module doesn't have jump label entries, just return */ 262 for (iter = iter_start; iter < iter_stop; iter++) {
364 if (!mod->num_jump_entries) 263 if (iter->key == (jump_label_t)(unsigned long)key)
365 return; 264 continue;
265
266 key = (struct jump_label_key *)(unsigned long)iter->key;
267
268 if (__module_address(iter->key) == mod)
269 continue;
270
271 prev = &key->next;
272 jlm = key->next;
366 273
367 for (i = 0; i < JUMP_LABEL_TABLE_SIZE; i++) { 274 while (jlm && jlm->mod != mod) {
368 head = &jump_label_table[i]; 275 prev = &jlm->next;
369 hlist_for_each_entry_safe(e, node, node_next, head, hlist) { 276 jlm = jlm->next;
370 hlist_for_each_entry_safe(e_module, module_node, 277 }
371 module_node_next, 278
372 &(e->modules), hlist) { 279 if (jlm) {
373 if (e_module->mod == mod) { 280 *prev = jlm->next;
374 hlist_del(&e_module->hlist); 281 kfree(jlm);
375 kfree(e_module);
376 }
377 }
378 if (hlist_empty(&e->modules) && (e->nr_entries == 0)) {
379 hlist_del(&e->hlist);
380 kfree(e);
381 }
382 } 282 }
383 } 283 }
384} 284}
385 285
386static void remove_jump_label_module_init(struct module *mod) 286static void jump_label_invalidate_module_init(struct module *mod)
387{ 287{
388 struct hlist_head *head; 288 struct jump_entry *iter_start = mod->jump_entries;
389 struct hlist_node *node, *node_next, *module_node, *module_node_next; 289 struct jump_entry *iter_stop = iter_start + mod->num_jump_entries;
390 struct jump_label_entry *e;
391 struct jump_label_module_entry *e_module;
392 struct jump_entry *iter; 290 struct jump_entry *iter;
393 int i, count;
394
395 /* if the module doesn't have jump label entries, just return */
396 if (!mod->num_jump_entries)
397 return;
398 291
399 for (i = 0; i < JUMP_LABEL_TABLE_SIZE; i++) { 292 for (iter = iter_start; iter < iter_stop; iter++) {
400 head = &jump_label_table[i]; 293 if (within_module_init(iter->code, mod))
401 hlist_for_each_entry_safe(e, node, node_next, head, hlist) { 294 iter->code = 0;
402 hlist_for_each_entry_safe(e_module, module_node,
403 module_node_next,
404 &(e->modules), hlist) {
405 if (e_module->mod != mod)
406 continue;
407 count = e_module->nr_entries;
408 iter = e_module->table;
409 while (count--) {
410 if (within_module_init(iter->code, mod))
411 iter->key = 0;
412 iter++;
413 }
414 }
415 }
416 } 295 }
417} 296}
418 297
@@ -426,59 +305,77 @@ jump_label_module_notify(struct notifier_block *self, unsigned long val,
426 switch (val) { 305 switch (val) {
427 case MODULE_STATE_COMING: 306 case MODULE_STATE_COMING:
428 jump_label_lock(); 307 jump_label_lock();
429 ret = add_jump_label_module(mod); 308 ret = jump_label_add_module(mod);
430 if (ret) 309 if (ret)
431 remove_jump_label_module(mod); 310 jump_label_del_module(mod);
432 jump_label_unlock(); 311 jump_label_unlock();
433 break; 312 break;
434 case MODULE_STATE_GOING: 313 case MODULE_STATE_GOING:
435 jump_label_lock(); 314 jump_label_lock();
436 remove_jump_label_module(mod); 315 jump_label_del_module(mod);
437 jump_label_unlock(); 316 jump_label_unlock();
438 break; 317 break;
439 case MODULE_STATE_LIVE: 318 case MODULE_STATE_LIVE:
440 jump_label_lock(); 319 jump_label_lock();
441 remove_jump_label_module_init(mod); 320 jump_label_invalidate_module_init(mod);
442 jump_label_unlock(); 321 jump_label_unlock();
443 break; 322 break;
444 } 323 }
445 return ret;
446}
447 324
448/*** 325 return notifier_from_errno(ret);
449 * apply_jump_label_nops - patch module jump labels with arch_get_jump_label_nop()
450 * @mod: module to patch
451 *
452 * Allow for run-time selection of the optimal nops. Before the module
453 * loads patch these with arch_get_jump_label_nop(), which is specified by
454 * the arch specific jump label code.
455 */
456void jump_label_apply_nops(struct module *mod)
457{
458 struct jump_entry *iter;
459
460 /* if the module doesn't have jump label entries, just return */
461 if (!mod->num_jump_entries)
462 return;
463
464 iter = mod->jump_entries;
465 while (iter < mod->jump_entries + mod->num_jump_entries) {
466 arch_jump_label_text_poke_early(iter->code);
467 iter++;
468 }
469} 326}
470 327
471struct notifier_block jump_label_module_nb = { 328struct notifier_block jump_label_module_nb = {
472 .notifier_call = jump_label_module_notify, 329 .notifier_call = jump_label_module_notify,
473 .priority = 0, 330 .priority = 1, /* higher than tracepoints */
474}; 331};
475 332
476static __init int init_jump_label_module(void) 333static __init int jump_label_init_module(void)
477{ 334{
478 return register_module_notifier(&jump_label_module_nb); 335 return register_module_notifier(&jump_label_module_nb);
479} 336}
480early_initcall(init_jump_label_module); 337early_initcall(jump_label_init_module);
481 338
482#endif /* CONFIG_MODULES */ 339#endif /* CONFIG_MODULES */
483 340
341/***
342 * jump_label_text_reserved - check if addr range is reserved
343 * @start: start text addr
344 * @end: end text addr
345 *
346 * checks if the text addr located between @start and @end
347 * overlaps with any of the jump label patch addresses. Code
348 * that wants to modify kernel text should first verify that
349 * it does not overlap with any of the jump label addresses.
350 * Caller must hold jump_label_mutex.
351 *
352 * returns 1 if there is an overlap, 0 otherwise
353 */
354int jump_label_text_reserved(void *start, void *end)
355{
356 int ret = __jump_label_text_reserved(__start___jump_table,
357 __stop___jump_table, start, end);
358
359 if (ret)
360 return ret;
361
362#ifdef CONFIG_MODULES
363 ret = __jump_label_mod_text_reserved(start, end);
364#endif
365 return ret;
366}
367
368static void jump_label_update(struct jump_label_key *key, int enable)
369{
370 struct jump_entry *entry = key->entries;
371
372 /* if there are no users, entry can be NULL */
373 if (entry)
374 __jump_label_update(key, entry, enable);
375
376#ifdef CONFIG_MODULES
377 __jump_label_mod_update(key, enable);
378#endif
379}
380
484#endif 381#endif
diff --git a/kernel/kexec.c b/kernel/kexec.c
index 4e240a378df6..8d814cbc8109 100644
--- a/kernel/kexec.c
+++ b/kernel/kexec.c
@@ -33,6 +33,7 @@
33#include <linux/vmalloc.h> 33#include <linux/vmalloc.h>
34#include <linux/swap.h> 34#include <linux/swap.h>
35#include <linux/kmsg_dump.h> 35#include <linux/kmsg_dump.h>
36#include <linux/syscore_ops.h>
36 37
37#include <asm/page.h> 38#include <asm/page.h>
38#include <asm/uaccess.h> 39#include <asm/uaccess.h>
@@ -144,7 +145,7 @@ static int do_kimage_alloc(struct kimage **rimage, unsigned long entry,
144 /* Initialize the list of destination pages */ 145 /* Initialize the list of destination pages */
145 INIT_LIST_HEAD(&image->dest_pages); 146 INIT_LIST_HEAD(&image->dest_pages);
146 147
147 /* Initialize the list of unuseable pages */ 148 /* Initialize the list of unusable pages */
148 INIT_LIST_HEAD(&image->unuseable_pages); 149 INIT_LIST_HEAD(&image->unuseable_pages);
149 150
150 /* Read in the segments */ 151 /* Read in the segments */
@@ -454,7 +455,7 @@ static struct page *kimage_alloc_normal_control_pages(struct kimage *image,
454 /* Deal with the destination pages I have inadvertently allocated. 455 /* Deal with the destination pages I have inadvertently allocated.
455 * 456 *
456 * Ideally I would convert multi-page allocations into single 457 * Ideally I would convert multi-page allocations into single
457 * page allocations, and add everyting to image->dest_pages. 458 * page allocations, and add everything to image->dest_pages.
458 * 459 *
459 * For now it is simpler to just free the pages. 460 * For now it is simpler to just free the pages.
460 */ 461 */
@@ -602,7 +603,7 @@ static void kimage_free_extra_pages(struct kimage *image)
602 /* Walk through and free any extra destination pages I may have */ 603 /* Walk through and free any extra destination pages I may have */
603 kimage_free_page_list(&image->dest_pages); 604 kimage_free_page_list(&image->dest_pages);
604 605
605 /* Walk through and free any unuseable pages I have cached */ 606 /* Walk through and free any unusable pages I have cached */
606 kimage_free_page_list(&image->unuseable_pages); 607 kimage_free_page_list(&image->unuseable_pages);
607 608
608} 609}
@@ -1530,8 +1531,7 @@ int kernel_kexec(void)
1530 if (error) 1531 if (error)
1531 goto Enable_cpus; 1532 goto Enable_cpus;
1532 local_irq_disable(); 1533 local_irq_disable();
1533 /* Suspend system devices */ 1534 error = syscore_suspend();
1534 error = sysdev_suspend(PMSG_FREEZE);
1535 if (error) 1535 if (error)
1536 goto Enable_irqs; 1536 goto Enable_irqs;
1537 } else 1537 } else
@@ -1546,7 +1546,7 @@ int kernel_kexec(void)
1546 1546
1547#ifdef CONFIG_KEXEC_JUMP 1547#ifdef CONFIG_KEXEC_JUMP
1548 if (kexec_image->preserve_context) { 1548 if (kexec_image->preserve_context) {
1549 sysdev_resume(); 1549 syscore_resume();
1550 Enable_irqs: 1550 Enable_irqs:
1551 local_irq_enable(); 1551 local_irq_enable();
1552 Enable_cpus: 1552 Enable_cpus:
diff --git a/kernel/kmod.c b/kernel/kmod.c
index 9cd0591c96a2..5ae0ff38425f 100644
--- a/kernel/kmod.c
+++ b/kernel/kmod.c
@@ -245,7 +245,6 @@ static void __call_usermodehelper(struct work_struct *work)
245 } 245 }
246} 246}
247 247
248#ifdef CONFIG_PM_SLEEP
249/* 248/*
250 * If set, call_usermodehelper_exec() will exit immediately returning -EBUSY 249 * If set, call_usermodehelper_exec() will exit immediately returning -EBUSY
251 * (used for preventing user land processes from being created after the user 250 * (used for preventing user land processes from being created after the user
@@ -301,6 +300,15 @@ void usermodehelper_enable(void)
301 usermodehelper_disabled = 0; 300 usermodehelper_disabled = 0;
302} 301}
303 302
303/**
304 * usermodehelper_is_disabled - check if new helpers are allowed to be started
305 */
306bool usermodehelper_is_disabled(void)
307{
308 return usermodehelper_disabled;
309}
310EXPORT_SYMBOL_GPL(usermodehelper_is_disabled);
311
304static void helper_lock(void) 312static void helper_lock(void)
305{ 313{
306 atomic_inc(&running_helpers); 314 atomic_inc(&running_helpers);
@@ -312,12 +320,6 @@ static void helper_unlock(void)
312 if (atomic_dec_and_test(&running_helpers)) 320 if (atomic_dec_and_test(&running_helpers))
313 wake_up(&running_helpers_waitq); 321 wake_up(&running_helpers_waitq);
314} 322}
315#else /* CONFIG_PM_SLEEP */
316#define usermodehelper_disabled 0
317
318static inline void helper_lock(void) {}
319static inline void helper_unlock(void) {}
320#endif /* CONFIG_PM_SLEEP */
321 323
322/** 324/**
323 * call_usermodehelper_setup - prepare to call a usermode helper 325 * call_usermodehelper_setup - prepare to call a usermode helper
diff --git a/kernel/ksysfs.c b/kernel/ksysfs.c
index 0b624e791805..3b053c04dd86 100644
--- a/kernel/ksysfs.c
+++ b/kernel/ksysfs.c
@@ -16,6 +16,7 @@
16#include <linux/kexec.h> 16#include <linux/kexec.h>
17#include <linux/profile.h> 17#include <linux/profile.h>
18#include <linux/sched.h> 18#include <linux/sched.h>
19#include <linux/capability.h>
19 20
20#define KERNEL_ATTR_RO(_name) \ 21#define KERNEL_ATTR_RO(_name) \
21static struct kobj_attribute _name##_attr = __ATTR_RO(_name) 22static struct kobj_attribute _name##_attr = __ATTR_RO(_name)
@@ -131,6 +132,14 @@ KERNEL_ATTR_RO(vmcoreinfo);
131 132
132#endif /* CONFIG_KEXEC */ 133#endif /* CONFIG_KEXEC */
133 134
135/* whether file capabilities are enabled */
136static ssize_t fscaps_show(struct kobject *kobj,
137 struct kobj_attribute *attr, char *buf)
138{
139 return sprintf(buf, "%d\n", file_caps_enabled);
140}
141KERNEL_ATTR_RO(fscaps);
142
134/* 143/*
135 * Make /sys/kernel/notes give the raw contents of our kernel .notes section. 144 * Make /sys/kernel/notes give the raw contents of our kernel .notes section.
136 */ 145 */
@@ -158,6 +167,7 @@ struct kobject *kernel_kobj;
158EXPORT_SYMBOL_GPL(kernel_kobj); 167EXPORT_SYMBOL_GPL(kernel_kobj);
159 168
160static struct attribute * kernel_attrs[] = { 169static struct attribute * kernel_attrs[] = {
170 &fscaps_attr.attr,
161#if defined(CONFIG_HOTPLUG) 171#if defined(CONFIG_HOTPLUG)
162 &uevent_seqnum_attr.attr, 172 &uevent_seqnum_attr.attr,
163 &uevent_helper_attr.attr, 173 &uevent_helper_attr.attr,
diff --git a/kernel/kthread.c b/kernel/kthread.c
index 684ab3f7dd72..3b34d2732bce 100644
--- a/kernel/kthread.c
+++ b/kernel/kthread.c
@@ -139,7 +139,7 @@ static void create_kthread(struct kthread_create_info *create)
139 * in @node, to get NUMA affinity for kthread stack, or else give -1. 139 * in @node, to get NUMA affinity for kthread stack, or else give -1.
140 * When woken, the thread will run @threadfn() with @data as its 140 * When woken, the thread will run @threadfn() with @data as its
141 * argument. @threadfn() can either call do_exit() directly if it is a 141 * argument. @threadfn() can either call do_exit() directly if it is a
142 * standalone thread for which noone will call kthread_stop(), or 142 * standalone thread for which no one will call kthread_stop(), or
143 * return when 'kthread_should_stop()' is true (which means 143 * return when 'kthread_should_stop()' is true (which means
144 * kthread_stop() has been called). The return value should be zero 144 * kthread_stop() has been called). The return value should be zero
145 * or a negative error number; it will be passed to kthread_stop(). 145 * or a negative error number; it will be passed to kthread_stop().
diff --git a/kernel/latencytop.c b/kernel/latencytop.c
index ee74b35e528d..376066e10413 100644
--- a/kernel/latencytop.c
+++ b/kernel/latencytop.c
@@ -153,7 +153,7 @@ static inline void store_stacktrace(struct task_struct *tsk,
153} 153}
154 154
155/** 155/**
156 * __account_scheduler_latency - record an occured latency 156 * __account_scheduler_latency - record an occurred latency
157 * @tsk - the task struct of the task hitting the latency 157 * @tsk - the task struct of the task hitting the latency
158 * @usecs - the duration of the latency in microseconds 158 * @usecs - the duration of the latency in microseconds
159 * @inter - 1 if the sleep was interruptible, 0 if uninterruptible 159 * @inter - 1 if the sleep was interruptible, 0 if uninterruptible
diff --git a/kernel/lockdep.c b/kernel/lockdep.c
index 0d2058da80f5..63437d065ac8 100644
--- a/kernel/lockdep.c
+++ b/kernel/lockdep.c
@@ -490,6 +490,18 @@ void get_usage_chars(struct lock_class *class, char usage[LOCK_USAGE_CHARS])
490 usage[i] = '\0'; 490 usage[i] = '\0';
491} 491}
492 492
493static int __print_lock_name(struct lock_class *class)
494{
495 char str[KSYM_NAME_LEN];
496 const char *name;
497
498 name = class->name;
499 if (!name)
500 name = __get_key_name(class->key, str);
501
502 return printk("%s", name);
503}
504
493static void print_lock_name(struct lock_class *class) 505static void print_lock_name(struct lock_class *class)
494{ 506{
495 char str[KSYM_NAME_LEN], usage[LOCK_USAGE_CHARS]; 507 char str[KSYM_NAME_LEN], usage[LOCK_USAGE_CHARS];
@@ -1053,6 +1065,56 @@ print_circular_bug_entry(struct lock_list *target, int depth)
1053 return 0; 1065 return 0;
1054} 1066}
1055 1067
1068static void
1069print_circular_lock_scenario(struct held_lock *src,
1070 struct held_lock *tgt,
1071 struct lock_list *prt)
1072{
1073 struct lock_class *source = hlock_class(src);
1074 struct lock_class *target = hlock_class(tgt);
1075 struct lock_class *parent = prt->class;
1076
1077 /*
1078 * A direct locking problem where unsafe_class lock is taken
1079 * directly by safe_class lock, then all we need to show
1080 * is the deadlock scenario, as it is obvious that the
1081 * unsafe lock is taken under the safe lock.
1082 *
1083 * But if there is a chain instead, where the safe lock takes
1084 * an intermediate lock (middle_class) where this lock is
1085 * not the same as the safe lock, then the lock chain is
1086 * used to describe the problem. Otherwise we would need
1087 * to show a different CPU case for each link in the chain
1088 * from the safe_class lock to the unsafe_class lock.
1089 */
1090 if (parent != source) {
1091 printk("Chain exists of:\n ");
1092 __print_lock_name(source);
1093 printk(" --> ");
1094 __print_lock_name(parent);
1095 printk(" --> ");
1096 __print_lock_name(target);
1097 printk("\n\n");
1098 }
1099
1100 printk(" Possible unsafe locking scenario:\n\n");
1101 printk(" CPU0 CPU1\n");
1102 printk(" ---- ----\n");
1103 printk(" lock(");
1104 __print_lock_name(target);
1105 printk(");\n");
1106 printk(" lock(");
1107 __print_lock_name(parent);
1108 printk(");\n");
1109 printk(" lock(");
1110 __print_lock_name(target);
1111 printk(");\n");
1112 printk(" lock(");
1113 __print_lock_name(source);
1114 printk(");\n");
1115 printk("\n *** DEADLOCK ***\n\n");
1116}
1117
1056/* 1118/*
1057 * When a circular dependency is detected, print the 1119 * When a circular dependency is detected, print the
1058 * header first: 1120 * header first:
@@ -1096,6 +1158,7 @@ static noinline int print_circular_bug(struct lock_list *this,
1096{ 1158{
1097 struct task_struct *curr = current; 1159 struct task_struct *curr = current;
1098 struct lock_list *parent; 1160 struct lock_list *parent;
1161 struct lock_list *first_parent;
1099 int depth; 1162 int depth;
1100 1163
1101 if (!debug_locks_off_graph_unlock() || debug_locks_silent) 1164 if (!debug_locks_off_graph_unlock() || debug_locks_silent)
@@ -1109,6 +1172,7 @@ static noinline int print_circular_bug(struct lock_list *this,
1109 print_circular_bug_header(target, depth, check_src, check_tgt); 1172 print_circular_bug_header(target, depth, check_src, check_tgt);
1110 1173
1111 parent = get_lock_parent(target); 1174 parent = get_lock_parent(target);
1175 first_parent = parent;
1112 1176
1113 while (parent) { 1177 while (parent) {
1114 print_circular_bug_entry(parent, --depth); 1178 print_circular_bug_entry(parent, --depth);
@@ -1116,6 +1180,9 @@ static noinline int print_circular_bug(struct lock_list *this,
1116 } 1180 }
1117 1181
1118 printk("\nother info that might help us debug this:\n\n"); 1182 printk("\nother info that might help us debug this:\n\n");
1183 print_circular_lock_scenario(check_src, check_tgt,
1184 first_parent);
1185
1119 lockdep_print_held_locks(curr); 1186 lockdep_print_held_locks(curr);
1120 1187
1121 printk("\nstack backtrace:\n"); 1188 printk("\nstack backtrace:\n");
@@ -1314,7 +1381,7 @@ print_shortest_lock_dependencies(struct lock_list *leaf,
1314 printk("\n"); 1381 printk("\n");
1315 1382
1316 if (depth == 0 && (entry != root)) { 1383 if (depth == 0 && (entry != root)) {
1317 printk("lockdep:%s bad BFS generated tree\n", __func__); 1384 printk("lockdep:%s bad path found in chain graph\n", __func__);
1318 break; 1385 break;
1319 } 1386 }
1320 1387
@@ -1325,6 +1392,62 @@ print_shortest_lock_dependencies(struct lock_list *leaf,
1325 return; 1392 return;
1326} 1393}
1327 1394
1395static void
1396print_irq_lock_scenario(struct lock_list *safe_entry,
1397 struct lock_list *unsafe_entry,
1398 struct lock_class *prev_class,
1399 struct lock_class *next_class)
1400{
1401 struct lock_class *safe_class = safe_entry->class;
1402 struct lock_class *unsafe_class = unsafe_entry->class;
1403 struct lock_class *middle_class = prev_class;
1404
1405 if (middle_class == safe_class)
1406 middle_class = next_class;
1407
1408 /*
1409 * A direct locking problem where unsafe_class lock is taken
1410 * directly by safe_class lock, then all we need to show
1411 * is the deadlock scenario, as it is obvious that the
1412 * unsafe lock is taken under the safe lock.
1413 *
1414 * But if there is a chain instead, where the safe lock takes
1415 * an intermediate lock (middle_class) where this lock is
1416 * not the same as the safe lock, then the lock chain is
1417 * used to describe the problem. Otherwise we would need
1418 * to show a different CPU case for each link in the chain
1419 * from the safe_class lock to the unsafe_class lock.
1420 */
1421 if (middle_class != unsafe_class) {
1422 printk("Chain exists of:\n ");
1423 __print_lock_name(safe_class);
1424 printk(" --> ");
1425 __print_lock_name(middle_class);
1426 printk(" --> ");
1427 __print_lock_name(unsafe_class);
1428 printk("\n\n");
1429 }
1430
1431 printk(" Possible interrupt unsafe locking scenario:\n\n");
1432 printk(" CPU0 CPU1\n");
1433 printk(" ---- ----\n");
1434 printk(" lock(");
1435 __print_lock_name(unsafe_class);
1436 printk(");\n");
1437 printk(" local_irq_disable();\n");
1438 printk(" lock(");
1439 __print_lock_name(safe_class);
1440 printk(");\n");
1441 printk(" lock(");
1442 __print_lock_name(middle_class);
1443 printk(");\n");
1444 printk(" <Interrupt>\n");
1445 printk(" lock(");
1446 __print_lock_name(safe_class);
1447 printk(");\n");
1448 printk("\n *** DEADLOCK ***\n\n");
1449}
1450
1328static int 1451static int
1329print_bad_irq_dependency(struct task_struct *curr, 1452print_bad_irq_dependency(struct task_struct *curr,
1330 struct lock_list *prev_root, 1453 struct lock_list *prev_root,
@@ -1376,6 +1499,9 @@ print_bad_irq_dependency(struct task_struct *curr,
1376 print_stack_trace(forwards_entry->class->usage_traces + bit2, 1); 1499 print_stack_trace(forwards_entry->class->usage_traces + bit2, 1);
1377 1500
1378 printk("\nother info that might help us debug this:\n\n"); 1501 printk("\nother info that might help us debug this:\n\n");
1502 print_irq_lock_scenario(backwards_entry, forwards_entry,
1503 hlock_class(prev), hlock_class(next));
1504
1379 lockdep_print_held_locks(curr); 1505 lockdep_print_held_locks(curr);
1380 1506
1381 printk("\nthe dependencies between %s-irq-safe lock", irqclass); 1507 printk("\nthe dependencies between %s-irq-safe lock", irqclass);
@@ -1539,6 +1665,26 @@ static inline void inc_chains(void)
1539 1665
1540#endif 1666#endif
1541 1667
1668static void
1669print_deadlock_scenario(struct held_lock *nxt,
1670 struct held_lock *prv)
1671{
1672 struct lock_class *next = hlock_class(nxt);
1673 struct lock_class *prev = hlock_class(prv);
1674
1675 printk(" Possible unsafe locking scenario:\n\n");
1676 printk(" CPU0\n");
1677 printk(" ----\n");
1678 printk(" lock(");
1679 __print_lock_name(prev);
1680 printk(");\n");
1681 printk(" lock(");
1682 __print_lock_name(next);
1683 printk(");\n");
1684 printk("\n *** DEADLOCK ***\n\n");
1685 printk(" May be due to missing lock nesting notation\n\n");
1686}
1687
1542static int 1688static int
1543print_deadlock_bug(struct task_struct *curr, struct held_lock *prev, 1689print_deadlock_bug(struct task_struct *curr, struct held_lock *prev,
1544 struct held_lock *next) 1690 struct held_lock *next)
@@ -1557,6 +1703,7 @@ print_deadlock_bug(struct task_struct *curr, struct held_lock *prev,
1557 print_lock(prev); 1703 print_lock(prev);
1558 1704
1559 printk("\nother info that might help us debug this:\n"); 1705 printk("\nother info that might help us debug this:\n");
1706 print_deadlock_scenario(next, prev);
1560 lockdep_print_held_locks(curr); 1707 lockdep_print_held_locks(curr);
1561 1708
1562 printk("\nstack backtrace:\n"); 1709 printk("\nstack backtrace:\n");
@@ -1826,7 +1973,7 @@ static inline int lookup_chain_cache(struct task_struct *curr,
1826 struct list_head *hash_head = chainhashentry(chain_key); 1973 struct list_head *hash_head = chainhashentry(chain_key);
1827 struct lock_chain *chain; 1974 struct lock_chain *chain;
1828 struct held_lock *hlock_curr, *hlock_next; 1975 struct held_lock *hlock_curr, *hlock_next;
1829 int i, j, n, cn; 1976 int i, j;
1830 1977
1831 if (DEBUG_LOCKS_WARN_ON(!irqs_disabled())) 1978 if (DEBUG_LOCKS_WARN_ON(!irqs_disabled()))
1832 return 0; 1979 return 0;
@@ -1886,15 +2033,9 @@ cache_hit:
1886 } 2033 }
1887 i++; 2034 i++;
1888 chain->depth = curr->lockdep_depth + 1 - i; 2035 chain->depth = curr->lockdep_depth + 1 - i;
1889 cn = nr_chain_hlocks; 2036 if (likely(nr_chain_hlocks + chain->depth <= MAX_LOCKDEP_CHAIN_HLOCKS)) {
1890 while (cn + chain->depth <= MAX_LOCKDEP_CHAIN_HLOCKS) { 2037 chain->base = nr_chain_hlocks;
1891 n = cmpxchg(&nr_chain_hlocks, cn, cn + chain->depth); 2038 nr_chain_hlocks += chain->depth;
1892 if (n == cn)
1893 break;
1894 cn = n;
1895 }
1896 if (likely(cn + chain->depth <= MAX_LOCKDEP_CHAIN_HLOCKS)) {
1897 chain->base = cn;
1898 for (j = 0; j < chain->depth - 1; j++, i++) { 2039 for (j = 0; j < chain->depth - 1; j++, i++) {
1899 int lock_id = curr->held_locks[i].class_idx - 1; 2040 int lock_id = curr->held_locks[i].class_idx - 1;
1900 chain_hlocks[chain->base + j] = lock_id; 2041 chain_hlocks[chain->base + j] = lock_id;
@@ -2011,6 +2152,24 @@ static void check_chain_key(struct task_struct *curr)
2011#endif 2152#endif
2012} 2153}
2013 2154
2155static void
2156print_usage_bug_scenario(struct held_lock *lock)
2157{
2158 struct lock_class *class = hlock_class(lock);
2159
2160 printk(" Possible unsafe locking scenario:\n\n");
2161 printk(" CPU0\n");
2162 printk(" ----\n");
2163 printk(" lock(");
2164 __print_lock_name(class);
2165 printk(");\n");
2166 printk(" <Interrupt>\n");
2167 printk(" lock(");
2168 __print_lock_name(class);
2169 printk(");\n");
2170 printk("\n *** DEADLOCK ***\n\n");
2171}
2172
2014static int 2173static int
2015print_usage_bug(struct task_struct *curr, struct held_lock *this, 2174print_usage_bug(struct task_struct *curr, struct held_lock *this,
2016 enum lock_usage_bit prev_bit, enum lock_usage_bit new_bit) 2175 enum lock_usage_bit prev_bit, enum lock_usage_bit new_bit)
@@ -2039,6 +2198,8 @@ print_usage_bug(struct task_struct *curr, struct held_lock *this,
2039 2198
2040 print_irqtrace_events(curr); 2199 print_irqtrace_events(curr);
2041 printk("\nother info that might help us debug this:\n"); 2200 printk("\nother info that might help us debug this:\n");
2201 print_usage_bug_scenario(this);
2202
2042 lockdep_print_held_locks(curr); 2203 lockdep_print_held_locks(curr);
2043 2204
2044 printk("\nstack backtrace:\n"); 2205 printk("\nstack backtrace:\n");
@@ -2073,6 +2234,10 @@ print_irq_inversion_bug(struct task_struct *curr,
2073 struct held_lock *this, int forwards, 2234 struct held_lock *this, int forwards,
2074 const char *irqclass) 2235 const char *irqclass)
2075{ 2236{
2237 struct lock_list *entry = other;
2238 struct lock_list *middle = NULL;
2239 int depth;
2240
2076 if (!debug_locks_off_graph_unlock() || debug_locks_silent) 2241 if (!debug_locks_off_graph_unlock() || debug_locks_silent)
2077 return 0; 2242 return 0;
2078 2243
@@ -2091,6 +2256,25 @@ print_irq_inversion_bug(struct task_struct *curr,
2091 printk("\n\nand interrupts could create inverse lock ordering between them.\n\n"); 2256 printk("\n\nand interrupts could create inverse lock ordering between them.\n\n");
2092 2257
2093 printk("\nother info that might help us debug this:\n"); 2258 printk("\nother info that might help us debug this:\n");
2259
2260 /* Find a middle lock (if one exists) */
2261 depth = get_lock_depth(other);
2262 do {
2263 if (depth == 0 && (entry != root)) {
2264 printk("lockdep:%s bad path found in chain graph\n", __func__);
2265 break;
2266 }
2267 middle = entry;
2268 entry = get_lock_parent(entry);
2269 depth--;
2270 } while (entry && entry != root && (depth >= 0));
2271 if (forwards)
2272 print_irq_lock_scenario(root, other,
2273 middle ? middle->class : root->class, other->class);
2274 else
2275 print_irq_lock_scenario(other, root,
2276 middle ? middle->class : other->class, root->class);
2277
2094 lockdep_print_held_locks(curr); 2278 lockdep_print_held_locks(curr);
2095 2279
2096 printk("\nthe shortest dependencies between 2nd lock and 1st lock:\n"); 2280 printk("\nthe shortest dependencies between 2nd lock and 1st lock:\n");
@@ -2309,7 +2493,7 @@ void trace_hardirqs_on_caller(unsigned long ip)
2309 if (unlikely(curr->hardirqs_enabled)) { 2493 if (unlikely(curr->hardirqs_enabled)) {
2310 /* 2494 /*
2311 * Neither irq nor preemption are disabled here 2495 * Neither irq nor preemption are disabled here
2312 * so this is racy by nature but loosing one hit 2496 * so this is racy by nature but losing one hit
2313 * in a stat is not a big deal. 2497 * in a stat is not a big deal.
2314 */ 2498 */
2315 __debug_atomic_inc(redundant_hardirqs_on); 2499 __debug_atomic_inc(redundant_hardirqs_on);
@@ -2620,7 +2804,7 @@ static int mark_lock(struct task_struct *curr, struct held_lock *this,
2620 if (!graph_lock()) 2804 if (!graph_lock())
2621 return 0; 2805 return 0;
2622 /* 2806 /*
2623 * Make sure we didnt race: 2807 * Make sure we didn't race:
2624 */ 2808 */
2625 if (unlikely(hlock_class(this)->usage_mask & new_mask)) { 2809 if (unlikely(hlock_class(this)->usage_mask & new_mask)) {
2626 graph_unlock(); 2810 graph_unlock();
diff --git a/kernel/module.c b/kernel/module.c
index 1f9f7bc56ca1..22879725678d 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -57,6 +57,7 @@
57#include <linux/kmemleak.h> 57#include <linux/kmemleak.h>
58#include <linux/jump_label.h> 58#include <linux/jump_label.h>
59#include <linux/pfn.h> 59#include <linux/pfn.h>
60#include <linux/bsearch.h>
60 61
61#define CREATE_TRACE_POINTS 62#define CREATE_TRACE_POINTS
62#include <trace/events/module.h> 63#include <trace/events/module.h>
@@ -240,23 +241,24 @@ static bool each_symbol_in_section(const struct symsearch *arr,
240 struct module *owner, 241 struct module *owner,
241 bool (*fn)(const struct symsearch *syms, 242 bool (*fn)(const struct symsearch *syms,
242 struct module *owner, 243 struct module *owner,
243 unsigned int symnum, void *data), 244 void *data),
244 void *data) 245 void *data)
245{ 246{
246 unsigned int i, j; 247 unsigned int j;
247 248
248 for (j = 0; j < arrsize; j++) { 249 for (j = 0; j < arrsize; j++) {
249 for (i = 0; i < arr[j].stop - arr[j].start; i++) 250 if (fn(&arr[j], owner, data))
250 if (fn(&arr[j], owner, i, data)) 251 return true;
251 return true;
252 } 252 }
253 253
254 return false; 254 return false;
255} 255}
256 256
257/* Returns true as soon as fn returns true, otherwise false. */ 257/* Returns true as soon as fn returns true, otherwise false. */
258bool each_symbol(bool (*fn)(const struct symsearch *arr, struct module *owner, 258bool each_symbol_section(bool (*fn)(const struct symsearch *arr,
259 unsigned int symnum, void *data), void *data) 259 struct module *owner,
260 void *data),
261 void *data)
260{ 262{
261 struct module *mod; 263 struct module *mod;
262 static const struct symsearch arr[] = { 264 static const struct symsearch arr[] = {
@@ -309,7 +311,7 @@ bool each_symbol(bool (*fn)(const struct symsearch *arr, struct module *owner,
309 } 311 }
310 return false; 312 return false;
311} 313}
312EXPORT_SYMBOL_GPL(each_symbol); 314EXPORT_SYMBOL_GPL(each_symbol_section);
313 315
314struct find_symbol_arg { 316struct find_symbol_arg {
315 /* Input */ 317 /* Input */
@@ -323,15 +325,12 @@ struct find_symbol_arg {
323 const struct kernel_symbol *sym; 325 const struct kernel_symbol *sym;
324}; 326};
325 327
326static bool find_symbol_in_section(const struct symsearch *syms, 328static bool check_symbol(const struct symsearch *syms,
327 struct module *owner, 329 struct module *owner,
328 unsigned int symnum, void *data) 330 unsigned int symnum, void *data)
329{ 331{
330 struct find_symbol_arg *fsa = data; 332 struct find_symbol_arg *fsa = data;
331 333
332 if (strcmp(syms->start[symnum].name, fsa->name) != 0)
333 return false;
334
335 if (!fsa->gplok) { 334 if (!fsa->gplok) {
336 if (syms->licence == GPL_ONLY) 335 if (syms->licence == GPL_ONLY)
337 return false; 336 return false;
@@ -365,6 +364,30 @@ static bool find_symbol_in_section(const struct symsearch *syms,
365 return true; 364 return true;
366} 365}
367 366
367static int cmp_name(const void *va, const void *vb)
368{
369 const char *a;
370 const struct kernel_symbol *b;
371 a = va; b = vb;
372 return strcmp(a, b->name);
373}
374
375static bool find_symbol_in_section(const struct symsearch *syms,
376 struct module *owner,
377 void *data)
378{
379 struct find_symbol_arg *fsa = data;
380 struct kernel_symbol *sym;
381
382 sym = bsearch(fsa->name, syms->start, syms->stop - syms->start,
383 sizeof(struct kernel_symbol), cmp_name);
384
385 if (sym != NULL && check_symbol(syms, owner, sym - syms->start, data))
386 return true;
387
388 return false;
389}
390
368/* Find a symbol and return it, along with, (optional) crc and 391/* Find a symbol and return it, along with, (optional) crc and
369 * (optional) module which owns it. Needs preempt disabled or module_mutex. */ 392 * (optional) module which owns it. Needs preempt disabled or module_mutex. */
370const struct kernel_symbol *find_symbol(const char *name, 393const struct kernel_symbol *find_symbol(const char *name,
@@ -379,7 +402,7 @@ const struct kernel_symbol *find_symbol(const char *name,
379 fsa.gplok = gplok; 402 fsa.gplok = gplok;
380 fsa.warn = warn; 403 fsa.warn = warn;
381 404
382 if (each_symbol(find_symbol_in_section, &fsa)) { 405 if (each_symbol_section(find_symbol_in_section, &fsa)) {
383 if (owner) 406 if (owner)
384 *owner = fsa.owner; 407 *owner = fsa.owner;
385 if (crc) 408 if (crc)
@@ -809,7 +832,7 @@ SYSCALL_DEFINE2(delete_module, const char __user *, name_user,
809 wait_for_zero_refcount(mod); 832 wait_for_zero_refcount(mod);
810 833
811 mutex_unlock(&module_mutex); 834 mutex_unlock(&module_mutex);
812 /* Final destruction now noone is using it. */ 835 /* Final destruction now no one is using it. */
813 if (mod->exit != NULL) 836 if (mod->exit != NULL)
814 mod->exit(); 837 mod->exit();
815 blocking_notifier_call_chain(&module_notify_list, 838 blocking_notifier_call_chain(&module_notify_list,
@@ -1607,27 +1630,28 @@ static void set_section_ro_nx(void *base,
1607 } 1630 }
1608} 1631}
1609 1632
1610/* Setting memory back to RW+NX before releasing it */ 1633static void unset_module_core_ro_nx(struct module *mod)
1611void unset_section_ro_nx(struct module *mod, void *module_region)
1612{ 1634{
1613 unsigned long total_pages; 1635 set_page_attributes(mod->module_core + mod->core_text_size,
1614 1636 mod->module_core + mod->core_size,
1615 if (mod->module_core == module_region) { 1637 set_memory_x);
1616 /* Set core as NX+RW */ 1638 set_page_attributes(mod->module_core,
1617 total_pages = MOD_NUMBER_OF_PAGES(mod->module_core, mod->core_size); 1639 mod->module_core + mod->core_ro_size,
1618 set_memory_nx((unsigned long)mod->module_core, total_pages); 1640 set_memory_rw);
1619 set_memory_rw((unsigned long)mod->module_core, total_pages); 1641}
1620 1642
1621 } else if (mod->module_init == module_region) { 1643static void unset_module_init_ro_nx(struct module *mod)
1622 /* Set init as NX+RW */ 1644{
1623 total_pages = MOD_NUMBER_OF_PAGES(mod->module_init, mod->init_size); 1645 set_page_attributes(mod->module_init + mod->init_text_size,
1624 set_memory_nx((unsigned long)mod->module_init, total_pages); 1646 mod->module_init + mod->init_size,
1625 set_memory_rw((unsigned long)mod->module_init, total_pages); 1647 set_memory_x);
1626 } 1648 set_page_attributes(mod->module_init,
1649 mod->module_init + mod->init_ro_size,
1650 set_memory_rw);
1627} 1651}
1628 1652
1629/* Iterate through all modules and set each module's text as RW */ 1653/* Iterate through all modules and set each module's text as RW */
1630void set_all_modules_text_rw() 1654void set_all_modules_text_rw(void)
1631{ 1655{
1632 struct module *mod; 1656 struct module *mod;
1633 1657
@@ -1648,7 +1672,7 @@ void set_all_modules_text_rw()
1648} 1672}
1649 1673
1650/* Iterate through all modules and set each module's text as RO */ 1674/* Iterate through all modules and set each module's text as RO */
1651void set_all_modules_text_ro() 1675void set_all_modules_text_ro(void)
1652{ 1676{
1653 struct module *mod; 1677 struct module *mod;
1654 1678
@@ -1669,7 +1693,8 @@ void set_all_modules_text_ro()
1669} 1693}
1670#else 1694#else
1671static inline void set_section_ro_nx(void *base, unsigned long text_size, unsigned long ro_size, unsigned long total_size) { } 1695static inline void set_section_ro_nx(void *base, unsigned long text_size, unsigned long ro_size, unsigned long total_size) { }
1672static inline void unset_section_ro_nx(struct module *mod, void *module_region) { } 1696static void unset_module_core_ro_nx(struct module *mod) { }
1697static void unset_module_init_ro_nx(struct module *mod) { }
1673#endif 1698#endif
1674 1699
1675/* Free a module, remove from lists, etc. */ 1700/* Free a module, remove from lists, etc. */
@@ -1696,7 +1721,7 @@ static void free_module(struct module *mod)
1696 destroy_params(mod->kp, mod->num_kp); 1721 destroy_params(mod->kp, mod->num_kp);
1697 1722
1698 /* This may be NULL, but that's OK */ 1723 /* This may be NULL, but that's OK */
1699 unset_section_ro_nx(mod, mod->module_init); 1724 unset_module_init_ro_nx(mod);
1700 module_free(mod, mod->module_init); 1725 module_free(mod, mod->module_init);
1701 kfree(mod->args); 1726 kfree(mod->args);
1702 percpu_modfree(mod); 1727 percpu_modfree(mod);
@@ -1705,7 +1730,7 @@ static void free_module(struct module *mod)
1705 lockdep_free_key_range(mod->module_core, mod->core_size); 1730 lockdep_free_key_range(mod->module_core, mod->core_size);
1706 1731
1707 /* Finally, free the core (containing the module structure) */ 1732 /* Finally, free the core (containing the module structure) */
1708 unset_section_ro_nx(mod, mod->module_core); 1733 unset_module_core_ro_nx(mod);
1709 module_free(mod, mod->module_core); 1734 module_free(mod, mod->module_core);
1710 1735
1711#ifdef CONFIG_MPU 1736#ifdef CONFIG_MPU
@@ -2030,11 +2055,8 @@ static const struct kernel_symbol *lookup_symbol(const char *name,
2030 const struct kernel_symbol *start, 2055 const struct kernel_symbol *start,
2031 const struct kernel_symbol *stop) 2056 const struct kernel_symbol *stop)
2032{ 2057{
2033 const struct kernel_symbol *ks = start; 2058 return bsearch(name, start, stop - start,
2034 for (; ks < stop; ks++) 2059 sizeof(struct kernel_symbol), cmp_name);
2035 if (strcmp(ks->name, name) == 0)
2036 return ks;
2037 return NULL;
2038} 2060}
2039 2061
2040static int is_exported(const char *name, unsigned long value, 2062static int is_exported(const char *name, unsigned long value,
@@ -2777,7 +2799,7 @@ static struct module *load_module(void __user *umod,
2777 mod->state = MODULE_STATE_COMING; 2799 mod->state = MODULE_STATE_COMING;
2778 2800
2779 /* Now sew it into the lists so we can get lockdep and oops 2801 /* Now sew it into the lists so we can get lockdep and oops
2780 * info during argument parsing. Noone should access us, since 2802 * info during argument parsing. No one should access us, since
2781 * strong_try_module_get() will fail. 2803 * strong_try_module_get() will fail.
2782 * lockdep/oops can run asynchronous, so use the RCU list insertion 2804 * lockdep/oops can run asynchronous, so use the RCU list insertion
2783 * function to insert in a way safe to concurrent readers. 2805 * function to insert in a way safe to concurrent readers.
@@ -2931,10 +2953,11 @@ SYSCALL_DEFINE3(init_module, void __user *, umod,
2931 mod->symtab = mod->core_symtab; 2953 mod->symtab = mod->core_symtab;
2932 mod->strtab = mod->core_strtab; 2954 mod->strtab = mod->core_strtab;
2933#endif 2955#endif
2934 unset_section_ro_nx(mod, mod->module_init); 2956 unset_module_init_ro_nx(mod);
2935 module_free(mod, mod->module_init); 2957 module_free(mod, mod->module_init);
2936 mod->module_init = NULL; 2958 mod->module_init = NULL;
2937 mod->init_size = 0; 2959 mod->init_size = 0;
2960 mod->init_ro_size = 0;
2938 mod->init_text_size = 0; 2961 mod->init_text_size = 0;
2939 mutex_unlock(&module_mutex); 2962 mutex_unlock(&module_mutex);
2940 2963
@@ -2971,7 +2994,7 @@ static const char *get_ksymbol(struct module *mod,
2971 else 2994 else
2972 nextval = (unsigned long)mod->module_core+mod->core_text_size; 2995 nextval = (unsigned long)mod->module_core+mod->core_text_size;
2973 2996
2974 /* Scan for closest preceeding symbol, and next symbol. (ELF 2997 /* Scan for closest preceding symbol, and next symbol. (ELF
2975 starts real symbols at 1). */ 2998 starts real symbols at 1). */
2976 for (i = 1; i < mod->num_symtab; i++) { 2999 for (i = 1; i < mod->num_symtab; i++) {
2977 if (mod->symtab[i].st_shndx == SHN_UNDEF) 3000 if (mod->symtab[i].st_shndx == SHN_UNDEF)
diff --git a/kernel/mutex-debug.c b/kernel/mutex-debug.c
index ec815a960b5d..73da83aff418 100644
--- a/kernel/mutex-debug.c
+++ b/kernel/mutex-debug.c
@@ -75,7 +75,7 @@ void debug_mutex_unlock(struct mutex *lock)
75 return; 75 return;
76 76
77 DEBUG_LOCKS_WARN_ON(lock->magic != lock); 77 DEBUG_LOCKS_WARN_ON(lock->magic != lock);
78 DEBUG_LOCKS_WARN_ON(lock->owner != current_thread_info()); 78 DEBUG_LOCKS_WARN_ON(lock->owner != current);
79 DEBUG_LOCKS_WARN_ON(!lock->wait_list.prev && !lock->wait_list.next); 79 DEBUG_LOCKS_WARN_ON(!lock->wait_list.prev && !lock->wait_list.next);
80 mutex_clear_owner(lock); 80 mutex_clear_owner(lock);
81} 81}
diff --git a/kernel/mutex-debug.h b/kernel/mutex-debug.h
index 57d527a16f9d..0799fd3e4cfa 100644
--- a/kernel/mutex-debug.h
+++ b/kernel/mutex-debug.h
@@ -29,7 +29,7 @@ extern void debug_mutex_init(struct mutex *lock, const char *name,
29 29
30static inline void mutex_set_owner(struct mutex *lock) 30static inline void mutex_set_owner(struct mutex *lock)
31{ 31{
32 lock->owner = current_thread_info(); 32 lock->owner = current;
33} 33}
34 34
35static inline void mutex_clear_owner(struct mutex *lock) 35static inline void mutex_clear_owner(struct mutex *lock)
diff --git a/kernel/mutex.c b/kernel/mutex.c
index a5889fb28ecf..2c938e2337cd 100644
--- a/kernel/mutex.c
+++ b/kernel/mutex.c
@@ -160,14 +160,7 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass,
160 */ 160 */
161 161
162 for (;;) { 162 for (;;) {
163 struct thread_info *owner; 163 struct task_struct *owner;
164
165 /*
166 * If we own the BKL, then don't spin. The owner of
167 * the mutex might be waiting on us to release the BKL.
168 */
169 if (unlikely(current->lock_depth >= 0))
170 break;
171 164
172 /* 165 /*
173 * If there's an owner, wait for it to either 166 * If there's an owner, wait for it to either
@@ -245,7 +238,7 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass,
245 } 238 }
246 __set_task_state(task, state); 239 __set_task_state(task, state);
247 240
248 /* didnt get the lock, go to sleep: */ 241 /* didn't get the lock, go to sleep: */
249 spin_unlock_mutex(&lock->wait_lock, flags); 242 spin_unlock_mutex(&lock->wait_lock, flags);
250 preempt_enable_no_resched(); 243 preempt_enable_no_resched();
251 schedule(); 244 schedule();
diff --git a/kernel/mutex.h b/kernel/mutex.h
index 67578ca48f94..4115fbf83b12 100644
--- a/kernel/mutex.h
+++ b/kernel/mutex.h
@@ -19,7 +19,7 @@
19#ifdef CONFIG_SMP 19#ifdef CONFIG_SMP
20static inline void mutex_set_owner(struct mutex *lock) 20static inline void mutex_set_owner(struct mutex *lock)
21{ 21{
22 lock->owner = current_thread_info(); 22 lock->owner = current;
23} 23}
24 24
25static inline void mutex_clear_owner(struct mutex *lock) 25static inline void mutex_clear_owner(struct mutex *lock)
diff --git a/kernel/padata.c b/kernel/padata.c
index 751019415d23..b91941df5e63 100644
--- a/kernel/padata.c
+++ b/kernel/padata.c
@@ -262,7 +262,7 @@ static void padata_reorder(struct parallel_data *pd)
262 /* 262 /*
263 * This cpu has to do the parallel processing of the next 263 * This cpu has to do the parallel processing of the next
264 * object. It's waiting in the cpu's parallelization queue, 264 * object. It's waiting in the cpu's parallelization queue,
265 * so exit imediately. 265 * so exit immediately.
266 */ 266 */
267 if (PTR_ERR(padata) == -ENODATA) { 267 if (PTR_ERR(padata) == -ENODATA) {
268 del_timer(&pd->timer); 268 del_timer(&pd->timer);
@@ -284,7 +284,7 @@ static void padata_reorder(struct parallel_data *pd)
284 /* 284 /*
285 * The next object that needs serialization might have arrived to 285 * The next object that needs serialization might have arrived to
286 * the reorder queues in the meantime, we will be called again 286 * the reorder queues in the meantime, we will be called again
287 * from the timer function if noone else cares for it. 287 * from the timer function if no one else cares for it.
288 */ 288 */
289 if (atomic_read(&pd->reorder_objects) 289 if (atomic_read(&pd->reorder_objects)
290 && !(pinst->flags & PADATA_RESET)) 290 && !(pinst->flags & PADATA_RESET))
@@ -515,7 +515,7 @@ static void __padata_stop(struct padata_instance *pinst)
515 put_online_cpus(); 515 put_online_cpus();
516} 516}
517 517
518/* Replace the internal control stucture with a new one. */ 518/* Replace the internal control structure with a new one. */
519static void padata_replace(struct padata_instance *pinst, 519static void padata_replace(struct padata_instance *pinst,
520 struct parallel_data *pd_new) 520 struct parallel_data *pd_new)
521{ 521{
@@ -768,7 +768,7 @@ static int __padata_remove_cpu(struct padata_instance *pinst, int cpu)
768} 768}
769 769
770 /** 770 /**
771 * padata_remove_cpu - remove a cpu from the one or both(serial and paralell) 771 * padata_remove_cpu - remove a cpu from the one or both(serial and parallel)
772 * padata cpumasks. 772 * padata cpumasks.
773 * 773 *
774 * @pinst: padata instance 774 * @pinst: padata instance
diff --git a/kernel/params.c b/kernel/params.c
index 0da1411222b9..ed72e1330862 100644
--- a/kernel/params.c
+++ b/kernel/params.c
@@ -95,7 +95,7 @@ static int parse_one(char *param,
95 /* Find parameter */ 95 /* Find parameter */
96 for (i = 0; i < num_params; i++) { 96 for (i = 0; i < num_params; i++) {
97 if (parameq(param, params[i].name)) { 97 if (parameq(param, params[i].name)) {
98 /* Noone handled NULL, so do it here. */ 98 /* No one handled NULL, so do it here. */
99 if (!val && params[i].ops->set != param_set_bool) 99 if (!val && params[i].ops->set != param_set_bool)
100 return -EINVAL; 100 return -EINVAL;
101 DEBUGP("They are equal! Calling %p\n", 101 DEBUGP("They are equal! Calling %p\n",
@@ -297,21 +297,15 @@ EXPORT_SYMBOL(param_ops_charp);
297int param_set_bool(const char *val, const struct kernel_param *kp) 297int param_set_bool(const char *val, const struct kernel_param *kp)
298{ 298{
299 bool v; 299 bool v;
300 int ret;
300 301
301 /* No equals means "set"... */ 302 /* No equals means "set"... */
302 if (!val) val = "1"; 303 if (!val) val = "1";
303 304
304 /* One of =[yYnN01] */ 305 /* One of =[yYnN01] */
305 switch (val[0]) { 306 ret = strtobool(val, &v);
306 case 'y': case 'Y': case '1': 307 if (ret)
307 v = true; 308 return ret;
308 break;
309 case 'n': case 'N': case '0':
310 v = false;
311 break;
312 default:
313 return -EINVAL;
314 }
315 309
316 if (kp->flags & KPARAM_ISBOOL) 310 if (kp->flags & KPARAM_ISBOOL)
317 *(bool *)kp->arg = v; 311 *(bool *)kp->arg = v;
@@ -821,15 +815,18 @@ ssize_t __modver_version_show(struct module_attribute *mattr,
821 return sprintf(buf, "%s\n", vattr->version); 815 return sprintf(buf, "%s\n", vattr->version);
822} 816}
823 817
824extern struct module_version_attribute __start___modver[], __stop___modver[]; 818extern const struct module_version_attribute *__start___modver[];
819extern const struct module_version_attribute *__stop___modver[];
825 820
826static void __init version_sysfs_builtin(void) 821static void __init version_sysfs_builtin(void)
827{ 822{
828 const struct module_version_attribute *vattr; 823 const struct module_version_attribute **p;
829 struct module_kobject *mk; 824 struct module_kobject *mk;
830 int err; 825 int err;
831 826
832 for (vattr = __start___modver; vattr < __stop___modver; vattr++) { 827 for (p = __start___modver; p < __stop___modver; p++) {
828 const struct module_version_attribute *vattr = *p;
829
833 mk = locate_module_kobject(vattr->module_name); 830 mk = locate_module_kobject(vattr->module_name);
834 if (mk) { 831 if (mk) {
835 err = sysfs_create_file(&mk->kobj, &vattr->mattr.attr); 832 err = sysfs_create_file(&mk->kobj, &vattr->mattr.attr);
diff --git a/kernel/pid.c b/kernel/pid.c
index 02f221274265..57a8346a270e 100644
--- a/kernel/pid.c
+++ b/kernel/pid.c
@@ -217,11 +217,14 @@ static int alloc_pidmap(struct pid_namespace *pid_ns)
217 return -1; 217 return -1;
218} 218}
219 219
220int next_pidmap(struct pid_namespace *pid_ns, int last) 220int next_pidmap(struct pid_namespace *pid_ns, unsigned int last)
221{ 221{
222 int offset; 222 int offset;
223 struct pidmap *map, *end; 223 struct pidmap *map, *end;
224 224
225 if (last >= PID_MAX_LIMIT)
226 return -1;
227
225 offset = (last + 1) & BITS_PER_PAGE_MASK; 228 offset = (last + 1) & BITS_PER_PAGE_MASK;
226 map = &pid_ns->pidmap[(last + 1)/BITS_PER_PAGE]; 229 map = &pid_ns->pidmap[(last + 1)/BITS_PER_PAGE];
227 end = &pid_ns->pidmap[PIDMAP_ENTRIES]; 230 end = &pid_ns->pidmap[PIDMAP_ENTRIES];
diff --git a/kernel/posix-cpu-timers.c b/kernel/posix-cpu-timers.c
index 67fea9d25d55..0791b13df7bf 100644
--- a/kernel/posix-cpu-timers.c
+++ b/kernel/posix-cpu-timers.c
@@ -1347,7 +1347,7 @@ void run_posix_cpu_timers(struct task_struct *tsk)
1347 1347
1348 /* 1348 /*
1349 * Now that all the timers on our list have the firing flag, 1349 * Now that all the timers on our list have the firing flag,
1350 * noone will touch their list entries but us. We'll take 1350 * no one will touch their list entries but us. We'll take
1351 * each timer's lock before clearing its firing flag, so no 1351 * each timer's lock before clearing its firing flag, so no
1352 * timer call will interfere. 1352 * timer call will interfere.
1353 */ 1353 */
diff --git a/kernel/posix-timers.c b/kernel/posix-timers.c
index 4c0124919f9a..e5498d7405c3 100644
--- a/kernel/posix-timers.c
+++ b/kernel/posix-timers.c
@@ -313,7 +313,7 @@ static void schedule_next_timer(struct k_itimer *timr)
313 * restarted (i.e. we have flagged this in the sys_private entry of the 313 * restarted (i.e. we have flagged this in the sys_private entry of the
314 * info block). 314 * info block).
315 * 315 *
316 * To protect aginst the timer going away while the interrupt is queued, 316 * To protect against the timer going away while the interrupt is queued,
317 * we require that the it_requeue_pending flag be set. 317 * we require that the it_requeue_pending flag be set.
318 */ 318 */
319void do_schedule_next_timer(struct siginfo *info) 319void do_schedule_next_timer(struct siginfo *info)
diff --git a/kernel/power/Kconfig b/kernel/power/Kconfig
index 4603f08dc47b..87f4d24b55b0 100644
--- a/kernel/power/Kconfig
+++ b/kernel/power/Kconfig
@@ -18,9 +18,13 @@ config SUSPEND_FREEZER
18 18
19 Turning OFF this setting is NOT recommended! If in doubt, say Y. 19 Turning OFF this setting is NOT recommended! If in doubt, say Y.
20 20
21config HIBERNATE_CALLBACKS
22 bool
23
21config HIBERNATION 24config HIBERNATION
22 bool "Hibernation (aka 'suspend to disk')" 25 bool "Hibernation (aka 'suspend to disk')"
23 depends on SWAP && ARCH_HIBERNATION_POSSIBLE 26 depends on SWAP && ARCH_HIBERNATION_POSSIBLE
27 select HIBERNATE_CALLBACKS
24 select LZO_COMPRESS 28 select LZO_COMPRESS
25 select LZO_DECOMPRESS 29 select LZO_DECOMPRESS
26 ---help--- 30 ---help---
@@ -85,7 +89,7 @@ config PM_STD_PARTITION
85 89
86config PM_SLEEP 90config PM_SLEEP
87 def_bool y 91 def_bool y
88 depends on SUSPEND || HIBERNATION || XEN_SAVE_RESTORE 92 depends on SUSPEND || HIBERNATE_CALLBACKS
89 93
90config PM_SLEEP_SMP 94config PM_SLEEP_SMP
91 def_bool y 95 def_bool y
@@ -121,12 +125,6 @@ config PM_DEBUG
121 code. This is helpful when debugging and reporting PM bugs, like 125 code. This is helpful when debugging and reporting PM bugs, like
122 suspend support. 126 suspend support.
123 127
124config PM_VERBOSE
125 bool "Verbose Power Management debugging"
126 depends on PM_DEBUG
127 ---help---
128 This option enables verbose messages from the Power Management code.
129
130config PM_ADVANCED_DEBUG 128config PM_ADVANCED_DEBUG
131 bool "Extra PM attributes in sysfs for low-level debugging/testing" 129 bool "Extra PM attributes in sysfs for low-level debugging/testing"
132 depends on PM_DEBUG 130 depends on PM_DEBUG
@@ -225,3 +223,7 @@ config PM_OPP
225 representing individual voltage domains and provides SOC 223 representing individual voltage domains and provides SOC
226 implementations a ready to use framework to manage OPPs. 224 implementations a ready to use framework to manage OPPs.
227 For more information, read <file:Documentation/power/opp.txt> 225 For more information, read <file:Documentation/power/opp.txt>
226
227config PM_RUNTIME_CLK
228 def_bool y
229 depends on PM_RUNTIME && HAVE_CLK
diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c
index aeabd26e3342..f9bec56d8825 100644
--- a/kernel/power/hibernate.c
+++ b/kernel/power/hibernate.c
@@ -272,9 +272,7 @@ static int create_image(int platform_mode)
272 272
273 local_irq_disable(); 273 local_irq_disable();
274 274
275 error = sysdev_suspend(PMSG_FREEZE); 275 error = syscore_suspend();
276 if (!error)
277 error = syscore_suspend();
278 if (error) { 276 if (error) {
279 printk(KERN_ERR "PM: Some system devices failed to power down, " 277 printk(KERN_ERR "PM: Some system devices failed to power down, "
280 "aborting hibernation\n"); 278 "aborting hibernation\n");
@@ -299,7 +297,6 @@ static int create_image(int platform_mode)
299 297
300 Power_up: 298 Power_up:
301 syscore_resume(); 299 syscore_resume();
302 sysdev_resume();
303 /* NOTE: dpm_resume_noirq() is just a resume() for devices 300 /* NOTE: dpm_resume_noirq() is just a resume() for devices
304 * that suspended with irqs off ... no overall powerup. 301 * that suspended with irqs off ... no overall powerup.
305 */ 302 */
@@ -330,20 +327,25 @@ static int create_image(int platform_mode)
330 327
331int hibernation_snapshot(int platform_mode) 328int hibernation_snapshot(int platform_mode)
332{ 329{
330 pm_message_t msg = PMSG_RECOVER;
333 int error; 331 int error;
334 332
335 error = platform_begin(platform_mode); 333 error = platform_begin(platform_mode);
336 if (error) 334 if (error)
337 goto Close; 335 goto Close;
338 336
337 error = dpm_prepare(PMSG_FREEZE);
338 if (error)
339 goto Complete_devices;
340
339 /* Preallocate image memory before shutting down devices. */ 341 /* Preallocate image memory before shutting down devices. */
340 error = hibernate_preallocate_memory(); 342 error = hibernate_preallocate_memory();
341 if (error) 343 if (error)
342 goto Close; 344 goto Complete_devices;
343 345
344 suspend_console(); 346 suspend_console();
345 pm_restrict_gfp_mask(); 347 pm_restrict_gfp_mask();
346 error = dpm_suspend_start(PMSG_FREEZE); 348 error = dpm_suspend(PMSG_FREEZE);
347 if (error) 349 if (error)
348 goto Recover_platform; 350 goto Recover_platform;
349 351
@@ -361,13 +363,17 @@ int hibernation_snapshot(int platform_mode)
361 if (error || !in_suspend) 363 if (error || !in_suspend)
362 swsusp_free(); 364 swsusp_free();
363 365
364 dpm_resume_end(in_suspend ? 366 msg = in_suspend ? (error ? PMSG_RECOVER : PMSG_THAW) : PMSG_RESTORE;
365 (error ? PMSG_RECOVER : PMSG_THAW) : PMSG_RESTORE); 367 dpm_resume(msg);
366 368
367 if (error || !in_suspend) 369 if (error || !in_suspend)
368 pm_restore_gfp_mask(); 370 pm_restore_gfp_mask();
369 371
370 resume_console(); 372 resume_console();
373
374 Complete_devices:
375 dpm_complete(msg);
376
371 Close: 377 Close:
372 platform_end(platform_mode); 378 platform_end(platform_mode);
373 return error; 379 return error;
@@ -406,9 +412,7 @@ static int resume_target_kernel(bool platform_mode)
406 412
407 local_irq_disable(); 413 local_irq_disable();
408 414
409 error = sysdev_suspend(PMSG_QUIESCE); 415 error = syscore_suspend();
410 if (!error)
411 error = syscore_suspend();
412 if (error) 416 if (error)
413 goto Enable_irqs; 417 goto Enable_irqs;
414 418
@@ -436,7 +440,6 @@ static int resume_target_kernel(bool platform_mode)
436 touch_softlockup_watchdog(); 440 touch_softlockup_watchdog();
437 441
438 syscore_resume(); 442 syscore_resume();
439 sysdev_resume();
440 443
441 Enable_irqs: 444 Enable_irqs:
442 local_irq_enable(); 445 local_irq_enable();
@@ -522,7 +525,6 @@ int hibernation_platform_enter(void)
522 goto Platform_finish; 525 goto Platform_finish;
523 526
524 local_irq_disable(); 527 local_irq_disable();
525 sysdev_suspend(PMSG_HIBERNATE);
526 syscore_suspend(); 528 syscore_suspend();
527 if (pm_wakeup_pending()) { 529 if (pm_wakeup_pending()) {
528 error = -EAGAIN; 530 error = -EAGAIN;
@@ -535,7 +537,6 @@ int hibernation_platform_enter(void)
535 537
536 Power_up: 538 Power_up:
537 syscore_resume(); 539 syscore_resume();
538 sysdev_resume();
539 local_irq_enable(); 540 local_irq_enable();
540 enable_nonboot_cpus(); 541 enable_nonboot_cpus();
541 542
@@ -976,10 +977,33 @@ static ssize_t image_size_store(struct kobject *kobj, struct kobj_attribute *att
976 977
977power_attr(image_size); 978power_attr(image_size);
978 979
980static ssize_t reserved_size_show(struct kobject *kobj,
981 struct kobj_attribute *attr, char *buf)
982{
983 return sprintf(buf, "%lu\n", reserved_size);
984}
985
986static ssize_t reserved_size_store(struct kobject *kobj,
987 struct kobj_attribute *attr,
988 const char *buf, size_t n)
989{
990 unsigned long size;
991
992 if (sscanf(buf, "%lu", &size) == 1) {
993 reserved_size = size;
994 return n;
995 }
996
997 return -EINVAL;
998}
999
1000power_attr(reserved_size);
1001
979static struct attribute * g[] = { 1002static struct attribute * g[] = {
980 &disk_attr.attr, 1003 &disk_attr.attr,
981 &resume_attr.attr, 1004 &resume_attr.attr,
982 &image_size_attr.attr, 1005 &image_size_attr.attr,
1006 &reserved_size_attr.attr,
983 NULL, 1007 NULL,
984}; 1008};
985 1009
diff --git a/kernel/power/main.c b/kernel/power/main.c
index 8eaba5f27b10..2981af4ce7cb 100644
--- a/kernel/power/main.c
+++ b/kernel/power/main.c
@@ -224,7 +224,7 @@ power_attr(state);
224 * writing to 'state'. It first should read from 'wakeup_count' and store 224 * writing to 'state'. It first should read from 'wakeup_count' and store
225 * the read value. Then, after carrying out its own preparations for the system 225 * the read value. Then, after carrying out its own preparations for the system
226 * transition to a sleep state, it should write the stored value to 226 * transition to a sleep state, it should write the stored value to
227 * 'wakeup_count'. If that fails, at least one wakeup event has occured since 227 * 'wakeup_count'. If that fails, at least one wakeup event has occurred since
228 * 'wakeup_count' was read and 'state' should not be written to. Otherwise, it 228 * 'wakeup_count' was read and 'state' should not be written to. Otherwise, it
229 * is allowed to write to 'state', but the transition will be aborted if there 229 * is allowed to write to 'state', but the transition will be aborted if there
230 * are any wakeup events detected after 'wakeup_count' was written to. 230 * are any wakeup events detected after 'wakeup_count' was written to.
@@ -337,6 +337,7 @@ static int __init pm_init(void)
337 if (error) 337 if (error)
338 return error; 338 return error;
339 hibernate_image_size_init(); 339 hibernate_image_size_init();
340 hibernate_reserved_size_init();
340 power_kobj = kobject_create_and_add("power", NULL); 341 power_kobj = kobject_create_and_add("power", NULL);
341 if (!power_kobj) 342 if (!power_kobj)
342 return -ENOMEM; 343 return -ENOMEM;
diff --git a/kernel/power/power.h b/kernel/power/power.h
index 03634be55f62..9a00a0a26280 100644
--- a/kernel/power/power.h
+++ b/kernel/power/power.h
@@ -15,6 +15,7 @@ struct swsusp_info {
15 15
16#ifdef CONFIG_HIBERNATION 16#ifdef CONFIG_HIBERNATION
17/* kernel/power/snapshot.c */ 17/* kernel/power/snapshot.c */
18extern void __init hibernate_reserved_size_init(void);
18extern void __init hibernate_image_size_init(void); 19extern void __init hibernate_image_size_init(void);
19 20
20#ifdef CONFIG_ARCH_HIBERNATION_HEADER 21#ifdef CONFIG_ARCH_HIBERNATION_HEADER
@@ -55,6 +56,7 @@ extern int hibernation_platform_enter(void);
55 56
56#else /* !CONFIG_HIBERNATION */ 57#else /* !CONFIG_HIBERNATION */
57 58
59static inline void hibernate_reserved_size_init(void) {}
58static inline void hibernate_image_size_init(void) {} 60static inline void hibernate_image_size_init(void) {}
59#endif /* !CONFIG_HIBERNATION */ 61#endif /* !CONFIG_HIBERNATION */
60 62
@@ -72,6 +74,8 @@ static struct kobj_attribute _name##_attr = { \
72 74
73/* Preferred image size in bytes (default 500 MB) */ 75/* Preferred image size in bytes (default 500 MB) */
74extern unsigned long image_size; 76extern unsigned long image_size;
77/* Size of memory reserved for drivers (default SPARE_PAGES x PAGE_SIZE) */
78extern unsigned long reserved_size;
75extern int in_suspend; 79extern int in_suspend;
76extern dev_t swsusp_resume_device; 80extern dev_t swsusp_resume_device;
77extern sector_t swsusp_resume_block; 81extern sector_t swsusp_resume_block;
diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c
index ca0aacc24874..ace55889f702 100644
--- a/kernel/power/snapshot.c
+++ b/kernel/power/snapshot.c
@@ -41,16 +41,28 @@ static void swsusp_set_page_forbidden(struct page *);
41static void swsusp_unset_page_forbidden(struct page *); 41static void swsusp_unset_page_forbidden(struct page *);
42 42
43/* 43/*
44 * Number of bytes to reserve for memory allocations made by device drivers
45 * from their ->freeze() and ->freeze_noirq() callbacks so that they don't
46 * cause image creation to fail (tunable via /sys/power/reserved_size).
47 */
48unsigned long reserved_size;
49
50void __init hibernate_reserved_size_init(void)
51{
52 reserved_size = SPARE_PAGES * PAGE_SIZE;
53}
54
55/*
44 * Preferred image size in bytes (tunable via /sys/power/image_size). 56 * Preferred image size in bytes (tunable via /sys/power/image_size).
45 * When it is set to N, the image creating code will do its best to 57 * When it is set to N, swsusp will do its best to ensure the image
46 * ensure the image size will not exceed N bytes, but if that is 58 * size will not exceed N bytes, but if that is impossible, it will
47 * impossible, it will try to create the smallest image possible. 59 * try to create the smallest image possible.
48 */ 60 */
49unsigned long image_size; 61unsigned long image_size;
50 62
51void __init hibernate_image_size_init(void) 63void __init hibernate_image_size_init(void)
52{ 64{
53 image_size = (totalram_pages / 3) * PAGE_SIZE; 65 image_size = ((totalram_pages * 2) / 5) * PAGE_SIZE;
54} 66}
55 67
56/* List of PBEs needed for restoring the pages that were allocated before 68/* List of PBEs needed for restoring the pages that were allocated before
@@ -1263,11 +1275,13 @@ static unsigned long minimum_image_size(unsigned long saveable)
1263 * frame in use. We also need a number of page frames to be free during 1275 * frame in use. We also need a number of page frames to be free during
1264 * hibernation for allocations made while saving the image and for device 1276 * hibernation for allocations made while saving the image and for device
1265 * drivers, in case they need to allocate memory from their hibernation 1277 * drivers, in case they need to allocate memory from their hibernation
1266 * callbacks (these two numbers are given by PAGES_FOR_IO and SPARE_PAGES, 1278 * callbacks (these two numbers are given by PAGES_FOR_IO (which is a rough
1267 * respectively, both of which are rough estimates). To make this happen, we 1279 * estimate) and reserverd_size divided by PAGE_SIZE (which is tunable through
1268 * compute the total number of available page frames and allocate at least 1280 * /sys/power/reserved_size, respectively). To make this happen, we compute the
1281 * total number of available page frames and allocate at least
1269 * 1282 *
1270 * ([page frames total] + PAGES_FOR_IO + [metadata pages]) / 2 + 2 * SPARE_PAGES 1283 * ([page frames total] + PAGES_FOR_IO + [metadata pages]) / 2
1284 * + 2 * DIV_ROUND_UP(reserved_size, PAGE_SIZE)
1271 * 1285 *
1272 * of them, which corresponds to the maximum size of a hibernation image. 1286 * of them, which corresponds to the maximum size of a hibernation image.
1273 * 1287 *
@@ -1322,7 +1336,8 @@ int hibernate_preallocate_memory(void)
1322 count -= totalreserve_pages; 1336 count -= totalreserve_pages;
1323 1337
1324 /* Compute the maximum number of saveable pages to leave in memory. */ 1338 /* Compute the maximum number of saveable pages to leave in memory. */
1325 max_size = (count - (size + PAGES_FOR_IO)) / 2 - 2 * SPARE_PAGES; 1339 max_size = (count - (size + PAGES_FOR_IO)) / 2
1340 - 2 * DIV_ROUND_UP(reserved_size, PAGE_SIZE);
1326 /* Compute the desired number of image pages specified by image_size. */ 1341 /* Compute the desired number of image pages specified by image_size. */
1327 size = DIV_ROUND_UP(image_size, PAGE_SIZE); 1342 size = DIV_ROUND_UP(image_size, PAGE_SIZE);
1328 if (size > max_size) 1343 if (size > max_size)
diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c
index 2814c32aed51..1c41ba215419 100644
--- a/kernel/power/suspend.c
+++ b/kernel/power/suspend.c
@@ -163,16 +163,13 @@ static int suspend_enter(suspend_state_t state)
163 arch_suspend_disable_irqs(); 163 arch_suspend_disable_irqs();
164 BUG_ON(!irqs_disabled()); 164 BUG_ON(!irqs_disabled());
165 165
166 error = sysdev_suspend(PMSG_SUSPEND); 166 error = syscore_suspend();
167 if (!error)
168 error = syscore_suspend();
169 if (!error) { 167 if (!error) {
170 if (!(suspend_test(TEST_CORE) || pm_wakeup_pending())) { 168 if (!(suspend_test(TEST_CORE) || pm_wakeup_pending())) {
171 error = suspend_ops->enter(state); 169 error = suspend_ops->enter(state);
172 events_check_enabled = false; 170 events_check_enabled = false;
173 } 171 }
174 syscore_resume(); 172 syscore_resume();
175 sysdev_resume();
176 } 173 }
177 174
178 arch_suspend_enable_irqs(); 175 arch_suspend_enable_irqs();
@@ -213,7 +210,6 @@ int suspend_devices_and_enter(suspend_state_t state)
213 goto Close; 210 goto Close;
214 } 211 }
215 suspend_console(); 212 suspend_console();
216 pm_restrict_gfp_mask();
217 suspend_test_start(); 213 suspend_test_start();
218 error = dpm_suspend_start(PMSG_SUSPEND); 214 error = dpm_suspend_start(PMSG_SUSPEND);
219 if (error) { 215 if (error) {
@@ -224,13 +220,12 @@ int suspend_devices_and_enter(suspend_state_t state)
224 if (suspend_test(TEST_DEVICES)) 220 if (suspend_test(TEST_DEVICES))
225 goto Recover_platform; 221 goto Recover_platform;
226 222
227 suspend_enter(state); 223 error = suspend_enter(state);
228 224
229 Resume_devices: 225 Resume_devices:
230 suspend_test_start(); 226 suspend_test_start();
231 dpm_resume_end(PMSG_RESUME); 227 dpm_resume_end(PMSG_RESUME);
232 suspend_test_finish("resume devices"); 228 suspend_test_finish("resume devices");
233 pm_restore_gfp_mask();
234 resume_console(); 229 resume_console();
235 Close: 230 Close:
236 if (suspend_ops->end) 231 if (suspend_ops->end)
@@ -291,7 +286,9 @@ int enter_state(suspend_state_t state)
291 goto Finish; 286 goto Finish;
292 287
293 pr_debug("PM: Entering %s sleep\n", pm_states[state]); 288 pr_debug("PM: Entering %s sleep\n", pm_states[state]);
289 pm_restrict_gfp_mask();
294 error = suspend_devices_and_enter(state); 290 error = suspend_devices_and_enter(state);
291 pm_restore_gfp_mask();
295 292
296 Finish: 293 Finish:
297 pr_debug("PM: Finishing wakeup.\n"); 294 pr_debug("PM: Finishing wakeup.\n");
diff --git a/kernel/power/user.c b/kernel/power/user.c
index c36c3b9e8a84..7d02d33be699 100644
--- a/kernel/power/user.c
+++ b/kernel/power/user.c
@@ -135,8 +135,10 @@ static int snapshot_release(struct inode *inode, struct file *filp)
135 free_basic_memory_bitmaps(); 135 free_basic_memory_bitmaps();
136 data = filp->private_data; 136 data = filp->private_data;
137 free_all_swap_pages(data->swap); 137 free_all_swap_pages(data->swap);
138 if (data->frozen) 138 if (data->frozen) {
139 pm_restore_gfp_mask();
139 thaw_processes(); 140 thaw_processes();
141 }
140 pm_notifier_call_chain(data->mode == O_RDONLY ? 142 pm_notifier_call_chain(data->mode == O_RDONLY ?
141 PM_POST_HIBERNATION : PM_POST_RESTORE); 143 PM_POST_HIBERNATION : PM_POST_RESTORE);
142 atomic_inc(&snapshot_device_available); 144 atomic_inc(&snapshot_device_available);
@@ -379,6 +381,7 @@ static long snapshot_ioctl(struct file *filp, unsigned int cmd,
379 * PM_HIBERNATION_PREPARE 381 * PM_HIBERNATION_PREPARE
380 */ 382 */
381 error = suspend_devices_and_enter(PM_SUSPEND_MEM); 383 error = suspend_devices_and_enter(PM_SUSPEND_MEM);
384 data->ready = 0;
382 break; 385 break;
383 386
384 case SNAPSHOT_PLATFORM_SUPPORT: 387 case SNAPSHOT_PLATFORM_SUPPORT:
diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index 512bd017218d..7a81fc071344 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -22,6 +22,7 @@
22#include <linux/syscalls.h> 22#include <linux/syscalls.h>
23#include <linux/uaccess.h> 23#include <linux/uaccess.h>
24#include <linux/regset.h> 24#include <linux/regset.h>
25#include <linux/hw_breakpoint.h>
25 26
26 27
27/* 28/*
@@ -923,3 +924,19 @@ asmlinkage long compat_sys_ptrace(compat_long_t request, compat_long_t pid,
923 return ret; 924 return ret;
924} 925}
925#endif /* CONFIG_COMPAT */ 926#endif /* CONFIG_COMPAT */
927
928#ifdef CONFIG_HAVE_HW_BREAKPOINT
929int ptrace_get_breakpoints(struct task_struct *tsk)
930{
931 if (atomic_inc_not_zero(&tsk->ptrace_bp_refcnt))
932 return 0;
933
934 return -1;
935}
936
937void ptrace_put_breakpoints(struct task_struct *tsk)
938{
939 if (atomic_dec_and_test(&tsk->ptrace_bp_refcnt))
940 flush_ptrace_hw_breakpoint(tsk);
941}
942#endif /* CONFIG_HAVE_HW_BREAKPOINT */
diff --git a/kernel/rcupdate.c b/kernel/rcupdate.c
index f3240e987928..7784bd216b6a 100644
--- a/kernel/rcupdate.c
+++ b/kernel/rcupdate.c
@@ -142,10 +142,17 @@ static int rcuhead_fixup_init(void *addr, enum debug_obj_state state)
142 * Ensure that queued callbacks are all executed. 142 * Ensure that queued callbacks are all executed.
143 * If we detect that we are nested in a RCU read-side critical 143 * If we detect that we are nested in a RCU read-side critical
144 * section, we should simply fail, otherwise we would deadlock. 144 * section, we should simply fail, otherwise we would deadlock.
145 * In !PREEMPT configurations, there is no way to tell if we are
146 * in a RCU read-side critical section or not, so we never
147 * attempt any fixup and just print a warning.
145 */ 148 */
149#ifndef CONFIG_PREEMPT
150 WARN_ON_ONCE(1);
151 return 0;
152#endif
146 if (rcu_preempt_depth() != 0 || preempt_count() != 0 || 153 if (rcu_preempt_depth() != 0 || preempt_count() != 0 ||
147 irqs_disabled()) { 154 irqs_disabled()) {
148 WARN_ON(1); 155 WARN_ON_ONCE(1);
149 return 0; 156 return 0;
150 } 157 }
151 rcu_barrier(); 158 rcu_barrier();
@@ -184,10 +191,17 @@ static int rcuhead_fixup_activate(void *addr, enum debug_obj_state state)
184 * Ensure that queued callbacks are all executed. 191 * Ensure that queued callbacks are all executed.
185 * If we detect that we are nested in a RCU read-side critical 192 * If we detect that we are nested in a RCU read-side critical
186 * section, we should simply fail, otherwise we would deadlock. 193 * section, we should simply fail, otherwise we would deadlock.
194 * In !PREEMPT configurations, there is no way to tell if we are
195 * in a RCU read-side critical section or not, so we never
196 * attempt any fixup and just print a warning.
187 */ 197 */
198#ifndef CONFIG_PREEMPT
199 WARN_ON_ONCE(1);
200 return 0;
201#endif
188 if (rcu_preempt_depth() != 0 || preempt_count() != 0 || 202 if (rcu_preempt_depth() != 0 || preempt_count() != 0 ||
189 irqs_disabled()) { 203 irqs_disabled()) {
190 WARN_ON(1); 204 WARN_ON_ONCE(1);
191 return 0; 205 return 0;
192 } 206 }
193 rcu_barrier(); 207 rcu_barrier();
@@ -214,15 +228,17 @@ static int rcuhead_fixup_free(void *addr, enum debug_obj_state state)
214 * Ensure that queued callbacks are all executed. 228 * Ensure that queued callbacks are all executed.
215 * If we detect that we are nested in a RCU read-side critical 229 * If we detect that we are nested in a RCU read-side critical
216 * section, we should simply fail, otherwise we would deadlock. 230 * section, we should simply fail, otherwise we would deadlock.
217 * Note that the machinery to reliably determine whether 231 * In !PREEMPT configurations, there is no way to tell if we are
218 * or not we are in an RCU read-side critical section 232 * in a RCU read-side critical section or not, so we never
219 * exists only in the preemptible RCU implementations 233 * attempt any fixup and just print a warning.
220 * (TINY_PREEMPT_RCU and TREE_PREEMPT_RCU), which is why
221 * DEBUG_OBJECTS_RCU_HEAD is disallowed if !PREEMPT.
222 */ 234 */
235#ifndef CONFIG_PREEMPT
236 WARN_ON_ONCE(1);
237 return 0;
238#endif
223 if (rcu_preempt_depth() != 0 || preempt_count() != 0 || 239 if (rcu_preempt_depth() != 0 || preempt_count() != 0 ||
224 irqs_disabled()) { 240 irqs_disabled()) {
225 WARN_ON(1); 241 WARN_ON_ONCE(1);
226 return 0; 242 return 0;
227 } 243 }
228 rcu_barrier(); 244 rcu_barrier();
diff --git a/kernel/rcutiny.c b/kernel/rcutiny.c
index 0c343b9a46d5..7bbac7d0f5ab 100644
--- a/kernel/rcutiny.c
+++ b/kernel/rcutiny.c
@@ -35,15 +35,16 @@
35#include <linux/init.h> 35#include <linux/init.h>
36#include <linux/time.h> 36#include <linux/time.h>
37#include <linux/cpu.h> 37#include <linux/cpu.h>
38#include <linux/prefetch.h>
38 39
39/* Controls for rcu_kthread() kthread, replacing RCU_SOFTIRQ used previously. */ 40/* Controls for rcu_kthread() kthread, replacing RCU_SOFTIRQ used previously. */
40static struct task_struct *rcu_kthread_task; 41static struct task_struct *rcu_kthread_task;
41static DECLARE_WAIT_QUEUE_HEAD(rcu_kthread_wq); 42static DECLARE_WAIT_QUEUE_HEAD(rcu_kthread_wq);
42static unsigned long have_rcu_kthread_work; 43static unsigned long have_rcu_kthread_work;
43static void invoke_rcu_kthread(void);
44 44
45/* Forward declarations for rcutiny_plugin.h. */ 45/* Forward declarations for rcutiny_plugin.h. */
46struct rcu_ctrlblk; 46struct rcu_ctrlblk;
47static void invoke_rcu_kthread(void);
47static void rcu_process_callbacks(struct rcu_ctrlblk *rcp); 48static void rcu_process_callbacks(struct rcu_ctrlblk *rcp);
48static int rcu_kthread(void *arg); 49static int rcu_kthread(void *arg);
49static void __call_rcu(struct rcu_head *head, 50static void __call_rcu(struct rcu_head *head,
@@ -79,36 +80,45 @@ void rcu_exit_nohz(void)
79#endif /* #ifdef CONFIG_NO_HZ */ 80#endif /* #ifdef CONFIG_NO_HZ */
80 81
81/* 82/*
82 * Helper function for rcu_qsctr_inc() and rcu_bh_qsctr_inc(). 83 * Helper function for rcu_sched_qs() and rcu_bh_qs().
83 * Also disable irqs to avoid confusion due to interrupt handlers 84 * Also irqs are disabled to avoid confusion due to interrupt handlers
84 * invoking call_rcu(). 85 * invoking call_rcu().
85 */ 86 */
86static int rcu_qsctr_help(struct rcu_ctrlblk *rcp) 87static int rcu_qsctr_help(struct rcu_ctrlblk *rcp)
87{ 88{
88 unsigned long flags;
89
90 local_irq_save(flags);
91 if (rcp->rcucblist != NULL && 89 if (rcp->rcucblist != NULL &&
92 rcp->donetail != rcp->curtail) { 90 rcp->donetail != rcp->curtail) {
93 rcp->donetail = rcp->curtail; 91 rcp->donetail = rcp->curtail;
94 local_irq_restore(flags);
95 return 1; 92 return 1;
96 } 93 }
97 local_irq_restore(flags);
98 94
99 return 0; 95 return 0;
100} 96}
101 97
102/* 98/*
99 * Wake up rcu_kthread() to process callbacks now eligible for invocation
100 * or to boost readers.
101 */
102static void invoke_rcu_kthread(void)
103{
104 have_rcu_kthread_work = 1;
105 wake_up(&rcu_kthread_wq);
106}
107
108/*
103 * Record an rcu quiescent state. And an rcu_bh quiescent state while we 109 * Record an rcu quiescent state. And an rcu_bh quiescent state while we
104 * are at it, given that any rcu quiescent state is also an rcu_bh 110 * are at it, given that any rcu quiescent state is also an rcu_bh
105 * quiescent state. Use "+" instead of "||" to defeat short circuiting. 111 * quiescent state. Use "+" instead of "||" to defeat short circuiting.
106 */ 112 */
107void rcu_sched_qs(int cpu) 113void rcu_sched_qs(int cpu)
108{ 114{
115 unsigned long flags;
116
117 local_irq_save(flags);
109 if (rcu_qsctr_help(&rcu_sched_ctrlblk) + 118 if (rcu_qsctr_help(&rcu_sched_ctrlblk) +
110 rcu_qsctr_help(&rcu_bh_ctrlblk)) 119 rcu_qsctr_help(&rcu_bh_ctrlblk))
111 invoke_rcu_kthread(); 120 invoke_rcu_kthread();
121 local_irq_restore(flags);
112} 122}
113 123
114/* 124/*
@@ -116,8 +126,12 @@ void rcu_sched_qs(int cpu)
116 */ 126 */
117void rcu_bh_qs(int cpu) 127void rcu_bh_qs(int cpu)
118{ 128{
129 unsigned long flags;
130
131 local_irq_save(flags);
119 if (rcu_qsctr_help(&rcu_bh_ctrlblk)) 132 if (rcu_qsctr_help(&rcu_bh_ctrlblk))
120 invoke_rcu_kthread(); 133 invoke_rcu_kthread();
134 local_irq_restore(flags);
121} 135}
122 136
123/* 137/*
@@ -167,7 +181,7 @@ static void rcu_process_callbacks(struct rcu_ctrlblk *rcp)
167 prefetch(next); 181 prefetch(next);
168 debug_rcu_head_unqueue(list); 182 debug_rcu_head_unqueue(list);
169 local_bh_disable(); 183 local_bh_disable();
170 list->func(list); 184 __rcu_reclaim(list);
171 local_bh_enable(); 185 local_bh_enable();
172 list = next; 186 list = next;
173 RCU_TRACE(cb_count++); 187 RCU_TRACE(cb_count++);
@@ -208,20 +222,6 @@ static int rcu_kthread(void *arg)
208} 222}
209 223
210/* 224/*
211 * Wake up rcu_kthread() to process callbacks now eligible for invocation
212 * or to boost readers.
213 */
214static void invoke_rcu_kthread(void)
215{
216 unsigned long flags;
217
218 local_irq_save(flags);
219 have_rcu_kthread_work = 1;
220 wake_up(&rcu_kthread_wq);
221 local_irq_restore(flags);
222}
223
224/*
225 * Wait for a grace period to elapse. But it is illegal to invoke 225 * Wait for a grace period to elapse. But it is illegal to invoke
226 * synchronize_sched() from within an RCU read-side critical section. 226 * synchronize_sched() from within an RCU read-side critical section.
227 * Therefore, any legal call to synchronize_sched() is a quiescent 227 * Therefore, any legal call to synchronize_sched() is a quiescent
diff --git a/kernel/rcutiny_plugin.h b/kernel/rcutiny_plugin.h
index 3cb8e362e883..f259c676195f 100644
--- a/kernel/rcutiny_plugin.h
+++ b/kernel/rcutiny_plugin.h
@@ -100,23 +100,28 @@ struct rcu_preempt_ctrlblk {
100 u8 completed; /* Last grace period completed. */ 100 u8 completed; /* Last grace period completed. */
101 /* If all three are equal, RCU is idle. */ 101 /* If all three are equal, RCU is idle. */
102#ifdef CONFIG_RCU_BOOST 102#ifdef CONFIG_RCU_BOOST
103 s8 boosted_this_gp; /* Has boosting already happened? */
104 unsigned long boost_time; /* When to start boosting (jiffies) */ 103 unsigned long boost_time; /* When to start boosting (jiffies) */
105#endif /* #ifdef CONFIG_RCU_BOOST */ 104#endif /* #ifdef CONFIG_RCU_BOOST */
106#ifdef CONFIG_RCU_TRACE 105#ifdef CONFIG_RCU_TRACE
107 unsigned long n_grace_periods; 106 unsigned long n_grace_periods;
108#ifdef CONFIG_RCU_BOOST 107#ifdef CONFIG_RCU_BOOST
109 unsigned long n_tasks_boosted; 108 unsigned long n_tasks_boosted;
109 /* Total number of tasks boosted. */
110 unsigned long n_exp_boosts; 110 unsigned long n_exp_boosts;
111 /* Number of tasks boosted for expedited GP. */
111 unsigned long n_normal_boosts; 112 unsigned long n_normal_boosts;
112 unsigned long n_normal_balk_blkd_tasks; 113 /* Number of tasks boosted for normal GP. */
113 unsigned long n_normal_balk_gp_tasks; 114 unsigned long n_balk_blkd_tasks;
114 unsigned long n_normal_balk_boost_tasks; 115 /* Refused to boost: no blocked tasks. */
115 unsigned long n_normal_balk_boosted; 116 unsigned long n_balk_exp_gp_tasks;
116 unsigned long n_normal_balk_notyet; 117 /* Refused to boost: nothing blocking GP. */
117 unsigned long n_normal_balk_nos; 118 unsigned long n_balk_boost_tasks;
118 unsigned long n_exp_balk_blkd_tasks; 119 /* Refused to boost: already boosting. */
119 unsigned long n_exp_balk_nos; 120 unsigned long n_balk_notyet;
121 /* Refused to boost: not yet time. */
122 unsigned long n_balk_nos;
123 /* Refused to boost: not sure why, though. */
124 /* This can happen due to race conditions. */
120#endif /* #ifdef CONFIG_RCU_BOOST */ 125#endif /* #ifdef CONFIG_RCU_BOOST */
121#endif /* #ifdef CONFIG_RCU_TRACE */ 126#endif /* #ifdef CONFIG_RCU_TRACE */
122}; 127};
@@ -201,7 +206,6 @@ static struct list_head *rcu_next_node_entry(struct task_struct *t)
201 206
202#ifdef CONFIG_RCU_BOOST 207#ifdef CONFIG_RCU_BOOST
203static void rcu_initiate_boost_trace(void); 208static void rcu_initiate_boost_trace(void);
204static void rcu_initiate_exp_boost_trace(void);
205#endif /* #ifdef CONFIG_RCU_BOOST */ 209#endif /* #ifdef CONFIG_RCU_BOOST */
206 210
207/* 211/*
@@ -219,41 +223,21 @@ static void show_tiny_preempt_stats(struct seq_file *m)
219 "N."[!rcu_preempt_ctrlblk.gp_tasks], 223 "N."[!rcu_preempt_ctrlblk.gp_tasks],
220 "E."[!rcu_preempt_ctrlblk.exp_tasks]); 224 "E."[!rcu_preempt_ctrlblk.exp_tasks]);
221#ifdef CONFIG_RCU_BOOST 225#ifdef CONFIG_RCU_BOOST
222 seq_printf(m, " ttb=%c btg=", 226 seq_printf(m, "%sttb=%c ntb=%lu neb=%lu nnb=%lu j=%04x bt=%04x\n",
223 "B."[!rcu_preempt_ctrlblk.boost_tasks]); 227 " ",
224 switch (rcu_preempt_ctrlblk.boosted_this_gp) { 228 "B."[!rcu_preempt_ctrlblk.boost_tasks],
225 case -1:
226 seq_puts(m, "exp");
227 break;
228 case 0:
229 seq_puts(m, "no");
230 break;
231 case 1:
232 seq_puts(m, "begun");
233 break;
234 case 2:
235 seq_puts(m, "done");
236 break;
237 default:
238 seq_printf(m, "?%d?", rcu_preempt_ctrlblk.boosted_this_gp);
239 }
240 seq_printf(m, " ntb=%lu neb=%lu nnb=%lu j=%04x bt=%04x\n",
241 rcu_preempt_ctrlblk.n_tasks_boosted, 229 rcu_preempt_ctrlblk.n_tasks_boosted,
242 rcu_preempt_ctrlblk.n_exp_boosts, 230 rcu_preempt_ctrlblk.n_exp_boosts,
243 rcu_preempt_ctrlblk.n_normal_boosts, 231 rcu_preempt_ctrlblk.n_normal_boosts,
244 (int)(jiffies & 0xffff), 232 (int)(jiffies & 0xffff),
245 (int)(rcu_preempt_ctrlblk.boost_time & 0xffff)); 233 (int)(rcu_preempt_ctrlblk.boost_time & 0xffff));
246 seq_printf(m, " %s: nt=%lu gt=%lu bt=%lu b=%lu ny=%lu nos=%lu\n", 234 seq_printf(m, "%s: nt=%lu egt=%lu bt=%lu ny=%lu nos=%lu\n",
247 "normal balk", 235 " balk",
248 rcu_preempt_ctrlblk.n_normal_balk_blkd_tasks, 236 rcu_preempt_ctrlblk.n_balk_blkd_tasks,
249 rcu_preempt_ctrlblk.n_normal_balk_gp_tasks, 237 rcu_preempt_ctrlblk.n_balk_exp_gp_tasks,
250 rcu_preempt_ctrlblk.n_normal_balk_boost_tasks, 238 rcu_preempt_ctrlblk.n_balk_boost_tasks,
251 rcu_preempt_ctrlblk.n_normal_balk_boosted, 239 rcu_preempt_ctrlblk.n_balk_notyet,
252 rcu_preempt_ctrlblk.n_normal_balk_notyet, 240 rcu_preempt_ctrlblk.n_balk_nos);
253 rcu_preempt_ctrlblk.n_normal_balk_nos);
254 seq_printf(m, " exp balk: bt=%lu nos=%lu\n",
255 rcu_preempt_ctrlblk.n_exp_balk_blkd_tasks,
256 rcu_preempt_ctrlblk.n_exp_balk_nos);
257#endif /* #ifdef CONFIG_RCU_BOOST */ 241#endif /* #ifdef CONFIG_RCU_BOOST */
258} 242}
259 243
@@ -271,25 +255,59 @@ static int rcu_boost(void)
271{ 255{
272 unsigned long flags; 256 unsigned long flags;
273 struct rt_mutex mtx; 257 struct rt_mutex mtx;
274 struct list_head *np;
275 struct task_struct *t; 258 struct task_struct *t;
259 struct list_head *tb;
276 260
277 if (rcu_preempt_ctrlblk.boost_tasks == NULL) 261 if (rcu_preempt_ctrlblk.boost_tasks == NULL &&
262 rcu_preempt_ctrlblk.exp_tasks == NULL)
278 return 0; /* Nothing to boost. */ 263 return 0; /* Nothing to boost. */
264
279 raw_local_irq_save(flags); 265 raw_local_irq_save(flags);
280 rcu_preempt_ctrlblk.boosted_this_gp++; 266
281 t = container_of(rcu_preempt_ctrlblk.boost_tasks, struct task_struct, 267 /*
282 rcu_node_entry); 268 * Recheck with irqs disabled: all tasks in need of boosting
283 np = rcu_next_node_entry(t); 269 * might exit their RCU read-side critical sections on their own
270 * if we are preempted just before disabling irqs.
271 */
272 if (rcu_preempt_ctrlblk.boost_tasks == NULL &&
273 rcu_preempt_ctrlblk.exp_tasks == NULL) {
274 raw_local_irq_restore(flags);
275 return 0;
276 }
277
278 /*
279 * Preferentially boost tasks blocking expedited grace periods.
280 * This cannot starve the normal grace periods because a second
281 * expedited grace period must boost all blocked tasks, including
282 * those blocking the pre-existing normal grace period.
283 */
284 if (rcu_preempt_ctrlblk.exp_tasks != NULL) {
285 tb = rcu_preempt_ctrlblk.exp_tasks;
286 RCU_TRACE(rcu_preempt_ctrlblk.n_exp_boosts++);
287 } else {
288 tb = rcu_preempt_ctrlblk.boost_tasks;
289 RCU_TRACE(rcu_preempt_ctrlblk.n_normal_boosts++);
290 }
291 RCU_TRACE(rcu_preempt_ctrlblk.n_tasks_boosted++);
292
293 /*
294 * We boost task t by manufacturing an rt_mutex that appears to
295 * be held by task t. We leave a pointer to that rt_mutex where
296 * task t can find it, and task t will release the mutex when it
297 * exits its outermost RCU read-side critical section. Then
298 * simply acquiring this artificial rt_mutex will boost task
299 * t's priority. (Thanks to tglx for suggesting this approach!)
300 */
301 t = container_of(tb, struct task_struct, rcu_node_entry);
284 rt_mutex_init_proxy_locked(&mtx, t); 302 rt_mutex_init_proxy_locked(&mtx, t);
285 t->rcu_boost_mutex = &mtx; 303 t->rcu_boost_mutex = &mtx;
286 t->rcu_read_unlock_special |= RCU_READ_UNLOCK_BOOSTED; 304 t->rcu_read_unlock_special |= RCU_READ_UNLOCK_BOOSTED;
287 raw_local_irq_restore(flags); 305 raw_local_irq_restore(flags);
288 rt_mutex_lock(&mtx); 306 rt_mutex_lock(&mtx);
289 RCU_TRACE(rcu_preempt_ctrlblk.n_tasks_boosted++); 307 rt_mutex_unlock(&mtx); /* Keep lockdep happy. */
290 rcu_preempt_ctrlblk.boosted_this_gp++; 308
291 rt_mutex_unlock(&mtx); 309 return rcu_preempt_ctrlblk.boost_tasks != NULL ||
292 return rcu_preempt_ctrlblk.boost_tasks != NULL; 310 rcu_preempt_ctrlblk.exp_tasks != NULL;
293} 311}
294 312
295/* 313/*
@@ -304,42 +322,25 @@ static int rcu_boost(void)
304 */ 322 */
305static int rcu_initiate_boost(void) 323static int rcu_initiate_boost(void)
306{ 324{
307 if (!rcu_preempt_blocked_readers_cgp()) { 325 if (!rcu_preempt_blocked_readers_cgp() &&
308 RCU_TRACE(rcu_preempt_ctrlblk.n_normal_balk_blkd_tasks++); 326 rcu_preempt_ctrlblk.exp_tasks == NULL) {
327 RCU_TRACE(rcu_preempt_ctrlblk.n_balk_exp_gp_tasks++);
309 return 0; 328 return 0;
310 } 329 }
311 if (rcu_preempt_ctrlblk.gp_tasks != NULL && 330 if (rcu_preempt_ctrlblk.exp_tasks != NULL ||
312 rcu_preempt_ctrlblk.boost_tasks == NULL && 331 (rcu_preempt_ctrlblk.gp_tasks != NULL &&
313 rcu_preempt_ctrlblk.boosted_this_gp == 0 && 332 rcu_preempt_ctrlblk.boost_tasks == NULL &&
314 ULONG_CMP_GE(jiffies, rcu_preempt_ctrlblk.boost_time)) { 333 ULONG_CMP_GE(jiffies, rcu_preempt_ctrlblk.boost_time))) {
315 rcu_preempt_ctrlblk.boost_tasks = rcu_preempt_ctrlblk.gp_tasks; 334 if (rcu_preempt_ctrlblk.exp_tasks == NULL)
335 rcu_preempt_ctrlblk.boost_tasks =
336 rcu_preempt_ctrlblk.gp_tasks;
316 invoke_rcu_kthread(); 337 invoke_rcu_kthread();
317 RCU_TRACE(rcu_preempt_ctrlblk.n_normal_boosts++);
318 } else 338 } else
319 RCU_TRACE(rcu_initiate_boost_trace()); 339 RCU_TRACE(rcu_initiate_boost_trace());
320 return 1; 340 return 1;
321} 341}
322 342
323/* 343#define RCU_BOOST_DELAY_JIFFIES DIV_ROUND_UP(CONFIG_RCU_BOOST_DELAY * HZ, 1000)
324 * Initiate boosting for an expedited grace period.
325 */
326static void rcu_initiate_expedited_boost(void)
327{
328 unsigned long flags;
329
330 raw_local_irq_save(flags);
331 if (!list_empty(&rcu_preempt_ctrlblk.blkd_tasks)) {
332 rcu_preempt_ctrlblk.boost_tasks =
333 rcu_preempt_ctrlblk.blkd_tasks.next;
334 rcu_preempt_ctrlblk.boosted_this_gp = -1;
335 invoke_rcu_kthread();
336 RCU_TRACE(rcu_preempt_ctrlblk.n_exp_boosts++);
337 } else
338 RCU_TRACE(rcu_initiate_exp_boost_trace());
339 raw_local_irq_restore(flags);
340}
341
342#define RCU_BOOST_DELAY_JIFFIES DIV_ROUND_UP(CONFIG_RCU_BOOST_DELAY * HZ, 1000);
343 344
344/* 345/*
345 * Do priority-boost accounting for the start of a new grace period. 346 * Do priority-boost accounting for the start of a new grace period.
@@ -347,8 +348,6 @@ static void rcu_initiate_expedited_boost(void)
347static void rcu_preempt_boost_start_gp(void) 348static void rcu_preempt_boost_start_gp(void)
348{ 349{
349 rcu_preempt_ctrlblk.boost_time = jiffies + RCU_BOOST_DELAY_JIFFIES; 350 rcu_preempt_ctrlblk.boost_time = jiffies + RCU_BOOST_DELAY_JIFFIES;
350 if (rcu_preempt_ctrlblk.boosted_this_gp > 0)
351 rcu_preempt_ctrlblk.boosted_this_gp = 0;
352} 351}
353 352
354#else /* #ifdef CONFIG_RCU_BOOST */ 353#else /* #ifdef CONFIG_RCU_BOOST */
@@ -372,13 +371,6 @@ static int rcu_initiate_boost(void)
372} 371}
373 372
374/* 373/*
375 * If there is no RCU priority boosting, we don't initiate expedited boosting.
376 */
377static void rcu_initiate_expedited_boost(void)
378{
379}
380
381/*
382 * If there is no RCU priority boosting, nothing to do at grace-period start. 374 * If there is no RCU priority boosting, nothing to do at grace-period start.
383 */ 375 */
384static void rcu_preempt_boost_start_gp(void) 376static void rcu_preempt_boost_start_gp(void)
@@ -418,7 +410,7 @@ static void rcu_preempt_cpu_qs(void)
418 if (!rcu_preempt_gp_in_progress()) 410 if (!rcu_preempt_gp_in_progress())
419 return; 411 return;
420 /* 412 /*
421 * Check up on boosting. If there are no readers blocking the 413 * Check up on boosting. If there are readers blocking the
422 * current grace period, leave. 414 * current grace period, leave.
423 */ 415 */
424 if (rcu_initiate_boost()) 416 if (rcu_initiate_boost())
@@ -578,7 +570,7 @@ static void rcu_read_unlock_special(struct task_struct *t)
578 empty = !rcu_preempt_blocked_readers_cgp(); 570 empty = !rcu_preempt_blocked_readers_cgp();
579 empty_exp = rcu_preempt_ctrlblk.exp_tasks == NULL; 571 empty_exp = rcu_preempt_ctrlblk.exp_tasks == NULL;
580 np = rcu_next_node_entry(t); 572 np = rcu_next_node_entry(t);
581 list_del(&t->rcu_node_entry); 573 list_del_init(&t->rcu_node_entry);
582 if (&t->rcu_node_entry == rcu_preempt_ctrlblk.gp_tasks) 574 if (&t->rcu_node_entry == rcu_preempt_ctrlblk.gp_tasks)
583 rcu_preempt_ctrlblk.gp_tasks = np; 575 rcu_preempt_ctrlblk.gp_tasks = np;
584 if (&t->rcu_node_entry == rcu_preempt_ctrlblk.exp_tasks) 576 if (&t->rcu_node_entry == rcu_preempt_ctrlblk.exp_tasks)
@@ -587,7 +579,6 @@ static void rcu_read_unlock_special(struct task_struct *t)
587 if (&t->rcu_node_entry == rcu_preempt_ctrlblk.boost_tasks) 579 if (&t->rcu_node_entry == rcu_preempt_ctrlblk.boost_tasks)
588 rcu_preempt_ctrlblk.boost_tasks = np; 580 rcu_preempt_ctrlblk.boost_tasks = np;
589#endif /* #ifdef CONFIG_RCU_BOOST */ 581#endif /* #ifdef CONFIG_RCU_BOOST */
590 INIT_LIST_HEAD(&t->rcu_node_entry);
591 582
592 /* 583 /*
593 * If this was the last task on the current list, and if 584 * If this was the last task on the current list, and if
@@ -812,13 +803,16 @@ void synchronize_rcu_expedited(void)
812 rpcp->exp_tasks = rpcp->blkd_tasks.next; 803 rpcp->exp_tasks = rpcp->blkd_tasks.next;
813 if (rpcp->exp_tasks == &rpcp->blkd_tasks) 804 if (rpcp->exp_tasks == &rpcp->blkd_tasks)
814 rpcp->exp_tasks = NULL; 805 rpcp->exp_tasks = NULL;
815 local_irq_restore(flags);
816 806
817 /* Wait for tail of ->blkd_tasks list to drain. */ 807 /* Wait for tail of ->blkd_tasks list to drain. */
818 if (rcu_preempted_readers_exp()) 808 if (!rcu_preempted_readers_exp())
819 rcu_initiate_expedited_boost(); 809 local_irq_restore(flags);
810 else {
811 rcu_initiate_boost();
812 local_irq_restore(flags);
820 wait_event(sync_rcu_preempt_exp_wq, 813 wait_event(sync_rcu_preempt_exp_wq,
821 !rcu_preempted_readers_exp()); 814 !rcu_preempted_readers_exp());
815 }
822 816
823 /* Clean up and exit. */ 817 /* Clean up and exit. */
824 barrier(); /* ensure expedited GP seen before counter increment. */ 818 barrier(); /* ensure expedited GP seen before counter increment. */
@@ -931,24 +925,17 @@ void __init rcu_scheduler_starting(void)
931 925
932static void rcu_initiate_boost_trace(void) 926static void rcu_initiate_boost_trace(void)
933{ 927{
934 if (rcu_preempt_ctrlblk.gp_tasks == NULL) 928 if (list_empty(&rcu_preempt_ctrlblk.blkd_tasks))
935 rcu_preempt_ctrlblk.n_normal_balk_gp_tasks++; 929 rcu_preempt_ctrlblk.n_balk_blkd_tasks++;
930 else if (rcu_preempt_ctrlblk.gp_tasks == NULL &&
931 rcu_preempt_ctrlblk.exp_tasks == NULL)
932 rcu_preempt_ctrlblk.n_balk_exp_gp_tasks++;
936 else if (rcu_preempt_ctrlblk.boost_tasks != NULL) 933 else if (rcu_preempt_ctrlblk.boost_tasks != NULL)
937 rcu_preempt_ctrlblk.n_normal_balk_boost_tasks++; 934 rcu_preempt_ctrlblk.n_balk_boost_tasks++;
938 else if (rcu_preempt_ctrlblk.boosted_this_gp != 0)
939 rcu_preempt_ctrlblk.n_normal_balk_boosted++;
940 else if (!ULONG_CMP_GE(jiffies, rcu_preempt_ctrlblk.boost_time)) 935 else if (!ULONG_CMP_GE(jiffies, rcu_preempt_ctrlblk.boost_time))
941 rcu_preempt_ctrlblk.n_normal_balk_notyet++; 936 rcu_preempt_ctrlblk.n_balk_notyet++;
942 else
943 rcu_preempt_ctrlblk.n_normal_balk_nos++;
944}
945
946static void rcu_initiate_exp_boost_trace(void)
947{
948 if (list_empty(&rcu_preempt_ctrlblk.blkd_tasks))
949 rcu_preempt_ctrlblk.n_exp_balk_blkd_tasks++;
950 else 937 else
951 rcu_preempt_ctrlblk.n_exp_balk_nos++; 938 rcu_preempt_ctrlblk.n_balk_nos++;
952} 939}
953 940
954#endif /* #ifdef CONFIG_RCU_BOOST */ 941#endif /* #ifdef CONFIG_RCU_BOOST */
diff --git a/kernel/rcutorture.c b/kernel/rcutorture.c
index c224da41890c..2e138db03382 100644
--- a/kernel/rcutorture.c
+++ b/kernel/rcutorture.c
@@ -131,7 +131,7 @@ struct rcu_torture {
131 131
132static LIST_HEAD(rcu_torture_freelist); 132static LIST_HEAD(rcu_torture_freelist);
133static struct rcu_torture __rcu *rcu_torture_current; 133static struct rcu_torture __rcu *rcu_torture_current;
134static long rcu_torture_current_version; 134static unsigned long rcu_torture_current_version;
135static struct rcu_torture rcu_tortures[10 * RCU_TORTURE_PIPE_LEN]; 135static struct rcu_torture rcu_tortures[10 * RCU_TORTURE_PIPE_LEN];
136static DEFINE_SPINLOCK(rcu_torture_lock); 136static DEFINE_SPINLOCK(rcu_torture_lock);
137static DEFINE_PER_CPU(long [RCU_TORTURE_PIPE_LEN + 1], rcu_torture_count) = 137static DEFINE_PER_CPU(long [RCU_TORTURE_PIPE_LEN + 1], rcu_torture_count) =
@@ -146,8 +146,6 @@ static atomic_t n_rcu_torture_mberror;
146static atomic_t n_rcu_torture_error; 146static atomic_t n_rcu_torture_error;
147static long n_rcu_torture_boost_ktrerror; 147static long n_rcu_torture_boost_ktrerror;
148static long n_rcu_torture_boost_rterror; 148static long n_rcu_torture_boost_rterror;
149static long n_rcu_torture_boost_allocerror;
150static long n_rcu_torture_boost_afferror;
151static long n_rcu_torture_boost_failure; 149static long n_rcu_torture_boost_failure;
152static long n_rcu_torture_boosts; 150static long n_rcu_torture_boosts;
153static long n_rcu_torture_timers; 151static long n_rcu_torture_timers;
@@ -163,11 +161,11 @@ static int stutter_pause_test;
163#endif 161#endif
164int rcutorture_runnable = RCUTORTURE_RUNNABLE_INIT; 162int rcutorture_runnable = RCUTORTURE_RUNNABLE_INIT;
165 163
166#ifdef CONFIG_RCU_BOOST 164#if defined(CONFIG_RCU_BOOST) && !defined(CONFIG_HOTPLUG_CPU)
167#define rcu_can_boost() 1 165#define rcu_can_boost() 1
168#else /* #ifdef CONFIG_RCU_BOOST */ 166#else /* #if defined(CONFIG_RCU_BOOST) && !defined(CONFIG_HOTPLUG_CPU) */
169#define rcu_can_boost() 0 167#define rcu_can_boost() 0
170#endif /* #else #ifdef CONFIG_RCU_BOOST */ 168#endif /* #else #if defined(CONFIG_RCU_BOOST) && !defined(CONFIG_HOTPLUG_CPU) */
171 169
172static unsigned long boost_starttime; /* jiffies of next boost test start. */ 170static unsigned long boost_starttime; /* jiffies of next boost test start. */
173DEFINE_MUTEX(boost_mutex); /* protect setting boost_starttime */ 171DEFINE_MUTEX(boost_mutex); /* protect setting boost_starttime */
@@ -751,6 +749,7 @@ static int rcu_torture_boost(void *arg)
751 n_rcu_torture_boost_rterror++; 749 n_rcu_torture_boost_rterror++;
752 } 750 }
753 751
752 init_rcu_head_on_stack(&rbi.rcu);
754 /* Each pass through the following loop does one boost-test cycle. */ 753 /* Each pass through the following loop does one boost-test cycle. */
755 do { 754 do {
756 /* Wait for the next test interval. */ 755 /* Wait for the next test interval. */
@@ -810,6 +809,7 @@ checkwait: rcu_stutter_wait("rcu_torture_boost");
810 809
811 /* Clean up and exit. */ 810 /* Clean up and exit. */
812 VERBOSE_PRINTK_STRING("rcu_torture_boost task stopping"); 811 VERBOSE_PRINTK_STRING("rcu_torture_boost task stopping");
812 destroy_rcu_head_on_stack(&rbi.rcu);
813 rcutorture_shutdown_absorb("rcu_torture_boost"); 813 rcutorture_shutdown_absorb("rcu_torture_boost");
814 while (!kthread_should_stop() || rbi.inflight) 814 while (!kthread_should_stop() || rbi.inflight)
815 schedule_timeout_uninterruptible(1); 815 schedule_timeout_uninterruptible(1);
@@ -886,7 +886,7 @@ rcu_torture_writer(void *arg)
886 old_rp->rtort_pipe_count++; 886 old_rp->rtort_pipe_count++;
887 cur_ops->deferred_free(old_rp); 887 cur_ops->deferred_free(old_rp);
888 } 888 }
889 rcu_torture_current_version++; 889 rcutorture_record_progress(++rcu_torture_current_version);
890 oldbatch = cur_ops->completed(); 890 oldbatch = cur_ops->completed();
891 rcu_stutter_wait("rcu_torture_writer"); 891 rcu_stutter_wait("rcu_torture_writer");
892 } while (!kthread_should_stop() && fullstop == FULLSTOP_DONTSTOP); 892 } while (!kthread_should_stop() && fullstop == FULLSTOP_DONTSTOP);
@@ -1066,8 +1066,8 @@ rcu_torture_printk(char *page)
1066 } 1066 }
1067 cnt += sprintf(&page[cnt], "%s%s ", torture_type, TORTURE_FLAG); 1067 cnt += sprintf(&page[cnt], "%s%s ", torture_type, TORTURE_FLAG);
1068 cnt += sprintf(&page[cnt], 1068 cnt += sprintf(&page[cnt],
1069 "rtc: %p ver: %ld tfle: %d rta: %d rtaf: %d rtf: %d " 1069 "rtc: %p ver: %lu tfle: %d rta: %d rtaf: %d rtf: %d "
1070 "rtmbe: %d rtbke: %ld rtbre: %ld rtbae: %ld rtbafe: %ld " 1070 "rtmbe: %d rtbke: %ld rtbre: %ld "
1071 "rtbf: %ld rtb: %ld nt: %ld", 1071 "rtbf: %ld rtb: %ld nt: %ld",
1072 rcu_torture_current, 1072 rcu_torture_current,
1073 rcu_torture_current_version, 1073 rcu_torture_current_version,
@@ -1078,16 +1078,12 @@ rcu_torture_printk(char *page)
1078 atomic_read(&n_rcu_torture_mberror), 1078 atomic_read(&n_rcu_torture_mberror),
1079 n_rcu_torture_boost_ktrerror, 1079 n_rcu_torture_boost_ktrerror,
1080 n_rcu_torture_boost_rterror, 1080 n_rcu_torture_boost_rterror,
1081 n_rcu_torture_boost_allocerror,
1082 n_rcu_torture_boost_afferror,
1083 n_rcu_torture_boost_failure, 1081 n_rcu_torture_boost_failure,
1084 n_rcu_torture_boosts, 1082 n_rcu_torture_boosts,
1085 n_rcu_torture_timers); 1083 n_rcu_torture_timers);
1086 if (atomic_read(&n_rcu_torture_mberror) != 0 || 1084 if (atomic_read(&n_rcu_torture_mberror) != 0 ||
1087 n_rcu_torture_boost_ktrerror != 0 || 1085 n_rcu_torture_boost_ktrerror != 0 ||
1088 n_rcu_torture_boost_rterror != 0 || 1086 n_rcu_torture_boost_rterror != 0 ||
1089 n_rcu_torture_boost_allocerror != 0 ||
1090 n_rcu_torture_boost_afferror != 0 ||
1091 n_rcu_torture_boost_failure != 0) 1087 n_rcu_torture_boost_failure != 0)
1092 cnt += sprintf(&page[cnt], " !!!"); 1088 cnt += sprintf(&page[cnt], " !!!");
1093 cnt += sprintf(&page[cnt], "\n%s%s ", torture_type, TORTURE_FLAG); 1089 cnt += sprintf(&page[cnt], "\n%s%s ", torture_type, TORTURE_FLAG);
@@ -1331,6 +1327,7 @@ rcu_torture_cleanup(void)
1331 int i; 1327 int i;
1332 1328
1333 mutex_lock(&fullstop_mutex); 1329 mutex_lock(&fullstop_mutex);
1330 rcutorture_record_test_transition();
1334 if (fullstop == FULLSTOP_SHUTDOWN) { 1331 if (fullstop == FULLSTOP_SHUTDOWN) {
1335 printk(KERN_WARNING /* but going down anyway, so... */ 1332 printk(KERN_WARNING /* but going down anyway, so... */
1336 "Concurrent 'rmmod rcutorture' and shutdown illegal!\n"); 1333 "Concurrent 'rmmod rcutorture' and shutdown illegal!\n");
@@ -1486,8 +1483,6 @@ rcu_torture_init(void)
1486 atomic_set(&n_rcu_torture_error, 0); 1483 atomic_set(&n_rcu_torture_error, 0);
1487 n_rcu_torture_boost_ktrerror = 0; 1484 n_rcu_torture_boost_ktrerror = 0;
1488 n_rcu_torture_boost_rterror = 0; 1485 n_rcu_torture_boost_rterror = 0;
1489 n_rcu_torture_boost_allocerror = 0;
1490 n_rcu_torture_boost_afferror = 0;
1491 n_rcu_torture_boost_failure = 0; 1486 n_rcu_torture_boost_failure = 0;
1492 n_rcu_torture_boosts = 0; 1487 n_rcu_torture_boosts = 0;
1493 for (i = 0; i < RCU_TORTURE_PIPE_LEN + 1; i++) 1488 for (i = 0; i < RCU_TORTURE_PIPE_LEN + 1; i++)
@@ -1624,6 +1619,7 @@ rcu_torture_init(void)
1624 } 1619 }
1625 } 1620 }
1626 register_reboot_notifier(&rcutorture_shutdown_nb); 1621 register_reboot_notifier(&rcutorture_shutdown_nb);
1622 rcutorture_record_test_transition();
1627 mutex_unlock(&fullstop_mutex); 1623 mutex_unlock(&fullstop_mutex);
1628 return 0; 1624 return 0;
1629 1625
diff --git a/kernel/rcutree.c b/kernel/rcutree.c
index dd4aea806f8e..f07d2f03181a 100644
--- a/kernel/rcutree.c
+++ b/kernel/rcutree.c
@@ -47,6 +47,9 @@
47#include <linux/mutex.h> 47#include <linux/mutex.h>
48#include <linux/time.h> 48#include <linux/time.h>
49#include <linux/kernel_stat.h> 49#include <linux/kernel_stat.h>
50#include <linux/wait.h>
51#include <linux/kthread.h>
52#include <linux/prefetch.h>
50 53
51#include "rcutree.h" 54#include "rcutree.h"
52 55
@@ -79,10 +82,41 @@ DEFINE_PER_CPU(struct rcu_data, rcu_sched_data);
79struct rcu_state rcu_bh_state = RCU_STATE_INITIALIZER(rcu_bh_state); 82struct rcu_state rcu_bh_state = RCU_STATE_INITIALIZER(rcu_bh_state);
80DEFINE_PER_CPU(struct rcu_data, rcu_bh_data); 83DEFINE_PER_CPU(struct rcu_data, rcu_bh_data);
81 84
85static struct rcu_state *rcu_state;
86
82int rcu_scheduler_active __read_mostly; 87int rcu_scheduler_active __read_mostly;
83EXPORT_SYMBOL_GPL(rcu_scheduler_active); 88EXPORT_SYMBOL_GPL(rcu_scheduler_active);
84 89
85/* 90/*
91 * Control variables for per-CPU and per-rcu_node kthreads. These
92 * handle all flavors of RCU.
93 */
94static DEFINE_PER_CPU(struct task_struct *, rcu_cpu_kthread_task);
95DEFINE_PER_CPU(unsigned int, rcu_cpu_kthread_status);
96DEFINE_PER_CPU(int, rcu_cpu_kthread_cpu);
97DEFINE_PER_CPU(unsigned int, rcu_cpu_kthread_loops);
98static DEFINE_PER_CPU(wait_queue_head_t, rcu_cpu_wq);
99DEFINE_PER_CPU(char, rcu_cpu_has_work);
100static char rcu_kthreads_spawnable;
101
102static void rcu_node_kthread_setaffinity(struct rcu_node *rnp, int outgoingcpu);
103static void invoke_rcu_cpu_kthread(void);
104
105#define RCU_KTHREAD_PRIO 1 /* RT priority for per-CPU kthreads. */
106
107/*
108 * Track the rcutorture test sequence number and the update version
109 * number within a given test. The rcutorture_testseq is incremented
110 * on every rcutorture module load and unload, so has an odd value
111 * when a test is running. The rcutorture_vernum is set to zero
112 * when rcutorture starts and is incremented on each rcutorture update.
113 * These variables enable correlating rcutorture output with the
114 * RCU tracing information.
115 */
116unsigned long rcutorture_testseq;
117unsigned long rcutorture_vernum;
118
119/*
86 * Return true if an RCU grace period is in progress. The ACCESS_ONCE()s 120 * Return true if an RCU grace period is in progress. The ACCESS_ONCE()s
87 * permit this function to be invoked without holding the root rcu_node 121 * permit this function to be invoked without holding the root rcu_node
88 * structure's ->lock, but of course results can be subject to change. 122 * structure's ->lock, but of course results can be subject to change.
@@ -124,6 +158,7 @@ void rcu_note_context_switch(int cpu)
124 rcu_sched_qs(cpu); 158 rcu_sched_qs(cpu);
125 rcu_preempt_note_context_switch(cpu); 159 rcu_preempt_note_context_switch(cpu);
126} 160}
161EXPORT_SYMBOL_GPL(rcu_note_context_switch);
127 162
128#ifdef CONFIG_NO_HZ 163#ifdef CONFIG_NO_HZ
129DEFINE_PER_CPU(struct rcu_dynticks, rcu_dynticks) = { 164DEFINE_PER_CPU(struct rcu_dynticks, rcu_dynticks) = {
@@ -140,10 +175,8 @@ module_param(blimit, int, 0);
140module_param(qhimark, int, 0); 175module_param(qhimark, int, 0);
141module_param(qlowmark, int, 0); 176module_param(qlowmark, int, 0);
142 177
143#ifdef CONFIG_RCU_CPU_STALL_DETECTOR 178int rcu_cpu_stall_suppress __read_mostly;
144int rcu_cpu_stall_suppress __read_mostly = RCU_CPU_STALL_SUPPRESS_INIT;
145module_param(rcu_cpu_stall_suppress, int, 0644); 179module_param(rcu_cpu_stall_suppress, int, 0644);
146#endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */
147 180
148static void force_quiescent_state(struct rcu_state *rsp, int relaxed); 181static void force_quiescent_state(struct rcu_state *rsp, int relaxed);
149static int rcu_pending(int cpu); 182static int rcu_pending(int cpu);
@@ -176,6 +209,31 @@ void rcu_bh_force_quiescent_state(void)
176EXPORT_SYMBOL_GPL(rcu_bh_force_quiescent_state); 209EXPORT_SYMBOL_GPL(rcu_bh_force_quiescent_state);
177 210
178/* 211/*
212 * Record the number of times rcutorture tests have been initiated and
213 * terminated. This information allows the debugfs tracing stats to be
214 * correlated to the rcutorture messages, even when the rcutorture module
215 * is being repeatedly loaded and unloaded. In other words, we cannot
216 * store this state in rcutorture itself.
217 */
218void rcutorture_record_test_transition(void)
219{
220 rcutorture_testseq++;
221 rcutorture_vernum = 0;
222}
223EXPORT_SYMBOL_GPL(rcutorture_record_test_transition);
224
225/*
226 * Record the number of writer passes through the current rcutorture test.
227 * This is also used to correlate debugfs tracing stats with the rcutorture
228 * messages.
229 */
230void rcutorture_record_progress(unsigned long vernum)
231{
232 rcutorture_vernum++;
233}
234EXPORT_SYMBOL_GPL(rcutorture_record_progress);
235
236/*
179 * Force a quiescent state for RCU-sched. 237 * Force a quiescent state for RCU-sched.
180 */ 238 */
181void rcu_sched_force_quiescent_state(void) 239void rcu_sched_force_quiescent_state(void)
@@ -234,8 +292,8 @@ static int rcu_implicit_offline_qs(struct rcu_data *rdp)
234 return 1; 292 return 1;
235 } 293 }
236 294
237 /* If preemptable RCU, no point in sending reschedule IPI. */ 295 /* If preemptible RCU, no point in sending reschedule IPI. */
238 if (rdp->preemptable) 296 if (rdp->preemptible)
239 return 0; 297 return 0;
240 298
241 /* The CPU is online, so send it a reschedule IPI. */ 299 /* The CPU is online, so send it a reschedule IPI. */
@@ -450,8 +508,6 @@ static int rcu_implicit_dynticks_qs(struct rcu_data *rdp)
450 508
451#endif /* #else #ifdef CONFIG_NO_HZ */ 509#endif /* #else #ifdef CONFIG_NO_HZ */
452 510
453#ifdef CONFIG_RCU_CPU_STALL_DETECTOR
454
455int rcu_cpu_stall_suppress __read_mostly; 511int rcu_cpu_stall_suppress __read_mostly;
456 512
457static void record_gp_stall_check_time(struct rcu_state *rsp) 513static void record_gp_stall_check_time(struct rcu_state *rsp)
@@ -537,21 +593,24 @@ static void print_cpu_stall(struct rcu_state *rsp)
537 593
538static void check_cpu_stall(struct rcu_state *rsp, struct rcu_data *rdp) 594static void check_cpu_stall(struct rcu_state *rsp, struct rcu_data *rdp)
539{ 595{
540 long delta; 596 unsigned long j;
597 unsigned long js;
541 struct rcu_node *rnp; 598 struct rcu_node *rnp;
542 599
543 if (rcu_cpu_stall_suppress) 600 if (rcu_cpu_stall_suppress)
544 return; 601 return;
545 delta = jiffies - ACCESS_ONCE(rsp->jiffies_stall); 602 j = ACCESS_ONCE(jiffies);
603 js = ACCESS_ONCE(rsp->jiffies_stall);
546 rnp = rdp->mynode; 604 rnp = rdp->mynode;
547 if ((ACCESS_ONCE(rnp->qsmask) & rdp->grpmask) && delta >= 0) { 605 if ((ACCESS_ONCE(rnp->qsmask) & rdp->grpmask) && ULONG_CMP_GE(j, js)) {
548 606
549 /* We haven't checked in, so go dump stack. */ 607 /* We haven't checked in, so go dump stack. */
550 print_cpu_stall(rsp); 608 print_cpu_stall(rsp);
551 609
552 } else if (rcu_gp_in_progress(rsp) && delta >= RCU_STALL_RAT_DELAY) { 610 } else if (rcu_gp_in_progress(rsp) &&
611 ULONG_CMP_GE(j, js + RCU_STALL_RAT_DELAY)) {
553 612
554 /* They had two time units to dump stack, so complain. */ 613 /* They had a few time units to dump stack, so complain. */
555 print_other_cpu_stall(rsp); 614 print_other_cpu_stall(rsp);
556 } 615 }
557} 616}
@@ -587,26 +646,6 @@ static void __init check_cpu_stall_init(void)
587 atomic_notifier_chain_register(&panic_notifier_list, &rcu_panic_block); 646 atomic_notifier_chain_register(&panic_notifier_list, &rcu_panic_block);
588} 647}
589 648
590#else /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */
591
592static void record_gp_stall_check_time(struct rcu_state *rsp)
593{
594}
595
596static void check_cpu_stall(struct rcu_state *rsp, struct rcu_data *rdp)
597{
598}
599
600void rcu_cpu_stall_reset(void)
601{
602}
603
604static void __init check_cpu_stall_init(void)
605{
606}
607
608#endif /* #else #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */
609
610/* 649/*
611 * Update CPU-local rcu_data state to record the newly noticed grace period. 650 * Update CPU-local rcu_data state to record the newly noticed grace period.
612 * This is used both when we started the grace period and when we notice 651 * This is used both when we started the grace period and when we notice
@@ -809,6 +848,7 @@ rcu_start_gp(struct rcu_state *rsp, unsigned long flags)
809 rnp->completed = rsp->completed; 848 rnp->completed = rsp->completed;
810 rsp->signaled = RCU_SIGNAL_INIT; /* force_quiescent_state OK. */ 849 rsp->signaled = RCU_SIGNAL_INIT; /* force_quiescent_state OK. */
811 rcu_start_gp_per_cpu(rsp, rnp, rdp); 850 rcu_start_gp_per_cpu(rsp, rnp, rdp);
851 rcu_preempt_boost_start_gp(rnp);
812 raw_spin_unlock_irqrestore(&rnp->lock, flags); 852 raw_spin_unlock_irqrestore(&rnp->lock, flags);
813 return; 853 return;
814 } 854 }
@@ -844,6 +884,7 @@ rcu_start_gp(struct rcu_state *rsp, unsigned long flags)
844 rnp->completed = rsp->completed; 884 rnp->completed = rsp->completed;
845 if (rnp == rdp->mynode) 885 if (rnp == rdp->mynode)
846 rcu_start_gp_per_cpu(rsp, rnp, rdp); 886 rcu_start_gp_per_cpu(rsp, rnp, rdp);
887 rcu_preempt_boost_start_gp(rnp);
847 raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */ 888 raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */
848 } 889 }
849 890
@@ -864,7 +905,12 @@ rcu_start_gp(struct rcu_state *rsp, unsigned long flags)
864static void rcu_report_qs_rsp(struct rcu_state *rsp, unsigned long flags) 905static void rcu_report_qs_rsp(struct rcu_state *rsp, unsigned long flags)
865 __releases(rcu_get_root(rsp)->lock) 906 __releases(rcu_get_root(rsp)->lock)
866{ 907{
908 unsigned long gp_duration;
909
867 WARN_ON_ONCE(!rcu_gp_in_progress(rsp)); 910 WARN_ON_ONCE(!rcu_gp_in_progress(rsp));
911 gp_duration = jiffies - rsp->gp_start;
912 if (gp_duration > rsp->gp_max)
913 rsp->gp_max = gp_duration;
868 rsp->completed = rsp->gpnum; 914 rsp->completed = rsp->gpnum;
869 rsp->signaled = RCU_GP_IDLE; 915 rsp->signaled = RCU_GP_IDLE;
870 rcu_start_gp(rsp, flags); /* releases root node's rnp->lock. */ 916 rcu_start_gp(rsp, flags); /* releases root node's rnp->lock. */
@@ -894,7 +940,7 @@ rcu_report_qs_rnp(unsigned long mask, struct rcu_state *rsp,
894 return; 940 return;
895 } 941 }
896 rnp->qsmask &= ~mask; 942 rnp->qsmask &= ~mask;
897 if (rnp->qsmask != 0 || rcu_preempted_readers(rnp)) { 943 if (rnp->qsmask != 0 || rcu_preempt_blocked_readers_cgp(rnp)) {
898 944
899 /* Other bits still set at this level, so done. */ 945 /* Other bits still set at this level, so done. */
900 raw_spin_unlock_irqrestore(&rnp->lock, flags); 946 raw_spin_unlock_irqrestore(&rnp->lock, flags);
@@ -1037,6 +1083,8 @@ static void rcu_send_cbs_to_online(struct rcu_state *rsp)
1037/* 1083/*
1038 * Remove the outgoing CPU from the bitmasks in the rcu_node hierarchy 1084 * Remove the outgoing CPU from the bitmasks in the rcu_node hierarchy
1039 * and move all callbacks from the outgoing CPU to the current one. 1085 * and move all callbacks from the outgoing CPU to the current one.
1086 * There can only be one CPU hotplug operation at a time, so no other
1087 * CPU can be attempting to update rcu_cpu_kthread_task.
1040 */ 1088 */
1041static void __rcu_offline_cpu(int cpu, struct rcu_state *rsp) 1089static void __rcu_offline_cpu(int cpu, struct rcu_state *rsp)
1042{ 1090{
@@ -1045,6 +1093,14 @@ static void __rcu_offline_cpu(int cpu, struct rcu_state *rsp)
1045 int need_report = 0; 1093 int need_report = 0;
1046 struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu); 1094 struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu);
1047 struct rcu_node *rnp; 1095 struct rcu_node *rnp;
1096 struct task_struct *t;
1097
1098 /* Stop the CPU's kthread. */
1099 t = per_cpu(rcu_cpu_kthread_task, cpu);
1100 if (t != NULL) {
1101 per_cpu(rcu_cpu_kthread_task, cpu) = NULL;
1102 kthread_stop(t);
1103 }
1048 1104
1049 /* Exclude any attempts to start a new grace period. */ 1105 /* Exclude any attempts to start a new grace period. */
1050 raw_spin_lock_irqsave(&rsp->onofflock, flags); 1106 raw_spin_lock_irqsave(&rsp->onofflock, flags);
@@ -1082,6 +1138,7 @@ static void __rcu_offline_cpu(int cpu, struct rcu_state *rsp)
1082 raw_spin_unlock_irqrestore(&rnp->lock, flags); 1138 raw_spin_unlock_irqrestore(&rnp->lock, flags);
1083 if (need_report & RCU_OFL_TASKS_EXP_GP) 1139 if (need_report & RCU_OFL_TASKS_EXP_GP)
1084 rcu_report_exp_rnp(rsp, rnp); 1140 rcu_report_exp_rnp(rsp, rnp);
1141 rcu_node_kthread_setaffinity(rnp, -1);
1085} 1142}
1086 1143
1087/* 1144/*
@@ -1143,7 +1200,7 @@ static void rcu_do_batch(struct rcu_state *rsp, struct rcu_data *rdp)
1143 next = list->next; 1200 next = list->next;
1144 prefetch(next); 1201 prefetch(next);
1145 debug_rcu_head_unqueue(list); 1202 debug_rcu_head_unqueue(list);
1146 list->func(list); 1203 __rcu_reclaim(list);
1147 list = next; 1204 list = next;
1148 if (++count >= rdp->blimit) 1205 if (++count >= rdp->blimit)
1149 break; 1206 break;
@@ -1179,7 +1236,7 @@ static void rcu_do_batch(struct rcu_state *rsp, struct rcu_data *rdp)
1179 1236
1180 /* Re-raise the RCU softirq if there are callbacks remaining. */ 1237 /* Re-raise the RCU softirq if there are callbacks remaining. */
1181 if (cpu_has_callbacks_ready_to_invoke(rdp)) 1238 if (cpu_has_callbacks_ready_to_invoke(rdp))
1182 raise_softirq(RCU_SOFTIRQ); 1239 invoke_rcu_cpu_kthread();
1183} 1240}
1184 1241
1185/* 1242/*
@@ -1225,7 +1282,7 @@ void rcu_check_callbacks(int cpu, int user)
1225 } 1282 }
1226 rcu_preempt_check_callbacks(cpu); 1283 rcu_preempt_check_callbacks(cpu);
1227 if (rcu_pending(cpu)) 1284 if (rcu_pending(cpu))
1228 raise_softirq(RCU_SOFTIRQ); 1285 invoke_rcu_cpu_kthread();
1229} 1286}
1230 1287
1231#ifdef CONFIG_SMP 1288#ifdef CONFIG_SMP
@@ -1233,6 +1290,8 @@ void rcu_check_callbacks(int cpu, int user)
1233/* 1290/*
1234 * Scan the leaf rcu_node structures, processing dyntick state for any that 1291 * Scan the leaf rcu_node structures, processing dyntick state for any that
1235 * have not yet encountered a quiescent state, using the function specified. 1292 * have not yet encountered a quiescent state, using the function specified.
1293 * Also initiate boosting for any threads blocked on the root rcu_node.
1294 *
1236 * The caller must have suppressed start of new grace periods. 1295 * The caller must have suppressed start of new grace periods.
1237 */ 1296 */
1238static void force_qs_rnp(struct rcu_state *rsp, int (*f)(struct rcu_data *)) 1297static void force_qs_rnp(struct rcu_state *rsp, int (*f)(struct rcu_data *))
@@ -1251,7 +1310,7 @@ static void force_qs_rnp(struct rcu_state *rsp, int (*f)(struct rcu_data *))
1251 return; 1310 return;
1252 } 1311 }
1253 if (rnp->qsmask == 0) { 1312 if (rnp->qsmask == 0) {
1254 raw_spin_unlock_irqrestore(&rnp->lock, flags); 1313 rcu_initiate_boost(rnp, flags); /* releases rnp->lock */
1255 continue; 1314 continue;
1256 } 1315 }
1257 cpu = rnp->grplo; 1316 cpu = rnp->grplo;
@@ -1269,6 +1328,11 @@ static void force_qs_rnp(struct rcu_state *rsp, int (*f)(struct rcu_data *))
1269 } 1328 }
1270 raw_spin_unlock_irqrestore(&rnp->lock, flags); 1329 raw_spin_unlock_irqrestore(&rnp->lock, flags);
1271 } 1330 }
1331 rnp = rcu_get_root(rsp);
1332 if (rnp->qsmask == 0) {
1333 raw_spin_lock_irqsave(&rnp->lock, flags);
1334 rcu_initiate_boost(rnp, flags); /* releases rnp->lock. */
1335 }
1272} 1336}
1273 1337
1274/* 1338/*
@@ -1389,7 +1453,7 @@ __rcu_process_callbacks(struct rcu_state *rsp, struct rcu_data *rdp)
1389/* 1453/*
1390 * Do softirq processing for the current CPU. 1454 * Do softirq processing for the current CPU.
1391 */ 1455 */
1392static void rcu_process_callbacks(struct softirq_action *unused) 1456static void rcu_process_callbacks(void)
1393{ 1457{
1394 /* 1458 /*
1395 * Memory references from any prior RCU read-side critical sections 1459 * Memory references from any prior RCU read-side critical sections
@@ -1414,6 +1478,347 @@ static void rcu_process_callbacks(struct softirq_action *unused)
1414 rcu_needs_cpu_flush(); 1478 rcu_needs_cpu_flush();
1415} 1479}
1416 1480
1481/*
1482 * Wake up the current CPU's kthread. This replaces raise_softirq()
1483 * in earlier versions of RCU. Note that because we are running on
1484 * the current CPU with interrupts disabled, the rcu_cpu_kthread_task
1485 * cannot disappear out from under us.
1486 */
1487static void invoke_rcu_cpu_kthread(void)
1488{
1489 unsigned long flags;
1490
1491 local_irq_save(flags);
1492 __this_cpu_write(rcu_cpu_has_work, 1);
1493 if (__this_cpu_read(rcu_cpu_kthread_task) == NULL) {
1494 local_irq_restore(flags);
1495 return;
1496 }
1497 wake_up(&__get_cpu_var(rcu_cpu_wq));
1498 local_irq_restore(flags);
1499}
1500
1501/*
1502 * Wake up the specified per-rcu_node-structure kthread.
1503 * Because the per-rcu_node kthreads are immortal, we don't need
1504 * to do anything to keep them alive.
1505 */
1506static void invoke_rcu_node_kthread(struct rcu_node *rnp)
1507{
1508 struct task_struct *t;
1509
1510 t = rnp->node_kthread_task;
1511 if (t != NULL)
1512 wake_up_process(t);
1513}
1514
1515/*
1516 * Set the specified CPU's kthread to run RT or not, as specified by
1517 * the to_rt argument. The CPU-hotplug locks are held, so the task
1518 * is not going away.
1519 */
1520static void rcu_cpu_kthread_setrt(int cpu, int to_rt)
1521{
1522 int policy;
1523 struct sched_param sp;
1524 struct task_struct *t;
1525
1526 t = per_cpu(rcu_cpu_kthread_task, cpu);
1527 if (t == NULL)
1528 return;
1529 if (to_rt) {
1530 policy = SCHED_FIFO;
1531 sp.sched_priority = RCU_KTHREAD_PRIO;
1532 } else {
1533 policy = SCHED_NORMAL;
1534 sp.sched_priority = 0;
1535 }
1536 sched_setscheduler_nocheck(t, policy, &sp);
1537}
1538
1539/*
1540 * Timer handler to initiate the waking up of per-CPU kthreads that
1541 * have yielded the CPU due to excess numbers of RCU callbacks.
1542 * We wake up the per-rcu_node kthread, which in turn will wake up
1543 * the booster kthread.
1544 */
1545static void rcu_cpu_kthread_timer(unsigned long arg)
1546{
1547 unsigned long flags;
1548 struct rcu_data *rdp = per_cpu_ptr(rcu_state->rda, arg);
1549 struct rcu_node *rnp = rdp->mynode;
1550
1551 raw_spin_lock_irqsave(&rnp->lock, flags);
1552 rnp->wakemask |= rdp->grpmask;
1553 raw_spin_unlock_irqrestore(&rnp->lock, flags);
1554 invoke_rcu_node_kthread(rnp);
1555}
1556
1557/*
1558 * Drop to non-real-time priority and yield, but only after posting a
1559 * timer that will cause us to regain our real-time priority if we
1560 * remain preempted. Either way, we restore our real-time priority
1561 * before returning.
1562 */
1563static void rcu_yield(void (*f)(unsigned long), unsigned long arg)
1564{
1565 struct sched_param sp;
1566 struct timer_list yield_timer;
1567
1568 setup_timer_on_stack(&yield_timer, f, arg);
1569 mod_timer(&yield_timer, jiffies + 2);
1570 sp.sched_priority = 0;
1571 sched_setscheduler_nocheck(current, SCHED_NORMAL, &sp);
1572 set_user_nice(current, 19);
1573 schedule();
1574 sp.sched_priority = RCU_KTHREAD_PRIO;
1575 sched_setscheduler_nocheck(current, SCHED_FIFO, &sp);
1576 del_timer(&yield_timer);
1577}
1578
1579/*
1580 * Handle cases where the rcu_cpu_kthread() ends up on the wrong CPU.
1581 * This can happen while the corresponding CPU is either coming online
1582 * or going offline. We cannot wait until the CPU is fully online
1583 * before starting the kthread, because the various notifier functions
1584 * can wait for RCU grace periods. So we park rcu_cpu_kthread() until
1585 * the corresponding CPU is online.
1586 *
1587 * Return 1 if the kthread needs to stop, 0 otherwise.
1588 *
1589 * Caller must disable bh. This function can momentarily enable it.
1590 */
1591static int rcu_cpu_kthread_should_stop(int cpu)
1592{
1593 while (cpu_is_offline(cpu) ||
1594 !cpumask_equal(&current->cpus_allowed, cpumask_of(cpu)) ||
1595 smp_processor_id() != cpu) {
1596 if (kthread_should_stop())
1597 return 1;
1598 per_cpu(rcu_cpu_kthread_status, cpu) = RCU_KTHREAD_OFFCPU;
1599 per_cpu(rcu_cpu_kthread_cpu, cpu) = raw_smp_processor_id();
1600 local_bh_enable();
1601 schedule_timeout_uninterruptible(1);
1602 if (!cpumask_equal(&current->cpus_allowed, cpumask_of(cpu)))
1603 set_cpus_allowed_ptr(current, cpumask_of(cpu));
1604 local_bh_disable();
1605 }
1606 per_cpu(rcu_cpu_kthread_cpu, cpu) = cpu;
1607 return 0;
1608}
1609
1610/*
1611 * Per-CPU kernel thread that invokes RCU callbacks. This replaces the
1612 * earlier RCU softirq.
1613 */
1614static int rcu_cpu_kthread(void *arg)
1615{
1616 int cpu = (int)(long)arg;
1617 unsigned long flags;
1618 int spincnt = 0;
1619 unsigned int *statusp = &per_cpu(rcu_cpu_kthread_status, cpu);
1620 wait_queue_head_t *wqp = &per_cpu(rcu_cpu_wq, cpu);
1621 char work;
1622 char *workp = &per_cpu(rcu_cpu_has_work, cpu);
1623
1624 for (;;) {
1625 *statusp = RCU_KTHREAD_WAITING;
1626 wait_event_interruptible(*wqp,
1627 *workp != 0 || kthread_should_stop());
1628 local_bh_disable();
1629 if (rcu_cpu_kthread_should_stop(cpu)) {
1630 local_bh_enable();
1631 break;
1632 }
1633 *statusp = RCU_KTHREAD_RUNNING;
1634 per_cpu(rcu_cpu_kthread_loops, cpu)++;
1635 local_irq_save(flags);
1636 work = *workp;
1637 *workp = 0;
1638 local_irq_restore(flags);
1639 if (work)
1640 rcu_process_callbacks();
1641 local_bh_enable();
1642 if (*workp != 0)
1643 spincnt++;
1644 else
1645 spincnt = 0;
1646 if (spincnt > 10) {
1647 *statusp = RCU_KTHREAD_YIELDING;
1648 rcu_yield(rcu_cpu_kthread_timer, (unsigned long)cpu);
1649 spincnt = 0;
1650 }
1651 }
1652 *statusp = RCU_KTHREAD_STOPPED;
1653 return 0;
1654}
1655
1656/*
1657 * Spawn a per-CPU kthread, setting up affinity and priority.
1658 * Because the CPU hotplug lock is held, no other CPU will be attempting
1659 * to manipulate rcu_cpu_kthread_task. There might be another CPU
1660 * attempting to access it during boot, but the locking in kthread_bind()
1661 * will enforce sufficient ordering.
1662 */
1663static int __cpuinit rcu_spawn_one_cpu_kthread(int cpu)
1664{
1665 struct sched_param sp;
1666 struct task_struct *t;
1667
1668 if (!rcu_kthreads_spawnable ||
1669 per_cpu(rcu_cpu_kthread_task, cpu) != NULL)
1670 return 0;
1671 t = kthread_create(rcu_cpu_kthread, (void *)(long)cpu, "rcuc%d", cpu);
1672 if (IS_ERR(t))
1673 return PTR_ERR(t);
1674 kthread_bind(t, cpu);
1675 per_cpu(rcu_cpu_kthread_cpu, cpu) = cpu;
1676 WARN_ON_ONCE(per_cpu(rcu_cpu_kthread_task, cpu) != NULL);
1677 per_cpu(rcu_cpu_kthread_task, cpu) = t;
1678 wake_up_process(t);
1679 sp.sched_priority = RCU_KTHREAD_PRIO;
1680 sched_setscheduler_nocheck(t, SCHED_FIFO, &sp);
1681 return 0;
1682}
1683
1684/*
1685 * Per-rcu_node kthread, which is in charge of waking up the per-CPU
1686 * kthreads when needed. We ignore requests to wake up kthreads
1687 * for offline CPUs, which is OK because force_quiescent_state()
1688 * takes care of this case.
1689 */
1690static int rcu_node_kthread(void *arg)
1691{
1692 int cpu;
1693 unsigned long flags;
1694 unsigned long mask;
1695 struct rcu_node *rnp = (struct rcu_node *)arg;
1696 struct sched_param sp;
1697 struct task_struct *t;
1698
1699 for (;;) {
1700 rnp->node_kthread_status = RCU_KTHREAD_WAITING;
1701 wait_event_interruptible(rnp->node_wq, rnp->wakemask != 0);
1702 rnp->node_kthread_status = RCU_KTHREAD_RUNNING;
1703 raw_spin_lock_irqsave(&rnp->lock, flags);
1704 mask = rnp->wakemask;
1705 rnp->wakemask = 0;
1706 rcu_initiate_boost(rnp, flags); /* releases rnp->lock. */
1707 for (cpu = rnp->grplo; cpu <= rnp->grphi; cpu++, mask >>= 1) {
1708 if ((mask & 0x1) == 0)
1709 continue;
1710 preempt_disable();
1711 t = per_cpu(rcu_cpu_kthread_task, cpu);
1712 if (!cpu_online(cpu) || t == NULL) {
1713 preempt_enable();
1714 continue;
1715 }
1716 per_cpu(rcu_cpu_has_work, cpu) = 1;
1717 sp.sched_priority = RCU_KTHREAD_PRIO;
1718 sched_setscheduler_nocheck(t, SCHED_FIFO, &sp);
1719 preempt_enable();
1720 }
1721 }
1722 /* NOTREACHED */
1723 rnp->node_kthread_status = RCU_KTHREAD_STOPPED;
1724 return 0;
1725}
1726
1727/*
1728 * Set the per-rcu_node kthread's affinity to cover all CPUs that are
1729 * served by the rcu_node in question. The CPU hotplug lock is still
1730 * held, so the value of rnp->qsmaskinit will be stable.
1731 *
1732 * We don't include outgoingcpu in the affinity set, use -1 if there is
1733 * no outgoing CPU. If there are no CPUs left in the affinity set,
1734 * this function allows the kthread to execute on any CPU.
1735 */
1736static void rcu_node_kthread_setaffinity(struct rcu_node *rnp, int outgoingcpu)
1737{
1738 cpumask_var_t cm;
1739 int cpu;
1740 unsigned long mask = rnp->qsmaskinit;
1741
1742 if (rnp->node_kthread_task == NULL)
1743 return;
1744 if (!alloc_cpumask_var(&cm, GFP_KERNEL))
1745 return;
1746 cpumask_clear(cm);
1747 for (cpu = rnp->grplo; cpu <= rnp->grphi; cpu++, mask >>= 1)
1748 if ((mask & 0x1) && cpu != outgoingcpu)
1749 cpumask_set_cpu(cpu, cm);
1750 if (cpumask_weight(cm) == 0) {
1751 cpumask_setall(cm);
1752 for (cpu = rnp->grplo; cpu <= rnp->grphi; cpu++)
1753 cpumask_clear_cpu(cpu, cm);
1754 WARN_ON_ONCE(cpumask_weight(cm) == 0);
1755 }
1756 set_cpus_allowed_ptr(rnp->node_kthread_task, cm);
1757 rcu_boost_kthread_setaffinity(rnp, cm);
1758 free_cpumask_var(cm);
1759}
1760
1761/*
1762 * Spawn a per-rcu_node kthread, setting priority and affinity.
1763 * Called during boot before online/offline can happen, or, if
1764 * during runtime, with the main CPU-hotplug locks held. So only
1765 * one of these can be executing at a time.
1766 */
1767static int __cpuinit rcu_spawn_one_node_kthread(struct rcu_state *rsp,
1768 struct rcu_node *rnp)
1769{
1770 unsigned long flags;
1771 int rnp_index = rnp - &rsp->node[0];
1772 struct sched_param sp;
1773 struct task_struct *t;
1774
1775 if (!rcu_kthreads_spawnable ||
1776 rnp->qsmaskinit == 0)
1777 return 0;
1778 if (rnp->node_kthread_task == NULL) {
1779 t = kthread_create(rcu_node_kthread, (void *)rnp,
1780 "rcun%d", rnp_index);
1781 if (IS_ERR(t))
1782 return PTR_ERR(t);
1783 raw_spin_lock_irqsave(&rnp->lock, flags);
1784 rnp->node_kthread_task = t;
1785 raw_spin_unlock_irqrestore(&rnp->lock, flags);
1786 wake_up_process(t);
1787 sp.sched_priority = 99;
1788 sched_setscheduler_nocheck(t, SCHED_FIFO, &sp);
1789 }
1790 return rcu_spawn_one_boost_kthread(rsp, rnp, rnp_index);
1791}
1792
1793/*
1794 * Spawn all kthreads -- called as soon as the scheduler is running.
1795 */
1796static int __init rcu_spawn_kthreads(void)
1797{
1798 int cpu;
1799 struct rcu_node *rnp;
1800
1801 rcu_kthreads_spawnable = 1;
1802 for_each_possible_cpu(cpu) {
1803 init_waitqueue_head(&per_cpu(rcu_cpu_wq, cpu));
1804 per_cpu(rcu_cpu_has_work, cpu) = 0;
1805 if (cpu_online(cpu))
1806 (void)rcu_spawn_one_cpu_kthread(cpu);
1807 }
1808 rnp = rcu_get_root(rcu_state);
1809 init_waitqueue_head(&rnp->node_wq);
1810 rcu_init_boost_waitqueue(rnp);
1811 (void)rcu_spawn_one_node_kthread(rcu_state, rnp);
1812 if (NUM_RCU_NODES > 1)
1813 rcu_for_each_leaf_node(rcu_state, rnp) {
1814 init_waitqueue_head(&rnp->node_wq);
1815 rcu_init_boost_waitqueue(rnp);
1816 (void)rcu_spawn_one_node_kthread(rcu_state, rnp);
1817 }
1818 return 0;
1819}
1820early_initcall(rcu_spawn_kthreads);
1821
1417static void 1822static void
1418__call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu), 1823__call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu),
1419 struct rcu_state *rsp) 1824 struct rcu_state *rsp)
@@ -1439,6 +1844,13 @@ __call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu),
1439 /* Add the callback to our list. */ 1844 /* Add the callback to our list. */
1440 *rdp->nxttail[RCU_NEXT_TAIL] = head; 1845 *rdp->nxttail[RCU_NEXT_TAIL] = head;
1441 rdp->nxttail[RCU_NEXT_TAIL] = &head->next; 1846 rdp->nxttail[RCU_NEXT_TAIL] = &head->next;
1847 rdp->qlen++;
1848
1849 /* If interrupts were disabled, don't dive into RCU core. */
1850 if (irqs_disabled_flags(flags)) {
1851 local_irq_restore(flags);
1852 return;
1853 }
1442 1854
1443 /* 1855 /*
1444 * Force the grace period if too many callbacks or too long waiting. 1856 * Force the grace period if too many callbacks or too long waiting.
@@ -1447,7 +1859,7 @@ __call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu),
1447 * invoking force_quiescent_state() if the newly enqueued callback 1859 * invoking force_quiescent_state() if the newly enqueued callback
1448 * is the only one waiting for a grace period to complete. 1860 * is the only one waiting for a grace period to complete.
1449 */ 1861 */
1450 if (unlikely(++rdp->qlen > rdp->qlen_last_fqs_check + qhimark)) { 1862 if (unlikely(rdp->qlen > rdp->qlen_last_fqs_check + qhimark)) {
1451 1863
1452 /* Are we ignoring a completed grace period? */ 1864 /* Are we ignoring a completed grace period? */
1453 rcu_process_gp_end(rsp, rdp); 1865 rcu_process_gp_end(rsp, rdp);
@@ -1583,7 +1995,7 @@ static int __rcu_pending(struct rcu_state *rsp, struct rcu_data *rdp)
1583 * or RCU-bh, force a local reschedule. 1995 * or RCU-bh, force a local reschedule.
1584 */ 1996 */
1585 rdp->n_rp_qs_pending++; 1997 rdp->n_rp_qs_pending++;
1586 if (!rdp->preemptable && 1998 if (!rdp->preemptible &&
1587 ULONG_CMP_LT(ACCESS_ONCE(rsp->jiffies_force_qs) - 1, 1999 ULONG_CMP_LT(ACCESS_ONCE(rsp->jiffies_force_qs) - 1,
1588 jiffies)) 2000 jiffies))
1589 set_need_resched(); 2001 set_need_resched();
@@ -1760,7 +2172,7 @@ rcu_boot_init_percpu_data(int cpu, struct rcu_state *rsp)
1760 * that this CPU cannot possibly have any RCU callbacks in flight yet. 2172 * that this CPU cannot possibly have any RCU callbacks in flight yet.
1761 */ 2173 */
1762static void __cpuinit 2174static void __cpuinit
1763rcu_init_percpu_data(int cpu, struct rcu_state *rsp, int preemptable) 2175rcu_init_percpu_data(int cpu, struct rcu_state *rsp, int preemptible)
1764{ 2176{
1765 unsigned long flags; 2177 unsigned long flags;
1766 unsigned long mask; 2178 unsigned long mask;
@@ -1772,7 +2184,7 @@ rcu_init_percpu_data(int cpu, struct rcu_state *rsp, int preemptable)
1772 rdp->passed_quiesc = 0; /* We could be racing with new GP, */ 2184 rdp->passed_quiesc = 0; /* We could be racing with new GP, */
1773 rdp->qs_pending = 1; /* so set up to respond to current GP. */ 2185 rdp->qs_pending = 1; /* so set up to respond to current GP. */
1774 rdp->beenonline = 1; /* We have now been online. */ 2186 rdp->beenonline = 1; /* We have now been online. */
1775 rdp->preemptable = preemptable; 2187 rdp->preemptible = preemptible;
1776 rdp->qlen_last_fqs_check = 0; 2188 rdp->qlen_last_fqs_check = 0;
1777 rdp->n_force_qs_snap = rsp->n_force_qs; 2189 rdp->n_force_qs_snap = rsp->n_force_qs;
1778 rdp->blimit = blimit; 2190 rdp->blimit = blimit;
@@ -1813,6 +2225,19 @@ static void __cpuinit rcu_online_cpu(int cpu)
1813 rcu_preempt_init_percpu_data(cpu); 2225 rcu_preempt_init_percpu_data(cpu);
1814} 2226}
1815 2227
2228static void __cpuinit rcu_online_kthreads(int cpu)
2229{
2230 struct rcu_data *rdp = per_cpu_ptr(rcu_state->rda, cpu);
2231 struct rcu_node *rnp = rdp->mynode;
2232
2233 /* Fire up the incoming CPU's kthread and leaf rcu_node kthread. */
2234 if (rcu_kthreads_spawnable) {
2235 (void)rcu_spawn_one_cpu_kthread(cpu);
2236 if (rnp->node_kthread_task == NULL)
2237 (void)rcu_spawn_one_node_kthread(rcu_state, rnp);
2238 }
2239}
2240
1816/* 2241/*
1817 * Handle CPU online/offline notification events. 2242 * Handle CPU online/offline notification events.
1818 */ 2243 */
@@ -1820,11 +2245,23 @@ static int __cpuinit rcu_cpu_notify(struct notifier_block *self,
1820 unsigned long action, void *hcpu) 2245 unsigned long action, void *hcpu)
1821{ 2246{
1822 long cpu = (long)hcpu; 2247 long cpu = (long)hcpu;
2248 struct rcu_data *rdp = per_cpu_ptr(rcu_state->rda, cpu);
2249 struct rcu_node *rnp = rdp->mynode;
1823 2250
1824 switch (action) { 2251 switch (action) {
1825 case CPU_UP_PREPARE: 2252 case CPU_UP_PREPARE:
1826 case CPU_UP_PREPARE_FROZEN: 2253 case CPU_UP_PREPARE_FROZEN:
1827 rcu_online_cpu(cpu); 2254 rcu_online_cpu(cpu);
2255 rcu_online_kthreads(cpu);
2256 break;
2257 case CPU_ONLINE:
2258 case CPU_DOWN_FAILED:
2259 rcu_node_kthread_setaffinity(rnp, -1);
2260 rcu_cpu_kthread_setrt(cpu, 1);
2261 break;
2262 case CPU_DOWN_PREPARE:
2263 rcu_node_kthread_setaffinity(rnp, cpu);
2264 rcu_cpu_kthread_setrt(cpu, 0);
1828 break; 2265 break;
1829 case CPU_DYING: 2266 case CPU_DYING:
1830 case CPU_DYING_FROZEN: 2267 case CPU_DYING_FROZEN:
@@ -1943,10 +2380,7 @@ static void __init rcu_init_one(struct rcu_state *rsp,
1943 j / rsp->levelspread[i - 1]; 2380 j / rsp->levelspread[i - 1];
1944 } 2381 }
1945 rnp->level = i; 2382 rnp->level = i;
1946 INIT_LIST_HEAD(&rnp->blocked_tasks[0]); 2383 INIT_LIST_HEAD(&rnp->blkd_tasks);
1947 INIT_LIST_HEAD(&rnp->blocked_tasks[1]);
1948 INIT_LIST_HEAD(&rnp->blocked_tasks[2]);
1949 INIT_LIST_HEAD(&rnp->blocked_tasks[3]);
1950 } 2384 }
1951 } 2385 }
1952 2386
@@ -1968,7 +2402,6 @@ void __init rcu_init(void)
1968 rcu_init_one(&rcu_sched_state, &rcu_sched_data); 2402 rcu_init_one(&rcu_sched_state, &rcu_sched_data);
1969 rcu_init_one(&rcu_bh_state, &rcu_bh_data); 2403 rcu_init_one(&rcu_bh_state, &rcu_bh_data);
1970 __rcu_init_preempt(); 2404 __rcu_init_preempt();
1971 open_softirq(RCU_SOFTIRQ, rcu_process_callbacks);
1972 2405
1973 /* 2406 /*
1974 * We don't need protection against CPU-hotplug here because 2407 * We don't need protection against CPU-hotplug here because
diff --git a/kernel/rcutree.h b/kernel/rcutree.h
index e8f057e44e3e..257664815d5d 100644
--- a/kernel/rcutree.h
+++ b/kernel/rcutree.h
@@ -91,6 +91,14 @@ struct rcu_dynticks {
91 /* remains even for nmi from irq handler. */ 91 /* remains even for nmi from irq handler. */
92}; 92};
93 93
94/* RCU's kthread states for tracing. */
95#define RCU_KTHREAD_STOPPED 0
96#define RCU_KTHREAD_RUNNING 1
97#define RCU_KTHREAD_WAITING 2
98#define RCU_KTHREAD_OFFCPU 3
99#define RCU_KTHREAD_YIELDING 4
100#define RCU_KTHREAD_MAX 4
101
94/* 102/*
95 * Definition for node within the RCU grace-period-detection hierarchy. 103 * Definition for node within the RCU grace-period-detection hierarchy.
96 */ 104 */
@@ -109,10 +117,11 @@ struct rcu_node {
109 /* an rcu_data structure, otherwise, each */ 117 /* an rcu_data structure, otherwise, each */
110 /* bit corresponds to a child rcu_node */ 118 /* bit corresponds to a child rcu_node */
111 /* structure. */ 119 /* structure. */
112 unsigned long expmask; /* Groups that have ->blocked_tasks[] */ 120 unsigned long expmask; /* Groups that have ->blkd_tasks */
113 /* elements that need to drain to allow the */ 121 /* elements that need to drain to allow the */
114 /* current expedited grace period to */ 122 /* current expedited grace period to */
115 /* complete (only for TREE_PREEMPT_RCU). */ 123 /* complete (only for TREE_PREEMPT_RCU). */
124 unsigned long wakemask; /* CPUs whose kthread needs to be awakened. */
116 unsigned long qsmaskinit; 125 unsigned long qsmaskinit;
117 /* Per-GP initial value for qsmask & expmask. */ 126 /* Per-GP initial value for qsmask & expmask. */
118 unsigned long grpmask; /* Mask to apply to parent qsmask. */ 127 unsigned long grpmask; /* Mask to apply to parent qsmask. */
@@ -122,11 +131,68 @@ struct rcu_node {
122 u8 grpnum; /* CPU/group number for next level up. */ 131 u8 grpnum; /* CPU/group number for next level up. */
123 u8 level; /* root is at level 0. */ 132 u8 level; /* root is at level 0. */
124 struct rcu_node *parent; 133 struct rcu_node *parent;
125 struct list_head blocked_tasks[4]; 134 struct list_head blkd_tasks;
126 /* Tasks blocked in RCU read-side critsect. */ 135 /* Tasks blocked in RCU read-side critical */
127 /* Grace period number (->gpnum) x blocked */ 136 /* section. Tasks are placed at the head */
128 /* by tasks on the (x & 0x1) element of the */ 137 /* of this list and age towards the tail. */
129 /* blocked_tasks[] array. */ 138 struct list_head *gp_tasks;
139 /* Pointer to the first task blocking the */
140 /* current grace period, or NULL if there */
141 /* is no such task. */
142 struct list_head *exp_tasks;
143 /* Pointer to the first task blocking the */
144 /* current expedited grace period, or NULL */
145 /* if there is no such task. If there */
146 /* is no current expedited grace period, */
147 /* then there can cannot be any such task. */
148#ifdef CONFIG_RCU_BOOST
149 struct list_head *boost_tasks;
150 /* Pointer to first task that needs to be */
151 /* priority boosted, or NULL if no priority */
152 /* boosting is needed for this rcu_node */
153 /* structure. If there are no tasks */
154 /* queued on this rcu_node structure that */
155 /* are blocking the current grace period, */
156 /* there can be no such task. */
157 unsigned long boost_time;
158 /* When to start boosting (jiffies). */
159 struct task_struct *boost_kthread_task;
160 /* kthread that takes care of priority */
161 /* boosting for this rcu_node structure. */
162 wait_queue_head_t boost_wq;
163 /* Wait queue on which to park the boost */
164 /* kthread. */
165 unsigned int boost_kthread_status;
166 /* State of boost_kthread_task for tracing. */
167 unsigned long n_tasks_boosted;
168 /* Total number of tasks boosted. */
169 unsigned long n_exp_boosts;
170 /* Number of tasks boosted for expedited GP. */
171 unsigned long n_normal_boosts;
172 /* Number of tasks boosted for normal GP. */
173 unsigned long n_balk_blkd_tasks;
174 /* Refused to boost: no blocked tasks. */
175 unsigned long n_balk_exp_gp_tasks;
176 /* Refused to boost: nothing blocking GP. */
177 unsigned long n_balk_boost_tasks;
178 /* Refused to boost: already boosting. */
179 unsigned long n_balk_notblocked;
180 /* Refused to boost: RCU RS CS still running. */
181 unsigned long n_balk_notyet;
182 /* Refused to boost: not yet time. */
183 unsigned long n_balk_nos;
184 /* Refused to boost: not sure why, though. */
185 /* This can happen due to race conditions. */
186#endif /* #ifdef CONFIG_RCU_BOOST */
187 struct task_struct *node_kthread_task;
188 /* kthread that takes care of this rcu_node */
189 /* structure, for example, awakening the */
190 /* per-CPU kthreads as needed. */
191 wait_queue_head_t node_wq;
192 /* Wait queue on which to park the per-node */
193 /* kthread. */
194 unsigned int node_kthread_status;
195 /* State of node_kthread_task for tracing. */
130} ____cacheline_internodealigned_in_smp; 196} ____cacheline_internodealigned_in_smp;
131 197
132/* 198/*
@@ -175,7 +241,7 @@ struct rcu_data {
175 bool passed_quiesc; /* User-mode/idle loop etc. */ 241 bool passed_quiesc; /* User-mode/idle loop etc. */
176 bool qs_pending; /* Core waits for quiesc state. */ 242 bool qs_pending; /* Core waits for quiesc state. */
177 bool beenonline; /* CPU online at least once. */ 243 bool beenonline; /* CPU online at least once. */
178 bool preemptable; /* Preemptable RCU? */ 244 bool preemptible; /* Preemptible RCU? */
179 struct rcu_node *mynode; /* This CPU's leaf of hierarchy */ 245 struct rcu_node *mynode; /* This CPU's leaf of hierarchy */
180 unsigned long grpmask; /* Mask to apply to leaf qsmask. */ 246 unsigned long grpmask; /* Mask to apply to leaf qsmask. */
181 247
@@ -254,7 +320,6 @@ struct rcu_data {
254#endif /* #else #ifdef CONFIG_NO_HZ */ 320#endif /* #else #ifdef CONFIG_NO_HZ */
255 321
256#define RCU_JIFFIES_TILL_FORCE_QS 3 /* for rsp->jiffies_force_qs */ 322#define RCU_JIFFIES_TILL_FORCE_QS 3 /* for rsp->jiffies_force_qs */
257#ifdef CONFIG_RCU_CPU_STALL_DETECTOR
258 323
259#ifdef CONFIG_PROVE_RCU 324#ifdef CONFIG_PROVE_RCU
260#define RCU_STALL_DELAY_DELTA (5 * HZ) 325#define RCU_STALL_DELAY_DELTA (5 * HZ)
@@ -272,13 +337,6 @@ struct rcu_data {
272 /* scheduling clock irq */ 337 /* scheduling clock irq */
273 /* before ratting on them. */ 338 /* before ratting on them. */
274 339
275#ifdef CONFIG_RCU_CPU_STALL_DETECTOR_RUNNABLE
276#define RCU_CPU_STALL_SUPPRESS_INIT 0
277#else
278#define RCU_CPU_STALL_SUPPRESS_INIT 1
279#endif
280
281#endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */
282 340
283/* 341/*
284 * RCU global state, including node hierarchy. This hierarchy is 342 * RCU global state, including node hierarchy. This hierarchy is
@@ -325,12 +383,12 @@ struct rcu_state {
325 /* due to lock unavailable. */ 383 /* due to lock unavailable. */
326 unsigned long n_force_qs_ngp; /* Number of calls leaving */ 384 unsigned long n_force_qs_ngp; /* Number of calls leaving */
327 /* due to no GP active. */ 385 /* due to no GP active. */
328#ifdef CONFIG_RCU_CPU_STALL_DETECTOR
329 unsigned long gp_start; /* Time at which GP started, */ 386 unsigned long gp_start; /* Time at which GP started, */
330 /* but in jiffies. */ 387 /* but in jiffies. */
331 unsigned long jiffies_stall; /* Time at which to check */ 388 unsigned long jiffies_stall; /* Time at which to check */
332 /* for CPU stalls. */ 389 /* for CPU stalls. */
333#endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */ 390 unsigned long gp_max; /* Maximum GP duration in */
391 /* jiffies. */
334 char *name; /* Name of structure. */ 392 char *name; /* Name of structure. */
335}; 393};
336 394
@@ -361,16 +419,14 @@ DECLARE_PER_CPU(struct rcu_data, rcu_preempt_data);
361static void rcu_bootup_announce(void); 419static void rcu_bootup_announce(void);
362long rcu_batches_completed(void); 420long rcu_batches_completed(void);
363static void rcu_preempt_note_context_switch(int cpu); 421static void rcu_preempt_note_context_switch(int cpu);
364static int rcu_preempted_readers(struct rcu_node *rnp); 422static int rcu_preempt_blocked_readers_cgp(struct rcu_node *rnp);
365#ifdef CONFIG_HOTPLUG_CPU 423#ifdef CONFIG_HOTPLUG_CPU
366static void rcu_report_unblock_qs_rnp(struct rcu_node *rnp, 424static void rcu_report_unblock_qs_rnp(struct rcu_node *rnp,
367 unsigned long flags); 425 unsigned long flags);
368#endif /* #ifdef CONFIG_HOTPLUG_CPU */ 426#endif /* #ifdef CONFIG_HOTPLUG_CPU */
369#ifdef CONFIG_RCU_CPU_STALL_DETECTOR
370static void rcu_print_detail_task_stall(struct rcu_state *rsp); 427static void rcu_print_detail_task_stall(struct rcu_state *rsp);
371static void rcu_print_task_stall(struct rcu_node *rnp); 428static void rcu_print_task_stall(struct rcu_node *rnp);
372static void rcu_preempt_stall_reset(void); 429static void rcu_preempt_stall_reset(void);
373#endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */
374static void rcu_preempt_check_blocked_tasks(struct rcu_node *rnp); 430static void rcu_preempt_check_blocked_tasks(struct rcu_node *rnp);
375#ifdef CONFIG_HOTPLUG_CPU 431#ifdef CONFIG_HOTPLUG_CPU
376static int rcu_preempt_offline_tasks(struct rcu_state *rsp, 432static int rcu_preempt_offline_tasks(struct rcu_state *rsp,
@@ -390,5 +446,13 @@ static void __cpuinit rcu_preempt_init_percpu_data(int cpu);
390static void rcu_preempt_send_cbs_to_online(void); 446static void rcu_preempt_send_cbs_to_online(void);
391static void __init __rcu_init_preempt(void); 447static void __init __rcu_init_preempt(void);
392static void rcu_needs_cpu_flush(void); 448static void rcu_needs_cpu_flush(void);
449static void __init rcu_init_boost_waitqueue(struct rcu_node *rnp);
450static void rcu_initiate_boost(struct rcu_node *rnp, unsigned long flags);
451static void rcu_boost_kthread_setaffinity(struct rcu_node *rnp,
452 cpumask_var_t cm);
453static void rcu_preempt_boost_start_gp(struct rcu_node *rnp);
454static int __cpuinit rcu_spawn_one_boost_kthread(struct rcu_state *rsp,
455 struct rcu_node *rnp,
456 int rnp_index);
393 457
394#endif /* #ifndef RCU_TREE_NONCORE */ 458#endif /* #ifndef RCU_TREE_NONCORE */
diff --git a/kernel/rcutree_plugin.h b/kernel/rcutree_plugin.h
index a3638710dc67..3f6559a5f5cd 100644
--- a/kernel/rcutree_plugin.h
+++ b/kernel/rcutree_plugin.h
@@ -1,7 +1,7 @@
1/* 1/*
2 * Read-Copy Update mechanism for mutual exclusion (tree-based version) 2 * Read-Copy Update mechanism for mutual exclusion (tree-based version)
3 * Internal non-public definitions that provide either classic 3 * Internal non-public definitions that provide either classic
4 * or preemptable semantics. 4 * or preemptible semantics.
5 * 5 *
6 * This program is free software; you can redistribute it and/or modify 6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by 7 * it under the terms of the GNU General Public License as published by
@@ -54,10 +54,6 @@ static void __init rcu_bootup_announce_oddness(void)
54#ifdef CONFIG_RCU_TORTURE_TEST_RUNNABLE 54#ifdef CONFIG_RCU_TORTURE_TEST_RUNNABLE
55 printk(KERN_INFO "\tRCU torture testing starts during boot.\n"); 55 printk(KERN_INFO "\tRCU torture testing starts during boot.\n");
56#endif 56#endif
57#ifndef CONFIG_RCU_CPU_STALL_DETECTOR
58 printk(KERN_INFO
59 "\tRCU-based detection of stalled CPUs is disabled.\n");
60#endif
61#if defined(CONFIG_TREE_PREEMPT_RCU) && !defined(CONFIG_RCU_CPU_STALL_VERBOSE) 57#if defined(CONFIG_TREE_PREEMPT_RCU) && !defined(CONFIG_RCU_CPU_STALL_VERBOSE)
62 printk(KERN_INFO "\tVerbose stalled-CPUs detection is disabled.\n"); 58 printk(KERN_INFO "\tVerbose stalled-CPUs detection is disabled.\n");
63#endif 59#endif
@@ -70,6 +66,7 @@ static void __init rcu_bootup_announce_oddness(void)
70 66
71struct rcu_state rcu_preempt_state = RCU_STATE_INITIALIZER(rcu_preempt_state); 67struct rcu_state rcu_preempt_state = RCU_STATE_INITIALIZER(rcu_preempt_state);
72DEFINE_PER_CPU(struct rcu_data, rcu_preempt_data); 68DEFINE_PER_CPU(struct rcu_data, rcu_preempt_data);
69static struct rcu_state *rcu_state = &rcu_preempt_state;
73 70
74static int rcu_preempted_readers_exp(struct rcu_node *rnp); 71static int rcu_preempted_readers_exp(struct rcu_node *rnp);
75 72
@@ -78,7 +75,7 @@ static int rcu_preempted_readers_exp(struct rcu_node *rnp);
78 */ 75 */
79static void __init rcu_bootup_announce(void) 76static void __init rcu_bootup_announce(void)
80{ 77{
81 printk(KERN_INFO "Preemptable hierarchical RCU implementation.\n"); 78 printk(KERN_INFO "Preemptible hierarchical RCU implementation.\n");
82 rcu_bootup_announce_oddness(); 79 rcu_bootup_announce_oddness();
83} 80}
84 81
@@ -111,7 +108,7 @@ void rcu_force_quiescent_state(void)
111EXPORT_SYMBOL_GPL(rcu_force_quiescent_state); 108EXPORT_SYMBOL_GPL(rcu_force_quiescent_state);
112 109
113/* 110/*
114 * Record a preemptable-RCU quiescent state for the specified CPU. Note 111 * Record a preemptible-RCU quiescent state for the specified CPU. Note
115 * that this just means that the task currently running on the CPU is 112 * that this just means that the task currently running on the CPU is
116 * not in a quiescent state. There might be any number of tasks blocked 113 * not in a quiescent state. There might be any number of tasks blocked
117 * while in an RCU read-side critical section. 114 * while in an RCU read-side critical section.
@@ -134,12 +131,12 @@ static void rcu_preempt_qs(int cpu)
134 * We have entered the scheduler, and the current task might soon be 131 * We have entered the scheduler, and the current task might soon be
135 * context-switched away from. If this task is in an RCU read-side 132 * context-switched away from. If this task is in an RCU read-side
136 * critical section, we will no longer be able to rely on the CPU to 133 * critical section, we will no longer be able to rely on the CPU to
137 * record that fact, so we enqueue the task on the appropriate entry 134 * record that fact, so we enqueue the task on the blkd_tasks list.
138 * of the blocked_tasks[] array. The task will dequeue itself when 135 * The task will dequeue itself when it exits the outermost enclosing
139 * it exits the outermost enclosing RCU read-side critical section. 136 * RCU read-side critical section. Therefore, the current grace period
140 * Therefore, the current grace period cannot be permitted to complete 137 * cannot be permitted to complete until the blkd_tasks list entries
141 * until the blocked_tasks[] entry indexed by the low-order bit of 138 * predating the current grace period drain, in other words, until
142 * rnp->gpnum empties. 139 * rnp->gp_tasks becomes NULL.
143 * 140 *
144 * Caller must disable preemption. 141 * Caller must disable preemption.
145 */ 142 */
@@ -147,7 +144,6 @@ static void rcu_preempt_note_context_switch(int cpu)
147{ 144{
148 struct task_struct *t = current; 145 struct task_struct *t = current;
149 unsigned long flags; 146 unsigned long flags;
150 int phase;
151 struct rcu_data *rdp; 147 struct rcu_data *rdp;
152 struct rcu_node *rnp; 148 struct rcu_node *rnp;
153 149
@@ -169,15 +165,30 @@ static void rcu_preempt_note_context_switch(int cpu)
169 * (i.e., this CPU has not yet passed through a quiescent 165 * (i.e., this CPU has not yet passed through a quiescent
170 * state for the current grace period), then as long 166 * state for the current grace period), then as long
171 * as that task remains queued, the current grace period 167 * as that task remains queued, the current grace period
172 * cannot end. 168 * cannot end. Note that there is some uncertainty as
169 * to exactly when the current grace period started.
170 * We take a conservative approach, which can result
171 * in unnecessarily waiting on tasks that started very
172 * slightly after the current grace period began. C'est
173 * la vie!!!
173 * 174 *
174 * But first, note that the current CPU must still be 175 * But first, note that the current CPU must still be
175 * on line! 176 * on line!
176 */ 177 */
177 WARN_ON_ONCE((rdp->grpmask & rnp->qsmaskinit) == 0); 178 WARN_ON_ONCE((rdp->grpmask & rnp->qsmaskinit) == 0);
178 WARN_ON_ONCE(!list_empty(&t->rcu_node_entry)); 179 WARN_ON_ONCE(!list_empty(&t->rcu_node_entry));
179 phase = (rnp->gpnum + !(rnp->qsmask & rdp->grpmask)) & 0x1; 180 if ((rnp->qsmask & rdp->grpmask) && rnp->gp_tasks != NULL) {
180 list_add(&t->rcu_node_entry, &rnp->blocked_tasks[phase]); 181 list_add(&t->rcu_node_entry, rnp->gp_tasks->prev);
182 rnp->gp_tasks = &t->rcu_node_entry;
183#ifdef CONFIG_RCU_BOOST
184 if (rnp->boost_tasks != NULL)
185 rnp->boost_tasks = rnp->gp_tasks;
186#endif /* #ifdef CONFIG_RCU_BOOST */
187 } else {
188 list_add(&t->rcu_node_entry, &rnp->blkd_tasks);
189 if (rnp->qsmask & rdp->grpmask)
190 rnp->gp_tasks = &t->rcu_node_entry;
191 }
181 raw_spin_unlock_irqrestore(&rnp->lock, flags); 192 raw_spin_unlock_irqrestore(&rnp->lock, flags);
182 } 193 }
183 194
@@ -196,7 +207,7 @@ static void rcu_preempt_note_context_switch(int cpu)
196} 207}
197 208
198/* 209/*
199 * Tree-preemptable RCU implementation for rcu_read_lock(). 210 * Tree-preemptible RCU implementation for rcu_read_lock().
200 * Just increment ->rcu_read_lock_nesting, shared state will be updated 211 * Just increment ->rcu_read_lock_nesting, shared state will be updated
201 * if we block. 212 * if we block.
202 */ 213 */
@@ -212,12 +223,9 @@ EXPORT_SYMBOL_GPL(__rcu_read_lock);
212 * for the specified rcu_node structure. If the caller needs a reliable 223 * for the specified rcu_node structure. If the caller needs a reliable
213 * answer, it must hold the rcu_node's ->lock. 224 * answer, it must hold the rcu_node's ->lock.
214 */ 225 */
215static int rcu_preempted_readers(struct rcu_node *rnp) 226static int rcu_preempt_blocked_readers_cgp(struct rcu_node *rnp)
216{ 227{
217 int phase = rnp->gpnum & 0x1; 228 return rnp->gp_tasks != NULL;
218
219 return !list_empty(&rnp->blocked_tasks[phase]) ||
220 !list_empty(&rnp->blocked_tasks[phase + 2]);
221} 229}
222 230
223/* 231/*
@@ -233,7 +241,7 @@ static void rcu_report_unblock_qs_rnp(struct rcu_node *rnp, unsigned long flags)
233 unsigned long mask; 241 unsigned long mask;
234 struct rcu_node *rnp_p; 242 struct rcu_node *rnp_p;
235 243
236 if (rnp->qsmask != 0 || rcu_preempted_readers(rnp)) { 244 if (rnp->qsmask != 0 || rcu_preempt_blocked_readers_cgp(rnp)) {
237 raw_spin_unlock_irqrestore(&rnp->lock, flags); 245 raw_spin_unlock_irqrestore(&rnp->lock, flags);
238 return; /* Still need more quiescent states! */ 246 return; /* Still need more quiescent states! */
239 } 247 }
@@ -257,6 +265,21 @@ static void rcu_report_unblock_qs_rnp(struct rcu_node *rnp, unsigned long flags)
257} 265}
258 266
259/* 267/*
268 * Advance a ->blkd_tasks-list pointer to the next entry, instead
269 * returning NULL if at the end of the list.
270 */
271static struct list_head *rcu_next_node_entry(struct task_struct *t,
272 struct rcu_node *rnp)
273{
274 struct list_head *np;
275
276 np = t->rcu_node_entry.next;
277 if (np == &rnp->blkd_tasks)
278 np = NULL;
279 return np;
280}
281
282/*
260 * Handle special cases during rcu_read_unlock(), such as needing to 283 * Handle special cases during rcu_read_unlock(), such as needing to
261 * notify RCU core processing or task having blocked during the RCU 284 * notify RCU core processing or task having blocked during the RCU
262 * read-side critical section. 285 * read-side critical section.
@@ -266,6 +289,7 @@ static void rcu_read_unlock_special(struct task_struct *t)
266 int empty; 289 int empty;
267 int empty_exp; 290 int empty_exp;
268 unsigned long flags; 291 unsigned long flags;
292 struct list_head *np;
269 struct rcu_node *rnp; 293 struct rcu_node *rnp;
270 int special; 294 int special;
271 295
@@ -306,10 +330,19 @@ static void rcu_read_unlock_special(struct task_struct *t)
306 break; 330 break;
307 raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */ 331 raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */
308 } 332 }
309 empty = !rcu_preempted_readers(rnp); 333 empty = !rcu_preempt_blocked_readers_cgp(rnp);
310 empty_exp = !rcu_preempted_readers_exp(rnp); 334 empty_exp = !rcu_preempted_readers_exp(rnp);
311 smp_mb(); /* ensure expedited fastpath sees end of RCU c-s. */ 335 smp_mb(); /* ensure expedited fastpath sees end of RCU c-s. */
336 np = rcu_next_node_entry(t, rnp);
312 list_del_init(&t->rcu_node_entry); 337 list_del_init(&t->rcu_node_entry);
338 if (&t->rcu_node_entry == rnp->gp_tasks)
339 rnp->gp_tasks = np;
340 if (&t->rcu_node_entry == rnp->exp_tasks)
341 rnp->exp_tasks = np;
342#ifdef CONFIG_RCU_BOOST
343 if (&t->rcu_node_entry == rnp->boost_tasks)
344 rnp->boost_tasks = np;
345#endif /* #ifdef CONFIG_RCU_BOOST */
313 t->rcu_blocked_node = NULL; 346 t->rcu_blocked_node = NULL;
314 347
315 /* 348 /*
@@ -322,6 +355,15 @@ static void rcu_read_unlock_special(struct task_struct *t)
322 else 355 else
323 rcu_report_unblock_qs_rnp(rnp, flags); 356 rcu_report_unblock_qs_rnp(rnp, flags);
324 357
358#ifdef CONFIG_RCU_BOOST
359 /* Unboost if we were boosted. */
360 if (special & RCU_READ_UNLOCK_BOOSTED) {
361 t->rcu_read_unlock_special &= ~RCU_READ_UNLOCK_BOOSTED;
362 rt_mutex_unlock(t->rcu_boost_mutex);
363 t->rcu_boost_mutex = NULL;
364 }
365#endif /* #ifdef CONFIG_RCU_BOOST */
366
325 /* 367 /*
326 * If this was the last task on the expedited lists, 368 * If this was the last task on the expedited lists,
327 * then we need to report up the rcu_node hierarchy. 369 * then we need to report up the rcu_node hierarchy.
@@ -334,7 +376,7 @@ static void rcu_read_unlock_special(struct task_struct *t)
334} 376}
335 377
336/* 378/*
337 * Tree-preemptable RCU implementation for rcu_read_unlock(). 379 * Tree-preemptible RCU implementation for rcu_read_unlock().
338 * Decrement ->rcu_read_lock_nesting. If the result is zero (outermost 380 * Decrement ->rcu_read_lock_nesting. If the result is zero (outermost
339 * rcu_read_unlock()) and ->rcu_read_unlock_special is non-zero, then 381 * rcu_read_unlock()) and ->rcu_read_unlock_special is non-zero, then
340 * invoke rcu_read_unlock_special() to clean up after a context switch 382 * invoke rcu_read_unlock_special() to clean up after a context switch
@@ -356,8 +398,6 @@ void __rcu_read_unlock(void)
356} 398}
357EXPORT_SYMBOL_GPL(__rcu_read_unlock); 399EXPORT_SYMBOL_GPL(__rcu_read_unlock);
358 400
359#ifdef CONFIG_RCU_CPU_STALL_DETECTOR
360
361#ifdef CONFIG_RCU_CPU_STALL_VERBOSE 401#ifdef CONFIG_RCU_CPU_STALL_VERBOSE
362 402
363/* 403/*
@@ -367,18 +407,16 @@ EXPORT_SYMBOL_GPL(__rcu_read_unlock);
367static void rcu_print_detail_task_stall_rnp(struct rcu_node *rnp) 407static void rcu_print_detail_task_stall_rnp(struct rcu_node *rnp)
368{ 408{
369 unsigned long flags; 409 unsigned long flags;
370 struct list_head *lp;
371 int phase;
372 struct task_struct *t; 410 struct task_struct *t;
373 411
374 if (rcu_preempted_readers(rnp)) { 412 if (!rcu_preempt_blocked_readers_cgp(rnp))
375 raw_spin_lock_irqsave(&rnp->lock, flags); 413 return;
376 phase = rnp->gpnum & 0x1; 414 raw_spin_lock_irqsave(&rnp->lock, flags);
377 lp = &rnp->blocked_tasks[phase]; 415 t = list_entry(rnp->gp_tasks,
378 list_for_each_entry(t, lp, rcu_node_entry) 416 struct task_struct, rcu_node_entry);
379 sched_show_task(t); 417 list_for_each_entry_continue(t, &rnp->blkd_tasks, rcu_node_entry)
380 raw_spin_unlock_irqrestore(&rnp->lock, flags); 418 sched_show_task(t);
381 } 419 raw_spin_unlock_irqrestore(&rnp->lock, flags);
382} 420}
383 421
384/* 422/*
@@ -408,16 +446,14 @@ static void rcu_print_detail_task_stall(struct rcu_state *rsp)
408 */ 446 */
409static void rcu_print_task_stall(struct rcu_node *rnp) 447static void rcu_print_task_stall(struct rcu_node *rnp)
410{ 448{
411 struct list_head *lp;
412 int phase;
413 struct task_struct *t; 449 struct task_struct *t;
414 450
415 if (rcu_preempted_readers(rnp)) { 451 if (!rcu_preempt_blocked_readers_cgp(rnp))
416 phase = rnp->gpnum & 0x1; 452 return;
417 lp = &rnp->blocked_tasks[phase]; 453 t = list_entry(rnp->gp_tasks,
418 list_for_each_entry(t, lp, rcu_node_entry) 454 struct task_struct, rcu_node_entry);
419 printk(" P%d", t->pid); 455 list_for_each_entry_continue(t, &rnp->blkd_tasks, rcu_node_entry)
420 } 456 printk(" P%d", t->pid);
421} 457}
422 458
423/* 459/*
@@ -430,18 +466,21 @@ static void rcu_preempt_stall_reset(void)
430 rcu_preempt_state.jiffies_stall = jiffies + ULONG_MAX / 2; 466 rcu_preempt_state.jiffies_stall = jiffies + ULONG_MAX / 2;
431} 467}
432 468
433#endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */
434
435/* 469/*
436 * Check that the list of blocked tasks for the newly completed grace 470 * Check that the list of blocked tasks for the newly completed grace
437 * period is in fact empty. It is a serious bug to complete a grace 471 * period is in fact empty. It is a serious bug to complete a grace
438 * period that still has RCU readers blocked! This function must be 472 * period that still has RCU readers blocked! This function must be
439 * invoked -before- updating this rnp's ->gpnum, and the rnp's ->lock 473 * invoked -before- updating this rnp's ->gpnum, and the rnp's ->lock
440 * must be held by the caller. 474 * must be held by the caller.
475 *
476 * Also, if there are blocked tasks on the list, they automatically
477 * block the newly created grace period, so set up ->gp_tasks accordingly.
441 */ 478 */
442static void rcu_preempt_check_blocked_tasks(struct rcu_node *rnp) 479static void rcu_preempt_check_blocked_tasks(struct rcu_node *rnp)
443{ 480{
444 WARN_ON_ONCE(rcu_preempted_readers(rnp)); 481 WARN_ON_ONCE(rcu_preempt_blocked_readers_cgp(rnp));
482 if (!list_empty(&rnp->blkd_tasks))
483 rnp->gp_tasks = rnp->blkd_tasks.next;
445 WARN_ON_ONCE(rnp->qsmask); 484 WARN_ON_ONCE(rnp->qsmask);
446} 485}
447 486
@@ -465,50 +504,68 @@ static int rcu_preempt_offline_tasks(struct rcu_state *rsp,
465 struct rcu_node *rnp, 504 struct rcu_node *rnp,
466 struct rcu_data *rdp) 505 struct rcu_data *rdp)
467{ 506{
468 int i;
469 struct list_head *lp; 507 struct list_head *lp;
470 struct list_head *lp_root; 508 struct list_head *lp_root;
471 int retval = 0; 509 int retval = 0;
472 struct rcu_node *rnp_root = rcu_get_root(rsp); 510 struct rcu_node *rnp_root = rcu_get_root(rsp);
473 struct task_struct *tp; 511 struct task_struct *t;
474 512
475 if (rnp == rnp_root) { 513 if (rnp == rnp_root) {
476 WARN_ONCE(1, "Last CPU thought to be offlined?"); 514 WARN_ONCE(1, "Last CPU thought to be offlined?");
477 return 0; /* Shouldn't happen: at least one CPU online. */ 515 return 0; /* Shouldn't happen: at least one CPU online. */
478 } 516 }
479 WARN_ON_ONCE(rnp != rdp->mynode && 517
480 (!list_empty(&rnp->blocked_tasks[0]) || 518 /* If we are on an internal node, complain bitterly. */
481 !list_empty(&rnp->blocked_tasks[1]) || 519 WARN_ON_ONCE(rnp != rdp->mynode);
482 !list_empty(&rnp->blocked_tasks[2]) ||
483 !list_empty(&rnp->blocked_tasks[3])));
484 520
485 /* 521 /*
486 * Move tasks up to root rcu_node. Rely on the fact that the 522 * Move tasks up to root rcu_node. Don't try to get fancy for
487 * root rcu_node can be at most one ahead of the rest of the 523 * this corner-case operation -- just put this node's tasks
488 * rcu_nodes in terms of gp_num value. This fact allows us to 524 * at the head of the root node's list, and update the root node's
489 * move the blocked_tasks[] array directly, element by element. 525 * ->gp_tasks and ->exp_tasks pointers to those of this node's,
526 * if non-NULL. This might result in waiting for more tasks than
527 * absolutely necessary, but this is a good performance/complexity
528 * tradeoff.
490 */ 529 */
491 if (rcu_preempted_readers(rnp)) 530 if (rcu_preempt_blocked_readers_cgp(rnp))
492 retval |= RCU_OFL_TASKS_NORM_GP; 531 retval |= RCU_OFL_TASKS_NORM_GP;
493 if (rcu_preempted_readers_exp(rnp)) 532 if (rcu_preempted_readers_exp(rnp))
494 retval |= RCU_OFL_TASKS_EXP_GP; 533 retval |= RCU_OFL_TASKS_EXP_GP;
495 for (i = 0; i < 4; i++) { 534 lp = &rnp->blkd_tasks;
496 lp = &rnp->blocked_tasks[i]; 535 lp_root = &rnp_root->blkd_tasks;
497 lp_root = &rnp_root->blocked_tasks[i]; 536 while (!list_empty(lp)) {
498 while (!list_empty(lp)) { 537 t = list_entry(lp->next, typeof(*t), rcu_node_entry);
499 tp = list_entry(lp->next, typeof(*tp), rcu_node_entry); 538 raw_spin_lock(&rnp_root->lock); /* irqs already disabled */
500 raw_spin_lock(&rnp_root->lock); /* irqs already disabled */ 539 list_del(&t->rcu_node_entry);
501 list_del(&tp->rcu_node_entry); 540 t->rcu_blocked_node = rnp_root;
502 tp->rcu_blocked_node = rnp_root; 541 list_add(&t->rcu_node_entry, lp_root);
503 list_add(&tp->rcu_node_entry, lp_root); 542 if (&t->rcu_node_entry == rnp->gp_tasks)
504 raw_spin_unlock(&rnp_root->lock); /* irqs remain disabled */ 543 rnp_root->gp_tasks = rnp->gp_tasks;
505 } 544 if (&t->rcu_node_entry == rnp->exp_tasks)
545 rnp_root->exp_tasks = rnp->exp_tasks;
546#ifdef CONFIG_RCU_BOOST
547 if (&t->rcu_node_entry == rnp->boost_tasks)
548 rnp_root->boost_tasks = rnp->boost_tasks;
549#endif /* #ifdef CONFIG_RCU_BOOST */
550 raw_spin_unlock(&rnp_root->lock); /* irqs still disabled */
506 } 551 }
552
553#ifdef CONFIG_RCU_BOOST
554 /* In case root is being boosted and leaf is not. */
555 raw_spin_lock(&rnp_root->lock); /* irqs already disabled */
556 if (rnp_root->boost_tasks != NULL &&
557 rnp_root->boost_tasks != rnp_root->gp_tasks)
558 rnp_root->boost_tasks = rnp_root->gp_tasks;
559 raw_spin_unlock(&rnp_root->lock); /* irqs still disabled */
560#endif /* #ifdef CONFIG_RCU_BOOST */
561
562 rnp->gp_tasks = NULL;
563 rnp->exp_tasks = NULL;
507 return retval; 564 return retval;
508} 565}
509 566
510/* 567/*
511 * Do CPU-offline processing for preemptable RCU. 568 * Do CPU-offline processing for preemptible RCU.
512 */ 569 */
513static void rcu_preempt_offline_cpu(int cpu) 570static void rcu_preempt_offline_cpu(int cpu)
514{ 571{
@@ -537,7 +594,7 @@ static void rcu_preempt_check_callbacks(int cpu)
537} 594}
538 595
539/* 596/*
540 * Process callbacks for preemptable RCU. 597 * Process callbacks for preemptible RCU.
541 */ 598 */
542static void rcu_preempt_process_callbacks(void) 599static void rcu_preempt_process_callbacks(void)
543{ 600{
@@ -546,7 +603,7 @@ static void rcu_preempt_process_callbacks(void)
546} 603}
547 604
548/* 605/*
549 * Queue a preemptable-RCU callback for invocation after a grace period. 606 * Queue a preemptible-RCU callback for invocation after a grace period.
550 */ 607 */
551void call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu)) 608void call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu))
552{ 609{
@@ -594,8 +651,7 @@ static DEFINE_MUTEX(sync_rcu_preempt_exp_mutex);
594 */ 651 */
595static int rcu_preempted_readers_exp(struct rcu_node *rnp) 652static int rcu_preempted_readers_exp(struct rcu_node *rnp)
596{ 653{
597 return !list_empty(&rnp->blocked_tasks[2]) || 654 return rnp->exp_tasks != NULL;
598 !list_empty(&rnp->blocked_tasks[3]);
599} 655}
600 656
601/* 657/*
@@ -655,13 +711,17 @@ static void rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp)
655static void 711static void
656sync_rcu_preempt_exp_init(struct rcu_state *rsp, struct rcu_node *rnp) 712sync_rcu_preempt_exp_init(struct rcu_state *rsp, struct rcu_node *rnp)
657{ 713{
658 int must_wait; 714 unsigned long flags;
715 int must_wait = 0;
659 716
660 raw_spin_lock(&rnp->lock); /* irqs already disabled */ 717 raw_spin_lock_irqsave(&rnp->lock, flags);
661 list_splice_init(&rnp->blocked_tasks[0], &rnp->blocked_tasks[2]); 718 if (list_empty(&rnp->blkd_tasks))
662 list_splice_init(&rnp->blocked_tasks[1], &rnp->blocked_tasks[3]); 719 raw_spin_unlock_irqrestore(&rnp->lock, flags);
663 must_wait = rcu_preempted_readers_exp(rnp); 720 else {
664 raw_spin_unlock(&rnp->lock); /* irqs remain disabled */ 721 rnp->exp_tasks = rnp->blkd_tasks.next;
722 rcu_initiate_boost(rnp, flags); /* releases rnp->lock */
723 must_wait = 1;
724 }
665 if (!must_wait) 725 if (!must_wait)
666 rcu_report_exp_rnp(rsp, rnp); 726 rcu_report_exp_rnp(rsp, rnp);
667} 727}
@@ -669,9 +729,7 @@ sync_rcu_preempt_exp_init(struct rcu_state *rsp, struct rcu_node *rnp)
669/* 729/*
670 * Wait for an rcu-preempt grace period, but expedite it. The basic idea 730 * Wait for an rcu-preempt grace period, but expedite it. The basic idea
671 * is to invoke synchronize_sched_expedited() to push all the tasks to 731 * is to invoke synchronize_sched_expedited() to push all the tasks to
672 * the ->blocked_tasks[] lists, move all entries from the first set of 732 * the ->blkd_tasks lists and wait for this list to drain.
673 * ->blocked_tasks[] lists to the second set, and finally wait for this
674 * second set to drain.
675 */ 733 */
676void synchronize_rcu_expedited(void) 734void synchronize_rcu_expedited(void)
677{ 735{
@@ -703,7 +761,7 @@ void synchronize_rcu_expedited(void)
703 if ((ACCESS_ONCE(sync_rcu_preempt_exp_count) - snap) > 0) 761 if ((ACCESS_ONCE(sync_rcu_preempt_exp_count) - snap) > 0)
704 goto unlock_mb_ret; /* Others did our work for us. */ 762 goto unlock_mb_ret; /* Others did our work for us. */
705 763
706 /* force all RCU readers onto blocked_tasks[]. */ 764 /* force all RCU readers onto ->blkd_tasks lists. */
707 synchronize_sched_expedited(); 765 synchronize_sched_expedited();
708 766
709 raw_spin_lock_irqsave(&rsp->onofflock, flags); 767 raw_spin_lock_irqsave(&rsp->onofflock, flags);
@@ -715,7 +773,7 @@ void synchronize_rcu_expedited(void)
715 raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */ 773 raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */
716 } 774 }
717 775
718 /* Snapshot current state of ->blocked_tasks[] lists. */ 776 /* Snapshot current state of ->blkd_tasks lists. */
719 rcu_for_each_leaf_node(rsp, rnp) 777 rcu_for_each_leaf_node(rsp, rnp)
720 sync_rcu_preempt_exp_init(rsp, rnp); 778 sync_rcu_preempt_exp_init(rsp, rnp);
721 if (NUM_RCU_NODES > 1) 779 if (NUM_RCU_NODES > 1)
@@ -723,7 +781,7 @@ void synchronize_rcu_expedited(void)
723 781
724 raw_spin_unlock_irqrestore(&rsp->onofflock, flags); 782 raw_spin_unlock_irqrestore(&rsp->onofflock, flags);
725 783
726 /* Wait for snapshotted ->blocked_tasks[] lists to drain. */ 784 /* Wait for snapshotted ->blkd_tasks lists to drain. */
727 rnp = rcu_get_root(rsp); 785 rnp = rcu_get_root(rsp);
728 wait_event(sync_rcu_preempt_exp_wq, 786 wait_event(sync_rcu_preempt_exp_wq,
729 sync_rcu_preempt_exp_done(rnp)); 787 sync_rcu_preempt_exp_done(rnp));
@@ -739,7 +797,7 @@ mb_ret:
739EXPORT_SYMBOL_GPL(synchronize_rcu_expedited); 797EXPORT_SYMBOL_GPL(synchronize_rcu_expedited);
740 798
741/* 799/*
742 * Check to see if there is any immediate preemptable-RCU-related work 800 * Check to see if there is any immediate preemptible-RCU-related work
743 * to be done. 801 * to be done.
744 */ 802 */
745static int rcu_preempt_pending(int cpu) 803static int rcu_preempt_pending(int cpu)
@@ -749,7 +807,7 @@ static int rcu_preempt_pending(int cpu)
749} 807}
750 808
751/* 809/*
752 * Does preemptable RCU need the CPU to stay out of dynticks mode? 810 * Does preemptible RCU need the CPU to stay out of dynticks mode?
753 */ 811 */
754static int rcu_preempt_needs_cpu(int cpu) 812static int rcu_preempt_needs_cpu(int cpu)
755{ 813{
@@ -766,7 +824,7 @@ void rcu_barrier(void)
766EXPORT_SYMBOL_GPL(rcu_barrier); 824EXPORT_SYMBOL_GPL(rcu_barrier);
767 825
768/* 826/*
769 * Initialize preemptable RCU's per-CPU data. 827 * Initialize preemptible RCU's per-CPU data.
770 */ 828 */
771static void __cpuinit rcu_preempt_init_percpu_data(int cpu) 829static void __cpuinit rcu_preempt_init_percpu_data(int cpu)
772{ 830{
@@ -774,7 +832,7 @@ static void __cpuinit rcu_preempt_init_percpu_data(int cpu)
774} 832}
775 833
776/* 834/*
777 * Move preemptable RCU's callbacks from dying CPU to other online CPU. 835 * Move preemptible RCU's callbacks from dying CPU to other online CPU.
778 */ 836 */
779static void rcu_preempt_send_cbs_to_online(void) 837static void rcu_preempt_send_cbs_to_online(void)
780{ 838{
@@ -782,7 +840,7 @@ static void rcu_preempt_send_cbs_to_online(void)
782} 840}
783 841
784/* 842/*
785 * Initialize preemptable RCU's state structures. 843 * Initialize preemptible RCU's state structures.
786 */ 844 */
787static void __init __rcu_init_preempt(void) 845static void __init __rcu_init_preempt(void)
788{ 846{
@@ -790,7 +848,7 @@ static void __init __rcu_init_preempt(void)
790} 848}
791 849
792/* 850/*
793 * Check for a task exiting while in a preemptable-RCU read-side 851 * Check for a task exiting while in a preemptible-RCU read-side
794 * critical section, clean up if so. No need to issue warnings, 852 * critical section, clean up if so. No need to issue warnings,
795 * as debug_check_no_locks_held() already does this if lockdep 853 * as debug_check_no_locks_held() already does this if lockdep
796 * is enabled. 854 * is enabled.
@@ -802,11 +860,13 @@ void exit_rcu(void)
802 if (t->rcu_read_lock_nesting == 0) 860 if (t->rcu_read_lock_nesting == 0)
803 return; 861 return;
804 t->rcu_read_lock_nesting = 1; 862 t->rcu_read_lock_nesting = 1;
805 rcu_read_unlock(); 863 __rcu_read_unlock();
806} 864}
807 865
808#else /* #ifdef CONFIG_TREE_PREEMPT_RCU */ 866#else /* #ifdef CONFIG_TREE_PREEMPT_RCU */
809 867
868static struct rcu_state *rcu_state = &rcu_sched_state;
869
810/* 870/*
811 * Tell them what RCU they are running. 871 * Tell them what RCU they are running.
812 */ 872 */
@@ -836,7 +896,7 @@ void rcu_force_quiescent_state(void)
836EXPORT_SYMBOL_GPL(rcu_force_quiescent_state); 896EXPORT_SYMBOL_GPL(rcu_force_quiescent_state);
837 897
838/* 898/*
839 * Because preemptable RCU does not exist, we never have to check for 899 * Because preemptible RCU does not exist, we never have to check for
840 * CPUs being in quiescent states. 900 * CPUs being in quiescent states.
841 */ 901 */
842static void rcu_preempt_note_context_switch(int cpu) 902static void rcu_preempt_note_context_switch(int cpu)
@@ -844,10 +904,10 @@ static void rcu_preempt_note_context_switch(int cpu)
844} 904}
845 905
846/* 906/*
847 * Because preemptable RCU does not exist, there are never any preempted 907 * Because preemptible RCU does not exist, there are never any preempted
848 * RCU readers. 908 * RCU readers.
849 */ 909 */
850static int rcu_preempted_readers(struct rcu_node *rnp) 910static int rcu_preempt_blocked_readers_cgp(struct rcu_node *rnp)
851{ 911{
852 return 0; 912 return 0;
853} 913}
@@ -862,10 +922,8 @@ static void rcu_report_unblock_qs_rnp(struct rcu_node *rnp, unsigned long flags)
862 922
863#endif /* #ifdef CONFIG_HOTPLUG_CPU */ 923#endif /* #ifdef CONFIG_HOTPLUG_CPU */
864 924
865#ifdef CONFIG_RCU_CPU_STALL_DETECTOR
866
867/* 925/*
868 * Because preemptable RCU does not exist, we never have to check for 926 * Because preemptible RCU does not exist, we never have to check for
869 * tasks blocked within RCU read-side critical sections. 927 * tasks blocked within RCU read-side critical sections.
870 */ 928 */
871static void rcu_print_detail_task_stall(struct rcu_state *rsp) 929static void rcu_print_detail_task_stall(struct rcu_state *rsp)
@@ -873,7 +931,7 @@ static void rcu_print_detail_task_stall(struct rcu_state *rsp)
873} 931}
874 932
875/* 933/*
876 * Because preemptable RCU does not exist, we never have to check for 934 * Because preemptible RCU does not exist, we never have to check for
877 * tasks blocked within RCU read-side critical sections. 935 * tasks blocked within RCU read-side critical sections.
878 */ 936 */
879static void rcu_print_task_stall(struct rcu_node *rnp) 937static void rcu_print_task_stall(struct rcu_node *rnp)
@@ -888,10 +946,8 @@ static void rcu_preempt_stall_reset(void)
888{ 946{
889} 947}
890 948
891#endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */
892
893/* 949/*
894 * Because there is no preemptable RCU, there can be no readers blocked, 950 * Because there is no preemptible RCU, there can be no readers blocked,
895 * so there is no need to check for blocked tasks. So check only for 951 * so there is no need to check for blocked tasks. So check only for
896 * bogus qsmask values. 952 * bogus qsmask values.
897 */ 953 */
@@ -903,7 +959,7 @@ static void rcu_preempt_check_blocked_tasks(struct rcu_node *rnp)
903#ifdef CONFIG_HOTPLUG_CPU 959#ifdef CONFIG_HOTPLUG_CPU
904 960
905/* 961/*
906 * Because preemptable RCU does not exist, it never needs to migrate 962 * Because preemptible RCU does not exist, it never needs to migrate
907 * tasks that were blocked within RCU read-side critical sections, and 963 * tasks that were blocked within RCU read-side critical sections, and
908 * such non-existent tasks cannot possibly have been blocking the current 964 * such non-existent tasks cannot possibly have been blocking the current
909 * grace period. 965 * grace period.
@@ -916,7 +972,7 @@ static int rcu_preempt_offline_tasks(struct rcu_state *rsp,
916} 972}
917 973
918/* 974/*
919 * Because preemptable RCU does not exist, it never needs CPU-offline 975 * Because preemptible RCU does not exist, it never needs CPU-offline
920 * processing. 976 * processing.
921 */ 977 */
922static void rcu_preempt_offline_cpu(int cpu) 978static void rcu_preempt_offline_cpu(int cpu)
@@ -926,7 +982,7 @@ static void rcu_preempt_offline_cpu(int cpu)
926#endif /* #ifdef CONFIG_HOTPLUG_CPU */ 982#endif /* #ifdef CONFIG_HOTPLUG_CPU */
927 983
928/* 984/*
929 * Because preemptable RCU does not exist, it never has any callbacks 985 * Because preemptible RCU does not exist, it never has any callbacks
930 * to check. 986 * to check.
931 */ 987 */
932static void rcu_preempt_check_callbacks(int cpu) 988static void rcu_preempt_check_callbacks(int cpu)
@@ -934,7 +990,7 @@ static void rcu_preempt_check_callbacks(int cpu)
934} 990}
935 991
936/* 992/*
937 * Because preemptable RCU does not exist, it never has any callbacks 993 * Because preemptible RCU does not exist, it never has any callbacks
938 * to process. 994 * to process.
939 */ 995 */
940static void rcu_preempt_process_callbacks(void) 996static void rcu_preempt_process_callbacks(void)
@@ -943,7 +999,7 @@ static void rcu_preempt_process_callbacks(void)
943 999
944/* 1000/*
945 * Wait for an rcu-preempt grace period, but make it happen quickly. 1001 * Wait for an rcu-preempt grace period, but make it happen quickly.
946 * But because preemptable RCU does not exist, map to rcu-sched. 1002 * But because preemptible RCU does not exist, map to rcu-sched.
947 */ 1003 */
948void synchronize_rcu_expedited(void) 1004void synchronize_rcu_expedited(void)
949{ 1005{
@@ -954,7 +1010,7 @@ EXPORT_SYMBOL_GPL(synchronize_rcu_expedited);
954#ifdef CONFIG_HOTPLUG_CPU 1010#ifdef CONFIG_HOTPLUG_CPU
955 1011
956/* 1012/*
957 * Because preemptable RCU does not exist, there is never any need to 1013 * Because preemptible RCU does not exist, there is never any need to
958 * report on tasks preempted in RCU read-side critical sections during 1014 * report on tasks preempted in RCU read-side critical sections during
959 * expedited RCU grace periods. 1015 * expedited RCU grace periods.
960 */ 1016 */
@@ -966,7 +1022,7 @@ static void rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp)
966#endif /* #ifdef CONFIG_HOTPLUG_CPU */ 1022#endif /* #ifdef CONFIG_HOTPLUG_CPU */
967 1023
968/* 1024/*
969 * Because preemptable RCU does not exist, it never has any work to do. 1025 * Because preemptible RCU does not exist, it never has any work to do.
970 */ 1026 */
971static int rcu_preempt_pending(int cpu) 1027static int rcu_preempt_pending(int cpu)
972{ 1028{
@@ -974,7 +1030,7 @@ static int rcu_preempt_pending(int cpu)
974} 1030}
975 1031
976/* 1032/*
977 * Because preemptable RCU does not exist, it never needs any CPU. 1033 * Because preemptible RCU does not exist, it never needs any CPU.
978 */ 1034 */
979static int rcu_preempt_needs_cpu(int cpu) 1035static int rcu_preempt_needs_cpu(int cpu)
980{ 1036{
@@ -982,7 +1038,7 @@ static int rcu_preempt_needs_cpu(int cpu)
982} 1038}
983 1039
984/* 1040/*
985 * Because preemptable RCU does not exist, rcu_barrier() is just 1041 * Because preemptible RCU does not exist, rcu_barrier() is just
986 * another name for rcu_barrier_sched(). 1042 * another name for rcu_barrier_sched().
987 */ 1043 */
988void rcu_barrier(void) 1044void rcu_barrier(void)
@@ -992,7 +1048,7 @@ void rcu_barrier(void)
992EXPORT_SYMBOL_GPL(rcu_barrier); 1048EXPORT_SYMBOL_GPL(rcu_barrier);
993 1049
994/* 1050/*
995 * Because preemptable RCU does not exist, there is no per-CPU 1051 * Because preemptible RCU does not exist, there is no per-CPU
996 * data to initialize. 1052 * data to initialize.
997 */ 1053 */
998static void __cpuinit rcu_preempt_init_percpu_data(int cpu) 1054static void __cpuinit rcu_preempt_init_percpu_data(int cpu)
@@ -1000,14 +1056,14 @@ static void __cpuinit rcu_preempt_init_percpu_data(int cpu)
1000} 1056}
1001 1057
1002/* 1058/*
1003 * Because there is no preemptable RCU, there are no callbacks to move. 1059 * Because there is no preemptible RCU, there are no callbacks to move.
1004 */ 1060 */
1005static void rcu_preempt_send_cbs_to_online(void) 1061static void rcu_preempt_send_cbs_to_online(void)
1006{ 1062{
1007} 1063}
1008 1064
1009/* 1065/*
1010 * Because preemptable RCU does not exist, it need not be initialized. 1066 * Because preemptible RCU does not exist, it need not be initialized.
1011 */ 1067 */
1012static void __init __rcu_init_preempt(void) 1068static void __init __rcu_init_preempt(void)
1013{ 1069{
@@ -1015,6 +1071,276 @@ static void __init __rcu_init_preempt(void)
1015 1071
1016#endif /* #else #ifdef CONFIG_TREE_PREEMPT_RCU */ 1072#endif /* #else #ifdef CONFIG_TREE_PREEMPT_RCU */
1017 1073
1074#ifdef CONFIG_RCU_BOOST
1075
1076#include "rtmutex_common.h"
1077
1078#ifdef CONFIG_RCU_TRACE
1079
1080static void rcu_initiate_boost_trace(struct rcu_node *rnp)
1081{
1082 if (list_empty(&rnp->blkd_tasks))
1083 rnp->n_balk_blkd_tasks++;
1084 else if (rnp->exp_tasks == NULL && rnp->gp_tasks == NULL)
1085 rnp->n_balk_exp_gp_tasks++;
1086 else if (rnp->gp_tasks != NULL && rnp->boost_tasks != NULL)
1087 rnp->n_balk_boost_tasks++;
1088 else if (rnp->gp_tasks != NULL && rnp->qsmask != 0)
1089 rnp->n_balk_notblocked++;
1090 else if (rnp->gp_tasks != NULL &&
1091 ULONG_CMP_LT(jiffies, rnp->boost_time))
1092 rnp->n_balk_notyet++;
1093 else
1094 rnp->n_balk_nos++;
1095}
1096
1097#else /* #ifdef CONFIG_RCU_TRACE */
1098
1099static void rcu_initiate_boost_trace(struct rcu_node *rnp)
1100{
1101}
1102
1103#endif /* #else #ifdef CONFIG_RCU_TRACE */
1104
1105/*
1106 * Carry out RCU priority boosting on the task indicated by ->exp_tasks
1107 * or ->boost_tasks, advancing the pointer to the next task in the
1108 * ->blkd_tasks list.
1109 *
1110 * Note that irqs must be enabled: boosting the task can block.
1111 * Returns 1 if there are more tasks needing to be boosted.
1112 */
1113static int rcu_boost(struct rcu_node *rnp)
1114{
1115 unsigned long flags;
1116 struct rt_mutex mtx;
1117 struct task_struct *t;
1118 struct list_head *tb;
1119
1120 if (rnp->exp_tasks == NULL && rnp->boost_tasks == NULL)
1121 return 0; /* Nothing left to boost. */
1122
1123 raw_spin_lock_irqsave(&rnp->lock, flags);
1124
1125 /*
1126 * Recheck under the lock: all tasks in need of boosting
1127 * might exit their RCU read-side critical sections on their own.
1128 */
1129 if (rnp->exp_tasks == NULL && rnp->boost_tasks == NULL) {
1130 raw_spin_unlock_irqrestore(&rnp->lock, flags);
1131 return 0;
1132 }
1133
1134 /*
1135 * Preferentially boost tasks blocking expedited grace periods.
1136 * This cannot starve the normal grace periods because a second
1137 * expedited grace period must boost all blocked tasks, including
1138 * those blocking the pre-existing normal grace period.
1139 */
1140 if (rnp->exp_tasks != NULL) {
1141 tb = rnp->exp_tasks;
1142 rnp->n_exp_boosts++;
1143 } else {
1144 tb = rnp->boost_tasks;
1145 rnp->n_normal_boosts++;
1146 }
1147 rnp->n_tasks_boosted++;
1148
1149 /*
1150 * We boost task t by manufacturing an rt_mutex that appears to
1151 * be held by task t. We leave a pointer to that rt_mutex where
1152 * task t can find it, and task t will release the mutex when it
1153 * exits its outermost RCU read-side critical section. Then
1154 * simply acquiring this artificial rt_mutex will boost task
1155 * t's priority. (Thanks to tglx for suggesting this approach!)
1156 *
1157 * Note that task t must acquire rnp->lock to remove itself from
1158 * the ->blkd_tasks list, which it will do from exit() if from
1159 * nowhere else. We therefore are guaranteed that task t will
1160 * stay around at least until we drop rnp->lock. Note that
1161 * rnp->lock also resolves races between our priority boosting
1162 * and task t's exiting its outermost RCU read-side critical
1163 * section.
1164 */
1165 t = container_of(tb, struct task_struct, rcu_node_entry);
1166 rt_mutex_init_proxy_locked(&mtx, t);
1167 t->rcu_boost_mutex = &mtx;
1168 t->rcu_read_unlock_special |= RCU_READ_UNLOCK_BOOSTED;
1169 raw_spin_unlock_irqrestore(&rnp->lock, flags);
1170 rt_mutex_lock(&mtx); /* Side effect: boosts task t's priority. */
1171 rt_mutex_unlock(&mtx); /* Keep lockdep happy. */
1172
1173 return rnp->exp_tasks != NULL || rnp->boost_tasks != NULL;
1174}
1175
1176/*
1177 * Timer handler to initiate waking up of boost kthreads that
1178 * have yielded the CPU due to excessive numbers of tasks to
1179 * boost. We wake up the per-rcu_node kthread, which in turn
1180 * will wake up the booster kthread.
1181 */
1182static void rcu_boost_kthread_timer(unsigned long arg)
1183{
1184 invoke_rcu_node_kthread((struct rcu_node *)arg);
1185}
1186
1187/*
1188 * Priority-boosting kthread. One per leaf rcu_node and one for the
1189 * root rcu_node.
1190 */
1191static int rcu_boost_kthread(void *arg)
1192{
1193 struct rcu_node *rnp = (struct rcu_node *)arg;
1194 int spincnt = 0;
1195 int more2boost;
1196
1197 for (;;) {
1198 rnp->boost_kthread_status = RCU_KTHREAD_WAITING;
1199 wait_event_interruptible(rnp->boost_wq, rnp->boost_tasks ||
1200 rnp->exp_tasks);
1201 rnp->boost_kthread_status = RCU_KTHREAD_RUNNING;
1202 more2boost = rcu_boost(rnp);
1203 if (more2boost)
1204 spincnt++;
1205 else
1206 spincnt = 0;
1207 if (spincnt > 10) {
1208 rcu_yield(rcu_boost_kthread_timer, (unsigned long)rnp);
1209 spincnt = 0;
1210 }
1211 }
1212 /* NOTREACHED */
1213 return 0;
1214}
1215
1216/*
1217 * Check to see if it is time to start boosting RCU readers that are
1218 * blocking the current grace period, and, if so, tell the per-rcu_node
1219 * kthread to start boosting them. If there is an expedited grace
1220 * period in progress, it is always time to boost.
1221 *
1222 * The caller must hold rnp->lock, which this function releases,
1223 * but irqs remain disabled. The ->boost_kthread_task is immortal,
1224 * so we don't need to worry about it going away.
1225 */
1226static void rcu_initiate_boost(struct rcu_node *rnp, unsigned long flags)
1227{
1228 struct task_struct *t;
1229
1230 if (!rcu_preempt_blocked_readers_cgp(rnp) && rnp->exp_tasks == NULL) {
1231 rnp->n_balk_exp_gp_tasks++;
1232 raw_spin_unlock_irqrestore(&rnp->lock, flags);
1233 return;
1234 }
1235 if (rnp->exp_tasks != NULL ||
1236 (rnp->gp_tasks != NULL &&
1237 rnp->boost_tasks == NULL &&
1238 rnp->qsmask == 0 &&
1239 ULONG_CMP_GE(jiffies, rnp->boost_time))) {
1240 if (rnp->exp_tasks == NULL)
1241 rnp->boost_tasks = rnp->gp_tasks;
1242 raw_spin_unlock_irqrestore(&rnp->lock, flags);
1243 t = rnp->boost_kthread_task;
1244 if (t != NULL)
1245 wake_up_process(t);
1246 } else {
1247 rcu_initiate_boost_trace(rnp);
1248 raw_spin_unlock_irqrestore(&rnp->lock, flags);
1249 }
1250}
1251
1252/*
1253 * Set the affinity of the boost kthread. The CPU-hotplug locks are
1254 * held, so no one should be messing with the existence of the boost
1255 * kthread.
1256 */
1257static void rcu_boost_kthread_setaffinity(struct rcu_node *rnp,
1258 cpumask_var_t cm)
1259{
1260 struct task_struct *t;
1261
1262 t = rnp->boost_kthread_task;
1263 if (t != NULL)
1264 set_cpus_allowed_ptr(rnp->boost_kthread_task, cm);
1265}
1266
1267#define RCU_BOOST_DELAY_JIFFIES DIV_ROUND_UP(CONFIG_RCU_BOOST_DELAY * HZ, 1000)
1268
1269/*
1270 * Do priority-boost accounting for the start of a new grace period.
1271 */
1272static void rcu_preempt_boost_start_gp(struct rcu_node *rnp)
1273{
1274 rnp->boost_time = jiffies + RCU_BOOST_DELAY_JIFFIES;
1275}
1276
1277/*
1278 * Initialize the RCU-boost waitqueue.
1279 */
1280static void __init rcu_init_boost_waitqueue(struct rcu_node *rnp)
1281{
1282 init_waitqueue_head(&rnp->boost_wq);
1283}
1284
1285/*
1286 * Create an RCU-boost kthread for the specified node if one does not
1287 * already exist. We only create this kthread for preemptible RCU.
1288 * Returns zero if all is well, a negated errno otherwise.
1289 */
1290static int __cpuinit rcu_spawn_one_boost_kthread(struct rcu_state *rsp,
1291 struct rcu_node *rnp,
1292 int rnp_index)
1293{
1294 unsigned long flags;
1295 struct sched_param sp;
1296 struct task_struct *t;
1297
1298 if (&rcu_preempt_state != rsp)
1299 return 0;
1300 if (rnp->boost_kthread_task != NULL)
1301 return 0;
1302 t = kthread_create(rcu_boost_kthread, (void *)rnp,
1303 "rcub%d", rnp_index);
1304 if (IS_ERR(t))
1305 return PTR_ERR(t);
1306 raw_spin_lock_irqsave(&rnp->lock, flags);
1307 rnp->boost_kthread_task = t;
1308 raw_spin_unlock_irqrestore(&rnp->lock, flags);
1309 wake_up_process(t);
1310 sp.sched_priority = RCU_KTHREAD_PRIO;
1311 sched_setscheduler_nocheck(t, SCHED_FIFO, &sp);
1312 return 0;
1313}
1314
1315#else /* #ifdef CONFIG_RCU_BOOST */
1316
1317static void rcu_initiate_boost(struct rcu_node *rnp, unsigned long flags)
1318{
1319 raw_spin_unlock_irqrestore(&rnp->lock, flags);
1320}
1321
1322static void rcu_boost_kthread_setaffinity(struct rcu_node *rnp,
1323 cpumask_var_t cm)
1324{
1325}
1326
1327static void rcu_preempt_boost_start_gp(struct rcu_node *rnp)
1328{
1329}
1330
1331static void __init rcu_init_boost_waitqueue(struct rcu_node *rnp)
1332{
1333}
1334
1335static int __cpuinit rcu_spawn_one_boost_kthread(struct rcu_state *rsp,
1336 struct rcu_node *rnp,
1337 int rnp_index)
1338{
1339 return 0;
1340}
1341
1342#endif /* #else #ifdef CONFIG_RCU_BOOST */
1343
1018#ifndef CONFIG_SMP 1344#ifndef CONFIG_SMP
1019 1345
1020void synchronize_sched_expedited(void) 1346void synchronize_sched_expedited(void)
@@ -1187,8 +1513,8 @@ static DEFINE_PER_CPU(unsigned long, rcu_dyntick_holdoff);
1187 * 1513 *
1188 * Because it is not legal to invoke rcu_process_callbacks() with irqs 1514 * Because it is not legal to invoke rcu_process_callbacks() with irqs
1189 * disabled, we do one pass of force_quiescent_state(), then do a 1515 * disabled, we do one pass of force_quiescent_state(), then do a
1190 * raise_softirq() to cause rcu_process_callbacks() to be invoked later. 1516 * invoke_rcu_cpu_kthread() to cause rcu_process_callbacks() to be invoked
1191 * The per-cpu rcu_dyntick_drain variable controls the sequencing. 1517 * later. The per-cpu rcu_dyntick_drain variable controls the sequencing.
1192 */ 1518 */
1193int rcu_needs_cpu(int cpu) 1519int rcu_needs_cpu(int cpu)
1194{ 1520{
@@ -1239,7 +1565,7 @@ int rcu_needs_cpu(int cpu)
1239 1565
1240 /* If RCU callbacks are still pending, RCU still needs this CPU. */ 1566 /* If RCU callbacks are still pending, RCU still needs this CPU. */
1241 if (c) 1567 if (c)
1242 raise_softirq(RCU_SOFTIRQ); 1568 invoke_rcu_cpu_kthread();
1243 return c; 1569 return c;
1244} 1570}
1245 1571
diff --git a/kernel/rcutree_trace.c b/kernel/rcutree_trace.c
index c8e97853b970..aa0fd72b4bc7 100644
--- a/kernel/rcutree_trace.c
+++ b/kernel/rcutree_trace.c
@@ -46,6 +46,18 @@
46#define RCU_TREE_NONCORE 46#define RCU_TREE_NONCORE
47#include "rcutree.h" 47#include "rcutree.h"
48 48
49DECLARE_PER_CPU(unsigned int, rcu_cpu_kthread_status);
50DECLARE_PER_CPU(unsigned int, rcu_cpu_kthread_cpu);
51DECLARE_PER_CPU(unsigned int, rcu_cpu_kthread_loops);
52DECLARE_PER_CPU(char, rcu_cpu_has_work);
53
54static char convert_kthread_status(unsigned int kthread_status)
55{
56 if (kthread_status > RCU_KTHREAD_MAX)
57 return '?';
58 return "SRWOY"[kthread_status];
59}
60
49static void print_one_rcu_data(struct seq_file *m, struct rcu_data *rdp) 61static void print_one_rcu_data(struct seq_file *m, struct rcu_data *rdp)
50{ 62{
51 if (!rdp->beenonline) 63 if (!rdp->beenonline)
@@ -64,7 +76,21 @@ static void print_one_rcu_data(struct seq_file *m, struct rcu_data *rdp)
64 rdp->dynticks_fqs); 76 rdp->dynticks_fqs);
65#endif /* #ifdef CONFIG_NO_HZ */ 77#endif /* #ifdef CONFIG_NO_HZ */
66 seq_printf(m, " of=%lu ri=%lu", rdp->offline_fqs, rdp->resched_ipi); 78 seq_printf(m, " of=%lu ri=%lu", rdp->offline_fqs, rdp->resched_ipi);
67 seq_printf(m, " ql=%ld b=%ld", rdp->qlen, rdp->blimit); 79 seq_printf(m, " ql=%ld qs=%c%c%c%c kt=%d/%c/%d ktl=%x b=%ld",
80 rdp->qlen,
81 ".N"[rdp->nxttail[RCU_NEXT_READY_TAIL] !=
82 rdp->nxttail[RCU_NEXT_TAIL]],
83 ".R"[rdp->nxttail[RCU_WAIT_TAIL] !=
84 rdp->nxttail[RCU_NEXT_READY_TAIL]],
85 ".W"[rdp->nxttail[RCU_DONE_TAIL] !=
86 rdp->nxttail[RCU_WAIT_TAIL]],
87 ".D"[&rdp->nxtlist != rdp->nxttail[RCU_DONE_TAIL]],
88 per_cpu(rcu_cpu_has_work, rdp->cpu),
89 convert_kthread_status(per_cpu(rcu_cpu_kthread_status,
90 rdp->cpu)),
91 per_cpu(rcu_cpu_kthread_cpu, rdp->cpu),
92 per_cpu(rcu_cpu_kthread_loops, rdp->cpu) & 0xffff,
93 rdp->blimit);
68 seq_printf(m, " ci=%lu co=%lu ca=%lu\n", 94 seq_printf(m, " ci=%lu co=%lu ca=%lu\n",
69 rdp->n_cbs_invoked, rdp->n_cbs_orphaned, rdp->n_cbs_adopted); 95 rdp->n_cbs_invoked, rdp->n_cbs_orphaned, rdp->n_cbs_adopted);
70} 96}
@@ -121,7 +147,18 @@ static void print_one_rcu_data_csv(struct seq_file *m, struct rcu_data *rdp)
121 rdp->dynticks_fqs); 147 rdp->dynticks_fqs);
122#endif /* #ifdef CONFIG_NO_HZ */ 148#endif /* #ifdef CONFIG_NO_HZ */
123 seq_printf(m, ",%lu,%lu", rdp->offline_fqs, rdp->resched_ipi); 149 seq_printf(m, ",%lu,%lu", rdp->offline_fqs, rdp->resched_ipi);
124 seq_printf(m, ",%ld,%ld", rdp->qlen, rdp->blimit); 150 seq_printf(m, ",%ld,\"%c%c%c%c\",%d,\"%c\",%ld", rdp->qlen,
151 ".N"[rdp->nxttail[RCU_NEXT_READY_TAIL] !=
152 rdp->nxttail[RCU_NEXT_TAIL]],
153 ".R"[rdp->nxttail[RCU_WAIT_TAIL] !=
154 rdp->nxttail[RCU_NEXT_READY_TAIL]],
155 ".W"[rdp->nxttail[RCU_DONE_TAIL] !=
156 rdp->nxttail[RCU_WAIT_TAIL]],
157 ".D"[&rdp->nxtlist != rdp->nxttail[RCU_DONE_TAIL]],
158 per_cpu(rcu_cpu_has_work, rdp->cpu),
159 convert_kthread_status(per_cpu(rcu_cpu_kthread_status,
160 rdp->cpu)),
161 rdp->blimit);
125 seq_printf(m, ",%lu,%lu,%lu\n", 162 seq_printf(m, ",%lu,%lu,%lu\n",
126 rdp->n_cbs_invoked, rdp->n_cbs_orphaned, rdp->n_cbs_adopted); 163 rdp->n_cbs_invoked, rdp->n_cbs_orphaned, rdp->n_cbs_adopted);
127} 164}
@@ -157,11 +194,76 @@ static const struct file_operations rcudata_csv_fops = {
157 .release = single_release, 194 .release = single_release,
158}; 195};
159 196
197#ifdef CONFIG_RCU_BOOST
198
199static void print_one_rcu_node_boost(struct seq_file *m, struct rcu_node *rnp)
200{
201 seq_printf(m, "%d:%d tasks=%c%c%c%c kt=%c ntb=%lu neb=%lu nnb=%lu "
202 "j=%04x bt=%04x\n",
203 rnp->grplo, rnp->grphi,
204 "T."[list_empty(&rnp->blkd_tasks)],
205 "N."[!rnp->gp_tasks],
206 "E."[!rnp->exp_tasks],
207 "B."[!rnp->boost_tasks],
208 convert_kthread_status(rnp->boost_kthread_status),
209 rnp->n_tasks_boosted, rnp->n_exp_boosts,
210 rnp->n_normal_boosts,
211 (int)(jiffies & 0xffff),
212 (int)(rnp->boost_time & 0xffff));
213 seq_printf(m, "%s: nt=%lu egt=%lu bt=%lu nb=%lu ny=%lu nos=%lu\n",
214 " balk",
215 rnp->n_balk_blkd_tasks,
216 rnp->n_balk_exp_gp_tasks,
217 rnp->n_balk_boost_tasks,
218 rnp->n_balk_notblocked,
219 rnp->n_balk_notyet,
220 rnp->n_balk_nos);
221}
222
223static int show_rcu_node_boost(struct seq_file *m, void *unused)
224{
225 struct rcu_node *rnp;
226
227 rcu_for_each_leaf_node(&rcu_preempt_state, rnp)
228 print_one_rcu_node_boost(m, rnp);
229 return 0;
230}
231
232static int rcu_node_boost_open(struct inode *inode, struct file *file)
233{
234 return single_open(file, show_rcu_node_boost, NULL);
235}
236
237static const struct file_operations rcu_node_boost_fops = {
238 .owner = THIS_MODULE,
239 .open = rcu_node_boost_open,
240 .read = seq_read,
241 .llseek = seq_lseek,
242 .release = single_release,
243};
244
245/*
246 * Create the rcuboost debugfs entry. Standard error return.
247 */
248static int rcu_boost_trace_create_file(struct dentry *rcudir)
249{
250 return !debugfs_create_file("rcuboost", 0444, rcudir, NULL,
251 &rcu_node_boost_fops);
252}
253
254#else /* #ifdef CONFIG_RCU_BOOST */
255
256static int rcu_boost_trace_create_file(struct dentry *rcudir)
257{
258 return 0; /* There cannot be an error if we didn't create it! */
259}
260
261#endif /* #else #ifdef CONFIG_RCU_BOOST */
262
160static void print_one_rcu_state(struct seq_file *m, struct rcu_state *rsp) 263static void print_one_rcu_state(struct seq_file *m, struct rcu_state *rsp)
161{ 264{
162 unsigned long gpnum; 265 unsigned long gpnum;
163 int level = 0; 266 int level = 0;
164 int phase;
165 struct rcu_node *rnp; 267 struct rcu_node *rnp;
166 268
167 gpnum = rsp->gpnum; 269 gpnum = rsp->gpnum;
@@ -178,13 +280,11 @@ static void print_one_rcu_state(struct seq_file *m, struct rcu_state *rsp)
178 seq_puts(m, "\n"); 280 seq_puts(m, "\n");
179 level = rnp->level; 281 level = rnp->level;
180 } 282 }
181 phase = gpnum & 0x1; 283 seq_printf(m, "%lx/%lx %c%c>%c %d:%d ^%d ",
182 seq_printf(m, "%lx/%lx %c%c>%c%c %d:%d ^%d ",
183 rnp->qsmask, rnp->qsmaskinit, 284 rnp->qsmask, rnp->qsmaskinit,
184 "T."[list_empty(&rnp->blocked_tasks[phase])], 285 ".G"[rnp->gp_tasks != NULL],
185 "E."[list_empty(&rnp->blocked_tasks[phase + 2])], 286 ".E"[rnp->exp_tasks != NULL],
186 "T."[list_empty(&rnp->blocked_tasks[!phase])], 287 ".T"[!list_empty(&rnp->blkd_tasks)],
187 "E."[list_empty(&rnp->blocked_tasks[!phase + 2])],
188 rnp->grplo, rnp->grphi, rnp->grpnum); 288 rnp->grplo, rnp->grphi, rnp->grpnum);
189 } 289 }
190 seq_puts(m, "\n"); 290 seq_puts(m, "\n");
@@ -216,16 +316,35 @@ static const struct file_operations rcuhier_fops = {
216 .release = single_release, 316 .release = single_release,
217}; 317};
218 318
319static void show_one_rcugp(struct seq_file *m, struct rcu_state *rsp)
320{
321 unsigned long flags;
322 unsigned long completed;
323 unsigned long gpnum;
324 unsigned long gpage;
325 unsigned long gpmax;
326 struct rcu_node *rnp = &rsp->node[0];
327
328 raw_spin_lock_irqsave(&rnp->lock, flags);
329 completed = rsp->completed;
330 gpnum = rsp->gpnum;
331 if (rsp->completed == rsp->gpnum)
332 gpage = 0;
333 else
334 gpage = jiffies - rsp->gp_start;
335 gpmax = rsp->gp_max;
336 raw_spin_unlock_irqrestore(&rnp->lock, flags);
337 seq_printf(m, "%s: completed=%ld gpnum=%lu age=%ld max=%ld\n",
338 rsp->name, completed, gpnum, gpage, gpmax);
339}
340
219static int show_rcugp(struct seq_file *m, void *unused) 341static int show_rcugp(struct seq_file *m, void *unused)
220{ 342{
221#ifdef CONFIG_TREE_PREEMPT_RCU 343#ifdef CONFIG_TREE_PREEMPT_RCU
222 seq_printf(m, "rcu_preempt: completed=%ld gpnum=%lu\n", 344 show_one_rcugp(m, &rcu_preempt_state);
223 rcu_preempt_state.completed, rcu_preempt_state.gpnum);
224#endif /* #ifdef CONFIG_TREE_PREEMPT_RCU */ 345#endif /* #ifdef CONFIG_TREE_PREEMPT_RCU */
225 seq_printf(m, "rcu_sched: completed=%ld gpnum=%lu\n", 346 show_one_rcugp(m, &rcu_sched_state);
226 rcu_sched_state.completed, rcu_sched_state.gpnum); 347 show_one_rcugp(m, &rcu_bh_state);
227 seq_printf(m, "rcu_bh: completed=%ld gpnum=%lu\n",
228 rcu_bh_state.completed, rcu_bh_state.gpnum);
229 return 0; 348 return 0;
230} 349}
231 350
@@ -298,6 +417,29 @@ static const struct file_operations rcu_pending_fops = {
298 .release = single_release, 417 .release = single_release,
299}; 418};
300 419
420static int show_rcutorture(struct seq_file *m, void *unused)
421{
422 seq_printf(m, "rcutorture test sequence: %lu %s\n",
423 rcutorture_testseq >> 1,
424 (rcutorture_testseq & 0x1) ? "(test in progress)" : "");
425 seq_printf(m, "rcutorture update version number: %lu\n",
426 rcutorture_vernum);
427 return 0;
428}
429
430static int rcutorture_open(struct inode *inode, struct file *file)
431{
432 return single_open(file, show_rcutorture, NULL);
433}
434
435static const struct file_operations rcutorture_fops = {
436 .owner = THIS_MODULE,
437 .open = rcutorture_open,
438 .read = seq_read,
439 .llseek = seq_lseek,
440 .release = single_release,
441};
442
301static struct dentry *rcudir; 443static struct dentry *rcudir;
302 444
303static int __init rcutree_trace_init(void) 445static int __init rcutree_trace_init(void)
@@ -318,6 +460,9 @@ static int __init rcutree_trace_init(void)
318 if (!retval) 460 if (!retval)
319 goto free_out; 461 goto free_out;
320 462
463 if (rcu_boost_trace_create_file(rcudir))
464 goto free_out;
465
321 retval = debugfs_create_file("rcugp", 0444, rcudir, NULL, &rcugp_fops); 466 retval = debugfs_create_file("rcugp", 0444, rcudir, NULL, &rcugp_fops);
322 if (!retval) 467 if (!retval)
323 goto free_out; 468 goto free_out;
@@ -331,6 +476,11 @@ static int __init rcutree_trace_init(void)
331 NULL, &rcu_pending_fops); 476 NULL, &rcu_pending_fops);
332 if (!retval) 477 if (!retval)
333 goto free_out; 478 goto free_out;
479
480 retval = debugfs_create_file("rcutorture", 0444, rcudir,
481 NULL, &rcutorture_fops);
482 if (!retval)
483 goto free_out;
334 return 0; 484 return 0;
335free_out: 485free_out:
336 debugfs_remove_recursive(rcudir); 486 debugfs_remove_recursive(rcudir);
diff --git a/kernel/sched.c b/kernel/sched.c
index a8845516ace6..c62acf45d3b9 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -231,7 +231,7 @@ static void destroy_rt_bandwidth(struct rt_bandwidth *rt_b)
231#endif 231#endif
232 232
233/* 233/*
234 * sched_domains_mutex serializes calls to arch_init_sched_domains, 234 * sched_domains_mutex serializes calls to init_sched_domains,
235 * detach_destroy_domains and partition_sched_domains. 235 * detach_destroy_domains and partition_sched_domains.
236 */ 236 */
237static DEFINE_MUTEX(sched_domains_mutex); 237static DEFINE_MUTEX(sched_domains_mutex);
@@ -312,6 +312,9 @@ struct cfs_rq {
312 312
313 u64 exec_clock; 313 u64 exec_clock;
314 u64 min_vruntime; 314 u64 min_vruntime;
315#ifndef CONFIG_64BIT
316 u64 min_vruntime_copy;
317#endif
315 318
316 struct rb_root tasks_timeline; 319 struct rb_root tasks_timeline;
317 struct rb_node *rb_leftmost; 320 struct rb_node *rb_leftmost;
@@ -325,7 +328,9 @@ struct cfs_rq {
325 */ 328 */
326 struct sched_entity *curr, *next, *last, *skip; 329 struct sched_entity *curr, *next, *last, *skip;
327 330
331#ifdef CONFIG_SCHED_DEBUG
328 unsigned int nr_spread_over; 332 unsigned int nr_spread_over;
333#endif
329 334
330#ifdef CONFIG_FAIR_GROUP_SCHED 335#ifdef CONFIG_FAIR_GROUP_SCHED
331 struct rq *rq; /* cpu runqueue to which this cfs_rq is attached */ 336 struct rq *rq; /* cpu runqueue to which this cfs_rq is attached */
@@ -417,6 +422,7 @@ struct rt_rq {
417 */ 422 */
418struct root_domain { 423struct root_domain {
419 atomic_t refcount; 424 atomic_t refcount;
425 struct rcu_head rcu;
420 cpumask_var_t span; 426 cpumask_var_t span;
421 cpumask_var_t online; 427 cpumask_var_t online;
422 428
@@ -460,7 +466,7 @@ struct rq {
460 u64 nohz_stamp; 466 u64 nohz_stamp;
461 unsigned char nohz_balance_kick; 467 unsigned char nohz_balance_kick;
462#endif 468#endif
463 unsigned int skip_clock_update; 469 int skip_clock_update;
464 470
465 /* capture load from *all* tasks on this cpu: */ 471 /* capture load from *all* tasks on this cpu: */
466 struct load_weight load; 472 struct load_weight load;
@@ -553,6 +559,10 @@ struct rq {
553 unsigned int ttwu_count; 559 unsigned int ttwu_count;
554 unsigned int ttwu_local; 560 unsigned int ttwu_local;
555#endif 561#endif
562
563#ifdef CONFIG_SMP
564 struct task_struct *wake_list;
565#endif
556}; 566};
557 567
558static DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues); 568static DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues);
@@ -571,7 +581,7 @@ static inline int cpu_of(struct rq *rq)
571 581
572#define rcu_dereference_check_sched_domain(p) \ 582#define rcu_dereference_check_sched_domain(p) \
573 rcu_dereference_check((p), \ 583 rcu_dereference_check((p), \
574 rcu_read_lock_sched_held() || \ 584 rcu_read_lock_held() || \
575 lockdep_is_held(&sched_domains_mutex)) 585 lockdep_is_held(&sched_domains_mutex))
576 586
577/* 587/*
@@ -596,7 +606,7 @@ static inline int cpu_of(struct rq *rq)
596 * Return the group to which this tasks belongs. 606 * Return the group to which this tasks belongs.
597 * 607 *
598 * We use task_subsys_state_check() and extend the RCU verification 608 * We use task_subsys_state_check() and extend the RCU verification
599 * with lockdep_is_held(&task_rq(p)->lock) because cpu_cgroup_attach() 609 * with lockdep_is_held(&p->pi_lock) because cpu_cgroup_attach()
600 * holds that lock for each task it moves into the cgroup. Therefore 610 * holds that lock for each task it moves into the cgroup. Therefore
601 * by holding that lock, we pin the task to the current cgroup. 611 * by holding that lock, we pin the task to the current cgroup.
602 */ 612 */
@@ -606,7 +616,7 @@ static inline struct task_group *task_group(struct task_struct *p)
606 struct cgroup_subsys_state *css; 616 struct cgroup_subsys_state *css;
607 617
608 css = task_subsys_state_check(p, cpu_cgroup_subsys_id, 618 css = task_subsys_state_check(p, cpu_cgroup_subsys_id,
609 lockdep_is_held(&task_rq(p)->lock)); 619 lockdep_is_held(&p->pi_lock));
610 tg = container_of(css, struct task_group, css); 620 tg = container_of(css, struct task_group, css);
611 621
612 return autogroup_task_group(p, tg); 622 return autogroup_task_group(p, tg);
@@ -642,7 +652,7 @@ static void update_rq_clock(struct rq *rq)
642{ 652{
643 s64 delta; 653 s64 delta;
644 654
645 if (rq->skip_clock_update) 655 if (rq->skip_clock_update > 0)
646 return; 656 return;
647 657
648 delta = sched_clock_cpu(cpu_of(rq)) - rq->clock; 658 delta = sched_clock_cpu(cpu_of(rq)) - rq->clock;
@@ -838,18 +848,39 @@ static inline int task_current(struct rq *rq, struct task_struct *p)
838 return rq->curr == p; 848 return rq->curr == p;
839} 849}
840 850
841#ifndef __ARCH_WANT_UNLOCKED_CTXSW
842static inline int task_running(struct rq *rq, struct task_struct *p) 851static inline int task_running(struct rq *rq, struct task_struct *p)
843{ 852{
853#ifdef CONFIG_SMP
854 return p->on_cpu;
855#else
844 return task_current(rq, p); 856 return task_current(rq, p);
857#endif
845} 858}
846 859
860#ifndef __ARCH_WANT_UNLOCKED_CTXSW
847static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next) 861static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next)
848{ 862{
863#ifdef CONFIG_SMP
864 /*
865 * We can optimise this out completely for !SMP, because the
866 * SMP rebalancing from interrupt is the only thing that cares
867 * here.
868 */
869 next->on_cpu = 1;
870#endif
849} 871}
850 872
851static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev) 873static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev)
852{ 874{
875#ifdef CONFIG_SMP
876 /*
877 * After ->on_cpu is cleared, the task can be moved to a different CPU.
878 * We must ensure this doesn't happen until the switch is completely
879 * finished.
880 */
881 smp_wmb();
882 prev->on_cpu = 0;
883#endif
853#ifdef CONFIG_DEBUG_SPINLOCK 884#ifdef CONFIG_DEBUG_SPINLOCK
854 /* this is a valid case when another task releases the spinlock */ 885 /* this is a valid case when another task releases the spinlock */
855 rq->lock.owner = current; 886 rq->lock.owner = current;
@@ -865,15 +896,6 @@ static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev)
865} 896}
866 897
867#else /* __ARCH_WANT_UNLOCKED_CTXSW */ 898#else /* __ARCH_WANT_UNLOCKED_CTXSW */
868static inline int task_running(struct rq *rq, struct task_struct *p)
869{
870#ifdef CONFIG_SMP
871 return p->oncpu;
872#else
873 return task_current(rq, p);
874#endif
875}
876
877static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next) 899static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next)
878{ 900{
879#ifdef CONFIG_SMP 901#ifdef CONFIG_SMP
@@ -882,7 +904,7 @@ static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next)
882 * SMP rebalancing from interrupt is the only thing that cares 904 * SMP rebalancing from interrupt is the only thing that cares
883 * here. 905 * here.
884 */ 906 */
885 next->oncpu = 1; 907 next->on_cpu = 1;
886#endif 908#endif
887#ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW 909#ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW
888 raw_spin_unlock_irq(&rq->lock); 910 raw_spin_unlock_irq(&rq->lock);
@@ -895,12 +917,12 @@ static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev)
895{ 917{
896#ifdef CONFIG_SMP 918#ifdef CONFIG_SMP
897 /* 919 /*
898 * After ->oncpu is cleared, the task can be moved to a different CPU. 920 * After ->on_cpu is cleared, the task can be moved to a different CPU.
899 * We must ensure this doesn't happen until the switch is completely 921 * We must ensure this doesn't happen until the switch is completely
900 * finished. 922 * finished.
901 */ 923 */
902 smp_wmb(); 924 smp_wmb();
903 prev->oncpu = 0; 925 prev->on_cpu = 0;
904#endif 926#endif
905#ifndef __ARCH_WANT_INTERRUPTS_ON_CTXSW 927#ifndef __ARCH_WANT_INTERRUPTS_ON_CTXSW
906 local_irq_enable(); 928 local_irq_enable();
@@ -909,23 +931,15 @@ static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev)
909#endif /* __ARCH_WANT_UNLOCKED_CTXSW */ 931#endif /* __ARCH_WANT_UNLOCKED_CTXSW */
910 932
911/* 933/*
912 * Check whether the task is waking, we use this to synchronize ->cpus_allowed 934 * __task_rq_lock - lock the rq @p resides on.
913 * against ttwu().
914 */
915static inline int task_is_waking(struct task_struct *p)
916{
917 return unlikely(p->state == TASK_WAKING);
918}
919
920/*
921 * __task_rq_lock - lock the runqueue a given task resides on.
922 * Must be called interrupts disabled.
923 */ 935 */
924static inline struct rq *__task_rq_lock(struct task_struct *p) 936static inline struct rq *__task_rq_lock(struct task_struct *p)
925 __acquires(rq->lock) 937 __acquires(rq->lock)
926{ 938{
927 struct rq *rq; 939 struct rq *rq;
928 940
941 lockdep_assert_held(&p->pi_lock);
942
929 for (;;) { 943 for (;;) {
930 rq = task_rq(p); 944 rq = task_rq(p);
931 raw_spin_lock(&rq->lock); 945 raw_spin_lock(&rq->lock);
@@ -936,22 +950,22 @@ static inline struct rq *__task_rq_lock(struct task_struct *p)
936} 950}
937 951
938/* 952/*
939 * task_rq_lock - lock the runqueue a given task resides on and disable 953 * task_rq_lock - lock p->pi_lock and lock the rq @p resides on.
940 * interrupts. Note the ordering: we can safely lookup the task_rq without
941 * explicitly disabling preemption.
942 */ 954 */
943static struct rq *task_rq_lock(struct task_struct *p, unsigned long *flags) 955static struct rq *task_rq_lock(struct task_struct *p, unsigned long *flags)
956 __acquires(p->pi_lock)
944 __acquires(rq->lock) 957 __acquires(rq->lock)
945{ 958{
946 struct rq *rq; 959 struct rq *rq;
947 960
948 for (;;) { 961 for (;;) {
949 local_irq_save(*flags); 962 raw_spin_lock_irqsave(&p->pi_lock, *flags);
950 rq = task_rq(p); 963 rq = task_rq(p);
951 raw_spin_lock(&rq->lock); 964 raw_spin_lock(&rq->lock);
952 if (likely(rq == task_rq(p))) 965 if (likely(rq == task_rq(p)))
953 return rq; 966 return rq;
954 raw_spin_unlock_irqrestore(&rq->lock, *flags); 967 raw_spin_unlock(&rq->lock);
968 raw_spin_unlock_irqrestore(&p->pi_lock, *flags);
955 } 969 }
956} 970}
957 971
@@ -961,10 +975,13 @@ static void __task_rq_unlock(struct rq *rq)
961 raw_spin_unlock(&rq->lock); 975 raw_spin_unlock(&rq->lock);
962} 976}
963 977
964static inline void task_rq_unlock(struct rq *rq, unsigned long *flags) 978static inline void
979task_rq_unlock(struct rq *rq, struct task_struct *p, unsigned long *flags)
965 __releases(rq->lock) 980 __releases(rq->lock)
981 __releases(p->pi_lock)
966{ 982{
967 raw_spin_unlock_irqrestore(&rq->lock, *flags); 983 raw_spin_unlock(&rq->lock);
984 raw_spin_unlock_irqrestore(&p->pi_lock, *flags);
968} 985}
969 986
970/* 987/*
@@ -1193,11 +1210,17 @@ int get_nohz_timer_target(void)
1193 int i; 1210 int i;
1194 struct sched_domain *sd; 1211 struct sched_domain *sd;
1195 1212
1213 rcu_read_lock();
1196 for_each_domain(cpu, sd) { 1214 for_each_domain(cpu, sd) {
1197 for_each_cpu(i, sched_domain_span(sd)) 1215 for_each_cpu(i, sched_domain_span(sd)) {
1198 if (!idle_cpu(i)) 1216 if (!idle_cpu(i)) {
1199 return i; 1217 cpu = i;
1218 goto unlock;
1219 }
1220 }
1200 } 1221 }
1222unlock:
1223 rcu_read_unlock();
1201 return cpu; 1224 return cpu;
1202} 1225}
1203/* 1226/*
@@ -1307,15 +1330,15 @@ calc_delta_mine(unsigned long delta_exec, unsigned long weight,
1307{ 1330{
1308 u64 tmp; 1331 u64 tmp;
1309 1332
1333 tmp = (u64)delta_exec * weight;
1334
1310 if (!lw->inv_weight) { 1335 if (!lw->inv_weight) {
1311 if (BITS_PER_LONG > 32 && unlikely(lw->weight >= WMULT_CONST)) 1336 if (BITS_PER_LONG > 32 && unlikely(lw->weight >= WMULT_CONST))
1312 lw->inv_weight = 1; 1337 lw->inv_weight = 1;
1313 else 1338 else
1314 lw->inv_weight = 1 + (WMULT_CONST-lw->weight/2) 1339 lw->inv_weight = WMULT_CONST / lw->weight;
1315 / (lw->weight+1);
1316 } 1340 }
1317 1341
1318 tmp = (u64)delta_exec * weight;
1319 /* 1342 /*
1320 * Check whether we'd overflow the 64-bit multiplication: 1343 * Check whether we'd overflow the 64-bit multiplication:
1321 */ 1344 */
@@ -1773,7 +1796,6 @@ static void enqueue_task(struct rq *rq, struct task_struct *p, int flags)
1773 update_rq_clock(rq); 1796 update_rq_clock(rq);
1774 sched_info_queued(p); 1797 sched_info_queued(p);
1775 p->sched_class->enqueue_task(rq, p, flags); 1798 p->sched_class->enqueue_task(rq, p, flags);
1776 p->se.on_rq = 1;
1777} 1799}
1778 1800
1779static void dequeue_task(struct rq *rq, struct task_struct *p, int flags) 1801static void dequeue_task(struct rq *rq, struct task_struct *p, int flags)
@@ -1781,7 +1803,6 @@ static void dequeue_task(struct rq *rq, struct task_struct *p, int flags)
1781 update_rq_clock(rq); 1803 update_rq_clock(rq);
1782 sched_info_dequeued(p); 1804 sched_info_dequeued(p);
1783 p->sched_class->dequeue_task(rq, p, flags); 1805 p->sched_class->dequeue_task(rq, p, flags);
1784 p->se.on_rq = 0;
1785} 1806}
1786 1807
1787/* 1808/*
@@ -2116,7 +2137,7 @@ static void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags)
2116 * A queue event has occurred, and we're going to schedule. In 2137 * A queue event has occurred, and we're going to schedule. In
2117 * this case, we can save a useless back to back clock update. 2138 * this case, we can save a useless back to back clock update.
2118 */ 2139 */
2119 if (rq->curr->se.on_rq && test_tsk_need_resched(rq->curr)) 2140 if (rq->curr->on_rq && test_tsk_need_resched(rq->curr))
2120 rq->skip_clock_update = 1; 2141 rq->skip_clock_update = 1;
2121} 2142}
2122 2143
@@ -2162,6 +2183,11 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
2162 */ 2183 */
2163 WARN_ON_ONCE(p->state != TASK_RUNNING && p->state != TASK_WAKING && 2184 WARN_ON_ONCE(p->state != TASK_RUNNING && p->state != TASK_WAKING &&
2164 !(task_thread_info(p)->preempt_count & PREEMPT_ACTIVE)); 2185 !(task_thread_info(p)->preempt_count & PREEMPT_ACTIVE));
2186
2187#ifdef CONFIG_LOCKDEP
2188 WARN_ON_ONCE(debug_locks && !(lockdep_is_held(&p->pi_lock) ||
2189 lockdep_is_held(&task_rq(p)->lock)));
2190#endif
2165#endif 2191#endif
2166 2192
2167 trace_sched_migrate_task(p, new_cpu); 2193 trace_sched_migrate_task(p, new_cpu);
@@ -2182,19 +2208,6 @@ struct migration_arg {
2182static int migration_cpu_stop(void *data); 2208static int migration_cpu_stop(void *data);
2183 2209
2184/* 2210/*
2185 * The task's runqueue lock must be held.
2186 * Returns true if you have to wait for migration thread.
2187 */
2188static bool migrate_task(struct task_struct *p, struct rq *rq)
2189{
2190 /*
2191 * If the task is not on a runqueue (and not running), then
2192 * the next wake-up will properly place the task.
2193 */
2194 return p->se.on_rq || task_running(rq, p);
2195}
2196
2197/*
2198 * wait_task_inactive - wait for a thread to unschedule. 2211 * wait_task_inactive - wait for a thread to unschedule.
2199 * 2212 *
2200 * If @match_state is nonzero, it's the @p->state value just checked and 2213 * If @match_state is nonzero, it's the @p->state value just checked and
@@ -2251,11 +2264,11 @@ unsigned long wait_task_inactive(struct task_struct *p, long match_state)
2251 rq = task_rq_lock(p, &flags); 2264 rq = task_rq_lock(p, &flags);
2252 trace_sched_wait_task(p); 2265 trace_sched_wait_task(p);
2253 running = task_running(rq, p); 2266 running = task_running(rq, p);
2254 on_rq = p->se.on_rq; 2267 on_rq = p->on_rq;
2255 ncsw = 0; 2268 ncsw = 0;
2256 if (!match_state || p->state == match_state) 2269 if (!match_state || p->state == match_state)
2257 ncsw = p->nvcsw | LONG_MIN; /* sets MSB */ 2270 ncsw = p->nvcsw | LONG_MIN; /* sets MSB */
2258 task_rq_unlock(rq, &flags); 2271 task_rq_unlock(rq, p, &flags);
2259 2272
2260 /* 2273 /*
2261 * If it changed from the expected state, bail out now. 2274 * If it changed from the expected state, bail out now.
@@ -2309,7 +2322,7 @@ unsigned long wait_task_inactive(struct task_struct *p, long match_state)
2309 * Cause a process which is running on another CPU to enter 2322 * Cause a process which is running on another CPU to enter
2310 * kernel-mode, without any delay. (to get signals handled.) 2323 * kernel-mode, without any delay. (to get signals handled.)
2311 * 2324 *
2312 * NOTE: this function doesnt have to take the runqueue lock, 2325 * NOTE: this function doesn't have to take the runqueue lock,
2313 * because all it wants to ensure is that the remote task enters 2326 * because all it wants to ensure is that the remote task enters
2314 * the kernel. If the IPI races and the task has been migrated 2327 * the kernel. If the IPI races and the task has been migrated
2315 * to another CPU then no harm is done and the purpose has been 2328 * to another CPU then no harm is done and the purpose has been
@@ -2330,7 +2343,7 @@ EXPORT_SYMBOL_GPL(kick_process);
2330 2343
2331#ifdef CONFIG_SMP 2344#ifdef CONFIG_SMP
2332/* 2345/*
2333 * ->cpus_allowed is protected by either TASK_WAKING or rq->lock held. 2346 * ->cpus_allowed is protected by both rq->lock and p->pi_lock
2334 */ 2347 */
2335static int select_fallback_rq(int cpu, struct task_struct *p) 2348static int select_fallback_rq(int cpu, struct task_struct *p)
2336{ 2349{
@@ -2363,12 +2376,12 @@ static int select_fallback_rq(int cpu, struct task_struct *p)
2363} 2376}
2364 2377
2365/* 2378/*
2366 * The caller (fork, wakeup) owns TASK_WAKING, ->cpus_allowed is stable. 2379 * The caller (fork, wakeup) owns p->pi_lock, ->cpus_allowed is stable.
2367 */ 2380 */
2368static inline 2381static inline
2369int select_task_rq(struct rq *rq, struct task_struct *p, int sd_flags, int wake_flags) 2382int select_task_rq(struct task_struct *p, int sd_flags, int wake_flags)
2370{ 2383{
2371 int cpu = p->sched_class->select_task_rq(rq, p, sd_flags, wake_flags); 2384 int cpu = p->sched_class->select_task_rq(p, sd_flags, wake_flags);
2372 2385
2373 /* 2386 /*
2374 * In order not to call set_task_cpu() on a blocking task we need 2387 * In order not to call set_task_cpu() on a blocking task we need
@@ -2394,27 +2407,62 @@ static void update_avg(u64 *avg, u64 sample)
2394} 2407}
2395#endif 2408#endif
2396 2409
2397static inline void ttwu_activate(struct task_struct *p, struct rq *rq, 2410static void
2398 bool is_sync, bool is_migrate, bool is_local, 2411ttwu_stat(struct task_struct *p, int cpu, int wake_flags)
2399 unsigned long en_flags)
2400{ 2412{
2413#ifdef CONFIG_SCHEDSTATS
2414 struct rq *rq = this_rq();
2415
2416#ifdef CONFIG_SMP
2417 int this_cpu = smp_processor_id();
2418
2419 if (cpu == this_cpu) {
2420 schedstat_inc(rq, ttwu_local);
2421 schedstat_inc(p, se.statistics.nr_wakeups_local);
2422 } else {
2423 struct sched_domain *sd;
2424
2425 schedstat_inc(p, se.statistics.nr_wakeups_remote);
2426 rcu_read_lock();
2427 for_each_domain(this_cpu, sd) {
2428 if (cpumask_test_cpu(cpu, sched_domain_span(sd))) {
2429 schedstat_inc(sd, ttwu_wake_remote);
2430 break;
2431 }
2432 }
2433 rcu_read_unlock();
2434 }
2435#endif /* CONFIG_SMP */
2436
2437 schedstat_inc(rq, ttwu_count);
2401 schedstat_inc(p, se.statistics.nr_wakeups); 2438 schedstat_inc(p, se.statistics.nr_wakeups);
2402 if (is_sync) 2439
2440 if (wake_flags & WF_SYNC)
2403 schedstat_inc(p, se.statistics.nr_wakeups_sync); 2441 schedstat_inc(p, se.statistics.nr_wakeups_sync);
2404 if (is_migrate) 2442
2443 if (cpu != task_cpu(p))
2405 schedstat_inc(p, se.statistics.nr_wakeups_migrate); 2444 schedstat_inc(p, se.statistics.nr_wakeups_migrate);
2406 if (is_local)
2407 schedstat_inc(p, se.statistics.nr_wakeups_local);
2408 else
2409 schedstat_inc(p, se.statistics.nr_wakeups_remote);
2410 2445
2446#endif /* CONFIG_SCHEDSTATS */
2447}
2448
2449static void ttwu_activate(struct rq *rq, struct task_struct *p, int en_flags)
2450{
2411 activate_task(rq, p, en_flags); 2451 activate_task(rq, p, en_flags);
2452 p->on_rq = 1;
2453
2454 /* if a worker is waking up, notify workqueue */
2455 if (p->flags & PF_WQ_WORKER)
2456 wq_worker_waking_up(p, cpu_of(rq));
2412} 2457}
2413 2458
2414static inline void ttwu_post_activation(struct task_struct *p, struct rq *rq, 2459/*
2415 int wake_flags, bool success) 2460 * Mark the task runnable and perform wakeup-preemption.
2461 */
2462static void
2463ttwu_do_wakeup(struct rq *rq, struct task_struct *p, int wake_flags)
2416{ 2464{
2417 trace_sched_wakeup(p, success); 2465 trace_sched_wakeup(p, true);
2418 check_preempt_curr(rq, p, wake_flags); 2466 check_preempt_curr(rq, p, wake_flags);
2419 2467
2420 p->state = TASK_RUNNING; 2468 p->state = TASK_RUNNING;
@@ -2433,9 +2481,99 @@ static inline void ttwu_post_activation(struct task_struct *p, struct rq *rq,
2433 rq->idle_stamp = 0; 2481 rq->idle_stamp = 0;
2434 } 2482 }
2435#endif 2483#endif
2436 /* if a worker is waking up, notify workqueue */ 2484}
2437 if ((p->flags & PF_WQ_WORKER) && success) 2485
2438 wq_worker_waking_up(p, cpu_of(rq)); 2486static void
2487ttwu_do_activate(struct rq *rq, struct task_struct *p, int wake_flags)
2488{
2489#ifdef CONFIG_SMP
2490 if (p->sched_contributes_to_load)
2491 rq->nr_uninterruptible--;
2492#endif
2493
2494 ttwu_activate(rq, p, ENQUEUE_WAKEUP | ENQUEUE_WAKING);
2495 ttwu_do_wakeup(rq, p, wake_flags);
2496}
2497
2498/*
2499 * Called in case the task @p isn't fully descheduled from its runqueue,
2500 * in this case we must do a remote wakeup. Its a 'light' wakeup though,
2501 * since all we need to do is flip p->state to TASK_RUNNING, since
2502 * the task is still ->on_rq.
2503 */
2504static int ttwu_remote(struct task_struct *p, int wake_flags)
2505{
2506 struct rq *rq;
2507 int ret = 0;
2508
2509 rq = __task_rq_lock(p);
2510 if (p->on_rq) {
2511 ttwu_do_wakeup(rq, p, wake_flags);
2512 ret = 1;
2513 }
2514 __task_rq_unlock(rq);
2515
2516 return ret;
2517}
2518
2519#ifdef CONFIG_SMP
2520static void sched_ttwu_pending(void)
2521{
2522 struct rq *rq = this_rq();
2523 struct task_struct *list = xchg(&rq->wake_list, NULL);
2524
2525 if (!list)
2526 return;
2527
2528 raw_spin_lock(&rq->lock);
2529
2530 while (list) {
2531 struct task_struct *p = list;
2532 list = list->wake_entry;
2533 ttwu_do_activate(rq, p, 0);
2534 }
2535
2536 raw_spin_unlock(&rq->lock);
2537}
2538
2539void scheduler_ipi(void)
2540{
2541 sched_ttwu_pending();
2542}
2543
2544static void ttwu_queue_remote(struct task_struct *p, int cpu)
2545{
2546 struct rq *rq = cpu_rq(cpu);
2547 struct task_struct *next = rq->wake_list;
2548
2549 for (;;) {
2550 struct task_struct *old = next;
2551
2552 p->wake_entry = next;
2553 next = cmpxchg(&rq->wake_list, old, p);
2554 if (next == old)
2555 break;
2556 }
2557
2558 if (!next)
2559 smp_send_reschedule(cpu);
2560}
2561#endif
2562
2563static void ttwu_queue(struct task_struct *p, int cpu)
2564{
2565 struct rq *rq = cpu_rq(cpu);
2566
2567#if defined(CONFIG_SMP) && defined(CONFIG_SCHED_TTWU_QUEUE)
2568 if (sched_feat(TTWU_QUEUE) && cpu != smp_processor_id()) {
2569 ttwu_queue_remote(p, cpu);
2570 return;
2571 }
2572#endif
2573
2574 raw_spin_lock(&rq->lock);
2575 ttwu_do_activate(rq, p, 0);
2576 raw_spin_unlock(&rq->lock);
2439} 2577}
2440 2578
2441/** 2579/**
@@ -2453,92 +2591,64 @@ static inline void ttwu_post_activation(struct task_struct *p, struct rq *rq,
2453 * Returns %true if @p was woken up, %false if it was already running 2591 * Returns %true if @p was woken up, %false if it was already running
2454 * or @state didn't match @p's state. 2592 * or @state didn't match @p's state.
2455 */ 2593 */
2456static int try_to_wake_up(struct task_struct *p, unsigned int state, 2594static int
2457 int wake_flags) 2595try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
2458{ 2596{
2459 int cpu, orig_cpu, this_cpu, success = 0;
2460 unsigned long flags; 2597 unsigned long flags;
2461 unsigned long en_flags = ENQUEUE_WAKEUP; 2598 int cpu, success = 0;
2462 struct rq *rq;
2463
2464 this_cpu = get_cpu();
2465 2599
2466 smp_wmb(); 2600 smp_wmb();
2467 rq = task_rq_lock(p, &flags); 2601 raw_spin_lock_irqsave(&p->pi_lock, flags);
2468 if (!(p->state & state)) 2602 if (!(p->state & state))
2469 goto out; 2603 goto out;
2470 2604
2471 if (p->se.on_rq) 2605 success = 1; /* we're going to change ->state */
2472 goto out_running;
2473
2474 cpu = task_cpu(p); 2606 cpu = task_cpu(p);
2475 orig_cpu = cpu;
2476 2607
2477#ifdef CONFIG_SMP 2608 if (p->on_rq && ttwu_remote(p, wake_flags))
2478 if (unlikely(task_running(rq, p))) 2609 goto stat;
2479 goto out_activate;
2480 2610
2611#ifdef CONFIG_SMP
2481 /* 2612 /*
2482 * In order to handle concurrent wakeups and release the rq->lock 2613 * If the owning (remote) cpu is still in the middle of schedule() with
2483 * we put the task in TASK_WAKING state. 2614 * this task as prev, wait until its done referencing the task.
2484 *
2485 * First fix up the nr_uninterruptible count:
2486 */ 2615 */
2487 if (task_contributes_to_load(p)) { 2616 while (p->on_cpu) {
2488 if (likely(cpu_online(orig_cpu))) 2617#ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW
2489 rq->nr_uninterruptible--; 2618 /*
2490 else 2619 * If called from interrupt context we could have landed in the
2491 this_rq()->nr_uninterruptible--; 2620 * middle of schedule(), in this case we should take care not
2492 } 2621 * to spin on ->on_cpu if p is current, since that would
2493 p->state = TASK_WAKING; 2622 * deadlock.
2494 2623 */
2495 if (p->sched_class->task_waking) { 2624 if (p == current) {
2496 p->sched_class->task_waking(rq, p); 2625 ttwu_queue(p, cpu);
2497 en_flags |= ENQUEUE_WAKING; 2626 goto stat;
2627 }
2628#endif
2629 cpu_relax();
2498 } 2630 }
2499
2500 cpu = select_task_rq(rq, p, SD_BALANCE_WAKE, wake_flags);
2501 if (cpu != orig_cpu)
2502 set_task_cpu(p, cpu);
2503 __task_rq_unlock(rq);
2504
2505 rq = cpu_rq(cpu);
2506 raw_spin_lock(&rq->lock);
2507
2508 /* 2631 /*
2509 * We migrated the task without holding either rq->lock, however 2632 * Pairs with the smp_wmb() in finish_lock_switch().
2510 * since the task is not on the task list itself, nobody else
2511 * will try and migrate the task, hence the rq should match the
2512 * cpu we just moved it to.
2513 */ 2633 */
2514 WARN_ON(task_cpu(p) != cpu); 2634 smp_rmb();
2515 WARN_ON(p->state != TASK_WAKING);
2516 2635
2517#ifdef CONFIG_SCHEDSTATS 2636 p->sched_contributes_to_load = !!task_contributes_to_load(p);
2518 schedstat_inc(rq, ttwu_count); 2637 p->state = TASK_WAKING;
2519 if (cpu == this_cpu)
2520 schedstat_inc(rq, ttwu_local);
2521 else {
2522 struct sched_domain *sd;
2523 for_each_domain(this_cpu, sd) {
2524 if (cpumask_test_cpu(cpu, sched_domain_span(sd))) {
2525 schedstat_inc(sd, ttwu_wake_remote);
2526 break;
2527 }
2528 }
2529 }
2530#endif /* CONFIG_SCHEDSTATS */
2531 2638
2532out_activate: 2639 if (p->sched_class->task_waking)
2640 p->sched_class->task_waking(p);
2641
2642 cpu = select_task_rq(p, SD_BALANCE_WAKE, wake_flags);
2643 if (task_cpu(p) != cpu)
2644 set_task_cpu(p, cpu);
2533#endif /* CONFIG_SMP */ 2645#endif /* CONFIG_SMP */
2534 ttwu_activate(p, rq, wake_flags & WF_SYNC, orig_cpu != cpu, 2646
2535 cpu == this_cpu, en_flags); 2647 ttwu_queue(p, cpu);
2536 success = 1; 2648stat:
2537out_running: 2649 ttwu_stat(p, cpu, wake_flags);
2538 ttwu_post_activation(p, rq, wake_flags, success);
2539out: 2650out:
2540 task_rq_unlock(rq, &flags); 2651 raw_spin_unlock_irqrestore(&p->pi_lock, flags);
2541 put_cpu();
2542 2652
2543 return success; 2653 return success;
2544} 2654}
@@ -2547,31 +2657,34 @@ out:
2547 * try_to_wake_up_local - try to wake up a local task with rq lock held 2657 * try_to_wake_up_local - try to wake up a local task with rq lock held
2548 * @p: the thread to be awakened 2658 * @p: the thread to be awakened
2549 * 2659 *
2550 * Put @p on the run-queue if it's not already there. The caller must 2660 * Put @p on the run-queue if it's not already there. The caller must
2551 * ensure that this_rq() is locked, @p is bound to this_rq() and not 2661 * ensure that this_rq() is locked, @p is bound to this_rq() and not
2552 * the current task. this_rq() stays locked over invocation. 2662 * the current task.
2553 */ 2663 */
2554static void try_to_wake_up_local(struct task_struct *p) 2664static void try_to_wake_up_local(struct task_struct *p)
2555{ 2665{
2556 struct rq *rq = task_rq(p); 2666 struct rq *rq = task_rq(p);
2557 bool success = false;
2558 2667
2559 BUG_ON(rq != this_rq()); 2668 BUG_ON(rq != this_rq());
2560 BUG_ON(p == current); 2669 BUG_ON(p == current);
2561 lockdep_assert_held(&rq->lock); 2670 lockdep_assert_held(&rq->lock);
2562 2671
2672 if (!raw_spin_trylock(&p->pi_lock)) {
2673 raw_spin_unlock(&rq->lock);
2674 raw_spin_lock(&p->pi_lock);
2675 raw_spin_lock(&rq->lock);
2676 }
2677
2563 if (!(p->state & TASK_NORMAL)) 2678 if (!(p->state & TASK_NORMAL))
2564 return; 2679 goto out;
2565 2680
2566 if (!p->se.on_rq) { 2681 if (!p->on_rq)
2567 if (likely(!task_running(rq, p))) { 2682 ttwu_activate(rq, p, ENQUEUE_WAKEUP);
2568 schedstat_inc(rq, ttwu_count); 2683
2569 schedstat_inc(rq, ttwu_local); 2684 ttwu_do_wakeup(rq, p, 0);
2570 } 2685 ttwu_stat(p, smp_processor_id(), 0);
2571 ttwu_activate(p, rq, false, false, true, ENQUEUE_WAKEUP); 2686out:
2572 success = true; 2687 raw_spin_unlock(&p->pi_lock);
2573 }
2574 ttwu_post_activation(p, rq, 0, success);
2575} 2688}
2576 2689
2577/** 2690/**
@@ -2604,19 +2717,21 @@ int wake_up_state(struct task_struct *p, unsigned int state)
2604 */ 2717 */
2605static void __sched_fork(struct task_struct *p) 2718static void __sched_fork(struct task_struct *p)
2606{ 2719{
2720 p->on_rq = 0;
2721
2722 p->se.on_rq = 0;
2607 p->se.exec_start = 0; 2723 p->se.exec_start = 0;
2608 p->se.sum_exec_runtime = 0; 2724 p->se.sum_exec_runtime = 0;
2609 p->se.prev_sum_exec_runtime = 0; 2725 p->se.prev_sum_exec_runtime = 0;
2610 p->se.nr_migrations = 0; 2726 p->se.nr_migrations = 0;
2611 p->se.vruntime = 0; 2727 p->se.vruntime = 0;
2728 INIT_LIST_HEAD(&p->se.group_node);
2612 2729
2613#ifdef CONFIG_SCHEDSTATS 2730#ifdef CONFIG_SCHEDSTATS
2614 memset(&p->se.statistics, 0, sizeof(p->se.statistics)); 2731 memset(&p->se.statistics, 0, sizeof(p->se.statistics));
2615#endif 2732#endif
2616 2733
2617 INIT_LIST_HEAD(&p->rt.run_list); 2734 INIT_LIST_HEAD(&p->rt.run_list);
2618 p->se.on_rq = 0;
2619 INIT_LIST_HEAD(&p->se.group_node);
2620 2735
2621#ifdef CONFIG_PREEMPT_NOTIFIERS 2736#ifdef CONFIG_PREEMPT_NOTIFIERS
2622 INIT_HLIST_HEAD(&p->preempt_notifiers); 2737 INIT_HLIST_HEAD(&p->preempt_notifiers);
@@ -2626,8 +2741,9 @@ static void __sched_fork(struct task_struct *p)
2626/* 2741/*
2627 * fork()/clone()-time setup: 2742 * fork()/clone()-time setup:
2628 */ 2743 */
2629void sched_fork(struct task_struct *p, int clone_flags) 2744void sched_fork(struct task_struct *p)
2630{ 2745{
2746 unsigned long flags;
2631 int cpu = get_cpu(); 2747 int cpu = get_cpu();
2632 2748
2633 __sched_fork(p); 2749 __sched_fork(p);
@@ -2678,16 +2794,16 @@ void sched_fork(struct task_struct *p, int clone_flags)
2678 * 2794 *
2679 * Silence PROVE_RCU. 2795 * Silence PROVE_RCU.
2680 */ 2796 */
2681 rcu_read_lock(); 2797 raw_spin_lock_irqsave(&p->pi_lock, flags);
2682 set_task_cpu(p, cpu); 2798 set_task_cpu(p, cpu);
2683 rcu_read_unlock(); 2799 raw_spin_unlock_irqrestore(&p->pi_lock, flags);
2684 2800
2685#if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT) 2801#if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT)
2686 if (likely(sched_info_on())) 2802 if (likely(sched_info_on()))
2687 memset(&p->sched_info, 0, sizeof(p->sched_info)); 2803 memset(&p->sched_info, 0, sizeof(p->sched_info));
2688#endif 2804#endif
2689#if defined(CONFIG_SMP) && defined(__ARCH_WANT_UNLOCKED_CTXSW) 2805#if defined(CONFIG_SMP)
2690 p->oncpu = 0; 2806 p->on_cpu = 0;
2691#endif 2807#endif
2692#ifdef CONFIG_PREEMPT 2808#ifdef CONFIG_PREEMPT
2693 /* Want to start with kernel preemption disabled. */ 2809 /* Want to start with kernel preemption disabled. */
@@ -2707,41 +2823,31 @@ void sched_fork(struct task_struct *p, int clone_flags)
2707 * that must be done for every newly created context, then puts the task 2823 * that must be done for every newly created context, then puts the task
2708 * on the runqueue and wakes it. 2824 * on the runqueue and wakes it.
2709 */ 2825 */
2710void wake_up_new_task(struct task_struct *p, unsigned long clone_flags) 2826void wake_up_new_task(struct task_struct *p)
2711{ 2827{
2712 unsigned long flags; 2828 unsigned long flags;
2713 struct rq *rq; 2829 struct rq *rq;
2714 int cpu __maybe_unused = get_cpu();
2715 2830
2831 raw_spin_lock_irqsave(&p->pi_lock, flags);
2716#ifdef CONFIG_SMP 2832#ifdef CONFIG_SMP
2717 rq = task_rq_lock(p, &flags);
2718 p->state = TASK_WAKING;
2719
2720 /* 2833 /*
2721 * Fork balancing, do it here and not earlier because: 2834 * Fork balancing, do it here and not earlier because:
2722 * - cpus_allowed can change in the fork path 2835 * - cpus_allowed can change in the fork path
2723 * - any previously selected cpu might disappear through hotplug 2836 * - any previously selected cpu might disappear through hotplug
2724 *
2725 * We set TASK_WAKING so that select_task_rq() can drop rq->lock
2726 * without people poking at ->cpus_allowed.
2727 */ 2837 */
2728 cpu = select_task_rq(rq, p, SD_BALANCE_FORK, 0); 2838 set_task_cpu(p, select_task_rq(p, SD_BALANCE_FORK, 0));
2729 set_task_cpu(p, cpu);
2730
2731 p->state = TASK_RUNNING;
2732 task_rq_unlock(rq, &flags);
2733#endif 2839#endif
2734 2840
2735 rq = task_rq_lock(p, &flags); 2841 rq = __task_rq_lock(p);
2736 activate_task(rq, p, 0); 2842 activate_task(rq, p, 0);
2737 trace_sched_wakeup_new(p, 1); 2843 p->on_rq = 1;
2844 trace_sched_wakeup_new(p, true);
2738 check_preempt_curr(rq, p, WF_FORK); 2845 check_preempt_curr(rq, p, WF_FORK);
2739#ifdef CONFIG_SMP 2846#ifdef CONFIG_SMP
2740 if (p->sched_class->task_woken) 2847 if (p->sched_class->task_woken)
2741 p->sched_class->task_woken(rq, p); 2848 p->sched_class->task_woken(rq, p);
2742#endif 2849#endif
2743 task_rq_unlock(rq, &flags); 2850 task_rq_unlock(rq, p, &flags);
2744 put_cpu();
2745} 2851}
2746 2852
2747#ifdef CONFIG_PREEMPT_NOTIFIERS 2853#ifdef CONFIG_PREEMPT_NOTIFIERS
@@ -3450,27 +3556,22 @@ void sched_exec(void)
3450{ 3556{
3451 struct task_struct *p = current; 3557 struct task_struct *p = current;
3452 unsigned long flags; 3558 unsigned long flags;
3453 struct rq *rq;
3454 int dest_cpu; 3559 int dest_cpu;
3455 3560
3456 rq = task_rq_lock(p, &flags); 3561 raw_spin_lock_irqsave(&p->pi_lock, flags);
3457 dest_cpu = p->sched_class->select_task_rq(rq, p, SD_BALANCE_EXEC, 0); 3562 dest_cpu = p->sched_class->select_task_rq(p, SD_BALANCE_EXEC, 0);
3458 if (dest_cpu == smp_processor_id()) 3563 if (dest_cpu == smp_processor_id())
3459 goto unlock; 3564 goto unlock;
3460 3565
3461 /* 3566 if (likely(cpu_active(dest_cpu))) {
3462 * select_task_rq() can race against ->cpus_allowed
3463 */
3464 if (cpumask_test_cpu(dest_cpu, &p->cpus_allowed) &&
3465 likely(cpu_active(dest_cpu)) && migrate_task(p, rq)) {
3466 struct migration_arg arg = { p, dest_cpu }; 3567 struct migration_arg arg = { p, dest_cpu };
3467 3568
3468 task_rq_unlock(rq, &flags); 3569 raw_spin_unlock_irqrestore(&p->pi_lock, flags);
3469 stop_one_cpu(cpu_of(rq), migration_cpu_stop, &arg); 3570 stop_one_cpu(task_cpu(p), migration_cpu_stop, &arg);
3470 return; 3571 return;
3471 } 3572 }
3472unlock: 3573unlock:
3473 task_rq_unlock(rq, &flags); 3574 raw_spin_unlock_irqrestore(&p->pi_lock, flags);
3474} 3575}
3475 3576
3476#endif 3577#endif
@@ -3507,7 +3608,7 @@ unsigned long long task_delta_exec(struct task_struct *p)
3507 3608
3508 rq = task_rq_lock(p, &flags); 3609 rq = task_rq_lock(p, &flags);
3509 ns = do_task_delta_exec(p, rq); 3610 ns = do_task_delta_exec(p, rq);
3510 task_rq_unlock(rq, &flags); 3611 task_rq_unlock(rq, p, &flags);
3511 3612
3512 return ns; 3613 return ns;
3513} 3614}
@@ -3525,7 +3626,7 @@ unsigned long long task_sched_runtime(struct task_struct *p)
3525 3626
3526 rq = task_rq_lock(p, &flags); 3627 rq = task_rq_lock(p, &flags);
3527 ns = p->se.sum_exec_runtime + do_task_delta_exec(p, rq); 3628 ns = p->se.sum_exec_runtime + do_task_delta_exec(p, rq);
3528 task_rq_unlock(rq, &flags); 3629 task_rq_unlock(rq, p, &flags);
3529 3630
3530 return ns; 3631 return ns;
3531} 3632}
@@ -3549,7 +3650,7 @@ unsigned long long thread_group_sched_runtime(struct task_struct *p)
3549 rq = task_rq_lock(p, &flags); 3650 rq = task_rq_lock(p, &flags);
3550 thread_group_cputime(p, &totals); 3651 thread_group_cputime(p, &totals);
3551 ns = totals.sum_exec_runtime + do_task_delta_exec(p, rq); 3652 ns = totals.sum_exec_runtime + do_task_delta_exec(p, rq);
3552 task_rq_unlock(rq, &flags); 3653 task_rq_unlock(rq, p, &flags);
3553 3654
3554 return ns; 3655 return ns;
3555} 3656}
@@ -3903,9 +4004,6 @@ void thread_group_times(struct task_struct *p, cputime_t *ut, cputime_t *st)
3903/* 4004/*
3904 * This function gets called by the timer code, with HZ frequency. 4005 * This function gets called by the timer code, with HZ frequency.
3905 * We call it with interrupts disabled. 4006 * We call it with interrupts disabled.
3906 *
3907 * It also gets called by the fork code, when changing the parent's
3908 * timeslices.
3909 */ 4007 */
3910void scheduler_tick(void) 4008void scheduler_tick(void)
3911{ 4009{
@@ -4025,17 +4123,11 @@ static inline void schedule_debug(struct task_struct *prev)
4025 profile_hit(SCHED_PROFILING, __builtin_return_address(0)); 4123 profile_hit(SCHED_PROFILING, __builtin_return_address(0));
4026 4124
4027 schedstat_inc(this_rq(), sched_count); 4125 schedstat_inc(this_rq(), sched_count);
4028#ifdef CONFIG_SCHEDSTATS
4029 if (unlikely(prev->lock_depth >= 0)) {
4030 schedstat_inc(this_rq(), rq_sched_info.bkl_count);
4031 schedstat_inc(prev, sched_info.bkl_count);
4032 }
4033#endif
4034} 4126}
4035 4127
4036static void put_prev_task(struct rq *rq, struct task_struct *prev) 4128static void put_prev_task(struct rq *rq, struct task_struct *prev)
4037{ 4129{
4038 if (prev->se.on_rq) 4130 if (prev->on_rq || rq->skip_clock_update < 0)
4039 update_rq_clock(rq); 4131 update_rq_clock(rq);
4040 prev->sched_class->put_prev_task(rq, prev); 4132 prev->sched_class->put_prev_task(rq, prev);
4041} 4133}
@@ -4097,11 +4189,13 @@ need_resched:
4097 if (unlikely(signal_pending_state(prev->state, prev))) { 4189 if (unlikely(signal_pending_state(prev->state, prev))) {
4098 prev->state = TASK_RUNNING; 4190 prev->state = TASK_RUNNING;
4099 } else { 4191 } else {
4192 deactivate_task(rq, prev, DEQUEUE_SLEEP);
4193 prev->on_rq = 0;
4194
4100 /* 4195 /*
4101 * If a worker is going to sleep, notify and 4196 * If a worker went to sleep, notify and ask workqueue
4102 * ask workqueue whether it wants to wake up a 4197 * whether it wants to wake up a task to maintain
4103 * task to maintain concurrency. If so, wake 4198 * concurrency.
4104 * up the task.
4105 */ 4199 */
4106 if (prev->flags & PF_WQ_WORKER) { 4200 if (prev->flags & PF_WQ_WORKER) {
4107 struct task_struct *to_wakeup; 4201 struct task_struct *to_wakeup;
@@ -4110,21 +4204,20 @@ need_resched:
4110 if (to_wakeup) 4204 if (to_wakeup)
4111 try_to_wake_up_local(to_wakeup); 4205 try_to_wake_up_local(to_wakeup);
4112 } 4206 }
4113 deactivate_task(rq, prev, DEQUEUE_SLEEP); 4207
4208 /*
4209 * If we are going to sleep and we have plugged IO
4210 * queued, make sure to submit it to avoid deadlocks.
4211 */
4212 if (blk_needs_flush_plug(prev)) {
4213 raw_spin_unlock(&rq->lock);
4214 blk_schedule_flush_plug(prev);
4215 raw_spin_lock(&rq->lock);
4216 }
4114 } 4217 }
4115 switch_count = &prev->nvcsw; 4218 switch_count = &prev->nvcsw;
4116 } 4219 }
4117 4220
4118 /*
4119 * If we are going to sleep and we have plugged IO queued, make
4120 * sure to submit it to avoid deadlocks.
4121 */
4122 if (prev->state != TASK_RUNNING && blk_needs_flush_plug(prev)) {
4123 raw_spin_unlock(&rq->lock);
4124 blk_flush_plug(prev);
4125 raw_spin_lock(&rq->lock);
4126 }
4127
4128 pre_schedule(rq, prev); 4221 pre_schedule(rq, prev);
4129 4222
4130 if (unlikely(!rq->nr_running)) 4223 if (unlikely(!rq->nr_running))
@@ -4161,70 +4254,53 @@ need_resched:
4161EXPORT_SYMBOL(schedule); 4254EXPORT_SYMBOL(schedule);
4162 4255
4163#ifdef CONFIG_MUTEX_SPIN_ON_OWNER 4256#ifdef CONFIG_MUTEX_SPIN_ON_OWNER
4164/*
4165 * Look out! "owner" is an entirely speculative pointer
4166 * access and not reliable.
4167 */
4168int mutex_spin_on_owner(struct mutex *lock, struct thread_info *owner)
4169{
4170 unsigned int cpu;
4171 struct rq *rq;
4172 4257
4173 if (!sched_feat(OWNER_SPIN)) 4258static inline bool owner_running(struct mutex *lock, struct task_struct *owner)
4174 return 0; 4259{
4260 bool ret = false;
4175 4261
4176#ifdef CONFIG_DEBUG_PAGEALLOC 4262 rcu_read_lock();
4177 /* 4263 if (lock->owner != owner)
4178 * Need to access the cpu field knowing that 4264 goto fail;
4179 * DEBUG_PAGEALLOC could have unmapped it if
4180 * the mutex owner just released it and exited.
4181 */
4182 if (probe_kernel_address(&owner->cpu, cpu))
4183 return 0;
4184#else
4185 cpu = owner->cpu;
4186#endif
4187 4265
4188 /* 4266 /*
4189 * Even if the access succeeded (likely case), 4267 * Ensure we emit the owner->on_cpu, dereference _after_ checking
4190 * the cpu field may no longer be valid. 4268 * lock->owner still matches owner, if that fails, owner might
4269 * point to free()d memory, if it still matches, the rcu_read_lock()
4270 * ensures the memory stays valid.
4191 */ 4271 */
4192 if (cpu >= nr_cpumask_bits) 4272 barrier();
4193 return 0;
4194 4273
4195 /* 4274 ret = owner->on_cpu;
4196 * We need to validate that we can do a 4275fail:
4197 * get_cpu() and that we have the percpu area. 4276 rcu_read_unlock();
4198 */
4199 if (!cpu_online(cpu))
4200 return 0;
4201 4277
4202 rq = cpu_rq(cpu); 4278 return ret;
4279}
4203 4280
4204 for (;;) { 4281/*
4205 /* 4282 * Look out! "owner" is an entirely speculative pointer
4206 * Owner changed, break to re-assess state. 4283 * access and not reliable.
4207 */ 4284 */
4208 if (lock->owner != owner) { 4285int mutex_spin_on_owner(struct mutex *lock, struct task_struct *owner)
4209 /* 4286{
4210 * If the lock has switched to a different owner, 4287 if (!sched_feat(OWNER_SPIN))
4211 * we likely have heavy contention. Return 0 to quit 4288 return 0;
4212 * optimistic spinning and not contend further:
4213 */
4214 if (lock->owner)
4215 return 0;
4216 break;
4217 }
4218 4289
4219 /* 4290 while (owner_running(lock, owner)) {
4220 * Is that owner really running on that cpu? 4291 if (need_resched())
4221 */
4222 if (task_thread_info(rq->curr) != owner || need_resched())
4223 return 0; 4292 return 0;
4224 4293
4225 arch_mutex_cpu_relax(); 4294 arch_mutex_cpu_relax();
4226 } 4295 }
4227 4296
4297 /*
4298 * If the owner changed to another task there is likely
4299 * heavy contention, stop spinning.
4300 */
4301 if (lock->owner)
4302 return 0;
4303
4228 return 1; 4304 return 1;
4229} 4305}
4230#endif 4306#endif
@@ -4684,19 +4760,18 @@ EXPORT_SYMBOL(sleep_on_timeout);
4684 */ 4760 */
4685void rt_mutex_setprio(struct task_struct *p, int prio) 4761void rt_mutex_setprio(struct task_struct *p, int prio)
4686{ 4762{
4687 unsigned long flags;
4688 int oldprio, on_rq, running; 4763 int oldprio, on_rq, running;
4689 struct rq *rq; 4764 struct rq *rq;
4690 const struct sched_class *prev_class; 4765 const struct sched_class *prev_class;
4691 4766
4692 BUG_ON(prio < 0 || prio > MAX_PRIO); 4767 BUG_ON(prio < 0 || prio > MAX_PRIO);
4693 4768
4694 rq = task_rq_lock(p, &flags); 4769 rq = __task_rq_lock(p);
4695 4770
4696 trace_sched_pi_setprio(p, prio); 4771 trace_sched_pi_setprio(p, prio);
4697 oldprio = p->prio; 4772 oldprio = p->prio;
4698 prev_class = p->sched_class; 4773 prev_class = p->sched_class;
4699 on_rq = p->se.on_rq; 4774 on_rq = p->on_rq;
4700 running = task_current(rq, p); 4775 running = task_current(rq, p);
4701 if (on_rq) 4776 if (on_rq)
4702 dequeue_task(rq, p, 0); 4777 dequeue_task(rq, p, 0);
@@ -4716,7 +4791,7 @@ void rt_mutex_setprio(struct task_struct *p, int prio)
4716 enqueue_task(rq, p, oldprio < prio ? ENQUEUE_HEAD : 0); 4791 enqueue_task(rq, p, oldprio < prio ? ENQUEUE_HEAD : 0);
4717 4792
4718 check_class_changed(rq, p, prev_class, oldprio); 4793 check_class_changed(rq, p, prev_class, oldprio);
4719 task_rq_unlock(rq, &flags); 4794 __task_rq_unlock(rq);
4720} 4795}
4721 4796
4722#endif 4797#endif
@@ -4744,7 +4819,7 @@ void set_user_nice(struct task_struct *p, long nice)
4744 p->static_prio = NICE_TO_PRIO(nice); 4819 p->static_prio = NICE_TO_PRIO(nice);
4745 goto out_unlock; 4820 goto out_unlock;
4746 } 4821 }
4747 on_rq = p->se.on_rq; 4822 on_rq = p->on_rq;
4748 if (on_rq) 4823 if (on_rq)
4749 dequeue_task(rq, p, 0); 4824 dequeue_task(rq, p, 0);
4750 4825
@@ -4764,7 +4839,7 @@ void set_user_nice(struct task_struct *p, long nice)
4764 resched_task(rq->curr); 4839 resched_task(rq->curr);
4765 } 4840 }
4766out_unlock: 4841out_unlock:
4767 task_rq_unlock(rq, &flags); 4842 task_rq_unlock(rq, p, &flags);
4768} 4843}
4769EXPORT_SYMBOL(set_user_nice); 4844EXPORT_SYMBOL(set_user_nice);
4770 4845
@@ -4878,8 +4953,6 @@ static struct task_struct *find_process_by_pid(pid_t pid)
4878static void 4953static void
4879__setscheduler(struct rq *rq, struct task_struct *p, int policy, int prio) 4954__setscheduler(struct rq *rq, struct task_struct *p, int policy, int prio)
4880{ 4955{
4881 BUG_ON(p->se.on_rq);
4882
4883 p->policy = policy; 4956 p->policy = policy;
4884 p->rt_priority = prio; 4957 p->rt_priority = prio;
4885 p->normal_prio = normal_prio(p); 4958 p->normal_prio = normal_prio(p);
@@ -4994,20 +5067,17 @@ recheck:
4994 /* 5067 /*
4995 * make sure no PI-waiters arrive (or leave) while we are 5068 * make sure no PI-waiters arrive (or leave) while we are
4996 * changing the priority of the task: 5069 * changing the priority of the task:
4997 */ 5070 *
4998 raw_spin_lock_irqsave(&p->pi_lock, flags); 5071 * To be able to change p->policy safely, the appropriate
4999 /*
5000 * To be able to change p->policy safely, the apropriate
5001 * runqueue lock must be held. 5072 * runqueue lock must be held.
5002 */ 5073 */
5003 rq = __task_rq_lock(p); 5074 rq = task_rq_lock(p, &flags);
5004 5075
5005 /* 5076 /*
5006 * Changing the policy of the stop threads its a very bad idea 5077 * Changing the policy of the stop threads its a very bad idea
5007 */ 5078 */
5008 if (p == rq->stop) { 5079 if (p == rq->stop) {
5009 __task_rq_unlock(rq); 5080 task_rq_unlock(rq, p, &flags);
5010 raw_spin_unlock_irqrestore(&p->pi_lock, flags);
5011 return -EINVAL; 5081 return -EINVAL;
5012 } 5082 }
5013 5083
@@ -5031,8 +5101,7 @@ recheck:
5031 if (rt_bandwidth_enabled() && rt_policy(policy) && 5101 if (rt_bandwidth_enabled() && rt_policy(policy) &&
5032 task_group(p)->rt_bandwidth.rt_runtime == 0 && 5102 task_group(p)->rt_bandwidth.rt_runtime == 0 &&
5033 !task_group_is_autogroup(task_group(p))) { 5103 !task_group_is_autogroup(task_group(p))) {
5034 __task_rq_unlock(rq); 5104 task_rq_unlock(rq, p, &flags);
5035 raw_spin_unlock_irqrestore(&p->pi_lock, flags);
5036 return -EPERM; 5105 return -EPERM;
5037 } 5106 }
5038 } 5107 }
@@ -5041,11 +5110,10 @@ recheck:
5041 /* recheck policy now with rq lock held */ 5110 /* recheck policy now with rq lock held */
5042 if (unlikely(oldpolicy != -1 && oldpolicy != p->policy)) { 5111 if (unlikely(oldpolicy != -1 && oldpolicy != p->policy)) {
5043 policy = oldpolicy = -1; 5112 policy = oldpolicy = -1;
5044 __task_rq_unlock(rq); 5113 task_rq_unlock(rq, p, &flags);
5045 raw_spin_unlock_irqrestore(&p->pi_lock, flags);
5046 goto recheck; 5114 goto recheck;
5047 } 5115 }
5048 on_rq = p->se.on_rq; 5116 on_rq = p->on_rq;
5049 running = task_current(rq, p); 5117 running = task_current(rq, p);
5050 if (on_rq) 5118 if (on_rq)
5051 deactivate_task(rq, p, 0); 5119 deactivate_task(rq, p, 0);
@@ -5064,8 +5132,7 @@ recheck:
5064 activate_task(rq, p, 0); 5132 activate_task(rq, p, 0);
5065 5133
5066 check_class_changed(rq, p, prev_class, oldprio); 5134 check_class_changed(rq, p, prev_class, oldprio);
5067 __task_rq_unlock(rq); 5135 task_rq_unlock(rq, p, &flags);
5068 raw_spin_unlock_irqrestore(&p->pi_lock, flags);
5069 5136
5070 rt_mutex_adjust_pi(p); 5137 rt_mutex_adjust_pi(p);
5071 5138
@@ -5316,7 +5383,6 @@ long sched_getaffinity(pid_t pid, struct cpumask *mask)
5316{ 5383{
5317 struct task_struct *p; 5384 struct task_struct *p;
5318 unsigned long flags; 5385 unsigned long flags;
5319 struct rq *rq;
5320 int retval; 5386 int retval;
5321 5387
5322 get_online_cpus(); 5388 get_online_cpus();
@@ -5331,9 +5397,9 @@ long sched_getaffinity(pid_t pid, struct cpumask *mask)
5331 if (retval) 5397 if (retval)
5332 goto out_unlock; 5398 goto out_unlock;
5333 5399
5334 rq = task_rq_lock(p, &flags); 5400 raw_spin_lock_irqsave(&p->pi_lock, flags);
5335 cpumask_and(mask, &p->cpus_allowed, cpu_online_mask); 5401 cpumask_and(mask, &p->cpus_allowed, cpu_online_mask);
5336 task_rq_unlock(rq, &flags); 5402 raw_spin_unlock_irqrestore(&p->pi_lock, flags);
5337 5403
5338out_unlock: 5404out_unlock:
5339 rcu_read_unlock(); 5405 rcu_read_unlock();
@@ -5658,7 +5724,7 @@ SYSCALL_DEFINE2(sched_rr_get_interval, pid_t, pid,
5658 5724
5659 rq = task_rq_lock(p, &flags); 5725 rq = task_rq_lock(p, &flags);
5660 time_slice = p->sched_class->get_rr_interval(rq, p); 5726 time_slice = p->sched_class->get_rr_interval(rq, p);
5661 task_rq_unlock(rq, &flags); 5727 task_rq_unlock(rq, p, &flags);
5662 5728
5663 rcu_read_unlock(); 5729 rcu_read_unlock();
5664 jiffies_to_timespec(time_slice, &t); 5730 jiffies_to_timespec(time_slice, &t);
@@ -5716,7 +5782,7 @@ void show_state_filter(unsigned long state_filter)
5716 do_each_thread(g, p) { 5782 do_each_thread(g, p) {
5717 /* 5783 /*
5718 * reset the NMI-timeout, listing all files on a slow 5784 * reset the NMI-timeout, listing all files on a slow
5719 * console might take alot of time: 5785 * console might take a lot of time:
5720 */ 5786 */
5721 touch_nmi_watchdog(); 5787 touch_nmi_watchdog();
5722 if (!state_filter || (p->state & state_filter)) 5788 if (!state_filter || (p->state & state_filter))
@@ -5776,17 +5842,14 @@ void __cpuinit init_idle(struct task_struct *idle, int cpu)
5776 rcu_read_unlock(); 5842 rcu_read_unlock();
5777 5843
5778 rq->curr = rq->idle = idle; 5844 rq->curr = rq->idle = idle;
5779#if defined(CONFIG_SMP) && defined(__ARCH_WANT_UNLOCKED_CTXSW) 5845#if defined(CONFIG_SMP)
5780 idle->oncpu = 1; 5846 idle->on_cpu = 1;
5781#endif 5847#endif
5782 raw_spin_unlock_irqrestore(&rq->lock, flags); 5848 raw_spin_unlock_irqrestore(&rq->lock, flags);
5783 5849
5784 /* Set the preempt count _outside_ the spinlocks! */ 5850 /* Set the preempt count _outside_ the spinlocks! */
5785#if defined(CONFIG_PREEMPT)
5786 task_thread_info(idle)->preempt_count = (idle->lock_depth >= 0);
5787#else
5788 task_thread_info(idle)->preempt_count = 0; 5851 task_thread_info(idle)->preempt_count = 0;
5789#endif 5852
5790 /* 5853 /*
5791 * The idle tasks have their own, simple scheduling class: 5854 * The idle tasks have their own, simple scheduling class:
5792 */ 5855 */
@@ -5881,26 +5944,17 @@ int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask)
5881 unsigned int dest_cpu; 5944 unsigned int dest_cpu;
5882 int ret = 0; 5945 int ret = 0;
5883 5946
5884 /*
5885 * Serialize against TASK_WAKING so that ttwu() and wunt() can
5886 * drop the rq->lock and still rely on ->cpus_allowed.
5887 */
5888again:
5889 while (task_is_waking(p))
5890 cpu_relax();
5891 rq = task_rq_lock(p, &flags); 5947 rq = task_rq_lock(p, &flags);
5892 if (task_is_waking(p)) { 5948
5893 task_rq_unlock(rq, &flags); 5949 if (cpumask_equal(&p->cpus_allowed, new_mask))
5894 goto again; 5950 goto out;
5895 }
5896 5951
5897 if (!cpumask_intersects(new_mask, cpu_active_mask)) { 5952 if (!cpumask_intersects(new_mask, cpu_active_mask)) {
5898 ret = -EINVAL; 5953 ret = -EINVAL;
5899 goto out; 5954 goto out;
5900 } 5955 }
5901 5956
5902 if (unlikely((p->flags & PF_THREAD_BOUND) && p != current && 5957 if (unlikely((p->flags & PF_THREAD_BOUND) && p != current)) {
5903 !cpumask_equal(&p->cpus_allowed, new_mask))) {
5904 ret = -EINVAL; 5958 ret = -EINVAL;
5905 goto out; 5959 goto out;
5906 } 5960 }
@@ -5917,16 +5971,16 @@ again:
5917 goto out; 5971 goto out;
5918 5972
5919 dest_cpu = cpumask_any_and(cpu_active_mask, new_mask); 5973 dest_cpu = cpumask_any_and(cpu_active_mask, new_mask);
5920 if (migrate_task(p, rq)) { 5974 if (p->on_rq) {
5921 struct migration_arg arg = { p, dest_cpu }; 5975 struct migration_arg arg = { p, dest_cpu };
5922 /* Need help from migration thread: drop lock and wait. */ 5976 /* Need help from migration thread: drop lock and wait. */
5923 task_rq_unlock(rq, &flags); 5977 task_rq_unlock(rq, p, &flags);
5924 stop_one_cpu(cpu_of(rq), migration_cpu_stop, &arg); 5978 stop_one_cpu(cpu_of(rq), migration_cpu_stop, &arg);
5925 tlb_migrate_finish(p->mm); 5979 tlb_migrate_finish(p->mm);
5926 return 0; 5980 return 0;
5927 } 5981 }
5928out: 5982out:
5929 task_rq_unlock(rq, &flags); 5983 task_rq_unlock(rq, p, &flags);
5930 5984
5931 return ret; 5985 return ret;
5932} 5986}
@@ -5954,6 +6008,7 @@ static int __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu)
5954 rq_src = cpu_rq(src_cpu); 6008 rq_src = cpu_rq(src_cpu);
5955 rq_dest = cpu_rq(dest_cpu); 6009 rq_dest = cpu_rq(dest_cpu);
5956 6010
6011 raw_spin_lock(&p->pi_lock);
5957 double_rq_lock(rq_src, rq_dest); 6012 double_rq_lock(rq_src, rq_dest);
5958 /* Already moved. */ 6013 /* Already moved. */
5959 if (task_cpu(p) != src_cpu) 6014 if (task_cpu(p) != src_cpu)
@@ -5966,7 +6021,7 @@ static int __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu)
5966 * If we're not on a rq, the next wake-up will ensure we're 6021 * If we're not on a rq, the next wake-up will ensure we're
5967 * placed properly. 6022 * placed properly.
5968 */ 6023 */
5969 if (p->se.on_rq) { 6024 if (p->on_rq) {
5970 deactivate_task(rq_src, p, 0); 6025 deactivate_task(rq_src, p, 0);
5971 set_task_cpu(p, dest_cpu); 6026 set_task_cpu(p, dest_cpu);
5972 activate_task(rq_dest, p, 0); 6027 activate_task(rq_dest, p, 0);
@@ -5976,6 +6031,7 @@ done:
5976 ret = 1; 6031 ret = 1;
5977fail: 6032fail:
5978 double_rq_unlock(rq_src, rq_dest); 6033 double_rq_unlock(rq_src, rq_dest);
6034 raw_spin_unlock(&p->pi_lock);
5979 return ret; 6035 return ret;
5980} 6036}
5981 6037
@@ -6316,6 +6372,7 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
6316 6372
6317#ifdef CONFIG_HOTPLUG_CPU 6373#ifdef CONFIG_HOTPLUG_CPU
6318 case CPU_DYING: 6374 case CPU_DYING:
6375 sched_ttwu_pending();
6319 /* Update our root-domain */ 6376 /* Update our root-domain */
6320 raw_spin_lock_irqsave(&rq->lock, flags); 6377 raw_spin_lock_irqsave(&rq->lock, flags);
6321 if (rq->rd) { 6378 if (rq->rd) {
@@ -6331,6 +6388,9 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
6331 break; 6388 break;
6332#endif 6389#endif
6333 } 6390 }
6391
6392 update_max_interval();
6393
6334 return NOTIFY_OK; 6394 return NOTIFY_OK;
6335} 6395}
6336 6396
@@ -6391,6 +6451,8 @@ early_initcall(migration_init);
6391 6451
6392#ifdef CONFIG_SMP 6452#ifdef CONFIG_SMP
6393 6453
6454static cpumask_var_t sched_domains_tmpmask; /* sched_domains_mutex */
6455
6394#ifdef CONFIG_SCHED_DEBUG 6456#ifdef CONFIG_SCHED_DEBUG
6395 6457
6396static __read_mostly int sched_domain_debug_enabled; 6458static __read_mostly int sched_domain_debug_enabled;
@@ -6486,7 +6548,6 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level,
6486 6548
6487static void sched_domain_debug(struct sched_domain *sd, int cpu) 6549static void sched_domain_debug(struct sched_domain *sd, int cpu)
6488{ 6550{
6489 cpumask_var_t groupmask;
6490 int level = 0; 6551 int level = 0;
6491 6552
6492 if (!sched_domain_debug_enabled) 6553 if (!sched_domain_debug_enabled)
@@ -6499,20 +6560,14 @@ static void sched_domain_debug(struct sched_domain *sd, int cpu)
6499 6560
6500 printk(KERN_DEBUG "CPU%d attaching sched-domain:\n", cpu); 6561 printk(KERN_DEBUG "CPU%d attaching sched-domain:\n", cpu);
6501 6562
6502 if (!alloc_cpumask_var(&groupmask, GFP_KERNEL)) {
6503 printk(KERN_DEBUG "Cannot load-balance (out of memory)\n");
6504 return;
6505 }
6506
6507 for (;;) { 6563 for (;;) {
6508 if (sched_domain_debug_one(sd, cpu, level, groupmask)) 6564 if (sched_domain_debug_one(sd, cpu, level, sched_domains_tmpmask))
6509 break; 6565 break;
6510 level++; 6566 level++;
6511 sd = sd->parent; 6567 sd = sd->parent;
6512 if (!sd) 6568 if (!sd)
6513 break; 6569 break;
6514 } 6570 }
6515 free_cpumask_var(groupmask);
6516} 6571}
6517#else /* !CONFIG_SCHED_DEBUG */ 6572#else /* !CONFIG_SCHED_DEBUG */
6518# define sched_domain_debug(sd, cpu) do { } while (0) 6573# define sched_domain_debug(sd, cpu) do { } while (0)
@@ -6569,12 +6624,11 @@ sd_parent_degenerate(struct sched_domain *sd, struct sched_domain *parent)
6569 return 1; 6624 return 1;
6570} 6625}
6571 6626
6572static void free_rootdomain(struct root_domain *rd) 6627static void free_rootdomain(struct rcu_head *rcu)
6573{ 6628{
6574 synchronize_sched(); 6629 struct root_domain *rd = container_of(rcu, struct root_domain, rcu);
6575 6630
6576 cpupri_cleanup(&rd->cpupri); 6631 cpupri_cleanup(&rd->cpupri);
6577
6578 free_cpumask_var(rd->rto_mask); 6632 free_cpumask_var(rd->rto_mask);
6579 free_cpumask_var(rd->online); 6633 free_cpumask_var(rd->online);
6580 free_cpumask_var(rd->span); 6634 free_cpumask_var(rd->span);
@@ -6615,7 +6669,7 @@ static void rq_attach_root(struct rq *rq, struct root_domain *rd)
6615 raw_spin_unlock_irqrestore(&rq->lock, flags); 6669 raw_spin_unlock_irqrestore(&rq->lock, flags);
6616 6670
6617 if (old_rd) 6671 if (old_rd)
6618 free_rootdomain(old_rd); 6672 call_rcu_sched(&old_rd->rcu, free_rootdomain);
6619} 6673}
6620 6674
6621static int init_rootdomain(struct root_domain *rd) 6675static int init_rootdomain(struct root_domain *rd)
@@ -6666,6 +6720,25 @@ static struct root_domain *alloc_rootdomain(void)
6666 return rd; 6720 return rd;
6667} 6721}
6668 6722
6723static void free_sched_domain(struct rcu_head *rcu)
6724{
6725 struct sched_domain *sd = container_of(rcu, struct sched_domain, rcu);
6726 if (atomic_dec_and_test(&sd->groups->ref))
6727 kfree(sd->groups);
6728 kfree(sd);
6729}
6730
6731static void destroy_sched_domain(struct sched_domain *sd, int cpu)
6732{
6733 call_rcu(&sd->rcu, free_sched_domain);
6734}
6735
6736static void destroy_sched_domains(struct sched_domain *sd, int cpu)
6737{
6738 for (; sd; sd = sd->parent)
6739 destroy_sched_domain(sd, cpu);
6740}
6741
6669/* 6742/*
6670 * Attach the domain 'sd' to 'cpu' as its base domain. Callers must 6743 * Attach the domain 'sd' to 'cpu' as its base domain. Callers must
6671 * hold the hotplug lock. 6744 * hold the hotplug lock.
@@ -6676,9 +6749,6 @@ cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu)
6676 struct rq *rq = cpu_rq(cpu); 6749 struct rq *rq = cpu_rq(cpu);
6677 struct sched_domain *tmp; 6750 struct sched_domain *tmp;
6678 6751
6679 for (tmp = sd; tmp; tmp = tmp->parent)
6680 tmp->span_weight = cpumask_weight(sched_domain_span(tmp));
6681
6682 /* Remove the sched domains which do not contribute to scheduling. */ 6752 /* Remove the sched domains which do not contribute to scheduling. */
6683 for (tmp = sd; tmp; ) { 6753 for (tmp = sd; tmp; ) {
6684 struct sched_domain *parent = tmp->parent; 6754 struct sched_domain *parent = tmp->parent;
@@ -6689,12 +6759,15 @@ cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu)
6689 tmp->parent = parent->parent; 6759 tmp->parent = parent->parent;
6690 if (parent->parent) 6760 if (parent->parent)
6691 parent->parent->child = tmp; 6761 parent->parent->child = tmp;
6762 destroy_sched_domain(parent, cpu);
6692 } else 6763 } else
6693 tmp = tmp->parent; 6764 tmp = tmp->parent;
6694 } 6765 }
6695 6766
6696 if (sd && sd_degenerate(sd)) { 6767 if (sd && sd_degenerate(sd)) {
6768 tmp = sd;
6697 sd = sd->parent; 6769 sd = sd->parent;
6770 destroy_sched_domain(tmp, cpu);
6698 if (sd) 6771 if (sd)
6699 sd->child = NULL; 6772 sd->child = NULL;
6700 } 6773 }
@@ -6702,7 +6775,9 @@ cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu)
6702 sched_domain_debug(sd, cpu); 6775 sched_domain_debug(sd, cpu);
6703 6776
6704 rq_attach_root(rq, rd); 6777 rq_attach_root(rq, rd);
6778 tmp = rq->sd;
6705 rcu_assign_pointer(rq->sd, sd); 6779 rcu_assign_pointer(rq->sd, sd);
6780 destroy_sched_domains(tmp, cpu);
6706} 6781}
6707 6782
6708/* cpus with isolated domains */ 6783/* cpus with isolated domains */
@@ -6718,56 +6793,6 @@ static int __init isolated_cpu_setup(char *str)
6718 6793
6719__setup("isolcpus=", isolated_cpu_setup); 6794__setup("isolcpus=", isolated_cpu_setup);
6720 6795
6721/*
6722 * init_sched_build_groups takes the cpumask we wish to span, and a pointer
6723 * to a function which identifies what group(along with sched group) a CPU
6724 * belongs to. The return value of group_fn must be a >= 0 and < nr_cpu_ids
6725 * (due to the fact that we keep track of groups covered with a struct cpumask).
6726 *
6727 * init_sched_build_groups will build a circular linked list of the groups
6728 * covered by the given span, and will set each group's ->cpumask correctly,
6729 * and ->cpu_power to 0.
6730 */
6731static void
6732init_sched_build_groups(const struct cpumask *span,
6733 const struct cpumask *cpu_map,
6734 int (*group_fn)(int cpu, const struct cpumask *cpu_map,
6735 struct sched_group **sg,
6736 struct cpumask *tmpmask),
6737 struct cpumask *covered, struct cpumask *tmpmask)
6738{
6739 struct sched_group *first = NULL, *last = NULL;
6740 int i;
6741
6742 cpumask_clear(covered);
6743
6744 for_each_cpu(i, span) {
6745 struct sched_group *sg;
6746 int group = group_fn(i, cpu_map, &sg, tmpmask);
6747 int j;
6748
6749 if (cpumask_test_cpu(i, covered))
6750 continue;
6751
6752 cpumask_clear(sched_group_cpus(sg));
6753 sg->cpu_power = 0;
6754
6755 for_each_cpu(j, span) {
6756 if (group_fn(j, cpu_map, NULL, tmpmask) != group)
6757 continue;
6758
6759 cpumask_set_cpu(j, covered);
6760 cpumask_set_cpu(j, sched_group_cpus(sg));
6761 }
6762 if (!first)
6763 first = sg;
6764 if (last)
6765 last->next = sg;
6766 last = sg;
6767 }
6768 last->next = first;
6769}
6770
6771#define SD_NODES_PER_DOMAIN 16 6796#define SD_NODES_PER_DOMAIN 16
6772 6797
6773#ifdef CONFIG_NUMA 6798#ifdef CONFIG_NUMA
@@ -6784,7 +6809,7 @@ init_sched_build_groups(const struct cpumask *span,
6784 */ 6809 */
6785static int find_next_best_node(int node, nodemask_t *used_nodes) 6810static int find_next_best_node(int node, nodemask_t *used_nodes)
6786{ 6811{
6787 int i, n, val, min_val, best_node = 0; 6812 int i, n, val, min_val, best_node = -1;
6788 6813
6789 min_val = INT_MAX; 6814 min_val = INT_MAX;
6790 6815
@@ -6808,7 +6833,8 @@ static int find_next_best_node(int node, nodemask_t *used_nodes)
6808 } 6833 }
6809 } 6834 }
6810 6835
6811 node_set(best_node, *used_nodes); 6836 if (best_node != -1)
6837 node_set(best_node, *used_nodes);
6812 return best_node; 6838 return best_node;
6813} 6839}
6814 6840
@@ -6834,315 +6860,130 @@ static void sched_domain_node_span(int node, struct cpumask *span)
6834 6860
6835 for (i = 1; i < SD_NODES_PER_DOMAIN; i++) { 6861 for (i = 1; i < SD_NODES_PER_DOMAIN; i++) {
6836 int next_node = find_next_best_node(node, &used_nodes); 6862 int next_node = find_next_best_node(node, &used_nodes);
6837 6863 if (next_node < 0)
6864 break;
6838 cpumask_or(span, span, cpumask_of_node(next_node)); 6865 cpumask_or(span, span, cpumask_of_node(next_node));
6839 } 6866 }
6840} 6867}
6868
6869static const struct cpumask *cpu_node_mask(int cpu)
6870{
6871 lockdep_assert_held(&sched_domains_mutex);
6872
6873 sched_domain_node_span(cpu_to_node(cpu), sched_domains_tmpmask);
6874
6875 return sched_domains_tmpmask;
6876}
6877
6878static const struct cpumask *cpu_allnodes_mask(int cpu)
6879{
6880 return cpu_possible_mask;
6881}
6841#endif /* CONFIG_NUMA */ 6882#endif /* CONFIG_NUMA */
6842 6883
6843int sched_smt_power_savings = 0, sched_mc_power_savings = 0; 6884static const struct cpumask *cpu_cpu_mask(int cpu)
6885{
6886 return cpumask_of_node(cpu_to_node(cpu));
6887}
6844 6888
6845/* 6889int sched_smt_power_savings = 0, sched_mc_power_savings = 0;
6846 * The cpus mask in sched_group and sched_domain hangs off the end.
6847 *
6848 * ( See the the comments in include/linux/sched.h:struct sched_group
6849 * and struct sched_domain. )
6850 */
6851struct static_sched_group {
6852 struct sched_group sg;
6853 DECLARE_BITMAP(cpus, CONFIG_NR_CPUS);
6854};
6855 6890
6856struct static_sched_domain { 6891struct sd_data {
6857 struct sched_domain sd; 6892 struct sched_domain **__percpu sd;
6858 DECLARE_BITMAP(span, CONFIG_NR_CPUS); 6893 struct sched_group **__percpu sg;
6859}; 6894};
6860 6895
6861struct s_data { 6896struct s_data {
6862#ifdef CONFIG_NUMA 6897 struct sched_domain ** __percpu sd;
6863 int sd_allnodes;
6864 cpumask_var_t domainspan;
6865 cpumask_var_t covered;
6866 cpumask_var_t notcovered;
6867#endif
6868 cpumask_var_t nodemask;
6869 cpumask_var_t this_sibling_map;
6870 cpumask_var_t this_core_map;
6871 cpumask_var_t this_book_map;
6872 cpumask_var_t send_covered;
6873 cpumask_var_t tmpmask;
6874 struct sched_group **sched_group_nodes;
6875 struct root_domain *rd; 6898 struct root_domain *rd;
6876}; 6899};
6877 6900
6878enum s_alloc { 6901enum s_alloc {
6879 sa_sched_groups = 0,
6880 sa_rootdomain, 6902 sa_rootdomain,
6881 sa_tmpmask, 6903 sa_sd,
6882 sa_send_covered, 6904 sa_sd_storage,
6883 sa_this_book_map,
6884 sa_this_core_map,
6885 sa_this_sibling_map,
6886 sa_nodemask,
6887 sa_sched_group_nodes,
6888#ifdef CONFIG_NUMA
6889 sa_notcovered,
6890 sa_covered,
6891 sa_domainspan,
6892#endif
6893 sa_none, 6905 sa_none,
6894}; 6906};
6895 6907
6896/* 6908struct sched_domain_topology_level;
6897 * SMT sched-domains:
6898 */
6899#ifdef CONFIG_SCHED_SMT
6900static DEFINE_PER_CPU(struct static_sched_domain, cpu_domains);
6901static DEFINE_PER_CPU(struct static_sched_group, sched_groups);
6902 6909
6903static int 6910typedef struct sched_domain *(*sched_domain_init_f)(struct sched_domain_topology_level *tl, int cpu);
6904cpu_to_cpu_group(int cpu, const struct cpumask *cpu_map, 6911typedef const struct cpumask *(*sched_domain_mask_f)(int cpu);
6905 struct sched_group **sg, struct cpumask *unused)
6906{
6907 if (sg)
6908 *sg = &per_cpu(sched_groups, cpu).sg;
6909 return cpu;
6910}
6911#endif /* CONFIG_SCHED_SMT */
6912 6912
6913/* 6913struct sched_domain_topology_level {
6914 * multi-core sched-domains: 6914 sched_domain_init_f init;
6915 */ 6915 sched_domain_mask_f mask;
6916#ifdef CONFIG_SCHED_MC 6916 struct sd_data data;
6917static DEFINE_PER_CPU(struct static_sched_domain, core_domains); 6917};
6918static DEFINE_PER_CPU(struct static_sched_group, sched_group_core);
6919
6920static int
6921cpu_to_core_group(int cpu, const struct cpumask *cpu_map,
6922 struct sched_group **sg, struct cpumask *mask)
6923{
6924 int group;
6925#ifdef CONFIG_SCHED_SMT
6926 cpumask_and(mask, topology_thread_cpumask(cpu), cpu_map);
6927 group = cpumask_first(mask);
6928#else
6929 group = cpu;
6930#endif
6931 if (sg)
6932 *sg = &per_cpu(sched_group_core, group).sg;
6933 return group;
6934}
6935#endif /* CONFIG_SCHED_MC */
6936 6918
6937/* 6919/*
6938 * book sched-domains: 6920 * Assumes the sched_domain tree is fully constructed
6939 */ 6921 */
6940#ifdef CONFIG_SCHED_BOOK 6922static int get_group(int cpu, struct sd_data *sdd, struct sched_group **sg)
6941static DEFINE_PER_CPU(struct static_sched_domain, book_domains);
6942static DEFINE_PER_CPU(struct static_sched_group, sched_group_book);
6943
6944static int
6945cpu_to_book_group(int cpu, const struct cpumask *cpu_map,
6946 struct sched_group **sg, struct cpumask *mask)
6947{ 6923{
6948 int group = cpu; 6924 struct sched_domain *sd = *per_cpu_ptr(sdd->sd, cpu);
6949#ifdef CONFIG_SCHED_MC 6925 struct sched_domain *child = sd->child;
6950 cpumask_and(mask, cpu_coregroup_mask(cpu), cpu_map);
6951 group = cpumask_first(mask);
6952#elif defined(CONFIG_SCHED_SMT)
6953 cpumask_and(mask, topology_thread_cpumask(cpu), cpu_map);
6954 group = cpumask_first(mask);
6955#endif
6956 if (sg)
6957 *sg = &per_cpu(sched_group_book, group).sg;
6958 return group;
6959}
6960#endif /* CONFIG_SCHED_BOOK */
6961 6926
6962static DEFINE_PER_CPU(struct static_sched_domain, phys_domains); 6927 if (child)
6963static DEFINE_PER_CPU(struct static_sched_group, sched_group_phys); 6928 cpu = cpumask_first(sched_domain_span(child));
6964 6929
6965static int
6966cpu_to_phys_group(int cpu, const struct cpumask *cpu_map,
6967 struct sched_group **sg, struct cpumask *mask)
6968{
6969 int group;
6970#ifdef CONFIG_SCHED_BOOK
6971 cpumask_and(mask, cpu_book_mask(cpu), cpu_map);
6972 group = cpumask_first(mask);
6973#elif defined(CONFIG_SCHED_MC)
6974 cpumask_and(mask, cpu_coregroup_mask(cpu), cpu_map);
6975 group = cpumask_first(mask);
6976#elif defined(CONFIG_SCHED_SMT)
6977 cpumask_and(mask, topology_thread_cpumask(cpu), cpu_map);
6978 group = cpumask_first(mask);
6979#else
6980 group = cpu;
6981#endif
6982 if (sg) 6930 if (sg)
6983 *sg = &per_cpu(sched_group_phys, group).sg; 6931 *sg = *per_cpu_ptr(sdd->sg, cpu);
6984 return group; 6932
6933 return cpu;
6985} 6934}
6986 6935
6987#ifdef CONFIG_NUMA
6988/* 6936/*
6989 * The init_sched_build_groups can't handle what we want to do with node 6937 * build_sched_groups takes the cpumask we wish to span, and a pointer
6990 * groups, so roll our own. Now each node has its own list of groups which 6938 * to a function which identifies what group(along with sched group) a CPU
6991 * gets dynamically allocated. 6939 * belongs to. The return value of group_fn must be a >= 0 and < nr_cpu_ids
6940 * (due to the fact that we keep track of groups covered with a struct cpumask).
6941 *
6942 * build_sched_groups will build a circular linked list of the groups
6943 * covered by the given span, and will set each group's ->cpumask correctly,
6944 * and ->cpu_power to 0.
6992 */ 6945 */
6993static DEFINE_PER_CPU(struct static_sched_domain, node_domains); 6946static void
6994static struct sched_group ***sched_group_nodes_bycpu; 6947build_sched_groups(struct sched_domain *sd)
6995
6996static DEFINE_PER_CPU(struct static_sched_domain, allnodes_domains);
6997static DEFINE_PER_CPU(struct static_sched_group, sched_group_allnodes);
6998
6999static int cpu_to_allnodes_group(int cpu, const struct cpumask *cpu_map,
7000 struct sched_group **sg,
7001 struct cpumask *nodemask)
7002{
7003 int group;
7004
7005 cpumask_and(nodemask, cpumask_of_node(cpu_to_node(cpu)), cpu_map);
7006 group = cpumask_first(nodemask);
7007
7008 if (sg)
7009 *sg = &per_cpu(sched_group_allnodes, group).sg;
7010 return group;
7011}
7012
7013static void init_numa_sched_groups_power(struct sched_group *group_head)
7014{
7015 struct sched_group *sg = group_head;
7016 int j;
7017
7018 if (!sg)
7019 return;
7020 do {
7021 for_each_cpu(j, sched_group_cpus(sg)) {
7022 struct sched_domain *sd;
7023
7024 sd = &per_cpu(phys_domains, j).sd;
7025 if (j != group_first_cpu(sd->groups)) {
7026 /*
7027 * Only add "power" once for each
7028 * physical package.
7029 */
7030 continue;
7031 }
7032
7033 sg->cpu_power += sd->groups->cpu_power;
7034 }
7035 sg = sg->next;
7036 } while (sg != group_head);
7037}
7038
7039static int build_numa_sched_groups(struct s_data *d,
7040 const struct cpumask *cpu_map, int num)
7041{ 6948{
7042 struct sched_domain *sd; 6949 struct sched_group *first = NULL, *last = NULL;
7043 struct sched_group *sg, *prev; 6950 struct sd_data *sdd = sd->private;
7044 int n, j; 6951 const struct cpumask *span = sched_domain_span(sd);
7045 6952 struct cpumask *covered;
7046 cpumask_clear(d->covered); 6953 int i;
7047 cpumask_and(d->nodemask, cpumask_of_node(num), cpu_map);
7048 if (cpumask_empty(d->nodemask)) {
7049 d->sched_group_nodes[num] = NULL;
7050 goto out;
7051 }
7052
7053 sched_domain_node_span(num, d->domainspan);
7054 cpumask_and(d->domainspan, d->domainspan, cpu_map);
7055
7056 sg = kmalloc_node(sizeof(struct sched_group) + cpumask_size(),
7057 GFP_KERNEL, num);
7058 if (!sg) {
7059 printk(KERN_WARNING "Can not alloc domain group for node %d\n",
7060 num);
7061 return -ENOMEM;
7062 }
7063 d->sched_group_nodes[num] = sg;
7064
7065 for_each_cpu(j, d->nodemask) {
7066 sd = &per_cpu(node_domains, j).sd;
7067 sd->groups = sg;
7068 }
7069 6954
7070 sg->cpu_power = 0; 6955 lockdep_assert_held(&sched_domains_mutex);
7071 cpumask_copy(sched_group_cpus(sg), d->nodemask); 6956 covered = sched_domains_tmpmask;
7072 sg->next = sg;
7073 cpumask_or(d->covered, d->covered, d->nodemask);
7074 6957
7075 prev = sg; 6958 cpumask_clear(covered);
7076 for (j = 0; j < nr_node_ids; j++) {
7077 n = (num + j) % nr_node_ids;
7078 cpumask_complement(d->notcovered, d->covered);
7079 cpumask_and(d->tmpmask, d->notcovered, cpu_map);
7080 cpumask_and(d->tmpmask, d->tmpmask, d->domainspan);
7081 if (cpumask_empty(d->tmpmask))
7082 break;
7083 cpumask_and(d->tmpmask, d->tmpmask, cpumask_of_node(n));
7084 if (cpumask_empty(d->tmpmask))
7085 continue;
7086 sg = kmalloc_node(sizeof(struct sched_group) + cpumask_size(),
7087 GFP_KERNEL, num);
7088 if (!sg) {
7089 printk(KERN_WARNING
7090 "Can not alloc domain group for node %d\n", j);
7091 return -ENOMEM;
7092 }
7093 sg->cpu_power = 0;
7094 cpumask_copy(sched_group_cpus(sg), d->tmpmask);
7095 sg->next = prev->next;
7096 cpumask_or(d->covered, d->covered, d->tmpmask);
7097 prev->next = sg;
7098 prev = sg;
7099 }
7100out:
7101 return 0;
7102}
7103#endif /* CONFIG_NUMA */
7104
7105#ifdef CONFIG_NUMA
7106/* Free memory allocated for various sched_group structures */
7107static void free_sched_groups(const struct cpumask *cpu_map,
7108 struct cpumask *nodemask)
7109{
7110 int cpu, i;
7111 6959
7112 for_each_cpu(cpu, cpu_map) { 6960 for_each_cpu(i, span) {
7113 struct sched_group **sched_group_nodes 6961 struct sched_group *sg;
7114 = sched_group_nodes_bycpu[cpu]; 6962 int group = get_group(i, sdd, &sg);
6963 int j;
7115 6964
7116 if (!sched_group_nodes) 6965 if (cpumask_test_cpu(i, covered))
7117 continue; 6966 continue;
7118 6967
7119 for (i = 0; i < nr_node_ids; i++) { 6968 cpumask_clear(sched_group_cpus(sg));
7120 struct sched_group *oldsg, *sg = sched_group_nodes[i]; 6969 sg->cpu_power = 0;
7121 6970
7122 cpumask_and(nodemask, cpumask_of_node(i), cpu_map); 6971 for_each_cpu(j, span) {
7123 if (cpumask_empty(nodemask)) 6972 if (get_group(j, sdd, NULL) != group)
7124 continue; 6973 continue;
7125 6974
7126 if (sg == NULL) 6975 cpumask_set_cpu(j, covered);
7127 continue; 6976 cpumask_set_cpu(j, sched_group_cpus(sg));
7128 sg = sg->next;
7129next_sg:
7130 oldsg = sg;
7131 sg = sg->next;
7132 kfree(oldsg);
7133 if (oldsg != sched_group_nodes[i])
7134 goto next_sg;
7135 } 6977 }
7136 kfree(sched_group_nodes); 6978
7137 sched_group_nodes_bycpu[cpu] = NULL; 6979 if (!first)
6980 first = sg;
6981 if (last)
6982 last->next = sg;
6983 last = sg;
7138 } 6984 }
6985 last->next = first;
7139} 6986}
7140#else /* !CONFIG_NUMA */
7141static void free_sched_groups(const struct cpumask *cpu_map,
7142 struct cpumask *nodemask)
7143{
7144}
7145#endif /* CONFIG_NUMA */
7146 6987
7147/* 6988/*
7148 * Initialize sched groups cpu_power. 6989 * Initialize sched groups cpu_power.
@@ -7156,11 +6997,6 @@ static void free_sched_groups(const struct cpumask *cpu_map,
7156 */ 6997 */
7157static void init_sched_groups_power(int cpu, struct sched_domain *sd) 6998static void init_sched_groups_power(int cpu, struct sched_domain *sd)
7158{ 6999{
7159 struct sched_domain *child;
7160 struct sched_group *group;
7161 long power;
7162 int weight;
7163
7164 WARN_ON(!sd || !sd->groups); 7000 WARN_ON(!sd || !sd->groups);
7165 7001
7166 if (cpu != group_first_cpu(sd->groups)) 7002 if (cpu != group_first_cpu(sd->groups))
@@ -7168,36 +7004,7 @@ static void init_sched_groups_power(int cpu, struct sched_domain *sd)
7168 7004
7169 sd->groups->group_weight = cpumask_weight(sched_group_cpus(sd->groups)); 7005 sd->groups->group_weight = cpumask_weight(sched_group_cpus(sd->groups));
7170 7006
7171 child = sd->child; 7007 update_group_power(sd, cpu);
7172
7173 sd->groups->cpu_power = 0;
7174
7175 if (!child) {
7176 power = SCHED_LOAD_SCALE;
7177 weight = cpumask_weight(sched_domain_span(sd));
7178 /*
7179 * SMT siblings share the power of a single core.
7180 * Usually multiple threads get a better yield out of
7181 * that one core than a single thread would have,
7182 * reflect that in sd->smt_gain.
7183 */
7184 if ((sd->flags & SD_SHARE_CPUPOWER) && weight > 1) {
7185 power *= sd->smt_gain;
7186 power /= weight;
7187 power >>= SCHED_LOAD_SHIFT;
7188 }
7189 sd->groups->cpu_power += power;
7190 return;
7191 }
7192
7193 /*
7194 * Add cpu_power of each child group to this groups cpu_power.
7195 */
7196 group = child->groups;
7197 do {
7198 sd->groups->cpu_power += group->cpu_power;
7199 group = group->next;
7200 } while (group != child->groups);
7201} 7008}
7202 7009
7203/* 7010/*
@@ -7211,15 +7018,15 @@ static void init_sched_groups_power(int cpu, struct sched_domain *sd)
7211# define SD_INIT_NAME(sd, type) do { } while (0) 7018# define SD_INIT_NAME(sd, type) do { } while (0)
7212#endif 7019#endif
7213 7020
7214#define SD_INIT(sd, type) sd_init_##type(sd) 7021#define SD_INIT_FUNC(type) \
7215 7022static noinline struct sched_domain * \
7216#define SD_INIT_FUNC(type) \ 7023sd_init_##type(struct sched_domain_topology_level *tl, int cpu) \
7217static noinline void sd_init_##type(struct sched_domain *sd) \ 7024{ \
7218{ \ 7025 struct sched_domain *sd = *per_cpu_ptr(tl->data.sd, cpu); \
7219 memset(sd, 0, sizeof(*sd)); \ 7026 *sd = SD_##type##_INIT; \
7220 *sd = SD_##type##_INIT; \ 7027 SD_INIT_NAME(sd, type); \
7221 sd->level = SD_LV_##type; \ 7028 sd->private = &tl->data; \
7222 SD_INIT_NAME(sd, type); \ 7029 return sd; \
7223} 7030}
7224 7031
7225SD_INIT_FUNC(CPU) 7032SD_INIT_FUNC(CPU)
@@ -7238,13 +7045,14 @@ SD_INIT_FUNC(CPU)
7238#endif 7045#endif
7239 7046
7240static int default_relax_domain_level = -1; 7047static int default_relax_domain_level = -1;
7048int sched_domain_level_max;
7241 7049
7242static int __init setup_relax_domain_level(char *str) 7050static int __init setup_relax_domain_level(char *str)
7243{ 7051{
7244 unsigned long val; 7052 unsigned long val;
7245 7053
7246 val = simple_strtoul(str, NULL, 0); 7054 val = simple_strtoul(str, NULL, 0);
7247 if (val < SD_LV_MAX) 7055 if (val < sched_domain_level_max)
7248 default_relax_domain_level = val; 7056 default_relax_domain_level = val;
7249 7057
7250 return 1; 7058 return 1;
@@ -7272,37 +7080,20 @@ static void set_domain_attribute(struct sched_domain *sd,
7272 } 7080 }
7273} 7081}
7274 7082
7083static void __sdt_free(const struct cpumask *cpu_map);
7084static int __sdt_alloc(const struct cpumask *cpu_map);
7085
7275static void __free_domain_allocs(struct s_data *d, enum s_alloc what, 7086static void __free_domain_allocs(struct s_data *d, enum s_alloc what,
7276 const struct cpumask *cpu_map) 7087 const struct cpumask *cpu_map)
7277{ 7088{
7278 switch (what) { 7089 switch (what) {
7279 case sa_sched_groups:
7280 free_sched_groups(cpu_map, d->tmpmask); /* fall through */
7281 d->sched_group_nodes = NULL;
7282 case sa_rootdomain: 7090 case sa_rootdomain:
7283 free_rootdomain(d->rd); /* fall through */ 7091 if (!atomic_read(&d->rd->refcount))
7284 case sa_tmpmask: 7092 free_rootdomain(&d->rd->rcu); /* fall through */
7285 free_cpumask_var(d->tmpmask); /* fall through */ 7093 case sa_sd:
7286 case sa_send_covered: 7094 free_percpu(d->sd); /* fall through */
7287 free_cpumask_var(d->send_covered); /* fall through */ 7095 case sa_sd_storage:
7288 case sa_this_book_map: 7096 __sdt_free(cpu_map); /* fall through */
7289 free_cpumask_var(d->this_book_map); /* fall through */
7290 case sa_this_core_map:
7291 free_cpumask_var(d->this_core_map); /* fall through */
7292 case sa_this_sibling_map:
7293 free_cpumask_var(d->this_sibling_map); /* fall through */
7294 case sa_nodemask:
7295 free_cpumask_var(d->nodemask); /* fall through */
7296 case sa_sched_group_nodes:
7297#ifdef CONFIG_NUMA
7298 kfree(d->sched_group_nodes); /* fall through */
7299 case sa_notcovered:
7300 free_cpumask_var(d->notcovered); /* fall through */
7301 case sa_covered:
7302 free_cpumask_var(d->covered); /* fall through */
7303 case sa_domainspan:
7304 free_cpumask_var(d->domainspan); /* fall through */
7305#endif
7306 case sa_none: 7097 case sa_none:
7307 break; 7098 break;
7308 } 7099 }
@@ -7311,308 +7102,212 @@ static void __free_domain_allocs(struct s_data *d, enum s_alloc what,
7311static enum s_alloc __visit_domain_allocation_hell(struct s_data *d, 7102static enum s_alloc __visit_domain_allocation_hell(struct s_data *d,
7312 const struct cpumask *cpu_map) 7103 const struct cpumask *cpu_map)
7313{ 7104{
7314#ifdef CONFIG_NUMA 7105 memset(d, 0, sizeof(*d));
7315 if (!alloc_cpumask_var(&d->domainspan, GFP_KERNEL)) 7106
7316 return sa_none; 7107 if (__sdt_alloc(cpu_map))
7317 if (!alloc_cpumask_var(&d->covered, GFP_KERNEL)) 7108 return sa_sd_storage;
7318 return sa_domainspan; 7109 d->sd = alloc_percpu(struct sched_domain *);
7319 if (!alloc_cpumask_var(&d->notcovered, GFP_KERNEL)) 7110 if (!d->sd)
7320 return sa_covered; 7111 return sa_sd_storage;
7321 /* Allocate the per-node list of sched groups */
7322 d->sched_group_nodes = kcalloc(nr_node_ids,
7323 sizeof(struct sched_group *), GFP_KERNEL);
7324 if (!d->sched_group_nodes) {
7325 printk(KERN_WARNING "Can not alloc sched group node list\n");
7326 return sa_notcovered;
7327 }
7328 sched_group_nodes_bycpu[cpumask_first(cpu_map)] = d->sched_group_nodes;
7329#endif
7330 if (!alloc_cpumask_var(&d->nodemask, GFP_KERNEL))
7331 return sa_sched_group_nodes;
7332 if (!alloc_cpumask_var(&d->this_sibling_map, GFP_KERNEL))
7333 return sa_nodemask;
7334 if (!alloc_cpumask_var(&d->this_core_map, GFP_KERNEL))
7335 return sa_this_sibling_map;
7336 if (!alloc_cpumask_var(&d->this_book_map, GFP_KERNEL))
7337 return sa_this_core_map;
7338 if (!alloc_cpumask_var(&d->send_covered, GFP_KERNEL))
7339 return sa_this_book_map;
7340 if (!alloc_cpumask_var(&d->tmpmask, GFP_KERNEL))
7341 return sa_send_covered;
7342 d->rd = alloc_rootdomain(); 7112 d->rd = alloc_rootdomain();
7343 if (!d->rd) { 7113 if (!d->rd)
7344 printk(KERN_WARNING "Cannot alloc root domain\n"); 7114 return sa_sd;
7345 return sa_tmpmask;
7346 }
7347 return sa_rootdomain; 7115 return sa_rootdomain;
7348} 7116}
7349 7117
7350static struct sched_domain *__build_numa_sched_domains(struct s_data *d, 7118/*
7351 const struct cpumask *cpu_map, struct sched_domain_attr *attr, int i) 7119 * NULL the sd_data elements we've used to build the sched_domain and
7120 * sched_group structure so that the subsequent __free_domain_allocs()
7121 * will not free the data we're using.
7122 */
7123static void claim_allocations(int cpu, struct sched_domain *sd)
7352{ 7124{
7353 struct sched_domain *sd = NULL; 7125 struct sd_data *sdd = sd->private;
7354#ifdef CONFIG_NUMA 7126 struct sched_group *sg = sd->groups;
7355 struct sched_domain *parent;
7356
7357 d->sd_allnodes = 0;
7358 if (cpumask_weight(cpu_map) >
7359 SD_NODES_PER_DOMAIN * cpumask_weight(d->nodemask)) {
7360 sd = &per_cpu(allnodes_domains, i).sd;
7361 SD_INIT(sd, ALLNODES);
7362 set_domain_attribute(sd, attr);
7363 cpumask_copy(sched_domain_span(sd), cpu_map);
7364 cpu_to_allnodes_group(i, cpu_map, &sd->groups, d->tmpmask);
7365 d->sd_allnodes = 1;
7366 }
7367 parent = sd;
7368
7369 sd = &per_cpu(node_domains, i).sd;
7370 SD_INIT(sd, NODE);
7371 set_domain_attribute(sd, attr);
7372 sched_domain_node_span(cpu_to_node(i), sched_domain_span(sd));
7373 sd->parent = parent;
7374 if (parent)
7375 parent->child = sd;
7376 cpumask_and(sched_domain_span(sd), sched_domain_span(sd), cpu_map);
7377#endif
7378 return sd;
7379}
7380 7127
7381static struct sched_domain *__build_cpu_sched_domain(struct s_data *d, 7128 WARN_ON_ONCE(*per_cpu_ptr(sdd->sd, cpu) != sd);
7382 const struct cpumask *cpu_map, struct sched_domain_attr *attr, 7129 *per_cpu_ptr(sdd->sd, cpu) = NULL;
7383 struct sched_domain *parent, int i)
7384{
7385 struct sched_domain *sd;
7386 sd = &per_cpu(phys_domains, i).sd;
7387 SD_INIT(sd, CPU);
7388 set_domain_attribute(sd, attr);
7389 cpumask_copy(sched_domain_span(sd), d->nodemask);
7390 sd->parent = parent;
7391 if (parent)
7392 parent->child = sd;
7393 cpu_to_phys_group(i, cpu_map, &sd->groups, d->tmpmask);
7394 return sd;
7395}
7396 7130
7397static struct sched_domain *__build_book_sched_domain(struct s_data *d, 7131 if (cpu == cpumask_first(sched_group_cpus(sg))) {
7398 const struct cpumask *cpu_map, struct sched_domain_attr *attr, 7132 WARN_ON_ONCE(*per_cpu_ptr(sdd->sg, cpu) != sg);
7399 struct sched_domain *parent, int i) 7133 *per_cpu_ptr(sdd->sg, cpu) = NULL;
7400{ 7134 }
7401 struct sched_domain *sd = parent;
7402#ifdef CONFIG_SCHED_BOOK
7403 sd = &per_cpu(book_domains, i).sd;
7404 SD_INIT(sd, BOOK);
7405 set_domain_attribute(sd, attr);
7406 cpumask_and(sched_domain_span(sd), cpu_map, cpu_book_mask(i));
7407 sd->parent = parent;
7408 parent->child = sd;
7409 cpu_to_book_group(i, cpu_map, &sd->groups, d->tmpmask);
7410#endif
7411 return sd;
7412} 7135}
7413 7136
7414static struct sched_domain *__build_mc_sched_domain(struct s_data *d, 7137#ifdef CONFIG_SCHED_SMT
7415 const struct cpumask *cpu_map, struct sched_domain_attr *attr, 7138static const struct cpumask *cpu_smt_mask(int cpu)
7416 struct sched_domain *parent, int i)
7417{ 7139{
7418 struct sched_domain *sd = parent; 7140 return topology_thread_cpumask(cpu);
7419#ifdef CONFIG_SCHED_MC
7420 sd = &per_cpu(core_domains, i).sd;
7421 SD_INIT(sd, MC);
7422 set_domain_attribute(sd, attr);
7423 cpumask_and(sched_domain_span(sd), cpu_map, cpu_coregroup_mask(i));
7424 sd->parent = parent;
7425 parent->child = sd;
7426 cpu_to_core_group(i, cpu_map, &sd->groups, d->tmpmask);
7427#endif
7428 return sd;
7429} 7141}
7430
7431static struct sched_domain *__build_smt_sched_domain(struct s_data *d,
7432 const struct cpumask *cpu_map, struct sched_domain_attr *attr,
7433 struct sched_domain *parent, int i)
7434{
7435 struct sched_domain *sd = parent;
7436#ifdef CONFIG_SCHED_SMT
7437 sd = &per_cpu(cpu_domains, i).sd;
7438 SD_INIT(sd, SIBLING);
7439 set_domain_attribute(sd, attr);
7440 cpumask_and(sched_domain_span(sd), cpu_map, topology_thread_cpumask(i));
7441 sd->parent = parent;
7442 parent->child = sd;
7443 cpu_to_cpu_group(i, cpu_map, &sd->groups, d->tmpmask);
7444#endif 7142#endif
7445 return sd;
7446}
7447 7143
7448static void build_sched_groups(struct s_data *d, enum sched_domain_level l, 7144/*
7449 const struct cpumask *cpu_map, int cpu) 7145 * Topology list, bottom-up.
7450{ 7146 */
7451 switch (l) { 7147static struct sched_domain_topology_level default_topology[] = {
7452#ifdef CONFIG_SCHED_SMT 7148#ifdef CONFIG_SCHED_SMT
7453 case SD_LV_SIBLING: /* set up CPU (sibling) groups */ 7149 { sd_init_SIBLING, cpu_smt_mask, },
7454 cpumask_and(d->this_sibling_map, cpu_map,
7455 topology_thread_cpumask(cpu));
7456 if (cpu == cpumask_first(d->this_sibling_map))
7457 init_sched_build_groups(d->this_sibling_map, cpu_map,
7458 &cpu_to_cpu_group,
7459 d->send_covered, d->tmpmask);
7460 break;
7461#endif 7150#endif
7462#ifdef CONFIG_SCHED_MC 7151#ifdef CONFIG_SCHED_MC
7463 case SD_LV_MC: /* set up multi-core groups */ 7152 { sd_init_MC, cpu_coregroup_mask, },
7464 cpumask_and(d->this_core_map, cpu_map, cpu_coregroup_mask(cpu));
7465 if (cpu == cpumask_first(d->this_core_map))
7466 init_sched_build_groups(d->this_core_map, cpu_map,
7467 &cpu_to_core_group,
7468 d->send_covered, d->tmpmask);
7469 break;
7470#endif 7153#endif
7471#ifdef CONFIG_SCHED_BOOK 7154#ifdef CONFIG_SCHED_BOOK
7472 case SD_LV_BOOK: /* set up book groups */ 7155 { sd_init_BOOK, cpu_book_mask, },
7473 cpumask_and(d->this_book_map, cpu_map, cpu_book_mask(cpu));
7474 if (cpu == cpumask_first(d->this_book_map))
7475 init_sched_build_groups(d->this_book_map, cpu_map,
7476 &cpu_to_book_group,
7477 d->send_covered, d->tmpmask);
7478 break;
7479#endif 7156#endif
7480 case SD_LV_CPU: /* set up physical groups */ 7157 { sd_init_CPU, cpu_cpu_mask, },
7481 cpumask_and(d->nodemask, cpumask_of_node(cpu), cpu_map);
7482 if (!cpumask_empty(d->nodemask))
7483 init_sched_build_groups(d->nodemask, cpu_map,
7484 &cpu_to_phys_group,
7485 d->send_covered, d->tmpmask);
7486 break;
7487#ifdef CONFIG_NUMA 7158#ifdef CONFIG_NUMA
7488 case SD_LV_ALLNODES: 7159 { sd_init_NODE, cpu_node_mask, },
7489 init_sched_build_groups(cpu_map, cpu_map, &cpu_to_allnodes_group, 7160 { sd_init_ALLNODES, cpu_allnodes_mask, },
7490 d->send_covered, d->tmpmask);
7491 break;
7492#endif 7161#endif
7493 default: 7162 { NULL, },
7494 break; 7163};
7164
7165static struct sched_domain_topology_level *sched_domain_topology = default_topology;
7166
7167static int __sdt_alloc(const struct cpumask *cpu_map)
7168{
7169 struct sched_domain_topology_level *tl;
7170 int j;
7171
7172 for (tl = sched_domain_topology; tl->init; tl++) {
7173 struct sd_data *sdd = &tl->data;
7174
7175 sdd->sd = alloc_percpu(struct sched_domain *);
7176 if (!sdd->sd)
7177 return -ENOMEM;
7178
7179 sdd->sg = alloc_percpu(struct sched_group *);
7180 if (!sdd->sg)
7181 return -ENOMEM;
7182
7183 for_each_cpu(j, cpu_map) {
7184 struct sched_domain *sd;
7185 struct sched_group *sg;
7186
7187 sd = kzalloc_node(sizeof(struct sched_domain) + cpumask_size(),
7188 GFP_KERNEL, cpu_to_node(j));
7189 if (!sd)
7190 return -ENOMEM;
7191
7192 *per_cpu_ptr(sdd->sd, j) = sd;
7193
7194 sg = kzalloc_node(sizeof(struct sched_group) + cpumask_size(),
7195 GFP_KERNEL, cpu_to_node(j));
7196 if (!sg)
7197 return -ENOMEM;
7198
7199 *per_cpu_ptr(sdd->sg, j) = sg;
7200 }
7201 }
7202
7203 return 0;
7204}
7205
7206static void __sdt_free(const struct cpumask *cpu_map)
7207{
7208 struct sched_domain_topology_level *tl;
7209 int j;
7210
7211 for (tl = sched_domain_topology; tl->init; tl++) {
7212 struct sd_data *sdd = &tl->data;
7213
7214 for_each_cpu(j, cpu_map) {
7215 kfree(*per_cpu_ptr(sdd->sd, j));
7216 kfree(*per_cpu_ptr(sdd->sg, j));
7217 }
7218 free_percpu(sdd->sd);
7219 free_percpu(sdd->sg);
7495 } 7220 }
7496} 7221}
7497 7222
7223struct sched_domain *build_sched_domain(struct sched_domain_topology_level *tl,
7224 struct s_data *d, const struct cpumask *cpu_map,
7225 struct sched_domain_attr *attr, struct sched_domain *child,
7226 int cpu)
7227{
7228 struct sched_domain *sd = tl->init(tl, cpu);
7229 if (!sd)
7230 return child;
7231
7232 set_domain_attribute(sd, attr);
7233 cpumask_and(sched_domain_span(sd), cpu_map, tl->mask(cpu));
7234 if (child) {
7235 sd->level = child->level + 1;
7236 sched_domain_level_max = max(sched_domain_level_max, sd->level);
7237 child->parent = sd;
7238 }
7239 sd->child = child;
7240
7241 return sd;
7242}
7243
7498/* 7244/*
7499 * Build sched domains for a given set of cpus and attach the sched domains 7245 * Build sched domains for a given set of cpus and attach the sched domains
7500 * to the individual cpus 7246 * to the individual cpus
7501 */ 7247 */
7502static int __build_sched_domains(const struct cpumask *cpu_map, 7248static int build_sched_domains(const struct cpumask *cpu_map,
7503 struct sched_domain_attr *attr) 7249 struct sched_domain_attr *attr)
7504{ 7250{
7505 enum s_alloc alloc_state = sa_none; 7251 enum s_alloc alloc_state = sa_none;
7506 struct s_data d;
7507 struct sched_domain *sd; 7252 struct sched_domain *sd;
7508 int i; 7253 struct s_data d;
7509#ifdef CONFIG_NUMA 7254 int i, ret = -ENOMEM;
7510 d.sd_allnodes = 0;
7511#endif
7512 7255
7513 alloc_state = __visit_domain_allocation_hell(&d, cpu_map); 7256 alloc_state = __visit_domain_allocation_hell(&d, cpu_map);
7514 if (alloc_state != sa_rootdomain) 7257 if (alloc_state != sa_rootdomain)
7515 goto error; 7258 goto error;
7516 alloc_state = sa_sched_groups;
7517 7259
7518 /* 7260 /* Set up domains for cpus specified by the cpu_map. */
7519 * Set up domains for cpus specified by the cpu_map.
7520 */
7521 for_each_cpu(i, cpu_map) { 7261 for_each_cpu(i, cpu_map) {
7522 cpumask_and(d.nodemask, cpumask_of_node(cpu_to_node(i)), 7262 struct sched_domain_topology_level *tl;
7523 cpu_map);
7524 7263
7525 sd = __build_numa_sched_domains(&d, cpu_map, attr, i); 7264 sd = NULL;
7526 sd = __build_cpu_sched_domain(&d, cpu_map, attr, sd, i); 7265 for (tl = sched_domain_topology; tl->init; tl++)
7527 sd = __build_book_sched_domain(&d, cpu_map, attr, sd, i); 7266 sd = build_sched_domain(tl, &d, cpu_map, attr, sd, i);
7528 sd = __build_mc_sched_domain(&d, cpu_map, attr, sd, i);
7529 sd = __build_smt_sched_domain(&d, cpu_map, attr, sd, i);
7530 }
7531
7532 for_each_cpu(i, cpu_map) {
7533 build_sched_groups(&d, SD_LV_SIBLING, cpu_map, i);
7534 build_sched_groups(&d, SD_LV_BOOK, cpu_map, i);
7535 build_sched_groups(&d, SD_LV_MC, cpu_map, i);
7536 }
7537
7538 /* Set up physical groups */
7539 for (i = 0; i < nr_node_ids; i++)
7540 build_sched_groups(&d, SD_LV_CPU, cpu_map, i);
7541 7267
7542#ifdef CONFIG_NUMA 7268 while (sd->child)
7543 /* Set up node groups */ 7269 sd = sd->child;
7544 if (d.sd_allnodes)
7545 build_sched_groups(&d, SD_LV_ALLNODES, cpu_map, 0);
7546
7547 for (i = 0; i < nr_node_ids; i++)
7548 if (build_numa_sched_groups(&d, cpu_map, i))
7549 goto error;
7550#endif
7551 7270
7552 /* Calculate CPU power for physical packages and nodes */ 7271 *per_cpu_ptr(d.sd, i) = sd;
7553#ifdef CONFIG_SCHED_SMT
7554 for_each_cpu(i, cpu_map) {
7555 sd = &per_cpu(cpu_domains, i).sd;
7556 init_sched_groups_power(i, sd);
7557 }
7558#endif
7559#ifdef CONFIG_SCHED_MC
7560 for_each_cpu(i, cpu_map) {
7561 sd = &per_cpu(core_domains, i).sd;
7562 init_sched_groups_power(i, sd);
7563 } 7272 }
7564#endif
7565#ifdef CONFIG_SCHED_BOOK
7566 for_each_cpu(i, cpu_map) {
7567 sd = &per_cpu(book_domains, i).sd;
7568 init_sched_groups_power(i, sd);
7569 }
7570#endif
7571 7273
7274 /* Build the groups for the domains */
7572 for_each_cpu(i, cpu_map) { 7275 for_each_cpu(i, cpu_map) {
7573 sd = &per_cpu(phys_domains, i).sd; 7276 for (sd = *per_cpu_ptr(d.sd, i); sd; sd = sd->parent) {
7574 init_sched_groups_power(i, sd); 7277 sd->span_weight = cpumask_weight(sched_domain_span(sd));
7575 } 7278 get_group(i, sd->private, &sd->groups);
7279 atomic_inc(&sd->groups->ref);
7576 7280
7577#ifdef CONFIG_NUMA 7281 if (i != cpumask_first(sched_domain_span(sd)))
7578 for (i = 0; i < nr_node_ids; i++) 7282 continue;
7579 init_numa_sched_groups_power(d.sched_group_nodes[i]);
7580 7283
7581 if (d.sd_allnodes) { 7284 build_sched_groups(sd);
7582 struct sched_group *sg; 7285 }
7286 }
7583 7287
7584 cpu_to_allnodes_group(cpumask_first(cpu_map), cpu_map, &sg, 7288 /* Calculate CPU power for physical packages and nodes */
7585 d.tmpmask); 7289 for (i = nr_cpumask_bits-1; i >= 0; i--) {
7586 init_numa_sched_groups_power(sg); 7290 if (!cpumask_test_cpu(i, cpu_map))
7291 continue;
7292
7293 for (sd = *per_cpu_ptr(d.sd, i); sd; sd = sd->parent) {
7294 claim_allocations(i, sd);
7295 init_sched_groups_power(i, sd);
7296 }
7587 } 7297 }
7588#endif
7589 7298
7590 /* Attach the domains */ 7299 /* Attach the domains */
7300 rcu_read_lock();
7591 for_each_cpu(i, cpu_map) { 7301 for_each_cpu(i, cpu_map) {
7592#ifdef CONFIG_SCHED_SMT 7302 sd = *per_cpu_ptr(d.sd, i);
7593 sd = &per_cpu(cpu_domains, i).sd;
7594#elif defined(CONFIG_SCHED_MC)
7595 sd = &per_cpu(core_domains, i).sd;
7596#elif defined(CONFIG_SCHED_BOOK)
7597 sd = &per_cpu(book_domains, i).sd;
7598#else
7599 sd = &per_cpu(phys_domains, i).sd;
7600#endif
7601 cpu_attach_domain(sd, d.rd, i); 7303 cpu_attach_domain(sd, d.rd, i);
7602 } 7304 }
7305 rcu_read_unlock();
7603 7306
7604 d.sched_group_nodes = NULL; /* don't free this we still need it */ 7307 ret = 0;
7605 __free_domain_allocs(&d, sa_tmpmask, cpu_map);
7606 return 0;
7607
7608error: 7308error:
7609 __free_domain_allocs(&d, alloc_state, cpu_map); 7309 __free_domain_allocs(&d, alloc_state, cpu_map);
7610 return -ENOMEM; 7310 return ret;
7611}
7612
7613static int build_sched_domains(const struct cpumask *cpu_map)
7614{
7615 return __build_sched_domains(cpu_map, NULL);
7616} 7311}
7617 7312
7618static cpumask_var_t *doms_cur; /* current sched domains */ 7313static cpumask_var_t *doms_cur; /* current sched domains */
@@ -7667,7 +7362,7 @@ void free_sched_domains(cpumask_var_t doms[], unsigned int ndoms)
7667 * For now this just excludes isolated cpus, but could be used to 7362 * For now this just excludes isolated cpus, but could be used to
7668 * exclude other special cases in the future. 7363 * exclude other special cases in the future.
7669 */ 7364 */
7670static int arch_init_sched_domains(const struct cpumask *cpu_map) 7365static int init_sched_domains(const struct cpumask *cpu_map)
7671{ 7366{
7672 int err; 7367 int err;
7673 7368
@@ -7678,32 +7373,24 @@ static int arch_init_sched_domains(const struct cpumask *cpu_map)
7678 doms_cur = &fallback_doms; 7373 doms_cur = &fallback_doms;
7679 cpumask_andnot(doms_cur[0], cpu_map, cpu_isolated_map); 7374 cpumask_andnot(doms_cur[0], cpu_map, cpu_isolated_map);
7680 dattr_cur = NULL; 7375 dattr_cur = NULL;
7681 err = build_sched_domains(doms_cur[0]); 7376 err = build_sched_domains(doms_cur[0], NULL);
7682 register_sched_domain_sysctl(); 7377 register_sched_domain_sysctl();
7683 7378
7684 return err; 7379 return err;
7685} 7380}
7686 7381
7687static void arch_destroy_sched_domains(const struct cpumask *cpu_map,
7688 struct cpumask *tmpmask)
7689{
7690 free_sched_groups(cpu_map, tmpmask);
7691}
7692
7693/* 7382/*
7694 * Detach sched domains from a group of cpus specified in cpu_map 7383 * Detach sched domains from a group of cpus specified in cpu_map
7695 * These cpus will now be attached to the NULL domain 7384 * These cpus will now be attached to the NULL domain
7696 */ 7385 */
7697static void detach_destroy_domains(const struct cpumask *cpu_map) 7386static void detach_destroy_domains(const struct cpumask *cpu_map)
7698{ 7387{
7699 /* Save because hotplug lock held. */
7700 static DECLARE_BITMAP(tmpmask, CONFIG_NR_CPUS);
7701 int i; 7388 int i;
7702 7389
7390 rcu_read_lock();
7703 for_each_cpu(i, cpu_map) 7391 for_each_cpu(i, cpu_map)
7704 cpu_attach_domain(NULL, &def_root_domain, i); 7392 cpu_attach_domain(NULL, &def_root_domain, i);
7705 synchronize_sched(); 7393 rcu_read_unlock();
7706 arch_destroy_sched_domains(cpu_map, to_cpumask(tmpmask));
7707} 7394}
7708 7395
7709/* handle null as "default" */ 7396/* handle null as "default" */
@@ -7792,8 +7479,7 @@ match1:
7792 goto match2; 7479 goto match2;
7793 } 7480 }
7794 /* no match - add a new doms_new */ 7481 /* no match - add a new doms_new */
7795 __build_sched_domains(doms_new[i], 7482 build_sched_domains(doms_new[i], dattr_new ? dattr_new + i : NULL);
7796 dattr_new ? dattr_new + i : NULL);
7797match2: 7483match2:
7798 ; 7484 ;
7799 } 7485 }
@@ -7812,7 +7498,7 @@ match2:
7812} 7498}
7813 7499
7814#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT) 7500#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
7815static void arch_reinit_sched_domains(void) 7501static void reinit_sched_domains(void)
7816{ 7502{
7817 get_online_cpus(); 7503 get_online_cpus();
7818 7504
@@ -7845,7 +7531,7 @@ static ssize_t sched_power_savings_store(const char *buf, size_t count, int smt)
7845 else 7531 else
7846 sched_mc_power_savings = level; 7532 sched_mc_power_savings = level;
7847 7533
7848 arch_reinit_sched_domains(); 7534 reinit_sched_domains();
7849 7535
7850 return count; 7536 return count;
7851} 7537}
@@ -7964,14 +7650,9 @@ void __init sched_init_smp(void)
7964 alloc_cpumask_var(&non_isolated_cpus, GFP_KERNEL); 7650 alloc_cpumask_var(&non_isolated_cpus, GFP_KERNEL);
7965 alloc_cpumask_var(&fallback_doms, GFP_KERNEL); 7651 alloc_cpumask_var(&fallback_doms, GFP_KERNEL);
7966 7652
7967#if defined(CONFIG_NUMA)
7968 sched_group_nodes_bycpu = kzalloc(nr_cpu_ids * sizeof(void **),
7969 GFP_KERNEL);
7970 BUG_ON(sched_group_nodes_bycpu == NULL);
7971#endif
7972 get_online_cpus(); 7653 get_online_cpus();
7973 mutex_lock(&sched_domains_mutex); 7654 mutex_lock(&sched_domains_mutex);
7974 arch_init_sched_domains(cpu_active_mask); 7655 init_sched_domains(cpu_active_mask);
7975 cpumask_andnot(non_isolated_cpus, cpu_possible_mask, cpu_isolated_map); 7656 cpumask_andnot(non_isolated_cpus, cpu_possible_mask, cpu_isolated_map);
7976 if (cpumask_empty(non_isolated_cpus)) 7657 if (cpumask_empty(non_isolated_cpus))
7977 cpumask_set_cpu(smp_processor_id(), non_isolated_cpus); 7658 cpumask_set_cpu(smp_processor_id(), non_isolated_cpus);
@@ -8278,6 +7959,7 @@ void __init sched_init(void)
8278 /* Allocate the nohz_cpu_mask if CONFIG_CPUMASK_OFFSTACK */ 7959 /* Allocate the nohz_cpu_mask if CONFIG_CPUMASK_OFFSTACK */
8279 zalloc_cpumask_var(&nohz_cpu_mask, GFP_NOWAIT); 7960 zalloc_cpumask_var(&nohz_cpu_mask, GFP_NOWAIT);
8280#ifdef CONFIG_SMP 7961#ifdef CONFIG_SMP
7962 zalloc_cpumask_var(&sched_domains_tmpmask, GFP_NOWAIT);
8281#ifdef CONFIG_NO_HZ 7963#ifdef CONFIG_NO_HZ
8282 zalloc_cpumask_var(&nohz.idle_cpus_mask, GFP_NOWAIT); 7964 zalloc_cpumask_var(&nohz.idle_cpus_mask, GFP_NOWAIT);
8283 alloc_cpumask_var(&nohz.grp_idle_mask, GFP_NOWAIT); 7965 alloc_cpumask_var(&nohz.grp_idle_mask, GFP_NOWAIT);
@@ -8337,7 +8019,7 @@ static void normalize_task(struct rq *rq, struct task_struct *p)
8337 int old_prio = p->prio; 8019 int old_prio = p->prio;
8338 int on_rq; 8020 int on_rq;
8339 8021
8340 on_rq = p->se.on_rq; 8022 on_rq = p->on_rq;
8341 if (on_rq) 8023 if (on_rq)
8342 deactivate_task(rq, p, 0); 8024 deactivate_task(rq, p, 0);
8343 __setscheduler(rq, p, SCHED_NORMAL, 0); 8025 __setscheduler(rq, p, SCHED_NORMAL, 0);
@@ -8550,7 +8232,6 @@ int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent)
8550{ 8232{
8551 struct rt_rq *rt_rq; 8233 struct rt_rq *rt_rq;
8552 struct sched_rt_entity *rt_se; 8234 struct sched_rt_entity *rt_se;
8553 struct rq *rq;
8554 int i; 8235 int i;
8555 8236
8556 tg->rt_rq = kzalloc(sizeof(rt_rq) * nr_cpu_ids, GFP_KERNEL); 8237 tg->rt_rq = kzalloc(sizeof(rt_rq) * nr_cpu_ids, GFP_KERNEL);
@@ -8564,8 +8245,6 @@ int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent)
8564 ktime_to_ns(def_rt_bandwidth.rt_period), 0); 8245 ktime_to_ns(def_rt_bandwidth.rt_period), 0);
8565 8246
8566 for_each_possible_cpu(i) { 8247 for_each_possible_cpu(i) {
8567 rq = cpu_rq(i);
8568
8569 rt_rq = kzalloc_node(sizeof(struct rt_rq), 8248 rt_rq = kzalloc_node(sizeof(struct rt_rq),
8570 GFP_KERNEL, cpu_to_node(i)); 8249 GFP_KERNEL, cpu_to_node(i));
8571 if (!rt_rq) 8250 if (!rt_rq)
@@ -8680,7 +8359,7 @@ void sched_move_task(struct task_struct *tsk)
8680 rq = task_rq_lock(tsk, &flags); 8359 rq = task_rq_lock(tsk, &flags);
8681 8360
8682 running = task_current(rq, tsk); 8361 running = task_current(rq, tsk);
8683 on_rq = tsk->se.on_rq; 8362 on_rq = tsk->on_rq;
8684 8363
8685 if (on_rq) 8364 if (on_rq)
8686 dequeue_task(rq, tsk, 0); 8365 dequeue_task(rq, tsk, 0);
@@ -8699,7 +8378,7 @@ void sched_move_task(struct task_struct *tsk)
8699 if (on_rq) 8378 if (on_rq)
8700 enqueue_task(rq, tsk, 0); 8379 enqueue_task(rq, tsk, 0);
8701 8380
8702 task_rq_unlock(rq, &flags); 8381 task_rq_unlock(rq, tsk, &flags);
8703} 8382}
8704#endif /* CONFIG_CGROUP_SCHED */ 8383#endif /* CONFIG_CGROUP_SCHED */
8705 8384
diff --git a/kernel/sched_autogroup.c b/kernel/sched_autogroup.c
index 5946ac515602..429242f3c484 100644
--- a/kernel/sched_autogroup.c
+++ b/kernel/sched_autogroup.c
@@ -179,7 +179,7 @@ void sched_autogroup_create_attach(struct task_struct *p)
179 struct autogroup *ag = autogroup_create(); 179 struct autogroup *ag = autogroup_create();
180 180
181 autogroup_move_group(p, ag); 181 autogroup_move_group(p, ag);
182 /* drop extra refrence added by autogroup_create() */ 182 /* drop extra reference added by autogroup_create() */
183 autogroup_kref_put(ag); 183 autogroup_kref_put(ag);
184} 184}
185EXPORT_SYMBOL(sched_autogroup_create_attach); 185EXPORT_SYMBOL(sched_autogroup_create_attach);
diff --git a/kernel/sched_debug.c b/kernel/sched_debug.c
index 7bacd83a4158..a6710a112b4f 100644
--- a/kernel/sched_debug.c
+++ b/kernel/sched_debug.c
@@ -152,7 +152,7 @@ static void print_rq(struct seq_file *m, struct rq *rq, int rq_cpu)
152 read_lock_irqsave(&tasklist_lock, flags); 152 read_lock_irqsave(&tasklist_lock, flags);
153 153
154 do_each_thread(g, p) { 154 do_each_thread(g, p) {
155 if (!p->se.on_rq || task_cpu(p) != rq_cpu) 155 if (!p->on_rq || task_cpu(p) != rq_cpu)
156 continue; 156 continue;
157 157
158 print_task(m, rq, p); 158 print_task(m, rq, p);
@@ -296,9 +296,6 @@ static void print_cpu(struct seq_file *m, int cpu)
296 P(ttwu_count); 296 P(ttwu_count);
297 P(ttwu_local); 297 P(ttwu_local);
298 298
299 SEQ_printf(m, " .%-30s: %d\n", "bkl_count",
300 rq->rq_sched_info.bkl_count);
301
302#undef P 299#undef P
303#undef P64 300#undef P64
304#endif 301#endif
@@ -441,7 +438,6 @@ void proc_sched_show_task(struct task_struct *p, struct seq_file *m)
441 P(se.statistics.wait_count); 438 P(se.statistics.wait_count);
442 PN(se.statistics.iowait_sum); 439 PN(se.statistics.iowait_sum);
443 P(se.statistics.iowait_count); 440 P(se.statistics.iowait_count);
444 P(sched_info.bkl_count);
445 P(se.nr_migrations); 441 P(se.nr_migrations);
446 P(se.statistics.nr_migrations_cold); 442 P(se.statistics.nr_migrations_cold);
447 P(se.statistics.nr_failed_migrations_affine); 443 P(se.statistics.nr_failed_migrations_affine);
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index c7ec5c8e7b44..37f22626225e 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -358,6 +358,10 @@ static void update_min_vruntime(struct cfs_rq *cfs_rq)
358 } 358 }
359 359
360 cfs_rq->min_vruntime = max_vruntime(cfs_rq->min_vruntime, vruntime); 360 cfs_rq->min_vruntime = max_vruntime(cfs_rq->min_vruntime, vruntime);
361#ifndef CONFIG_64BIT
362 smp_wmb();
363 cfs_rq->min_vruntime_copy = cfs_rq->min_vruntime;
364#endif
361} 365}
362 366
363/* 367/*
@@ -1340,6 +1344,8 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
1340 hrtick_update(rq); 1344 hrtick_update(rq);
1341} 1345}
1342 1346
1347static void set_next_buddy(struct sched_entity *se);
1348
1343/* 1349/*
1344 * The dequeue_task method is called before nr_running is 1350 * The dequeue_task method is called before nr_running is
1345 * decreased. We remove the task from the rbtree and 1351 * decreased. We remove the task from the rbtree and
@@ -1349,14 +1355,22 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
1349{ 1355{
1350 struct cfs_rq *cfs_rq; 1356 struct cfs_rq *cfs_rq;
1351 struct sched_entity *se = &p->se; 1357 struct sched_entity *se = &p->se;
1358 int task_sleep = flags & DEQUEUE_SLEEP;
1352 1359
1353 for_each_sched_entity(se) { 1360 for_each_sched_entity(se) {
1354 cfs_rq = cfs_rq_of(se); 1361 cfs_rq = cfs_rq_of(se);
1355 dequeue_entity(cfs_rq, se, flags); 1362 dequeue_entity(cfs_rq, se, flags);
1356 1363
1357 /* Don't dequeue parent if it has other entities besides us */ 1364 /* Don't dequeue parent if it has other entities besides us */
1358 if (cfs_rq->load.weight) 1365 if (cfs_rq->load.weight) {
1366 /*
1367 * Bias pick_next to pick a task from this cfs_rq, as
1368 * p is sleeping when it is within its sched_slice.
1369 */
1370 if (task_sleep && parent_entity(se))
1371 set_next_buddy(parent_entity(se));
1359 break; 1372 break;
1373 }
1360 flags |= DEQUEUE_SLEEP; 1374 flags |= DEQUEUE_SLEEP;
1361 } 1375 }
1362 1376
@@ -1372,12 +1386,25 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
1372 1386
1373#ifdef CONFIG_SMP 1387#ifdef CONFIG_SMP
1374 1388
1375static void task_waking_fair(struct rq *rq, struct task_struct *p) 1389static void task_waking_fair(struct task_struct *p)
1376{ 1390{
1377 struct sched_entity *se = &p->se; 1391 struct sched_entity *se = &p->se;
1378 struct cfs_rq *cfs_rq = cfs_rq_of(se); 1392 struct cfs_rq *cfs_rq = cfs_rq_of(se);
1393 u64 min_vruntime;
1379 1394
1380 se->vruntime -= cfs_rq->min_vruntime; 1395#ifndef CONFIG_64BIT
1396 u64 min_vruntime_copy;
1397
1398 do {
1399 min_vruntime_copy = cfs_rq->min_vruntime_copy;
1400 smp_rmb();
1401 min_vruntime = cfs_rq->min_vruntime;
1402 } while (min_vruntime != min_vruntime_copy);
1403#else
1404 min_vruntime = cfs_rq->min_vruntime;
1405#endif
1406
1407 se->vruntime -= min_vruntime;
1381} 1408}
1382 1409
1383#ifdef CONFIG_FAIR_GROUP_SCHED 1410#ifdef CONFIG_FAIR_GROUP_SCHED
@@ -1622,6 +1649,7 @@ static int select_idle_sibling(struct task_struct *p, int target)
1622 /* 1649 /*
1623 * Otherwise, iterate the domains and find an elegible idle cpu. 1650 * Otherwise, iterate the domains and find an elegible idle cpu.
1624 */ 1651 */
1652 rcu_read_lock();
1625 for_each_domain(target, sd) { 1653 for_each_domain(target, sd) {
1626 if (!(sd->flags & SD_SHARE_PKG_RESOURCES)) 1654 if (!(sd->flags & SD_SHARE_PKG_RESOURCES))
1627 break; 1655 break;
@@ -1641,6 +1669,7 @@ static int select_idle_sibling(struct task_struct *p, int target)
1641 cpumask_test_cpu(prev_cpu, sched_domain_span(sd))) 1669 cpumask_test_cpu(prev_cpu, sched_domain_span(sd)))
1642 break; 1670 break;
1643 } 1671 }
1672 rcu_read_unlock();
1644 1673
1645 return target; 1674 return target;
1646} 1675}
@@ -1657,7 +1686,7 @@ static int select_idle_sibling(struct task_struct *p, int target)
1657 * preempt must be disabled. 1686 * preempt must be disabled.
1658 */ 1687 */
1659static int 1688static int
1660select_task_rq_fair(struct rq *rq, struct task_struct *p, int sd_flag, int wake_flags) 1689select_task_rq_fair(struct task_struct *p, int sd_flag, int wake_flags)
1661{ 1690{
1662 struct sched_domain *tmp, *affine_sd = NULL, *sd = NULL; 1691 struct sched_domain *tmp, *affine_sd = NULL, *sd = NULL;
1663 int cpu = smp_processor_id(); 1692 int cpu = smp_processor_id();
@@ -1673,6 +1702,7 @@ select_task_rq_fair(struct rq *rq, struct task_struct *p, int sd_flag, int wake_
1673 new_cpu = prev_cpu; 1702 new_cpu = prev_cpu;
1674 } 1703 }
1675 1704
1705 rcu_read_lock();
1676 for_each_domain(cpu, tmp) { 1706 for_each_domain(cpu, tmp) {
1677 if (!(tmp->flags & SD_LOAD_BALANCE)) 1707 if (!(tmp->flags & SD_LOAD_BALANCE))
1678 continue; 1708 continue;
@@ -1723,9 +1753,10 @@ select_task_rq_fair(struct rq *rq, struct task_struct *p, int sd_flag, int wake_
1723 1753
1724 if (affine_sd) { 1754 if (affine_sd) {
1725 if (cpu == prev_cpu || wake_affine(affine_sd, p, sync)) 1755 if (cpu == prev_cpu || wake_affine(affine_sd, p, sync))
1726 return select_idle_sibling(p, cpu); 1756 prev_cpu = cpu;
1727 else 1757
1728 return select_idle_sibling(p, prev_cpu); 1758 new_cpu = select_idle_sibling(p, prev_cpu);
1759 goto unlock;
1729 } 1760 }
1730 1761
1731 while (sd) { 1762 while (sd) {
@@ -1766,6 +1797,8 @@ select_task_rq_fair(struct rq *rq, struct task_struct *p, int sd_flag, int wake_
1766 } 1797 }
1767 /* while loop will break here if sd == NULL */ 1798 /* while loop will break here if sd == NULL */
1768 } 1799 }
1800unlock:
1801 rcu_read_unlock();
1769 1802
1770 return new_cpu; 1803 return new_cpu;
1771} 1804}
@@ -1789,10 +1822,7 @@ wakeup_gran(struct sched_entity *curr, struct sched_entity *se)
1789 * This is especially important for buddies when the leftmost 1822 * This is especially important for buddies when the leftmost
1790 * task is higher priority than the buddy. 1823 * task is higher priority than the buddy.
1791 */ 1824 */
1792 if (unlikely(se->load.weight != NICE_0_LOAD)) 1825 return calc_delta_fair(gran, se);
1793 gran = calc_delta_fair(gran, se);
1794
1795 return gran;
1796} 1826}
1797 1827
1798/* 1828/*
@@ -1826,26 +1856,26 @@ wakeup_preempt_entity(struct sched_entity *curr, struct sched_entity *se)
1826 1856
1827static void set_last_buddy(struct sched_entity *se) 1857static void set_last_buddy(struct sched_entity *se)
1828{ 1858{
1829 if (likely(task_of(se)->policy != SCHED_IDLE)) { 1859 if (entity_is_task(se) && unlikely(task_of(se)->policy == SCHED_IDLE))
1830 for_each_sched_entity(se) 1860 return;
1831 cfs_rq_of(se)->last = se; 1861
1832 } 1862 for_each_sched_entity(se)
1863 cfs_rq_of(se)->last = se;
1833} 1864}
1834 1865
1835static void set_next_buddy(struct sched_entity *se) 1866static void set_next_buddy(struct sched_entity *se)
1836{ 1867{
1837 if (likely(task_of(se)->policy != SCHED_IDLE)) { 1868 if (entity_is_task(se) && unlikely(task_of(se)->policy == SCHED_IDLE))
1838 for_each_sched_entity(se) 1869 return;
1839 cfs_rq_of(se)->next = se; 1870
1840 } 1871 for_each_sched_entity(se)
1872 cfs_rq_of(se)->next = se;
1841} 1873}
1842 1874
1843static void set_skip_buddy(struct sched_entity *se) 1875static void set_skip_buddy(struct sched_entity *se)
1844{ 1876{
1845 if (likely(task_of(se)->policy != SCHED_IDLE)) { 1877 for_each_sched_entity(se)
1846 for_each_sched_entity(se) 1878 cfs_rq_of(se)->skip = se;
1847 cfs_rq_of(se)->skip = se;
1848 }
1849} 1879}
1850 1880
1851/* 1881/*
@@ -1857,12 +1887,15 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_
1857 struct sched_entity *se = &curr->se, *pse = &p->se; 1887 struct sched_entity *se = &curr->se, *pse = &p->se;
1858 struct cfs_rq *cfs_rq = task_cfs_rq(curr); 1888 struct cfs_rq *cfs_rq = task_cfs_rq(curr);
1859 int scale = cfs_rq->nr_running >= sched_nr_latency; 1889 int scale = cfs_rq->nr_running >= sched_nr_latency;
1890 int next_buddy_marked = 0;
1860 1891
1861 if (unlikely(se == pse)) 1892 if (unlikely(se == pse))
1862 return; 1893 return;
1863 1894
1864 if (sched_feat(NEXT_BUDDY) && scale && !(wake_flags & WF_FORK)) 1895 if (sched_feat(NEXT_BUDDY) && scale && !(wake_flags & WF_FORK)) {
1865 set_next_buddy(pse); 1896 set_next_buddy(pse);
1897 next_buddy_marked = 1;
1898 }
1866 1899
1867 /* 1900 /*
1868 * We can come here with TIF_NEED_RESCHED already set from new task 1901 * We can come here with TIF_NEED_RESCHED already set from new task
@@ -1890,8 +1923,15 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_
1890 update_curr(cfs_rq); 1923 update_curr(cfs_rq);
1891 find_matching_se(&se, &pse); 1924 find_matching_se(&se, &pse);
1892 BUG_ON(!pse); 1925 BUG_ON(!pse);
1893 if (wakeup_preempt_entity(se, pse) == 1) 1926 if (wakeup_preempt_entity(se, pse) == 1) {
1927 /*
1928 * Bias pick_next to pick the sched entity that is
1929 * triggering this preemption.
1930 */
1931 if (!next_buddy_marked)
1932 set_next_buddy(pse);
1894 goto preempt; 1933 goto preempt;
1934 }
1895 1935
1896 return; 1936 return;
1897 1937
@@ -2102,23 +2142,22 @@ static unsigned long
2102balance_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest, 2142balance_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest,
2103 unsigned long max_load_move, struct sched_domain *sd, 2143 unsigned long max_load_move, struct sched_domain *sd,
2104 enum cpu_idle_type idle, int *all_pinned, 2144 enum cpu_idle_type idle, int *all_pinned,
2105 int *this_best_prio, struct cfs_rq *busiest_cfs_rq) 2145 struct cfs_rq *busiest_cfs_rq)
2106{ 2146{
2107 int loops = 0, pulled = 0, pinned = 0; 2147 int loops = 0, pulled = 0;
2108 long rem_load_move = max_load_move; 2148 long rem_load_move = max_load_move;
2109 struct task_struct *p, *n; 2149 struct task_struct *p, *n;
2110 2150
2111 if (max_load_move == 0) 2151 if (max_load_move == 0)
2112 goto out; 2152 goto out;
2113 2153
2114 pinned = 1;
2115
2116 list_for_each_entry_safe(p, n, &busiest_cfs_rq->tasks, se.group_node) { 2154 list_for_each_entry_safe(p, n, &busiest_cfs_rq->tasks, se.group_node) {
2117 if (loops++ > sysctl_sched_nr_migrate) 2155 if (loops++ > sysctl_sched_nr_migrate)
2118 break; 2156 break;
2119 2157
2120 if ((p->se.load.weight >> 1) > rem_load_move || 2158 if ((p->se.load.weight >> 1) > rem_load_move ||
2121 !can_migrate_task(p, busiest, this_cpu, sd, idle, &pinned)) 2159 !can_migrate_task(p, busiest, this_cpu, sd, idle,
2160 all_pinned))
2122 continue; 2161 continue;
2123 2162
2124 pull_task(busiest, p, this_rq, this_cpu); 2163 pull_task(busiest, p, this_rq, this_cpu);
@@ -2141,9 +2180,6 @@ balance_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest,
2141 */ 2180 */
2142 if (rem_load_move <= 0) 2181 if (rem_load_move <= 0)
2143 break; 2182 break;
2144
2145 if (p->prio < *this_best_prio)
2146 *this_best_prio = p->prio;
2147 } 2183 }
2148out: 2184out:
2149 /* 2185 /*
@@ -2153,9 +2189,6 @@ out:
2153 */ 2189 */
2154 schedstat_add(sd, lb_gained[idle], pulled); 2190 schedstat_add(sd, lb_gained[idle], pulled);
2155 2191
2156 if (all_pinned)
2157 *all_pinned = pinned;
2158
2159 return max_load_move - rem_load_move; 2192 return max_load_move - rem_load_move;
2160} 2193}
2161 2194
@@ -2206,7 +2239,7 @@ static unsigned long
2206load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest, 2239load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest,
2207 unsigned long max_load_move, 2240 unsigned long max_load_move,
2208 struct sched_domain *sd, enum cpu_idle_type idle, 2241 struct sched_domain *sd, enum cpu_idle_type idle,
2209 int *all_pinned, int *this_best_prio) 2242 int *all_pinned)
2210{ 2243{
2211 long rem_load_move = max_load_move; 2244 long rem_load_move = max_load_move;
2212 int busiest_cpu = cpu_of(busiest); 2245 int busiest_cpu = cpu_of(busiest);
@@ -2231,7 +2264,7 @@ load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest,
2231 rem_load = div_u64(rem_load, busiest_h_load + 1); 2264 rem_load = div_u64(rem_load, busiest_h_load + 1);
2232 2265
2233 moved_load = balance_tasks(this_rq, this_cpu, busiest, 2266 moved_load = balance_tasks(this_rq, this_cpu, busiest,
2234 rem_load, sd, idle, all_pinned, this_best_prio, 2267 rem_load, sd, idle, all_pinned,
2235 busiest_cfs_rq); 2268 busiest_cfs_rq);
2236 2269
2237 if (!moved_load) 2270 if (!moved_load)
@@ -2257,11 +2290,11 @@ static unsigned long
2257load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest, 2290load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest,
2258 unsigned long max_load_move, 2291 unsigned long max_load_move,
2259 struct sched_domain *sd, enum cpu_idle_type idle, 2292 struct sched_domain *sd, enum cpu_idle_type idle,
2260 int *all_pinned, int *this_best_prio) 2293 int *all_pinned)
2261{ 2294{
2262 return balance_tasks(this_rq, this_cpu, busiest, 2295 return balance_tasks(this_rq, this_cpu, busiest,
2263 max_load_move, sd, idle, all_pinned, 2296 max_load_move, sd, idle, all_pinned,
2264 this_best_prio, &busiest->cfs); 2297 &busiest->cfs);
2265} 2298}
2266#endif 2299#endif
2267 2300
@@ -2278,12 +2311,11 @@ static int move_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest,
2278 int *all_pinned) 2311 int *all_pinned)
2279{ 2312{
2280 unsigned long total_load_moved = 0, load_moved; 2313 unsigned long total_load_moved = 0, load_moved;
2281 int this_best_prio = this_rq->curr->prio;
2282 2314
2283 do { 2315 do {
2284 load_moved = load_balance_fair(this_rq, this_cpu, busiest, 2316 load_moved = load_balance_fair(this_rq, this_cpu, busiest,
2285 max_load_move - total_load_moved, 2317 max_load_move - total_load_moved,
2286 sd, idle, all_pinned, &this_best_prio); 2318 sd, idle, all_pinned);
2287 2319
2288 total_load_moved += load_moved; 2320 total_load_moved += load_moved;
2289 2321
@@ -2652,7 +2684,7 @@ fix_small_capacity(struct sched_domain *sd, struct sched_group *group)
2652 /* 2684 /*
2653 * Only siblings can have significantly less than SCHED_LOAD_SCALE 2685 * Only siblings can have significantly less than SCHED_LOAD_SCALE
2654 */ 2686 */
2655 if (sd->level != SD_LV_SIBLING) 2687 if (!(sd->flags & SD_SHARE_CPUPOWER))
2656 return 0; 2688 return 0;
2657 2689
2658 /* 2690 /*
@@ -3062,7 +3094,7 @@ static inline void calculate_imbalance(struct sd_lb_stats *sds, int this_cpu,
3062 3094
3063 /* 3095 /*
3064 * if *imbalance is less than the average load per runnable task 3096 * if *imbalance is less than the average load per runnable task
3065 * there is no gaurantee that any tasks will be moved so we'll have 3097 * there is no guarantee that any tasks will be moved so we'll have
3066 * a think about bumping its value to force at least one task to be 3098 * a think about bumping its value to force at least one task to be
3067 * moved 3099 * moved
3068 */ 3100 */
@@ -3127,6 +3159,8 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
3127 if (!sds.busiest || sds.busiest_nr_running == 0) 3159 if (!sds.busiest || sds.busiest_nr_running == 0)
3128 goto out_balanced; 3160 goto out_balanced;
3129 3161
3162 sds.avg_load = (SCHED_LOAD_SCALE * sds.total_load) / sds.total_pwr;
3163
3130 /* 3164 /*
3131 * If the busiest group is imbalanced the below checks don't 3165 * If the busiest group is imbalanced the below checks don't
3132 * work because they assumes all things are equal, which typically 3166 * work because they assumes all things are equal, which typically
@@ -3151,7 +3185,6 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
3151 * Don't pull any tasks if this group is already above the domain 3185 * Don't pull any tasks if this group is already above the domain
3152 * average load. 3186 * average load.
3153 */ 3187 */
3154 sds.avg_load = (SCHED_LOAD_SCALE * sds.total_load) / sds.total_pwr;
3155 if (sds.this_load >= sds.avg_load) 3188 if (sds.this_load >= sds.avg_load)
3156 goto out_balanced; 3189 goto out_balanced;
3157 3190
@@ -3340,6 +3373,7 @@ redo:
3340 * still unbalanced. ld_moved simply stays zero, so it is 3373 * still unbalanced. ld_moved simply stays zero, so it is
3341 * correctly treated as an imbalance. 3374 * correctly treated as an imbalance.
3342 */ 3375 */
3376 all_pinned = 1;
3343 local_irq_save(flags); 3377 local_irq_save(flags);
3344 double_rq_lock(this_rq, busiest); 3378 double_rq_lock(this_rq, busiest);
3345 ld_moved = move_tasks(this_rq, this_cpu, busiest, 3379 ld_moved = move_tasks(this_rq, this_cpu, busiest,
@@ -3467,6 +3501,7 @@ static void idle_balance(int this_cpu, struct rq *this_rq)
3467 raw_spin_unlock(&this_rq->lock); 3501 raw_spin_unlock(&this_rq->lock);
3468 3502
3469 update_shares(this_cpu); 3503 update_shares(this_cpu);
3504 rcu_read_lock();
3470 for_each_domain(this_cpu, sd) { 3505 for_each_domain(this_cpu, sd) {
3471 unsigned long interval; 3506 unsigned long interval;
3472 int balance = 1; 3507 int balance = 1;
@@ -3488,6 +3523,7 @@ static void idle_balance(int this_cpu, struct rq *this_rq)
3488 break; 3523 break;
3489 } 3524 }
3490 } 3525 }
3526 rcu_read_unlock();
3491 3527
3492 raw_spin_lock(&this_rq->lock); 3528 raw_spin_lock(&this_rq->lock);
3493 3529
@@ -3536,6 +3572,7 @@ static int active_load_balance_cpu_stop(void *data)
3536 double_lock_balance(busiest_rq, target_rq); 3572 double_lock_balance(busiest_rq, target_rq);
3537 3573
3538 /* Search for an sd spanning us and the target CPU. */ 3574 /* Search for an sd spanning us and the target CPU. */
3575 rcu_read_lock();
3539 for_each_domain(target_cpu, sd) { 3576 for_each_domain(target_cpu, sd) {
3540 if ((sd->flags & SD_LOAD_BALANCE) && 3577 if ((sd->flags & SD_LOAD_BALANCE) &&
3541 cpumask_test_cpu(busiest_cpu, sched_domain_span(sd))) 3578 cpumask_test_cpu(busiest_cpu, sched_domain_span(sd)))
@@ -3551,6 +3588,7 @@ static int active_load_balance_cpu_stop(void *data)
3551 else 3588 else
3552 schedstat_inc(sd, alb_failed); 3589 schedstat_inc(sd, alb_failed);
3553 } 3590 }
3591 rcu_read_unlock();
3554 double_unlock_balance(busiest_rq, target_rq); 3592 double_unlock_balance(busiest_rq, target_rq);
3555out_unlock: 3593out_unlock:
3556 busiest_rq->active_balance = 0; 3594 busiest_rq->active_balance = 0;
@@ -3677,6 +3715,7 @@ static int find_new_ilb(int cpu)
3677{ 3715{
3678 struct sched_domain *sd; 3716 struct sched_domain *sd;
3679 struct sched_group *ilb_group; 3717 struct sched_group *ilb_group;
3718 int ilb = nr_cpu_ids;
3680 3719
3681 /* 3720 /*
3682 * Have idle load balancer selection from semi-idle packages only 3721 * Have idle load balancer selection from semi-idle packages only
@@ -3692,20 +3731,25 @@ static int find_new_ilb(int cpu)
3692 if (cpumask_weight(nohz.idle_cpus_mask) < 2) 3731 if (cpumask_weight(nohz.idle_cpus_mask) < 2)
3693 goto out_done; 3732 goto out_done;
3694 3733
3734 rcu_read_lock();
3695 for_each_flag_domain(cpu, sd, SD_POWERSAVINGS_BALANCE) { 3735 for_each_flag_domain(cpu, sd, SD_POWERSAVINGS_BALANCE) {
3696 ilb_group = sd->groups; 3736 ilb_group = sd->groups;
3697 3737
3698 do { 3738 do {
3699 if (is_semi_idle_group(ilb_group)) 3739 if (is_semi_idle_group(ilb_group)) {
3700 return cpumask_first(nohz.grp_idle_mask); 3740 ilb = cpumask_first(nohz.grp_idle_mask);
3741 goto unlock;
3742 }
3701 3743
3702 ilb_group = ilb_group->next; 3744 ilb_group = ilb_group->next;
3703 3745
3704 } while (ilb_group != sd->groups); 3746 } while (ilb_group != sd->groups);
3705 } 3747 }
3748unlock:
3749 rcu_read_unlock();
3706 3750
3707out_done: 3751out_done:
3708 return nr_cpu_ids; 3752 return ilb;
3709} 3753}
3710#else /* (CONFIG_SCHED_MC || CONFIG_SCHED_SMT) */ 3754#else /* (CONFIG_SCHED_MC || CONFIG_SCHED_SMT) */
3711static inline int find_new_ilb(int call_cpu) 3755static inline int find_new_ilb(int call_cpu)
@@ -3820,6 +3864,17 @@ void select_nohz_load_balancer(int stop_tick)
3820 3864
3821static DEFINE_SPINLOCK(balancing); 3865static DEFINE_SPINLOCK(balancing);
3822 3866
3867static unsigned long __read_mostly max_load_balance_interval = HZ/10;
3868
3869/*
3870 * Scale the max load_balance interval with the number of CPUs in the system.
3871 * This trades load-balance latency on larger machines for less cross talk.
3872 */
3873static void update_max_interval(void)
3874{
3875 max_load_balance_interval = HZ*num_online_cpus()/10;
3876}
3877
3823/* 3878/*
3824 * It checks each scheduling domain to see if it is due to be balanced, 3879 * It checks each scheduling domain to see if it is due to be balanced,
3825 * and initiates a balancing operation if so. 3880 * and initiates a balancing operation if so.
@@ -3839,6 +3894,7 @@ static void rebalance_domains(int cpu, enum cpu_idle_type idle)
3839 3894
3840 update_shares(cpu); 3895 update_shares(cpu);
3841 3896
3897 rcu_read_lock();
3842 for_each_domain(cpu, sd) { 3898 for_each_domain(cpu, sd) {
3843 if (!(sd->flags & SD_LOAD_BALANCE)) 3899 if (!(sd->flags & SD_LOAD_BALANCE))
3844 continue; 3900 continue;
@@ -3849,10 +3905,7 @@ static void rebalance_domains(int cpu, enum cpu_idle_type idle)
3849 3905
3850 /* scale ms to jiffies */ 3906 /* scale ms to jiffies */
3851 interval = msecs_to_jiffies(interval); 3907 interval = msecs_to_jiffies(interval);
3852 if (unlikely(!interval)) 3908 interval = clamp(interval, 1UL, max_load_balance_interval);
3853 interval = 1;
3854 if (interval > HZ*num_online_cpus()/10)
3855 interval = HZ*num_online_cpus()/10;
3856 3909
3857 need_serialize = sd->flags & SD_SERIALIZE; 3910 need_serialize = sd->flags & SD_SERIALIZE;
3858 3911
@@ -3887,6 +3940,7 @@ out:
3887 if (!balance) 3940 if (!balance)
3888 break; 3941 break;
3889 } 3942 }
3943 rcu_read_unlock();
3890 3944
3891 /* 3945 /*
3892 * next_balance will be updated only when there is a need. 3946 * next_balance will be updated only when there is a need.
diff --git a/kernel/sched_features.h b/kernel/sched_features.h
index 68e69acc29b9..be40f7371ee1 100644
--- a/kernel/sched_features.h
+++ b/kernel/sched_features.h
@@ -64,3 +64,9 @@ SCHED_FEAT(OWNER_SPIN, 1)
64 * Decrement CPU power based on irq activity 64 * Decrement CPU power based on irq activity
65 */ 65 */
66SCHED_FEAT(NONIRQ_POWER, 1) 66SCHED_FEAT(NONIRQ_POWER, 1)
67
68/*
69 * Queue remote wakeups on the target CPU and process them
70 * using the scheduler IPI. Reduces rq->lock contention/bounces.
71 */
72SCHED_FEAT(TTWU_QUEUE, 1)
diff --git a/kernel/sched_idletask.c b/kernel/sched_idletask.c
index a776a6396427..0a51882534ea 100644
--- a/kernel/sched_idletask.c
+++ b/kernel/sched_idletask.c
@@ -7,7 +7,7 @@
7 7
8#ifdef CONFIG_SMP 8#ifdef CONFIG_SMP
9static int 9static int
10select_task_rq_idle(struct rq *rq, struct task_struct *p, int sd_flag, int flags) 10select_task_rq_idle(struct task_struct *p, int sd_flag, int flags)
11{ 11{
12 return task_cpu(p); /* IDLE tasks as never migrated */ 12 return task_cpu(p); /* IDLE tasks as never migrated */
13} 13}
diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c
index db308cb08b75..64b2a37c07d0 100644
--- a/kernel/sched_rt.c
+++ b/kernel/sched_rt.c
@@ -183,6 +183,14 @@ static inline u64 sched_rt_period(struct rt_rq *rt_rq)
183 return ktime_to_ns(rt_rq->tg->rt_bandwidth.rt_period); 183 return ktime_to_ns(rt_rq->tg->rt_bandwidth.rt_period);
184} 184}
185 185
186typedef struct task_group *rt_rq_iter_t;
187
188#define for_each_rt_rq(rt_rq, iter, rq) \
189 for (iter = list_entry_rcu(task_groups.next, typeof(*iter), list); \
190 (&iter->list != &task_groups) && \
191 (rt_rq = iter->rt_rq[cpu_of(rq)]); \
192 iter = list_entry_rcu(iter->list.next, typeof(*iter), list))
193
186static inline void list_add_leaf_rt_rq(struct rt_rq *rt_rq) 194static inline void list_add_leaf_rt_rq(struct rt_rq *rt_rq)
187{ 195{
188 list_add_rcu(&rt_rq->leaf_rt_rq_list, 196 list_add_rcu(&rt_rq->leaf_rt_rq_list,
@@ -288,6 +296,11 @@ static inline u64 sched_rt_period(struct rt_rq *rt_rq)
288 return ktime_to_ns(def_rt_bandwidth.rt_period); 296 return ktime_to_ns(def_rt_bandwidth.rt_period);
289} 297}
290 298
299typedef struct rt_rq *rt_rq_iter_t;
300
301#define for_each_rt_rq(rt_rq, iter, rq) \
302 for ((void) iter, rt_rq = &rq->rt; rt_rq; rt_rq = NULL)
303
291static inline void list_add_leaf_rt_rq(struct rt_rq *rt_rq) 304static inline void list_add_leaf_rt_rq(struct rt_rq *rt_rq)
292{ 305{
293} 306}
@@ -402,12 +415,13 @@ next:
402static void __disable_runtime(struct rq *rq) 415static void __disable_runtime(struct rq *rq)
403{ 416{
404 struct root_domain *rd = rq->rd; 417 struct root_domain *rd = rq->rd;
418 rt_rq_iter_t iter;
405 struct rt_rq *rt_rq; 419 struct rt_rq *rt_rq;
406 420
407 if (unlikely(!scheduler_running)) 421 if (unlikely(!scheduler_running))
408 return; 422 return;
409 423
410 for_each_leaf_rt_rq(rt_rq, rq) { 424 for_each_rt_rq(rt_rq, iter, rq) {
411 struct rt_bandwidth *rt_b = sched_rt_bandwidth(rt_rq); 425 struct rt_bandwidth *rt_b = sched_rt_bandwidth(rt_rq);
412 s64 want; 426 s64 want;
413 int i; 427 int i;
@@ -487,6 +501,7 @@ static void disable_runtime(struct rq *rq)
487 501
488static void __enable_runtime(struct rq *rq) 502static void __enable_runtime(struct rq *rq)
489{ 503{
504 rt_rq_iter_t iter;
490 struct rt_rq *rt_rq; 505 struct rt_rq *rt_rq;
491 506
492 if (unlikely(!scheduler_running)) 507 if (unlikely(!scheduler_running))
@@ -495,7 +510,7 @@ static void __enable_runtime(struct rq *rq)
495 /* 510 /*
496 * Reset each runqueue's bandwidth settings 511 * Reset each runqueue's bandwidth settings
497 */ 512 */
498 for_each_leaf_rt_rq(rt_rq, rq) { 513 for_each_rt_rq(rt_rq, iter, rq) {
499 struct rt_bandwidth *rt_b = sched_rt_bandwidth(rt_rq); 514 struct rt_bandwidth *rt_b = sched_rt_bandwidth(rt_rq);
500 515
501 raw_spin_lock(&rt_b->rt_runtime_lock); 516 raw_spin_lock(&rt_b->rt_runtime_lock);
@@ -562,6 +577,13 @@ static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun)
562 if (rt_rq->rt_throttled && rt_rq->rt_time < runtime) { 577 if (rt_rq->rt_throttled && rt_rq->rt_time < runtime) {
563 rt_rq->rt_throttled = 0; 578 rt_rq->rt_throttled = 0;
564 enqueue = 1; 579 enqueue = 1;
580
581 /*
582 * Force a clock update if the CPU was idle,
583 * lest wakeup -> unthrottle time accumulate.
584 */
585 if (rt_rq->rt_nr_running && rq->curr == rq->idle)
586 rq->skip_clock_update = -1;
565 } 587 }
566 if (rt_rq->rt_time || rt_rq->rt_nr_running) 588 if (rt_rq->rt_time || rt_rq->rt_nr_running)
567 idle = 0; 589 idle = 0;
@@ -977,13 +999,23 @@ static void yield_task_rt(struct rq *rq)
977static int find_lowest_rq(struct task_struct *task); 999static int find_lowest_rq(struct task_struct *task);
978 1000
979static int 1001static int
980select_task_rq_rt(struct rq *rq, struct task_struct *p, int sd_flag, int flags) 1002select_task_rq_rt(struct task_struct *p, int sd_flag, int flags)
981{ 1003{
1004 struct task_struct *curr;
1005 struct rq *rq;
1006 int cpu;
1007
982 if (sd_flag != SD_BALANCE_WAKE) 1008 if (sd_flag != SD_BALANCE_WAKE)
983 return smp_processor_id(); 1009 return smp_processor_id();
984 1010
1011 cpu = task_cpu(p);
1012 rq = cpu_rq(cpu);
1013
1014 rcu_read_lock();
1015 curr = ACCESS_ONCE(rq->curr); /* unlocked access */
1016
985 /* 1017 /*
986 * If the current task is an RT task, then 1018 * If the current task on @p's runqueue is an RT task, then
987 * try to see if we can wake this RT task up on another 1019 * try to see if we can wake this RT task up on another
988 * runqueue. Otherwise simply start this RT task 1020 * runqueue. Otherwise simply start this RT task
989 * on its current runqueue. 1021 * on its current runqueue.
@@ -997,21 +1029,25 @@ select_task_rq_rt(struct rq *rq, struct task_struct *p, int sd_flag, int flags)
997 * lock? 1029 * lock?
998 * 1030 *
999 * For equal prio tasks, we just let the scheduler sort it out. 1031 * For equal prio tasks, we just let the scheduler sort it out.
1032 *
1033 * Otherwise, just let it ride on the affined RQ and the
1034 * post-schedule router will push the preempted task away
1035 *
1036 * This test is optimistic, if we get it wrong the load-balancer
1037 * will have to sort it out.
1000 */ 1038 */
1001 if (unlikely(rt_task(rq->curr)) && 1039 if (curr && unlikely(rt_task(curr)) &&
1002 (rq->curr->rt.nr_cpus_allowed < 2 || 1040 (curr->rt.nr_cpus_allowed < 2 ||
1003 rq->curr->prio < p->prio) && 1041 curr->prio < p->prio) &&
1004 (p->rt.nr_cpus_allowed > 1)) { 1042 (p->rt.nr_cpus_allowed > 1)) {
1005 int cpu = find_lowest_rq(p); 1043 int target = find_lowest_rq(p);
1006 1044
1007 return (cpu == -1) ? task_cpu(p) : cpu; 1045 if (target != -1)
1046 cpu = target;
1008 } 1047 }
1048 rcu_read_unlock();
1009 1049
1010 /* 1050 return cpu;
1011 * Otherwise, just let it ride on the affined RQ and the
1012 * post-schedule router will push the preempted task away
1013 */
1014 return task_cpu(p);
1015} 1051}
1016 1052
1017static void check_preempt_equal_prio(struct rq *rq, struct task_struct *p) 1053static void check_preempt_equal_prio(struct rq *rq, struct task_struct *p)
@@ -1136,7 +1172,7 @@ static void put_prev_task_rt(struct rq *rq, struct task_struct *p)
1136 * The previous task needs to be made eligible for pushing 1172 * The previous task needs to be made eligible for pushing
1137 * if it is still active 1173 * if it is still active
1138 */ 1174 */
1139 if (p->se.on_rq && p->rt.nr_cpus_allowed > 1) 1175 if (on_rt_rq(&p->rt) && p->rt.nr_cpus_allowed > 1)
1140 enqueue_pushable_task(rq, p); 1176 enqueue_pushable_task(rq, p);
1141} 1177}
1142 1178
@@ -1287,7 +1323,7 @@ static struct rq *find_lock_lowest_rq(struct task_struct *task, struct rq *rq)
1287 !cpumask_test_cpu(lowest_rq->cpu, 1323 !cpumask_test_cpu(lowest_rq->cpu,
1288 &task->cpus_allowed) || 1324 &task->cpus_allowed) ||
1289 task_running(rq, task) || 1325 task_running(rq, task) ||
1290 !task->se.on_rq)) { 1326 !task->on_rq)) {
1291 1327
1292 raw_spin_unlock(&lowest_rq->lock); 1328 raw_spin_unlock(&lowest_rq->lock);
1293 lowest_rq = NULL; 1329 lowest_rq = NULL;
@@ -1321,7 +1357,7 @@ static struct task_struct *pick_next_pushable_task(struct rq *rq)
1321 BUG_ON(task_current(rq, p)); 1357 BUG_ON(task_current(rq, p));
1322 BUG_ON(p->rt.nr_cpus_allowed <= 1); 1358 BUG_ON(p->rt.nr_cpus_allowed <= 1);
1323 1359
1324 BUG_ON(!p->se.on_rq); 1360 BUG_ON(!p->on_rq);
1325 BUG_ON(!rt_task(p)); 1361 BUG_ON(!rt_task(p));
1326 1362
1327 return p; 1363 return p;
@@ -1378,7 +1414,7 @@ retry:
1378 task = pick_next_pushable_task(rq); 1414 task = pick_next_pushable_task(rq);
1379 if (task_cpu(next_task) == rq->cpu && task == next_task) { 1415 if (task_cpu(next_task) == rq->cpu && task == next_task) {
1380 /* 1416 /*
1381 * If we get here, the task hasnt moved at all, but 1417 * If we get here, the task hasn't moved at all, but
1382 * it has failed to push. We will not try again, 1418 * it has failed to push. We will not try again,
1383 * since the other cpus will pull from us when they 1419 * since the other cpus will pull from us when they
1384 * are ready. 1420 * are ready.
@@ -1467,7 +1503,7 @@ static int pull_rt_task(struct rq *this_rq)
1467 */ 1503 */
1468 if (p && (p->prio < this_rq->rt.highest_prio.curr)) { 1504 if (p && (p->prio < this_rq->rt.highest_prio.curr)) {
1469 WARN_ON(p == src_rq->curr); 1505 WARN_ON(p == src_rq->curr);
1470 WARN_ON(!p->se.on_rq); 1506 WARN_ON(!p->on_rq);
1471 1507
1472 /* 1508 /*
1473 * There's a chance that p is higher in priority 1509 * There's a chance that p is higher in priority
@@ -1488,7 +1524,7 @@ static int pull_rt_task(struct rq *this_rq)
1488 /* 1524 /*
1489 * We continue with the search, just in 1525 * We continue with the search, just in
1490 * case there's an even higher prio task 1526 * case there's an even higher prio task
1491 * in another runqueue. (low likelyhood 1527 * in another runqueue. (low likelihood
1492 * but possible) 1528 * but possible)
1493 */ 1529 */
1494 } 1530 }
@@ -1538,7 +1574,7 @@ static void set_cpus_allowed_rt(struct task_struct *p,
1538 * Update the migration status of the RQ if we have an RT task 1574 * Update the migration status of the RQ if we have an RT task
1539 * which is running AND changing its weight value. 1575 * which is running AND changing its weight value.
1540 */ 1576 */
1541 if (p->se.on_rq && (weight != p->rt.nr_cpus_allowed)) { 1577 if (p->on_rq && (weight != p->rt.nr_cpus_allowed)) {
1542 struct rq *rq = task_rq(p); 1578 struct rq *rq = task_rq(p);
1543 1579
1544 if (!task_current(rq, p)) { 1580 if (!task_current(rq, p)) {
@@ -1608,7 +1644,7 @@ static void switched_from_rt(struct rq *rq, struct task_struct *p)
1608 * we may need to handle the pulling of RT tasks 1644 * we may need to handle the pulling of RT tasks
1609 * now. 1645 * now.
1610 */ 1646 */
1611 if (p->se.on_rq && !rq->rt.rt_nr_running) 1647 if (p->on_rq && !rq->rt.rt_nr_running)
1612 pull_rt_task(rq); 1648 pull_rt_task(rq);
1613} 1649}
1614 1650
@@ -1638,7 +1674,7 @@ static void switched_to_rt(struct rq *rq, struct task_struct *p)
1638 * If that current running task is also an RT task 1674 * If that current running task is also an RT task
1639 * then see if we can move to another run queue. 1675 * then see if we can move to another run queue.
1640 */ 1676 */
1641 if (p->se.on_rq && rq->curr != p) { 1677 if (p->on_rq && rq->curr != p) {
1642#ifdef CONFIG_SMP 1678#ifdef CONFIG_SMP
1643 if (rq->rt.overloaded && push_rt_task(rq) && 1679 if (rq->rt.overloaded && push_rt_task(rq) &&
1644 /* Don't resched if we changed runqueues */ 1680 /* Don't resched if we changed runqueues */
@@ -1657,7 +1693,7 @@ static void switched_to_rt(struct rq *rq, struct task_struct *p)
1657static void 1693static void
1658prio_changed_rt(struct rq *rq, struct task_struct *p, int oldprio) 1694prio_changed_rt(struct rq *rq, struct task_struct *p, int oldprio)
1659{ 1695{
1660 if (!p->se.on_rq) 1696 if (!p->on_rq)
1661 return; 1697 return;
1662 1698
1663 if (rq->curr == p) { 1699 if (rq->curr == p) {
@@ -1796,10 +1832,11 @@ extern void print_rt_rq(struct seq_file *m, int cpu, struct rt_rq *rt_rq);
1796 1832
1797static void print_rt_stats(struct seq_file *m, int cpu) 1833static void print_rt_stats(struct seq_file *m, int cpu)
1798{ 1834{
1835 rt_rq_iter_t iter;
1799 struct rt_rq *rt_rq; 1836 struct rt_rq *rt_rq;
1800 1837
1801 rcu_read_lock(); 1838 rcu_read_lock();
1802 for_each_leaf_rt_rq(rt_rq, cpu_rq(cpu)) 1839 for_each_rt_rq(rt_rq, iter, cpu_rq(cpu))
1803 print_rt_rq(m, cpu, rt_rq); 1840 print_rt_rq(m, cpu, rt_rq);
1804 rcu_read_unlock(); 1841 rcu_read_unlock();
1805} 1842}
diff --git a/kernel/sched_stoptask.c b/kernel/sched_stoptask.c
index 1ba2bd40fdac..6f437632afab 100644
--- a/kernel/sched_stoptask.c
+++ b/kernel/sched_stoptask.c
@@ -9,8 +9,7 @@
9 9
10#ifdef CONFIG_SMP 10#ifdef CONFIG_SMP
11static int 11static int
12select_task_rq_stop(struct rq *rq, struct task_struct *p, 12select_task_rq_stop(struct task_struct *p, int sd_flag, int flags)
13 int sd_flag, int flags)
14{ 13{
15 return task_cpu(p); /* stop tasks as never migrate */ 14 return task_cpu(p); /* stop tasks as never migrate */
16} 15}
@@ -26,7 +25,7 @@ static struct task_struct *pick_next_task_stop(struct rq *rq)
26{ 25{
27 struct task_struct *stop = rq->stop; 26 struct task_struct *stop = rq->stop;
28 27
29 if (stop && stop->se.on_rq) 28 if (stop && stop->on_rq)
30 return stop; 29 return stop;
31 30
32 return NULL; 31 return NULL;
diff --git a/kernel/signal.c b/kernel/signal.c
index 35c5f4b05344..ad5e818baacc 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -2068,7 +2068,7 @@ relock:
2068 for (;;) { 2068 for (;;) {
2069 struct k_sigaction *ka; 2069 struct k_sigaction *ka;
2070 /* 2070 /*
2071 * Tracing can induce an artifical signal and choose sigaction. 2071 * Tracing can induce an artificial signal and choose sigaction.
2072 * The return value in @signr determines the default action, 2072 * The return value in @signr determines the default action,
2073 * but @info->si_signo is the signal number we will report. 2073 * but @info->si_signo is the signal number we will report.
2074 */ 2074 */
@@ -2941,8 +2941,8 @@ SYSCALL_DEFINE3(sigprocmask, int, how, old_sigset_t __user *, nset,
2941/** 2941/**
2942 * sys_rt_sigaction - alter an action taken by a process 2942 * sys_rt_sigaction - alter an action taken by a process
2943 * @sig: signal to be sent 2943 * @sig: signal to be sent
2944 * @act: the thread group ID of the thread 2944 * @act: new sigaction
2945 * @oact: the PID of the thread 2945 * @oact: used to save the previous sigaction
2946 * @sigsetsize: size of sigset_t type 2946 * @sigsetsize: size of sigset_t type
2947 */ 2947 */
2948SYSCALL_DEFINE4(rt_sigaction, int, sig, 2948SYSCALL_DEFINE4(rt_sigaction, int, sig,
diff --git a/kernel/softirq.c b/kernel/softirq.c
index 735d87095172..13960170cad4 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -58,7 +58,7 @@ DEFINE_PER_CPU(struct task_struct *, ksoftirqd);
58 58
59char *softirq_to_name[NR_SOFTIRQS] = { 59char *softirq_to_name[NR_SOFTIRQS] = {
60 "HI", "TIMER", "NET_TX", "NET_RX", "BLOCK", "BLOCK_IOPOLL", 60 "HI", "TIMER", "NET_TX", "NET_RX", "BLOCK", "BLOCK_IOPOLL",
61 "TASKLET", "SCHED", "HRTIMER", "RCU" 61 "TASKLET", "SCHED", "HRTIMER"
62}; 62};
63 63
64/* 64/*
@@ -567,7 +567,7 @@ static void __tasklet_hrtimer_trampoline(unsigned long data)
567/** 567/**
568 * tasklet_hrtimer_init - Init a tasklet/hrtimer combo for softirq callbacks 568 * tasklet_hrtimer_init - Init a tasklet/hrtimer combo for softirq callbacks
569 * @ttimer: tasklet_hrtimer which is initialized 569 * @ttimer: tasklet_hrtimer which is initialized
570 * @function: hrtimer callback funtion which gets called from softirq context 570 * @function: hrtimer callback function which gets called from softirq context
571 * @which_clock: clock id (CLOCK_MONOTONIC/CLOCK_REALTIME) 571 * @which_clock: clock id (CLOCK_MONOTONIC/CLOCK_REALTIME)
572 * @mode: hrtimer mode (HRTIMER_MODE_ABS/HRTIMER_MODE_REL) 572 * @mode: hrtimer mode (HRTIMER_MODE_ABS/HRTIMER_MODE_REL)
573 */ 573 */
diff --git a/kernel/sys.c b/kernel/sys.c
index af468edf096a..e4128b278f23 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -314,8 +314,8 @@ void kernel_restart_prepare(char *cmd)
314{ 314{
315 blocking_notifier_call_chain(&reboot_notifier_list, SYS_RESTART, cmd); 315 blocking_notifier_call_chain(&reboot_notifier_list, SYS_RESTART, cmd);
316 system_state = SYSTEM_RESTART; 316 system_state = SYSTEM_RESTART;
317 usermodehelper_disable();
317 device_shutdown(); 318 device_shutdown();
318 sysdev_shutdown();
319 syscore_shutdown(); 319 syscore_shutdown();
320} 320}
321 321
@@ -344,6 +344,7 @@ static void kernel_shutdown_prepare(enum system_states state)
344 blocking_notifier_call_chain(&reboot_notifier_list, 344 blocking_notifier_call_chain(&reboot_notifier_list,
345 (state == SYSTEM_HALT)?SYS_HALT:SYS_POWER_OFF, NULL); 345 (state == SYSTEM_HALT)?SYS_HALT:SYS_POWER_OFF, NULL);
346 system_state = state; 346 system_state = state;
347 usermodehelper_disable();
347 device_shutdown(); 348 device_shutdown();
348} 349}
349/** 350/**
@@ -354,7 +355,6 @@ static void kernel_shutdown_prepare(enum system_states state)
354void kernel_halt(void) 355void kernel_halt(void)
355{ 356{
356 kernel_shutdown_prepare(SYSTEM_HALT); 357 kernel_shutdown_prepare(SYSTEM_HALT);
357 sysdev_shutdown();
358 syscore_shutdown(); 358 syscore_shutdown();
359 printk(KERN_EMERG "System halted.\n"); 359 printk(KERN_EMERG "System halted.\n");
360 kmsg_dump(KMSG_DUMP_HALT); 360 kmsg_dump(KMSG_DUMP_HALT);
@@ -374,7 +374,6 @@ void kernel_power_off(void)
374 if (pm_power_off_prepare) 374 if (pm_power_off_prepare)
375 pm_power_off_prepare(); 375 pm_power_off_prepare();
376 disable_nonboot_cpus(); 376 disable_nonboot_cpus();
377 sysdev_shutdown();
378 syscore_shutdown(); 377 syscore_shutdown();
379 printk(KERN_EMERG "Power down.\n"); 378 printk(KERN_EMERG "Power down.\n");
380 kmsg_dump(KMSG_DUMP_POWEROFF); 379 kmsg_dump(KMSG_DUMP_POWEROFF);
diff --git a/kernel/time/Makefile b/kernel/time/Makefile
index b0425991e9ac..e2fd74b8e8c2 100644
--- a/kernel/time/Makefile
+++ b/kernel/time/Makefile
@@ -1,5 +1,5 @@
1obj-y += timekeeping.o ntp.o clocksource.o jiffies.o timer_list.o timecompare.o 1obj-y += timekeeping.o ntp.o clocksource.o jiffies.o timer_list.o timecompare.o
2obj-y += timeconv.o posix-clock.o 2obj-y += timeconv.o posix-clock.o alarmtimer.o
3 3
4obj-$(CONFIG_GENERIC_CLOCKEVENTS_BUILD) += clockevents.o 4obj-$(CONFIG_GENERIC_CLOCKEVENTS_BUILD) += clockevents.o
5obj-$(CONFIG_GENERIC_CLOCKEVENTS) += tick-common.o 5obj-$(CONFIG_GENERIC_CLOCKEVENTS) += tick-common.o
diff --git a/kernel/time/alarmtimer.c b/kernel/time/alarmtimer.c
new file mode 100644
index 000000000000..9265014cb4db
--- /dev/null
+++ b/kernel/time/alarmtimer.c
@@ -0,0 +1,694 @@
1/*
2 * Alarmtimer interface
3 *
4 * This interface provides a timer which is similarto hrtimers,
5 * but triggers a RTC alarm if the box is suspend.
6 *
7 * This interface is influenced by the Android RTC Alarm timer
8 * interface.
9 *
10 * Copyright (C) 2010 IBM Corperation
11 *
12 * Author: John Stultz <john.stultz@linaro.org>
13 *
14 * This program is free software; you can redistribute it and/or modify
15 * it under the terms of the GNU General Public License version 2 as
16 * published by the Free Software Foundation.
17 */
18#include <linux/time.h>
19#include <linux/hrtimer.h>
20#include <linux/timerqueue.h>
21#include <linux/rtc.h>
22#include <linux/alarmtimer.h>
23#include <linux/mutex.h>
24#include <linux/platform_device.h>
25#include <linux/posix-timers.h>
26#include <linux/workqueue.h>
27#include <linux/freezer.h>
28
29/**
30 * struct alarm_base - Alarm timer bases
31 * @lock: Lock for syncrhonized access to the base
32 * @timerqueue: Timerqueue head managing the list of events
33 * @timer: hrtimer used to schedule events while running
34 * @gettime: Function to read the time correlating to the base
35 * @base_clockid: clockid for the base
36 */
37static struct alarm_base {
38 spinlock_t lock;
39 struct timerqueue_head timerqueue;
40 struct hrtimer timer;
41 ktime_t (*gettime)(void);
42 clockid_t base_clockid;
43} alarm_bases[ALARM_NUMTYPE];
44
45#ifdef CONFIG_RTC_CLASS
46/* rtc timer and device for setting alarm wakeups at suspend */
47static struct rtc_timer rtctimer;
48static struct rtc_device *rtcdev;
49#endif
50
51/* freezer delta & lock used to handle clock_nanosleep triggered wakeups */
52static ktime_t freezer_delta;
53static DEFINE_SPINLOCK(freezer_delta_lock);
54
55
56/**
57 * alarmtimer_enqueue - Adds an alarm timer to an alarm_base timerqueue
58 * @base: pointer to the base where the timer is being run
59 * @alarm: pointer to alarm being enqueued.
60 *
61 * Adds alarm to a alarm_base timerqueue and if necessary sets
62 * an hrtimer to run.
63 *
64 * Must hold base->lock when calling.
65 */
66static void alarmtimer_enqueue(struct alarm_base *base, struct alarm *alarm)
67{
68 timerqueue_add(&base->timerqueue, &alarm->node);
69 if (&alarm->node == timerqueue_getnext(&base->timerqueue)) {
70 hrtimer_try_to_cancel(&base->timer);
71 hrtimer_start(&base->timer, alarm->node.expires,
72 HRTIMER_MODE_ABS);
73 }
74}
75
76/**
77 * alarmtimer_remove - Removes an alarm timer from an alarm_base timerqueue
78 * @base: pointer to the base where the timer is running
79 * @alarm: pointer to alarm being removed
80 *
81 * Removes alarm to a alarm_base timerqueue and if necessary sets
82 * a new timer to run.
83 *
84 * Must hold base->lock when calling.
85 */
86static void alarmtimer_remove(struct alarm_base *base, struct alarm *alarm)
87{
88 struct timerqueue_node *next = timerqueue_getnext(&base->timerqueue);
89
90 timerqueue_del(&base->timerqueue, &alarm->node);
91 if (next == &alarm->node) {
92 hrtimer_try_to_cancel(&base->timer);
93 next = timerqueue_getnext(&base->timerqueue);
94 if (!next)
95 return;
96 hrtimer_start(&base->timer, next->expires, HRTIMER_MODE_ABS);
97 }
98}
99
100
101/**
102 * alarmtimer_fired - Handles alarm hrtimer being fired.
103 * @timer: pointer to hrtimer being run
104 *
105 * When a alarm timer fires, this runs through the timerqueue to
106 * see which alarms expired, and runs those. If there are more alarm
107 * timers queued for the future, we set the hrtimer to fire when
108 * when the next future alarm timer expires.
109 */
110static enum hrtimer_restart alarmtimer_fired(struct hrtimer *timer)
111{
112 struct alarm_base *base = container_of(timer, struct alarm_base, timer);
113 struct timerqueue_node *next;
114 unsigned long flags;
115 ktime_t now;
116 int ret = HRTIMER_NORESTART;
117
118 spin_lock_irqsave(&base->lock, flags);
119 now = base->gettime();
120 while ((next = timerqueue_getnext(&base->timerqueue))) {
121 struct alarm *alarm;
122 ktime_t expired = next->expires;
123
124 if (expired.tv64 >= now.tv64)
125 break;
126
127 alarm = container_of(next, struct alarm, node);
128
129 timerqueue_del(&base->timerqueue, &alarm->node);
130 alarm->enabled = 0;
131 /* Re-add periodic timers */
132 if (alarm->period.tv64) {
133 alarm->node.expires = ktime_add(expired, alarm->period);
134 timerqueue_add(&base->timerqueue, &alarm->node);
135 alarm->enabled = 1;
136 }
137 spin_unlock_irqrestore(&base->lock, flags);
138 if (alarm->function)
139 alarm->function(alarm);
140 spin_lock_irqsave(&base->lock, flags);
141 }
142
143 if (next) {
144 hrtimer_set_expires(&base->timer, next->expires);
145 ret = HRTIMER_RESTART;
146 }
147 spin_unlock_irqrestore(&base->lock, flags);
148
149 return ret;
150
151}
152
153#ifdef CONFIG_RTC_CLASS
154/**
155 * alarmtimer_suspend - Suspend time callback
156 * @dev: unused
157 * @state: unused
158 *
159 * When we are going into suspend, we look through the bases
160 * to see which is the soonest timer to expire. We then
161 * set an rtc timer to fire that far into the future, which
162 * will wake us from suspend.
163 */
164static int alarmtimer_suspend(struct device *dev)
165{
166 struct rtc_time tm;
167 ktime_t min, now;
168 unsigned long flags;
169 int i;
170
171 spin_lock_irqsave(&freezer_delta_lock, flags);
172 min = freezer_delta;
173 freezer_delta = ktime_set(0, 0);
174 spin_unlock_irqrestore(&freezer_delta_lock, flags);
175
176 /* If we have no rtcdev, just return */
177 if (!rtcdev)
178 return 0;
179
180 /* Find the soonest timer to expire*/
181 for (i = 0; i < ALARM_NUMTYPE; i++) {
182 struct alarm_base *base = &alarm_bases[i];
183 struct timerqueue_node *next;
184 ktime_t delta;
185
186 spin_lock_irqsave(&base->lock, flags);
187 next = timerqueue_getnext(&base->timerqueue);
188 spin_unlock_irqrestore(&base->lock, flags);
189 if (!next)
190 continue;
191 delta = ktime_sub(next->expires, base->gettime());
192 if (!min.tv64 || (delta.tv64 < min.tv64))
193 min = delta;
194 }
195 if (min.tv64 == 0)
196 return 0;
197
198 /* XXX - Should we enforce a minimum sleep time? */
199 WARN_ON(min.tv64 < NSEC_PER_SEC);
200
201 /* Setup an rtc timer to fire that far in the future */
202 rtc_timer_cancel(rtcdev, &rtctimer);
203 rtc_read_time(rtcdev, &tm);
204 now = rtc_tm_to_ktime(tm);
205 now = ktime_add(now, min);
206
207 rtc_timer_start(rtcdev, &rtctimer, now, ktime_set(0, 0));
208
209 return 0;
210}
211#else
212static int alarmtimer_suspend(struct device *dev)
213{
214 return 0;
215}
216#endif
217
218static void alarmtimer_freezerset(ktime_t absexp, enum alarmtimer_type type)
219{
220 ktime_t delta;
221 unsigned long flags;
222 struct alarm_base *base = &alarm_bases[type];
223
224 delta = ktime_sub(absexp, base->gettime());
225
226 spin_lock_irqsave(&freezer_delta_lock, flags);
227 if (!freezer_delta.tv64 || (delta.tv64 < freezer_delta.tv64))
228 freezer_delta = delta;
229 spin_unlock_irqrestore(&freezer_delta_lock, flags);
230}
231
232
233/**
234 * alarm_init - Initialize an alarm structure
235 * @alarm: ptr to alarm to be initialized
236 * @type: the type of the alarm
237 * @function: callback that is run when the alarm fires
238 */
239void alarm_init(struct alarm *alarm, enum alarmtimer_type type,
240 void (*function)(struct alarm *))
241{
242 timerqueue_init(&alarm->node);
243 alarm->period = ktime_set(0, 0);
244 alarm->function = function;
245 alarm->type = type;
246 alarm->enabled = 0;
247}
248
249/**
250 * alarm_start - Sets an alarm to fire
251 * @alarm: ptr to alarm to set
252 * @start: time to run the alarm
253 * @period: period at which the alarm will recur
254 */
255void alarm_start(struct alarm *alarm, ktime_t start, ktime_t period)
256{
257 struct alarm_base *base = &alarm_bases[alarm->type];
258 unsigned long flags;
259
260 spin_lock_irqsave(&base->lock, flags);
261 if (alarm->enabled)
262 alarmtimer_remove(base, alarm);
263 alarm->node.expires = start;
264 alarm->period = period;
265 alarmtimer_enqueue(base, alarm);
266 alarm->enabled = 1;
267 spin_unlock_irqrestore(&base->lock, flags);
268}
269
270/**
271 * alarm_cancel - Tries to cancel an alarm timer
272 * @alarm: ptr to alarm to be canceled
273 */
274void alarm_cancel(struct alarm *alarm)
275{
276 struct alarm_base *base = &alarm_bases[alarm->type];
277 unsigned long flags;
278
279 spin_lock_irqsave(&base->lock, flags);
280 if (alarm->enabled)
281 alarmtimer_remove(base, alarm);
282 alarm->enabled = 0;
283 spin_unlock_irqrestore(&base->lock, flags);
284}
285
286
287/**
288 * clock2alarm - helper that converts from clockid to alarmtypes
289 * @clockid: clockid.
290 */
291static enum alarmtimer_type clock2alarm(clockid_t clockid)
292{
293 if (clockid == CLOCK_REALTIME_ALARM)
294 return ALARM_REALTIME;
295 if (clockid == CLOCK_BOOTTIME_ALARM)
296 return ALARM_BOOTTIME;
297 return -1;
298}
299
300/**
301 * alarm_handle_timer - Callback for posix timers
302 * @alarm: alarm that fired
303 *
304 * Posix timer callback for expired alarm timers.
305 */
306static void alarm_handle_timer(struct alarm *alarm)
307{
308 struct k_itimer *ptr = container_of(alarm, struct k_itimer,
309 it.alarmtimer);
310 if (posix_timer_event(ptr, 0) != 0)
311 ptr->it_overrun++;
312}
313
314/**
315 * alarm_clock_getres - posix getres interface
316 * @which_clock: clockid
317 * @tp: timespec to fill
318 *
319 * Returns the granularity of underlying alarm base clock
320 */
321static int alarm_clock_getres(const clockid_t which_clock, struct timespec *tp)
322{
323 clockid_t baseid = alarm_bases[clock2alarm(which_clock)].base_clockid;
324
325 return hrtimer_get_res(baseid, tp);
326}
327
328/**
329 * alarm_clock_get - posix clock_get interface
330 * @which_clock: clockid
331 * @tp: timespec to fill.
332 *
333 * Provides the underlying alarm base time.
334 */
335static int alarm_clock_get(clockid_t which_clock, struct timespec *tp)
336{
337 struct alarm_base *base = &alarm_bases[clock2alarm(which_clock)];
338
339 *tp = ktime_to_timespec(base->gettime());
340 return 0;
341}
342
343/**
344 * alarm_timer_create - posix timer_create interface
345 * @new_timer: k_itimer pointer to manage
346 *
347 * Initializes the k_itimer structure.
348 */
349static int alarm_timer_create(struct k_itimer *new_timer)
350{
351 enum alarmtimer_type type;
352 struct alarm_base *base;
353
354 if (!capable(CAP_WAKE_ALARM))
355 return -EPERM;
356
357 type = clock2alarm(new_timer->it_clock);
358 base = &alarm_bases[type];
359 alarm_init(&new_timer->it.alarmtimer, type, alarm_handle_timer);
360 return 0;
361}
362
363/**
364 * alarm_timer_get - posix timer_get interface
365 * @new_timer: k_itimer pointer
366 * @cur_setting: itimerspec data to fill
367 *
368 * Copies the itimerspec data out from the k_itimer
369 */
370static void alarm_timer_get(struct k_itimer *timr,
371 struct itimerspec *cur_setting)
372{
373 cur_setting->it_interval =
374 ktime_to_timespec(timr->it.alarmtimer.period);
375 cur_setting->it_value =
376 ktime_to_timespec(timr->it.alarmtimer.node.expires);
377 return;
378}
379
380/**
381 * alarm_timer_del - posix timer_del interface
382 * @timr: k_itimer pointer to be deleted
383 *
384 * Cancels any programmed alarms for the given timer.
385 */
386static int alarm_timer_del(struct k_itimer *timr)
387{
388 alarm_cancel(&timr->it.alarmtimer);
389 return 0;
390}
391
392/**
393 * alarm_timer_set - posix timer_set interface
394 * @timr: k_itimer pointer to be deleted
395 * @flags: timer flags
396 * @new_setting: itimerspec to be used
397 * @old_setting: itimerspec being replaced
398 *
399 * Sets the timer to new_setting, and starts the timer.
400 */
401static int alarm_timer_set(struct k_itimer *timr, int flags,
402 struct itimerspec *new_setting,
403 struct itimerspec *old_setting)
404{
405 /* Save old values */
406 old_setting->it_interval =
407 ktime_to_timespec(timr->it.alarmtimer.period);
408 old_setting->it_value =
409 ktime_to_timespec(timr->it.alarmtimer.node.expires);
410
411 /* If the timer was already set, cancel it */
412 alarm_cancel(&timr->it.alarmtimer);
413
414 /* start the timer */
415 alarm_start(&timr->it.alarmtimer,
416 timespec_to_ktime(new_setting->it_value),
417 timespec_to_ktime(new_setting->it_interval));
418 return 0;
419}
420
421/**
422 * alarmtimer_nsleep_wakeup - Wakeup function for alarm_timer_nsleep
423 * @alarm: ptr to alarm that fired
424 *
425 * Wakes up the task that set the alarmtimer
426 */
427static void alarmtimer_nsleep_wakeup(struct alarm *alarm)
428{
429 struct task_struct *task = (struct task_struct *)alarm->data;
430
431 alarm->data = NULL;
432 if (task)
433 wake_up_process(task);
434}
435
436/**
437 * alarmtimer_do_nsleep - Internal alarmtimer nsleep implementation
438 * @alarm: ptr to alarmtimer
439 * @absexp: absolute expiration time
440 *
441 * Sets the alarm timer and sleeps until it is fired or interrupted.
442 */
443static int alarmtimer_do_nsleep(struct alarm *alarm, ktime_t absexp)
444{
445 alarm->data = (void *)current;
446 do {
447 set_current_state(TASK_INTERRUPTIBLE);
448 alarm_start(alarm, absexp, ktime_set(0, 0));
449 if (likely(alarm->data))
450 schedule();
451
452 alarm_cancel(alarm);
453 } while (alarm->data && !signal_pending(current));
454
455 __set_current_state(TASK_RUNNING);
456
457 return (alarm->data == NULL);
458}
459
460
461/**
462 * update_rmtp - Update remaining timespec value
463 * @exp: expiration time
464 * @type: timer type
465 * @rmtp: user pointer to remaining timepsec value
466 *
467 * Helper function that fills in rmtp value with time between
468 * now and the exp value
469 */
470static int update_rmtp(ktime_t exp, enum alarmtimer_type type,
471 struct timespec __user *rmtp)
472{
473 struct timespec rmt;
474 ktime_t rem;
475
476 rem = ktime_sub(exp, alarm_bases[type].gettime());
477
478 if (rem.tv64 <= 0)
479 return 0;
480 rmt = ktime_to_timespec(rem);
481
482 if (copy_to_user(rmtp, &rmt, sizeof(*rmtp)))
483 return -EFAULT;
484
485 return 1;
486
487}
488
489/**
490 * alarm_timer_nsleep_restart - restartblock alarmtimer nsleep
491 * @restart: ptr to restart block
492 *
493 * Handles restarted clock_nanosleep calls
494 */
495static long __sched alarm_timer_nsleep_restart(struct restart_block *restart)
496{
497 enum alarmtimer_type type = restart->nanosleep.index;
498 ktime_t exp;
499 struct timespec __user *rmtp;
500 struct alarm alarm;
501 int ret = 0;
502
503 exp.tv64 = restart->nanosleep.expires;
504 alarm_init(&alarm, type, alarmtimer_nsleep_wakeup);
505
506 if (alarmtimer_do_nsleep(&alarm, exp))
507 goto out;
508
509 if (freezing(current))
510 alarmtimer_freezerset(exp, type);
511
512 rmtp = restart->nanosleep.rmtp;
513 if (rmtp) {
514 ret = update_rmtp(exp, type, rmtp);
515 if (ret <= 0)
516 goto out;
517 }
518
519
520 /* The other values in restart are already filled in */
521 ret = -ERESTART_RESTARTBLOCK;
522out:
523 return ret;
524}
525
526/**
527 * alarm_timer_nsleep - alarmtimer nanosleep
528 * @which_clock: clockid
529 * @flags: determins abstime or relative
530 * @tsreq: requested sleep time (abs or rel)
531 * @rmtp: remaining sleep time saved
532 *
533 * Handles clock_nanosleep calls against _ALARM clockids
534 */
535static int alarm_timer_nsleep(const clockid_t which_clock, int flags,
536 struct timespec *tsreq, struct timespec __user *rmtp)
537{
538 enum alarmtimer_type type = clock2alarm(which_clock);
539 struct alarm alarm;
540 ktime_t exp;
541 int ret = 0;
542 struct restart_block *restart;
543
544 if (!capable(CAP_WAKE_ALARM))
545 return -EPERM;
546
547 alarm_init(&alarm, type, alarmtimer_nsleep_wakeup);
548
549 exp = timespec_to_ktime(*tsreq);
550 /* Convert (if necessary) to absolute time */
551 if (flags != TIMER_ABSTIME) {
552 ktime_t now = alarm_bases[type].gettime();
553 exp = ktime_add(now, exp);
554 }
555
556 if (alarmtimer_do_nsleep(&alarm, exp))
557 goto out;
558
559 if (freezing(current))
560 alarmtimer_freezerset(exp, type);
561
562 /* abs timers don't set remaining time or restart */
563 if (flags == TIMER_ABSTIME) {
564 ret = -ERESTARTNOHAND;
565 goto out;
566 }
567
568 if (rmtp) {
569 ret = update_rmtp(exp, type, rmtp);
570 if (ret <= 0)
571 goto out;
572 }
573
574 restart = &current_thread_info()->restart_block;
575 restart->fn = alarm_timer_nsleep_restart;
576 restart->nanosleep.index = type;
577 restart->nanosleep.expires = exp.tv64;
578 restart->nanosleep.rmtp = rmtp;
579 ret = -ERESTART_RESTARTBLOCK;
580
581out:
582 return ret;
583}
584
585
586/* Suspend hook structures */
587static const struct dev_pm_ops alarmtimer_pm_ops = {
588 .suspend = alarmtimer_suspend,
589};
590
591static struct platform_driver alarmtimer_driver = {
592 .driver = {
593 .name = "alarmtimer",
594 .pm = &alarmtimer_pm_ops,
595 }
596};
597
598/**
599 * alarmtimer_init - Initialize alarm timer code
600 *
601 * This function initializes the alarm bases and registers
602 * the posix clock ids.
603 */
604static int __init alarmtimer_init(void)
605{
606 int error = 0;
607 int i;
608 struct k_clock alarm_clock = {
609 .clock_getres = alarm_clock_getres,
610 .clock_get = alarm_clock_get,
611 .timer_create = alarm_timer_create,
612 .timer_set = alarm_timer_set,
613 .timer_del = alarm_timer_del,
614 .timer_get = alarm_timer_get,
615 .nsleep = alarm_timer_nsleep,
616 };
617
618 posix_timers_register_clock(CLOCK_REALTIME_ALARM, &alarm_clock);
619 posix_timers_register_clock(CLOCK_BOOTTIME_ALARM, &alarm_clock);
620
621 /* Initialize alarm bases */
622 alarm_bases[ALARM_REALTIME].base_clockid = CLOCK_REALTIME;
623 alarm_bases[ALARM_REALTIME].gettime = &ktime_get_real;
624 alarm_bases[ALARM_BOOTTIME].base_clockid = CLOCK_BOOTTIME;
625 alarm_bases[ALARM_BOOTTIME].gettime = &ktime_get_boottime;
626 for (i = 0; i < ALARM_NUMTYPE; i++) {
627 timerqueue_init_head(&alarm_bases[i].timerqueue);
628 spin_lock_init(&alarm_bases[i].lock);
629 hrtimer_init(&alarm_bases[i].timer,
630 alarm_bases[i].base_clockid,
631 HRTIMER_MODE_ABS);
632 alarm_bases[i].timer.function = alarmtimer_fired;
633 }
634 error = platform_driver_register(&alarmtimer_driver);
635 platform_device_register_simple("alarmtimer", -1, NULL, 0);
636
637 return error;
638}
639device_initcall(alarmtimer_init);
640
641#ifdef CONFIG_RTC_CLASS
642/**
643 * has_wakealarm - check rtc device has wakealarm ability
644 * @dev: current device
645 * @name_ptr: name to be returned
646 *
647 * This helper function checks to see if the rtc device can wake
648 * from suspend.
649 */
650static int __init has_wakealarm(struct device *dev, void *name_ptr)
651{
652 struct rtc_device *candidate = to_rtc_device(dev);
653
654 if (!candidate->ops->set_alarm)
655 return 0;
656 if (!device_may_wakeup(candidate->dev.parent))
657 return 0;
658
659 *(const char **)name_ptr = dev_name(dev);
660 return 1;
661}
662
663/**
664 * alarmtimer_init_late - Late initializing of alarmtimer code
665 *
666 * This function locates a rtc device to use for wakealarms.
667 * Run as late_initcall to make sure rtc devices have been
668 * registered.
669 */
670static int __init alarmtimer_init_late(void)
671{
672 char *str;
673
674 /* Find an rtc device and init the rtc_timer */
675 class_find_device(rtc_class, NULL, &str, has_wakealarm);
676 if (str)
677 rtcdev = rtc_class_open(str);
678 if (!rtcdev) {
679 printk(KERN_WARNING "No RTC device found, ALARM timers will"
680 " not wake from suspend");
681 }
682 rtc_timer_init(&rtctimer, NULL, NULL);
683
684 return 0;
685}
686#else
687static int __init alarmtimer_init_late(void)
688{
689 printk(KERN_WARNING "Kernel not built with RTC support, ALARM timers"
690 " will not wake from suspend");
691 return 0;
692}
693#endif
694late_initcall(alarmtimer_init_late);
diff --git a/kernel/time/clockevents.c b/kernel/time/clockevents.c
index 0d74b9ba90c8..22a9da9a9c96 100644
--- a/kernel/time/clockevents.c
+++ b/kernel/time/clockevents.c
@@ -194,6 +194,70 @@ void clockevents_register_device(struct clock_event_device *dev)
194} 194}
195EXPORT_SYMBOL_GPL(clockevents_register_device); 195EXPORT_SYMBOL_GPL(clockevents_register_device);
196 196
197static void clockevents_config(struct clock_event_device *dev,
198 u32 freq)
199{
200 unsigned long sec;
201
202 if (!(dev->features & CLOCK_EVT_FEAT_ONESHOT))
203 return;
204
205 /*
206 * Calculate the maximum number of seconds we can sleep. Limit
207 * to 10 minutes for hardware which can program more than
208 * 32bit ticks so we still get reasonable conversion values.
209 */
210 sec = dev->max_delta_ticks;
211 do_div(sec, freq);
212 if (!sec)
213 sec = 1;
214 else if (sec > 600 && dev->max_delta_ticks > UINT_MAX)
215 sec = 600;
216
217 clockevents_calc_mult_shift(dev, freq, sec);
218 dev->min_delta_ns = clockevent_delta2ns(dev->min_delta_ticks, dev);
219 dev->max_delta_ns = clockevent_delta2ns(dev->max_delta_ticks, dev);
220}
221
222/**
223 * clockevents_config_and_register - Configure and register a clock event device
224 * @dev: device to register
225 * @freq: The clock frequency
226 * @min_delta: The minimum clock ticks to program in oneshot mode
227 * @max_delta: The maximum clock ticks to program in oneshot mode
228 *
229 * min/max_delta can be 0 for devices which do not support oneshot mode.
230 */
231void clockevents_config_and_register(struct clock_event_device *dev,
232 u32 freq, unsigned long min_delta,
233 unsigned long max_delta)
234{
235 dev->min_delta_ticks = min_delta;
236 dev->max_delta_ticks = max_delta;
237 clockevents_config(dev, freq);
238 clockevents_register_device(dev);
239}
240
241/**
242 * clockevents_update_freq - Update frequency and reprogram a clock event device.
243 * @dev: device to modify
244 * @freq: new device frequency
245 *
246 * Reconfigure and reprogram a clock event device in oneshot
247 * mode. Must be called on the cpu for which the device delivers per
248 * cpu timer events with interrupts disabled! Returns 0 on success,
249 * -ETIME when the event is in the past.
250 */
251int clockevents_update_freq(struct clock_event_device *dev, u32 freq)
252{
253 clockevents_config(dev, freq);
254
255 if (dev->mode != CLOCK_EVT_MODE_ONESHOT)
256 return 0;
257
258 return clockevents_program_event(dev, dev->next_event, ktime_get());
259}
260
197/* 261/*
198 * Noop handler when we shut down an event device 262 * Noop handler when we shut down an event device
199 */ 263 */
diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c
index 6519cf62d9cd..d9d5f8c885f6 100644
--- a/kernel/time/clocksource.c
+++ b/kernel/time/clocksource.c
@@ -626,19 +626,6 @@ static void clocksource_enqueue(struct clocksource *cs)
626 list_add(&cs->list, entry); 626 list_add(&cs->list, entry);
627} 627}
628 628
629
630/*
631 * Maximum time we expect to go between ticks. This includes idle
632 * tickless time. It provides the trade off between selecting a
633 * mult/shift pair that is very precise but can only handle a short
634 * period of time, vs. a mult/shift pair that can handle long periods
635 * of time but isn't as precise.
636 *
637 * This is a subsystem constant, and actual hardware limitations
638 * may override it (ie: clocksources that wrap every 3 seconds).
639 */
640#define MAX_UPDATE_LENGTH 5 /* Seconds */
641
642/** 629/**
643 * __clocksource_updatefreq_scale - Used update clocksource with new freq 630 * __clocksource_updatefreq_scale - Used update clocksource with new freq
644 * @t: clocksource to be registered 631 * @t: clocksource to be registered
@@ -652,15 +639,28 @@ static void clocksource_enqueue(struct clocksource *cs)
652 */ 639 */
653void __clocksource_updatefreq_scale(struct clocksource *cs, u32 scale, u32 freq) 640void __clocksource_updatefreq_scale(struct clocksource *cs, u32 scale, u32 freq)
654{ 641{
642 unsigned long sec;
643
655 /* 644 /*
656 * Ideally we want to use some of the limits used in 645 * Calc the maximum number of seconds which we can run before
657 * clocksource_max_deferment, to provide a more informed 646 * wrapping around. For clocksources which have a mask > 32bit
658 * MAX_UPDATE_LENGTH. But for now this just gets the 647 * we need to limit the max sleep time to have a good
659 * register interface working properly. 648 * conversion precision. 10 minutes is still a reasonable
649 * amount. That results in a shift value of 24 for a
650 * clocksource with mask >= 40bit and f >= 4GHz. That maps to
651 * ~ 0.06ppm granularity for NTP. We apply the same 12.5%
652 * margin as we do in clocksource_max_deferment()
660 */ 653 */
654 sec = (cs->mask - (cs->mask >> 5));
655 do_div(sec, freq);
656 do_div(sec, scale);
657 if (!sec)
658 sec = 1;
659 else if (sec > 600 && cs->mask > UINT_MAX)
660 sec = 600;
661
661 clocks_calc_mult_shift(&cs->mult, &cs->shift, freq, 662 clocks_calc_mult_shift(&cs->mult, &cs->shift, freq,
662 NSEC_PER_SEC/scale, 663 NSEC_PER_SEC / scale, sec * scale);
663 MAX_UPDATE_LENGTH*scale);
664 cs->max_idle_ns = clocksource_max_deferment(cs); 664 cs->max_idle_ns = clocksource_max_deferment(cs);
665} 665}
666EXPORT_SYMBOL_GPL(__clocksource_updatefreq_scale); 666EXPORT_SYMBOL_GPL(__clocksource_updatefreq_scale);
@@ -685,8 +685,8 @@ int __clocksource_register_scale(struct clocksource *cs, u32 scale, u32 freq)
685 /* Add clocksource to the clcoksource list */ 685 /* Add clocksource to the clcoksource list */
686 mutex_lock(&clocksource_mutex); 686 mutex_lock(&clocksource_mutex);
687 clocksource_enqueue(cs); 687 clocksource_enqueue(cs);
688 clocksource_select();
689 clocksource_enqueue_watchdog(cs); 688 clocksource_enqueue_watchdog(cs);
689 clocksource_select();
690 mutex_unlock(&clocksource_mutex); 690 mutex_unlock(&clocksource_mutex);
691 return 0; 691 return 0;
692} 692}
@@ -706,8 +706,8 @@ int clocksource_register(struct clocksource *cs)
706 706
707 mutex_lock(&clocksource_mutex); 707 mutex_lock(&clocksource_mutex);
708 clocksource_enqueue(cs); 708 clocksource_enqueue(cs);
709 clocksource_select();
710 clocksource_enqueue_watchdog(cs); 709 clocksource_enqueue_watchdog(cs);
710 clocksource_select();
711 mutex_unlock(&clocksource_mutex); 711 mutex_unlock(&clocksource_mutex);
712 return 0; 712 return 0;
713} 713}
diff --git a/kernel/time/jiffies.c b/kernel/time/jiffies.c
index b2fa506667c0..a470154e0408 100644
--- a/kernel/time/jiffies.c
+++ b/kernel/time/jiffies.c
@@ -34,7 +34,7 @@
34 * inaccuracies caused by missed or lost timer 34 * inaccuracies caused by missed or lost timer
35 * interrupts and the inability for the timer 35 * interrupts and the inability for the timer
36 * interrupt hardware to accuratly tick at the 36 * interrupt hardware to accuratly tick at the
37 * requested HZ value. It is also not reccomended 37 * requested HZ value. It is also not recommended
38 * for "tick-less" systems. 38 * for "tick-less" systems.
39 */ 39 */
40#define NSEC_PER_JIFFY ((u32)((((u64)NSEC_PER_SEC)<<8)/ACTHZ)) 40#define NSEC_PER_JIFFY ((u32)((((u64)NSEC_PER_SEC)<<8)/ACTHZ))
diff --git a/kernel/time/posix-clock.c b/kernel/time/posix-clock.c
index 25028dd4fa18..c340ca658f37 100644
--- a/kernel/time/posix-clock.c
+++ b/kernel/time/posix-clock.c
@@ -19,7 +19,6 @@
19 */ 19 */
20#include <linux/device.h> 20#include <linux/device.h>
21#include <linux/file.h> 21#include <linux/file.h>
22#include <linux/mutex.h>
23#include <linux/posix-clock.h> 22#include <linux/posix-clock.h>
24#include <linux/slab.h> 23#include <linux/slab.h>
25#include <linux/syscalls.h> 24#include <linux/syscalls.h>
@@ -34,19 +33,19 @@ static struct posix_clock *get_posix_clock(struct file *fp)
34{ 33{
35 struct posix_clock *clk = fp->private_data; 34 struct posix_clock *clk = fp->private_data;
36 35
37 mutex_lock(&clk->mutex); 36 down_read(&clk->rwsem);
38 37
39 if (!clk->zombie) 38 if (!clk->zombie)
40 return clk; 39 return clk;
41 40
42 mutex_unlock(&clk->mutex); 41 up_read(&clk->rwsem);
43 42
44 return NULL; 43 return NULL;
45} 44}
46 45
47static void put_posix_clock(struct posix_clock *clk) 46static void put_posix_clock(struct posix_clock *clk)
48{ 47{
49 mutex_unlock(&clk->mutex); 48 up_read(&clk->rwsem);
50} 49}
51 50
52static ssize_t posix_clock_read(struct file *fp, char __user *buf, 51static ssize_t posix_clock_read(struct file *fp, char __user *buf,
@@ -156,7 +155,7 @@ static int posix_clock_open(struct inode *inode, struct file *fp)
156 struct posix_clock *clk = 155 struct posix_clock *clk =
157 container_of(inode->i_cdev, struct posix_clock, cdev); 156 container_of(inode->i_cdev, struct posix_clock, cdev);
158 157
159 mutex_lock(&clk->mutex); 158 down_read(&clk->rwsem);
160 159
161 if (clk->zombie) { 160 if (clk->zombie) {
162 err = -ENODEV; 161 err = -ENODEV;
@@ -172,7 +171,7 @@ static int posix_clock_open(struct inode *inode, struct file *fp)
172 fp->private_data = clk; 171 fp->private_data = clk;
173 } 172 }
174out: 173out:
175 mutex_unlock(&clk->mutex); 174 up_read(&clk->rwsem);
176 return err; 175 return err;
177} 176}
178 177
@@ -211,25 +210,20 @@ int posix_clock_register(struct posix_clock *clk, dev_t devid)
211 int err; 210 int err;
212 211
213 kref_init(&clk->kref); 212 kref_init(&clk->kref);
214 mutex_init(&clk->mutex); 213 init_rwsem(&clk->rwsem);
215 214
216 cdev_init(&clk->cdev, &posix_clock_file_operations); 215 cdev_init(&clk->cdev, &posix_clock_file_operations);
217 clk->cdev.owner = clk->ops.owner; 216 clk->cdev.owner = clk->ops.owner;
218 err = cdev_add(&clk->cdev, devid, 1); 217 err = cdev_add(&clk->cdev, devid, 1);
219 if (err)
220 goto no_cdev;
221 218
222 return err; 219 return err;
223no_cdev:
224 mutex_destroy(&clk->mutex);
225 return err;
226} 220}
227EXPORT_SYMBOL_GPL(posix_clock_register); 221EXPORT_SYMBOL_GPL(posix_clock_register);
228 222
229static void delete_clock(struct kref *kref) 223static void delete_clock(struct kref *kref)
230{ 224{
231 struct posix_clock *clk = container_of(kref, struct posix_clock, kref); 225 struct posix_clock *clk = container_of(kref, struct posix_clock, kref);
232 mutex_destroy(&clk->mutex); 226
233 if (clk->release) 227 if (clk->release)
234 clk->release(clk); 228 clk->release(clk);
235} 229}
@@ -238,9 +232,9 @@ void posix_clock_unregister(struct posix_clock *clk)
238{ 232{
239 cdev_del(&clk->cdev); 233 cdev_del(&clk->cdev);
240 234
241 mutex_lock(&clk->mutex); 235 down_write(&clk->rwsem);
242 clk->zombie = true; 236 clk->zombie = true;
243 mutex_unlock(&clk->mutex); 237 up_write(&clk->rwsem);
244 238
245 kref_put(&clk->kref, delete_clock); 239 kref_put(&clk->kref, delete_clock);
246} 240}
diff --git a/kernel/time/tick-broadcast.c b/kernel/time/tick-broadcast.c
index da800ffa810c..723c7637e55a 100644
--- a/kernel/time/tick-broadcast.c
+++ b/kernel/time/tick-broadcast.c
@@ -522,10 +522,11 @@ static void tick_broadcast_init_next_event(struct cpumask *mask,
522 */ 522 */
523void tick_broadcast_setup_oneshot(struct clock_event_device *bc) 523void tick_broadcast_setup_oneshot(struct clock_event_device *bc)
524{ 524{
525 int cpu = smp_processor_id();
526
525 /* Set it up only once ! */ 527 /* Set it up only once ! */
526 if (bc->event_handler != tick_handle_oneshot_broadcast) { 528 if (bc->event_handler != tick_handle_oneshot_broadcast) {
527 int was_periodic = bc->mode == CLOCK_EVT_MODE_PERIODIC; 529 int was_periodic = bc->mode == CLOCK_EVT_MODE_PERIODIC;
528 int cpu = smp_processor_id();
529 530
530 bc->event_handler = tick_handle_oneshot_broadcast; 531 bc->event_handler = tick_handle_oneshot_broadcast;
531 clockevents_set_mode(bc, CLOCK_EVT_MODE_ONESHOT); 532 clockevents_set_mode(bc, CLOCK_EVT_MODE_ONESHOT);
@@ -551,6 +552,15 @@ void tick_broadcast_setup_oneshot(struct clock_event_device *bc)
551 tick_broadcast_set_event(tick_next_period, 1); 552 tick_broadcast_set_event(tick_next_period, 1);
552 } else 553 } else
553 bc->next_event.tv64 = KTIME_MAX; 554 bc->next_event.tv64 = KTIME_MAX;
555 } else {
556 /*
557 * The first cpu which switches to oneshot mode sets
558 * the bit for all other cpus which are in the general
559 * (periodic) broadcast mask. So the bit is set and
560 * would prevent the first broadcast enter after this
561 * to program the bc device.
562 */
563 tick_broadcast_clear_oneshot(cpu);
554 } 564 }
555} 565}
556 566
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index 8ad5d576755e..8e6a05a5915a 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -596,6 +596,58 @@ void __init timekeeping_init(void)
596static struct timespec timekeeping_suspend_time; 596static struct timespec timekeeping_suspend_time;
597 597
598/** 598/**
599 * __timekeeping_inject_sleeptime - Internal function to add sleep interval
600 * @delta: pointer to a timespec delta value
601 *
602 * Takes a timespec offset measuring a suspend interval and properly
603 * adds the sleep offset to the timekeeping variables.
604 */
605static void __timekeeping_inject_sleeptime(struct timespec *delta)
606{
607 xtime = timespec_add(xtime, *delta);
608 wall_to_monotonic = timespec_sub(wall_to_monotonic, *delta);
609 total_sleep_time = timespec_add(total_sleep_time, *delta);
610}
611
612
613/**
614 * timekeeping_inject_sleeptime - Adds suspend interval to timeekeeping values
615 * @delta: pointer to a timespec delta value
616 *
617 * This hook is for architectures that cannot support read_persistent_clock
618 * because their RTC/persistent clock is only accessible when irqs are enabled.
619 *
620 * This function should only be called by rtc_resume(), and allows
621 * a suspend offset to be injected into the timekeeping values.
622 */
623void timekeeping_inject_sleeptime(struct timespec *delta)
624{
625 unsigned long flags;
626 struct timespec ts;
627
628 /* Make sure we don't set the clock twice */
629 read_persistent_clock(&ts);
630 if (!(ts.tv_sec == 0 && ts.tv_nsec == 0))
631 return;
632
633 write_seqlock_irqsave(&xtime_lock, flags);
634 timekeeping_forward_now();
635
636 __timekeeping_inject_sleeptime(delta);
637
638 timekeeper.ntp_error = 0;
639 ntp_clear();
640 update_vsyscall(&xtime, &wall_to_monotonic, timekeeper.clock,
641 timekeeper.mult);
642
643 write_sequnlock_irqrestore(&xtime_lock, flags);
644
645 /* signal hrtimers about time change */
646 clock_was_set();
647}
648
649
650/**
599 * timekeeping_resume - Resumes the generic timekeeping subsystem. 651 * timekeeping_resume - Resumes the generic timekeeping subsystem.
600 * 652 *
601 * This is for the generic clocksource timekeeping. 653 * This is for the generic clocksource timekeeping.
@@ -615,9 +667,7 @@ static void timekeeping_resume(void)
615 667
616 if (timespec_compare(&ts, &timekeeping_suspend_time) > 0) { 668 if (timespec_compare(&ts, &timekeeping_suspend_time) > 0) {
617 ts = timespec_sub(ts, timekeeping_suspend_time); 669 ts = timespec_sub(ts, timekeeping_suspend_time);
618 xtime = timespec_add(xtime, ts); 670 __timekeeping_inject_sleeptime(&ts);
619 wall_to_monotonic = timespec_sub(wall_to_monotonic, ts);
620 total_sleep_time = timespec_add(total_sleep_time, ts);
621 } 671 }
622 /* re-base the last cycle value */ 672 /* re-base the last cycle value */
623 timekeeper.clock->cycle_last = timekeeper.clock->read(timekeeper.clock); 673 timekeeper.clock->cycle_last = timekeeper.clock->read(timekeeper.clock);
diff --git a/kernel/time/timer_stats.c b/kernel/time/timer_stats.c
index 2f3b585b8d7d..a5d0a3a85dd8 100644
--- a/kernel/time/timer_stats.c
+++ b/kernel/time/timer_stats.c
@@ -236,7 +236,7 @@ void timer_stats_update_stats(void *timer, pid_t pid, void *startf,
236 unsigned int timer_flag) 236 unsigned int timer_flag)
237{ 237{
238 /* 238 /*
239 * It doesnt matter which lock we take: 239 * It doesn't matter which lock we take:
240 */ 240 */
241 raw_spinlock_t *lock; 241 raw_spinlock_t *lock;
242 struct entry *entry, input; 242 struct entry *entry, input;
diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
index 61d7d59f4a1a..2ad39e556cb4 100644
--- a/kernel/trace/Kconfig
+++ b/kernel/trace/Kconfig
@@ -141,7 +141,7 @@ if FTRACE
141config FUNCTION_TRACER 141config FUNCTION_TRACER
142 bool "Kernel Function Tracer" 142 bool "Kernel Function Tracer"
143 depends on HAVE_FUNCTION_TRACER 143 depends on HAVE_FUNCTION_TRACER
144 select FRAME_POINTER if !ARM_UNWIND && !S390 144 select FRAME_POINTER if !ARM_UNWIND && !S390 && !MICROBLAZE
145 select KALLSYMS 145 select KALLSYMS
146 select GENERIC_TRACER 146 select GENERIC_TRACER
147 select CONTEXT_SWITCH_TRACER 147 select CONTEXT_SWITCH_TRACER
diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c
index 7aa40f8e182d..6957aa298dfa 100644
--- a/kernel/trace/blktrace.c
+++ b/kernel/trace/blktrace.c
@@ -850,29 +850,21 @@ static void blk_add_trace_plug(void *ignore, struct request_queue *q)
850 __blk_add_trace(bt, 0, 0, 0, BLK_TA_PLUG, 0, 0, NULL); 850 __blk_add_trace(bt, 0, 0, 0, BLK_TA_PLUG, 0, 0, NULL);
851} 851}
852 852
853static void blk_add_trace_unplug_io(void *ignore, struct request_queue *q) 853static void blk_add_trace_unplug(void *ignore, struct request_queue *q,
854 unsigned int depth, bool explicit)
854{ 855{
855 struct blk_trace *bt = q->blk_trace; 856 struct blk_trace *bt = q->blk_trace;
856 857
857 if (bt) { 858 if (bt) {
858 unsigned int pdu = q->rq.count[READ] + q->rq.count[WRITE]; 859 __be64 rpdu = cpu_to_be64(depth);
859 __be64 rpdu = cpu_to_be64(pdu); 860 u32 what;
860 861
861 __blk_add_trace(bt, 0, 0, 0, BLK_TA_UNPLUG_IO, 0, 862 if (explicit)
862 sizeof(rpdu), &rpdu); 863 what = BLK_TA_UNPLUG_IO;
863 } 864 else
864} 865 what = BLK_TA_UNPLUG_TIMER;
865
866static void blk_add_trace_unplug_timer(void *ignore, struct request_queue *q)
867{
868 struct blk_trace *bt = q->blk_trace;
869
870 if (bt) {
871 unsigned int pdu = q->rq.count[READ] + q->rq.count[WRITE];
872 __be64 rpdu = cpu_to_be64(pdu);
873 866
874 __blk_add_trace(bt, 0, 0, 0, BLK_TA_UNPLUG_TIMER, 0, 867 __blk_add_trace(bt, 0, 0, 0, what, 0, sizeof(rpdu), &rpdu);
875 sizeof(rpdu), &rpdu);
876 } 868 }
877} 869}
878 870
@@ -1015,9 +1007,7 @@ static void blk_register_tracepoints(void)
1015 WARN_ON(ret); 1007 WARN_ON(ret);
1016 ret = register_trace_block_plug(blk_add_trace_plug, NULL); 1008 ret = register_trace_block_plug(blk_add_trace_plug, NULL);
1017 WARN_ON(ret); 1009 WARN_ON(ret);
1018 ret = register_trace_block_unplug_timer(blk_add_trace_unplug_timer, NULL); 1010 ret = register_trace_block_unplug(blk_add_trace_unplug, NULL);
1019 WARN_ON(ret);
1020 ret = register_trace_block_unplug_io(blk_add_trace_unplug_io, NULL);
1021 WARN_ON(ret); 1011 WARN_ON(ret);
1022 ret = register_trace_block_split(blk_add_trace_split, NULL); 1012 ret = register_trace_block_split(blk_add_trace_split, NULL);
1023 WARN_ON(ret); 1013 WARN_ON(ret);
@@ -1032,8 +1022,7 @@ static void blk_unregister_tracepoints(void)
1032 unregister_trace_block_rq_remap(blk_add_trace_rq_remap, NULL); 1022 unregister_trace_block_rq_remap(blk_add_trace_rq_remap, NULL);
1033 unregister_trace_block_bio_remap(blk_add_trace_bio_remap, NULL); 1023 unregister_trace_block_bio_remap(blk_add_trace_bio_remap, NULL);
1034 unregister_trace_block_split(blk_add_trace_split, NULL); 1024 unregister_trace_block_split(blk_add_trace_split, NULL);
1035 unregister_trace_block_unplug_io(blk_add_trace_unplug_io, NULL); 1025 unregister_trace_block_unplug(blk_add_trace_unplug, NULL);
1036 unregister_trace_block_unplug_timer(blk_add_trace_unplug_timer, NULL);
1037 unregister_trace_block_plug(blk_add_trace_plug, NULL); 1026 unregister_trace_block_plug(blk_add_trace_plug, NULL);
1038 unregister_trace_block_sleeprq(blk_add_trace_sleeprq, NULL); 1027 unregister_trace_block_sleeprq(blk_add_trace_sleeprq, NULL);
1039 unregister_trace_block_getrq(blk_add_trace_getrq, NULL); 1028 unregister_trace_block_getrq(blk_add_trace_getrq, NULL);
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index c075f4ea6b94..d017c2c82c44 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -39,20 +39,26 @@
39#include "trace_stat.h" 39#include "trace_stat.h"
40 40
41#define FTRACE_WARN_ON(cond) \ 41#define FTRACE_WARN_ON(cond) \
42 do { \ 42 ({ \
43 if (WARN_ON(cond)) \ 43 int ___r = cond; \
44 if (WARN_ON(___r)) \
44 ftrace_kill(); \ 45 ftrace_kill(); \
45 } while (0) 46 ___r; \
47 })
46 48
47#define FTRACE_WARN_ON_ONCE(cond) \ 49#define FTRACE_WARN_ON_ONCE(cond) \
48 do { \ 50 ({ \
49 if (WARN_ON_ONCE(cond)) \ 51 int ___r = cond; \
52 if (WARN_ON_ONCE(___r)) \
50 ftrace_kill(); \ 53 ftrace_kill(); \
51 } while (0) 54 ___r; \
55 })
52 56
53/* hash bits for specific function selection */ 57/* hash bits for specific function selection */
54#define FTRACE_HASH_BITS 7 58#define FTRACE_HASH_BITS 7
55#define FTRACE_FUNC_HASHSIZE (1 << FTRACE_HASH_BITS) 59#define FTRACE_FUNC_HASHSIZE (1 << FTRACE_HASH_BITS)
60#define FTRACE_HASH_DEFAULT_BITS 10
61#define FTRACE_HASH_MAX_BITS 12
56 62
57/* ftrace_enabled is a method to turn ftrace on or off */ 63/* ftrace_enabled is a method to turn ftrace on or off */
58int ftrace_enabled __read_mostly; 64int ftrace_enabled __read_mostly;
@@ -81,23 +87,29 @@ static struct ftrace_ops ftrace_list_end __read_mostly =
81 .func = ftrace_stub, 87 .func = ftrace_stub,
82}; 88};
83 89
84static struct ftrace_ops *ftrace_list __read_mostly = &ftrace_list_end; 90static struct ftrace_ops *ftrace_global_list __read_mostly = &ftrace_list_end;
91static struct ftrace_ops *ftrace_ops_list __read_mostly = &ftrace_list_end;
85ftrace_func_t ftrace_trace_function __read_mostly = ftrace_stub; 92ftrace_func_t ftrace_trace_function __read_mostly = ftrace_stub;
86ftrace_func_t __ftrace_trace_function __read_mostly = ftrace_stub; 93ftrace_func_t __ftrace_trace_function __read_mostly = ftrace_stub;
87ftrace_func_t ftrace_pid_function __read_mostly = ftrace_stub; 94ftrace_func_t ftrace_pid_function __read_mostly = ftrace_stub;
95static struct ftrace_ops global_ops;
96
97static void
98ftrace_ops_list_func(unsigned long ip, unsigned long parent_ip);
88 99
89/* 100/*
90 * Traverse the ftrace_list, invoking all entries. The reason that we 101 * Traverse the ftrace_global_list, invoking all entries. The reason that we
91 * can use rcu_dereference_raw() is that elements removed from this list 102 * can use rcu_dereference_raw() is that elements removed from this list
92 * are simply leaked, so there is no need to interact with a grace-period 103 * are simply leaked, so there is no need to interact with a grace-period
93 * mechanism. The rcu_dereference_raw() calls are needed to handle 104 * mechanism. The rcu_dereference_raw() calls are needed to handle
94 * concurrent insertions into the ftrace_list. 105 * concurrent insertions into the ftrace_global_list.
95 * 106 *
96 * Silly Alpha and silly pointer-speculation compiler optimizations! 107 * Silly Alpha and silly pointer-speculation compiler optimizations!
97 */ 108 */
98static void ftrace_list_func(unsigned long ip, unsigned long parent_ip) 109static void ftrace_global_list_func(unsigned long ip,
110 unsigned long parent_ip)
99{ 111{
100 struct ftrace_ops *op = rcu_dereference_raw(ftrace_list); /*see above*/ 112 struct ftrace_ops *op = rcu_dereference_raw(ftrace_global_list); /*see above*/
101 113
102 while (op != &ftrace_list_end) { 114 while (op != &ftrace_list_end) {
103 op->func(ip, parent_ip); 115 op->func(ip, parent_ip);
@@ -147,46 +159,69 @@ static void ftrace_test_stop_func(unsigned long ip, unsigned long parent_ip)
147} 159}
148#endif 160#endif
149 161
150static int __register_ftrace_function(struct ftrace_ops *ops) 162static void update_global_ops(void)
151{ 163{
152 ops->next = ftrace_list; 164 ftrace_func_t func;
165
153 /* 166 /*
154 * We are entering ops into the ftrace_list but another 167 * If there's only one function registered, then call that
155 * CPU might be walking that list. We need to make sure 168 * function directly. Otherwise, we need to iterate over the
156 * the ops->next pointer is valid before another CPU sees 169 * registered callers.
157 * the ops pointer included into the ftrace_list.
158 */ 170 */
159 rcu_assign_pointer(ftrace_list, ops); 171 if (ftrace_global_list == &ftrace_list_end ||
172 ftrace_global_list->next == &ftrace_list_end)
173 func = ftrace_global_list->func;
174 else
175 func = ftrace_global_list_func;
160 176
161 if (ftrace_enabled) { 177 /* If we filter on pids, update to use the pid function */
162 ftrace_func_t func; 178 if (!list_empty(&ftrace_pids)) {
179 set_ftrace_pid_function(func);
180 func = ftrace_pid_func;
181 }
163 182
164 if (ops->next == &ftrace_list_end) 183 global_ops.func = func;
165 func = ops->func; 184}
166 else
167 func = ftrace_list_func;
168 185
169 if (!list_empty(&ftrace_pids)) { 186static void update_ftrace_function(void)
170 set_ftrace_pid_function(func); 187{
171 func = ftrace_pid_func; 188 ftrace_func_t func;
172 } 189
190 update_global_ops();
191
192 /*
193 * If we are at the end of the list and this ops is
194 * not dynamic, then have the mcount trampoline call
195 * the function directly
196 */
197 if (ftrace_ops_list == &ftrace_list_end ||
198 (ftrace_ops_list->next == &ftrace_list_end &&
199 !(ftrace_ops_list->flags & FTRACE_OPS_FL_DYNAMIC)))
200 func = ftrace_ops_list->func;
201 else
202 func = ftrace_ops_list_func;
173 203
174 /*
175 * For one func, simply call it directly.
176 * For more than one func, call the chain.
177 */
178#ifdef CONFIG_HAVE_FUNCTION_TRACE_MCOUNT_TEST 204#ifdef CONFIG_HAVE_FUNCTION_TRACE_MCOUNT_TEST
179 ftrace_trace_function = func; 205 ftrace_trace_function = func;
180#else 206#else
181 __ftrace_trace_function = func; 207 __ftrace_trace_function = func;
182 ftrace_trace_function = ftrace_test_stop_func; 208 ftrace_trace_function = ftrace_test_stop_func;
183#endif 209#endif
184 } 210}
185 211
186 return 0; 212static void add_ftrace_ops(struct ftrace_ops **list, struct ftrace_ops *ops)
213{
214 ops->next = *list;
215 /*
216 * We are entering ops into the list but another
217 * CPU might be walking that list. We need to make sure
218 * the ops->next pointer is valid before another CPU sees
219 * the ops pointer included into the list.
220 */
221 rcu_assign_pointer(*list, ops);
187} 222}
188 223
189static int __unregister_ftrace_function(struct ftrace_ops *ops) 224static int remove_ftrace_ops(struct ftrace_ops **list, struct ftrace_ops *ops)
190{ 225{
191 struct ftrace_ops **p; 226 struct ftrace_ops **p;
192 227
@@ -194,13 +229,12 @@ static int __unregister_ftrace_function(struct ftrace_ops *ops)
194 * If we are removing the last function, then simply point 229 * If we are removing the last function, then simply point
195 * to the ftrace_stub. 230 * to the ftrace_stub.
196 */ 231 */
197 if (ftrace_list == ops && ops->next == &ftrace_list_end) { 232 if (*list == ops && ops->next == &ftrace_list_end) {
198 ftrace_trace_function = ftrace_stub; 233 *list = &ftrace_list_end;
199 ftrace_list = &ftrace_list_end;
200 return 0; 234 return 0;
201 } 235 }
202 236
203 for (p = &ftrace_list; *p != &ftrace_list_end; p = &(*p)->next) 237 for (p = list; *p != &ftrace_list_end; p = &(*p)->next)
204 if (*p == ops) 238 if (*p == ops)
205 break; 239 break;
206 240
@@ -208,53 +242,83 @@ static int __unregister_ftrace_function(struct ftrace_ops *ops)
208 return -1; 242 return -1;
209 243
210 *p = (*p)->next; 244 *p = (*p)->next;
245 return 0;
246}
211 247
212 if (ftrace_enabled) { 248static int __register_ftrace_function(struct ftrace_ops *ops)
213 /* If we only have one func left, then call that directly */ 249{
214 if (ftrace_list->next == &ftrace_list_end) { 250 if (ftrace_disabled)
215 ftrace_func_t func = ftrace_list->func; 251 return -ENODEV;
216 252
217 if (!list_empty(&ftrace_pids)) { 253 if (FTRACE_WARN_ON(ops == &global_ops))
218 set_ftrace_pid_function(func); 254 return -EINVAL;
219 func = ftrace_pid_func; 255
220 } 256 if (WARN_ON(ops->flags & FTRACE_OPS_FL_ENABLED))
221#ifdef CONFIG_HAVE_FUNCTION_TRACE_MCOUNT_TEST 257 return -EBUSY;
222 ftrace_trace_function = func; 258
223#else 259 if (!core_kernel_data((unsigned long)ops))
224 __ftrace_trace_function = func; 260 ops->flags |= FTRACE_OPS_FL_DYNAMIC;
225#endif 261
226 } 262 if (ops->flags & FTRACE_OPS_FL_GLOBAL) {
227 } 263 int first = ftrace_global_list == &ftrace_list_end;
264 add_ftrace_ops(&ftrace_global_list, ops);
265 ops->flags |= FTRACE_OPS_FL_ENABLED;
266 if (first)
267 add_ftrace_ops(&ftrace_ops_list, &global_ops);
268 } else
269 add_ftrace_ops(&ftrace_ops_list, ops);
270
271 if (ftrace_enabled)
272 update_ftrace_function();
228 273
229 return 0; 274 return 0;
230} 275}
231 276
232static void ftrace_update_pid_func(void) 277static int __unregister_ftrace_function(struct ftrace_ops *ops)
233{ 278{
234 ftrace_func_t func; 279 int ret;
235 280
236 if (ftrace_trace_function == ftrace_stub) 281 if (ftrace_disabled)
237 return; 282 return -ENODEV;
238 283
239#ifdef CONFIG_HAVE_FUNCTION_TRACE_MCOUNT_TEST 284 if (WARN_ON(!(ops->flags & FTRACE_OPS_FL_ENABLED)))
240 func = ftrace_trace_function; 285 return -EBUSY;
241#else
242 func = __ftrace_trace_function;
243#endif
244 286
245 if (!list_empty(&ftrace_pids)) { 287 if (FTRACE_WARN_ON(ops == &global_ops))
246 set_ftrace_pid_function(func); 288 return -EINVAL;
247 func = ftrace_pid_func;
248 } else {
249 if (func == ftrace_pid_func)
250 func = ftrace_pid_function;
251 }
252 289
253#ifdef CONFIG_HAVE_FUNCTION_TRACE_MCOUNT_TEST 290 if (ops->flags & FTRACE_OPS_FL_GLOBAL) {
254 ftrace_trace_function = func; 291 ret = remove_ftrace_ops(&ftrace_global_list, ops);
255#else 292 if (!ret && ftrace_global_list == &ftrace_list_end)
256 __ftrace_trace_function = func; 293 ret = remove_ftrace_ops(&ftrace_ops_list, &global_ops);
257#endif 294 if (!ret)
295 ops->flags &= ~FTRACE_OPS_FL_ENABLED;
296 } else
297 ret = remove_ftrace_ops(&ftrace_ops_list, ops);
298
299 if (ret < 0)
300 return ret;
301
302 if (ftrace_enabled)
303 update_ftrace_function();
304
305 /*
306 * Dynamic ops may be freed, we must make sure that all
307 * callers are done before leaving this function.
308 */
309 if (ops->flags & FTRACE_OPS_FL_DYNAMIC)
310 synchronize_sched();
311
312 return 0;
313}
314
315static void ftrace_update_pid_func(void)
316{
317 /* Only do something if we are tracing something */
318 if (ftrace_trace_function == ftrace_stub)
319 return;
320
321 update_ftrace_function();
258} 322}
259 323
260#ifdef CONFIG_FUNCTION_PROFILER 324#ifdef CONFIG_FUNCTION_PROFILER
@@ -888,8 +952,35 @@ enum {
888 FTRACE_START_FUNC_RET = (1 << 3), 952 FTRACE_START_FUNC_RET = (1 << 3),
889 FTRACE_STOP_FUNC_RET = (1 << 4), 953 FTRACE_STOP_FUNC_RET = (1 << 4),
890}; 954};
955struct ftrace_func_entry {
956 struct hlist_node hlist;
957 unsigned long ip;
958};
891 959
892static int ftrace_filtered; 960struct ftrace_hash {
961 unsigned long size_bits;
962 struct hlist_head *buckets;
963 unsigned long count;
964 struct rcu_head rcu;
965};
966
967/*
968 * We make these constant because no one should touch them,
969 * but they are used as the default "empty hash", to avoid allocating
970 * it all the time. These are in a read only section such that if
971 * anyone does try to modify it, it will cause an exception.
972 */
973static const struct hlist_head empty_buckets[1];
974static const struct ftrace_hash empty_hash = {
975 .buckets = (struct hlist_head *)empty_buckets,
976};
977#define EMPTY_HASH ((struct ftrace_hash *)&empty_hash)
978
979static struct ftrace_ops global_ops = {
980 .func = ftrace_stub,
981 .notrace_hash = EMPTY_HASH,
982 .filter_hash = EMPTY_HASH,
983};
893 984
894static struct dyn_ftrace *ftrace_new_addrs; 985static struct dyn_ftrace *ftrace_new_addrs;
895 986
@@ -912,6 +1003,269 @@ static struct ftrace_page *ftrace_pages;
912 1003
913static struct dyn_ftrace *ftrace_free_records; 1004static struct dyn_ftrace *ftrace_free_records;
914 1005
1006static struct ftrace_func_entry *
1007ftrace_lookup_ip(struct ftrace_hash *hash, unsigned long ip)
1008{
1009 unsigned long key;
1010 struct ftrace_func_entry *entry;
1011 struct hlist_head *hhd;
1012 struct hlist_node *n;
1013
1014 if (!hash->count)
1015 return NULL;
1016
1017 if (hash->size_bits > 0)
1018 key = hash_long(ip, hash->size_bits);
1019 else
1020 key = 0;
1021
1022 hhd = &hash->buckets[key];
1023
1024 hlist_for_each_entry_rcu(entry, n, hhd, hlist) {
1025 if (entry->ip == ip)
1026 return entry;
1027 }
1028 return NULL;
1029}
1030
1031static void __add_hash_entry(struct ftrace_hash *hash,
1032 struct ftrace_func_entry *entry)
1033{
1034 struct hlist_head *hhd;
1035 unsigned long key;
1036
1037 if (hash->size_bits)
1038 key = hash_long(entry->ip, hash->size_bits);
1039 else
1040 key = 0;
1041
1042 hhd = &hash->buckets[key];
1043 hlist_add_head(&entry->hlist, hhd);
1044 hash->count++;
1045}
1046
1047static int add_hash_entry(struct ftrace_hash *hash, unsigned long ip)
1048{
1049 struct ftrace_func_entry *entry;
1050
1051 entry = kmalloc(sizeof(*entry), GFP_KERNEL);
1052 if (!entry)
1053 return -ENOMEM;
1054
1055 entry->ip = ip;
1056 __add_hash_entry(hash, entry);
1057
1058 return 0;
1059}
1060
1061static void
1062free_hash_entry(struct ftrace_hash *hash,
1063 struct ftrace_func_entry *entry)
1064{
1065 hlist_del(&entry->hlist);
1066 kfree(entry);
1067 hash->count--;
1068}
1069
1070static void
1071remove_hash_entry(struct ftrace_hash *hash,
1072 struct ftrace_func_entry *entry)
1073{
1074 hlist_del(&entry->hlist);
1075 hash->count--;
1076}
1077
1078static void ftrace_hash_clear(struct ftrace_hash *hash)
1079{
1080 struct hlist_head *hhd;
1081 struct hlist_node *tp, *tn;
1082 struct ftrace_func_entry *entry;
1083 int size = 1 << hash->size_bits;
1084 int i;
1085
1086 if (!hash->count)
1087 return;
1088
1089 for (i = 0; i < size; i++) {
1090 hhd = &hash->buckets[i];
1091 hlist_for_each_entry_safe(entry, tp, tn, hhd, hlist)
1092 free_hash_entry(hash, entry);
1093 }
1094 FTRACE_WARN_ON(hash->count);
1095}
1096
1097static void free_ftrace_hash(struct ftrace_hash *hash)
1098{
1099 if (!hash || hash == EMPTY_HASH)
1100 return;
1101 ftrace_hash_clear(hash);
1102 kfree(hash->buckets);
1103 kfree(hash);
1104}
1105
1106static void __free_ftrace_hash_rcu(struct rcu_head *rcu)
1107{
1108 struct ftrace_hash *hash;
1109
1110 hash = container_of(rcu, struct ftrace_hash, rcu);
1111 free_ftrace_hash(hash);
1112}
1113
1114static void free_ftrace_hash_rcu(struct ftrace_hash *hash)
1115{
1116 if (!hash || hash == EMPTY_HASH)
1117 return;
1118 call_rcu_sched(&hash->rcu, __free_ftrace_hash_rcu);
1119}
1120
1121static struct ftrace_hash *alloc_ftrace_hash(int size_bits)
1122{
1123 struct ftrace_hash *hash;
1124 int size;
1125
1126 hash = kzalloc(sizeof(*hash), GFP_KERNEL);
1127 if (!hash)
1128 return NULL;
1129
1130 size = 1 << size_bits;
1131 hash->buckets = kzalloc(sizeof(*hash->buckets) * size, GFP_KERNEL);
1132
1133 if (!hash->buckets) {
1134 kfree(hash);
1135 return NULL;
1136 }
1137
1138 hash->size_bits = size_bits;
1139
1140 return hash;
1141}
1142
1143static struct ftrace_hash *
1144alloc_and_copy_ftrace_hash(int size_bits, struct ftrace_hash *hash)
1145{
1146 struct ftrace_func_entry *entry;
1147 struct ftrace_hash *new_hash;
1148 struct hlist_node *tp;
1149 int size;
1150 int ret;
1151 int i;
1152
1153 new_hash = alloc_ftrace_hash(size_bits);
1154 if (!new_hash)
1155 return NULL;
1156
1157 /* Empty hash? */
1158 if (!hash || !hash->count)
1159 return new_hash;
1160
1161 size = 1 << hash->size_bits;
1162 for (i = 0; i < size; i++) {
1163 hlist_for_each_entry(entry, tp, &hash->buckets[i], hlist) {
1164 ret = add_hash_entry(new_hash, entry->ip);
1165 if (ret < 0)
1166 goto free_hash;
1167 }
1168 }
1169
1170 FTRACE_WARN_ON(new_hash->count != hash->count);
1171
1172 return new_hash;
1173
1174 free_hash:
1175 free_ftrace_hash(new_hash);
1176 return NULL;
1177}
1178
1179static int
1180ftrace_hash_move(struct ftrace_hash **dst, struct ftrace_hash *src)
1181{
1182 struct ftrace_func_entry *entry;
1183 struct hlist_node *tp, *tn;
1184 struct hlist_head *hhd;
1185 struct ftrace_hash *old_hash;
1186 struct ftrace_hash *new_hash;
1187 unsigned long key;
1188 int size = src->count;
1189 int bits = 0;
1190 int i;
1191
1192 /*
1193 * If the new source is empty, just free dst and assign it
1194 * the empty_hash.
1195 */
1196 if (!src->count) {
1197 free_ftrace_hash_rcu(*dst);
1198 rcu_assign_pointer(*dst, EMPTY_HASH);
1199 return 0;
1200 }
1201
1202 /*
1203 * Make the hash size about 1/2 the # found
1204 */
1205 for (size /= 2; size; size >>= 1)
1206 bits++;
1207
1208 /* Don't allocate too much */
1209 if (bits > FTRACE_HASH_MAX_BITS)
1210 bits = FTRACE_HASH_MAX_BITS;
1211
1212 new_hash = alloc_ftrace_hash(bits);
1213 if (!new_hash)
1214 return -ENOMEM;
1215
1216 size = 1 << src->size_bits;
1217 for (i = 0; i < size; i++) {
1218 hhd = &src->buckets[i];
1219 hlist_for_each_entry_safe(entry, tp, tn, hhd, hlist) {
1220 if (bits > 0)
1221 key = hash_long(entry->ip, bits);
1222 else
1223 key = 0;
1224 remove_hash_entry(src, entry);
1225 __add_hash_entry(new_hash, entry);
1226 }
1227 }
1228
1229 old_hash = *dst;
1230 rcu_assign_pointer(*dst, new_hash);
1231 free_ftrace_hash_rcu(old_hash);
1232
1233 return 0;
1234}
1235
1236/*
1237 * Test the hashes for this ops to see if we want to call
1238 * the ops->func or not.
1239 *
1240 * It's a match if the ip is in the ops->filter_hash or
1241 * the filter_hash does not exist or is empty,
1242 * AND
1243 * the ip is not in the ops->notrace_hash.
1244 *
1245 * This needs to be called with preemption disabled as
1246 * the hashes are freed with call_rcu_sched().
1247 */
1248static int
1249ftrace_ops_test(struct ftrace_ops *ops, unsigned long ip)
1250{
1251 struct ftrace_hash *filter_hash;
1252 struct ftrace_hash *notrace_hash;
1253 int ret;
1254
1255 filter_hash = rcu_dereference_raw(ops->filter_hash);
1256 notrace_hash = rcu_dereference_raw(ops->notrace_hash);
1257
1258 if ((!filter_hash || !filter_hash->count ||
1259 ftrace_lookup_ip(filter_hash, ip)) &&
1260 (!notrace_hash || !notrace_hash->count ||
1261 !ftrace_lookup_ip(notrace_hash, ip)))
1262 ret = 1;
1263 else
1264 ret = 0;
1265
1266 return ret;
1267}
1268
915/* 1269/*
916 * This is a double for. Do not use 'break' to break out of the loop, 1270 * This is a double for. Do not use 'break' to break out of the loop,
917 * you must use a goto. 1271 * you must use a goto.
@@ -926,6 +1280,105 @@ static struct dyn_ftrace *ftrace_free_records;
926 } \ 1280 } \
927 } 1281 }
928 1282
1283static void __ftrace_hash_rec_update(struct ftrace_ops *ops,
1284 int filter_hash,
1285 bool inc)
1286{
1287 struct ftrace_hash *hash;
1288 struct ftrace_hash *other_hash;
1289 struct ftrace_page *pg;
1290 struct dyn_ftrace *rec;
1291 int count = 0;
1292 int all = 0;
1293
1294 /* Only update if the ops has been registered */
1295 if (!(ops->flags & FTRACE_OPS_FL_ENABLED))
1296 return;
1297
1298 /*
1299 * In the filter_hash case:
1300 * If the count is zero, we update all records.
1301 * Otherwise we just update the items in the hash.
1302 *
1303 * In the notrace_hash case:
1304 * We enable the update in the hash.
1305 * As disabling notrace means enabling the tracing,
1306 * and enabling notrace means disabling, the inc variable
1307 * gets inversed.
1308 */
1309 if (filter_hash) {
1310 hash = ops->filter_hash;
1311 other_hash = ops->notrace_hash;
1312 if (!hash || !hash->count)
1313 all = 1;
1314 } else {
1315 inc = !inc;
1316 hash = ops->notrace_hash;
1317 other_hash = ops->filter_hash;
1318 /*
1319 * If the notrace hash has no items,
1320 * then there's nothing to do.
1321 */
1322 if (hash && !hash->count)
1323 return;
1324 }
1325
1326 do_for_each_ftrace_rec(pg, rec) {
1327 int in_other_hash = 0;
1328 int in_hash = 0;
1329 int match = 0;
1330
1331 if (all) {
1332 /*
1333 * Only the filter_hash affects all records.
1334 * Update if the record is not in the notrace hash.
1335 */
1336 if (!other_hash || !ftrace_lookup_ip(other_hash, rec->ip))
1337 match = 1;
1338 } else {
1339 in_hash = hash && !!ftrace_lookup_ip(hash, rec->ip);
1340 in_other_hash = other_hash && !!ftrace_lookup_ip(other_hash, rec->ip);
1341
1342 /*
1343 *
1344 */
1345 if (filter_hash && in_hash && !in_other_hash)
1346 match = 1;
1347 else if (!filter_hash && in_hash &&
1348 (in_other_hash || !other_hash->count))
1349 match = 1;
1350 }
1351 if (!match)
1352 continue;
1353
1354 if (inc) {
1355 rec->flags++;
1356 if (FTRACE_WARN_ON((rec->flags & ~FTRACE_FL_MASK) == FTRACE_REF_MAX))
1357 return;
1358 } else {
1359 if (FTRACE_WARN_ON((rec->flags & ~FTRACE_FL_MASK) == 0))
1360 return;
1361 rec->flags--;
1362 }
1363 count++;
1364 /* Shortcut, if we handled all records, we are done. */
1365 if (!all && count == hash->count)
1366 return;
1367 } while_for_each_ftrace_rec();
1368}
1369
1370static void ftrace_hash_rec_disable(struct ftrace_ops *ops,
1371 int filter_hash)
1372{
1373 __ftrace_hash_rec_update(ops, filter_hash, 0);
1374}
1375
1376static void ftrace_hash_rec_enable(struct ftrace_ops *ops,
1377 int filter_hash)
1378{
1379 __ftrace_hash_rec_update(ops, filter_hash, 1);
1380}
1381
929static void ftrace_free_rec(struct dyn_ftrace *rec) 1382static void ftrace_free_rec(struct dyn_ftrace *rec)
930{ 1383{
931 rec->freelist = ftrace_free_records; 1384 rec->freelist = ftrace_free_records;
@@ -1047,18 +1500,18 @@ __ftrace_replace_code(struct dyn_ftrace *rec, int enable)
1047 ftrace_addr = (unsigned long)FTRACE_ADDR; 1500 ftrace_addr = (unsigned long)FTRACE_ADDR;
1048 1501
1049 /* 1502 /*
1050 * If this record is not to be traced or we want to disable it, 1503 * If we are enabling tracing:
1051 * then disable it. 1504 *
1505 * If the record has a ref count, then we need to enable it
1506 * because someone is using it.
1052 * 1507 *
1053 * If we want to enable it and filtering is off, then enable it. 1508 * Otherwise we make sure its disabled.
1054 * 1509 *
1055 * If we want to enable it and filtering is on, enable it only if 1510 * If we are disabling tracing, then disable all records that
1056 * it's filtered 1511 * are enabled.
1057 */ 1512 */
1058 if (enable && !(rec->flags & FTRACE_FL_NOTRACE)) { 1513 if (enable && (rec->flags & ~FTRACE_FL_MASK))
1059 if (!ftrace_filtered || (rec->flags & FTRACE_FL_FILTER)) 1514 flag = FTRACE_FL_ENABLED;
1060 flag = FTRACE_FL_ENABLED;
1061 }
1062 1515
1063 /* If the state of this record hasn't changed, then do nothing */ 1516 /* If the state of this record hasn't changed, then do nothing */
1064 if ((rec->flags & FTRACE_FL_ENABLED) == flag) 1517 if ((rec->flags & FTRACE_FL_ENABLED) == flag)
@@ -1079,19 +1532,16 @@ static void ftrace_replace_code(int enable)
1079 struct ftrace_page *pg; 1532 struct ftrace_page *pg;
1080 int failed; 1533 int failed;
1081 1534
1535 if (unlikely(ftrace_disabled))
1536 return;
1537
1082 do_for_each_ftrace_rec(pg, rec) { 1538 do_for_each_ftrace_rec(pg, rec) {
1083 /* 1539 /* Skip over free records */
1084 * Skip over free records, records that have 1540 if (rec->flags & FTRACE_FL_FREE)
1085 * failed and not converted.
1086 */
1087 if (rec->flags & FTRACE_FL_FREE ||
1088 rec->flags & FTRACE_FL_FAILED ||
1089 !(rec->flags & FTRACE_FL_CONVERTED))
1090 continue; 1541 continue;
1091 1542
1092 failed = __ftrace_replace_code(rec, enable); 1543 failed = __ftrace_replace_code(rec, enable);
1093 if (failed) { 1544 if (failed) {
1094 rec->flags |= FTRACE_FL_FAILED;
1095 ftrace_bug(failed, rec->ip); 1545 ftrace_bug(failed, rec->ip);
1096 /* Stop processing */ 1546 /* Stop processing */
1097 return; 1547 return;
@@ -1107,10 +1557,12 @@ ftrace_code_disable(struct module *mod, struct dyn_ftrace *rec)
1107 1557
1108 ip = rec->ip; 1558 ip = rec->ip;
1109 1559
1560 if (unlikely(ftrace_disabled))
1561 return 0;
1562
1110 ret = ftrace_make_nop(mod, rec, MCOUNT_ADDR); 1563 ret = ftrace_make_nop(mod, rec, MCOUNT_ADDR);
1111 if (ret) { 1564 if (ret) {
1112 ftrace_bug(ret, ip); 1565 ftrace_bug(ret, ip);
1113 rec->flags |= FTRACE_FL_FAILED;
1114 return 0; 1566 return 0;
1115 } 1567 }
1116 return 1; 1568 return 1;
@@ -1171,6 +1623,7 @@ static void ftrace_run_update_code(int command)
1171 1623
1172static ftrace_func_t saved_ftrace_func; 1624static ftrace_func_t saved_ftrace_func;
1173static int ftrace_start_up; 1625static int ftrace_start_up;
1626static int global_start_up;
1174 1627
1175static void ftrace_startup_enable(int command) 1628static void ftrace_startup_enable(int command)
1176{ 1629{
@@ -1185,19 +1638,36 @@ static void ftrace_startup_enable(int command)
1185 ftrace_run_update_code(command); 1638 ftrace_run_update_code(command);
1186} 1639}
1187 1640
1188static void ftrace_startup(int command) 1641static void ftrace_startup(struct ftrace_ops *ops, int command)
1189{ 1642{
1643 bool hash_enable = true;
1644
1190 if (unlikely(ftrace_disabled)) 1645 if (unlikely(ftrace_disabled))
1191 return; 1646 return;
1192 1647
1193 ftrace_start_up++; 1648 ftrace_start_up++;
1194 command |= FTRACE_ENABLE_CALLS; 1649 command |= FTRACE_ENABLE_CALLS;
1195 1650
1651 /* ops marked global share the filter hashes */
1652 if (ops->flags & FTRACE_OPS_FL_GLOBAL) {
1653 ops = &global_ops;
1654 /* Don't update hash if global is already set */
1655 if (global_start_up)
1656 hash_enable = false;
1657 global_start_up++;
1658 }
1659
1660 ops->flags |= FTRACE_OPS_FL_ENABLED;
1661 if (hash_enable)
1662 ftrace_hash_rec_enable(ops, 1);
1663
1196 ftrace_startup_enable(command); 1664 ftrace_startup_enable(command);
1197} 1665}
1198 1666
1199static void ftrace_shutdown(int command) 1667static void ftrace_shutdown(struct ftrace_ops *ops, int command)
1200{ 1668{
1669 bool hash_disable = true;
1670
1201 if (unlikely(ftrace_disabled)) 1671 if (unlikely(ftrace_disabled))
1202 return; 1672 return;
1203 1673
@@ -1209,6 +1679,23 @@ static void ftrace_shutdown(int command)
1209 */ 1679 */
1210 WARN_ON_ONCE(ftrace_start_up < 0); 1680 WARN_ON_ONCE(ftrace_start_up < 0);
1211 1681
1682 if (ops->flags & FTRACE_OPS_FL_GLOBAL) {
1683 ops = &global_ops;
1684 global_start_up--;
1685 WARN_ON_ONCE(global_start_up < 0);
1686 /* Don't update hash if global still has users */
1687 if (global_start_up) {
1688 WARN_ON_ONCE(!ftrace_start_up);
1689 hash_disable = false;
1690 }
1691 }
1692
1693 if (hash_disable)
1694 ftrace_hash_rec_disable(ops, 1);
1695
1696 if (ops != &global_ops || !global_start_up)
1697 ops->flags &= ~FTRACE_OPS_FL_ENABLED;
1698
1212 if (!ftrace_start_up) 1699 if (!ftrace_start_up)
1213 command |= FTRACE_DISABLE_CALLS; 1700 command |= FTRACE_DISABLE_CALLS;
1214 1701
@@ -1268,15 +1755,15 @@ static int ftrace_update_code(struct module *mod)
1268 p->flags = 0L; 1755 p->flags = 0L;
1269 1756
1270 /* 1757 /*
1271 * Do the initial record convertion from mcount jump 1758 * Do the initial record conversion from mcount jump
1272 * to the NOP instructions. 1759 * to the NOP instructions.
1273 */ 1760 */
1274 if (!ftrace_code_disable(mod, p)) { 1761 if (!ftrace_code_disable(mod, p)) {
1275 ftrace_free_rec(p); 1762 ftrace_free_rec(p);
1276 continue; 1763 /* Game over */
1764 break;
1277 } 1765 }
1278 1766
1279 p->flags |= FTRACE_FL_CONVERTED;
1280 ftrace_update_cnt++; 1767 ftrace_update_cnt++;
1281 1768
1282 /* 1769 /*
@@ -1351,9 +1838,9 @@ static int __init ftrace_dyn_table_alloc(unsigned long num_to_init)
1351enum { 1838enum {
1352 FTRACE_ITER_FILTER = (1 << 0), 1839 FTRACE_ITER_FILTER = (1 << 0),
1353 FTRACE_ITER_NOTRACE = (1 << 1), 1840 FTRACE_ITER_NOTRACE = (1 << 1),
1354 FTRACE_ITER_FAILURES = (1 << 2), 1841 FTRACE_ITER_PRINTALL = (1 << 2),
1355 FTRACE_ITER_PRINTALL = (1 << 3), 1842 FTRACE_ITER_HASH = (1 << 3),
1356 FTRACE_ITER_HASH = (1 << 4), 1843 FTRACE_ITER_ENABLED = (1 << 4),
1357}; 1844};
1358 1845
1359#define FTRACE_BUFF_MAX (KSYM_SYMBOL_LEN+4) /* room for wildcards */ 1846#define FTRACE_BUFF_MAX (KSYM_SYMBOL_LEN+4) /* room for wildcards */
@@ -1365,6 +1852,8 @@ struct ftrace_iterator {
1365 struct dyn_ftrace *func; 1852 struct dyn_ftrace *func;
1366 struct ftrace_func_probe *probe; 1853 struct ftrace_func_probe *probe;
1367 struct trace_parser parser; 1854 struct trace_parser parser;
1855 struct ftrace_hash *hash;
1856 struct ftrace_ops *ops;
1368 int hidx; 1857 int hidx;
1369 int idx; 1858 int idx;
1370 unsigned flags; 1859 unsigned flags;
@@ -1461,8 +1950,12 @@ static void *
1461t_next(struct seq_file *m, void *v, loff_t *pos) 1950t_next(struct seq_file *m, void *v, loff_t *pos)
1462{ 1951{
1463 struct ftrace_iterator *iter = m->private; 1952 struct ftrace_iterator *iter = m->private;
1953 struct ftrace_ops *ops = &global_ops;
1464 struct dyn_ftrace *rec = NULL; 1954 struct dyn_ftrace *rec = NULL;
1465 1955
1956 if (unlikely(ftrace_disabled))
1957 return NULL;
1958
1466 if (iter->flags & FTRACE_ITER_HASH) 1959 if (iter->flags & FTRACE_ITER_HASH)
1467 return t_hash_next(m, pos); 1960 return t_hash_next(m, pos);
1468 1961
@@ -1483,17 +1976,15 @@ t_next(struct seq_file *m, void *v, loff_t *pos)
1483 rec = &iter->pg->records[iter->idx++]; 1976 rec = &iter->pg->records[iter->idx++];
1484 if ((rec->flags & FTRACE_FL_FREE) || 1977 if ((rec->flags & FTRACE_FL_FREE) ||
1485 1978
1486 (!(iter->flags & FTRACE_ITER_FAILURES) &&
1487 (rec->flags & FTRACE_FL_FAILED)) ||
1488
1489 ((iter->flags & FTRACE_ITER_FAILURES) &&
1490 !(rec->flags & FTRACE_FL_FAILED)) ||
1491
1492 ((iter->flags & FTRACE_ITER_FILTER) && 1979 ((iter->flags & FTRACE_ITER_FILTER) &&
1493 !(rec->flags & FTRACE_FL_FILTER)) || 1980 !(ftrace_lookup_ip(ops->filter_hash, rec->ip))) ||
1494 1981
1495 ((iter->flags & FTRACE_ITER_NOTRACE) && 1982 ((iter->flags & FTRACE_ITER_NOTRACE) &&
1496 !(rec->flags & FTRACE_FL_NOTRACE))) { 1983 !ftrace_lookup_ip(ops->notrace_hash, rec->ip)) ||
1984
1985 ((iter->flags & FTRACE_ITER_ENABLED) &&
1986 !(rec->flags & ~FTRACE_FL_MASK))) {
1987
1497 rec = NULL; 1988 rec = NULL;
1498 goto retry; 1989 goto retry;
1499 } 1990 }
@@ -1517,10 +2008,15 @@ static void reset_iter_read(struct ftrace_iterator *iter)
1517static void *t_start(struct seq_file *m, loff_t *pos) 2008static void *t_start(struct seq_file *m, loff_t *pos)
1518{ 2009{
1519 struct ftrace_iterator *iter = m->private; 2010 struct ftrace_iterator *iter = m->private;
2011 struct ftrace_ops *ops = &global_ops;
1520 void *p = NULL; 2012 void *p = NULL;
1521 loff_t l; 2013 loff_t l;
1522 2014
1523 mutex_lock(&ftrace_lock); 2015 mutex_lock(&ftrace_lock);
2016
2017 if (unlikely(ftrace_disabled))
2018 return NULL;
2019
1524 /* 2020 /*
1525 * If an lseek was done, then reset and start from beginning. 2021 * If an lseek was done, then reset and start from beginning.
1526 */ 2022 */
@@ -1532,7 +2028,7 @@ static void *t_start(struct seq_file *m, loff_t *pos)
1532 * off, we can short cut and just print out that all 2028 * off, we can short cut and just print out that all
1533 * functions are enabled. 2029 * functions are enabled.
1534 */ 2030 */
1535 if (iter->flags & FTRACE_ITER_FILTER && !ftrace_filtered) { 2031 if (iter->flags & FTRACE_ITER_FILTER && !ops->filter_hash->count) {
1536 if (*pos > 0) 2032 if (*pos > 0)
1537 return t_hash_start(m, pos); 2033 return t_hash_start(m, pos);
1538 iter->flags |= FTRACE_ITER_PRINTALL; 2034 iter->flags |= FTRACE_ITER_PRINTALL;
@@ -1590,7 +2086,11 @@ static int t_show(struct seq_file *m, void *v)
1590 if (!rec) 2086 if (!rec)
1591 return 0; 2087 return 0;
1592 2088
1593 seq_printf(m, "%ps\n", (void *)rec->ip); 2089 seq_printf(m, "%ps", (void *)rec->ip);
2090 if (iter->flags & FTRACE_ITER_ENABLED)
2091 seq_printf(m, " (%ld)",
2092 rec->flags & ~FTRACE_FL_MASK);
2093 seq_printf(m, "\n");
1594 2094
1595 return 0; 2095 return 0;
1596} 2096}
@@ -1630,44 +2130,46 @@ ftrace_avail_open(struct inode *inode, struct file *file)
1630} 2130}
1631 2131
1632static int 2132static int
1633ftrace_failures_open(struct inode *inode, struct file *file) 2133ftrace_enabled_open(struct inode *inode, struct file *file)
1634{ 2134{
1635 int ret;
1636 struct seq_file *m;
1637 struct ftrace_iterator *iter; 2135 struct ftrace_iterator *iter;
2136 int ret;
2137
2138 if (unlikely(ftrace_disabled))
2139 return -ENODEV;
2140
2141 iter = kzalloc(sizeof(*iter), GFP_KERNEL);
2142 if (!iter)
2143 return -ENOMEM;
2144
2145 iter->pg = ftrace_pages_start;
2146 iter->flags = FTRACE_ITER_ENABLED;
1638 2147
1639 ret = ftrace_avail_open(inode, file); 2148 ret = seq_open(file, &show_ftrace_seq_ops);
1640 if (!ret) { 2149 if (!ret) {
1641 m = file->private_data; 2150 struct seq_file *m = file->private_data;
1642 iter = m->private; 2151
1643 iter->flags = FTRACE_ITER_FAILURES; 2152 m->private = iter;
2153 } else {
2154 kfree(iter);
1644 } 2155 }
1645 2156
1646 return ret; 2157 return ret;
1647} 2158}
1648 2159
1649 2160static void ftrace_filter_reset(struct ftrace_hash *hash)
1650static void ftrace_filter_reset(int enable)
1651{ 2161{
1652 struct ftrace_page *pg;
1653 struct dyn_ftrace *rec;
1654 unsigned long type = enable ? FTRACE_FL_FILTER : FTRACE_FL_NOTRACE;
1655
1656 mutex_lock(&ftrace_lock); 2162 mutex_lock(&ftrace_lock);
1657 if (enable) 2163 ftrace_hash_clear(hash);
1658 ftrace_filtered = 0;
1659 do_for_each_ftrace_rec(pg, rec) {
1660 if (rec->flags & FTRACE_FL_FAILED)
1661 continue;
1662 rec->flags &= ~type;
1663 } while_for_each_ftrace_rec();
1664 mutex_unlock(&ftrace_lock); 2164 mutex_unlock(&ftrace_lock);
1665} 2165}
1666 2166
1667static int 2167static int
1668ftrace_regex_open(struct inode *inode, struct file *file, int enable) 2168ftrace_regex_open(struct ftrace_ops *ops, int flag,
2169 struct inode *inode, struct file *file)
1669{ 2170{
1670 struct ftrace_iterator *iter; 2171 struct ftrace_iterator *iter;
2172 struct ftrace_hash *hash;
1671 int ret = 0; 2173 int ret = 0;
1672 2174
1673 if (unlikely(ftrace_disabled)) 2175 if (unlikely(ftrace_disabled))
@@ -1682,21 +2184,42 @@ ftrace_regex_open(struct inode *inode, struct file *file, int enable)
1682 return -ENOMEM; 2184 return -ENOMEM;
1683 } 2185 }
1684 2186
2187 if (flag & FTRACE_ITER_NOTRACE)
2188 hash = ops->notrace_hash;
2189 else
2190 hash = ops->filter_hash;
2191
2192 iter->ops = ops;
2193 iter->flags = flag;
2194
2195 if (file->f_mode & FMODE_WRITE) {
2196 mutex_lock(&ftrace_lock);
2197 iter->hash = alloc_and_copy_ftrace_hash(FTRACE_HASH_DEFAULT_BITS, hash);
2198 mutex_unlock(&ftrace_lock);
2199
2200 if (!iter->hash) {
2201 trace_parser_put(&iter->parser);
2202 kfree(iter);
2203 return -ENOMEM;
2204 }
2205 }
2206
1685 mutex_lock(&ftrace_regex_lock); 2207 mutex_lock(&ftrace_regex_lock);
2208
1686 if ((file->f_mode & FMODE_WRITE) && 2209 if ((file->f_mode & FMODE_WRITE) &&
1687 (file->f_flags & O_TRUNC)) 2210 (file->f_flags & O_TRUNC))
1688 ftrace_filter_reset(enable); 2211 ftrace_filter_reset(iter->hash);
1689 2212
1690 if (file->f_mode & FMODE_READ) { 2213 if (file->f_mode & FMODE_READ) {
1691 iter->pg = ftrace_pages_start; 2214 iter->pg = ftrace_pages_start;
1692 iter->flags = enable ? FTRACE_ITER_FILTER :
1693 FTRACE_ITER_NOTRACE;
1694 2215
1695 ret = seq_open(file, &show_ftrace_seq_ops); 2216 ret = seq_open(file, &show_ftrace_seq_ops);
1696 if (!ret) { 2217 if (!ret) {
1697 struct seq_file *m = file->private_data; 2218 struct seq_file *m = file->private_data;
1698 m->private = iter; 2219 m->private = iter;
1699 } else { 2220 } else {
2221 /* Failed */
2222 free_ftrace_hash(iter->hash);
1700 trace_parser_put(&iter->parser); 2223 trace_parser_put(&iter->parser);
1701 kfree(iter); 2224 kfree(iter);
1702 } 2225 }
@@ -1710,13 +2233,15 @@ ftrace_regex_open(struct inode *inode, struct file *file, int enable)
1710static int 2233static int
1711ftrace_filter_open(struct inode *inode, struct file *file) 2234ftrace_filter_open(struct inode *inode, struct file *file)
1712{ 2235{
1713 return ftrace_regex_open(inode, file, 1); 2236 return ftrace_regex_open(&global_ops, FTRACE_ITER_FILTER,
2237 inode, file);
1714} 2238}
1715 2239
1716static int 2240static int
1717ftrace_notrace_open(struct inode *inode, struct file *file) 2241ftrace_notrace_open(struct inode *inode, struct file *file)
1718{ 2242{
1719 return ftrace_regex_open(inode, file, 0); 2243 return ftrace_regex_open(&global_ops, FTRACE_ITER_NOTRACE,
2244 inode, file);
1720} 2245}
1721 2246
1722static loff_t 2247static loff_t
@@ -1761,86 +2286,99 @@ static int ftrace_match(char *str, char *regex, int len, int type)
1761} 2286}
1762 2287
1763static int 2288static int
1764ftrace_match_record(struct dyn_ftrace *rec, char *regex, int len, int type) 2289enter_record(struct ftrace_hash *hash, struct dyn_ftrace *rec, int not)
2290{
2291 struct ftrace_func_entry *entry;
2292 int ret = 0;
2293
2294 entry = ftrace_lookup_ip(hash, rec->ip);
2295 if (not) {
2296 /* Do nothing if it doesn't exist */
2297 if (!entry)
2298 return 0;
2299
2300 free_hash_entry(hash, entry);
2301 } else {
2302 /* Do nothing if it exists */
2303 if (entry)
2304 return 0;
2305
2306 ret = add_hash_entry(hash, rec->ip);
2307 }
2308 return ret;
2309}
2310
2311static int
2312ftrace_match_record(struct dyn_ftrace *rec, char *mod,
2313 char *regex, int len, int type)
1765{ 2314{
1766 char str[KSYM_SYMBOL_LEN]; 2315 char str[KSYM_SYMBOL_LEN];
2316 char *modname;
2317
2318 kallsyms_lookup(rec->ip, NULL, NULL, &modname, str);
2319
2320 if (mod) {
2321 /* module lookup requires matching the module */
2322 if (!modname || strcmp(modname, mod))
2323 return 0;
2324
2325 /* blank search means to match all funcs in the mod */
2326 if (!len)
2327 return 1;
2328 }
1767 2329
1768 kallsyms_lookup(rec->ip, NULL, NULL, NULL, str);
1769 return ftrace_match(str, regex, len, type); 2330 return ftrace_match(str, regex, len, type);
1770} 2331}
1771 2332
1772static int ftrace_match_records(char *buff, int len, int enable) 2333static int
2334match_records(struct ftrace_hash *hash, char *buff,
2335 int len, char *mod, int not)
1773{ 2336{
1774 unsigned int search_len; 2337 unsigned search_len = 0;
1775 struct ftrace_page *pg; 2338 struct ftrace_page *pg;
1776 struct dyn_ftrace *rec; 2339 struct dyn_ftrace *rec;
1777 unsigned long flag; 2340 int type = MATCH_FULL;
1778 char *search; 2341 char *search = buff;
1779 int type;
1780 int not;
1781 int found = 0; 2342 int found = 0;
2343 int ret;
1782 2344
1783 flag = enable ? FTRACE_FL_FILTER : FTRACE_FL_NOTRACE; 2345 if (len) {
1784 type = filter_parse_regex(buff, len, &search, &not); 2346 type = filter_parse_regex(buff, len, &search, &not);
1785 2347 search_len = strlen(search);
1786 search_len = strlen(search); 2348 }
1787 2349
1788 mutex_lock(&ftrace_lock); 2350 mutex_lock(&ftrace_lock);
1789 do_for_each_ftrace_rec(pg, rec) {
1790 2351
1791 if (rec->flags & FTRACE_FL_FAILED) 2352 if (unlikely(ftrace_disabled))
1792 continue; 2353 goto out_unlock;
1793 2354
1794 if (ftrace_match_record(rec, search, search_len, type)) { 2355 do_for_each_ftrace_rec(pg, rec) {
1795 if (not) 2356
1796 rec->flags &= ~flag; 2357 if (ftrace_match_record(rec, mod, search, search_len, type)) {
1797 else 2358 ret = enter_record(hash, rec, not);
1798 rec->flags |= flag; 2359 if (ret < 0) {
2360 found = ret;
2361 goto out_unlock;
2362 }
1799 found = 1; 2363 found = 1;
1800 } 2364 }
1801 /*
1802 * Only enable filtering if we have a function that
1803 * is filtered on.
1804 */
1805 if (enable && (rec->flags & FTRACE_FL_FILTER))
1806 ftrace_filtered = 1;
1807 } while_for_each_ftrace_rec(); 2365 } while_for_each_ftrace_rec();
2366 out_unlock:
1808 mutex_unlock(&ftrace_lock); 2367 mutex_unlock(&ftrace_lock);
1809 2368
1810 return found; 2369 return found;
1811} 2370}
1812 2371
1813static int 2372static int
1814ftrace_match_module_record(struct dyn_ftrace *rec, char *mod, 2373ftrace_match_records(struct ftrace_hash *hash, char *buff, int len)
1815 char *regex, int len, int type)
1816{ 2374{
1817 char str[KSYM_SYMBOL_LEN]; 2375 return match_records(hash, buff, len, NULL, 0);
1818 char *modname;
1819
1820 kallsyms_lookup(rec->ip, NULL, NULL, &modname, str);
1821
1822 if (!modname || strcmp(modname, mod))
1823 return 0;
1824
1825 /* blank search means to match all funcs in the mod */
1826 if (len)
1827 return ftrace_match(str, regex, len, type);
1828 else
1829 return 1;
1830} 2376}
1831 2377
1832static int ftrace_match_module_records(char *buff, char *mod, int enable) 2378static int
2379ftrace_match_module_records(struct ftrace_hash *hash, char *buff, char *mod)
1833{ 2380{
1834 unsigned search_len = 0;
1835 struct ftrace_page *pg;
1836 struct dyn_ftrace *rec;
1837 int type = MATCH_FULL;
1838 char *search = buff;
1839 unsigned long flag;
1840 int not = 0; 2381 int not = 0;
1841 int found = 0;
1842
1843 flag = enable ? FTRACE_FL_FILTER : FTRACE_FL_NOTRACE;
1844 2382
1845 /* blank or '*' mean the same */ 2383 /* blank or '*' mean the same */
1846 if (strcmp(buff, "*") == 0) 2384 if (strcmp(buff, "*") == 0)
@@ -1852,32 +2390,7 @@ static int ftrace_match_module_records(char *buff, char *mod, int enable)
1852 not = 1; 2390 not = 1;
1853 } 2391 }
1854 2392
1855 if (strlen(buff)) { 2393 return match_records(hash, buff, strlen(buff), mod, not);
1856 type = filter_parse_regex(buff, strlen(buff), &search, &not);
1857 search_len = strlen(search);
1858 }
1859
1860 mutex_lock(&ftrace_lock);
1861 do_for_each_ftrace_rec(pg, rec) {
1862
1863 if (rec->flags & FTRACE_FL_FAILED)
1864 continue;
1865
1866 if (ftrace_match_module_record(rec, mod,
1867 search, search_len, type)) {
1868 if (not)
1869 rec->flags &= ~flag;
1870 else
1871 rec->flags |= flag;
1872 found = 1;
1873 }
1874 if (enable && (rec->flags & FTRACE_FL_FILTER))
1875 ftrace_filtered = 1;
1876
1877 } while_for_each_ftrace_rec();
1878 mutex_unlock(&ftrace_lock);
1879
1880 return found;
1881} 2394}
1882 2395
1883/* 2396/*
@@ -1888,7 +2401,10 @@ static int ftrace_match_module_records(char *buff, char *mod, int enable)
1888static int 2401static int
1889ftrace_mod_callback(char *func, char *cmd, char *param, int enable) 2402ftrace_mod_callback(char *func, char *cmd, char *param, int enable)
1890{ 2403{
2404 struct ftrace_ops *ops = &global_ops;
2405 struct ftrace_hash *hash;
1891 char *mod; 2406 char *mod;
2407 int ret = -EINVAL;
1892 2408
1893 /* 2409 /*
1894 * cmd == 'mod' because we only registered this func 2410 * cmd == 'mod' because we only registered this func
@@ -1900,15 +2416,24 @@ ftrace_mod_callback(char *func, char *cmd, char *param, int enable)
1900 2416
1901 /* we must have a module name */ 2417 /* we must have a module name */
1902 if (!param) 2418 if (!param)
1903 return -EINVAL; 2419 return ret;
1904 2420
1905 mod = strsep(&param, ":"); 2421 mod = strsep(&param, ":");
1906 if (!strlen(mod)) 2422 if (!strlen(mod))
1907 return -EINVAL; 2423 return ret;
1908 2424
1909 if (ftrace_match_module_records(func, mod, enable)) 2425 if (enable)
1910 return 0; 2426 hash = ops->filter_hash;
1911 return -EINVAL; 2427 else
2428 hash = ops->notrace_hash;
2429
2430 ret = ftrace_match_module_records(hash, func, mod);
2431 if (!ret)
2432 ret = -EINVAL;
2433 if (ret < 0)
2434 return ret;
2435
2436 return 0;
1912} 2437}
1913 2438
1914static struct ftrace_func_command ftrace_mod_cmd = { 2439static struct ftrace_func_command ftrace_mod_cmd = {
@@ -1959,6 +2484,7 @@ static int ftrace_probe_registered;
1959 2484
1960static void __enable_ftrace_function_probe(void) 2485static void __enable_ftrace_function_probe(void)
1961{ 2486{
2487 int ret;
1962 int i; 2488 int i;
1963 2489
1964 if (ftrace_probe_registered) 2490 if (ftrace_probe_registered)
@@ -1973,13 +2499,16 @@ static void __enable_ftrace_function_probe(void)
1973 if (i == FTRACE_FUNC_HASHSIZE) 2499 if (i == FTRACE_FUNC_HASHSIZE)
1974 return; 2500 return;
1975 2501
1976 __register_ftrace_function(&trace_probe_ops); 2502 ret = __register_ftrace_function(&trace_probe_ops);
1977 ftrace_startup(0); 2503 if (!ret)
2504 ftrace_startup(&trace_probe_ops, 0);
2505
1978 ftrace_probe_registered = 1; 2506 ftrace_probe_registered = 1;
1979} 2507}
1980 2508
1981static void __disable_ftrace_function_probe(void) 2509static void __disable_ftrace_function_probe(void)
1982{ 2510{
2511 int ret;
1983 int i; 2512 int i;
1984 2513
1985 if (!ftrace_probe_registered) 2514 if (!ftrace_probe_registered)
@@ -1992,8 +2521,10 @@ static void __disable_ftrace_function_probe(void)
1992 } 2521 }
1993 2522
1994 /* no more funcs left */ 2523 /* no more funcs left */
1995 __unregister_ftrace_function(&trace_probe_ops); 2524 ret = __unregister_ftrace_function(&trace_probe_ops);
1996 ftrace_shutdown(0); 2525 if (!ret)
2526 ftrace_shutdown(&trace_probe_ops, 0);
2527
1997 ftrace_probe_registered = 0; 2528 ftrace_probe_registered = 0;
1998} 2529}
1999 2530
@@ -2029,12 +2560,13 @@ register_ftrace_function_probe(char *glob, struct ftrace_probe_ops *ops,
2029 return -EINVAL; 2560 return -EINVAL;
2030 2561
2031 mutex_lock(&ftrace_lock); 2562 mutex_lock(&ftrace_lock);
2032 do_for_each_ftrace_rec(pg, rec) {
2033 2563
2034 if (rec->flags & FTRACE_FL_FAILED) 2564 if (unlikely(ftrace_disabled))
2035 continue; 2565 goto out_unlock;
2566
2567 do_for_each_ftrace_rec(pg, rec) {
2036 2568
2037 if (!ftrace_match_record(rec, search, len, type)) 2569 if (!ftrace_match_record(rec, NULL, search, len, type))
2038 continue; 2570 continue;
2039 2571
2040 entry = kmalloc(sizeof(*entry), GFP_KERNEL); 2572 entry = kmalloc(sizeof(*entry), GFP_KERNEL);
@@ -2195,18 +2727,22 @@ int unregister_ftrace_command(struct ftrace_func_command *cmd)
2195 return ret; 2727 return ret;
2196} 2728}
2197 2729
2198static int ftrace_process_regex(char *buff, int len, int enable) 2730static int ftrace_process_regex(struct ftrace_hash *hash,
2731 char *buff, int len, int enable)
2199{ 2732{
2200 char *func, *command, *next = buff; 2733 char *func, *command, *next = buff;
2201 struct ftrace_func_command *p; 2734 struct ftrace_func_command *p;
2202 int ret = -EINVAL; 2735 int ret;
2203 2736
2204 func = strsep(&next, ":"); 2737 func = strsep(&next, ":");
2205 2738
2206 if (!next) { 2739 if (!next) {
2207 if (ftrace_match_records(func, len, enable)) 2740 ret = ftrace_match_records(hash, func, len);
2208 return 0; 2741 if (!ret)
2209 return ret; 2742 ret = -EINVAL;
2743 if (ret < 0)
2744 return ret;
2745 return 0;
2210 } 2746 }
2211 2747
2212 /* command found */ 2748 /* command found */
@@ -2239,6 +2775,10 @@ ftrace_regex_write(struct file *file, const char __user *ubuf,
2239 2775
2240 mutex_lock(&ftrace_regex_lock); 2776 mutex_lock(&ftrace_regex_lock);
2241 2777
2778 ret = -ENODEV;
2779 if (unlikely(ftrace_disabled))
2780 goto out_unlock;
2781
2242 if (file->f_mode & FMODE_READ) { 2782 if (file->f_mode & FMODE_READ) {
2243 struct seq_file *m = file->private_data; 2783 struct seq_file *m = file->private_data;
2244 iter = m->private; 2784 iter = m->private;
@@ -2250,7 +2790,7 @@ ftrace_regex_write(struct file *file, const char __user *ubuf,
2250 2790
2251 if (read >= 0 && trace_parser_loaded(parser) && 2791 if (read >= 0 && trace_parser_loaded(parser) &&
2252 !trace_parser_cont(parser)) { 2792 !trace_parser_cont(parser)) {
2253 ret = ftrace_process_regex(parser->buffer, 2793 ret = ftrace_process_regex(iter->hash, parser->buffer,
2254 parser->idx, enable); 2794 parser->idx, enable);
2255 trace_parser_clear(parser); 2795 trace_parser_clear(parser);
2256 if (ret) 2796 if (ret)
@@ -2278,22 +2818,49 @@ ftrace_notrace_write(struct file *file, const char __user *ubuf,
2278 return ftrace_regex_write(file, ubuf, cnt, ppos, 0); 2818 return ftrace_regex_write(file, ubuf, cnt, ppos, 0);
2279} 2819}
2280 2820
2281static void 2821static int
2282ftrace_set_regex(unsigned char *buf, int len, int reset, int enable) 2822ftrace_set_regex(struct ftrace_ops *ops, unsigned char *buf, int len,
2823 int reset, int enable)
2283{ 2824{
2825 struct ftrace_hash **orig_hash;
2826 struct ftrace_hash *hash;
2827 int ret;
2828
2829 /* All global ops uses the global ops filters */
2830 if (ops->flags & FTRACE_OPS_FL_GLOBAL)
2831 ops = &global_ops;
2832
2284 if (unlikely(ftrace_disabled)) 2833 if (unlikely(ftrace_disabled))
2285 return; 2834 return -ENODEV;
2835
2836 if (enable)
2837 orig_hash = &ops->filter_hash;
2838 else
2839 orig_hash = &ops->notrace_hash;
2840
2841 hash = alloc_and_copy_ftrace_hash(FTRACE_HASH_DEFAULT_BITS, *orig_hash);
2842 if (!hash)
2843 return -ENOMEM;
2286 2844
2287 mutex_lock(&ftrace_regex_lock); 2845 mutex_lock(&ftrace_regex_lock);
2288 if (reset) 2846 if (reset)
2289 ftrace_filter_reset(enable); 2847 ftrace_filter_reset(hash);
2290 if (buf) 2848 if (buf)
2291 ftrace_match_records(buf, len, enable); 2849 ftrace_match_records(hash, buf, len);
2850
2851 mutex_lock(&ftrace_lock);
2852 ret = ftrace_hash_move(orig_hash, hash);
2853 mutex_unlock(&ftrace_lock);
2854
2292 mutex_unlock(&ftrace_regex_lock); 2855 mutex_unlock(&ftrace_regex_lock);
2856
2857 free_ftrace_hash(hash);
2858 return ret;
2293} 2859}
2294 2860
2295/** 2861/**
2296 * ftrace_set_filter - set a function to filter on in ftrace 2862 * ftrace_set_filter - set a function to filter on in ftrace
2863 * @ops - the ops to set the filter with
2297 * @buf - the string that holds the function filter text. 2864 * @buf - the string that holds the function filter text.
2298 * @len - the length of the string. 2865 * @len - the length of the string.
2299 * @reset - non zero to reset all filters before applying this filter. 2866 * @reset - non zero to reset all filters before applying this filter.
@@ -2301,13 +2868,16 @@ ftrace_set_regex(unsigned char *buf, int len, int reset, int enable)
2301 * Filters denote which functions should be enabled when tracing is enabled. 2868 * Filters denote which functions should be enabled when tracing is enabled.
2302 * If @buf is NULL and reset is set, all functions will be enabled for tracing. 2869 * If @buf is NULL and reset is set, all functions will be enabled for tracing.
2303 */ 2870 */
2304void ftrace_set_filter(unsigned char *buf, int len, int reset) 2871void ftrace_set_filter(struct ftrace_ops *ops, unsigned char *buf,
2872 int len, int reset)
2305{ 2873{
2306 ftrace_set_regex(buf, len, reset, 1); 2874 ftrace_set_regex(ops, buf, len, reset, 1);
2307} 2875}
2876EXPORT_SYMBOL_GPL(ftrace_set_filter);
2308 2877
2309/** 2878/**
2310 * ftrace_set_notrace - set a function to not trace in ftrace 2879 * ftrace_set_notrace - set a function to not trace in ftrace
2880 * @ops - the ops to set the notrace filter with
2311 * @buf - the string that holds the function notrace text. 2881 * @buf - the string that holds the function notrace text.
2312 * @len - the length of the string. 2882 * @len - the length of the string.
2313 * @reset - non zero to reset all filters before applying this filter. 2883 * @reset - non zero to reset all filters before applying this filter.
@@ -2316,10 +2886,44 @@ void ftrace_set_filter(unsigned char *buf, int len, int reset)
2316 * is enabled. If @buf is NULL and reset is set, all functions will be enabled 2886 * is enabled. If @buf is NULL and reset is set, all functions will be enabled
2317 * for tracing. 2887 * for tracing.
2318 */ 2888 */
2319void ftrace_set_notrace(unsigned char *buf, int len, int reset) 2889void ftrace_set_notrace(struct ftrace_ops *ops, unsigned char *buf,
2890 int len, int reset)
2320{ 2891{
2321 ftrace_set_regex(buf, len, reset, 0); 2892 ftrace_set_regex(ops, buf, len, reset, 0);
2322} 2893}
2894EXPORT_SYMBOL_GPL(ftrace_set_notrace);
2895/**
2896 * ftrace_set_filter - set a function to filter on in ftrace
2897 * @ops - the ops to set the filter with
2898 * @buf - the string that holds the function filter text.
2899 * @len - the length of the string.
2900 * @reset - non zero to reset all filters before applying this filter.
2901 *
2902 * Filters denote which functions should be enabled when tracing is enabled.
2903 * If @buf is NULL and reset is set, all functions will be enabled for tracing.
2904 */
2905void ftrace_set_global_filter(unsigned char *buf, int len, int reset)
2906{
2907 ftrace_set_regex(&global_ops, buf, len, reset, 1);
2908}
2909EXPORT_SYMBOL_GPL(ftrace_set_global_filter);
2910
2911/**
2912 * ftrace_set_notrace - set a function to not trace in ftrace
2913 * @ops - the ops to set the notrace filter with
2914 * @buf - the string that holds the function notrace text.
2915 * @len - the length of the string.
2916 * @reset - non zero to reset all filters before applying this filter.
2917 *
2918 * Notrace Filters denote which functions should not be enabled when tracing
2919 * is enabled. If @buf is NULL and reset is set, all functions will be enabled
2920 * for tracing.
2921 */
2922void ftrace_set_global_notrace(unsigned char *buf, int len, int reset)
2923{
2924 ftrace_set_regex(&global_ops, buf, len, reset, 0);
2925}
2926EXPORT_SYMBOL_GPL(ftrace_set_global_notrace);
2323 2927
2324/* 2928/*
2325 * command line interface to allow users to set filters on boot up. 2929 * command line interface to allow users to set filters on boot up.
@@ -2370,22 +2974,23 @@ static void __init set_ftrace_early_graph(char *buf)
2370} 2974}
2371#endif /* CONFIG_FUNCTION_GRAPH_TRACER */ 2975#endif /* CONFIG_FUNCTION_GRAPH_TRACER */
2372 2976
2373static void __init set_ftrace_early_filter(char *buf, int enable) 2977static void __init
2978set_ftrace_early_filter(struct ftrace_ops *ops, char *buf, int enable)
2374{ 2979{
2375 char *func; 2980 char *func;
2376 2981
2377 while (buf) { 2982 while (buf) {
2378 func = strsep(&buf, ","); 2983 func = strsep(&buf, ",");
2379 ftrace_set_regex(func, strlen(func), 0, enable); 2984 ftrace_set_regex(ops, func, strlen(func), 0, enable);
2380 } 2985 }
2381} 2986}
2382 2987
2383static void __init set_ftrace_early_filters(void) 2988static void __init set_ftrace_early_filters(void)
2384{ 2989{
2385 if (ftrace_filter_buf[0]) 2990 if (ftrace_filter_buf[0])
2386 set_ftrace_early_filter(ftrace_filter_buf, 1); 2991 set_ftrace_early_filter(&global_ops, ftrace_filter_buf, 1);
2387 if (ftrace_notrace_buf[0]) 2992 if (ftrace_notrace_buf[0])
2388 set_ftrace_early_filter(ftrace_notrace_buf, 0); 2993 set_ftrace_early_filter(&global_ops, ftrace_notrace_buf, 0);
2389#ifdef CONFIG_FUNCTION_GRAPH_TRACER 2994#ifdef CONFIG_FUNCTION_GRAPH_TRACER
2390 if (ftrace_graph_buf[0]) 2995 if (ftrace_graph_buf[0])
2391 set_ftrace_early_graph(ftrace_graph_buf); 2996 set_ftrace_early_graph(ftrace_graph_buf);
@@ -2393,11 +2998,14 @@ static void __init set_ftrace_early_filters(void)
2393} 2998}
2394 2999
2395static int 3000static int
2396ftrace_regex_release(struct inode *inode, struct file *file, int enable) 3001ftrace_regex_release(struct inode *inode, struct file *file)
2397{ 3002{
2398 struct seq_file *m = (struct seq_file *)file->private_data; 3003 struct seq_file *m = (struct seq_file *)file->private_data;
2399 struct ftrace_iterator *iter; 3004 struct ftrace_iterator *iter;
3005 struct ftrace_hash **orig_hash;
2400 struct trace_parser *parser; 3006 struct trace_parser *parser;
3007 int filter_hash;
3008 int ret;
2401 3009
2402 mutex_lock(&ftrace_regex_lock); 3010 mutex_lock(&ftrace_regex_lock);
2403 if (file->f_mode & FMODE_READ) { 3011 if (file->f_mode & FMODE_READ) {
@@ -2410,33 +3018,41 @@ ftrace_regex_release(struct inode *inode, struct file *file, int enable)
2410 parser = &iter->parser; 3018 parser = &iter->parser;
2411 if (trace_parser_loaded(parser)) { 3019 if (trace_parser_loaded(parser)) {
2412 parser->buffer[parser->idx] = 0; 3020 parser->buffer[parser->idx] = 0;
2413 ftrace_match_records(parser->buffer, parser->idx, enable); 3021 ftrace_match_records(iter->hash, parser->buffer, parser->idx);
2414 } 3022 }
2415 3023
2416 mutex_lock(&ftrace_lock);
2417 if (ftrace_start_up && ftrace_enabled)
2418 ftrace_run_update_code(FTRACE_ENABLE_CALLS);
2419 mutex_unlock(&ftrace_lock);
2420
2421 trace_parser_put(parser); 3024 trace_parser_put(parser);
3025
3026 if (file->f_mode & FMODE_WRITE) {
3027 filter_hash = !!(iter->flags & FTRACE_ITER_FILTER);
3028
3029 if (filter_hash)
3030 orig_hash = &iter->ops->filter_hash;
3031 else
3032 orig_hash = &iter->ops->notrace_hash;
3033
3034 mutex_lock(&ftrace_lock);
3035 /*
3036 * Remove the current set, update the hash and add
3037 * them back.
3038 */
3039 ftrace_hash_rec_disable(iter->ops, filter_hash);
3040 ret = ftrace_hash_move(orig_hash, iter->hash);
3041 if (!ret) {
3042 ftrace_hash_rec_enable(iter->ops, filter_hash);
3043 if (iter->ops->flags & FTRACE_OPS_FL_ENABLED
3044 && ftrace_enabled)
3045 ftrace_run_update_code(FTRACE_ENABLE_CALLS);
3046 }
3047 mutex_unlock(&ftrace_lock);
3048 }
3049 free_ftrace_hash(iter->hash);
2422 kfree(iter); 3050 kfree(iter);
2423 3051
2424 mutex_unlock(&ftrace_regex_lock); 3052 mutex_unlock(&ftrace_regex_lock);
2425 return 0; 3053 return 0;
2426} 3054}
2427 3055
2428static int
2429ftrace_filter_release(struct inode *inode, struct file *file)
2430{
2431 return ftrace_regex_release(inode, file, 1);
2432}
2433
2434static int
2435ftrace_notrace_release(struct inode *inode, struct file *file)
2436{
2437 return ftrace_regex_release(inode, file, 0);
2438}
2439
2440static const struct file_operations ftrace_avail_fops = { 3056static const struct file_operations ftrace_avail_fops = {
2441 .open = ftrace_avail_open, 3057 .open = ftrace_avail_open,
2442 .read = seq_read, 3058 .read = seq_read,
@@ -2444,8 +3060,8 @@ static const struct file_operations ftrace_avail_fops = {
2444 .release = seq_release_private, 3060 .release = seq_release_private,
2445}; 3061};
2446 3062
2447static const struct file_operations ftrace_failures_fops = { 3063static const struct file_operations ftrace_enabled_fops = {
2448 .open = ftrace_failures_open, 3064 .open = ftrace_enabled_open,
2449 .read = seq_read, 3065 .read = seq_read,
2450 .llseek = seq_lseek, 3066 .llseek = seq_lseek,
2451 .release = seq_release_private, 3067 .release = seq_release_private,
@@ -2456,7 +3072,7 @@ static const struct file_operations ftrace_filter_fops = {
2456 .read = seq_read, 3072 .read = seq_read,
2457 .write = ftrace_filter_write, 3073 .write = ftrace_filter_write,
2458 .llseek = ftrace_regex_lseek, 3074 .llseek = ftrace_regex_lseek,
2459 .release = ftrace_filter_release, 3075 .release = ftrace_regex_release,
2460}; 3076};
2461 3077
2462static const struct file_operations ftrace_notrace_fops = { 3078static const struct file_operations ftrace_notrace_fops = {
@@ -2464,7 +3080,7 @@ static const struct file_operations ftrace_notrace_fops = {
2464 .read = seq_read, 3080 .read = seq_read,
2465 .write = ftrace_notrace_write, 3081 .write = ftrace_notrace_write,
2466 .llseek = ftrace_regex_lseek, 3082 .llseek = ftrace_regex_lseek,
2467 .release = ftrace_notrace_release, 3083 .release = ftrace_regex_release,
2468}; 3084};
2469 3085
2470#ifdef CONFIG_FUNCTION_GRAPH_TRACER 3086#ifdef CONFIG_FUNCTION_GRAPH_TRACER
@@ -2573,9 +3189,6 @@ ftrace_set_func(unsigned long *array, int *idx, char *buffer)
2573 bool exists; 3189 bool exists;
2574 int i; 3190 int i;
2575 3191
2576 if (ftrace_disabled)
2577 return -ENODEV;
2578
2579 /* decode regex */ 3192 /* decode regex */
2580 type = filter_parse_regex(buffer, strlen(buffer), &search, &not); 3193 type = filter_parse_regex(buffer, strlen(buffer), &search, &not);
2581 if (!not && *idx >= FTRACE_GRAPH_MAX_FUNCS) 3194 if (!not && *idx >= FTRACE_GRAPH_MAX_FUNCS)
@@ -2584,12 +3197,18 @@ ftrace_set_func(unsigned long *array, int *idx, char *buffer)
2584 search_len = strlen(search); 3197 search_len = strlen(search);
2585 3198
2586 mutex_lock(&ftrace_lock); 3199 mutex_lock(&ftrace_lock);
3200
3201 if (unlikely(ftrace_disabled)) {
3202 mutex_unlock(&ftrace_lock);
3203 return -ENODEV;
3204 }
3205
2587 do_for_each_ftrace_rec(pg, rec) { 3206 do_for_each_ftrace_rec(pg, rec) {
2588 3207
2589 if (rec->flags & (FTRACE_FL_FAILED | FTRACE_FL_FREE)) 3208 if (rec->flags & FTRACE_FL_FREE)
2590 continue; 3209 continue;
2591 3210
2592 if (ftrace_match_record(rec, search, search_len, type)) { 3211 if (ftrace_match_record(rec, NULL, search, search_len, type)) {
2593 /* if it is in the array */ 3212 /* if it is in the array */
2594 exists = false; 3213 exists = false;
2595 for (i = 0; i < *idx; i++) { 3214 for (i = 0; i < *idx; i++) {
@@ -2679,8 +3298,8 @@ static __init int ftrace_init_dyn_debugfs(struct dentry *d_tracer)
2679 trace_create_file("available_filter_functions", 0444, 3298 trace_create_file("available_filter_functions", 0444,
2680 d_tracer, NULL, &ftrace_avail_fops); 3299 d_tracer, NULL, &ftrace_avail_fops);
2681 3300
2682 trace_create_file("failures", 0444, 3301 trace_create_file("enabled_functions", 0444,
2683 d_tracer, NULL, &ftrace_failures_fops); 3302 d_tracer, NULL, &ftrace_enabled_fops);
2684 3303
2685 trace_create_file("set_ftrace_filter", 0644, d_tracer, 3304 trace_create_file("set_ftrace_filter", 0644, d_tracer,
2686 NULL, &ftrace_filter_fops); 3305 NULL, &ftrace_filter_fops);
@@ -2703,7 +3322,6 @@ static int ftrace_process_locs(struct module *mod,
2703{ 3322{
2704 unsigned long *p; 3323 unsigned long *p;
2705 unsigned long addr; 3324 unsigned long addr;
2706 unsigned long flags;
2707 3325
2708 mutex_lock(&ftrace_lock); 3326 mutex_lock(&ftrace_lock);
2709 p = start; 3327 p = start;
@@ -2720,10 +3338,7 @@ static int ftrace_process_locs(struct module *mod,
2720 ftrace_record_ip(addr); 3338 ftrace_record_ip(addr);
2721 } 3339 }
2722 3340
2723 /* disable interrupts to prevent kstop machine */
2724 local_irq_save(flags);
2725 ftrace_update_code(mod); 3341 ftrace_update_code(mod);
2726 local_irq_restore(flags);
2727 mutex_unlock(&ftrace_lock); 3342 mutex_unlock(&ftrace_lock);
2728 3343
2729 return 0; 3344 return 0;
@@ -2735,10 +3350,11 @@ void ftrace_release_mod(struct module *mod)
2735 struct dyn_ftrace *rec; 3350 struct dyn_ftrace *rec;
2736 struct ftrace_page *pg; 3351 struct ftrace_page *pg;
2737 3352
3353 mutex_lock(&ftrace_lock);
3354
2738 if (ftrace_disabled) 3355 if (ftrace_disabled)
2739 return; 3356 goto out_unlock;
2740 3357
2741 mutex_lock(&ftrace_lock);
2742 do_for_each_ftrace_rec(pg, rec) { 3358 do_for_each_ftrace_rec(pg, rec) {
2743 if (within_module_core(rec->ip, mod)) { 3359 if (within_module_core(rec->ip, mod)) {
2744 /* 3360 /*
@@ -2749,6 +3365,7 @@ void ftrace_release_mod(struct module *mod)
2749 ftrace_free_rec(rec); 3365 ftrace_free_rec(rec);
2750 } 3366 }
2751 } while_for_each_ftrace_rec(); 3367 } while_for_each_ftrace_rec();
3368 out_unlock:
2752 mutex_unlock(&ftrace_lock); 3369 mutex_unlock(&ftrace_lock);
2753} 3370}
2754 3371
@@ -2835,6 +3452,10 @@ void __init ftrace_init(void)
2835 3452
2836#else 3453#else
2837 3454
3455static struct ftrace_ops global_ops = {
3456 .func = ftrace_stub,
3457};
3458
2838static int __init ftrace_nodyn_init(void) 3459static int __init ftrace_nodyn_init(void)
2839{ 3460{
2840 ftrace_enabled = 1; 3461 ftrace_enabled = 1;
@@ -2845,12 +3466,38 @@ device_initcall(ftrace_nodyn_init);
2845static inline int ftrace_init_dyn_debugfs(struct dentry *d_tracer) { return 0; } 3466static inline int ftrace_init_dyn_debugfs(struct dentry *d_tracer) { return 0; }
2846static inline void ftrace_startup_enable(int command) { } 3467static inline void ftrace_startup_enable(int command) { }
2847/* Keep as macros so we do not need to define the commands */ 3468/* Keep as macros so we do not need to define the commands */
2848# define ftrace_startup(command) do { } while (0) 3469# define ftrace_startup(ops, command) do { } while (0)
2849# define ftrace_shutdown(command) do { } while (0) 3470# define ftrace_shutdown(ops, command) do { } while (0)
2850# define ftrace_startup_sysctl() do { } while (0) 3471# define ftrace_startup_sysctl() do { } while (0)
2851# define ftrace_shutdown_sysctl() do { } while (0) 3472# define ftrace_shutdown_sysctl() do { } while (0)
3473
3474static inline int
3475ftrace_ops_test(struct ftrace_ops *ops, unsigned long ip)
3476{
3477 return 1;
3478}
3479
2852#endif /* CONFIG_DYNAMIC_FTRACE */ 3480#endif /* CONFIG_DYNAMIC_FTRACE */
2853 3481
3482static void
3483ftrace_ops_list_func(unsigned long ip, unsigned long parent_ip)
3484{
3485 struct ftrace_ops *op;
3486
3487 /*
3488 * Some of the ops may be dynamically allocated,
3489 * they must be freed after a synchronize_sched().
3490 */
3491 preempt_disable_notrace();
3492 op = rcu_dereference_raw(ftrace_ops_list);
3493 while (op != &ftrace_list_end) {
3494 if (ftrace_ops_test(op, ip))
3495 op->func(ip, parent_ip);
3496 op = rcu_dereference_raw(op->next);
3497 };
3498 preempt_enable_notrace();
3499}
3500
2854static void clear_ftrace_swapper(void) 3501static void clear_ftrace_swapper(void)
2855{ 3502{
2856 struct task_struct *p; 3503 struct task_struct *p;
@@ -3143,19 +3790,23 @@ void ftrace_kill(void)
3143 */ 3790 */
3144int register_ftrace_function(struct ftrace_ops *ops) 3791int register_ftrace_function(struct ftrace_ops *ops)
3145{ 3792{
3146 int ret; 3793 int ret = -1;
3147
3148 if (unlikely(ftrace_disabled))
3149 return -1;
3150 3794
3151 mutex_lock(&ftrace_lock); 3795 mutex_lock(&ftrace_lock);
3152 3796
3797 if (unlikely(ftrace_disabled))
3798 goto out_unlock;
3799
3153 ret = __register_ftrace_function(ops); 3800 ret = __register_ftrace_function(ops);
3154 ftrace_startup(0); 3801 if (!ret)
3802 ftrace_startup(ops, 0);
3155 3803
3804
3805 out_unlock:
3156 mutex_unlock(&ftrace_lock); 3806 mutex_unlock(&ftrace_lock);
3157 return ret; 3807 return ret;
3158} 3808}
3809EXPORT_SYMBOL_GPL(register_ftrace_function);
3159 3810
3160/** 3811/**
3161 * unregister_ftrace_function - unregister a function for profiling. 3812 * unregister_ftrace_function - unregister a function for profiling.
@@ -3169,25 +3820,27 @@ int unregister_ftrace_function(struct ftrace_ops *ops)
3169 3820
3170 mutex_lock(&ftrace_lock); 3821 mutex_lock(&ftrace_lock);
3171 ret = __unregister_ftrace_function(ops); 3822 ret = __unregister_ftrace_function(ops);
3172 ftrace_shutdown(0); 3823 if (!ret)
3824 ftrace_shutdown(ops, 0);
3173 mutex_unlock(&ftrace_lock); 3825 mutex_unlock(&ftrace_lock);
3174 3826
3175 return ret; 3827 return ret;
3176} 3828}
3829EXPORT_SYMBOL_GPL(unregister_ftrace_function);
3177 3830
3178int 3831int
3179ftrace_enable_sysctl(struct ctl_table *table, int write, 3832ftrace_enable_sysctl(struct ctl_table *table, int write,
3180 void __user *buffer, size_t *lenp, 3833 void __user *buffer, size_t *lenp,
3181 loff_t *ppos) 3834 loff_t *ppos)
3182{ 3835{
3183 int ret; 3836 int ret = -ENODEV;
3184
3185 if (unlikely(ftrace_disabled))
3186 return -ENODEV;
3187 3837
3188 mutex_lock(&ftrace_lock); 3838 mutex_lock(&ftrace_lock);
3189 3839
3190 ret = proc_dointvec(table, write, buffer, lenp, ppos); 3840 if (unlikely(ftrace_disabled))
3841 goto out;
3842
3843 ret = proc_dointvec(table, write, buffer, lenp, ppos);
3191 3844
3192 if (ret || !write || (last_ftrace_enabled == !!ftrace_enabled)) 3845 if (ret || !write || (last_ftrace_enabled == !!ftrace_enabled))
3193 goto out; 3846 goto out;
@@ -3199,11 +3852,11 @@ ftrace_enable_sysctl(struct ctl_table *table, int write,
3199 ftrace_startup_sysctl(); 3852 ftrace_startup_sysctl();
3200 3853
3201 /* we are starting ftrace again */ 3854 /* we are starting ftrace again */
3202 if (ftrace_list != &ftrace_list_end) { 3855 if (ftrace_ops_list != &ftrace_list_end) {
3203 if (ftrace_list->next == &ftrace_list_end) 3856 if (ftrace_ops_list->next == &ftrace_list_end)
3204 ftrace_trace_function = ftrace_list->func; 3857 ftrace_trace_function = ftrace_ops_list->func;
3205 else 3858 else
3206 ftrace_trace_function = ftrace_list_func; 3859 ftrace_trace_function = ftrace_ops_list_func;
3207 } 3860 }
3208 3861
3209 } else { 3862 } else {
@@ -3392,7 +4045,7 @@ int register_ftrace_graph(trace_func_graph_ret_t retfunc,
3392 ftrace_graph_return = retfunc; 4045 ftrace_graph_return = retfunc;
3393 ftrace_graph_entry = entryfunc; 4046 ftrace_graph_entry = entryfunc;
3394 4047
3395 ftrace_startup(FTRACE_START_FUNC_RET); 4048 ftrace_startup(&global_ops, FTRACE_START_FUNC_RET);
3396 4049
3397out: 4050out:
3398 mutex_unlock(&ftrace_lock); 4051 mutex_unlock(&ftrace_lock);
@@ -3409,7 +4062,7 @@ void unregister_ftrace_graph(void)
3409 ftrace_graph_active--; 4062 ftrace_graph_active--;
3410 ftrace_graph_return = (trace_func_graph_ret_t)ftrace_stub; 4063 ftrace_graph_return = (trace_func_graph_ret_t)ftrace_stub;
3411 ftrace_graph_entry = ftrace_graph_entry_stub; 4064 ftrace_graph_entry = ftrace_graph_entry_stub;
3412 ftrace_shutdown(FTRACE_STOP_FUNC_RET); 4065 ftrace_shutdown(&global_ops, FTRACE_STOP_FUNC_RET);
3413 unregister_pm_notifier(&ftrace_suspend_notifier); 4066 unregister_pm_notifier(&ftrace_suspend_notifier);
3414 unregister_trace_sched_switch(ftrace_graph_probe_sched_switch, NULL); 4067 unregister_trace_sched_switch(ftrace_graph_probe_sched_switch, NULL);
3415 4068
@@ -3425,7 +4078,7 @@ graph_init_task(struct task_struct *t, struct ftrace_ret_stack *ret_stack)
3425 atomic_set(&t->tracing_graph_pause, 0); 4078 atomic_set(&t->tracing_graph_pause, 0);
3426 atomic_set(&t->trace_overrun, 0); 4079 atomic_set(&t->trace_overrun, 0);
3427 t->ftrace_timestamp = 0; 4080 t->ftrace_timestamp = 0;
3428 /* make curr_ret_stack visable before we add the ret_stack */ 4081 /* make curr_ret_stack visible before we add the ret_stack */
3429 smp_wmb(); 4082 smp_wmb();
3430 t->ret_stack = ret_stack; 4083 t->ret_stack = ret_stack;
3431} 4084}
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index d9c8bcafb120..0ef7b4b2a1f7 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -1478,7 +1478,7 @@ static inline unsigned long rb_page_entries(struct buffer_page *bpage)
1478 return local_read(&bpage->entries) & RB_WRITE_MASK; 1478 return local_read(&bpage->entries) & RB_WRITE_MASK;
1479} 1479}
1480 1480
1481/* Size is determined by what has been commited */ 1481/* Size is determined by what has been committed */
1482static inline unsigned rb_page_size(struct buffer_page *bpage) 1482static inline unsigned rb_page_size(struct buffer_page *bpage)
1483{ 1483{
1484 return rb_page_commit(bpage); 1484 return rb_page_commit(bpage);
@@ -2932,7 +2932,7 @@ rb_get_reader_page(struct ring_buffer_per_cpu *cpu_buffer)
2932 /* 2932 /*
2933 * cpu_buffer->pages just needs to point to the buffer, it 2933 * cpu_buffer->pages just needs to point to the buffer, it
2934 * has no specific buffer page to point to. Lets move it out 2934 * has no specific buffer page to point to. Lets move it out
2935 * of our way so we don't accidently swap it. 2935 * of our way so we don't accidentally swap it.
2936 */ 2936 */
2937 cpu_buffer->pages = reader->list.prev; 2937 cpu_buffer->pages = reader->list.prev;
2938 2938
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 9541c27c1cf2..ee9c921d7f21 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -1110,6 +1110,7 @@ tracing_generic_entry_update(struct trace_entry *entry, unsigned long flags,
1110 1110
1111 entry->preempt_count = pc & 0xff; 1111 entry->preempt_count = pc & 0xff;
1112 entry->pid = (tsk) ? tsk->pid : 0; 1112 entry->pid = (tsk) ? tsk->pid : 0;
1113 entry->padding = 0;
1113 entry->flags = 1114 entry->flags =
1114#ifdef CONFIG_TRACE_IRQFLAGS_SUPPORT 1115#ifdef CONFIG_TRACE_IRQFLAGS_SUPPORT
1115 (irqs_disabled_flags(flags) ? TRACE_FLAG_IRQS_OFF : 0) | 1116 (irqs_disabled_flags(flags) ? TRACE_FLAG_IRQS_OFF : 0) |
@@ -2013,9 +2014,10 @@ enum print_line_t print_trace_line(struct trace_iterator *iter)
2013{ 2014{
2014 enum print_line_t ret; 2015 enum print_line_t ret;
2015 2016
2016 if (iter->lost_events) 2017 if (iter->lost_events &&
2017 trace_seq_printf(&iter->seq, "CPU:%d [LOST %lu EVENTS]\n", 2018 !trace_seq_printf(&iter->seq, "CPU:%d [LOST %lu EVENTS]\n",
2018 iter->cpu, iter->lost_events); 2019 iter->cpu, iter->lost_events))
2020 return TRACE_TYPE_PARTIAL_LINE;
2019 2021
2020 if (iter->trace && iter->trace->print_line) { 2022 if (iter->trace && iter->trace->print_line) {
2021 ret = iter->trace->print_line(iter); 2023 ret = iter->trace->print_line(iter);
@@ -3229,6 +3231,14 @@ waitagain:
3229 3231
3230 if (iter->seq.len >= cnt) 3232 if (iter->seq.len >= cnt)
3231 break; 3233 break;
3234
3235 /*
3236 * Setting the full flag means we reached the trace_seq buffer
3237 * size and we should leave by partial output condition above.
3238 * One of the trace_seq_* functions is not used properly.
3239 */
3240 WARN_ONCE(iter->seq.full, "full flag set for trace type %d",
3241 iter->ent->type);
3232 } 3242 }
3233 trace_access_unlock(iter->cpu_file); 3243 trace_access_unlock(iter->cpu_file);
3234 trace_event_read_unlock(); 3244 trace_event_read_unlock();
@@ -3239,7 +3249,7 @@ waitagain:
3239 trace_seq_init(&iter->seq); 3249 trace_seq_init(&iter->seq);
3240 3250
3241 /* 3251 /*
3242 * If there was nothing to send to user, inspite of consuming trace 3252 * If there was nothing to send to user, in spite of consuming trace
3243 * entries, go back to wait for more entries. 3253 * entries, go back to wait for more entries.
3244 */ 3254 */
3245 if (sret == -EBUSY) 3255 if (sret == -EBUSY)
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 5e9dfc6286dd..6b69c4bd306f 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -419,6 +419,8 @@ extern void trace_find_cmdline(int pid, char comm[]);
419extern unsigned long ftrace_update_tot_cnt; 419extern unsigned long ftrace_update_tot_cnt;
420#define DYN_FTRACE_TEST_NAME trace_selftest_dynamic_test_func 420#define DYN_FTRACE_TEST_NAME trace_selftest_dynamic_test_func
421extern int DYN_FTRACE_TEST_NAME(void); 421extern int DYN_FTRACE_TEST_NAME(void);
422#define DYN_FTRACE_TEST_NAME2 trace_selftest_dynamic_test_func2
423extern int DYN_FTRACE_TEST_NAME2(void);
422#endif 424#endif
423 425
424extern int ring_buffer_expanded; 426extern int ring_buffer_expanded;
diff --git a/kernel/trace/trace_clock.c b/kernel/trace/trace_clock.c
index 685a67d55db0..6302747a1398 100644
--- a/kernel/trace/trace_clock.c
+++ b/kernel/trace/trace_clock.c
@@ -46,7 +46,7 @@ u64 notrace trace_clock_local(void)
46} 46}
47 47
48/* 48/*
49 * trace_clock(): 'inbetween' trace clock. Not completely serialized, 49 * trace_clock(): 'between' trace clock. Not completely serialized,
50 * but not completely incorrect when crossing CPUs either. 50 * but not completely incorrect when crossing CPUs either.
51 * 51 *
52 * This is based on cpu_clock(), which will allow at most ~1 jiffy of 52 * This is based on cpu_clock(), which will allow at most ~1 jiffy of
diff --git a/kernel/trace/trace_entries.h b/kernel/trace/trace_entries.h
index 1516cb3ec549..e32744c84d94 100644
--- a/kernel/trace/trace_entries.h
+++ b/kernel/trace/trace_entries.h
@@ -27,7 +27,7 @@
27 * in the structure. 27 * in the structure.
28 * 28 *
29 * * for structures within structures, the format of the internal 29 * * for structures within structures, the format of the internal
30 * structure is layed out. This allows the internal structure 30 * structure is laid out. This allows the internal structure
31 * to be deciphered for the format file. Although these macros 31 * to be deciphered for the format file. Although these macros
32 * may become out of sync with the internal structure, they 32 * may become out of sync with the internal structure, they
33 * will create a compile error if it happens. Since the 33 * will create a compile error if it happens. Since the
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index e88f74fe1d4c..2fe110341359 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -116,6 +116,7 @@ static int trace_define_common_fields(void)
116 __common_field(unsigned char, flags); 116 __common_field(unsigned char, flags);
117 __common_field(unsigned char, preempt_count); 117 __common_field(unsigned char, preempt_count);
118 __common_field(int, pid); 118 __common_field(int, pid);
119 __common_field(int, padding);
119 120
120 return ret; 121 return ret;
121} 122}
diff --git a/kernel/trace/trace_functions.c b/kernel/trace/trace_functions.c
index 16aee4d44e8f..8d0e1cc4e974 100644
--- a/kernel/trace/trace_functions.c
+++ b/kernel/trace/trace_functions.c
@@ -149,11 +149,13 @@ function_stack_trace_call(unsigned long ip, unsigned long parent_ip)
149static struct ftrace_ops trace_ops __read_mostly = 149static struct ftrace_ops trace_ops __read_mostly =
150{ 150{
151 .func = function_trace_call, 151 .func = function_trace_call,
152 .flags = FTRACE_OPS_FL_GLOBAL,
152}; 153};
153 154
154static struct ftrace_ops trace_stack_ops __read_mostly = 155static struct ftrace_ops trace_stack_ops __read_mostly =
155{ 156{
156 .func = function_stack_trace_call, 157 .func = function_stack_trace_call,
158 .flags = FTRACE_OPS_FL_GLOBAL,
157}; 159};
158 160
159/* Our two options */ 161/* Our two options */
diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c
index 76b05980225c..962cdb24ed81 100644
--- a/kernel/trace/trace_functions_graph.c
+++ b/kernel/trace/trace_functions_graph.c
@@ -905,7 +905,7 @@ print_graph_prologue(struct trace_iterator *iter, struct trace_seq *s,
905 * 905 *
906 * returns 1 if 906 * returns 1 if
907 * - we are inside irq code 907 * - we are inside irq code
908 * - we just extered irq code 908 * - we just entered irq code
909 * 909 *
910 * retunns 0 if 910 * retunns 0 if
911 * - funcgraph-interrupts option is set 911 * - funcgraph-interrupts option is set
diff --git a/kernel/trace/trace_irqsoff.c b/kernel/trace/trace_irqsoff.c
index 92b6e1e12d98..c77424be284d 100644
--- a/kernel/trace/trace_irqsoff.c
+++ b/kernel/trace/trace_irqsoff.c
@@ -80,7 +80,7 @@ static struct tracer_flags tracer_flags = {
80 * skip the latency if the sequence has changed - some other section 80 * skip the latency if the sequence has changed - some other section
81 * did a maximum and could disturb our measurement with serial console 81 * did a maximum and could disturb our measurement with serial console
82 * printouts, etc. Truly coinciding maximum latencies should be rare 82 * printouts, etc. Truly coinciding maximum latencies should be rare
83 * and what happens together happens separately as well, so this doesnt 83 * and what happens together happens separately as well, so this doesn't
84 * decrease the validity of the maximum found: 84 * decrease the validity of the maximum found:
85 */ 85 */
86static __cacheline_aligned_in_smp unsigned long max_sequence; 86static __cacheline_aligned_in_smp unsigned long max_sequence;
@@ -153,6 +153,7 @@ irqsoff_tracer_call(unsigned long ip, unsigned long parent_ip)
153static struct ftrace_ops trace_ops __read_mostly = 153static struct ftrace_ops trace_ops __read_mostly =
154{ 154{
155 .func = irqsoff_tracer_call, 155 .func = irqsoff_tracer_call,
156 .flags = FTRACE_OPS_FL_GLOBAL,
156}; 157};
157#endif /* CONFIG_FUNCTION_TRACER */ 158#endif /* CONFIG_FUNCTION_TRACER */
158 159
diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c
index 8435b43b1782..f925c45f0afa 100644
--- a/kernel/trace/trace_kprobe.c
+++ b/kernel/trace/trace_kprobe.c
@@ -53,7 +53,6 @@ const char *reserved_field_names[] = {
53 "common_preempt_count", 53 "common_preempt_count",
54 "common_pid", 54 "common_pid",
55 "common_tgid", 55 "common_tgid",
56 "common_lock_depth",
57 FIELD_STRING_IP, 56 FIELD_STRING_IP,
58 FIELD_STRING_RETIP, 57 FIELD_STRING_RETIP,
59 FIELD_STRING_FUNC, 58 FIELD_STRING_FUNC,
@@ -1839,7 +1838,7 @@ static void unregister_probe_event(struct trace_probe *tp)
1839 kfree(tp->call.print_fmt); 1838 kfree(tp->call.print_fmt);
1840} 1839}
1841 1840
1842/* Make a debugfs interface for controling probe points */ 1841/* Make a debugfs interface for controlling probe points */
1843static __init int init_kprobe_trace(void) 1842static __init int init_kprobe_trace(void)
1844{ 1843{
1845 struct dentry *d_tracer; 1844 struct dentry *d_tracer;
diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c
index 456be9063c2d..cf535ccedc86 100644
--- a/kernel/trace/trace_output.c
+++ b/kernel/trace/trace_output.c
@@ -830,6 +830,9 @@ EXPORT_SYMBOL_GPL(unregister_ftrace_event);
830enum print_line_t trace_nop_print(struct trace_iterator *iter, int flags, 830enum print_line_t trace_nop_print(struct trace_iterator *iter, int flags,
831 struct trace_event *event) 831 struct trace_event *event)
832{ 832{
833 if (!trace_seq_printf(&iter->seq, "type: %d\n", iter->ent->type))
834 return TRACE_TYPE_PARTIAL_LINE;
835
833 return TRACE_TYPE_HANDLED; 836 return TRACE_TYPE_HANDLED;
834} 837}
835 838
diff --git a/kernel/trace/trace_printk.c b/kernel/trace/trace_printk.c
index 2547d8813cf0..dff763b7baf1 100644
--- a/kernel/trace/trace_printk.c
+++ b/kernel/trace/trace_printk.c
@@ -32,7 +32,7 @@ static DEFINE_MUTEX(btrace_mutex);
32 32
33struct trace_bprintk_fmt { 33struct trace_bprintk_fmt {
34 struct list_head list; 34 struct list_head list;
35 char fmt[0]; 35 const char *fmt;
36}; 36};
37 37
38static inline struct trace_bprintk_fmt *lookup_format(const char *fmt) 38static inline struct trace_bprintk_fmt *lookup_format(const char *fmt)
@@ -49,6 +49,7 @@ static
49void hold_module_trace_bprintk_format(const char **start, const char **end) 49void hold_module_trace_bprintk_format(const char **start, const char **end)
50{ 50{
51 const char **iter; 51 const char **iter;
52 char *fmt;
52 53
53 mutex_lock(&btrace_mutex); 54 mutex_lock(&btrace_mutex);
54 for (iter = start; iter < end; iter++) { 55 for (iter = start; iter < end; iter++) {
@@ -58,14 +59,18 @@ void hold_module_trace_bprintk_format(const char **start, const char **end)
58 continue; 59 continue;
59 } 60 }
60 61
61 tb_fmt = kmalloc(offsetof(struct trace_bprintk_fmt, fmt) 62 tb_fmt = kmalloc(sizeof(*tb_fmt), GFP_KERNEL);
62 + strlen(*iter) + 1, GFP_KERNEL); 63 if (tb_fmt)
63 if (tb_fmt) { 64 fmt = kmalloc(strlen(*iter) + 1, GFP_KERNEL);
65 if (tb_fmt && fmt) {
64 list_add_tail(&tb_fmt->list, &trace_bprintk_fmt_list); 66 list_add_tail(&tb_fmt->list, &trace_bprintk_fmt_list);
65 strcpy(tb_fmt->fmt, *iter); 67 strcpy(fmt, *iter);
68 tb_fmt->fmt = fmt;
66 *iter = tb_fmt->fmt; 69 *iter = tb_fmt->fmt;
67 } else 70 } else {
71 kfree(tb_fmt);
68 *iter = NULL; 72 *iter = NULL;
73 }
69 } 74 }
70 mutex_unlock(&btrace_mutex); 75 mutex_unlock(&btrace_mutex);
71} 76}
@@ -84,6 +89,76 @@ static int module_trace_bprintk_format_notify(struct notifier_block *self,
84 return 0; 89 return 0;
85} 90}
86 91
92/*
93 * The debugfs/tracing/printk_formats file maps the addresses with
94 * the ASCII formats that are used in the bprintk events in the
95 * buffer. For userspace tools to be able to decode the events from
96 * the buffer, they need to be able to map the address with the format.
97 *
98 * The addresses of the bprintk formats are in their own section
99 * __trace_printk_fmt. But for modules we copy them into a link list.
100 * The code to print the formats and their addresses passes around the
101 * address of the fmt string. If the fmt address passed into the seq
102 * functions is within the kernel core __trace_printk_fmt section, then
103 * it simply uses the next pointer in the list.
104 *
105 * When the fmt pointer is outside the kernel core __trace_printk_fmt
106 * section, then we need to read the link list pointers. The trick is
107 * we pass the address of the string to the seq function just like
108 * we do for the kernel core formats. To get back the structure that
109 * holds the format, we simply use containerof() and then go to the
110 * next format in the list.
111 */
112static const char **
113find_next_mod_format(int start_index, void *v, const char **fmt, loff_t *pos)
114{
115 struct trace_bprintk_fmt *mod_fmt;
116
117 if (list_empty(&trace_bprintk_fmt_list))
118 return NULL;
119
120 /*
121 * v will point to the address of the fmt record from t_next
122 * v will be NULL from t_start.
123 * If this is the first pointer or called from start
124 * then we need to walk the list.
125 */
126 if (!v || start_index == *pos) {
127 struct trace_bprintk_fmt *p;
128
129 /* search the module list */
130 list_for_each_entry(p, &trace_bprintk_fmt_list, list) {
131 if (start_index == *pos)
132 return &p->fmt;
133 start_index++;
134 }
135 /* pos > index */
136 return NULL;
137 }
138
139 /*
140 * v points to the address of the fmt field in the mod list
141 * structure that holds the module print format.
142 */
143 mod_fmt = container_of(v, typeof(*mod_fmt), fmt);
144 if (mod_fmt->list.next == &trace_bprintk_fmt_list)
145 return NULL;
146
147 mod_fmt = container_of(mod_fmt->list.next, typeof(*mod_fmt), list);
148
149 return &mod_fmt->fmt;
150}
151
152static void format_mod_start(void)
153{
154 mutex_lock(&btrace_mutex);
155}
156
157static void format_mod_stop(void)
158{
159 mutex_unlock(&btrace_mutex);
160}
161
87#else /* !CONFIG_MODULES */ 162#else /* !CONFIG_MODULES */
88__init static int 163__init static int
89module_trace_bprintk_format_notify(struct notifier_block *self, 164module_trace_bprintk_format_notify(struct notifier_block *self,
@@ -91,6 +166,13 @@ module_trace_bprintk_format_notify(struct notifier_block *self,
91{ 166{
92 return 0; 167 return 0;
93} 168}
169static inline const char **
170find_next_mod_format(int start_index, void *v, const char **fmt, loff_t *pos)
171{
172 return NULL;
173}
174static inline void format_mod_start(void) { }
175static inline void format_mod_stop(void) { }
94#endif /* CONFIG_MODULES */ 176#endif /* CONFIG_MODULES */
95 177
96 178
@@ -153,20 +235,33 @@ int __ftrace_vprintk(unsigned long ip, const char *fmt, va_list ap)
153} 235}
154EXPORT_SYMBOL_GPL(__ftrace_vprintk); 236EXPORT_SYMBOL_GPL(__ftrace_vprintk);
155 237
238static const char **find_next(void *v, loff_t *pos)
239{
240 const char **fmt = v;
241 int start_index;
242
243 if (!fmt)
244 fmt = __start___trace_bprintk_fmt + *pos;
245
246 start_index = __stop___trace_bprintk_fmt - __start___trace_bprintk_fmt;
247
248 if (*pos < start_index)
249 return fmt;
250
251 return find_next_mod_format(start_index, v, fmt, pos);
252}
253
156static void * 254static void *
157t_start(struct seq_file *m, loff_t *pos) 255t_start(struct seq_file *m, loff_t *pos)
158{ 256{
159 const char **fmt = __start___trace_bprintk_fmt + *pos; 257 format_mod_start();
160 258 return find_next(NULL, pos);
161 if ((unsigned long)fmt >= (unsigned long)__stop___trace_bprintk_fmt)
162 return NULL;
163 return fmt;
164} 259}
165 260
166static void *t_next(struct seq_file *m, void * v, loff_t *pos) 261static void *t_next(struct seq_file *m, void * v, loff_t *pos)
167{ 262{
168 (*pos)++; 263 (*pos)++;
169 return t_start(m, pos); 264 return find_next(v, pos);
170} 265}
171 266
172static int t_show(struct seq_file *m, void *v) 267static int t_show(struct seq_file *m, void *v)
@@ -205,6 +300,7 @@ static int t_show(struct seq_file *m, void *v)
205 300
206static void t_stop(struct seq_file *m, void *p) 301static void t_stop(struct seq_file *m, void *p)
207{ 302{
303 format_mod_stop();
208} 304}
209 305
210static const struct seq_operations show_format_seq_ops = { 306static const struct seq_operations show_format_seq_ops = {
diff --git a/kernel/trace/trace_sched_wakeup.c b/kernel/trace/trace_sched_wakeup.c
index 7319559ed59f..f029dd4fd2ca 100644
--- a/kernel/trace/trace_sched_wakeup.c
+++ b/kernel/trace/trace_sched_wakeup.c
@@ -129,6 +129,7 @@ wakeup_tracer_call(unsigned long ip, unsigned long parent_ip)
129static struct ftrace_ops trace_ops __read_mostly = 129static struct ftrace_ops trace_ops __read_mostly =
130{ 130{
131 .func = wakeup_tracer_call, 131 .func = wakeup_tracer_call,
132 .flags = FTRACE_OPS_FL_GLOBAL,
132}; 133};
133#endif /* CONFIG_FUNCTION_TRACER */ 134#endif /* CONFIG_FUNCTION_TRACER */
134 135
diff --git a/kernel/trace/trace_selftest.c b/kernel/trace/trace_selftest.c
index 659732eba07c..288541f977fb 100644
--- a/kernel/trace/trace_selftest.c
+++ b/kernel/trace/trace_selftest.c
@@ -101,6 +101,206 @@ static inline void warn_failed_init_tracer(struct tracer *trace, int init_ret)
101 101
102#ifdef CONFIG_DYNAMIC_FTRACE 102#ifdef CONFIG_DYNAMIC_FTRACE
103 103
104static int trace_selftest_test_probe1_cnt;
105static void trace_selftest_test_probe1_func(unsigned long ip,
106 unsigned long pip)
107{
108 trace_selftest_test_probe1_cnt++;
109}
110
111static int trace_selftest_test_probe2_cnt;
112static void trace_selftest_test_probe2_func(unsigned long ip,
113 unsigned long pip)
114{
115 trace_selftest_test_probe2_cnt++;
116}
117
118static int trace_selftest_test_probe3_cnt;
119static void trace_selftest_test_probe3_func(unsigned long ip,
120 unsigned long pip)
121{
122 trace_selftest_test_probe3_cnt++;
123}
124
125static int trace_selftest_test_global_cnt;
126static void trace_selftest_test_global_func(unsigned long ip,
127 unsigned long pip)
128{
129 trace_selftest_test_global_cnt++;
130}
131
132static int trace_selftest_test_dyn_cnt;
133static void trace_selftest_test_dyn_func(unsigned long ip,
134 unsigned long pip)
135{
136 trace_selftest_test_dyn_cnt++;
137}
138
139static struct ftrace_ops test_probe1 = {
140 .func = trace_selftest_test_probe1_func,
141};
142
143static struct ftrace_ops test_probe2 = {
144 .func = trace_selftest_test_probe2_func,
145};
146
147static struct ftrace_ops test_probe3 = {
148 .func = trace_selftest_test_probe3_func,
149};
150
151static struct ftrace_ops test_global = {
152 .func = trace_selftest_test_global_func,
153 .flags = FTRACE_OPS_FL_GLOBAL,
154};
155
156static void print_counts(void)
157{
158 printk("(%d %d %d %d %d) ",
159 trace_selftest_test_probe1_cnt,
160 trace_selftest_test_probe2_cnt,
161 trace_selftest_test_probe3_cnt,
162 trace_selftest_test_global_cnt,
163 trace_selftest_test_dyn_cnt);
164}
165
166static void reset_counts(void)
167{
168 trace_selftest_test_probe1_cnt = 0;
169 trace_selftest_test_probe2_cnt = 0;
170 trace_selftest_test_probe3_cnt = 0;
171 trace_selftest_test_global_cnt = 0;
172 trace_selftest_test_dyn_cnt = 0;
173}
174
175static int trace_selftest_ops(int cnt)
176{
177 int save_ftrace_enabled = ftrace_enabled;
178 struct ftrace_ops *dyn_ops;
179 char *func1_name;
180 char *func2_name;
181 int len1;
182 int len2;
183 int ret = -1;
184
185 printk(KERN_CONT "PASSED\n");
186 pr_info("Testing dynamic ftrace ops #%d: ", cnt);
187
188 ftrace_enabled = 1;
189 reset_counts();
190
191 /* Handle PPC64 '.' name */
192 func1_name = "*" __stringify(DYN_FTRACE_TEST_NAME);
193 func2_name = "*" __stringify(DYN_FTRACE_TEST_NAME2);
194 len1 = strlen(func1_name);
195 len2 = strlen(func2_name);
196
197 /*
198 * Probe 1 will trace function 1.
199 * Probe 2 will trace function 2.
200 * Probe 3 will trace functions 1 and 2.
201 */
202 ftrace_set_filter(&test_probe1, func1_name, len1, 1);
203 ftrace_set_filter(&test_probe2, func2_name, len2, 1);
204 ftrace_set_filter(&test_probe3, func1_name, len1, 1);
205 ftrace_set_filter(&test_probe3, func2_name, len2, 0);
206
207 register_ftrace_function(&test_probe1);
208 register_ftrace_function(&test_probe2);
209 register_ftrace_function(&test_probe3);
210 register_ftrace_function(&test_global);
211
212 DYN_FTRACE_TEST_NAME();
213
214 print_counts();
215
216 if (trace_selftest_test_probe1_cnt != 1)
217 goto out;
218 if (trace_selftest_test_probe2_cnt != 0)
219 goto out;
220 if (trace_selftest_test_probe3_cnt != 1)
221 goto out;
222 if (trace_selftest_test_global_cnt == 0)
223 goto out;
224
225 DYN_FTRACE_TEST_NAME2();
226
227 print_counts();
228
229 if (trace_selftest_test_probe1_cnt != 1)
230 goto out;
231 if (trace_selftest_test_probe2_cnt != 1)
232 goto out;
233 if (trace_selftest_test_probe3_cnt != 2)
234 goto out;
235
236 /* Add a dynamic probe */
237 dyn_ops = kzalloc(sizeof(*dyn_ops), GFP_KERNEL);
238 if (!dyn_ops) {
239 printk("MEMORY ERROR ");
240 goto out;
241 }
242
243 dyn_ops->func = trace_selftest_test_dyn_func;
244
245 register_ftrace_function(dyn_ops);
246
247 trace_selftest_test_global_cnt = 0;
248
249 DYN_FTRACE_TEST_NAME();
250
251 print_counts();
252
253 if (trace_selftest_test_probe1_cnt != 2)
254 goto out_free;
255 if (trace_selftest_test_probe2_cnt != 1)
256 goto out_free;
257 if (trace_selftest_test_probe3_cnt != 3)
258 goto out_free;
259 if (trace_selftest_test_global_cnt == 0)
260 goto out;
261 if (trace_selftest_test_dyn_cnt == 0)
262 goto out_free;
263
264 DYN_FTRACE_TEST_NAME2();
265
266 print_counts();
267
268 if (trace_selftest_test_probe1_cnt != 2)
269 goto out_free;
270 if (trace_selftest_test_probe2_cnt != 2)
271 goto out_free;
272 if (trace_selftest_test_probe3_cnt != 4)
273 goto out_free;
274
275 ret = 0;
276 out_free:
277 unregister_ftrace_function(dyn_ops);
278 kfree(dyn_ops);
279
280 out:
281 /* Purposely unregister in the same order */
282 unregister_ftrace_function(&test_probe1);
283 unregister_ftrace_function(&test_probe2);
284 unregister_ftrace_function(&test_probe3);
285 unregister_ftrace_function(&test_global);
286
287 /* Make sure everything is off */
288 reset_counts();
289 DYN_FTRACE_TEST_NAME();
290 DYN_FTRACE_TEST_NAME();
291
292 if (trace_selftest_test_probe1_cnt ||
293 trace_selftest_test_probe2_cnt ||
294 trace_selftest_test_probe3_cnt ||
295 trace_selftest_test_global_cnt ||
296 trace_selftest_test_dyn_cnt)
297 ret = -1;
298
299 ftrace_enabled = save_ftrace_enabled;
300
301 return ret;
302}
303
104/* Test dynamic code modification and ftrace filters */ 304/* Test dynamic code modification and ftrace filters */
105int trace_selftest_startup_dynamic_tracing(struct tracer *trace, 305int trace_selftest_startup_dynamic_tracing(struct tracer *trace,
106 struct trace_array *tr, 306 struct trace_array *tr,
@@ -131,7 +331,7 @@ int trace_selftest_startup_dynamic_tracing(struct tracer *trace,
131 func_name = "*" __stringify(DYN_FTRACE_TEST_NAME); 331 func_name = "*" __stringify(DYN_FTRACE_TEST_NAME);
132 332
133 /* filter only on our function */ 333 /* filter only on our function */
134 ftrace_set_filter(func_name, strlen(func_name), 1); 334 ftrace_set_global_filter(func_name, strlen(func_name), 1);
135 335
136 /* enable tracing */ 336 /* enable tracing */
137 ret = tracer_init(trace, tr); 337 ret = tracer_init(trace, tr);
@@ -166,22 +366,30 @@ int trace_selftest_startup_dynamic_tracing(struct tracer *trace,
166 366
167 /* check the trace buffer */ 367 /* check the trace buffer */
168 ret = trace_test_buffer(tr, &count); 368 ret = trace_test_buffer(tr, &count);
169 trace->reset(tr);
170 tracing_start(); 369 tracing_start();
171 370
172 /* we should only have one item */ 371 /* we should only have one item */
173 if (!ret && count != 1) { 372 if (!ret && count != 1) {
373 trace->reset(tr);
174 printk(KERN_CONT ".. filter failed count=%ld ..", count); 374 printk(KERN_CONT ".. filter failed count=%ld ..", count);
175 ret = -1; 375 ret = -1;
176 goto out; 376 goto out;
177 } 377 }
178 378
379 /* Test the ops with global tracing running */
380 ret = trace_selftest_ops(1);
381 trace->reset(tr);
382
179 out: 383 out:
180 ftrace_enabled = save_ftrace_enabled; 384 ftrace_enabled = save_ftrace_enabled;
181 tracer_enabled = save_tracer_enabled; 385 tracer_enabled = save_tracer_enabled;
182 386
183 /* Enable tracing on all functions again */ 387 /* Enable tracing on all functions again */
184 ftrace_set_filter(NULL, 0, 1); 388 ftrace_set_global_filter(NULL, 0, 1);
389
390 /* Test the ops with global tracing off */
391 if (!ret)
392 ret = trace_selftest_ops(2);
185 393
186 return ret; 394 return ret;
187} 395}
diff --git a/kernel/trace/trace_selftest_dynamic.c b/kernel/trace/trace_selftest_dynamic.c
index 54dd77cce5bf..b4c475a0a48b 100644
--- a/kernel/trace/trace_selftest_dynamic.c
+++ b/kernel/trace/trace_selftest_dynamic.c
@@ -5,3 +5,9 @@ int DYN_FTRACE_TEST_NAME(void)
5 /* used to call mcount */ 5 /* used to call mcount */
6 return 0; 6 return 0;
7} 7}
8
9int DYN_FTRACE_TEST_NAME2(void)
10{
11 /* used to call mcount */
12 return 0;
13}
diff --git a/kernel/trace/trace_stack.c b/kernel/trace/trace_stack.c
index 4c5dead0c239..b0b53b8e4c25 100644
--- a/kernel/trace/trace_stack.c
+++ b/kernel/trace/trace_stack.c
@@ -133,6 +133,7 @@ stack_trace_call(unsigned long ip, unsigned long parent_ip)
133static struct ftrace_ops trace_ops __read_mostly = 133static struct ftrace_ops trace_ops __read_mostly =
134{ 134{
135 .func = stack_trace_call, 135 .func = stack_trace_call,
136 .flags = FTRACE_OPS_FL_GLOBAL,
136}; 137};
137 138
138static ssize_t 139static ssize_t
diff --git a/kernel/tracepoint.c b/kernel/tracepoint.c
index 68187af4889e..b219f1449c54 100644
--- a/kernel/tracepoint.c
+++ b/kernel/tracepoint.c
@@ -251,9 +251,9 @@ static void set_tracepoint(struct tracepoint_entry **entry,
251{ 251{
252 WARN_ON(strcmp((*entry)->name, elem->name) != 0); 252 WARN_ON(strcmp((*entry)->name, elem->name) != 0);
253 253
254 if (elem->regfunc && !elem->state && active) 254 if (elem->regfunc && !jump_label_enabled(&elem->key) && active)
255 elem->regfunc(); 255 elem->regfunc();
256 else if (elem->unregfunc && elem->state && !active) 256 else if (elem->unregfunc && jump_label_enabled(&elem->key) && !active)
257 elem->unregfunc(); 257 elem->unregfunc();
258 258
259 /* 259 /*
@@ -264,13 +264,10 @@ static void set_tracepoint(struct tracepoint_entry **entry,
264 * is used. 264 * is used.
265 */ 265 */
266 rcu_assign_pointer(elem->funcs, (*entry)->funcs); 266 rcu_assign_pointer(elem->funcs, (*entry)->funcs);
267 if (!elem->state && active) { 267 if (active && !jump_label_enabled(&elem->key))
268 jump_label_enable(&elem->state); 268 jump_label_inc(&elem->key);
269 elem->state = active; 269 else if (!active && jump_label_enabled(&elem->key))
270 } else if (elem->state && !active) { 270 jump_label_dec(&elem->key);
271 jump_label_disable(&elem->state);
272 elem->state = active;
273 }
274} 271}
275 272
276/* 273/*
@@ -281,13 +278,11 @@ static void set_tracepoint(struct tracepoint_entry **entry,
281 */ 278 */
282static void disable_tracepoint(struct tracepoint *elem) 279static void disable_tracepoint(struct tracepoint *elem)
283{ 280{
284 if (elem->unregfunc && elem->state) 281 if (elem->unregfunc && jump_label_enabled(&elem->key))
285 elem->unregfunc(); 282 elem->unregfunc();
286 283
287 if (elem->state) { 284 if (jump_label_enabled(&elem->key))
288 jump_label_disable(&elem->state); 285 jump_label_dec(&elem->key);
289 elem->state = 0;
290 }
291 rcu_assign_pointer(elem->funcs, NULL); 286 rcu_assign_pointer(elem->funcs, NULL);
292} 287}
293 288
diff --git a/kernel/user-return-notifier.c b/kernel/user-return-notifier.c
index eb27fd3430a2..92cb706c7fc8 100644
--- a/kernel/user-return-notifier.c
+++ b/kernel/user-return-notifier.c
@@ -20,7 +20,7 @@ EXPORT_SYMBOL_GPL(user_return_notifier_register);
20 20
21/* 21/*
22 * Removes a registered user return notifier. Must be called from atomic 22 * Removes a registered user return notifier. Must be called from atomic
23 * context, and from the same cpu registration occured in. 23 * context, and from the same cpu registration occurred in.
24 */ 24 */
25void user_return_notifier_unregister(struct user_return_notifier *urn) 25void user_return_notifier_unregister(struct user_return_notifier *urn)
26{ 26{
diff --git a/kernel/wait.c b/kernel/wait.c
index b0310eb6cc1e..f45ea8d2a1ce 100644
--- a/kernel/wait.c
+++ b/kernel/wait.c
@@ -142,7 +142,7 @@ EXPORT_SYMBOL(finish_wait);
142 * woken up through the queue. 142 * woken up through the queue.
143 * 143 *
144 * This prevents waiter starvation where an exclusive waiter 144 * This prevents waiter starvation where an exclusive waiter
145 * aborts and is woken up concurrently and noone wakes up 145 * aborts and is woken up concurrently and no one wakes up
146 * the next waiter. 146 * the next waiter.
147 */ 147 */
148void abort_exclusive_wait(wait_queue_head_t *q, wait_queue_t *wait, 148void abort_exclusive_wait(wait_queue_head_t *q, wait_queue_t *wait,
diff --git a/kernel/watchdog.c b/kernel/watchdog.c
index 140dce750450..14733d4d156b 100644
--- a/kernel/watchdog.c
+++ b/kernel/watchdog.c
@@ -430,9 +430,12 @@ static int watchdog_enable(int cpu)
430 p = kthread_create(watchdog, (void *)(unsigned long)cpu, "watchdog/%d", cpu); 430 p = kthread_create(watchdog, (void *)(unsigned long)cpu, "watchdog/%d", cpu);
431 if (IS_ERR(p)) { 431 if (IS_ERR(p)) {
432 printk(KERN_ERR "softlockup watchdog for %i failed\n", cpu); 432 printk(KERN_ERR "softlockup watchdog for %i failed\n", cpu);
433 if (!err) 433 if (!err) {
434 /* if hardlockup hasn't already set this */ 434 /* if hardlockup hasn't already set this */
435 err = PTR_ERR(p); 435 err = PTR_ERR(p);
436 /* and disable the perf event */
437 watchdog_nmi_disable(cpu);
438 }
436 goto out; 439 goto out;
437 } 440 }
438 kthread_bind(p, cpu); 441 kthread_bind(p, cpu);
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 04ef830690ec..e3378e8d3a5c 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -1291,8 +1291,14 @@ __acquires(&gcwq->lock)
1291 return true; 1291 return true;
1292 spin_unlock_irq(&gcwq->lock); 1292 spin_unlock_irq(&gcwq->lock);
1293 1293
1294 /* CPU has come up inbetween, retry migration */ 1294 /*
1295 * We've raced with CPU hot[un]plug. Give it a breather
1296 * and retry migration. cond_resched() is required here;
1297 * otherwise, we might deadlock against cpu_stop trying to
1298 * bring down the CPU on non-preemptive kernel.
1299 */
1295 cpu_relax(); 1300 cpu_relax();
1301 cond_resched();
1296 } 1302 }
1297} 1303}
1298 1304