aboutsummaryrefslogtreecommitdiffstats
path: root/kernel
diff options
context:
space:
mode:
Diffstat (limited to 'kernel')
-rw-r--r--kernel/Kconfig.locks2
-rw-r--r--kernel/Kconfig.preempt3
-rw-r--r--kernel/Makefile15
-rw-r--r--kernel/async.c12
-rw-r--r--kernel/audit.c31
-rw-r--r--kernel/audit_tree.c8
-rw-r--r--kernel/auditsc.c29
-rw-r--r--kernel/capability.c16
-rw-r--r--kernel/cgroup.c632
-rw-r--r--kernel/cgroup_freezer.c26
-rw-r--r--kernel/compat.c62
-rw-r--r--kernel/configs.c4
-rw-r--r--kernel/cpuset.c119
-rw-r--r--kernel/cred.c26
-rw-r--r--kernel/debug/debug_core.c2
-rw-r--r--kernel/debug/gdbstub.c22
-rw-r--r--kernel/debug/kdb/kdb_bt.c5
-rw-r--r--kernel/debug/kdb/kdb_cmds4
-rw-r--r--kernel/debug/kdb/kdb_debugger.c21
-rw-r--r--kernel/debug/kdb/kdb_io.c36
-rw-r--r--kernel/debug/kdb/kdb_main.c4
-rw-r--r--kernel/debug/kdb/kdb_private.h3
-rw-r--r--kernel/delayacct.c2
-rw-r--r--kernel/events/Makefile6
-rw-r--r--kernel/events/core.c (renamed from kernel/perf_event.c)1020
-rw-r--r--kernel/events/hw_breakpoint.c (renamed from kernel/hw_breakpoint.c)10
-rw-r--r--kernel/events/internal.h96
-rw-r--r--kernel/events/ring_buffer.c380
-rw-r--r--kernel/exit.c229
-rw-r--r--kernel/extable.c18
-rw-r--r--kernel/fork.c220
-rw-r--r--kernel/freezer.c4
-rw-r--r--kernel/futex.c60
-rw-r--r--kernel/gcov/Kconfig3
-rw-r--r--kernel/hrtimer.c164
-rw-r--r--kernel/hung_task.c2
-rw-r--r--kernel/irq/Kconfig8
-rw-r--r--kernel/irq/Makefile2
-rw-r--r--kernel/irq/chip.c3
-rw-r--r--kernel/irq/debug.h1
-rw-r--r--kernel/irq/devres.c2
-rw-r--r--kernel/irq/generic-chip.c368
-rw-r--r--kernel/irq/handle.c6
-rw-r--r--kernel/irq/irqdesc.c36
-rw-r--r--kernel/irq/irqdomain.c180
-rw-r--r--kernel/irq/manage.c30
-rw-r--r--kernel/irq/proc.c55
-rw-r--r--kernel/irq/settings.h17
-rw-r--r--kernel/irq/spurious.c31
-rw-r--r--kernel/jump_label.c551
-rw-r--r--kernel/kexec.c11
-rw-r--r--kernel/kmod.c126
-rw-r--r--kernel/kprobes.c33
-rw-r--r--kernel/ksysfs.c10
-rw-r--r--kernel/kthread.c4
-rw-r--r--kernel/lockdep.c276
-rw-r--r--kernel/module.c189
-rw-r--r--kernel/mutex-debug.c2
-rw-r--r--kernel/mutex-debug.h2
-rw-r--r--kernel/mutex.c34
-rw-r--r--kernel/mutex.h2
-rw-r--r--kernel/notifier.c31
-rw-r--r--kernel/ns_cgroup.c118
-rw-r--r--kernel/nsproxy.c50
-rw-r--r--kernel/panic.c2
-rw-r--r--kernel/params.c41
-rw-r--r--kernel/pid.c1
-rw-r--r--kernel/pm_qos_params.c78
-rw-r--r--kernel/posix-cpu-timers.c4
-rw-r--r--kernel/posix-timers.c27
-rw-r--r--kernel/power/Kconfig18
-rw-r--r--kernel/power/hibernate.c278
-rw-r--r--kernel/power/main.c6
-rw-r--r--kernel/power/power.h4
-rw-r--r--kernel/power/snapshot.c39
-rw-r--r--kernel/power/suspend.c32
-rw-r--r--kernel/power/user.c9
-rw-r--r--kernel/printk.c117
-rw-r--r--kernel/profile.c22
-rw-r--r--kernel/ptrace.c291
-rw-r--r--kernel/rcupdate.c34
-rw-r--r--kernel/rcutiny.c46
-rw-r--r--kernel/rcutiny_plugin.h203
-rw-r--r--kernel/rcutorture.c30
-rw-r--r--kernel/rcutree.c353
-rw-r--r--kernel/rcutree.h128
-rw-r--r--kernel/rcutree_plugin.h1022
-rw-r--r--kernel/rcutree_trace.c210
-rw-r--r--kernel/resource.c137
-rw-r--r--kernel/rtmutex.c2
-rw-r--r--kernel/rwsem.c18
-rw-r--r--kernel/sched.c2117
-rw-r--r--kernel/sched_autogroup.h1
-rw-r--r--kernel/sched_debug.c6
-rw-r--r--kernel/sched_fair.c295
-rw-r--r--kernel/sched_features.h12
-rw-r--r--kernel/sched_idletask.c2
-rw-r--r--kernel/sched_rt.c115
-rw-r--r--kernel/sched_stats.h4
-rw-r--r--kernel/sched_stoptask.c5
-rw-r--r--kernel/signal.c955
-rw-r--r--kernel/smp.c5
-rw-r--r--kernel/softirq.c14
-rw-r--r--kernel/stacktrace.c12
-rw-r--r--kernel/stop_machine.c80
-rw-r--r--kernel/sys.c52
-rw-r--r--kernel/sys_ni.c9
-rw-r--r--kernel/sysctl.c37
-rw-r--r--kernel/taskstats.c17
-rw-r--r--kernel/time/Makefile2
-rw-r--r--kernel/time/alarmtimer.c720
-rw-r--r--kernel/time/clockevents.c69
-rw-r--r--kernel/time/clocksource.c66
-rw-r--r--kernel/time/tick-broadcast.c28
-rw-r--r--kernel/time/timekeeping.c101
-rw-r--r--kernel/timer.c15
-rw-r--r--kernel/trace/Kconfig2
-rw-r--r--kernel/trace/ftrace.c1386
-rw-r--r--kernel/trace/ring_buffer.c76
-rw-r--r--kernel/trace/ring_buffer_benchmark.c2
-rw-r--r--kernel/trace/trace.c341
-rw-r--r--kernel/trace/trace.h50
-rw-r--r--kernel/trace/trace_entries.h3
-rw-r--r--kernel/trace/trace_events.c146
-rw-r--r--kernel/trace/trace_events_filter.c6
-rw-r--r--kernel/trace/trace_functions.c5
-rw-r--r--kernel/trace/trace_functions_graph.c225
-rw-r--r--kernel/trace/trace_irqsoff.c5
-rw-r--r--kernel/trace/trace_kprobe.c333
-rw-r--r--kernel/trace/trace_mmiotrace.c2
-rw-r--r--kernel/trace/trace_output.c41
-rw-r--r--kernel/trace/trace_printk.c117
-rw-r--r--kernel/trace/trace_sched_wakeup.c5
-rw-r--r--kernel/trace/trace_selftest.c214
-rw-r--r--kernel/trace/trace_selftest_dynamic.c6
-rw-r--r--kernel/trace/trace_stack.c14
-rw-r--r--kernel/tracepoint.c23
-rw-r--r--kernel/utsname.c39
-rw-r--r--kernel/watchdog.c69
-rw-r--r--kernel/workqueue.c85
140 files changed, 11146 insertions, 5414 deletions
diff --git a/kernel/Kconfig.locks b/kernel/Kconfig.locks
index 88c92fb44618..5068e2a4e75f 100644
--- a/kernel/Kconfig.locks
+++ b/kernel/Kconfig.locks
@@ -199,4 +199,4 @@ config INLINE_WRITE_UNLOCK_IRQRESTORE
199 def_bool !DEBUG_SPINLOCK && ARCH_INLINE_WRITE_UNLOCK_IRQRESTORE 199 def_bool !DEBUG_SPINLOCK && ARCH_INLINE_WRITE_UNLOCK_IRQRESTORE
200 200
201config MUTEX_SPIN_ON_OWNER 201config MUTEX_SPIN_ON_OWNER
202 def_bool SMP && !DEBUG_MUTEXES && !HAVE_DEFAULT_NO_SPIN_MUTEXES 202 def_bool SMP && !DEBUG_MUTEXES
diff --git a/kernel/Kconfig.preempt b/kernel/Kconfig.preempt
index bf987b95b356..24e7cb0ba26a 100644
--- a/kernel/Kconfig.preempt
+++ b/kernel/Kconfig.preempt
@@ -35,6 +35,7 @@ config PREEMPT_VOLUNTARY
35 35
36config PREEMPT 36config PREEMPT
37 bool "Preemptible Kernel (Low-Latency Desktop)" 37 bool "Preemptible Kernel (Low-Latency Desktop)"
38 select PREEMPT_COUNT
38 help 39 help
39 This option reduces the latency of the kernel by making 40 This option reduces the latency of the kernel by making
40 all kernel code (that is not executing in a critical section) 41 all kernel code (that is not executing in a critical section)
@@ -52,3 +53,5 @@ config PREEMPT
52 53
53endchoice 54endchoice
54 55
56config PREEMPT_COUNT
57 bool \ No newline at end of file
diff --git a/kernel/Makefile b/kernel/Makefile
index 85cbfb31e73e..eca595e2fd52 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -10,7 +10,7 @@ obj-y = sched.o fork.o exec_domain.o panic.o printk.o \
10 kthread.o wait.o kfifo.o sys_ni.o posix-cpu-timers.o mutex.o \ 10 kthread.o wait.o kfifo.o sys_ni.o posix-cpu-timers.o mutex.o \
11 hrtimer.o rwsem.o nsproxy.o srcu.o semaphore.o \ 11 hrtimer.o rwsem.o nsproxy.o srcu.o semaphore.o \
12 notifier.o ksysfs.o pm_qos_params.o sched_clock.o cred.o \ 12 notifier.o ksysfs.o pm_qos_params.o sched_clock.o cred.o \
13 async.o range.o jump_label.o 13 async.o range.o
14obj-y += groups.o 14obj-y += groups.o
15 15
16ifdef CONFIG_FUNCTION_TRACER 16ifdef CONFIG_FUNCTION_TRACER
@@ -21,7 +21,6 @@ CFLAGS_REMOVE_mutex-debug.o = -pg
21CFLAGS_REMOVE_rtmutex-debug.o = -pg 21CFLAGS_REMOVE_rtmutex-debug.o = -pg
22CFLAGS_REMOVE_cgroup-debug.o = -pg 22CFLAGS_REMOVE_cgroup-debug.o = -pg
23CFLAGS_REMOVE_sched_clock.o = -pg 23CFLAGS_REMOVE_sched_clock.o = -pg
24CFLAGS_REMOVE_perf_event.o = -pg
25CFLAGS_REMOVE_irq_work.o = -pg 24CFLAGS_REMOVE_irq_work.o = -pg
26endif 25endif
27 26
@@ -62,7 +61,6 @@ obj-$(CONFIG_COMPAT) += compat.o
62obj-$(CONFIG_CGROUPS) += cgroup.o 61obj-$(CONFIG_CGROUPS) += cgroup.o
63obj-$(CONFIG_CGROUP_FREEZER) += cgroup_freezer.o 62obj-$(CONFIG_CGROUP_FREEZER) += cgroup_freezer.o
64obj-$(CONFIG_CPUSETS) += cpuset.o 63obj-$(CONFIG_CPUSETS) += cpuset.o
65obj-$(CONFIG_CGROUP_NS) += ns_cgroup.o
66obj-$(CONFIG_UTS_NS) += utsname.o 64obj-$(CONFIG_UTS_NS) += utsname.o
67obj-$(CONFIG_USER_NS) += user_namespace.o 65obj-$(CONFIG_USER_NS) += user_namespace.o
68obj-$(CONFIG_PID_NS) += pid_namespace.o 66obj-$(CONFIG_PID_NS) += pid_namespace.o
@@ -103,11 +101,13 @@ obj-$(CONFIG_RING_BUFFER) += trace/
103obj-$(CONFIG_TRACEPOINTS) += trace/ 101obj-$(CONFIG_TRACEPOINTS) += trace/
104obj-$(CONFIG_SMP) += sched_cpupri.o 102obj-$(CONFIG_SMP) += sched_cpupri.o
105obj-$(CONFIG_IRQ_WORK) += irq_work.o 103obj-$(CONFIG_IRQ_WORK) += irq_work.o
106obj-$(CONFIG_PERF_EVENTS) += perf_event.o 104
107obj-$(CONFIG_HAVE_HW_BREAKPOINT) += hw_breakpoint.o 105obj-$(CONFIG_PERF_EVENTS) += events/
106
108obj-$(CONFIG_USER_RETURN_NOTIFIER) += user-return-notifier.o 107obj-$(CONFIG_USER_RETURN_NOTIFIER) += user-return-notifier.o
109obj-$(CONFIG_PADATA) += padata.o 108obj-$(CONFIG_PADATA) += padata.o
110obj-$(CONFIG_CRASH_DUMP) += crash_dump.o 109obj-$(CONFIG_CRASH_DUMP) += crash_dump.o
110obj-$(CONFIG_JUMP_LABEL) += jump_label.o
111 111
112ifneq ($(CONFIG_SCHED_OMIT_FRAME_POINTER),y) 112ifneq ($(CONFIG_SCHED_OMIT_FRAME_POINTER),y)
113# According to Alan Modra <alan@linuxcare.com.au>, the -fno-omit-frame-pointer is 113# According to Alan Modra <alan@linuxcare.com.au>, the -fno-omit-frame-pointer is
@@ -126,11 +126,10 @@ targets += config_data.gz
126$(obj)/config_data.gz: $(KCONFIG_CONFIG) FORCE 126$(obj)/config_data.gz: $(KCONFIG_CONFIG) FORCE
127 $(call if_changed,gzip) 127 $(call if_changed,gzip)
128 128
129quiet_cmd_ikconfiggz = IKCFG $@ 129 filechk_ikconfiggz = (echo "static const char kernel_config_data[] __used = MAGIC_START"; cat $< | scripts/bin2c; echo "MAGIC_END;")
130 cmd_ikconfiggz = (echo "static const char kernel_config_data[] __used = MAGIC_START"; cat $< | scripts/bin2c; echo "MAGIC_END;") > $@
131targets += config_data.h 130targets += config_data.h
132$(obj)/config_data.h: $(obj)/config_data.gz FORCE 131$(obj)/config_data.h: $(obj)/config_data.gz FORCE
133 $(call if_changed,ikconfiggz) 132 $(call filechk,ikconfiggz)
134 133
135$(obj)/time.o: $(obj)/timeconst.h 134$(obj)/time.o: $(obj)/timeconst.h
136 135
diff --git a/kernel/async.c b/kernel/async.c
index cd9dbb913c77..d5fe7af0de2e 100644
--- a/kernel/async.c
+++ b/kernel/async.c
@@ -49,12 +49,13 @@ asynchronous and synchronous parts of the kernel.
49*/ 49*/
50 50
51#include <linux/async.h> 51#include <linux/async.h>
52#include <linux/atomic.h>
53#include <linux/ktime.h>
52#include <linux/module.h> 54#include <linux/module.h>
53#include <linux/wait.h> 55#include <linux/wait.h>
54#include <linux/sched.h> 56#include <linux/sched.h>
55#include <linux/slab.h> 57#include <linux/slab.h>
56#include <linux/workqueue.h> 58#include <linux/workqueue.h>
57#include <asm/atomic.h>
58 59
59static async_cookie_t next_cookie = 1; 60static async_cookie_t next_cookie = 1;
60 61
@@ -128,7 +129,8 @@ static void async_run_entry_fn(struct work_struct *work)
128 129
129 /* 2) run (and print duration) */ 130 /* 2) run (and print duration) */
130 if (initcall_debug && system_state == SYSTEM_BOOTING) { 131 if (initcall_debug && system_state == SYSTEM_BOOTING) {
131 printk("calling %lli_%pF @ %i\n", (long long)entry->cookie, 132 printk(KERN_DEBUG "calling %lli_%pF @ %i\n",
133 (long long)entry->cookie,
132 entry->func, task_pid_nr(current)); 134 entry->func, task_pid_nr(current));
133 calltime = ktime_get(); 135 calltime = ktime_get();
134 } 136 }
@@ -136,7 +138,7 @@ static void async_run_entry_fn(struct work_struct *work)
136 if (initcall_debug && system_state == SYSTEM_BOOTING) { 138 if (initcall_debug && system_state == SYSTEM_BOOTING) {
137 rettime = ktime_get(); 139 rettime = ktime_get();
138 delta = ktime_sub(rettime, calltime); 140 delta = ktime_sub(rettime, calltime);
139 printk("initcall %lli_%pF returned 0 after %lld usecs\n", 141 printk(KERN_DEBUG "initcall %lli_%pF returned 0 after %lld usecs\n",
140 (long long)entry->cookie, 142 (long long)entry->cookie,
141 entry->func, 143 entry->func,
142 (long long)ktime_to_ns(delta) >> 10); 144 (long long)ktime_to_ns(delta) >> 10);
@@ -270,7 +272,7 @@ void async_synchronize_cookie_domain(async_cookie_t cookie,
270 ktime_t starttime, delta, endtime; 272 ktime_t starttime, delta, endtime;
271 273
272 if (initcall_debug && system_state == SYSTEM_BOOTING) { 274 if (initcall_debug && system_state == SYSTEM_BOOTING) {
273 printk("async_waiting @ %i\n", task_pid_nr(current)); 275 printk(KERN_DEBUG "async_waiting @ %i\n", task_pid_nr(current));
274 starttime = ktime_get(); 276 starttime = ktime_get();
275 } 277 }
276 278
@@ -280,7 +282,7 @@ void async_synchronize_cookie_domain(async_cookie_t cookie,
280 endtime = ktime_get(); 282 endtime = ktime_get();
281 delta = ktime_sub(endtime, starttime); 283 delta = ktime_sub(endtime, starttime);
282 284
283 printk("async_continuing @ %i after %lli usec\n", 285 printk(KERN_DEBUG "async_continuing @ %i after %lli usec\n",
284 task_pid_nr(current), 286 task_pid_nr(current),
285 (long long)ktime_to_ns(delta) >> 10); 287 (long long)ktime_to_ns(delta) >> 10);
286 } 288 }
diff --git a/kernel/audit.c b/kernel/audit.c
index 939500317066..0a1355ca3d79 100644
--- a/kernel/audit.c
+++ b/kernel/audit.c
@@ -43,7 +43,7 @@
43 43
44#include <linux/init.h> 44#include <linux/init.h>
45#include <asm/types.h> 45#include <asm/types.h>
46#include <asm/atomic.h> 46#include <linux/atomic.h>
47#include <linux/mm.h> 47#include <linux/mm.h>
48#include <linux/module.h> 48#include <linux/module.h>
49#include <linux/slab.h> 49#include <linux/slab.h>
@@ -55,6 +55,9 @@
55#include <net/sock.h> 55#include <net/sock.h>
56#include <net/netlink.h> 56#include <net/netlink.h>
57#include <linux/skbuff.h> 57#include <linux/skbuff.h>
58#ifdef CONFIG_SECURITY
59#include <linux/security.h>
60#endif
58#include <linux/netlink.h> 61#include <linux/netlink.h>
59#include <linux/freezer.h> 62#include <linux/freezer.h>
60#include <linux/tty.h> 63#include <linux/tty.h>
@@ -1502,6 +1505,32 @@ void audit_log(struct audit_context *ctx, gfp_t gfp_mask, int type,
1502 } 1505 }
1503} 1506}
1504 1507
1508#ifdef CONFIG_SECURITY
1509/**
1510 * audit_log_secctx - Converts and logs SELinux context
1511 * @ab: audit_buffer
1512 * @secid: security number
1513 *
1514 * This is a helper function that calls security_secid_to_secctx to convert
1515 * secid to secctx and then adds the (converted) SELinux context to the audit
1516 * log by calling audit_log_format, thus also preventing leak of internal secid
1517 * to userspace. If secid cannot be converted audit_panic is called.
1518 */
1519void audit_log_secctx(struct audit_buffer *ab, u32 secid)
1520{
1521 u32 len;
1522 char *secctx;
1523
1524 if (security_secid_to_secctx(secid, &secctx, &len)) {
1525 audit_panic("Cannot convert secid to context");
1526 } else {
1527 audit_log_format(ab, " obj=%s", secctx);
1528 security_release_secctx(secctx, len);
1529 }
1530}
1531EXPORT_SYMBOL(audit_log_secctx);
1532#endif
1533
1505EXPORT_SYMBOL(audit_log_start); 1534EXPORT_SYMBOL(audit_log_start);
1506EXPORT_SYMBOL(audit_log_end); 1535EXPORT_SYMBOL(audit_log_end);
1507EXPORT_SYMBOL(audit_log_format); 1536EXPORT_SYMBOL(audit_log_format);
diff --git a/kernel/audit_tree.c b/kernel/audit_tree.c
index e99dda04b126..5bf0790497e7 100644
--- a/kernel/audit_tree.c
+++ b/kernel/audit_tree.c
@@ -93,16 +93,10 @@ static inline void get_tree(struct audit_tree *tree)
93 atomic_inc(&tree->count); 93 atomic_inc(&tree->count);
94} 94}
95 95
96static void __put_tree(struct rcu_head *rcu)
97{
98 struct audit_tree *tree = container_of(rcu, struct audit_tree, head);
99 kfree(tree);
100}
101
102static inline void put_tree(struct audit_tree *tree) 96static inline void put_tree(struct audit_tree *tree)
103{ 97{
104 if (atomic_dec_and_test(&tree->count)) 98 if (atomic_dec_and_test(&tree->count))
105 call_rcu(&tree->head, __put_tree); 99 kfree_rcu(tree, head);
106} 100}
107 101
108/* to avoid bringing the entire thing in audit.h */ 102/* to avoid bringing the entire thing in audit.h */
diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index b33513a08beb..ce4b054acee5 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -44,7 +44,7 @@
44 44
45#include <linux/init.h> 45#include <linux/init.h>
46#include <asm/types.h> 46#include <asm/types.h>
47#include <asm/atomic.h> 47#include <linux/atomic.h>
48#include <linux/fs.h> 48#include <linux/fs.h>
49#include <linux/namei.h> 49#include <linux/namei.h>
50#include <linux/mm.h> 50#include <linux/mm.h>
@@ -443,17 +443,25 @@ static int match_tree_refs(struct audit_context *ctx, struct audit_tree *tree)
443 443
444/* Determine if any context name data matches a rule's watch data */ 444/* Determine if any context name data matches a rule's watch data */
445/* Compare a task_struct with an audit_rule. Return 1 on match, 0 445/* Compare a task_struct with an audit_rule. Return 1 on match, 0
446 * otherwise. */ 446 * otherwise.
447 *
448 * If task_creation is true, this is an explicit indication that we are
449 * filtering a task rule at task creation time. This and tsk == current are
450 * the only situations where tsk->cred may be accessed without an rcu read lock.
451 */
447static int audit_filter_rules(struct task_struct *tsk, 452static int audit_filter_rules(struct task_struct *tsk,
448 struct audit_krule *rule, 453 struct audit_krule *rule,
449 struct audit_context *ctx, 454 struct audit_context *ctx,
450 struct audit_names *name, 455 struct audit_names *name,
451 enum audit_state *state) 456 enum audit_state *state,
457 bool task_creation)
452{ 458{
453 const struct cred *cred = get_task_cred(tsk); 459 const struct cred *cred;
454 int i, j, need_sid = 1; 460 int i, j, need_sid = 1;
455 u32 sid; 461 u32 sid;
456 462
463 cred = rcu_dereference_check(tsk->cred, tsk == current || task_creation);
464
457 for (i = 0; i < rule->field_count; i++) { 465 for (i = 0; i < rule->field_count; i++) {
458 struct audit_field *f = &rule->fields[i]; 466 struct audit_field *f = &rule->fields[i];
459 int result = 0; 467 int result = 0;
@@ -637,10 +645,8 @@ static int audit_filter_rules(struct task_struct *tsk,
637 break; 645 break;
638 } 646 }
639 647
640 if (!result) { 648 if (!result)
641 put_cred(cred);
642 return 0; 649 return 0;
643 }
644 } 650 }
645 651
646 if (ctx) { 652 if (ctx) {
@@ -656,7 +662,6 @@ static int audit_filter_rules(struct task_struct *tsk,
656 case AUDIT_NEVER: *state = AUDIT_DISABLED; break; 662 case AUDIT_NEVER: *state = AUDIT_DISABLED; break;
657 case AUDIT_ALWAYS: *state = AUDIT_RECORD_CONTEXT; break; 663 case AUDIT_ALWAYS: *state = AUDIT_RECORD_CONTEXT; break;
658 } 664 }
659 put_cred(cred);
660 return 1; 665 return 1;
661} 666}
662 667
@@ -671,7 +676,8 @@ static enum audit_state audit_filter_task(struct task_struct *tsk, char **key)
671 676
672 rcu_read_lock(); 677 rcu_read_lock();
673 list_for_each_entry_rcu(e, &audit_filter_list[AUDIT_FILTER_TASK], list) { 678 list_for_each_entry_rcu(e, &audit_filter_list[AUDIT_FILTER_TASK], list) {
674 if (audit_filter_rules(tsk, &e->rule, NULL, NULL, &state)) { 679 if (audit_filter_rules(tsk, &e->rule, NULL, NULL,
680 &state, true)) {
675 if (state == AUDIT_RECORD_CONTEXT) 681 if (state == AUDIT_RECORD_CONTEXT)
676 *key = kstrdup(e->rule.filterkey, GFP_ATOMIC); 682 *key = kstrdup(e->rule.filterkey, GFP_ATOMIC);
677 rcu_read_unlock(); 683 rcu_read_unlock();
@@ -705,7 +711,7 @@ static enum audit_state audit_filter_syscall(struct task_struct *tsk,
705 list_for_each_entry_rcu(e, list, list) { 711 list_for_each_entry_rcu(e, list, list) {
706 if ((e->rule.mask[word] & bit) == bit && 712 if ((e->rule.mask[word] & bit) == bit &&
707 audit_filter_rules(tsk, &e->rule, ctx, NULL, 713 audit_filter_rules(tsk, &e->rule, ctx, NULL,
708 &state)) { 714 &state, false)) {
709 rcu_read_unlock(); 715 rcu_read_unlock();
710 ctx->current_state = state; 716 ctx->current_state = state;
711 return state; 717 return state;
@@ -743,7 +749,8 @@ void audit_filter_inodes(struct task_struct *tsk, struct audit_context *ctx)
743 749
744 list_for_each_entry_rcu(e, list, list) { 750 list_for_each_entry_rcu(e, list, list) {
745 if ((e->rule.mask[word] & bit) == bit && 751 if ((e->rule.mask[word] & bit) == bit &&
746 audit_filter_rules(tsk, &e->rule, ctx, n, &state)) { 752 audit_filter_rules(tsk, &e->rule, ctx, n,
753 &state, false)) {
747 rcu_read_unlock(); 754 rcu_read_unlock();
748 ctx->current_state = state; 755 ctx->current_state = state;
749 return; 756 return;
diff --git a/kernel/capability.c b/kernel/capability.c
index bf0c734d0c12..283c529f8b1c 100644
--- a/kernel/capability.c
+++ b/kernel/capability.c
@@ -22,12 +22,8 @@
22 */ 22 */
23 23
24const kernel_cap_t __cap_empty_set = CAP_EMPTY_SET; 24const kernel_cap_t __cap_empty_set = CAP_EMPTY_SET;
25const kernel_cap_t __cap_full_set = CAP_FULL_SET;
26const kernel_cap_t __cap_init_eff_set = CAP_INIT_EFF_SET;
27 25
28EXPORT_SYMBOL(__cap_empty_set); 26EXPORT_SYMBOL(__cap_empty_set);
29EXPORT_SYMBOL(__cap_full_set);
30EXPORT_SYMBOL(__cap_init_eff_set);
31 27
32int file_caps_enabled = 1; 28int file_caps_enabled = 1;
33 29
@@ -399,3 +395,15 @@ bool task_ns_capable(struct task_struct *t, int cap)
399 return ns_capable(task_cred_xxx(t, user)->user_ns, cap); 395 return ns_capable(task_cred_xxx(t, user)->user_ns, cap);
400} 396}
401EXPORT_SYMBOL(task_ns_capable); 397EXPORT_SYMBOL(task_ns_capable);
398
399/**
400 * nsown_capable - Check superior capability to one's own user_ns
401 * @cap: The capability in question
402 *
403 * Return true if the current task has the given superior capability
404 * targeted at its own user namespace.
405 */
406bool nsown_capable(int cap)
407{
408 return ns_capable(current_user_ns(), cap);
409}
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 25c7eb52de1a..1d2b6ceea95d 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -27,9 +27,11 @@
27 */ 27 */
28 28
29#include <linux/cgroup.h> 29#include <linux/cgroup.h>
30#include <linux/cred.h>
30#include <linux/ctype.h> 31#include <linux/ctype.h>
31#include <linux/errno.h> 32#include <linux/errno.h>
32#include <linux/fs.h> 33#include <linux/fs.h>
34#include <linux/init_task.h>
33#include <linux/kernel.h> 35#include <linux/kernel.h>
34#include <linux/list.h> 36#include <linux/list.h>
35#include <linux/mm.h> 37#include <linux/mm.h>
@@ -57,8 +59,9 @@
57#include <linux/vmalloc.h> /* TODO: replace with more sophisticated array */ 59#include <linux/vmalloc.h> /* TODO: replace with more sophisticated array */
58#include <linux/eventfd.h> 60#include <linux/eventfd.h>
59#include <linux/poll.h> 61#include <linux/poll.h>
62#include <linux/flex_array.h> /* used in cgroup_attach_proc */
60 63
61#include <asm/atomic.h> 64#include <linux/atomic.h>
62 65
63static DEFINE_MUTEX(cgroup_mutex); 66static DEFINE_MUTEX(cgroup_mutex);
64 67
@@ -326,12 +329,6 @@ static struct hlist_head *css_set_hash(struct cgroup_subsys_state *css[])
326 return &css_set_table[index]; 329 return &css_set_table[index];
327} 330}
328 331
329static void free_css_set_rcu(struct rcu_head *obj)
330{
331 struct css_set *cg = container_of(obj, struct css_set, rcu_head);
332 kfree(cg);
333}
334
335/* We don't maintain the lists running through each css_set to its 332/* We don't maintain the lists running through each css_set to its
336 * task until after the first call to cgroup_iter_start(). This 333 * task until after the first call to cgroup_iter_start(). This
337 * reduces the fork()/exit() overhead for people who have cgroups 334 * reduces the fork()/exit() overhead for people who have cgroups
@@ -375,7 +372,7 @@ static void __put_css_set(struct css_set *cg, int taskexit)
375 } 372 }
376 373
377 write_unlock(&css_set_lock); 374 write_unlock(&css_set_lock);
378 call_rcu(&cg->rcu_head, free_css_set_rcu); 375 kfree_rcu(cg, rcu_head);
379} 376}
380 377
381/* 378/*
@@ -812,13 +809,6 @@ static int cgroup_call_pre_destroy(struct cgroup *cgrp)
812 return ret; 809 return ret;
813} 810}
814 811
815static void free_cgroup_rcu(struct rcu_head *obj)
816{
817 struct cgroup *cgrp = container_of(obj, struct cgroup, rcu_head);
818
819 kfree(cgrp);
820}
821
822static void cgroup_diput(struct dentry *dentry, struct inode *inode) 812static void cgroup_diput(struct dentry *dentry, struct inode *inode)
823{ 813{
824 /* is dentry a directory ? if so, kfree() associated cgroup */ 814 /* is dentry a directory ? if so, kfree() associated cgroup */
@@ -856,7 +846,7 @@ static void cgroup_diput(struct dentry *dentry, struct inode *inode)
856 */ 846 */
857 BUG_ON(!list_empty(&cgrp->pidlists)); 847 BUG_ON(!list_empty(&cgrp->pidlists));
858 848
859 call_rcu(&cgrp->rcu_head, free_cgroup_rcu); 849 kfree_rcu(cgrp, rcu_head);
860 } 850 }
861 iput(inode); 851 iput(inode);
862} 852}
@@ -1526,6 +1516,7 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type,
1526 struct cgroup *root_cgrp = &root->top_cgroup; 1516 struct cgroup *root_cgrp = &root->top_cgroup;
1527 struct inode *inode; 1517 struct inode *inode;
1528 struct cgroupfs_root *existing_root; 1518 struct cgroupfs_root *existing_root;
1519 const struct cred *cred;
1529 int i; 1520 int i;
1530 1521
1531 BUG_ON(sb->s_root != NULL); 1522 BUG_ON(sb->s_root != NULL);
@@ -1605,7 +1596,9 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type,
1605 BUG_ON(!list_empty(&root_cgrp->children)); 1596 BUG_ON(!list_empty(&root_cgrp->children));
1606 BUG_ON(root->number_of_cgroups != 1); 1597 BUG_ON(root->number_of_cgroups != 1);
1607 1598
1599 cred = override_creds(&init_cred);
1608 cgroup_populate_dir(root_cgrp); 1600 cgroup_populate_dir(root_cgrp);
1601 revert_creds(cred);
1609 mutex_unlock(&cgroup_mutex); 1602 mutex_unlock(&cgroup_mutex);
1610 mutex_unlock(&inode->i_mutex); 1603 mutex_unlock(&inode->i_mutex);
1611 } else { 1604 } else {
@@ -1709,7 +1702,6 @@ int cgroup_path(const struct cgroup *cgrp, char *buf, int buflen)
1709{ 1702{
1710 char *start; 1703 char *start;
1711 struct dentry *dentry = rcu_dereference_check(cgrp->dentry, 1704 struct dentry *dentry = rcu_dereference_check(cgrp->dentry,
1712 rcu_read_lock_held() ||
1713 cgroup_lock_is_held()); 1705 cgroup_lock_is_held());
1714 1706
1715 if (!dentry || cgrp == dummytop) { 1707 if (!dentry || cgrp == dummytop) {
@@ -1735,7 +1727,6 @@ int cgroup_path(const struct cgroup *cgrp, char *buf, int buflen)
1735 break; 1727 break;
1736 1728
1737 dentry = rcu_dereference_check(cgrp->dentry, 1729 dentry = rcu_dereference_check(cgrp->dentry,
1738 rcu_read_lock_held() ||
1739 cgroup_lock_is_held()); 1730 cgroup_lock_is_held());
1740 if (!cgrp->parent) 1731 if (!cgrp->parent)
1741 continue; 1732 continue;
@@ -1748,6 +1739,76 @@ int cgroup_path(const struct cgroup *cgrp, char *buf, int buflen)
1748} 1739}
1749EXPORT_SYMBOL_GPL(cgroup_path); 1740EXPORT_SYMBOL_GPL(cgroup_path);
1750 1741
1742/*
1743 * cgroup_task_migrate - move a task from one cgroup to another.
1744 *
1745 * 'guarantee' is set if the caller promises that a new css_set for the task
1746 * will already exist. If not set, this function might sleep, and can fail with
1747 * -ENOMEM. Otherwise, it can only fail with -ESRCH.
1748 */
1749static int cgroup_task_migrate(struct cgroup *cgrp, struct cgroup *oldcgrp,
1750 struct task_struct *tsk, bool guarantee)
1751{
1752 struct css_set *oldcg;
1753 struct css_set *newcg;
1754
1755 /*
1756 * get old css_set. we need to take task_lock and refcount it, because
1757 * an exiting task can change its css_set to init_css_set and drop its
1758 * old one without taking cgroup_mutex.
1759 */
1760 task_lock(tsk);
1761 oldcg = tsk->cgroups;
1762 get_css_set(oldcg);
1763 task_unlock(tsk);
1764
1765 /* locate or allocate a new css_set for this task. */
1766 if (guarantee) {
1767 /* we know the css_set we want already exists. */
1768 struct cgroup_subsys_state *template[CGROUP_SUBSYS_COUNT];
1769 read_lock(&css_set_lock);
1770 newcg = find_existing_css_set(oldcg, cgrp, template);
1771 BUG_ON(!newcg);
1772 get_css_set(newcg);
1773 read_unlock(&css_set_lock);
1774 } else {
1775 might_sleep();
1776 /* find_css_set will give us newcg already referenced. */
1777 newcg = find_css_set(oldcg, cgrp);
1778 if (!newcg) {
1779 put_css_set(oldcg);
1780 return -ENOMEM;
1781 }
1782 }
1783 put_css_set(oldcg);
1784
1785 /* if PF_EXITING is set, the tsk->cgroups pointer is no longer safe. */
1786 task_lock(tsk);
1787 if (tsk->flags & PF_EXITING) {
1788 task_unlock(tsk);
1789 put_css_set(newcg);
1790 return -ESRCH;
1791 }
1792 rcu_assign_pointer(tsk->cgroups, newcg);
1793 task_unlock(tsk);
1794
1795 /* Update the css_set linked lists if we're using them */
1796 write_lock(&css_set_lock);
1797 if (!list_empty(&tsk->cg_list))
1798 list_move(&tsk->cg_list, &newcg->tasks);
1799 write_unlock(&css_set_lock);
1800
1801 /*
1802 * We just gained a reference on oldcg by taking it from the task. As
1803 * trading it for newcg is protected by cgroup_mutex, we're safe to drop
1804 * it here; it will be freed under RCU.
1805 */
1806 put_css_set(oldcg);
1807
1808 set_bit(CGRP_RELEASABLE, &oldcgrp->flags);
1809 return 0;
1810}
1811
1751/** 1812/**
1752 * cgroup_attach_task - attach task 'tsk' to cgroup 'cgrp' 1813 * cgroup_attach_task - attach task 'tsk' to cgroup 'cgrp'
1753 * @cgrp: the cgroup the task is attaching to 1814 * @cgrp: the cgroup the task is attaching to
@@ -1758,11 +1819,9 @@ EXPORT_SYMBOL_GPL(cgroup_path);
1758 */ 1819 */
1759int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk) 1820int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk)
1760{ 1821{
1761 int retval = 0; 1822 int retval;
1762 struct cgroup_subsys *ss, *failed_ss = NULL; 1823 struct cgroup_subsys *ss, *failed_ss = NULL;
1763 struct cgroup *oldcgrp; 1824 struct cgroup *oldcgrp;
1764 struct css_set *cg;
1765 struct css_set *newcg;
1766 struct cgroupfs_root *root = cgrp->root; 1825 struct cgroupfs_root *root = cgrp->root;
1767 1826
1768 /* Nothing to do if the task is already in that cgroup */ 1827 /* Nothing to do if the task is already in that cgroup */
@@ -1772,7 +1831,7 @@ int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk)
1772 1831
1773 for_each_subsys(root, ss) { 1832 for_each_subsys(root, ss) {
1774 if (ss->can_attach) { 1833 if (ss->can_attach) {
1775 retval = ss->can_attach(ss, cgrp, tsk, false); 1834 retval = ss->can_attach(ss, cgrp, tsk);
1776 if (retval) { 1835 if (retval) {
1777 /* 1836 /*
1778 * Remember on which subsystem the can_attach() 1837 * Remember on which subsystem the can_attach()
@@ -1784,46 +1843,29 @@ int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk)
1784 goto out; 1843 goto out;
1785 } 1844 }
1786 } 1845 }
1846 if (ss->can_attach_task) {
1847 retval = ss->can_attach_task(cgrp, tsk);
1848 if (retval) {
1849 failed_ss = ss;
1850 goto out;
1851 }
1852 }
1787 } 1853 }
1788 1854
1789 task_lock(tsk); 1855 retval = cgroup_task_migrate(cgrp, oldcgrp, tsk, false);
1790 cg = tsk->cgroups; 1856 if (retval)
1791 get_css_set(cg);
1792 task_unlock(tsk);
1793 /*
1794 * Locate or allocate a new css_set for this task,
1795 * based on its final set of cgroups
1796 */
1797 newcg = find_css_set(cg, cgrp);
1798 put_css_set(cg);
1799 if (!newcg) {
1800 retval = -ENOMEM;
1801 goto out; 1857 goto out;
1802 }
1803
1804 task_lock(tsk);
1805 if (tsk->flags & PF_EXITING) {
1806 task_unlock(tsk);
1807 put_css_set(newcg);
1808 retval = -ESRCH;
1809 goto out;
1810 }
1811 rcu_assign_pointer(tsk->cgroups, newcg);
1812 task_unlock(tsk);
1813
1814 /* Update the css_set linked lists if we're using them */
1815 write_lock(&css_set_lock);
1816 if (!list_empty(&tsk->cg_list))
1817 list_move(&tsk->cg_list, &newcg->tasks);
1818 write_unlock(&css_set_lock);
1819 1858
1820 for_each_subsys(root, ss) { 1859 for_each_subsys(root, ss) {
1860 if (ss->pre_attach)
1861 ss->pre_attach(cgrp);
1862 if (ss->attach_task)
1863 ss->attach_task(cgrp, tsk);
1821 if (ss->attach) 1864 if (ss->attach)
1822 ss->attach(ss, cgrp, oldcgrp, tsk, false); 1865 ss->attach(ss, cgrp, oldcgrp, tsk);
1823 } 1866 }
1824 set_bit(CGRP_RELEASABLE, &oldcgrp->flags); 1867
1825 synchronize_rcu(); 1868 synchronize_rcu();
1826 put_css_set(cg);
1827 1869
1828 /* 1870 /*
1829 * wake up rmdir() waiter. the rmdir should fail since the cgroup 1871 * wake up rmdir() waiter. the rmdir should fail since the cgroup
@@ -1842,7 +1884,7 @@ out:
1842 */ 1884 */
1843 break; 1885 break;
1844 if (ss->cancel_attach) 1886 if (ss->cancel_attach)
1845 ss->cancel_attach(ss, cgrp, tsk, false); 1887 ss->cancel_attach(ss, cgrp, tsk);
1846 } 1888 }
1847 } 1889 }
1848 return retval; 1890 return retval;
@@ -1873,49 +1915,370 @@ int cgroup_attach_task_all(struct task_struct *from, struct task_struct *tsk)
1873EXPORT_SYMBOL_GPL(cgroup_attach_task_all); 1915EXPORT_SYMBOL_GPL(cgroup_attach_task_all);
1874 1916
1875/* 1917/*
1876 * Attach task with pid 'pid' to cgroup 'cgrp'. Call with cgroup_mutex 1918 * cgroup_attach_proc works in two stages, the first of which prefetches all
1877 * held. May take task_lock of task 1919 * new css_sets needed (to make sure we have enough memory before committing
1920 * to the move) and stores them in a list of entries of the following type.
1921 * TODO: possible optimization: use css_set->rcu_head for chaining instead
1922 */
1923struct cg_list_entry {
1924 struct css_set *cg;
1925 struct list_head links;
1926};
1927
1928static bool css_set_check_fetched(struct cgroup *cgrp,
1929 struct task_struct *tsk, struct css_set *cg,
1930 struct list_head *newcg_list)
1931{
1932 struct css_set *newcg;
1933 struct cg_list_entry *cg_entry;
1934 struct cgroup_subsys_state *template[CGROUP_SUBSYS_COUNT];
1935
1936 read_lock(&css_set_lock);
1937 newcg = find_existing_css_set(cg, cgrp, template);
1938 if (newcg)
1939 get_css_set(newcg);
1940 read_unlock(&css_set_lock);
1941
1942 /* doesn't exist at all? */
1943 if (!newcg)
1944 return false;
1945 /* see if it's already in the list */
1946 list_for_each_entry(cg_entry, newcg_list, links) {
1947 if (cg_entry->cg == newcg) {
1948 put_css_set(newcg);
1949 return true;
1950 }
1951 }
1952
1953 /* not found */
1954 put_css_set(newcg);
1955 return false;
1956}
1957
1958/*
1959 * Find the new css_set and store it in the list in preparation for moving the
1960 * given task to the given cgroup. Returns 0 or -ENOMEM.
1961 */
1962static int css_set_prefetch(struct cgroup *cgrp, struct css_set *cg,
1963 struct list_head *newcg_list)
1964{
1965 struct css_set *newcg;
1966 struct cg_list_entry *cg_entry;
1967
1968 /* ensure a new css_set will exist for this thread */
1969 newcg = find_css_set(cg, cgrp);
1970 if (!newcg)
1971 return -ENOMEM;
1972 /* add it to the list */
1973 cg_entry = kmalloc(sizeof(struct cg_list_entry), GFP_KERNEL);
1974 if (!cg_entry) {
1975 put_css_set(newcg);
1976 return -ENOMEM;
1977 }
1978 cg_entry->cg = newcg;
1979 list_add(&cg_entry->links, newcg_list);
1980 return 0;
1981}
1982
1983/**
1984 * cgroup_attach_proc - attach all threads in a threadgroup to a cgroup
1985 * @cgrp: the cgroup to attach to
1986 * @leader: the threadgroup leader task_struct of the group to be attached
1987 *
1988 * Call holding cgroup_mutex and the threadgroup_fork_lock of the leader. Will
1989 * take task_lock of each thread in leader's threadgroup individually in turn.
1990 */
1991int cgroup_attach_proc(struct cgroup *cgrp, struct task_struct *leader)
1992{
1993 int retval, i, group_size;
1994 struct cgroup_subsys *ss, *failed_ss = NULL;
1995 bool cancel_failed_ss = false;
1996 /* guaranteed to be initialized later, but the compiler needs this */
1997 struct cgroup *oldcgrp = NULL;
1998 struct css_set *oldcg;
1999 struct cgroupfs_root *root = cgrp->root;
2000 /* threadgroup list cursor and array */
2001 struct task_struct *tsk;
2002 struct flex_array *group;
2003 /*
2004 * we need to make sure we have css_sets for all the tasks we're
2005 * going to move -before- we actually start moving them, so that in
2006 * case we get an ENOMEM we can bail out before making any changes.
2007 */
2008 struct list_head newcg_list;
2009 struct cg_list_entry *cg_entry, *temp_nobe;
2010
2011 /*
2012 * step 0: in order to do expensive, possibly blocking operations for
2013 * every thread, we cannot iterate the thread group list, since it needs
2014 * rcu or tasklist locked. instead, build an array of all threads in the
2015 * group - threadgroup_fork_lock prevents new threads from appearing,
2016 * and if threads exit, this will just be an over-estimate.
2017 */
2018 group_size = get_nr_threads(leader);
2019 /* flex_array supports very large thread-groups better than kmalloc. */
2020 group = flex_array_alloc(sizeof(struct task_struct *), group_size,
2021 GFP_KERNEL);
2022 if (!group)
2023 return -ENOMEM;
2024 /* pre-allocate to guarantee space while iterating in rcu read-side. */
2025 retval = flex_array_prealloc(group, 0, group_size - 1, GFP_KERNEL);
2026 if (retval)
2027 goto out_free_group_list;
2028
2029 /* prevent changes to the threadgroup list while we take a snapshot. */
2030 rcu_read_lock();
2031 if (!thread_group_leader(leader)) {
2032 /*
2033 * a race with de_thread from another thread's exec() may strip
2034 * us of our leadership, making while_each_thread unsafe to use
2035 * on this task. if this happens, there is no choice but to
2036 * throw this task away and try again (from cgroup_procs_write);
2037 * this is "double-double-toil-and-trouble-check locking".
2038 */
2039 rcu_read_unlock();
2040 retval = -EAGAIN;
2041 goto out_free_group_list;
2042 }
2043 /* take a reference on each task in the group to go in the array. */
2044 tsk = leader;
2045 i = 0;
2046 do {
2047 /* as per above, nr_threads may decrease, but not increase. */
2048 BUG_ON(i >= group_size);
2049 get_task_struct(tsk);
2050 /*
2051 * saying GFP_ATOMIC has no effect here because we did prealloc
2052 * earlier, but it's good form to communicate our expectations.
2053 */
2054 retval = flex_array_put_ptr(group, i, tsk, GFP_ATOMIC);
2055 BUG_ON(retval != 0);
2056 i++;
2057 } while_each_thread(leader, tsk);
2058 /* remember the number of threads in the array for later. */
2059 group_size = i;
2060 rcu_read_unlock();
2061
2062 /*
2063 * step 1: check that we can legitimately attach to the cgroup.
2064 */
2065 for_each_subsys(root, ss) {
2066 if (ss->can_attach) {
2067 retval = ss->can_attach(ss, cgrp, leader);
2068 if (retval) {
2069 failed_ss = ss;
2070 goto out_cancel_attach;
2071 }
2072 }
2073 /* a callback to be run on every thread in the threadgroup. */
2074 if (ss->can_attach_task) {
2075 /* run on each task in the threadgroup. */
2076 for (i = 0; i < group_size; i++) {
2077 tsk = flex_array_get_ptr(group, i);
2078 retval = ss->can_attach_task(cgrp, tsk);
2079 if (retval) {
2080 failed_ss = ss;
2081 cancel_failed_ss = true;
2082 goto out_cancel_attach;
2083 }
2084 }
2085 }
2086 }
2087
2088 /*
2089 * step 2: make sure css_sets exist for all threads to be migrated.
2090 * we use find_css_set, which allocates a new one if necessary.
2091 */
2092 INIT_LIST_HEAD(&newcg_list);
2093 for (i = 0; i < group_size; i++) {
2094 tsk = flex_array_get_ptr(group, i);
2095 /* nothing to do if this task is already in the cgroup */
2096 oldcgrp = task_cgroup_from_root(tsk, root);
2097 if (cgrp == oldcgrp)
2098 continue;
2099 /* get old css_set pointer */
2100 task_lock(tsk);
2101 if (tsk->flags & PF_EXITING) {
2102 /* ignore this task if it's going away */
2103 task_unlock(tsk);
2104 continue;
2105 }
2106 oldcg = tsk->cgroups;
2107 get_css_set(oldcg);
2108 task_unlock(tsk);
2109 /* see if the new one for us is already in the list? */
2110 if (css_set_check_fetched(cgrp, tsk, oldcg, &newcg_list)) {
2111 /* was already there, nothing to do. */
2112 put_css_set(oldcg);
2113 } else {
2114 /* we don't already have it. get new one. */
2115 retval = css_set_prefetch(cgrp, oldcg, &newcg_list);
2116 put_css_set(oldcg);
2117 if (retval)
2118 goto out_list_teardown;
2119 }
2120 }
2121
2122 /*
2123 * step 3: now that we're guaranteed success wrt the css_sets, proceed
2124 * to move all tasks to the new cgroup, calling ss->attach_task for each
2125 * one along the way. there are no failure cases after here, so this is
2126 * the commit point.
2127 */
2128 for_each_subsys(root, ss) {
2129 if (ss->pre_attach)
2130 ss->pre_attach(cgrp);
2131 }
2132 for (i = 0; i < group_size; i++) {
2133 tsk = flex_array_get_ptr(group, i);
2134 /* leave current thread as it is if it's already there */
2135 oldcgrp = task_cgroup_from_root(tsk, root);
2136 if (cgrp == oldcgrp)
2137 continue;
2138 /* attach each task to each subsystem */
2139 for_each_subsys(root, ss) {
2140 if (ss->attach_task)
2141 ss->attach_task(cgrp, tsk);
2142 }
2143 /* if the thread is PF_EXITING, it can just get skipped. */
2144 retval = cgroup_task_migrate(cgrp, oldcgrp, tsk, true);
2145 BUG_ON(retval != 0 && retval != -ESRCH);
2146 }
2147 /* nothing is sensitive to fork() after this point. */
2148
2149 /*
2150 * step 4: do expensive, non-thread-specific subsystem callbacks.
2151 * TODO: if ever a subsystem needs to know the oldcgrp for each task
2152 * being moved, this call will need to be reworked to communicate that.
2153 */
2154 for_each_subsys(root, ss) {
2155 if (ss->attach)
2156 ss->attach(ss, cgrp, oldcgrp, leader);
2157 }
2158
2159 /*
2160 * step 5: success! and cleanup
2161 */
2162 synchronize_rcu();
2163 cgroup_wakeup_rmdir_waiter(cgrp);
2164 retval = 0;
2165out_list_teardown:
2166 /* clean up the list of prefetched css_sets. */
2167 list_for_each_entry_safe(cg_entry, temp_nobe, &newcg_list, links) {
2168 list_del(&cg_entry->links);
2169 put_css_set(cg_entry->cg);
2170 kfree(cg_entry);
2171 }
2172out_cancel_attach:
2173 /* same deal as in cgroup_attach_task */
2174 if (retval) {
2175 for_each_subsys(root, ss) {
2176 if (ss == failed_ss) {
2177 if (cancel_failed_ss && ss->cancel_attach)
2178 ss->cancel_attach(ss, cgrp, leader);
2179 break;
2180 }
2181 if (ss->cancel_attach)
2182 ss->cancel_attach(ss, cgrp, leader);
2183 }
2184 }
2185 /* clean up the array of referenced threads in the group. */
2186 for (i = 0; i < group_size; i++) {
2187 tsk = flex_array_get_ptr(group, i);
2188 put_task_struct(tsk);
2189 }
2190out_free_group_list:
2191 flex_array_free(group);
2192 return retval;
2193}
2194
2195/*
2196 * Find the task_struct of the task to attach by vpid and pass it along to the
2197 * function to attach either it or all tasks in its threadgroup. Will take
2198 * cgroup_mutex; may take task_lock of task.
1878 */ 2199 */
1879static int attach_task_by_pid(struct cgroup *cgrp, u64 pid) 2200static int attach_task_by_pid(struct cgroup *cgrp, u64 pid, bool threadgroup)
1880{ 2201{
1881 struct task_struct *tsk; 2202 struct task_struct *tsk;
1882 const struct cred *cred = current_cred(), *tcred; 2203 const struct cred *cred = current_cred(), *tcred;
1883 int ret; 2204 int ret;
1884 2205
2206 if (!cgroup_lock_live_group(cgrp))
2207 return -ENODEV;
2208
1885 if (pid) { 2209 if (pid) {
1886 rcu_read_lock(); 2210 rcu_read_lock();
1887 tsk = find_task_by_vpid(pid); 2211 tsk = find_task_by_vpid(pid);
1888 if (!tsk || tsk->flags & PF_EXITING) { 2212 if (!tsk) {
1889 rcu_read_unlock(); 2213 rcu_read_unlock();
2214 cgroup_unlock();
2215 return -ESRCH;
2216 }
2217 if (threadgroup) {
2218 /*
2219 * RCU protects this access, since tsk was found in the
2220 * tid map. a race with de_thread may cause group_leader
2221 * to stop being the leader, but cgroup_attach_proc will
2222 * detect it later.
2223 */
2224 tsk = tsk->group_leader;
2225 } else if (tsk->flags & PF_EXITING) {
2226 /* optimization for the single-task-only case */
2227 rcu_read_unlock();
2228 cgroup_unlock();
1890 return -ESRCH; 2229 return -ESRCH;
1891 } 2230 }
1892 2231
2232 /*
2233 * even if we're attaching all tasks in the thread group, we
2234 * only need to check permissions on one of them.
2235 */
1893 tcred = __task_cred(tsk); 2236 tcred = __task_cred(tsk);
1894 if (cred->euid && 2237 if (cred->euid &&
1895 cred->euid != tcred->uid && 2238 cred->euid != tcred->uid &&
1896 cred->euid != tcred->suid) { 2239 cred->euid != tcred->suid) {
1897 rcu_read_unlock(); 2240 rcu_read_unlock();
2241 cgroup_unlock();
1898 return -EACCES; 2242 return -EACCES;
1899 } 2243 }
1900 get_task_struct(tsk); 2244 get_task_struct(tsk);
1901 rcu_read_unlock(); 2245 rcu_read_unlock();
1902 } else { 2246 } else {
1903 tsk = current; 2247 if (threadgroup)
2248 tsk = current->group_leader;
2249 else
2250 tsk = current;
1904 get_task_struct(tsk); 2251 get_task_struct(tsk);
1905 } 2252 }
1906 2253
1907 ret = cgroup_attach_task(cgrp, tsk); 2254 if (threadgroup) {
2255 threadgroup_fork_write_lock(tsk);
2256 ret = cgroup_attach_proc(cgrp, tsk);
2257 threadgroup_fork_write_unlock(tsk);
2258 } else {
2259 ret = cgroup_attach_task(cgrp, tsk);
2260 }
1908 put_task_struct(tsk); 2261 put_task_struct(tsk);
2262 cgroup_unlock();
1909 return ret; 2263 return ret;
1910} 2264}
1911 2265
1912static int cgroup_tasks_write(struct cgroup *cgrp, struct cftype *cft, u64 pid) 2266static int cgroup_tasks_write(struct cgroup *cgrp, struct cftype *cft, u64 pid)
1913{ 2267{
2268 return attach_task_by_pid(cgrp, pid, false);
2269}
2270
2271static int cgroup_procs_write(struct cgroup *cgrp, struct cftype *cft, u64 tgid)
2272{
1914 int ret; 2273 int ret;
1915 if (!cgroup_lock_live_group(cgrp)) 2274 do {
1916 return -ENODEV; 2275 /*
1917 ret = attach_task_by_pid(cgrp, pid); 2276 * attach_proc fails with -EAGAIN if threadgroup leadership
1918 cgroup_unlock(); 2277 * changes in the middle of the operation, in which case we need
2278 * to find the task_struct for the new leader and start over.
2279 */
2280 ret = attach_task_by_pid(cgrp, tgid, true);
2281 } while (ret == -EAGAIN);
1919 return ret; 2282 return ret;
1920} 2283}
1921 2284
@@ -3182,7 +3545,8 @@ static int cgroup_write_event_control(struct cgroup *cgrp, struct cftype *cft,
3182 } 3545 }
3183 3546
3184 /* the process need read permission on control file */ 3547 /* the process need read permission on control file */
3185 ret = file_permission(cfile, MAY_READ); 3548 /* AV: shouldn't we check that it's been opened for read instead? */
3549 ret = inode_permission(cfile->f_path.dentry->d_inode, MAY_READ);
3186 if (ret < 0) 3550 if (ret < 0)
3187 goto fail; 3551 goto fail;
3188 3552
@@ -3272,9 +3636,9 @@ static struct cftype files[] = {
3272 { 3636 {
3273 .name = CGROUP_FILE_GENERIC_PREFIX "procs", 3637 .name = CGROUP_FILE_GENERIC_PREFIX "procs",
3274 .open = cgroup_procs_open, 3638 .open = cgroup_procs_open,
3275 /* .write_u64 = cgroup_procs_write, TODO */ 3639 .write_u64 = cgroup_procs_write,
3276 .release = cgroup_pidlist_release, 3640 .release = cgroup_pidlist_release,
3277 .mode = S_IRUGO, 3641 .mode = S_IRUGO | S_IWUSR,
3278 }, 3642 },
3279 { 3643 {
3280 .name = "notify_on_release", 3644 .name = "notify_on_release",
@@ -4270,122 +4634,6 @@ void cgroup_exit(struct task_struct *tsk, int run_callbacks)
4270} 4634}
4271 4635
4272/** 4636/**
4273 * cgroup_clone - clone the cgroup the given subsystem is attached to
4274 * @tsk: the task to be moved
4275 * @subsys: the given subsystem
4276 * @nodename: the name for the new cgroup
4277 *
4278 * Duplicate the current cgroup in the hierarchy that the given
4279 * subsystem is attached to, and move this task into the new
4280 * child.
4281 */
4282int cgroup_clone(struct task_struct *tsk, struct cgroup_subsys *subsys,
4283 char *nodename)
4284{
4285 struct dentry *dentry;
4286 int ret = 0;
4287 struct cgroup *parent, *child;
4288 struct inode *inode;
4289 struct css_set *cg;
4290 struct cgroupfs_root *root;
4291 struct cgroup_subsys *ss;
4292
4293 /* We shouldn't be called by an unregistered subsystem */
4294 BUG_ON(!subsys->active);
4295
4296 /* First figure out what hierarchy and cgroup we're dealing
4297 * with, and pin them so we can drop cgroup_mutex */
4298 mutex_lock(&cgroup_mutex);
4299 again:
4300 root = subsys->root;
4301 if (root == &rootnode) {
4302 mutex_unlock(&cgroup_mutex);
4303 return 0;
4304 }
4305
4306 /* Pin the hierarchy */
4307 if (!atomic_inc_not_zero(&root->sb->s_active)) {
4308 /* We race with the final deactivate_super() */
4309 mutex_unlock(&cgroup_mutex);
4310 return 0;
4311 }
4312
4313 /* Keep the cgroup alive */
4314 task_lock(tsk);
4315 parent = task_cgroup(tsk, subsys->subsys_id);
4316 cg = tsk->cgroups;
4317 get_css_set(cg);
4318 task_unlock(tsk);
4319
4320 mutex_unlock(&cgroup_mutex);
4321
4322 /* Now do the VFS work to create a cgroup */
4323 inode = parent->dentry->d_inode;
4324
4325 /* Hold the parent directory mutex across this operation to
4326 * stop anyone else deleting the new cgroup */
4327 mutex_lock(&inode->i_mutex);
4328 dentry = lookup_one_len(nodename, parent->dentry, strlen(nodename));
4329 if (IS_ERR(dentry)) {
4330 printk(KERN_INFO
4331 "cgroup: Couldn't allocate dentry for %s: %ld\n", nodename,
4332 PTR_ERR(dentry));
4333 ret = PTR_ERR(dentry);
4334 goto out_release;
4335 }
4336
4337 /* Create the cgroup directory, which also creates the cgroup */
4338 ret = vfs_mkdir(inode, dentry, 0755);
4339 child = __d_cgrp(dentry);
4340 dput(dentry);
4341 if (ret) {
4342 printk(KERN_INFO
4343 "Failed to create cgroup %s: %d\n", nodename,
4344 ret);
4345 goto out_release;
4346 }
4347
4348 /* The cgroup now exists. Retake cgroup_mutex and check
4349 * that we're still in the same state that we thought we
4350 * were. */
4351 mutex_lock(&cgroup_mutex);
4352 if ((root != subsys->root) ||
4353 (parent != task_cgroup(tsk, subsys->subsys_id))) {
4354 /* Aargh, we raced ... */
4355 mutex_unlock(&inode->i_mutex);
4356 put_css_set(cg);
4357
4358 deactivate_super(root->sb);
4359 /* The cgroup is still accessible in the VFS, but
4360 * we're not going to try to rmdir() it at this
4361 * point. */
4362 printk(KERN_INFO
4363 "Race in cgroup_clone() - leaking cgroup %s\n",
4364 nodename);
4365 goto again;
4366 }
4367
4368 /* do any required auto-setup */
4369 for_each_subsys(root, ss) {
4370 if (ss->post_clone)
4371 ss->post_clone(ss, child);
4372 }
4373
4374 /* All seems fine. Finish by moving the task into the new cgroup */
4375 ret = cgroup_attach_task(child, tsk);
4376 mutex_unlock(&cgroup_mutex);
4377
4378 out_release:
4379 mutex_unlock(&inode->i_mutex);
4380
4381 mutex_lock(&cgroup_mutex);
4382 put_css_set(cg);
4383 mutex_unlock(&cgroup_mutex);
4384 deactivate_super(root->sb);
4385 return ret;
4386}
4387
4388/**
4389 * cgroup_is_descendant - see if @cgrp is a descendant of @task's cgrp 4637 * cgroup_is_descendant - see if @cgrp is a descendant of @task's cgrp
4390 * @cgrp: the cgroup in question 4638 * @cgrp: the cgroup in question
4391 * @task: the task in question 4639 * @task: the task in question
@@ -4569,8 +4817,7 @@ unsigned short css_id(struct cgroup_subsys_state *css)
4569 * on this or this is under rcu_read_lock(). Once css->id is allocated, 4817 * on this or this is under rcu_read_lock(). Once css->id is allocated,
4570 * it's unchanged until freed. 4818 * it's unchanged until freed.
4571 */ 4819 */
4572 cssid = rcu_dereference_check(css->id, 4820 cssid = rcu_dereference_check(css->id, atomic_read(&css->refcnt));
4573 rcu_read_lock_held() || atomic_read(&css->refcnt));
4574 4821
4575 if (cssid) 4822 if (cssid)
4576 return cssid->id; 4823 return cssid->id;
@@ -4582,8 +4829,7 @@ unsigned short css_depth(struct cgroup_subsys_state *css)
4582{ 4829{
4583 struct css_id *cssid; 4830 struct css_id *cssid;
4584 4831
4585 cssid = rcu_dereference_check(css->id, 4832 cssid = rcu_dereference_check(css->id, atomic_read(&css->refcnt));
4586 rcu_read_lock_held() || atomic_read(&css->refcnt));
4587 4833
4588 if (cssid) 4834 if (cssid)
4589 return cssid->depth; 4835 return cssid->depth;
@@ -4623,14 +4869,6 @@ bool css_is_ancestor(struct cgroup_subsys_state *child,
4623 return ret; 4869 return ret;
4624} 4870}
4625 4871
4626static void __free_css_id_cb(struct rcu_head *head)
4627{
4628 struct css_id *id;
4629
4630 id = container_of(head, struct css_id, rcu_head);
4631 kfree(id);
4632}
4633
4634void free_css_id(struct cgroup_subsys *ss, struct cgroup_subsys_state *css) 4872void free_css_id(struct cgroup_subsys *ss, struct cgroup_subsys_state *css)
4635{ 4873{
4636 struct css_id *id = css->id; 4874 struct css_id *id = css->id;
@@ -4645,7 +4883,7 @@ void free_css_id(struct cgroup_subsys *ss, struct cgroup_subsys_state *css)
4645 spin_lock(&ss->id_lock); 4883 spin_lock(&ss->id_lock);
4646 idr_remove(&ss->idr, id->id); 4884 idr_remove(&ss->idr, id->id);
4647 spin_unlock(&ss->id_lock); 4885 spin_unlock(&ss->id_lock);
4648 call_rcu(&id->rcu_head, __free_css_id_cb); 4886 kfree_rcu(id, rcu_head);
4649} 4887}
4650EXPORT_SYMBOL_GPL(free_css_id); 4888EXPORT_SYMBOL_GPL(free_css_id);
4651 4889
diff --git a/kernel/cgroup_freezer.c b/kernel/cgroup_freezer.c
index e7bebb7c6c38..e691818d7e45 100644
--- a/kernel/cgroup_freezer.c
+++ b/kernel/cgroup_freezer.c
@@ -160,7 +160,7 @@ static void freezer_destroy(struct cgroup_subsys *ss,
160 */ 160 */
161static int freezer_can_attach(struct cgroup_subsys *ss, 161static int freezer_can_attach(struct cgroup_subsys *ss,
162 struct cgroup *new_cgroup, 162 struct cgroup *new_cgroup,
163 struct task_struct *task, bool threadgroup) 163 struct task_struct *task)
164{ 164{
165 struct freezer *freezer; 165 struct freezer *freezer;
166 166
@@ -172,26 +172,17 @@ static int freezer_can_attach(struct cgroup_subsys *ss,
172 if (freezer->state != CGROUP_THAWED) 172 if (freezer->state != CGROUP_THAWED)
173 return -EBUSY; 173 return -EBUSY;
174 174
175 return 0;
176}
177
178static int freezer_can_attach_task(struct cgroup *cgrp, struct task_struct *tsk)
179{
175 rcu_read_lock(); 180 rcu_read_lock();
176 if (__cgroup_freezing_or_frozen(task)) { 181 if (__cgroup_freezing_or_frozen(tsk)) {
177 rcu_read_unlock(); 182 rcu_read_unlock();
178 return -EBUSY; 183 return -EBUSY;
179 } 184 }
180 rcu_read_unlock(); 185 rcu_read_unlock();
181
182 if (threadgroup) {
183 struct task_struct *c;
184
185 rcu_read_lock();
186 list_for_each_entry_rcu(c, &task->thread_group, thread_group) {
187 if (__cgroup_freezing_or_frozen(c)) {
188 rcu_read_unlock();
189 return -EBUSY;
190 }
191 }
192 rcu_read_unlock();
193 }
194
195 return 0; 186 return 0;
196} 187}
197 188
@@ -390,6 +381,9 @@ struct cgroup_subsys freezer_subsys = {
390 .populate = freezer_populate, 381 .populate = freezer_populate,
391 .subsys_id = freezer_subsys_id, 382 .subsys_id = freezer_subsys_id,
392 .can_attach = freezer_can_attach, 383 .can_attach = freezer_can_attach,
384 .can_attach_task = freezer_can_attach_task,
385 .pre_attach = NULL,
386 .attach_task = NULL,
393 .attach = NULL, 387 .attach = NULL,
394 .fork = freezer_fork, 388 .fork = freezer_fork,
395 .exit = NULL, 389 .exit = NULL,
diff --git a/kernel/compat.c b/kernel/compat.c
index 38b1d2c1cbe8..e2435ee9993a 100644
--- a/kernel/compat.c
+++ b/kernel/compat.c
@@ -158,6 +158,7 @@ int put_compat_timespec(const struct timespec *ts, struct compat_timespec __user
158 __put_user(ts->tv_sec, &cts->tv_sec) || 158 __put_user(ts->tv_sec, &cts->tv_sec) ||
159 __put_user(ts->tv_nsec, &cts->tv_nsec)) ? -EFAULT : 0; 159 __put_user(ts->tv_nsec, &cts->tv_nsec)) ? -EFAULT : 0;
160} 160}
161EXPORT_SYMBOL_GPL(put_compat_timespec);
161 162
162static long compat_nanosleep_restart(struct restart_block *restart) 163static long compat_nanosleep_restart(struct restart_block *restart)
163{ 164{
@@ -293,6 +294,8 @@ asmlinkage long compat_sys_times(struct compat_tms __user *tbuf)
293 return compat_jiffies_to_clock_t(jiffies); 294 return compat_jiffies_to_clock_t(jiffies);
294} 295}
295 296
297#ifdef __ARCH_WANT_SYS_SIGPENDING
298
296/* 299/*
297 * Assumption: old_sigset_t and compat_old_sigset_t are both 300 * Assumption: old_sigset_t and compat_old_sigset_t are both
298 * types that can be passed to put_user()/get_user(). 301 * types that can be passed to put_user()/get_user().
@@ -312,6 +315,10 @@ asmlinkage long compat_sys_sigpending(compat_old_sigset_t __user *set)
312 return ret; 315 return ret;
313} 316}
314 317
318#endif
319
320#ifdef __ARCH_WANT_SYS_SIGPROCMASK
321
315asmlinkage long compat_sys_sigprocmask(int how, compat_old_sigset_t __user *set, 322asmlinkage long compat_sys_sigprocmask(int how, compat_old_sigset_t __user *set,
316 compat_old_sigset_t __user *oset) 323 compat_old_sigset_t __user *oset)
317{ 324{
@@ -333,6 +340,8 @@ asmlinkage long compat_sys_sigprocmask(int how, compat_old_sigset_t __user *set,
333 return ret; 340 return ret;
334} 341}
335 342
343#endif
344
336asmlinkage long compat_sys_setrlimit(unsigned int resource, 345asmlinkage long compat_sys_setrlimit(unsigned int resource,
337 struct compat_rlimit __user *rlim) 346 struct compat_rlimit __user *rlim)
338{ 347{
@@ -882,6 +891,7 @@ sigset_from_compat (sigset_t *set, compat_sigset_t *compat)
882 case 1: set->sig[0] = compat->sig[0] | (((long)compat->sig[1]) << 32 ); 891 case 1: set->sig[0] = compat->sig[0] | (((long)compat->sig[1]) << 32 );
883 } 892 }
884} 893}
894EXPORT_SYMBOL_GPL(sigset_from_compat);
885 895
886asmlinkage long 896asmlinkage long
887compat_sys_rt_sigtimedwait (compat_sigset_t __user *uthese, 897compat_sys_rt_sigtimedwait (compat_sigset_t __user *uthese,
@@ -890,10 +900,9 @@ compat_sys_rt_sigtimedwait (compat_sigset_t __user *uthese,
890{ 900{
891 compat_sigset_t s32; 901 compat_sigset_t s32;
892 sigset_t s; 902 sigset_t s;
893 int sig;
894 struct timespec t; 903 struct timespec t;
895 siginfo_t info; 904 siginfo_t info;
896 long ret, timeout = 0; 905 long ret;
897 906
898 if (sigsetsize != sizeof(sigset_t)) 907 if (sigsetsize != sizeof(sigset_t))
899 return -EINVAL; 908 return -EINVAL;
@@ -901,51 +910,19 @@ compat_sys_rt_sigtimedwait (compat_sigset_t __user *uthese,
901 if (copy_from_user(&s32, uthese, sizeof(compat_sigset_t))) 910 if (copy_from_user(&s32, uthese, sizeof(compat_sigset_t)))
902 return -EFAULT; 911 return -EFAULT;
903 sigset_from_compat(&s, &s32); 912 sigset_from_compat(&s, &s32);
904 sigdelsetmask(&s,sigmask(SIGKILL)|sigmask(SIGSTOP));
905 signotset(&s);
906 913
907 if (uts) { 914 if (uts) {
908 if (get_compat_timespec (&t, uts)) 915 if (get_compat_timespec(&t, uts))
909 return -EFAULT; 916 return -EFAULT;
910 if (t.tv_nsec >= 1000000000L || t.tv_nsec < 0
911 || t.tv_sec < 0)
912 return -EINVAL;
913 } 917 }
914 918
915 spin_lock_irq(&current->sighand->siglock); 919 ret = do_sigtimedwait(&s, &info, uts ? &t : NULL);
916 sig = dequeue_signal(current, &s, &info);
917 if (!sig) {
918 timeout = MAX_SCHEDULE_TIMEOUT;
919 if (uts)
920 timeout = timespec_to_jiffies(&t)
921 +(t.tv_sec || t.tv_nsec);
922 if (timeout) {
923 current->real_blocked = current->blocked;
924 sigandsets(&current->blocked, &current->blocked, &s);
925
926 recalc_sigpending();
927 spin_unlock_irq(&current->sighand->siglock);
928
929 timeout = schedule_timeout_interruptible(timeout);
930
931 spin_lock_irq(&current->sighand->siglock);
932 sig = dequeue_signal(current, &s, &info);
933 current->blocked = current->real_blocked;
934 siginitset(&current->real_blocked, 0);
935 recalc_sigpending();
936 }
937 }
938 spin_unlock_irq(&current->sighand->siglock);
939 920
940 if (sig) { 921 if (ret > 0 && uinfo) {
941 ret = sig; 922 if (copy_siginfo_to_user32(uinfo, &info))
942 if (uinfo) { 923 ret = -EFAULT;
943 if (copy_siginfo_to_user32(uinfo, &info))
944 ret = -EFAULT;
945 }
946 }else {
947 ret = timeout?-EINTR:-EAGAIN;
948 } 924 }
925
949 return ret; 926 return ret;
950 927
951} 928}
@@ -1016,11 +993,8 @@ asmlinkage long compat_sys_rt_sigsuspend(compat_sigset_t __user *unewset, compat
1016 sigset_from_compat(&newset, &newset32); 993 sigset_from_compat(&newset, &newset32);
1017 sigdelsetmask(&newset, sigmask(SIGKILL)|sigmask(SIGSTOP)); 994 sigdelsetmask(&newset, sigmask(SIGKILL)|sigmask(SIGSTOP));
1018 995
1019 spin_lock_irq(&current->sighand->siglock);
1020 current->saved_sigmask = current->blocked; 996 current->saved_sigmask = current->blocked;
1021 current->blocked = newset; 997 set_current_blocked(&newset);
1022 recalc_sigpending();
1023 spin_unlock_irq(&current->sighand->siglock);
1024 998
1025 current->state = TASK_INTERRUPTIBLE; 999 current->state = TASK_INTERRUPTIBLE;
1026 schedule(); 1000 schedule();
diff --git a/kernel/configs.c b/kernel/configs.c
index b4066b44a99d..42e8fa075eed 100644
--- a/kernel/configs.c
+++ b/kernel/configs.c
@@ -92,8 +92,8 @@ static void __exit ikconfig_cleanup(void)
92module_init(ikconfig_init); 92module_init(ikconfig_init);
93module_exit(ikconfig_cleanup); 93module_exit(ikconfig_cleanup);
94 94
95#endif /* CONFIG_IKCONFIG_PROC */
96
95MODULE_LICENSE("GPL"); 97MODULE_LICENSE("GPL");
96MODULE_AUTHOR("Randy Dunlap"); 98MODULE_AUTHOR("Randy Dunlap");
97MODULE_DESCRIPTION("Echo the kernel .config file used to build the kernel"); 99MODULE_DESCRIPTION("Echo the kernel .config file used to build the kernel");
98
99#endif /* CONFIG_IKCONFIG_PROC */
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index 33eee16addb8..10131fdaff70 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -55,7 +55,7 @@
55#include <linux/sort.h> 55#include <linux/sort.h>
56 56
57#include <asm/uaccess.h> 57#include <asm/uaccess.h>
58#include <asm/atomic.h> 58#include <linux/atomic.h>
59#include <linux/mutex.h> 59#include <linux/mutex.h>
60#include <linux/workqueue.h> 60#include <linux/workqueue.h>
61#include <linux/cgroup.h> 61#include <linux/cgroup.h>
@@ -1159,7 +1159,7 @@ int current_cpuset_is_being_rebound(void)
1159static int update_relax_domain_level(struct cpuset *cs, s64 val) 1159static int update_relax_domain_level(struct cpuset *cs, s64 val)
1160{ 1160{
1161#ifdef CONFIG_SMP 1161#ifdef CONFIG_SMP
1162 if (val < -1 || val >= SD_LV_MAX) 1162 if (val < -1 || val >= sched_domain_level_max)
1163 return -EINVAL; 1163 return -EINVAL;
1164#endif 1164#endif
1165 1165
@@ -1367,14 +1367,10 @@ static int fmeter_getrate(struct fmeter *fmp)
1367 return val; 1367 return val;
1368} 1368}
1369 1369
1370/* Protected by cgroup_lock */
1371static cpumask_var_t cpus_attach;
1372
1373/* Called by cgroups to determine if a cpuset is usable; cgroup_mutex held */ 1370/* Called by cgroups to determine if a cpuset is usable; cgroup_mutex held */
1374static int cpuset_can_attach(struct cgroup_subsys *ss, struct cgroup *cont, 1371static int cpuset_can_attach(struct cgroup_subsys *ss, struct cgroup *cont,
1375 struct task_struct *tsk, bool threadgroup) 1372 struct task_struct *tsk)
1376{ 1373{
1377 int ret;
1378 struct cpuset *cs = cgroup_cs(cont); 1374 struct cpuset *cs = cgroup_cs(cont);
1379 1375
1380 if (cpumask_empty(cs->cpus_allowed) || nodes_empty(cs->mems_allowed)) 1376 if (cpumask_empty(cs->cpus_allowed) || nodes_empty(cs->mems_allowed))
@@ -1391,29 +1387,42 @@ static int cpuset_can_attach(struct cgroup_subsys *ss, struct cgroup *cont,
1391 if (tsk->flags & PF_THREAD_BOUND) 1387 if (tsk->flags & PF_THREAD_BOUND)
1392 return -EINVAL; 1388 return -EINVAL;
1393 1389
1394 ret = security_task_setscheduler(tsk);
1395 if (ret)
1396 return ret;
1397 if (threadgroup) {
1398 struct task_struct *c;
1399
1400 rcu_read_lock();
1401 list_for_each_entry_rcu(c, &tsk->thread_group, thread_group) {
1402 ret = security_task_setscheduler(c);
1403 if (ret) {
1404 rcu_read_unlock();
1405 return ret;
1406 }
1407 }
1408 rcu_read_unlock();
1409 }
1410 return 0; 1390 return 0;
1411} 1391}
1412 1392
1413static void cpuset_attach_task(struct task_struct *tsk, nodemask_t *to, 1393static int cpuset_can_attach_task(struct cgroup *cgrp, struct task_struct *task)
1414 struct cpuset *cs) 1394{
1395 return security_task_setscheduler(task);
1396}
1397
1398/*
1399 * Protected by cgroup_lock. The nodemasks must be stored globally because
1400 * dynamically allocating them is not allowed in pre_attach, and they must
1401 * persist among pre_attach, attach_task, and attach.
1402 */
1403static cpumask_var_t cpus_attach;
1404static nodemask_t cpuset_attach_nodemask_from;
1405static nodemask_t cpuset_attach_nodemask_to;
1406
1407/* Set-up work for before attaching each task. */
1408static void cpuset_pre_attach(struct cgroup *cont)
1409{
1410 struct cpuset *cs = cgroup_cs(cont);
1411
1412 if (cs == &top_cpuset)
1413 cpumask_copy(cpus_attach, cpu_possible_mask);
1414 else
1415 guarantee_online_cpus(cs, cpus_attach);
1416
1417 guarantee_online_mems(cs, &cpuset_attach_nodemask_to);
1418}
1419
1420/* Per-thread attachment work. */
1421static void cpuset_attach_task(struct cgroup *cont, struct task_struct *tsk)
1415{ 1422{
1416 int err; 1423 int err;
1424 struct cpuset *cs = cgroup_cs(cont);
1425
1417 /* 1426 /*
1418 * can_attach beforehand should guarantee that this doesn't fail. 1427 * can_attach beforehand should guarantee that this doesn't fail.
1419 * TODO: have a better way to handle failure here 1428 * TODO: have a better way to handle failure here
@@ -1421,45 +1430,29 @@ static void cpuset_attach_task(struct task_struct *tsk, nodemask_t *to,
1421 err = set_cpus_allowed_ptr(tsk, cpus_attach); 1430 err = set_cpus_allowed_ptr(tsk, cpus_attach);
1422 WARN_ON_ONCE(err); 1431 WARN_ON_ONCE(err);
1423 1432
1424 cpuset_change_task_nodemask(tsk, to); 1433 cpuset_change_task_nodemask(tsk, &cpuset_attach_nodemask_to);
1425 cpuset_update_task_spread_flag(cs, tsk); 1434 cpuset_update_task_spread_flag(cs, tsk);
1426
1427} 1435}
1428 1436
1429static void cpuset_attach(struct cgroup_subsys *ss, struct cgroup *cont, 1437static void cpuset_attach(struct cgroup_subsys *ss, struct cgroup *cont,
1430 struct cgroup *oldcont, struct task_struct *tsk, 1438 struct cgroup *oldcont, struct task_struct *tsk)
1431 bool threadgroup)
1432{ 1439{
1433 struct mm_struct *mm; 1440 struct mm_struct *mm;
1434 struct cpuset *cs = cgroup_cs(cont); 1441 struct cpuset *cs = cgroup_cs(cont);
1435 struct cpuset *oldcs = cgroup_cs(oldcont); 1442 struct cpuset *oldcs = cgroup_cs(oldcont);
1436 static nodemask_t to; /* protected by cgroup_mutex */
1437
1438 if (cs == &top_cpuset) {
1439 cpumask_copy(cpus_attach, cpu_possible_mask);
1440 } else {
1441 guarantee_online_cpus(cs, cpus_attach);
1442 }
1443 guarantee_online_mems(cs, &to);
1444
1445 /* do per-task migration stuff possibly for each in the threadgroup */
1446 cpuset_attach_task(tsk, &to, cs);
1447 if (threadgroup) {
1448 struct task_struct *c;
1449 rcu_read_lock();
1450 list_for_each_entry_rcu(c, &tsk->thread_group, thread_group) {
1451 cpuset_attach_task(c, &to, cs);
1452 }
1453 rcu_read_unlock();
1454 }
1455 1443
1456 /* change mm; only needs to be done once even if threadgroup */ 1444 /*
1457 to = cs->mems_allowed; 1445 * Change mm, possibly for multiple threads in a threadgroup. This is
1446 * expensive and may sleep.
1447 */
1448 cpuset_attach_nodemask_from = oldcs->mems_allowed;
1449 cpuset_attach_nodemask_to = cs->mems_allowed;
1458 mm = get_task_mm(tsk); 1450 mm = get_task_mm(tsk);
1459 if (mm) { 1451 if (mm) {
1460 mpol_rebind_mm(mm, &to); 1452 mpol_rebind_mm(mm, &cpuset_attach_nodemask_to);
1461 if (is_memory_migrate(cs)) 1453 if (is_memory_migrate(cs))
1462 cpuset_migrate_mm(mm, &oldcs->mems_allowed, &to); 1454 cpuset_migrate_mm(mm, &cpuset_attach_nodemask_from,
1455 &cpuset_attach_nodemask_to);
1463 mmput(mm); 1456 mmput(mm);
1464 } 1457 }
1465} 1458}
@@ -1809,10 +1802,9 @@ static int cpuset_populate(struct cgroup_subsys *ss, struct cgroup *cont)
1809} 1802}
1810 1803
1811/* 1804/*
1812 * post_clone() is called at the end of cgroup_clone(). 1805 * post_clone() is called during cgroup_create() when the
1813 * 'cgroup' was just created automatically as a result of 1806 * clone_children mount argument was specified. The cgroup
1814 * a cgroup_clone(), and the current task is about to 1807 * can not yet have any tasks.
1815 * be moved into 'cgroup'.
1816 * 1808 *
1817 * Currently we refuse to set up the cgroup - thereby 1809 * Currently we refuse to set up the cgroup - thereby
1818 * refusing the task to be entered, and as a result refusing 1810 * refusing the task to be entered, and as a result refusing
@@ -1911,6 +1903,9 @@ struct cgroup_subsys cpuset_subsys = {
1911 .create = cpuset_create, 1903 .create = cpuset_create,
1912 .destroy = cpuset_destroy, 1904 .destroy = cpuset_destroy,
1913 .can_attach = cpuset_can_attach, 1905 .can_attach = cpuset_can_attach,
1906 .can_attach_task = cpuset_can_attach_task,
1907 .pre_attach = cpuset_pre_attach,
1908 .attach_task = cpuset_attach_task,
1914 .attach = cpuset_attach, 1909 .attach = cpuset_attach,
1915 .populate = cpuset_populate, 1910 .populate = cpuset_populate,
1916 .post_clone = cpuset_post_clone, 1911 .post_clone = cpuset_post_clone,
@@ -2195,7 +2190,7 @@ int cpuset_cpus_allowed_fallback(struct task_struct *tsk)
2195 rcu_read_lock(); 2190 rcu_read_lock();
2196 cs = task_cs(tsk); 2191 cs = task_cs(tsk);
2197 if (cs) 2192 if (cs)
2198 cpumask_copy(&tsk->cpus_allowed, cs->cpus_allowed); 2193 do_set_cpus_allowed(tsk, cs->cpus_allowed);
2199 rcu_read_unlock(); 2194 rcu_read_unlock();
2200 2195
2201 /* 2196 /*
@@ -2222,7 +2217,7 @@ int cpuset_cpus_allowed_fallback(struct task_struct *tsk)
2222 * Like above we can temporary set any mask and rely on 2217 * Like above we can temporary set any mask and rely on
2223 * set_cpus_allowed_ptr() as synchronization point. 2218 * set_cpus_allowed_ptr() as synchronization point.
2224 */ 2219 */
2225 cpumask_copy(&tsk->cpus_allowed, cpu_possible_mask); 2220 do_set_cpus_allowed(tsk, cpu_possible_mask);
2226 cpu = cpumask_any(cpu_active_mask); 2221 cpu = cpumask_any(cpu_active_mask);
2227 } 2222 }
2228 2223
@@ -2465,11 +2460,19 @@ static int cpuset_spread_node(int *rotor)
2465 2460
2466int cpuset_mem_spread_node(void) 2461int cpuset_mem_spread_node(void)
2467{ 2462{
2463 if (current->cpuset_mem_spread_rotor == NUMA_NO_NODE)
2464 current->cpuset_mem_spread_rotor =
2465 node_random(&current->mems_allowed);
2466
2468 return cpuset_spread_node(&current->cpuset_mem_spread_rotor); 2467 return cpuset_spread_node(&current->cpuset_mem_spread_rotor);
2469} 2468}
2470 2469
2471int cpuset_slab_spread_node(void) 2470int cpuset_slab_spread_node(void)
2472{ 2471{
2472 if (current->cpuset_slab_spread_rotor == NUMA_NO_NODE)
2473 current->cpuset_slab_spread_rotor =
2474 node_random(&current->mems_allowed);
2475
2473 return cpuset_spread_node(&current->cpuset_slab_spread_rotor); 2476 return cpuset_spread_node(&current->cpuset_slab_spread_rotor);
2474} 2477}
2475 2478
diff --git a/kernel/cred.c b/kernel/cred.c
index 5557b55048df..8ef31f53c44c 100644
--- a/kernel/cred.c
+++ b/kernel/cred.c
@@ -1,4 +1,4 @@
1/* Task credentials management - see Documentation/credentials.txt 1/* Task credentials management - see Documentation/security/credentials.txt
2 * 2 *
3 * Copyright (C) 2008 Red Hat, Inc. All Rights Reserved. 3 * Copyright (C) 2008 Red Hat, Inc. All Rights Reserved.
4 * Written by David Howells (dhowells@redhat.com) 4 * Written by David Howells (dhowells@redhat.com)
@@ -49,11 +49,12 @@ struct cred init_cred = {
49 .magic = CRED_MAGIC, 49 .magic = CRED_MAGIC,
50#endif 50#endif
51 .securebits = SECUREBITS_DEFAULT, 51 .securebits = SECUREBITS_DEFAULT,
52 .cap_inheritable = CAP_INIT_INH_SET, 52 .cap_inheritable = CAP_EMPTY_SET,
53 .cap_permitted = CAP_FULL_SET, 53 .cap_permitted = CAP_FULL_SET,
54 .cap_effective = CAP_INIT_EFF_SET, 54 .cap_effective = CAP_FULL_SET,
55 .cap_bset = CAP_INIT_BSET, 55 .cap_bset = CAP_FULL_SET,
56 .user = INIT_USER, 56 .user = INIT_USER,
57 .user_ns = &init_user_ns,
57 .group_info = &init_groups, 58 .group_info = &init_groups,
58#ifdef CONFIG_KEYS 59#ifdef CONFIG_KEYS
59 .tgcred = &init_tgcred, 60 .tgcred = &init_tgcred,
@@ -410,6 +411,11 @@ int copy_creds(struct task_struct *p, unsigned long clone_flags)
410 goto error_put; 411 goto error_put;
411 } 412 }
412 413
414 /* cache user_ns in cred. Doesn't need a refcount because it will
415 * stay pinned by cred->user
416 */
417 new->user_ns = new->user->user_ns;
418
413#ifdef CONFIG_KEYS 419#ifdef CONFIG_KEYS
414 /* new threads get their own thread keyrings if their parent already 420 /* new threads get their own thread keyrings if their parent already
415 * had one */ 421 * had one */
@@ -502,10 +508,8 @@ int commit_creds(struct cred *new)
502 key_fsgid_changed(task); 508 key_fsgid_changed(task);
503 509
504 /* do it 510 /* do it
505 * - What if a process setreuid()'s and this brings the 511 * RLIMIT_NPROC limits on user->processes have already been checked
506 * new uid over his NPROC rlimit? We can check this now 512 * in set_user().
507 * cheaply with the new uid cache, so if it matters
508 * we should be checking for it. -DaveM
509 */ 513 */
510 alter_cred_subscribers(new, 2); 514 alter_cred_subscribers(new, 2);
511 if (new->user != old->user) 515 if (new->user != old->user)
@@ -741,12 +745,6 @@ int set_create_files_as(struct cred *new, struct inode *inode)
741} 745}
742EXPORT_SYMBOL(set_create_files_as); 746EXPORT_SYMBOL(set_create_files_as);
743 747
744struct user_namespace *current_user_ns(void)
745{
746 return _current_user_ns();
747}
748EXPORT_SYMBOL(current_user_ns);
749
750#ifdef CONFIG_DEBUG_CREDENTIALS 748#ifdef CONFIG_DEBUG_CREDENTIALS
751 749
752bool creds_are_invalid(const struct cred *cred) 750bool creds_are_invalid(const struct cred *cred)
diff --git a/kernel/debug/debug_core.c b/kernel/debug/debug_core.c
index bad6786dee88..0d7c08784efb 100644
--- a/kernel/debug/debug_core.c
+++ b/kernel/debug/debug_core.c
@@ -51,7 +51,7 @@
51 51
52#include <asm/cacheflush.h> 52#include <asm/cacheflush.h>
53#include <asm/byteorder.h> 53#include <asm/byteorder.h>
54#include <asm/atomic.h> 54#include <linux/atomic.h>
55#include <asm/system.h> 55#include <asm/system.h>
56 56
57#include "debug_core.h" 57#include "debug_core.h"
diff --git a/kernel/debug/gdbstub.c b/kernel/debug/gdbstub.c
index a11db956dd62..34872482315e 100644
--- a/kernel/debug/gdbstub.c
+++ b/kernel/debug/gdbstub.c
@@ -42,6 +42,8 @@
42/* Our I/O buffers. */ 42/* Our I/O buffers. */
43static char remcom_in_buffer[BUFMAX]; 43static char remcom_in_buffer[BUFMAX];
44static char remcom_out_buffer[BUFMAX]; 44static char remcom_out_buffer[BUFMAX];
45static int gdbstub_use_prev_in_buf;
46static int gdbstub_prev_in_buf_pos;
45 47
46/* Storage for the registers, in GDB format. */ 48/* Storage for the registers, in GDB format. */
47static unsigned long gdb_regs[(NUMREGBYTES + 49static unsigned long gdb_regs[(NUMREGBYTES +
@@ -58,6 +60,13 @@ static int gdbstub_read_wait(void)
58 int ret = -1; 60 int ret = -1;
59 int i; 61 int i;
60 62
63 if (unlikely(gdbstub_use_prev_in_buf)) {
64 if (gdbstub_prev_in_buf_pos < gdbstub_use_prev_in_buf)
65 return remcom_in_buffer[gdbstub_prev_in_buf_pos++];
66 else
67 gdbstub_use_prev_in_buf = 0;
68 }
69
61 /* poll any additional I/O interfaces that are defined */ 70 /* poll any additional I/O interfaces that are defined */
62 while (ret < 0) 71 while (ret < 0)
63 for (i = 0; kdb_poll_funcs[i] != NULL; i++) { 72 for (i = 0; kdb_poll_funcs[i] != NULL; i++) {
@@ -109,7 +118,6 @@ static void get_packet(char *buffer)
109 buffer[count] = ch; 118 buffer[count] = ch;
110 count = count + 1; 119 count = count + 1;
111 } 120 }
112 buffer[count] = 0;
113 121
114 if (ch == '#') { 122 if (ch == '#') {
115 xmitcsum = hex_to_bin(gdbstub_read_wait()) << 4; 123 xmitcsum = hex_to_bin(gdbstub_read_wait()) << 4;
@@ -124,6 +132,7 @@ static void get_packet(char *buffer)
124 if (dbg_io_ops->flush) 132 if (dbg_io_ops->flush)
125 dbg_io_ops->flush(); 133 dbg_io_ops->flush();
126 } 134 }
135 buffer[count] = 0;
127 } while (checksum != xmitcsum); 136 } while (checksum != xmitcsum);
128} 137}
129 138
@@ -1082,12 +1091,11 @@ int gdbstub_state(struct kgdb_state *ks, char *cmd)
1082 case 'c': 1091 case 'c':
1083 strcpy(remcom_in_buffer, cmd); 1092 strcpy(remcom_in_buffer, cmd);
1084 return 0; 1093 return 0;
1085 case '?': 1094 case '$':
1086 gdb_cmd_status(ks); 1095 strcpy(remcom_in_buffer, cmd);
1087 break; 1096 gdbstub_use_prev_in_buf = strlen(remcom_in_buffer);
1088 case '\0': 1097 gdbstub_prev_in_buf_pos = 0;
1089 strcpy(remcom_out_buffer, ""); 1098 return 0;
1090 break;
1091 } 1099 }
1092 dbg_io_ops->write_char('+'); 1100 dbg_io_ops->write_char('+');
1093 put_packet(remcom_out_buffer); 1101 put_packet(remcom_out_buffer);
diff --git a/kernel/debug/kdb/kdb_bt.c b/kernel/debug/kdb/kdb_bt.c
index 2f62fe85f16a..7179eac7b41c 100644
--- a/kernel/debug/kdb/kdb_bt.c
+++ b/kernel/debug/kdb/kdb_bt.c
@@ -112,9 +112,8 @@ kdb_bt(int argc, const char **argv)
112 unsigned long addr; 112 unsigned long addr;
113 long offset; 113 long offset;
114 114
115 kdbgetintenv("BTARGS", &argcount); /* Arguments to print */ 115 /* Prompt after each proc in bta */
116 kdbgetintenv("BTAPROMPT", &btaprompt); /* Prompt after each 116 kdbgetintenv("BTAPROMPT", &btaprompt);
117 * proc in bta */
118 117
119 if (strcmp(argv[0], "bta") == 0) { 118 if (strcmp(argv[0], "bta") == 0) {
120 struct task_struct *g, *p; 119 struct task_struct *g, *p;
diff --git a/kernel/debug/kdb/kdb_cmds b/kernel/debug/kdb/kdb_cmds
index 56c88e4db309..9834ad303ab6 100644
--- a/kernel/debug/kdb/kdb_cmds
+++ b/kernel/debug/kdb/kdb_cmds
@@ -18,16 +18,12 @@ defcmd dumpcommon "" "Common kdb debugging"
18endefcmd 18endefcmd
19 19
20defcmd dumpall "" "First line debugging" 20defcmd dumpall "" "First line debugging"
21 set BTSYMARG 1
22 set BTARGS 9
23 pid R 21 pid R
24 -dumpcommon 22 -dumpcommon
25 -bta 23 -bta
26endefcmd 24endefcmd
27 25
28defcmd dumpcpu "" "Same as dumpall but only tasks on cpus" 26defcmd dumpcpu "" "Same as dumpall but only tasks on cpus"
29 set BTSYMARG 1
30 set BTARGS 9
31 pid R 27 pid R
32 -dumpcommon 28 -dumpcommon
33 -btc 29 -btc
diff --git a/kernel/debug/kdb/kdb_debugger.c b/kernel/debug/kdb/kdb_debugger.c
index dd0b1b7dd02c..d9ca9aa481ec 100644
--- a/kernel/debug/kdb/kdb_debugger.c
+++ b/kernel/debug/kdb/kdb_debugger.c
@@ -30,6 +30,8 @@ EXPORT_SYMBOL_GPL(kdb_poll_funcs);
30int kdb_poll_idx = 1; 30int kdb_poll_idx = 1;
31EXPORT_SYMBOL_GPL(kdb_poll_idx); 31EXPORT_SYMBOL_GPL(kdb_poll_idx);
32 32
33static struct kgdb_state *kdb_ks;
34
33int kdb_stub(struct kgdb_state *ks) 35int kdb_stub(struct kgdb_state *ks)
34{ 36{
35 int error = 0; 37 int error = 0;
@@ -39,6 +41,7 @@ int kdb_stub(struct kgdb_state *ks)
39 kdb_dbtrap_t db_result = KDB_DB_NOBPT; 41 kdb_dbtrap_t db_result = KDB_DB_NOBPT;
40 int i; 42 int i;
41 43
44 kdb_ks = ks;
42 if (KDB_STATE(REENTRY)) { 45 if (KDB_STATE(REENTRY)) {
43 reason = KDB_REASON_SWITCH; 46 reason = KDB_REASON_SWITCH;
44 KDB_STATE_CLEAR(REENTRY); 47 KDB_STATE_CLEAR(REENTRY);
@@ -123,20 +126,8 @@ int kdb_stub(struct kgdb_state *ks)
123 KDB_STATE_CLEAR(PAGER); 126 KDB_STATE_CLEAR(PAGER);
124 kdbnearsym_cleanup(); 127 kdbnearsym_cleanup();
125 if (error == KDB_CMD_KGDB) { 128 if (error == KDB_CMD_KGDB) {
126 if (KDB_STATE(DOING_KGDB) || KDB_STATE(DOING_KGDB2)) { 129 if (KDB_STATE(DOING_KGDB))
127 /*
128 * This inteface glue which allows kdb to transition in into
129 * the gdb stub. In order to do this the '?' or '' gdb serial
130 * packet response is processed here. And then control is
131 * passed to the gdbstub.
132 */
133 if (KDB_STATE(DOING_KGDB))
134 gdbstub_state(ks, "?");
135 else
136 gdbstub_state(ks, "");
137 KDB_STATE_CLEAR(DOING_KGDB); 130 KDB_STATE_CLEAR(DOING_KGDB);
138 KDB_STATE_CLEAR(DOING_KGDB2);
139 }
140 return DBG_PASS_EVENT; 131 return DBG_PASS_EVENT;
141 } 132 }
142 kdb_bp_install(ks->linux_regs); 133 kdb_bp_install(ks->linux_regs);
@@ -166,3 +157,7 @@ int kdb_stub(struct kgdb_state *ks)
166 return kgdb_info[ks->cpu].ret_state; 157 return kgdb_info[ks->cpu].ret_state;
167} 158}
168 159
160void kdb_gdb_state_pass(char *buf)
161{
162 gdbstub_state(kdb_ks, buf);
163}
diff --git a/kernel/debug/kdb/kdb_io.c b/kernel/debug/kdb/kdb_io.c
index 96fdaac46a80..4802eb5840e1 100644
--- a/kernel/debug/kdb/kdb_io.c
+++ b/kernel/debug/kdb/kdb_io.c
@@ -31,15 +31,21 @@ char kdb_prompt_str[CMD_BUFLEN];
31 31
32int kdb_trap_printk; 32int kdb_trap_printk;
33 33
34static void kgdb_transition_check(char *buffer) 34static int kgdb_transition_check(char *buffer)
35{ 35{
36 int slen = strlen(buffer); 36 if (buffer[0] != '+' && buffer[0] != '$') {
37 if (strncmp(buffer, "$?#3f", slen) != 0 &&
38 strncmp(buffer, "$qSupported#37", slen) != 0 &&
39 strncmp(buffer, "+$qSupported#37", slen) != 0) {
40 KDB_STATE_SET(KGDB_TRANS); 37 KDB_STATE_SET(KGDB_TRANS);
41 kdb_printf("%s", buffer); 38 kdb_printf("%s", buffer);
39 } else {
40 int slen = strlen(buffer);
41 if (slen > 3 && buffer[slen - 3] == '#') {
42 kdb_gdb_state_pass(buffer);
43 strcpy(buffer, "kgdb");
44 KDB_STATE_SET(DOING_KGDB);
45 return 1;
46 }
42 } 47 }
48 return 0;
43} 49}
44 50
45static int kdb_read_get_key(char *buffer, size_t bufsize) 51static int kdb_read_get_key(char *buffer, size_t bufsize)
@@ -251,6 +257,10 @@ poll_again:
251 case 13: /* enter */ 257 case 13: /* enter */
252 *lastchar++ = '\n'; 258 *lastchar++ = '\n';
253 *lastchar++ = '\0'; 259 *lastchar++ = '\0';
260 if (!KDB_STATE(KGDB_TRANS)) {
261 KDB_STATE_SET(KGDB_TRANS);
262 kdb_printf("%s", buffer);
263 }
254 kdb_printf("\n"); 264 kdb_printf("\n");
255 return buffer; 265 return buffer;
256 case 4: /* Del */ 266 case 4: /* Del */
@@ -382,22 +392,26 @@ poll_again:
382 * printed characters if we think that 392 * printed characters if we think that
383 * kgdb is connecting, until the check 393 * kgdb is connecting, until the check
384 * fails */ 394 * fails */
385 if (!KDB_STATE(KGDB_TRANS)) 395 if (!KDB_STATE(KGDB_TRANS)) {
386 kgdb_transition_check(buffer); 396 if (kgdb_transition_check(buffer))
387 else 397 return buffer;
398 } else {
388 kdb_printf("%c", key); 399 kdb_printf("%c", key);
400 }
389 } 401 }
390 /* Special escape to kgdb */ 402 /* Special escape to kgdb */
391 if (lastchar - buffer >= 5 && 403 if (lastchar - buffer >= 5 &&
392 strcmp(lastchar - 5, "$?#3f") == 0) { 404 strcmp(lastchar - 5, "$?#3f") == 0) {
405 kdb_gdb_state_pass(lastchar - 5);
393 strcpy(buffer, "kgdb"); 406 strcpy(buffer, "kgdb");
394 KDB_STATE_SET(DOING_KGDB); 407 KDB_STATE_SET(DOING_KGDB);
395 return buffer; 408 return buffer;
396 } 409 }
397 if (lastchar - buffer >= 14 && 410 if (lastchar - buffer >= 11 &&
398 strcmp(lastchar - 14, "$qSupported#37") == 0) { 411 strcmp(lastchar - 11, "$qSupported") == 0) {
412 kdb_gdb_state_pass(lastchar - 11);
399 strcpy(buffer, "kgdb"); 413 strcpy(buffer, "kgdb");
400 KDB_STATE_SET(DOING_KGDB2); 414 KDB_STATE_SET(DOING_KGDB);
401 return buffer; 415 return buffer;
402 } 416 }
403 } 417 }
diff --git a/kernel/debug/kdb/kdb_main.c b/kernel/debug/kdb/kdb_main.c
index be14779bcef6..63786e71a3cd 100644
--- a/kernel/debug/kdb/kdb_main.c
+++ b/kernel/debug/kdb/kdb_main.c
@@ -145,7 +145,6 @@ static char *__env[] = {
145#endif 145#endif
146 "RADIX=16", 146 "RADIX=16",
147 "MDCOUNT=8", /* lines of md output */ 147 "MDCOUNT=8", /* lines of md output */
148 "BTARGS=9", /* 9 possible args in bt */
149 KDB_PLATFORM_ENV, 148 KDB_PLATFORM_ENV,
150 "DTABCOUNT=30", 149 "DTABCOUNT=30",
151 "NOSECT=1", 150 "NOSECT=1",
@@ -172,6 +171,7 @@ static char *__env[] = {
172 (char *)0, 171 (char *)0,
173 (char *)0, 172 (char *)0,
174 (char *)0, 173 (char *)0,
174 (char *)0,
175}; 175};
176 176
177static const int __nenv = (sizeof(__env) / sizeof(char *)); 177static const int __nenv = (sizeof(__env) / sizeof(char *));
@@ -1386,7 +1386,7 @@ int kdb_main_loop(kdb_reason_t reason, kdb_reason_t reason2, int error,
1386 } 1386 }
1387 1387
1388 if (result == KDB_CMD_KGDB) { 1388 if (result == KDB_CMD_KGDB) {
1389 if (!(KDB_STATE(DOING_KGDB) || KDB_STATE(DOING_KGDB2))) 1389 if (!KDB_STATE(DOING_KGDB))
1390 kdb_printf("Entering please attach debugger " 1390 kdb_printf("Entering please attach debugger "
1391 "or use $D#44+ or $3#33\n"); 1391 "or use $D#44+ or $3#33\n");
1392 break; 1392 break;
diff --git a/kernel/debug/kdb/kdb_private.h b/kernel/debug/kdb/kdb_private.h
index 35d69ed1dfb5..e381d105b40b 100644
--- a/kernel/debug/kdb/kdb_private.h
+++ b/kernel/debug/kdb/kdb_private.h
@@ -21,7 +21,6 @@
21#define KDB_CMD_SS (-1003) 21#define KDB_CMD_SS (-1003)
22#define KDB_CMD_SSB (-1004) 22#define KDB_CMD_SSB (-1004)
23#define KDB_CMD_KGDB (-1005) 23#define KDB_CMD_KGDB (-1005)
24#define KDB_CMD_KGDB2 (-1006)
25 24
26/* Internal debug flags */ 25/* Internal debug flags */
27#define KDB_DEBUG_FLAG_BP 0x0002 /* Breakpoint subsystem debug */ 26#define KDB_DEBUG_FLAG_BP 0x0002 /* Breakpoint subsystem debug */
@@ -146,7 +145,6 @@ extern int kdb_state;
146 * keyboard on this cpu */ 145 * keyboard on this cpu */
147#define KDB_STATE_KEXEC 0x00040000 /* kexec issued */ 146#define KDB_STATE_KEXEC 0x00040000 /* kexec issued */
148#define KDB_STATE_DOING_KGDB 0x00080000 /* kgdb enter now issued */ 147#define KDB_STATE_DOING_KGDB 0x00080000 /* kgdb enter now issued */
149#define KDB_STATE_DOING_KGDB2 0x00100000 /* kgdb enter now issued */
150#define KDB_STATE_KGDB_TRANS 0x00200000 /* Transition to kgdb */ 148#define KDB_STATE_KGDB_TRANS 0x00200000 /* Transition to kgdb */
151#define KDB_STATE_ARCH 0xff000000 /* Reserved for arch 149#define KDB_STATE_ARCH 0xff000000 /* Reserved for arch
152 * specific use */ 150 * specific use */
@@ -218,6 +216,7 @@ extern void kdb_print_nameval(const char *name, unsigned long val);
218extern void kdb_send_sig_info(struct task_struct *p, struct siginfo *info); 216extern void kdb_send_sig_info(struct task_struct *p, struct siginfo *info);
219extern void kdb_meminfo_proc_show(void); 217extern void kdb_meminfo_proc_show(void);
220extern char *kdb_getstr(char *, size_t, char *); 218extern char *kdb_getstr(char *, size_t, char *);
219extern void kdb_gdb_state_pass(char *buf);
221 220
222/* Defines for kdb_symbol_print */ 221/* Defines for kdb_symbol_print */
223#define KDB_SP_SPACEB 0x0001 /* Space before string */ 222#define KDB_SP_SPACEB 0x0001 /* Space before string */
diff --git a/kernel/delayacct.c b/kernel/delayacct.c
index ead9b610aa71..418b3f7053aa 100644
--- a/kernel/delayacct.c
+++ b/kernel/delayacct.c
@@ -19,8 +19,10 @@
19#include <linux/time.h> 19#include <linux/time.h>
20#include <linux/sysctl.h> 20#include <linux/sysctl.h>
21#include <linux/delayacct.h> 21#include <linux/delayacct.h>
22#include <linux/module.h>
22 23
23int delayacct_on __read_mostly = 1; /* Delay accounting turned on/off */ 24int delayacct_on __read_mostly = 1; /* Delay accounting turned on/off */
25EXPORT_SYMBOL_GPL(delayacct_on);
24struct kmem_cache *delayacct_cache; 26struct kmem_cache *delayacct_cache;
25 27
26static int __init delayacct_setup_disable(char *str) 28static int __init delayacct_setup_disable(char *str)
diff --git a/kernel/events/Makefile b/kernel/events/Makefile
new file mode 100644
index 000000000000..89e5e8aa4c36
--- /dev/null
+++ b/kernel/events/Makefile
@@ -0,0 +1,6 @@
1ifdef CONFIG_FUNCTION_TRACER
2CFLAGS_REMOVE_core.o = -pg
3endif
4
5obj-y := core.o ring_buffer.o
6obj-$(CONFIG_HAVE_HW_BREAKPOINT) += hw_breakpoint.o
diff --git a/kernel/perf_event.c b/kernel/events/core.c
index 8e81a9860a0d..b8785e26ee1c 100644
--- a/kernel/perf_event.c
+++ b/kernel/events/core.c
@@ -2,8 +2,8 @@
2 * Performance events core code: 2 * Performance events core code:
3 * 3 *
4 * Copyright (C) 2008 Thomas Gleixner <tglx@linutronix.de> 4 * Copyright (C) 2008 Thomas Gleixner <tglx@linutronix.de>
5 * Copyright (C) 2008-2009 Red Hat, Inc., Ingo Molnar 5 * Copyright (C) 2008-2011 Red Hat, Inc., Ingo Molnar
6 * Copyright (C) 2008-2009 Red Hat, Inc., Peter Zijlstra <pzijlstr@redhat.com> 6 * Copyright (C) 2008-2011 Red Hat, Inc., Peter Zijlstra <pzijlstr@redhat.com>
7 * Copyright © 2009 Paul Mackerras, IBM Corp. <paulus@au1.ibm.com> 7 * Copyright © 2009 Paul Mackerras, IBM Corp. <paulus@au1.ibm.com>
8 * 8 *
9 * For licensing details see kernel-base/COPYING 9 * For licensing details see kernel-base/COPYING
@@ -36,13 +36,15 @@
36#include <linux/ftrace_event.h> 36#include <linux/ftrace_event.h>
37#include <linux/hw_breakpoint.h> 37#include <linux/hw_breakpoint.h>
38 38
39#include "internal.h"
40
39#include <asm/irq_regs.h> 41#include <asm/irq_regs.h>
40 42
41struct remote_function_call { 43struct remote_function_call {
42 struct task_struct *p; 44 struct task_struct *p;
43 int (*func)(void *info); 45 int (*func)(void *info);
44 void *info; 46 void *info;
45 int ret; 47 int ret;
46}; 48};
47 49
48static void remote_function(void *data) 50static void remote_function(void *data)
@@ -76,10 +78,10 @@ static int
76task_function_call(struct task_struct *p, int (*func) (void *info), void *info) 78task_function_call(struct task_struct *p, int (*func) (void *info), void *info)
77{ 79{
78 struct remote_function_call data = { 80 struct remote_function_call data = {
79 .p = p, 81 .p = p,
80 .func = func, 82 .func = func,
81 .info = info, 83 .info = info,
82 .ret = -ESRCH, /* No such (running) process */ 84 .ret = -ESRCH, /* No such (running) process */
83 }; 85 };
84 86
85 if (task_curr(p)) 87 if (task_curr(p))
@@ -100,10 +102,10 @@ task_function_call(struct task_struct *p, int (*func) (void *info), void *info)
100static int cpu_function_call(int cpu, int (*func) (void *info), void *info) 102static int cpu_function_call(int cpu, int (*func) (void *info), void *info)
101{ 103{
102 struct remote_function_call data = { 104 struct remote_function_call data = {
103 .p = NULL, 105 .p = NULL,
104 .func = func, 106 .func = func,
105 .info = info, 107 .info = info,
106 .ret = -ENXIO, /* No such CPU */ 108 .ret = -ENXIO, /* No such CPU */
107 }; 109 };
108 110
109 smp_call_function_single(cpu, remote_function, &data, 1); 111 smp_call_function_single(cpu, remote_function, &data, 1);
@@ -125,7 +127,7 @@ enum event_type_t {
125 * perf_sched_events : >0 events exist 127 * perf_sched_events : >0 events exist
126 * perf_cgroup_events: >0 per-cpu cgroup events exist on this cpu 128 * perf_cgroup_events: >0 per-cpu cgroup events exist on this cpu
127 */ 129 */
128atomic_t perf_sched_events __read_mostly; 130struct jump_label_key perf_sched_events __read_mostly;
129static DEFINE_PER_CPU(atomic_t, perf_cgroup_events); 131static DEFINE_PER_CPU(atomic_t, perf_cgroup_events);
130 132
131static atomic_t nr_mmap_events __read_mostly; 133static atomic_t nr_mmap_events __read_mostly;
@@ -200,6 +202,22 @@ __get_cpu_context(struct perf_event_context *ctx)
200 return this_cpu_ptr(ctx->pmu->pmu_cpu_context); 202 return this_cpu_ptr(ctx->pmu->pmu_cpu_context);
201} 203}
202 204
205static void perf_ctx_lock(struct perf_cpu_context *cpuctx,
206 struct perf_event_context *ctx)
207{
208 raw_spin_lock(&cpuctx->ctx.lock);
209 if (ctx)
210 raw_spin_lock(&ctx->lock);
211}
212
213static void perf_ctx_unlock(struct perf_cpu_context *cpuctx,
214 struct perf_event_context *ctx)
215{
216 if (ctx)
217 raw_spin_unlock(&ctx->lock);
218 raw_spin_unlock(&cpuctx->ctx.lock);
219}
220
203#ifdef CONFIG_CGROUP_PERF 221#ifdef CONFIG_CGROUP_PERF
204 222
205/* 223/*
@@ -340,11 +358,8 @@ void perf_cgroup_switch(struct task_struct *task, int mode)
340 rcu_read_lock(); 358 rcu_read_lock();
341 359
342 list_for_each_entry_rcu(pmu, &pmus, entry) { 360 list_for_each_entry_rcu(pmu, &pmus, entry) {
343
344 cpuctx = this_cpu_ptr(pmu->pmu_cpu_context); 361 cpuctx = this_cpu_ptr(pmu->pmu_cpu_context);
345 362
346 perf_pmu_disable(cpuctx->ctx.pmu);
347
348 /* 363 /*
349 * perf_cgroup_events says at least one 364 * perf_cgroup_events says at least one
350 * context on this CPU has cgroup events. 365 * context on this CPU has cgroup events.
@@ -353,6 +368,8 @@ void perf_cgroup_switch(struct task_struct *task, int mode)
353 * events for a context. 368 * events for a context.
354 */ 369 */
355 if (cpuctx->ctx.nr_cgroups > 0) { 370 if (cpuctx->ctx.nr_cgroups > 0) {
371 perf_ctx_lock(cpuctx, cpuctx->task_ctx);
372 perf_pmu_disable(cpuctx->ctx.pmu);
356 373
357 if (mode & PERF_CGROUP_SWOUT) { 374 if (mode & PERF_CGROUP_SWOUT) {
358 cpu_ctx_sched_out(cpuctx, EVENT_ALL); 375 cpu_ctx_sched_out(cpuctx, EVENT_ALL);
@@ -372,9 +389,9 @@ void perf_cgroup_switch(struct task_struct *task, int mode)
372 cpuctx->cgrp = perf_cgroup_from_task(task); 389 cpuctx->cgrp = perf_cgroup_from_task(task);
373 cpu_ctx_sched_in(cpuctx, EVENT_ALL, task); 390 cpu_ctx_sched_in(cpuctx, EVENT_ALL, task);
374 } 391 }
392 perf_pmu_enable(cpuctx->ctx.pmu);
393 perf_ctx_unlock(cpuctx, cpuctx->task_ctx);
375 } 394 }
376
377 perf_pmu_enable(cpuctx->ctx.pmu);
378 } 395 }
379 396
380 rcu_read_unlock(); 397 rcu_read_unlock();
@@ -586,14 +603,6 @@ static void get_ctx(struct perf_event_context *ctx)
586 WARN_ON(!atomic_inc_not_zero(&ctx->refcount)); 603 WARN_ON(!atomic_inc_not_zero(&ctx->refcount));
587} 604}
588 605
589static void free_ctx(struct rcu_head *head)
590{
591 struct perf_event_context *ctx;
592
593 ctx = container_of(head, struct perf_event_context, rcu_head);
594 kfree(ctx);
595}
596
597static void put_ctx(struct perf_event_context *ctx) 606static void put_ctx(struct perf_event_context *ctx)
598{ 607{
599 if (atomic_dec_and_test(&ctx->refcount)) { 608 if (atomic_dec_and_test(&ctx->refcount)) {
@@ -601,7 +610,7 @@ static void put_ctx(struct perf_event_context *ctx)
601 put_ctx(ctx->parent_ctx); 610 put_ctx(ctx->parent_ctx);
602 if (ctx->task) 611 if (ctx->task)
603 put_task_struct(ctx->task); 612 put_task_struct(ctx->task);
604 call_rcu(&ctx->rcu_head, free_ctx); 613 kfree_rcu(ctx, rcu_head);
605 } 614 }
606} 615}
607 616
@@ -739,6 +748,7 @@ static u64 perf_event_time(struct perf_event *event)
739 748
740/* 749/*
741 * Update the total_time_enabled and total_time_running fields for a event. 750 * Update the total_time_enabled and total_time_running fields for a event.
751 * The caller of this function needs to hold the ctx->lock.
742 */ 752 */
743static void update_event_times(struct perf_event *event) 753static void update_event_times(struct perf_event *event)
744{ 754{
@@ -1113,6 +1123,10 @@ static int __perf_remove_from_context(void *info)
1113 raw_spin_lock(&ctx->lock); 1123 raw_spin_lock(&ctx->lock);
1114 event_sched_out(event, cpuctx, ctx); 1124 event_sched_out(event, cpuctx, ctx);
1115 list_del_event(event, ctx); 1125 list_del_event(event, ctx);
1126 if (!ctx->nr_events && cpuctx->task_ctx == ctx) {
1127 ctx->is_active = 0;
1128 cpuctx->task_ctx = NULL;
1129 }
1116 raw_spin_unlock(&ctx->lock); 1130 raw_spin_unlock(&ctx->lock);
1117 1131
1118 return 0; 1132 return 0;
@@ -1462,8 +1476,24 @@ static void add_event_to_ctx(struct perf_event *event,
1462 event->tstamp_stopped = tstamp; 1476 event->tstamp_stopped = tstamp;
1463} 1477}
1464 1478
1465static void perf_event_context_sched_in(struct perf_event_context *ctx, 1479static void task_ctx_sched_out(struct perf_event_context *ctx);
1466 struct task_struct *tsk); 1480static void
1481ctx_sched_in(struct perf_event_context *ctx,
1482 struct perf_cpu_context *cpuctx,
1483 enum event_type_t event_type,
1484 struct task_struct *task);
1485
1486static void perf_event_sched_in(struct perf_cpu_context *cpuctx,
1487 struct perf_event_context *ctx,
1488 struct task_struct *task)
1489{
1490 cpu_ctx_sched_in(cpuctx, EVENT_PINNED, task);
1491 if (ctx)
1492 ctx_sched_in(ctx, cpuctx, EVENT_PINNED, task);
1493 cpu_ctx_sched_in(cpuctx, EVENT_FLEXIBLE, task);
1494 if (ctx)
1495 ctx_sched_in(ctx, cpuctx, EVENT_FLEXIBLE, task);
1496}
1467 1497
1468/* 1498/*
1469 * Cross CPU call to install and enable a performance event 1499 * Cross CPU call to install and enable a performance event
@@ -1474,20 +1504,37 @@ static int __perf_install_in_context(void *info)
1474{ 1504{
1475 struct perf_event *event = info; 1505 struct perf_event *event = info;
1476 struct perf_event_context *ctx = event->ctx; 1506 struct perf_event_context *ctx = event->ctx;
1477 struct perf_event *leader = event->group_leader;
1478 struct perf_cpu_context *cpuctx = __get_cpu_context(ctx); 1507 struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
1479 int err; 1508 struct perf_event_context *task_ctx = cpuctx->task_ctx;
1509 struct task_struct *task = current;
1510
1511 perf_ctx_lock(cpuctx, task_ctx);
1512 perf_pmu_disable(cpuctx->ctx.pmu);
1480 1513
1481 /* 1514 /*
1482 * In case we're installing a new context to an already running task, 1515 * If there was an active task_ctx schedule it out.
1483 * could also happen before perf_event_task_sched_in() on architectures
1484 * which do context switches with IRQs enabled.
1485 */ 1516 */
1486 if (ctx->task && !cpuctx->task_ctx) 1517 if (task_ctx)
1487 perf_event_context_sched_in(ctx, ctx->task); 1518 task_ctx_sched_out(task_ctx);
1519
1520 /*
1521 * If the context we're installing events in is not the
1522 * active task_ctx, flip them.
1523 */
1524 if (ctx->task && task_ctx != ctx) {
1525 if (task_ctx)
1526 raw_spin_unlock(&task_ctx->lock);
1527 raw_spin_lock(&ctx->lock);
1528 task_ctx = ctx;
1529 }
1530
1531 if (task_ctx) {
1532 cpuctx->task_ctx = task_ctx;
1533 task = task_ctx->task;
1534 }
1535
1536 cpu_ctx_sched_out(cpuctx, EVENT_ALL);
1488 1537
1489 raw_spin_lock(&ctx->lock);
1490 ctx->is_active = 1;
1491 update_context_time(ctx); 1538 update_context_time(ctx);
1492 /* 1539 /*
1493 * update cgrp time only if current cgrp 1540 * update cgrp time only if current cgrp
@@ -1498,43 +1545,13 @@ static int __perf_install_in_context(void *info)
1498 1545
1499 add_event_to_ctx(event, ctx); 1546 add_event_to_ctx(event, ctx);
1500 1547
1501 if (!event_filter_match(event))
1502 goto unlock;
1503
1504 /* 1548 /*
1505 * Don't put the event on if it is disabled or if 1549 * Schedule everything back in
1506 * it is in a group and the group isn't on.
1507 */ 1550 */
1508 if (event->state != PERF_EVENT_STATE_INACTIVE || 1551 perf_event_sched_in(cpuctx, task_ctx, task);
1509 (leader != event && leader->state != PERF_EVENT_STATE_ACTIVE))
1510 goto unlock;
1511
1512 /*
1513 * An exclusive event can't go on if there are already active
1514 * hardware events, and no hardware event can go on if there
1515 * is already an exclusive event on.
1516 */
1517 if (!group_can_go_on(event, cpuctx, 1))
1518 err = -EEXIST;
1519 else
1520 err = event_sched_in(event, cpuctx, ctx);
1521
1522 if (err) {
1523 /*
1524 * This event couldn't go on. If it is in a group
1525 * then we have to pull the whole group off.
1526 * If the event group is pinned then put it in error state.
1527 */
1528 if (leader != event)
1529 group_sched_out(leader, cpuctx, ctx);
1530 if (leader->attr.pinned) {
1531 update_group_times(leader);
1532 leader->state = PERF_EVENT_STATE_ERROR;
1533 }
1534 }
1535 1552
1536unlock: 1553 perf_pmu_enable(cpuctx->ctx.pmu);
1537 raw_spin_unlock(&ctx->lock); 1554 perf_ctx_unlock(cpuctx, task_ctx);
1538 1555
1539 return 0; 1556 return 0;
1540} 1557}
@@ -1747,7 +1764,7 @@ out:
1747 raw_spin_unlock_irq(&ctx->lock); 1764 raw_spin_unlock_irq(&ctx->lock);
1748} 1765}
1749 1766
1750static int perf_event_refresh(struct perf_event *event, int refresh) 1767int perf_event_refresh(struct perf_event *event, int refresh)
1751{ 1768{
1752 /* 1769 /*
1753 * not supported on inherited events 1770 * not supported on inherited events
@@ -1760,36 +1777,35 @@ static int perf_event_refresh(struct perf_event *event, int refresh)
1760 1777
1761 return 0; 1778 return 0;
1762} 1779}
1780EXPORT_SYMBOL_GPL(perf_event_refresh);
1763 1781
1764static void ctx_sched_out(struct perf_event_context *ctx, 1782static void ctx_sched_out(struct perf_event_context *ctx,
1765 struct perf_cpu_context *cpuctx, 1783 struct perf_cpu_context *cpuctx,
1766 enum event_type_t event_type) 1784 enum event_type_t event_type)
1767{ 1785{
1768 struct perf_event *event; 1786 struct perf_event *event;
1787 int is_active = ctx->is_active;
1769 1788
1770 raw_spin_lock(&ctx->lock); 1789 ctx->is_active &= ~event_type;
1771 perf_pmu_disable(ctx->pmu);
1772 ctx->is_active = 0;
1773 if (likely(!ctx->nr_events)) 1790 if (likely(!ctx->nr_events))
1774 goto out; 1791 return;
1792
1775 update_context_time(ctx); 1793 update_context_time(ctx);
1776 update_cgrp_time_from_cpuctx(cpuctx); 1794 update_cgrp_time_from_cpuctx(cpuctx);
1777
1778 if (!ctx->nr_active) 1795 if (!ctx->nr_active)
1779 goto out; 1796 return;
1780 1797
1781 if (event_type & EVENT_PINNED) { 1798 perf_pmu_disable(ctx->pmu);
1799 if ((is_active & EVENT_PINNED) && (event_type & EVENT_PINNED)) {
1782 list_for_each_entry(event, &ctx->pinned_groups, group_entry) 1800 list_for_each_entry(event, &ctx->pinned_groups, group_entry)
1783 group_sched_out(event, cpuctx, ctx); 1801 group_sched_out(event, cpuctx, ctx);
1784 } 1802 }
1785 1803
1786 if (event_type & EVENT_FLEXIBLE) { 1804 if ((is_active & EVENT_FLEXIBLE) && (event_type & EVENT_FLEXIBLE)) {
1787 list_for_each_entry(event, &ctx->flexible_groups, group_entry) 1805 list_for_each_entry(event, &ctx->flexible_groups, group_entry)
1788 group_sched_out(event, cpuctx, ctx); 1806 group_sched_out(event, cpuctx, ctx);
1789 } 1807 }
1790out:
1791 perf_pmu_enable(ctx->pmu); 1808 perf_pmu_enable(ctx->pmu);
1792 raw_spin_unlock(&ctx->lock);
1793} 1809}
1794 1810
1795/* 1811/*
@@ -1937,8 +1953,10 @@ static void perf_event_context_sched_out(struct task_struct *task, int ctxn,
1937 rcu_read_unlock(); 1953 rcu_read_unlock();
1938 1954
1939 if (do_switch) { 1955 if (do_switch) {
1956 raw_spin_lock(&ctx->lock);
1940 ctx_sched_out(ctx, cpuctx, EVENT_ALL); 1957 ctx_sched_out(ctx, cpuctx, EVENT_ALL);
1941 cpuctx->task_ctx = NULL; 1958 cpuctx->task_ctx = NULL;
1959 raw_spin_unlock(&ctx->lock);
1942 } 1960 }
1943} 1961}
1944 1962
@@ -1973,8 +1991,7 @@ void __perf_event_task_sched_out(struct task_struct *task,
1973 perf_cgroup_sched_out(task); 1991 perf_cgroup_sched_out(task);
1974} 1992}
1975 1993
1976static void task_ctx_sched_out(struct perf_event_context *ctx, 1994static void task_ctx_sched_out(struct perf_event_context *ctx)
1977 enum event_type_t event_type)
1978{ 1995{
1979 struct perf_cpu_context *cpuctx = __get_cpu_context(ctx); 1996 struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
1980 1997
@@ -1984,7 +2001,7 @@ static void task_ctx_sched_out(struct perf_event_context *ctx,
1984 if (WARN_ON_ONCE(ctx != cpuctx->task_ctx)) 2001 if (WARN_ON_ONCE(ctx != cpuctx->task_ctx))
1985 return; 2002 return;
1986 2003
1987 ctx_sched_out(ctx, cpuctx, event_type); 2004 ctx_sched_out(ctx, cpuctx, EVENT_ALL);
1988 cpuctx->task_ctx = NULL; 2005 cpuctx->task_ctx = NULL;
1989} 2006}
1990 2007
@@ -2063,11 +2080,11 @@ ctx_sched_in(struct perf_event_context *ctx,
2063 struct task_struct *task) 2080 struct task_struct *task)
2064{ 2081{
2065 u64 now; 2082 u64 now;
2083 int is_active = ctx->is_active;
2066 2084
2067 raw_spin_lock(&ctx->lock); 2085 ctx->is_active |= event_type;
2068 ctx->is_active = 1;
2069 if (likely(!ctx->nr_events)) 2086 if (likely(!ctx->nr_events))
2070 goto out; 2087 return;
2071 2088
2072 now = perf_clock(); 2089 now = perf_clock();
2073 ctx->timestamp = now; 2090 ctx->timestamp = now;
@@ -2076,15 +2093,12 @@ ctx_sched_in(struct perf_event_context *ctx,
2076 * First go through the list and put on any pinned groups 2093 * First go through the list and put on any pinned groups
2077 * in order to give them the best chance of going on. 2094 * in order to give them the best chance of going on.
2078 */ 2095 */
2079 if (event_type & EVENT_PINNED) 2096 if (!(is_active & EVENT_PINNED) && (event_type & EVENT_PINNED))
2080 ctx_pinned_sched_in(ctx, cpuctx); 2097 ctx_pinned_sched_in(ctx, cpuctx);
2081 2098
2082 /* Then walk through the lower prio flexible groups */ 2099 /* Then walk through the lower prio flexible groups */
2083 if (event_type & EVENT_FLEXIBLE) 2100 if (!(is_active & EVENT_FLEXIBLE) && (event_type & EVENT_FLEXIBLE))
2084 ctx_flexible_sched_in(ctx, cpuctx); 2101 ctx_flexible_sched_in(ctx, cpuctx);
2085
2086out:
2087 raw_spin_unlock(&ctx->lock);
2088} 2102}
2089 2103
2090static void cpu_ctx_sched_in(struct perf_cpu_context *cpuctx, 2104static void cpu_ctx_sched_in(struct perf_cpu_context *cpuctx,
@@ -2096,19 +2110,6 @@ static void cpu_ctx_sched_in(struct perf_cpu_context *cpuctx,
2096 ctx_sched_in(ctx, cpuctx, event_type, task); 2110 ctx_sched_in(ctx, cpuctx, event_type, task);
2097} 2111}
2098 2112
2099static void task_ctx_sched_in(struct perf_event_context *ctx,
2100 enum event_type_t event_type)
2101{
2102 struct perf_cpu_context *cpuctx;
2103
2104 cpuctx = __get_cpu_context(ctx);
2105 if (cpuctx->task_ctx == ctx)
2106 return;
2107
2108 ctx_sched_in(ctx, cpuctx, event_type, NULL);
2109 cpuctx->task_ctx = ctx;
2110}
2111
2112static void perf_event_context_sched_in(struct perf_event_context *ctx, 2113static void perf_event_context_sched_in(struct perf_event_context *ctx,
2113 struct task_struct *task) 2114 struct task_struct *task)
2114{ 2115{
@@ -2118,6 +2119,7 @@ static void perf_event_context_sched_in(struct perf_event_context *ctx,
2118 if (cpuctx->task_ctx == ctx) 2119 if (cpuctx->task_ctx == ctx)
2119 return; 2120 return;
2120 2121
2122 perf_ctx_lock(cpuctx, ctx);
2121 perf_pmu_disable(ctx->pmu); 2123 perf_pmu_disable(ctx->pmu);
2122 /* 2124 /*
2123 * We want to keep the following priority order: 2125 * We want to keep the following priority order:
@@ -2126,18 +2128,18 @@ static void perf_event_context_sched_in(struct perf_event_context *ctx,
2126 */ 2128 */
2127 cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE); 2129 cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE);
2128 2130
2129 ctx_sched_in(ctx, cpuctx, EVENT_PINNED, task); 2131 perf_event_sched_in(cpuctx, ctx, task);
2130 cpu_ctx_sched_in(cpuctx, EVENT_FLEXIBLE, task);
2131 ctx_sched_in(ctx, cpuctx, EVENT_FLEXIBLE, task);
2132 2132
2133 cpuctx->task_ctx = ctx; 2133 cpuctx->task_ctx = ctx;
2134 2134
2135 perf_pmu_enable(ctx->pmu);
2136 perf_ctx_unlock(cpuctx, ctx);
2137
2135 /* 2138 /*
2136 * Since these rotations are per-cpu, we need to ensure the 2139 * Since these rotations are per-cpu, we need to ensure the
2137 * cpu-context we got scheduled on is actually rotating. 2140 * cpu-context we got scheduled on is actually rotating.
2138 */ 2141 */
2139 perf_pmu_rotate_start(ctx->pmu); 2142 perf_pmu_rotate_start(ctx->pmu);
2140 perf_pmu_enable(ctx->pmu);
2141} 2143}
2142 2144
2143/* 2145/*
@@ -2277,7 +2279,6 @@ static void perf_ctx_adjust_freq(struct perf_event_context *ctx, u64 period)
2277 u64 interrupts, now; 2279 u64 interrupts, now;
2278 s64 delta; 2280 s64 delta;
2279 2281
2280 raw_spin_lock(&ctx->lock);
2281 list_for_each_entry_rcu(event, &ctx->event_list, event_entry) { 2282 list_for_each_entry_rcu(event, &ctx->event_list, event_entry) {
2282 if (event->state != PERF_EVENT_STATE_ACTIVE) 2283 if (event->state != PERF_EVENT_STATE_ACTIVE)
2283 continue; 2284 continue;
@@ -2309,7 +2310,6 @@ static void perf_ctx_adjust_freq(struct perf_event_context *ctx, u64 period)
2309 if (delta > 0) 2310 if (delta > 0)
2310 perf_adjust_period(event, period, delta); 2311 perf_adjust_period(event, period, delta);
2311 } 2312 }
2312 raw_spin_unlock(&ctx->lock);
2313} 2313}
2314 2314
2315/* 2315/*
@@ -2317,16 +2317,12 @@ static void perf_ctx_adjust_freq(struct perf_event_context *ctx, u64 period)
2317 */ 2317 */
2318static void rotate_ctx(struct perf_event_context *ctx) 2318static void rotate_ctx(struct perf_event_context *ctx)
2319{ 2319{
2320 raw_spin_lock(&ctx->lock);
2321
2322 /* 2320 /*
2323 * Rotate the first entry last of non-pinned groups. Rotation might be 2321 * Rotate the first entry last of non-pinned groups. Rotation might be
2324 * disabled by the inheritance code. 2322 * disabled by the inheritance code.
2325 */ 2323 */
2326 if (!ctx->rotate_disable) 2324 if (!ctx->rotate_disable)
2327 list_rotate_left(&ctx->flexible_groups); 2325 list_rotate_left(&ctx->flexible_groups);
2328
2329 raw_spin_unlock(&ctx->lock);
2330} 2326}
2331 2327
2332/* 2328/*
@@ -2353,6 +2349,7 @@ static void perf_rotate_context(struct perf_cpu_context *cpuctx)
2353 rotate = 1; 2349 rotate = 1;
2354 } 2350 }
2355 2351
2352 perf_ctx_lock(cpuctx, cpuctx->task_ctx);
2356 perf_pmu_disable(cpuctx->ctx.pmu); 2353 perf_pmu_disable(cpuctx->ctx.pmu);
2357 perf_ctx_adjust_freq(&cpuctx->ctx, interval); 2354 perf_ctx_adjust_freq(&cpuctx->ctx, interval);
2358 if (ctx) 2355 if (ctx)
@@ -2363,21 +2360,20 @@ static void perf_rotate_context(struct perf_cpu_context *cpuctx)
2363 2360
2364 cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE); 2361 cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE);
2365 if (ctx) 2362 if (ctx)
2366 task_ctx_sched_out(ctx, EVENT_FLEXIBLE); 2363 ctx_sched_out(ctx, cpuctx, EVENT_FLEXIBLE);
2367 2364
2368 rotate_ctx(&cpuctx->ctx); 2365 rotate_ctx(&cpuctx->ctx);
2369 if (ctx) 2366 if (ctx)
2370 rotate_ctx(ctx); 2367 rotate_ctx(ctx);
2371 2368
2372 cpu_ctx_sched_in(cpuctx, EVENT_FLEXIBLE, current); 2369 perf_event_sched_in(cpuctx, ctx, current);
2373 if (ctx)
2374 task_ctx_sched_in(ctx, EVENT_FLEXIBLE);
2375 2370
2376done: 2371done:
2377 if (remove) 2372 if (remove)
2378 list_del_init(&cpuctx->rotation_list); 2373 list_del_init(&cpuctx->rotation_list);
2379 2374
2380 perf_pmu_enable(cpuctx->ctx.pmu); 2375 perf_pmu_enable(cpuctx->ctx.pmu);
2376 perf_ctx_unlock(cpuctx, cpuctx->task_ctx);
2381} 2377}
2382 2378
2383void perf_event_task_tick(void) 2379void perf_event_task_tick(void)
@@ -2432,9 +2428,9 @@ static void perf_event_enable_on_exec(struct perf_event_context *ctx)
2432 * in. 2428 * in.
2433 */ 2429 */
2434 perf_cgroup_sched_out(current); 2430 perf_cgroup_sched_out(current);
2435 task_ctx_sched_out(ctx, EVENT_ALL);
2436 2431
2437 raw_spin_lock(&ctx->lock); 2432 raw_spin_lock(&ctx->lock);
2433 task_ctx_sched_out(ctx);
2438 2434
2439 list_for_each_entry(event, &ctx->pinned_groups, group_entry) { 2435 list_for_each_entry(event, &ctx->pinned_groups, group_entry) {
2440 ret = event_enable_on_exec(event, ctx); 2436 ret = event_enable_on_exec(event, ctx);
@@ -2843,16 +2839,12 @@ retry:
2843 unclone_ctx(ctx); 2839 unclone_ctx(ctx);
2844 ++ctx->pin_count; 2840 ++ctx->pin_count;
2845 raw_spin_unlock_irqrestore(&ctx->lock, flags); 2841 raw_spin_unlock_irqrestore(&ctx->lock, flags);
2846 } 2842 } else {
2847
2848 if (!ctx) {
2849 ctx = alloc_perf_context(pmu, task); 2843 ctx = alloc_perf_context(pmu, task);
2850 err = -ENOMEM; 2844 err = -ENOMEM;
2851 if (!ctx) 2845 if (!ctx)
2852 goto errout; 2846 goto errout;
2853 2847
2854 get_ctx(ctx);
2855
2856 err = 0; 2848 err = 0;
2857 mutex_lock(&task->perf_event_mutex); 2849 mutex_lock(&task->perf_event_mutex);
2858 /* 2850 /*
@@ -2864,14 +2856,14 @@ retry:
2864 else if (task->perf_event_ctxp[ctxn]) 2856 else if (task->perf_event_ctxp[ctxn])
2865 err = -EAGAIN; 2857 err = -EAGAIN;
2866 else { 2858 else {
2859 get_ctx(ctx);
2867 ++ctx->pin_count; 2860 ++ctx->pin_count;
2868 rcu_assign_pointer(task->perf_event_ctxp[ctxn], ctx); 2861 rcu_assign_pointer(task->perf_event_ctxp[ctxn], ctx);
2869 } 2862 }
2870 mutex_unlock(&task->perf_event_mutex); 2863 mutex_unlock(&task->perf_event_mutex);
2871 2864
2872 if (unlikely(err)) { 2865 if (unlikely(err)) {
2873 put_task_struct(task); 2866 put_ctx(ctx);
2874 kfree(ctx);
2875 2867
2876 if (err == -EAGAIN) 2868 if (err == -EAGAIN)
2877 goto retry; 2869 goto retry;
@@ -2898,7 +2890,7 @@ static void free_event_rcu(struct rcu_head *head)
2898 kfree(event); 2890 kfree(event);
2899} 2891}
2900 2892
2901static void perf_buffer_put(struct perf_buffer *buffer); 2893static void ring_buffer_put(struct ring_buffer *rb);
2902 2894
2903static void free_event(struct perf_event *event) 2895static void free_event(struct perf_event *event)
2904{ 2896{
@@ -2921,9 +2913,9 @@ static void free_event(struct perf_event *event)
2921 } 2913 }
2922 } 2914 }
2923 2915
2924 if (event->buffer) { 2916 if (event->rb) {
2925 perf_buffer_put(event->buffer); 2917 ring_buffer_put(event->rb);
2926 event->buffer = NULL; 2918 event->rb = NULL;
2927 } 2919 }
2928 2920
2929 if (is_cgroup_event(event)) 2921 if (is_cgroup_event(event))
@@ -2942,12 +2934,6 @@ int perf_event_release_kernel(struct perf_event *event)
2942{ 2934{
2943 struct perf_event_context *ctx = event->ctx; 2935 struct perf_event_context *ctx = event->ctx;
2944 2936
2945 /*
2946 * Remove from the PMU, can't get re-enabled since we got
2947 * here because the last ref went.
2948 */
2949 perf_event_disable(event);
2950
2951 WARN_ON_ONCE(ctx->parent_ctx); 2937 WARN_ON_ONCE(ctx->parent_ctx);
2952 /* 2938 /*
2953 * There are two ways this annotation is useful: 2939 * There are two ways this annotation is useful:
@@ -2964,8 +2950,8 @@ int perf_event_release_kernel(struct perf_event *event)
2964 mutex_lock_nested(&ctx->mutex, SINGLE_DEPTH_NESTING); 2950 mutex_lock_nested(&ctx->mutex, SINGLE_DEPTH_NESTING);
2965 raw_spin_lock_irq(&ctx->lock); 2951 raw_spin_lock_irq(&ctx->lock);
2966 perf_group_detach(event); 2952 perf_group_detach(event);
2967 list_del_event(event, ctx);
2968 raw_spin_unlock_irq(&ctx->lock); 2953 raw_spin_unlock_irq(&ctx->lock);
2954 perf_remove_from_context(event);
2969 mutex_unlock(&ctx->mutex); 2955 mutex_unlock(&ctx->mutex);
2970 2956
2971 free_event(event); 2957 free_event(event);
@@ -3157,13 +3143,13 @@ perf_read(struct file *file, char __user *buf, size_t count, loff_t *ppos)
3157static unsigned int perf_poll(struct file *file, poll_table *wait) 3143static unsigned int perf_poll(struct file *file, poll_table *wait)
3158{ 3144{
3159 struct perf_event *event = file->private_data; 3145 struct perf_event *event = file->private_data;
3160 struct perf_buffer *buffer; 3146 struct ring_buffer *rb;
3161 unsigned int events = POLL_HUP; 3147 unsigned int events = POLL_HUP;
3162 3148
3163 rcu_read_lock(); 3149 rcu_read_lock();
3164 buffer = rcu_dereference(event->buffer); 3150 rb = rcu_dereference(event->rb);
3165 if (buffer) 3151 if (rb)
3166 events = atomic_xchg(&buffer->poll, 0); 3152 events = atomic_xchg(&rb->poll, 0);
3167 rcu_read_unlock(); 3153 rcu_read_unlock();
3168 3154
3169 poll_wait(file, &event->waitq, wait); 3155 poll_wait(file, &event->waitq, wait);
@@ -3366,6 +3352,18 @@ static int perf_event_index(struct perf_event *event)
3366 return event->hw.idx + 1 - PERF_EVENT_INDEX_OFFSET; 3352 return event->hw.idx + 1 - PERF_EVENT_INDEX_OFFSET;
3367} 3353}
3368 3354
3355static void calc_timer_values(struct perf_event *event,
3356 u64 *running,
3357 u64 *enabled)
3358{
3359 u64 now, ctx_time;
3360
3361 now = perf_clock();
3362 ctx_time = event->shadow_ctx_time + now;
3363 *enabled = ctx_time - event->tstamp_enabled;
3364 *running = ctx_time - event->tstamp_running;
3365}
3366
3369/* 3367/*
3370 * Callers need to ensure there can be no nesting of this function, otherwise 3368 * Callers need to ensure there can be no nesting of this function, otherwise
3371 * the seqlock logic goes bad. We can not serialize this because the arch 3369 * the seqlock logic goes bad. We can not serialize this because the arch
@@ -3374,14 +3372,25 @@ static int perf_event_index(struct perf_event *event)
3374void perf_event_update_userpage(struct perf_event *event) 3372void perf_event_update_userpage(struct perf_event *event)
3375{ 3373{
3376 struct perf_event_mmap_page *userpg; 3374 struct perf_event_mmap_page *userpg;
3377 struct perf_buffer *buffer; 3375 struct ring_buffer *rb;
3376 u64 enabled, running;
3378 3377
3379 rcu_read_lock(); 3378 rcu_read_lock();
3380 buffer = rcu_dereference(event->buffer); 3379 /*
3381 if (!buffer) 3380 * compute total_time_enabled, total_time_running
3381 * based on snapshot values taken when the event
3382 * was last scheduled in.
3383 *
3384 * we cannot simply called update_context_time()
3385 * because of locking issue as we can be called in
3386 * NMI context
3387 */
3388 calc_timer_values(event, &enabled, &running);
3389 rb = rcu_dereference(event->rb);
3390 if (!rb)
3382 goto unlock; 3391 goto unlock;
3383 3392
3384 userpg = buffer->user_page; 3393 userpg = rb->user_page;
3385 3394
3386 /* 3395 /*
3387 * Disable preemption so as to not let the corresponding user-space 3396 * Disable preemption so as to not let the corresponding user-space
@@ -3395,10 +3404,10 @@ void perf_event_update_userpage(struct perf_event *event)
3395 if (event->state == PERF_EVENT_STATE_ACTIVE) 3404 if (event->state == PERF_EVENT_STATE_ACTIVE)
3396 userpg->offset -= local64_read(&event->hw.prev_count); 3405 userpg->offset -= local64_read(&event->hw.prev_count);
3397 3406
3398 userpg->time_enabled = event->total_time_enabled + 3407 userpg->time_enabled = enabled +
3399 atomic64_read(&event->child_total_time_enabled); 3408 atomic64_read(&event->child_total_time_enabled);
3400 3409
3401 userpg->time_running = event->total_time_running + 3410 userpg->time_running = running +
3402 atomic64_read(&event->child_total_time_running); 3411 atomic64_read(&event->child_total_time_running);
3403 3412
3404 barrier(); 3413 barrier();
@@ -3408,220 +3417,10 @@ unlock:
3408 rcu_read_unlock(); 3417 rcu_read_unlock();
3409} 3418}
3410 3419
3411static unsigned long perf_data_size(struct perf_buffer *buffer);
3412
3413static void
3414perf_buffer_init(struct perf_buffer *buffer, long watermark, int flags)
3415{
3416 long max_size = perf_data_size(buffer);
3417
3418 if (watermark)
3419 buffer->watermark = min(max_size, watermark);
3420
3421 if (!buffer->watermark)
3422 buffer->watermark = max_size / 2;
3423
3424 if (flags & PERF_BUFFER_WRITABLE)
3425 buffer->writable = 1;
3426
3427 atomic_set(&buffer->refcount, 1);
3428}
3429
3430#ifndef CONFIG_PERF_USE_VMALLOC
3431
3432/*
3433 * Back perf_mmap() with regular GFP_KERNEL-0 pages.
3434 */
3435
3436static struct page *
3437perf_mmap_to_page(struct perf_buffer *buffer, unsigned long pgoff)
3438{
3439 if (pgoff > buffer->nr_pages)
3440 return NULL;
3441
3442 if (pgoff == 0)
3443 return virt_to_page(buffer->user_page);
3444
3445 return virt_to_page(buffer->data_pages[pgoff - 1]);
3446}
3447
3448static void *perf_mmap_alloc_page(int cpu)
3449{
3450 struct page *page;
3451 int node;
3452
3453 node = (cpu == -1) ? cpu : cpu_to_node(cpu);
3454 page = alloc_pages_node(node, GFP_KERNEL | __GFP_ZERO, 0);
3455 if (!page)
3456 return NULL;
3457
3458 return page_address(page);
3459}
3460
3461static struct perf_buffer *
3462perf_buffer_alloc(int nr_pages, long watermark, int cpu, int flags)
3463{
3464 struct perf_buffer *buffer;
3465 unsigned long size;
3466 int i;
3467
3468 size = sizeof(struct perf_buffer);
3469 size += nr_pages * sizeof(void *);
3470
3471 buffer = kzalloc(size, GFP_KERNEL);
3472 if (!buffer)
3473 goto fail;
3474
3475 buffer->user_page = perf_mmap_alloc_page(cpu);
3476 if (!buffer->user_page)
3477 goto fail_user_page;
3478
3479 for (i = 0; i < nr_pages; i++) {
3480 buffer->data_pages[i] = perf_mmap_alloc_page(cpu);
3481 if (!buffer->data_pages[i])
3482 goto fail_data_pages;
3483 }
3484
3485 buffer->nr_pages = nr_pages;
3486
3487 perf_buffer_init(buffer, watermark, flags);
3488
3489 return buffer;
3490
3491fail_data_pages:
3492 for (i--; i >= 0; i--)
3493 free_page((unsigned long)buffer->data_pages[i]);
3494
3495 free_page((unsigned long)buffer->user_page);
3496
3497fail_user_page:
3498 kfree(buffer);
3499
3500fail:
3501 return NULL;
3502}
3503
3504static void perf_mmap_free_page(unsigned long addr)
3505{
3506 struct page *page = virt_to_page((void *)addr);
3507
3508 page->mapping = NULL;
3509 __free_page(page);
3510}
3511
3512static void perf_buffer_free(struct perf_buffer *buffer)
3513{
3514 int i;
3515
3516 perf_mmap_free_page((unsigned long)buffer->user_page);
3517 for (i = 0; i < buffer->nr_pages; i++)
3518 perf_mmap_free_page((unsigned long)buffer->data_pages[i]);
3519 kfree(buffer);
3520}
3521
3522static inline int page_order(struct perf_buffer *buffer)
3523{
3524 return 0;
3525}
3526
3527#else
3528
3529/*
3530 * Back perf_mmap() with vmalloc memory.
3531 *
3532 * Required for architectures that have d-cache aliasing issues.
3533 */
3534
3535static inline int page_order(struct perf_buffer *buffer)
3536{
3537 return buffer->page_order;
3538}
3539
3540static struct page *
3541perf_mmap_to_page(struct perf_buffer *buffer, unsigned long pgoff)
3542{
3543 if (pgoff > (1UL << page_order(buffer)))
3544 return NULL;
3545
3546 return vmalloc_to_page((void *)buffer->user_page + pgoff * PAGE_SIZE);
3547}
3548
3549static void perf_mmap_unmark_page(void *addr)
3550{
3551 struct page *page = vmalloc_to_page(addr);
3552
3553 page->mapping = NULL;
3554}
3555
3556static void perf_buffer_free_work(struct work_struct *work)
3557{
3558 struct perf_buffer *buffer;
3559 void *base;
3560 int i, nr;
3561
3562 buffer = container_of(work, struct perf_buffer, work);
3563 nr = 1 << page_order(buffer);
3564
3565 base = buffer->user_page;
3566 for (i = 0; i < nr + 1; i++)
3567 perf_mmap_unmark_page(base + (i * PAGE_SIZE));
3568
3569 vfree(base);
3570 kfree(buffer);
3571}
3572
3573static void perf_buffer_free(struct perf_buffer *buffer)
3574{
3575 schedule_work(&buffer->work);
3576}
3577
3578static struct perf_buffer *
3579perf_buffer_alloc(int nr_pages, long watermark, int cpu, int flags)
3580{
3581 struct perf_buffer *buffer;
3582 unsigned long size;
3583 void *all_buf;
3584
3585 size = sizeof(struct perf_buffer);
3586 size += sizeof(void *);
3587
3588 buffer = kzalloc(size, GFP_KERNEL);
3589 if (!buffer)
3590 goto fail;
3591
3592 INIT_WORK(&buffer->work, perf_buffer_free_work);
3593
3594 all_buf = vmalloc_user((nr_pages + 1) * PAGE_SIZE);
3595 if (!all_buf)
3596 goto fail_all_buf;
3597
3598 buffer->user_page = all_buf;
3599 buffer->data_pages[0] = all_buf + PAGE_SIZE;
3600 buffer->page_order = ilog2(nr_pages);
3601 buffer->nr_pages = 1;
3602
3603 perf_buffer_init(buffer, watermark, flags);
3604
3605 return buffer;
3606
3607fail_all_buf:
3608 kfree(buffer);
3609
3610fail:
3611 return NULL;
3612}
3613
3614#endif
3615
3616static unsigned long perf_data_size(struct perf_buffer *buffer)
3617{
3618 return buffer->nr_pages << (PAGE_SHIFT + page_order(buffer));
3619}
3620
3621static int perf_mmap_fault(struct vm_area_struct *vma, struct vm_fault *vmf) 3420static int perf_mmap_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
3622{ 3421{
3623 struct perf_event *event = vma->vm_file->private_data; 3422 struct perf_event *event = vma->vm_file->private_data;
3624 struct perf_buffer *buffer; 3423 struct ring_buffer *rb;
3625 int ret = VM_FAULT_SIGBUS; 3424 int ret = VM_FAULT_SIGBUS;
3626 3425
3627 if (vmf->flags & FAULT_FLAG_MKWRITE) { 3426 if (vmf->flags & FAULT_FLAG_MKWRITE) {
@@ -3631,14 +3430,14 @@ static int perf_mmap_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
3631 } 3430 }
3632 3431
3633 rcu_read_lock(); 3432 rcu_read_lock();
3634 buffer = rcu_dereference(event->buffer); 3433 rb = rcu_dereference(event->rb);
3635 if (!buffer) 3434 if (!rb)
3636 goto unlock; 3435 goto unlock;
3637 3436
3638 if (vmf->pgoff && (vmf->flags & FAULT_FLAG_WRITE)) 3437 if (vmf->pgoff && (vmf->flags & FAULT_FLAG_WRITE))
3639 goto unlock; 3438 goto unlock;
3640 3439
3641 vmf->page = perf_mmap_to_page(buffer, vmf->pgoff); 3440 vmf->page = perf_mmap_to_page(rb, vmf->pgoff);
3642 if (!vmf->page) 3441 if (!vmf->page)
3643 goto unlock; 3442 goto unlock;
3644 3443
@@ -3653,35 +3452,35 @@ unlock:
3653 return ret; 3452 return ret;
3654} 3453}
3655 3454
3656static void perf_buffer_free_rcu(struct rcu_head *rcu_head) 3455static void rb_free_rcu(struct rcu_head *rcu_head)
3657{ 3456{
3658 struct perf_buffer *buffer; 3457 struct ring_buffer *rb;
3659 3458
3660 buffer = container_of(rcu_head, struct perf_buffer, rcu_head); 3459 rb = container_of(rcu_head, struct ring_buffer, rcu_head);
3661 perf_buffer_free(buffer); 3460 rb_free(rb);
3662} 3461}
3663 3462
3664static struct perf_buffer *perf_buffer_get(struct perf_event *event) 3463static struct ring_buffer *ring_buffer_get(struct perf_event *event)
3665{ 3464{
3666 struct perf_buffer *buffer; 3465 struct ring_buffer *rb;
3667 3466
3668 rcu_read_lock(); 3467 rcu_read_lock();
3669 buffer = rcu_dereference(event->buffer); 3468 rb = rcu_dereference(event->rb);
3670 if (buffer) { 3469 if (rb) {
3671 if (!atomic_inc_not_zero(&buffer->refcount)) 3470 if (!atomic_inc_not_zero(&rb->refcount))
3672 buffer = NULL; 3471 rb = NULL;
3673 } 3472 }
3674 rcu_read_unlock(); 3473 rcu_read_unlock();
3675 3474
3676 return buffer; 3475 return rb;
3677} 3476}
3678 3477
3679static void perf_buffer_put(struct perf_buffer *buffer) 3478static void ring_buffer_put(struct ring_buffer *rb)
3680{ 3479{
3681 if (!atomic_dec_and_test(&buffer->refcount)) 3480 if (!atomic_dec_and_test(&rb->refcount))
3682 return; 3481 return;
3683 3482
3684 call_rcu(&buffer->rcu_head, perf_buffer_free_rcu); 3483 call_rcu(&rb->rcu_head, rb_free_rcu);
3685} 3484}
3686 3485
3687static void perf_mmap_open(struct vm_area_struct *vma) 3486static void perf_mmap_open(struct vm_area_struct *vma)
@@ -3696,16 +3495,16 @@ static void perf_mmap_close(struct vm_area_struct *vma)
3696 struct perf_event *event = vma->vm_file->private_data; 3495 struct perf_event *event = vma->vm_file->private_data;
3697 3496
3698 if (atomic_dec_and_mutex_lock(&event->mmap_count, &event->mmap_mutex)) { 3497 if (atomic_dec_and_mutex_lock(&event->mmap_count, &event->mmap_mutex)) {
3699 unsigned long size = perf_data_size(event->buffer); 3498 unsigned long size = perf_data_size(event->rb);
3700 struct user_struct *user = event->mmap_user; 3499 struct user_struct *user = event->mmap_user;
3701 struct perf_buffer *buffer = event->buffer; 3500 struct ring_buffer *rb = event->rb;
3702 3501
3703 atomic_long_sub((size >> PAGE_SHIFT) + 1, &user->locked_vm); 3502 atomic_long_sub((size >> PAGE_SHIFT) + 1, &user->locked_vm);
3704 vma->vm_mm->locked_vm -= event->mmap_locked; 3503 vma->vm_mm->locked_vm -= event->mmap_locked;
3705 rcu_assign_pointer(event->buffer, NULL); 3504 rcu_assign_pointer(event->rb, NULL);
3706 mutex_unlock(&event->mmap_mutex); 3505 mutex_unlock(&event->mmap_mutex);
3707 3506
3708 perf_buffer_put(buffer); 3507 ring_buffer_put(rb);
3709 free_uid(user); 3508 free_uid(user);
3710 } 3509 }
3711} 3510}
@@ -3723,7 +3522,7 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma)
3723 unsigned long user_locked, user_lock_limit; 3522 unsigned long user_locked, user_lock_limit;
3724 struct user_struct *user = current_user(); 3523 struct user_struct *user = current_user();
3725 unsigned long locked, lock_limit; 3524 unsigned long locked, lock_limit;
3726 struct perf_buffer *buffer; 3525 struct ring_buffer *rb;
3727 unsigned long vma_size; 3526 unsigned long vma_size;
3728 unsigned long nr_pages; 3527 unsigned long nr_pages;
3729 long user_extra, extra; 3528 long user_extra, extra;
@@ -3732,7 +3531,7 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma)
3732 /* 3531 /*
3733 * Don't allow mmap() of inherited per-task counters. This would 3532 * Don't allow mmap() of inherited per-task counters. This would
3734 * create a performance issue due to all children writing to the 3533 * create a performance issue due to all children writing to the
3735 * same buffer. 3534 * same rb.
3736 */ 3535 */
3737 if (event->cpu == -1 && event->attr.inherit) 3536 if (event->cpu == -1 && event->attr.inherit)
3738 return -EINVAL; 3537 return -EINVAL;
@@ -3744,7 +3543,7 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma)
3744 nr_pages = (vma_size / PAGE_SIZE) - 1; 3543 nr_pages = (vma_size / PAGE_SIZE) - 1;
3745 3544
3746 /* 3545 /*
3747 * If we have buffer pages ensure they're a power-of-two number, so we 3546 * If we have rb pages ensure they're a power-of-two number, so we
3748 * can do bitmasks instead of modulo. 3547 * can do bitmasks instead of modulo.
3749 */ 3548 */
3750 if (nr_pages != 0 && !is_power_of_2(nr_pages)) 3549 if (nr_pages != 0 && !is_power_of_2(nr_pages))
@@ -3758,9 +3557,9 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma)
3758 3557
3759 WARN_ON_ONCE(event->ctx->parent_ctx); 3558 WARN_ON_ONCE(event->ctx->parent_ctx);
3760 mutex_lock(&event->mmap_mutex); 3559 mutex_lock(&event->mmap_mutex);
3761 if (event->buffer) { 3560 if (event->rb) {
3762 if (event->buffer->nr_pages == nr_pages) 3561 if (event->rb->nr_pages == nr_pages)
3763 atomic_inc(&event->buffer->refcount); 3562 atomic_inc(&event->rb->refcount);
3764 else 3563 else
3765 ret = -EINVAL; 3564 ret = -EINVAL;
3766 goto unlock; 3565 goto unlock;
@@ -3790,18 +3589,20 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma)
3790 goto unlock; 3589 goto unlock;
3791 } 3590 }
3792 3591
3793 WARN_ON(event->buffer); 3592 WARN_ON(event->rb);
3794 3593
3795 if (vma->vm_flags & VM_WRITE) 3594 if (vma->vm_flags & VM_WRITE)
3796 flags |= PERF_BUFFER_WRITABLE; 3595 flags |= RING_BUFFER_WRITABLE;
3596
3597 rb = rb_alloc(nr_pages,
3598 event->attr.watermark ? event->attr.wakeup_watermark : 0,
3599 event->cpu, flags);
3797 3600
3798 buffer = perf_buffer_alloc(nr_pages, event->attr.wakeup_watermark, 3601 if (!rb) {
3799 event->cpu, flags);
3800 if (!buffer) {
3801 ret = -ENOMEM; 3602 ret = -ENOMEM;
3802 goto unlock; 3603 goto unlock;
3803 } 3604 }
3804 rcu_assign_pointer(event->buffer, buffer); 3605 rcu_assign_pointer(event->rb, rb);
3805 3606
3806 atomic_long_add(user_extra, &user->locked_vm); 3607 atomic_long_add(user_extra, &user->locked_vm);
3807 event->mmap_locked = extra; 3608 event->mmap_locked = extra;
@@ -3900,117 +3701,6 @@ int perf_unregister_guest_info_callbacks(struct perf_guest_info_callbacks *cbs)
3900} 3701}
3901EXPORT_SYMBOL_GPL(perf_unregister_guest_info_callbacks); 3702EXPORT_SYMBOL_GPL(perf_unregister_guest_info_callbacks);
3902 3703
3903/*
3904 * Output
3905 */
3906static bool perf_output_space(struct perf_buffer *buffer, unsigned long tail,
3907 unsigned long offset, unsigned long head)
3908{
3909 unsigned long mask;
3910
3911 if (!buffer->writable)
3912 return true;
3913
3914 mask = perf_data_size(buffer) - 1;
3915
3916 offset = (offset - tail) & mask;
3917 head = (head - tail) & mask;
3918
3919 if ((int)(head - offset) < 0)
3920 return false;
3921
3922 return true;
3923}
3924
3925static void perf_output_wakeup(struct perf_output_handle *handle)
3926{
3927 atomic_set(&handle->buffer->poll, POLL_IN);
3928
3929 if (handle->nmi) {
3930 handle->event->pending_wakeup = 1;
3931 irq_work_queue(&handle->event->pending);
3932 } else
3933 perf_event_wakeup(handle->event);
3934}
3935
3936/*
3937 * We need to ensure a later event_id doesn't publish a head when a former
3938 * event isn't done writing. However since we need to deal with NMIs we
3939 * cannot fully serialize things.
3940 *
3941 * We only publish the head (and generate a wakeup) when the outer-most
3942 * event completes.
3943 */
3944static void perf_output_get_handle(struct perf_output_handle *handle)
3945{
3946 struct perf_buffer *buffer = handle->buffer;
3947
3948 preempt_disable();
3949 local_inc(&buffer->nest);
3950 handle->wakeup = local_read(&buffer->wakeup);
3951}
3952
3953static void perf_output_put_handle(struct perf_output_handle *handle)
3954{
3955 struct perf_buffer *buffer = handle->buffer;
3956 unsigned long head;
3957
3958again:
3959 head = local_read(&buffer->head);
3960
3961 /*
3962 * IRQ/NMI can happen here, which means we can miss a head update.
3963 */
3964
3965 if (!local_dec_and_test(&buffer->nest))
3966 goto out;
3967
3968 /*
3969 * Publish the known good head. Rely on the full barrier implied
3970 * by atomic_dec_and_test() order the buffer->head read and this
3971 * write.
3972 */
3973 buffer->user_page->data_head = head;
3974
3975 /*
3976 * Now check if we missed an update, rely on the (compiler)
3977 * barrier in atomic_dec_and_test() to re-read buffer->head.
3978 */
3979 if (unlikely(head != local_read(&buffer->head))) {
3980 local_inc(&buffer->nest);
3981 goto again;
3982 }
3983
3984 if (handle->wakeup != local_read(&buffer->wakeup))
3985 perf_output_wakeup(handle);
3986
3987out:
3988 preempt_enable();
3989}
3990
3991__always_inline void perf_output_copy(struct perf_output_handle *handle,
3992 const void *buf, unsigned int len)
3993{
3994 do {
3995 unsigned long size = min_t(unsigned long, handle->size, len);
3996
3997 memcpy(handle->addr, buf, size);
3998
3999 len -= size;
4000 handle->addr += size;
4001 buf += size;
4002 handle->size -= size;
4003 if (!handle->size) {
4004 struct perf_buffer *buffer = handle->buffer;
4005
4006 handle->page++;
4007 handle->page &= buffer->nr_pages - 1;
4008 handle->addr = buffer->data_pages[handle->page];
4009 handle->size = PAGE_SIZE << page_order(buffer);
4010 }
4011 } while (len);
4012}
4013
4014static void __perf_event_header__init_id(struct perf_event_header *header, 3704static void __perf_event_header__init_id(struct perf_event_header *header,
4015 struct perf_sample_data *data, 3705 struct perf_sample_data *data,
4016 struct perf_event *event) 3706 struct perf_event *event)
@@ -4041,9 +3731,9 @@ static void __perf_event_header__init_id(struct perf_event_header *header,
4041 } 3731 }
4042} 3732}
4043 3733
4044static void perf_event_header__init_id(struct perf_event_header *header, 3734void perf_event_header__init_id(struct perf_event_header *header,
4045 struct perf_sample_data *data, 3735 struct perf_sample_data *data,
4046 struct perf_event *event) 3736 struct perf_event *event)
4047{ 3737{
4048 if (event->attr.sample_id_all) 3738 if (event->attr.sample_id_all)
4049 __perf_event_header__init_id(header, data, event); 3739 __perf_event_header__init_id(header, data, event);
@@ -4070,121 +3760,14 @@ static void __perf_event__output_id_sample(struct perf_output_handle *handle,
4070 perf_output_put(handle, data->cpu_entry); 3760 perf_output_put(handle, data->cpu_entry);
4071} 3761}
4072 3762
4073static void perf_event__output_id_sample(struct perf_event *event, 3763void perf_event__output_id_sample(struct perf_event *event,
4074 struct perf_output_handle *handle, 3764 struct perf_output_handle *handle,
4075 struct perf_sample_data *sample) 3765 struct perf_sample_data *sample)
4076{ 3766{
4077 if (event->attr.sample_id_all) 3767 if (event->attr.sample_id_all)
4078 __perf_event__output_id_sample(handle, sample); 3768 __perf_event__output_id_sample(handle, sample);
4079} 3769}
4080 3770
4081int perf_output_begin(struct perf_output_handle *handle,
4082 struct perf_event *event, unsigned int size,
4083 int nmi, int sample)
4084{
4085 struct perf_buffer *buffer;
4086 unsigned long tail, offset, head;
4087 int have_lost;
4088 struct perf_sample_data sample_data;
4089 struct {
4090 struct perf_event_header header;
4091 u64 id;
4092 u64 lost;
4093 } lost_event;
4094
4095 rcu_read_lock();
4096 /*
4097 * For inherited events we send all the output towards the parent.
4098 */
4099 if (event->parent)
4100 event = event->parent;
4101
4102 buffer = rcu_dereference(event->buffer);
4103 if (!buffer)
4104 goto out;
4105
4106 handle->buffer = buffer;
4107 handle->event = event;
4108 handle->nmi = nmi;
4109 handle->sample = sample;
4110
4111 if (!buffer->nr_pages)
4112 goto out;
4113
4114 have_lost = local_read(&buffer->lost);
4115 if (have_lost) {
4116 lost_event.header.size = sizeof(lost_event);
4117 perf_event_header__init_id(&lost_event.header, &sample_data,
4118 event);
4119 size += lost_event.header.size;
4120 }
4121
4122 perf_output_get_handle(handle);
4123
4124 do {
4125 /*
4126 * Userspace could choose to issue a mb() before updating the
4127 * tail pointer. So that all reads will be completed before the
4128 * write is issued.
4129 */
4130 tail = ACCESS_ONCE(buffer->user_page->data_tail);
4131 smp_rmb();
4132 offset = head = local_read(&buffer->head);
4133 head += size;
4134 if (unlikely(!perf_output_space(buffer, tail, offset, head)))
4135 goto fail;
4136 } while (local_cmpxchg(&buffer->head, offset, head) != offset);
4137
4138 if (head - local_read(&buffer->wakeup) > buffer->watermark)
4139 local_add(buffer->watermark, &buffer->wakeup);
4140
4141 handle->page = offset >> (PAGE_SHIFT + page_order(buffer));
4142 handle->page &= buffer->nr_pages - 1;
4143 handle->size = offset & ((PAGE_SIZE << page_order(buffer)) - 1);
4144 handle->addr = buffer->data_pages[handle->page];
4145 handle->addr += handle->size;
4146 handle->size = (PAGE_SIZE << page_order(buffer)) - handle->size;
4147
4148 if (have_lost) {
4149 lost_event.header.type = PERF_RECORD_LOST;
4150 lost_event.header.misc = 0;
4151 lost_event.id = event->id;
4152 lost_event.lost = local_xchg(&buffer->lost, 0);
4153
4154 perf_output_put(handle, lost_event);
4155 perf_event__output_id_sample(event, handle, &sample_data);
4156 }
4157
4158 return 0;
4159
4160fail:
4161 local_inc(&buffer->lost);
4162 perf_output_put_handle(handle);
4163out:
4164 rcu_read_unlock();
4165
4166 return -ENOSPC;
4167}
4168
4169void perf_output_end(struct perf_output_handle *handle)
4170{
4171 struct perf_event *event = handle->event;
4172 struct perf_buffer *buffer = handle->buffer;
4173
4174 int wakeup_events = event->attr.wakeup_events;
4175
4176 if (handle->sample && wakeup_events) {
4177 int events = local_inc_return(&buffer->events);
4178 if (events >= wakeup_events) {
4179 local_sub(wakeup_events, &buffer->events);
4180 local_inc(&buffer->wakeup);
4181 }
4182 }
4183
4184 perf_output_put_handle(handle);
4185 rcu_read_unlock();
4186}
4187
4188static void perf_output_read_one(struct perf_output_handle *handle, 3771static void perf_output_read_one(struct perf_output_handle *handle,
4189 struct perf_event *event, 3772 struct perf_event *event,
4190 u64 enabled, u64 running) 3773 u64 enabled, u64 running)
@@ -4205,7 +3788,7 @@ static void perf_output_read_one(struct perf_output_handle *handle,
4205 if (read_format & PERF_FORMAT_ID) 3788 if (read_format & PERF_FORMAT_ID)
4206 values[n++] = primary_event_id(event); 3789 values[n++] = primary_event_id(event);
4207 3790
4208 perf_output_copy(handle, values, n * sizeof(u64)); 3791 __output_copy(handle, values, n * sizeof(u64));
4209} 3792}
4210 3793
4211/* 3794/*
@@ -4235,7 +3818,7 @@ static void perf_output_read_group(struct perf_output_handle *handle,
4235 if (read_format & PERF_FORMAT_ID) 3818 if (read_format & PERF_FORMAT_ID)
4236 values[n++] = primary_event_id(leader); 3819 values[n++] = primary_event_id(leader);
4237 3820
4238 perf_output_copy(handle, values, n * sizeof(u64)); 3821 __output_copy(handle, values, n * sizeof(u64));
4239 3822
4240 list_for_each_entry(sub, &leader->sibling_list, group_entry) { 3823 list_for_each_entry(sub, &leader->sibling_list, group_entry) {
4241 n = 0; 3824 n = 0;
@@ -4247,7 +3830,7 @@ static void perf_output_read_group(struct perf_output_handle *handle,
4247 if (read_format & PERF_FORMAT_ID) 3830 if (read_format & PERF_FORMAT_ID)
4248 values[n++] = primary_event_id(sub); 3831 values[n++] = primary_event_id(sub);
4249 3832
4250 perf_output_copy(handle, values, n * sizeof(u64)); 3833 __output_copy(handle, values, n * sizeof(u64));
4251 } 3834 }
4252} 3835}
4253 3836
@@ -4257,7 +3840,7 @@ static void perf_output_read_group(struct perf_output_handle *handle,
4257static void perf_output_read(struct perf_output_handle *handle, 3840static void perf_output_read(struct perf_output_handle *handle,
4258 struct perf_event *event) 3841 struct perf_event *event)
4259{ 3842{
4260 u64 enabled = 0, running = 0, now, ctx_time; 3843 u64 enabled = 0, running = 0;
4261 u64 read_format = event->attr.read_format; 3844 u64 read_format = event->attr.read_format;
4262 3845
4263 /* 3846 /*
@@ -4269,12 +3852,8 @@ static void perf_output_read(struct perf_output_handle *handle,
4269 * because of locking issue as we are called in 3852 * because of locking issue as we are called in
4270 * NMI context 3853 * NMI context
4271 */ 3854 */
4272 if (read_format & PERF_FORMAT_TOTAL_TIMES) { 3855 if (read_format & PERF_FORMAT_TOTAL_TIMES)
4273 now = perf_clock(); 3856 calc_timer_values(event, &enabled, &running);
4274 ctx_time = event->shadow_ctx_time + now;
4275 enabled = ctx_time - event->tstamp_enabled;
4276 running = ctx_time - event->tstamp_running;
4277 }
4278 3857
4279 if (event->attr.read_format & PERF_FORMAT_GROUP) 3858 if (event->attr.read_format & PERF_FORMAT_GROUP)
4280 perf_output_read_group(handle, event, enabled, running); 3859 perf_output_read_group(handle, event, enabled, running);
@@ -4327,7 +3906,7 @@ void perf_output_sample(struct perf_output_handle *handle,
4327 3906
4328 size *= sizeof(u64); 3907 size *= sizeof(u64);
4329 3908
4330 perf_output_copy(handle, data->callchain, size); 3909 __output_copy(handle, data->callchain, size);
4331 } else { 3910 } else {
4332 u64 nr = 0; 3911 u64 nr = 0;
4333 perf_output_put(handle, nr); 3912 perf_output_put(handle, nr);
@@ -4337,8 +3916,8 @@ void perf_output_sample(struct perf_output_handle *handle,
4337 if (sample_type & PERF_SAMPLE_RAW) { 3916 if (sample_type & PERF_SAMPLE_RAW) {
4338 if (data->raw) { 3917 if (data->raw) {
4339 perf_output_put(handle, data->raw->size); 3918 perf_output_put(handle, data->raw->size);
4340 perf_output_copy(handle, data->raw->data, 3919 __output_copy(handle, data->raw->data,
4341 data->raw->size); 3920 data->raw->size);
4342 } else { 3921 } else {
4343 struct { 3922 struct {
4344 u32 size; 3923 u32 size;
@@ -4350,6 +3929,20 @@ void perf_output_sample(struct perf_output_handle *handle,
4350 perf_output_put(handle, raw); 3929 perf_output_put(handle, raw);
4351 } 3930 }
4352 } 3931 }
3932
3933 if (!event->attr.watermark) {
3934 int wakeup_events = event->attr.wakeup_events;
3935
3936 if (wakeup_events) {
3937 struct ring_buffer *rb = handle->rb;
3938 int events = local_inc_return(&rb->events);
3939
3940 if (events >= wakeup_events) {
3941 local_sub(wakeup_events, &rb->events);
3942 local_inc(&rb->wakeup);
3943 }
3944 }
3945 }
4353} 3946}
4354 3947
4355void perf_prepare_sample(struct perf_event_header *header, 3948void perf_prepare_sample(struct perf_event_header *header,
@@ -4394,7 +3987,7 @@ void perf_prepare_sample(struct perf_event_header *header,
4394 } 3987 }
4395} 3988}
4396 3989
4397static void perf_event_output(struct perf_event *event, int nmi, 3990static void perf_event_output(struct perf_event *event,
4398 struct perf_sample_data *data, 3991 struct perf_sample_data *data,
4399 struct pt_regs *regs) 3992 struct pt_regs *regs)
4400{ 3993{
@@ -4406,7 +3999,7 @@ static void perf_event_output(struct perf_event *event, int nmi,
4406 3999
4407 perf_prepare_sample(&header, data, event, regs); 4000 perf_prepare_sample(&header, data, event, regs);
4408 4001
4409 if (perf_output_begin(&handle, event, header.size, nmi, 1)) 4002 if (perf_output_begin(&handle, event, header.size))
4410 goto exit; 4003 goto exit;
4411 4004
4412 perf_output_sample(&handle, &header, data, event); 4005 perf_output_sample(&handle, &header, data, event);
@@ -4446,7 +4039,7 @@ perf_event_read_event(struct perf_event *event,
4446 int ret; 4039 int ret;
4447 4040
4448 perf_event_header__init_id(&read_event.header, &sample, event); 4041 perf_event_header__init_id(&read_event.header, &sample, event);
4449 ret = perf_output_begin(&handle, event, read_event.header.size, 0, 0); 4042 ret = perf_output_begin(&handle, event, read_event.header.size);
4450 if (ret) 4043 if (ret)
4451 return; 4044 return;
4452 4045
@@ -4489,7 +4082,7 @@ static void perf_event_task_output(struct perf_event *event,
4489 perf_event_header__init_id(&task_event->event_id.header, &sample, event); 4082 perf_event_header__init_id(&task_event->event_id.header, &sample, event);
4490 4083
4491 ret = perf_output_begin(&handle, event, 4084 ret = perf_output_begin(&handle, event,
4492 task_event->event_id.header.size, 0, 0); 4085 task_event->event_id.header.size);
4493 if (ret) 4086 if (ret)
4494 goto out; 4087 goto out;
4495 4088
@@ -4626,7 +4219,7 @@ static void perf_event_comm_output(struct perf_event *event,
4626 4219
4627 perf_event_header__init_id(&comm_event->event_id.header, &sample, event); 4220 perf_event_header__init_id(&comm_event->event_id.header, &sample, event);
4628 ret = perf_output_begin(&handle, event, 4221 ret = perf_output_begin(&handle, event,
4629 comm_event->event_id.header.size, 0, 0); 4222 comm_event->event_id.header.size);
4630 4223
4631 if (ret) 4224 if (ret)
4632 goto out; 4225 goto out;
@@ -4635,7 +4228,7 @@ static void perf_event_comm_output(struct perf_event *event,
4635 comm_event->event_id.tid = perf_event_tid(event, comm_event->task); 4228 comm_event->event_id.tid = perf_event_tid(event, comm_event->task);
4636 4229
4637 perf_output_put(&handle, comm_event->event_id); 4230 perf_output_put(&handle, comm_event->event_id);
4638 perf_output_copy(&handle, comm_event->comm, 4231 __output_copy(&handle, comm_event->comm,
4639 comm_event->comm_size); 4232 comm_event->comm_size);
4640 4233
4641 perf_event__output_id_sample(event, &handle, &sample); 4234 perf_event__output_id_sample(event, &handle, &sample);
@@ -4773,7 +4366,7 @@ static void perf_event_mmap_output(struct perf_event *event,
4773 4366
4774 perf_event_header__init_id(&mmap_event->event_id.header, &sample, event); 4367 perf_event_header__init_id(&mmap_event->event_id.header, &sample, event);
4775 ret = perf_output_begin(&handle, event, 4368 ret = perf_output_begin(&handle, event,
4776 mmap_event->event_id.header.size, 0, 0); 4369 mmap_event->event_id.header.size);
4777 if (ret) 4370 if (ret)
4778 goto out; 4371 goto out;
4779 4372
@@ -4781,7 +4374,7 @@ static void perf_event_mmap_output(struct perf_event *event,
4781 mmap_event->event_id.tid = perf_event_tid(event, current); 4374 mmap_event->event_id.tid = perf_event_tid(event, current);
4782 4375
4783 perf_output_put(&handle, mmap_event->event_id); 4376 perf_output_put(&handle, mmap_event->event_id);
4784 perf_output_copy(&handle, mmap_event->file_name, 4377 __output_copy(&handle, mmap_event->file_name,
4785 mmap_event->file_size); 4378 mmap_event->file_size);
4786 4379
4787 perf_event__output_id_sample(event, &handle, &sample); 4380 perf_event__output_id_sample(event, &handle, &sample);
@@ -4837,7 +4430,7 @@ static void perf_event_mmap_event(struct perf_mmap_event *mmap_event)
4837 4430
4838 if (file) { 4431 if (file) {
4839 /* 4432 /*
4840 * d_path works from the end of the buffer backwards, so we 4433 * d_path works from the end of the rb backwards, so we
4841 * need to add enough zero bytes after the string to handle 4434 * need to add enough zero bytes after the string to handle
4842 * the 64bit alignment we do later. 4435 * the 64bit alignment we do later.
4843 */ 4436 */
@@ -4968,7 +4561,7 @@ static void perf_log_throttle(struct perf_event *event, int enable)
4968 perf_event_header__init_id(&throttle_event.header, &sample, event); 4561 perf_event_header__init_id(&throttle_event.header, &sample, event);
4969 4562
4970 ret = perf_output_begin(&handle, event, 4563 ret = perf_output_begin(&handle, event,
4971 throttle_event.header.size, 1, 0); 4564 throttle_event.header.size);
4972 if (ret) 4565 if (ret)
4973 return; 4566 return;
4974 4567
@@ -4981,7 +4574,7 @@ static void perf_log_throttle(struct perf_event *event, int enable)
4981 * Generic event overflow handling, sampling. 4574 * Generic event overflow handling, sampling.
4982 */ 4575 */
4983 4576
4984static int __perf_event_overflow(struct perf_event *event, int nmi, 4577static int __perf_event_overflow(struct perf_event *event,
4985 int throttle, struct perf_sample_data *data, 4578 int throttle, struct perf_sample_data *data,
4986 struct pt_regs *regs) 4579 struct pt_regs *regs)
4987{ 4580{
@@ -5024,26 +4617,28 @@ static int __perf_event_overflow(struct perf_event *event, int nmi,
5024 if (events && atomic_dec_and_test(&event->event_limit)) { 4617 if (events && atomic_dec_and_test(&event->event_limit)) {
5025 ret = 1; 4618 ret = 1;
5026 event->pending_kill = POLL_HUP; 4619 event->pending_kill = POLL_HUP;
5027 if (nmi) { 4620 event->pending_disable = 1;
5028 event->pending_disable = 1; 4621 irq_work_queue(&event->pending);
5029 irq_work_queue(&event->pending);
5030 } else
5031 perf_event_disable(event);
5032 } 4622 }
5033 4623
5034 if (event->overflow_handler) 4624 if (event->overflow_handler)
5035 event->overflow_handler(event, nmi, data, regs); 4625 event->overflow_handler(event, data, regs);
5036 else 4626 else
5037 perf_event_output(event, nmi, data, regs); 4627 perf_event_output(event, data, regs);
4628
4629 if (event->fasync && event->pending_kill) {
4630 event->pending_wakeup = 1;
4631 irq_work_queue(&event->pending);
4632 }
5038 4633
5039 return ret; 4634 return ret;
5040} 4635}
5041 4636
5042int perf_event_overflow(struct perf_event *event, int nmi, 4637int perf_event_overflow(struct perf_event *event,
5043 struct perf_sample_data *data, 4638 struct perf_sample_data *data,
5044 struct pt_regs *regs) 4639 struct pt_regs *regs)
5045{ 4640{
5046 return __perf_event_overflow(event, nmi, 1, data, regs); 4641 return __perf_event_overflow(event, 1, data, regs);
5047} 4642}
5048 4643
5049/* 4644/*
@@ -5092,7 +4687,7 @@ again:
5092} 4687}
5093 4688
5094static void perf_swevent_overflow(struct perf_event *event, u64 overflow, 4689static void perf_swevent_overflow(struct perf_event *event, u64 overflow,
5095 int nmi, struct perf_sample_data *data, 4690 struct perf_sample_data *data,
5096 struct pt_regs *regs) 4691 struct pt_regs *regs)
5097{ 4692{
5098 struct hw_perf_event *hwc = &event->hw; 4693 struct hw_perf_event *hwc = &event->hw;
@@ -5106,7 +4701,7 @@ static void perf_swevent_overflow(struct perf_event *event, u64 overflow,
5106 return; 4701 return;
5107 4702
5108 for (; overflow; overflow--) { 4703 for (; overflow; overflow--) {
5109 if (__perf_event_overflow(event, nmi, throttle, 4704 if (__perf_event_overflow(event, throttle,
5110 data, regs)) { 4705 data, regs)) {
5111 /* 4706 /*
5112 * We inhibit the overflow from happening when 4707 * We inhibit the overflow from happening when
@@ -5119,7 +4714,7 @@ static void perf_swevent_overflow(struct perf_event *event, u64 overflow,
5119} 4714}
5120 4715
5121static void perf_swevent_event(struct perf_event *event, u64 nr, 4716static void perf_swevent_event(struct perf_event *event, u64 nr,
5122 int nmi, struct perf_sample_data *data, 4717 struct perf_sample_data *data,
5123 struct pt_regs *regs) 4718 struct pt_regs *regs)
5124{ 4719{
5125 struct hw_perf_event *hwc = &event->hw; 4720 struct hw_perf_event *hwc = &event->hw;
@@ -5133,12 +4728,12 @@ static void perf_swevent_event(struct perf_event *event, u64 nr,
5133 return; 4728 return;
5134 4729
5135 if (nr == 1 && hwc->sample_period == 1 && !event->attr.freq) 4730 if (nr == 1 && hwc->sample_period == 1 && !event->attr.freq)
5136 return perf_swevent_overflow(event, 1, nmi, data, regs); 4731 return perf_swevent_overflow(event, 1, data, regs);
5137 4732
5138 if (local64_add_negative(nr, &hwc->period_left)) 4733 if (local64_add_negative(nr, &hwc->period_left))
5139 return; 4734 return;
5140 4735
5141 perf_swevent_overflow(event, 0, nmi, data, regs); 4736 perf_swevent_overflow(event, 0, data, regs);
5142} 4737}
5143 4738
5144static int perf_exclude_event(struct perf_event *event, 4739static int perf_exclude_event(struct perf_event *event,
@@ -5226,7 +4821,7 @@ find_swevent_head(struct swevent_htable *swhash, struct perf_event *event)
5226} 4821}
5227 4822
5228static void do_perf_sw_event(enum perf_type_id type, u32 event_id, 4823static void do_perf_sw_event(enum perf_type_id type, u32 event_id,
5229 u64 nr, int nmi, 4824 u64 nr,
5230 struct perf_sample_data *data, 4825 struct perf_sample_data *data,
5231 struct pt_regs *regs) 4826 struct pt_regs *regs)
5232{ 4827{
@@ -5242,7 +4837,7 @@ static void do_perf_sw_event(enum perf_type_id type, u32 event_id,
5242 4837
5243 hlist_for_each_entry_rcu(event, node, head, hlist_entry) { 4838 hlist_for_each_entry_rcu(event, node, head, hlist_entry) {
5244 if (perf_swevent_match(event, type, event_id, data, regs)) 4839 if (perf_swevent_match(event, type, event_id, data, regs))
5245 perf_swevent_event(event, nr, nmi, data, regs); 4840 perf_swevent_event(event, nr, data, regs);
5246 } 4841 }
5247end: 4842end:
5248 rcu_read_unlock(); 4843 rcu_read_unlock();
@@ -5263,8 +4858,7 @@ inline void perf_swevent_put_recursion_context(int rctx)
5263 put_recursion_context(swhash->recursion, rctx); 4858 put_recursion_context(swhash->recursion, rctx);
5264} 4859}
5265 4860
5266void __perf_sw_event(u32 event_id, u64 nr, int nmi, 4861void __perf_sw_event(u32 event_id, u64 nr, struct pt_regs *regs, u64 addr)
5267 struct pt_regs *regs, u64 addr)
5268{ 4862{
5269 struct perf_sample_data data; 4863 struct perf_sample_data data;
5270 int rctx; 4864 int rctx;
@@ -5276,7 +4870,7 @@ void __perf_sw_event(u32 event_id, u64 nr, int nmi,
5276 4870
5277 perf_sample_data_init(&data, addr); 4871 perf_sample_data_init(&data, addr);
5278 4872
5279 do_perf_sw_event(PERF_TYPE_SOFTWARE, event_id, nr, nmi, &data, regs); 4873 do_perf_sw_event(PERF_TYPE_SOFTWARE, event_id, nr, &data, regs);
5280 4874
5281 perf_swevent_put_recursion_context(rctx); 4875 perf_swevent_put_recursion_context(rctx);
5282 preempt_enable_notrace(); 4876 preempt_enable_notrace();
@@ -5331,14 +4925,6 @@ swevent_hlist_deref(struct swevent_htable *swhash)
5331 lockdep_is_held(&swhash->hlist_mutex)); 4925 lockdep_is_held(&swhash->hlist_mutex));
5332} 4926}
5333 4927
5334static void swevent_hlist_release_rcu(struct rcu_head *rcu_head)
5335{
5336 struct swevent_hlist *hlist;
5337
5338 hlist = container_of(rcu_head, struct swevent_hlist, rcu_head);
5339 kfree(hlist);
5340}
5341
5342static void swevent_hlist_release(struct swevent_htable *swhash) 4928static void swevent_hlist_release(struct swevent_htable *swhash)
5343{ 4929{
5344 struct swevent_hlist *hlist = swevent_hlist_deref(swhash); 4930 struct swevent_hlist *hlist = swevent_hlist_deref(swhash);
@@ -5347,7 +4933,7 @@ static void swevent_hlist_release(struct swevent_htable *swhash)
5347 return; 4933 return;
5348 4934
5349 rcu_assign_pointer(swhash->swevent_hlist, NULL); 4935 rcu_assign_pointer(swhash->swevent_hlist, NULL);
5350 call_rcu(&hlist->rcu_head, swevent_hlist_release_rcu); 4936 kfree_rcu(hlist, rcu_head);
5351} 4937}
5352 4938
5353static void swevent_hlist_put_cpu(struct perf_event *event, int cpu) 4939static void swevent_hlist_put_cpu(struct perf_event *event, int cpu)
@@ -5429,7 +5015,7 @@ fail:
5429 return err; 5015 return err;
5430} 5016}
5431 5017
5432atomic_t perf_swevent_enabled[PERF_COUNT_SW_MAX]; 5018struct jump_label_key perf_swevent_enabled[PERF_COUNT_SW_MAX];
5433 5019
5434static void sw_perf_event_destroy(struct perf_event *event) 5020static void sw_perf_event_destroy(struct perf_event *event)
5435{ 5021{
@@ -5532,7 +5118,7 @@ void perf_tp_event(u64 addr, u64 count, void *record, int entry_size,
5532 5118
5533 hlist_for_each_entry_rcu(event, node, head, hlist_entry) { 5119 hlist_for_each_entry_rcu(event, node, head, hlist_entry) {
5534 if (perf_tp_event_match(event, &data, regs)) 5120 if (perf_tp_event_match(event, &data, regs))
5535 perf_swevent_event(event, count, 1, &data, regs); 5121 perf_swevent_event(event, count, &data, regs);
5536 } 5122 }
5537 5123
5538 perf_swevent_put_recursion_context(rctx); 5124 perf_swevent_put_recursion_context(rctx);
@@ -5625,7 +5211,7 @@ void perf_bp_event(struct perf_event *bp, void *data)
5625 perf_sample_data_init(&sample, bp->attr.bp_addr); 5211 perf_sample_data_init(&sample, bp->attr.bp_addr);
5626 5212
5627 if (!bp->hw.state && !perf_exclude_event(bp, regs)) 5213 if (!bp->hw.state && !perf_exclude_event(bp, regs))
5628 perf_swevent_event(bp, 1, 1, &sample, regs); 5214 perf_swevent_event(bp, 1, &sample, regs);
5629} 5215}
5630#endif 5216#endif
5631 5217
@@ -5654,7 +5240,7 @@ static enum hrtimer_restart perf_swevent_hrtimer(struct hrtimer *hrtimer)
5654 5240
5655 if (regs && !perf_exclude_event(event, regs)) { 5241 if (regs && !perf_exclude_event(event, regs)) {
5656 if (!(event->attr.exclude_idle && current->pid == 0)) 5242 if (!(event->attr.exclude_idle && current->pid == 0))
5657 if (perf_event_overflow(event, 0, &data, regs)) 5243 if (perf_event_overflow(event, &data, regs))
5658 ret = HRTIMER_NORESTART; 5244 ret = HRTIMER_NORESTART;
5659 } 5245 }
5660 5246
@@ -5994,6 +5580,7 @@ free_dev:
5994} 5580}
5995 5581
5996static struct lock_class_key cpuctx_mutex; 5582static struct lock_class_key cpuctx_mutex;
5583static struct lock_class_key cpuctx_lock;
5997 5584
5998int perf_pmu_register(struct pmu *pmu, char *name, int type) 5585int perf_pmu_register(struct pmu *pmu, char *name, int type)
5999{ 5586{
@@ -6044,6 +5631,7 @@ skip_type:
6044 cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, cpu); 5631 cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, cpu);
6045 __perf_event_init_context(&cpuctx->ctx); 5632 __perf_event_init_context(&cpuctx->ctx);
6046 lockdep_set_class(&cpuctx->ctx.mutex, &cpuctx_mutex); 5633 lockdep_set_class(&cpuctx->ctx.mutex, &cpuctx_mutex);
5634 lockdep_set_class(&cpuctx->ctx.lock, &cpuctx_lock);
6047 cpuctx->ctx.type = cpu_context; 5635 cpuctx->ctx.type = cpu_context;
6048 cpuctx->ctx.pmu = pmu; 5636 cpuctx->ctx.pmu = pmu;
6049 cpuctx->jiffies_interval = 1; 5637 cpuctx->jiffies_interval = 1;
@@ -6158,7 +5746,8 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu,
6158 struct task_struct *task, 5746 struct task_struct *task,
6159 struct perf_event *group_leader, 5747 struct perf_event *group_leader,
6160 struct perf_event *parent_event, 5748 struct perf_event *parent_event,
6161 perf_overflow_handler_t overflow_handler) 5749 perf_overflow_handler_t overflow_handler,
5750 void *context)
6162{ 5751{
6163 struct pmu *pmu; 5752 struct pmu *pmu;
6164 struct perf_event *event; 5753 struct perf_event *event;
@@ -6216,10 +5805,13 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu,
6216#endif 5805#endif
6217 } 5806 }
6218 5807
6219 if (!overflow_handler && parent_event) 5808 if (!overflow_handler && parent_event) {
6220 overflow_handler = parent_event->overflow_handler; 5809 overflow_handler = parent_event->overflow_handler;
5810 context = parent_event->overflow_handler_context;
5811 }
6221 5812
6222 event->overflow_handler = overflow_handler; 5813 event->overflow_handler = overflow_handler;
5814 event->overflow_handler_context = context;
6223 5815
6224 if (attr->disabled) 5816 if (attr->disabled)
6225 event->state = PERF_EVENT_STATE_OFF; 5817 event->state = PERF_EVENT_STATE_OFF;
@@ -6334,13 +5926,6 @@ static int perf_copy_attr(struct perf_event_attr __user *uattr,
6334 if (ret) 5926 if (ret)
6335 return -EFAULT; 5927 return -EFAULT;
6336 5928
6337 /*
6338 * If the type exists, the corresponding creation will verify
6339 * the attr->config.
6340 */
6341 if (attr->type >= PERF_TYPE_MAX)
6342 return -EINVAL;
6343
6344 if (attr->__reserved_1) 5929 if (attr->__reserved_1)
6345 return -EINVAL; 5930 return -EINVAL;
6346 5931
@@ -6362,7 +5947,7 @@ err_size:
6362static int 5947static int
6363perf_event_set_output(struct perf_event *event, struct perf_event *output_event) 5948perf_event_set_output(struct perf_event *event, struct perf_event *output_event)
6364{ 5949{
6365 struct perf_buffer *buffer = NULL, *old_buffer = NULL; 5950 struct ring_buffer *rb = NULL, *old_rb = NULL;
6366 int ret = -EINVAL; 5951 int ret = -EINVAL;
6367 5952
6368 if (!output_event) 5953 if (!output_event)
@@ -6379,7 +5964,7 @@ perf_event_set_output(struct perf_event *event, struct perf_event *output_event)
6379 goto out; 5964 goto out;
6380 5965
6381 /* 5966 /*
6382 * If its not a per-cpu buffer, it must be the same task. 5967 * If its not a per-cpu rb, it must be the same task.
6383 */ 5968 */
6384 if (output_event->cpu == -1 && output_event->ctx != event->ctx) 5969 if (output_event->cpu == -1 && output_event->ctx != event->ctx)
6385 goto out; 5970 goto out;
@@ -6391,20 +5976,20 @@ set:
6391 goto unlock; 5976 goto unlock;
6392 5977
6393 if (output_event) { 5978 if (output_event) {
6394 /* get the buffer we want to redirect to */ 5979 /* get the rb we want to redirect to */
6395 buffer = perf_buffer_get(output_event); 5980 rb = ring_buffer_get(output_event);
6396 if (!buffer) 5981 if (!rb)
6397 goto unlock; 5982 goto unlock;
6398 } 5983 }
6399 5984
6400 old_buffer = event->buffer; 5985 old_rb = event->rb;
6401 rcu_assign_pointer(event->buffer, buffer); 5986 rcu_assign_pointer(event->rb, rb);
6402 ret = 0; 5987 ret = 0;
6403unlock: 5988unlock:
6404 mutex_unlock(&event->mmap_mutex); 5989 mutex_unlock(&event->mmap_mutex);
6405 5990
6406 if (old_buffer) 5991 if (old_rb)
6407 perf_buffer_put(old_buffer); 5992 ring_buffer_put(old_rb);
6408out: 5993out:
6409 return ret; 5994 return ret;
6410} 5995}
@@ -6486,7 +6071,8 @@ SYSCALL_DEFINE5(perf_event_open,
6486 } 6071 }
6487 } 6072 }
6488 6073
6489 event = perf_event_alloc(&attr, cpu, task, group_leader, NULL, NULL); 6074 event = perf_event_alloc(&attr, cpu, task, group_leader, NULL,
6075 NULL, NULL);
6490 if (IS_ERR(event)) { 6076 if (IS_ERR(event)) {
6491 err = PTR_ERR(event); 6077 err = PTR_ERR(event);
6492 goto err_task; 6078 goto err_task;
@@ -6671,7 +6257,8 @@ err_fd:
6671struct perf_event * 6257struct perf_event *
6672perf_event_create_kernel_counter(struct perf_event_attr *attr, int cpu, 6258perf_event_create_kernel_counter(struct perf_event_attr *attr, int cpu,
6673 struct task_struct *task, 6259 struct task_struct *task,
6674 perf_overflow_handler_t overflow_handler) 6260 perf_overflow_handler_t overflow_handler,
6261 void *context)
6675{ 6262{
6676 struct perf_event_context *ctx; 6263 struct perf_event_context *ctx;
6677 struct perf_event *event; 6264 struct perf_event *event;
@@ -6681,7 +6268,8 @@ perf_event_create_kernel_counter(struct perf_event_attr *attr, int cpu,
6681 * Get the target context (task or percpu): 6268 * Get the target context (task or percpu):
6682 */ 6269 */
6683 6270
6684 event = perf_event_alloc(attr, cpu, task, NULL, NULL, overflow_handler); 6271 event = perf_event_alloc(attr, cpu, task, NULL, NULL,
6272 overflow_handler, context);
6685 if (IS_ERR(event)) { 6273 if (IS_ERR(event)) {
6686 err = PTR_ERR(event); 6274 err = PTR_ERR(event);
6687 goto err; 6275 goto err;
@@ -6788,7 +6376,6 @@ static void perf_event_exit_task_context(struct task_struct *child, int ctxn)
6788 * our context. 6376 * our context.
6789 */ 6377 */
6790 child_ctx = rcu_dereference_raw(child->perf_event_ctxp[ctxn]); 6378 child_ctx = rcu_dereference_raw(child->perf_event_ctxp[ctxn]);
6791 task_ctx_sched_out(child_ctx, EVENT_ALL);
6792 6379
6793 /* 6380 /*
6794 * Take the context lock here so that if find_get_context is 6381 * Take the context lock here so that if find_get_context is
@@ -6796,6 +6383,7 @@ static void perf_event_exit_task_context(struct task_struct *child, int ctxn)
6796 * incremented the context's refcount before we do put_ctx below. 6383 * incremented the context's refcount before we do put_ctx below.
6797 */ 6384 */
6798 raw_spin_lock(&child_ctx->lock); 6385 raw_spin_lock(&child_ctx->lock);
6386 task_ctx_sched_out(child_ctx);
6799 child->perf_event_ctxp[ctxn] = NULL; 6387 child->perf_event_ctxp[ctxn] = NULL;
6800 /* 6388 /*
6801 * If this context is a clone; unclone it so it can't get 6389 * If this context is a clone; unclone it so it can't get
@@ -6965,7 +6553,7 @@ inherit_event(struct perf_event *parent_event,
6965 parent_event->cpu, 6553 parent_event->cpu,
6966 child, 6554 child,
6967 group_leader, parent_event, 6555 group_leader, parent_event,
6968 NULL); 6556 NULL, NULL);
6969 if (IS_ERR(child_event)) 6557 if (IS_ERR(child_event))
6970 return child_event; 6558 return child_event;
6971 get_ctx(child_ctx); 6559 get_ctx(child_ctx);
@@ -6992,6 +6580,8 @@ inherit_event(struct perf_event *parent_event,
6992 6580
6993 child_event->ctx = child_ctx; 6581 child_event->ctx = child_ctx;
6994 child_event->overflow_handler = parent_event->overflow_handler; 6582 child_event->overflow_handler = parent_event->overflow_handler;
6583 child_event->overflow_handler_context
6584 = parent_event->overflow_handler_context;
6995 6585
6996 /* 6586 /*
6997 * Precalculate sample_data sizes 6587 * Precalculate sample_data sizes
@@ -7410,26 +7000,12 @@ static int __perf_cgroup_move(void *info)
7410 return 0; 7000 return 0;
7411} 7001}
7412 7002
7413static void perf_cgroup_move(struct task_struct *task) 7003static void
7004perf_cgroup_attach_task(struct cgroup *cgrp, struct task_struct *task)
7414{ 7005{
7415 task_function_call(task, __perf_cgroup_move, task); 7006 task_function_call(task, __perf_cgroup_move, task);
7416} 7007}
7417 7008
7418static void perf_cgroup_attach(struct cgroup_subsys *ss, struct cgroup *cgrp,
7419 struct cgroup *old_cgrp, struct task_struct *task,
7420 bool threadgroup)
7421{
7422 perf_cgroup_move(task);
7423 if (threadgroup) {
7424 struct task_struct *c;
7425 rcu_read_lock();
7426 list_for_each_entry_rcu(c, &task->thread_group, thread_group) {
7427 perf_cgroup_move(c);
7428 }
7429 rcu_read_unlock();
7430 }
7431}
7432
7433static void perf_cgroup_exit(struct cgroup_subsys *ss, struct cgroup *cgrp, 7009static void perf_cgroup_exit(struct cgroup_subsys *ss, struct cgroup *cgrp,
7434 struct cgroup *old_cgrp, struct task_struct *task) 7010 struct cgroup *old_cgrp, struct task_struct *task)
7435{ 7011{
@@ -7441,15 +7017,15 @@ static void perf_cgroup_exit(struct cgroup_subsys *ss, struct cgroup *cgrp,
7441 if (!(task->flags & PF_EXITING)) 7017 if (!(task->flags & PF_EXITING))
7442 return; 7018 return;
7443 7019
7444 perf_cgroup_move(task); 7020 perf_cgroup_attach_task(cgrp, task);
7445} 7021}
7446 7022
7447struct cgroup_subsys perf_subsys = { 7023struct cgroup_subsys perf_subsys = {
7448 .name = "perf_event", 7024 .name = "perf_event",
7449 .subsys_id = perf_subsys_id, 7025 .subsys_id = perf_subsys_id,
7450 .create = perf_cgroup_create, 7026 .create = perf_cgroup_create,
7451 .destroy = perf_cgroup_destroy, 7027 .destroy = perf_cgroup_destroy,
7452 .exit = perf_cgroup_exit, 7028 .exit = perf_cgroup_exit,
7453 .attach = perf_cgroup_attach, 7029 .attach_task = perf_cgroup_attach_task,
7454}; 7030};
7455#endif /* CONFIG_CGROUP_PERF */ 7031#endif /* CONFIG_CGROUP_PERF */
diff --git a/kernel/hw_breakpoint.c b/kernel/events/hw_breakpoint.c
index 086adf25a55e..b7971d6f38bf 100644
--- a/kernel/hw_breakpoint.c
+++ b/kernel/events/hw_breakpoint.c
@@ -431,9 +431,11 @@ int register_perf_hw_breakpoint(struct perf_event *bp)
431struct perf_event * 431struct perf_event *
432register_user_hw_breakpoint(struct perf_event_attr *attr, 432register_user_hw_breakpoint(struct perf_event_attr *attr,
433 perf_overflow_handler_t triggered, 433 perf_overflow_handler_t triggered,
434 void *context,
434 struct task_struct *tsk) 435 struct task_struct *tsk)
435{ 436{
436 return perf_event_create_kernel_counter(attr, -1, tsk, triggered); 437 return perf_event_create_kernel_counter(attr, -1, tsk, triggered,
438 context);
437} 439}
438EXPORT_SYMBOL_GPL(register_user_hw_breakpoint); 440EXPORT_SYMBOL_GPL(register_user_hw_breakpoint);
439 441
@@ -502,7 +504,8 @@ EXPORT_SYMBOL_GPL(unregister_hw_breakpoint);
502 */ 504 */
503struct perf_event * __percpu * 505struct perf_event * __percpu *
504register_wide_hw_breakpoint(struct perf_event_attr *attr, 506register_wide_hw_breakpoint(struct perf_event_attr *attr,
505 perf_overflow_handler_t triggered) 507 perf_overflow_handler_t triggered,
508 void *context)
506{ 509{
507 struct perf_event * __percpu *cpu_events, **pevent, *bp; 510 struct perf_event * __percpu *cpu_events, **pevent, *bp;
508 long err; 511 long err;
@@ -515,7 +518,8 @@ register_wide_hw_breakpoint(struct perf_event_attr *attr,
515 get_online_cpus(); 518 get_online_cpus();
516 for_each_online_cpu(cpu) { 519 for_each_online_cpu(cpu) {
517 pevent = per_cpu_ptr(cpu_events, cpu); 520 pevent = per_cpu_ptr(cpu_events, cpu);
518 bp = perf_event_create_kernel_counter(attr, cpu, NULL, triggered); 521 bp = perf_event_create_kernel_counter(attr, cpu, NULL,
522 triggered, context);
519 523
520 *pevent = bp; 524 *pevent = bp;
521 525
diff --git a/kernel/events/internal.h b/kernel/events/internal.h
new file mode 100644
index 000000000000..09097dd8116c
--- /dev/null
+++ b/kernel/events/internal.h
@@ -0,0 +1,96 @@
1#ifndef _KERNEL_EVENTS_INTERNAL_H
2#define _KERNEL_EVENTS_INTERNAL_H
3
4#define RING_BUFFER_WRITABLE 0x01
5
6struct ring_buffer {
7 atomic_t refcount;
8 struct rcu_head rcu_head;
9#ifdef CONFIG_PERF_USE_VMALLOC
10 struct work_struct work;
11 int page_order; /* allocation order */
12#endif
13 int nr_pages; /* nr of data pages */
14 int writable; /* are we writable */
15
16 atomic_t poll; /* POLL_ for wakeups */
17
18 local_t head; /* write position */
19 local_t nest; /* nested writers */
20 local_t events; /* event limit */
21 local_t wakeup; /* wakeup stamp */
22 local_t lost; /* nr records lost */
23
24 long watermark; /* wakeup watermark */
25
26 struct perf_event_mmap_page *user_page;
27 void *data_pages[0];
28};
29
30extern void rb_free(struct ring_buffer *rb);
31extern struct ring_buffer *
32rb_alloc(int nr_pages, long watermark, int cpu, int flags);
33extern void perf_event_wakeup(struct perf_event *event);
34
35extern void
36perf_event_header__init_id(struct perf_event_header *header,
37 struct perf_sample_data *data,
38 struct perf_event *event);
39extern void
40perf_event__output_id_sample(struct perf_event *event,
41 struct perf_output_handle *handle,
42 struct perf_sample_data *sample);
43
44extern struct page *
45perf_mmap_to_page(struct ring_buffer *rb, unsigned long pgoff);
46
47#ifdef CONFIG_PERF_USE_VMALLOC
48/*
49 * Back perf_mmap() with vmalloc memory.
50 *
51 * Required for architectures that have d-cache aliasing issues.
52 */
53
54static inline int page_order(struct ring_buffer *rb)
55{
56 return rb->page_order;
57}
58
59#else
60
61static inline int page_order(struct ring_buffer *rb)
62{
63 return 0;
64}
65#endif
66
67static unsigned long perf_data_size(struct ring_buffer *rb)
68{
69 return rb->nr_pages << (PAGE_SHIFT + page_order(rb));
70}
71
72static inline void
73__output_copy(struct perf_output_handle *handle,
74 const void *buf, unsigned int len)
75{
76 do {
77 unsigned long size = min_t(unsigned long, handle->size, len);
78
79 memcpy(handle->addr, buf, size);
80
81 len -= size;
82 handle->addr += size;
83 buf += size;
84 handle->size -= size;
85 if (!handle->size) {
86 struct ring_buffer *rb = handle->rb;
87
88 handle->page++;
89 handle->page &= rb->nr_pages - 1;
90 handle->addr = rb->data_pages[handle->page];
91 handle->size = PAGE_SIZE << page_order(rb);
92 }
93 } while (len);
94}
95
96#endif /* _KERNEL_EVENTS_INTERNAL_H */
diff --git a/kernel/events/ring_buffer.c b/kernel/events/ring_buffer.c
new file mode 100644
index 000000000000..a2a29205cc0f
--- /dev/null
+++ b/kernel/events/ring_buffer.c
@@ -0,0 +1,380 @@
1/*
2 * Performance events ring-buffer code:
3 *
4 * Copyright (C) 2008 Thomas Gleixner <tglx@linutronix.de>
5 * Copyright (C) 2008-2011 Red Hat, Inc., Ingo Molnar
6 * Copyright (C) 2008-2011 Red Hat, Inc., Peter Zijlstra <pzijlstr@redhat.com>
7 * Copyright © 2009 Paul Mackerras, IBM Corp. <paulus@au1.ibm.com>
8 *
9 * For licensing details see kernel-base/COPYING
10 */
11
12#include <linux/perf_event.h>
13#include <linux/vmalloc.h>
14#include <linux/slab.h>
15
16#include "internal.h"
17
18static bool perf_output_space(struct ring_buffer *rb, unsigned long tail,
19 unsigned long offset, unsigned long head)
20{
21 unsigned long mask;
22
23 if (!rb->writable)
24 return true;
25
26 mask = perf_data_size(rb) - 1;
27
28 offset = (offset - tail) & mask;
29 head = (head - tail) & mask;
30
31 if ((int)(head - offset) < 0)
32 return false;
33
34 return true;
35}
36
37static void perf_output_wakeup(struct perf_output_handle *handle)
38{
39 atomic_set(&handle->rb->poll, POLL_IN);
40
41 handle->event->pending_wakeup = 1;
42 irq_work_queue(&handle->event->pending);
43}
44
45/*
46 * We need to ensure a later event_id doesn't publish a head when a former
47 * event isn't done writing. However since we need to deal with NMIs we
48 * cannot fully serialize things.
49 *
50 * We only publish the head (and generate a wakeup) when the outer-most
51 * event completes.
52 */
53static void perf_output_get_handle(struct perf_output_handle *handle)
54{
55 struct ring_buffer *rb = handle->rb;
56
57 preempt_disable();
58 local_inc(&rb->nest);
59 handle->wakeup = local_read(&rb->wakeup);
60}
61
62static void perf_output_put_handle(struct perf_output_handle *handle)
63{
64 struct ring_buffer *rb = handle->rb;
65 unsigned long head;
66
67again:
68 head = local_read(&rb->head);
69
70 /*
71 * IRQ/NMI can happen here, which means we can miss a head update.
72 */
73
74 if (!local_dec_and_test(&rb->nest))
75 goto out;
76
77 /*
78 * Publish the known good head. Rely on the full barrier implied
79 * by atomic_dec_and_test() order the rb->head read and this
80 * write.
81 */
82 rb->user_page->data_head = head;
83
84 /*
85 * Now check if we missed an update, rely on the (compiler)
86 * barrier in atomic_dec_and_test() to re-read rb->head.
87 */
88 if (unlikely(head != local_read(&rb->head))) {
89 local_inc(&rb->nest);
90 goto again;
91 }
92
93 if (handle->wakeup != local_read(&rb->wakeup))
94 perf_output_wakeup(handle);
95
96out:
97 preempt_enable();
98}
99
100int perf_output_begin(struct perf_output_handle *handle,
101 struct perf_event *event, unsigned int size)
102{
103 struct ring_buffer *rb;
104 unsigned long tail, offset, head;
105 int have_lost;
106 struct perf_sample_data sample_data;
107 struct {
108 struct perf_event_header header;
109 u64 id;
110 u64 lost;
111 } lost_event;
112
113 rcu_read_lock();
114 /*
115 * For inherited events we send all the output towards the parent.
116 */
117 if (event->parent)
118 event = event->parent;
119
120 rb = rcu_dereference(event->rb);
121 if (!rb)
122 goto out;
123
124 handle->rb = rb;
125 handle->event = event;
126
127 if (!rb->nr_pages)
128 goto out;
129
130 have_lost = local_read(&rb->lost);
131 if (have_lost) {
132 lost_event.header.size = sizeof(lost_event);
133 perf_event_header__init_id(&lost_event.header, &sample_data,
134 event);
135 size += lost_event.header.size;
136 }
137
138 perf_output_get_handle(handle);
139
140 do {
141 /*
142 * Userspace could choose to issue a mb() before updating the
143 * tail pointer. So that all reads will be completed before the
144 * write is issued.
145 */
146 tail = ACCESS_ONCE(rb->user_page->data_tail);
147 smp_rmb();
148 offset = head = local_read(&rb->head);
149 head += size;
150 if (unlikely(!perf_output_space(rb, tail, offset, head)))
151 goto fail;
152 } while (local_cmpxchg(&rb->head, offset, head) != offset);
153
154 if (head - local_read(&rb->wakeup) > rb->watermark)
155 local_add(rb->watermark, &rb->wakeup);
156
157 handle->page = offset >> (PAGE_SHIFT + page_order(rb));
158 handle->page &= rb->nr_pages - 1;
159 handle->size = offset & ((PAGE_SIZE << page_order(rb)) - 1);
160 handle->addr = rb->data_pages[handle->page];
161 handle->addr += handle->size;
162 handle->size = (PAGE_SIZE << page_order(rb)) - handle->size;
163
164 if (have_lost) {
165 lost_event.header.type = PERF_RECORD_LOST;
166 lost_event.header.misc = 0;
167 lost_event.id = event->id;
168 lost_event.lost = local_xchg(&rb->lost, 0);
169
170 perf_output_put(handle, lost_event);
171 perf_event__output_id_sample(event, handle, &sample_data);
172 }
173
174 return 0;
175
176fail:
177 local_inc(&rb->lost);
178 perf_output_put_handle(handle);
179out:
180 rcu_read_unlock();
181
182 return -ENOSPC;
183}
184
185void perf_output_copy(struct perf_output_handle *handle,
186 const void *buf, unsigned int len)
187{
188 __output_copy(handle, buf, len);
189}
190
191void perf_output_end(struct perf_output_handle *handle)
192{
193 perf_output_put_handle(handle);
194 rcu_read_unlock();
195}
196
197static void
198ring_buffer_init(struct ring_buffer *rb, long watermark, int flags)
199{
200 long max_size = perf_data_size(rb);
201
202 if (watermark)
203 rb->watermark = min(max_size, watermark);
204
205 if (!rb->watermark)
206 rb->watermark = max_size / 2;
207
208 if (flags & RING_BUFFER_WRITABLE)
209 rb->writable = 1;
210
211 atomic_set(&rb->refcount, 1);
212}
213
214#ifndef CONFIG_PERF_USE_VMALLOC
215
216/*
217 * Back perf_mmap() with regular GFP_KERNEL-0 pages.
218 */
219
220struct page *
221perf_mmap_to_page(struct ring_buffer *rb, unsigned long pgoff)
222{
223 if (pgoff > rb->nr_pages)
224 return NULL;
225
226 if (pgoff == 0)
227 return virt_to_page(rb->user_page);
228
229 return virt_to_page(rb->data_pages[pgoff - 1]);
230}
231
232static void *perf_mmap_alloc_page(int cpu)
233{
234 struct page *page;
235 int node;
236
237 node = (cpu == -1) ? cpu : cpu_to_node(cpu);
238 page = alloc_pages_node(node, GFP_KERNEL | __GFP_ZERO, 0);
239 if (!page)
240 return NULL;
241
242 return page_address(page);
243}
244
245struct ring_buffer *rb_alloc(int nr_pages, long watermark, int cpu, int flags)
246{
247 struct ring_buffer *rb;
248 unsigned long size;
249 int i;
250
251 size = sizeof(struct ring_buffer);
252 size += nr_pages * sizeof(void *);
253
254 rb = kzalloc(size, GFP_KERNEL);
255 if (!rb)
256 goto fail;
257
258 rb->user_page = perf_mmap_alloc_page(cpu);
259 if (!rb->user_page)
260 goto fail_user_page;
261
262 for (i = 0; i < nr_pages; i++) {
263 rb->data_pages[i] = perf_mmap_alloc_page(cpu);
264 if (!rb->data_pages[i])
265 goto fail_data_pages;
266 }
267
268 rb->nr_pages = nr_pages;
269
270 ring_buffer_init(rb, watermark, flags);
271
272 return rb;
273
274fail_data_pages:
275 for (i--; i >= 0; i--)
276 free_page((unsigned long)rb->data_pages[i]);
277
278 free_page((unsigned long)rb->user_page);
279
280fail_user_page:
281 kfree(rb);
282
283fail:
284 return NULL;
285}
286
287static void perf_mmap_free_page(unsigned long addr)
288{
289 struct page *page = virt_to_page((void *)addr);
290
291 page->mapping = NULL;
292 __free_page(page);
293}
294
295void rb_free(struct ring_buffer *rb)
296{
297 int i;
298
299 perf_mmap_free_page((unsigned long)rb->user_page);
300 for (i = 0; i < rb->nr_pages; i++)
301 perf_mmap_free_page((unsigned long)rb->data_pages[i]);
302 kfree(rb);
303}
304
305#else
306
307struct page *
308perf_mmap_to_page(struct ring_buffer *rb, unsigned long pgoff)
309{
310 if (pgoff > (1UL << page_order(rb)))
311 return NULL;
312
313 return vmalloc_to_page((void *)rb->user_page + pgoff * PAGE_SIZE);
314}
315
316static void perf_mmap_unmark_page(void *addr)
317{
318 struct page *page = vmalloc_to_page(addr);
319
320 page->mapping = NULL;
321}
322
323static void rb_free_work(struct work_struct *work)
324{
325 struct ring_buffer *rb;
326 void *base;
327 int i, nr;
328
329 rb = container_of(work, struct ring_buffer, work);
330 nr = 1 << page_order(rb);
331
332 base = rb->user_page;
333 for (i = 0; i < nr + 1; i++)
334 perf_mmap_unmark_page(base + (i * PAGE_SIZE));
335
336 vfree(base);
337 kfree(rb);
338}
339
340void rb_free(struct ring_buffer *rb)
341{
342 schedule_work(&rb->work);
343}
344
345struct ring_buffer *rb_alloc(int nr_pages, long watermark, int cpu, int flags)
346{
347 struct ring_buffer *rb;
348 unsigned long size;
349 void *all_buf;
350
351 size = sizeof(struct ring_buffer);
352 size += sizeof(void *);
353
354 rb = kzalloc(size, GFP_KERNEL);
355 if (!rb)
356 goto fail;
357
358 INIT_WORK(&rb->work, rb_free_work);
359
360 all_buf = vmalloc_user((nr_pages + 1) * PAGE_SIZE);
361 if (!all_buf)
362 goto fail_all_buf;
363
364 rb->user_page = all_buf;
365 rb->data_pages[0] = all_buf + PAGE_SIZE;
366 rb->page_order = ilog2(nr_pages);
367 rb->nr_pages = 1;
368
369 ring_buffer_init(rb, watermark, flags);
370
371 return rb;
372
373fail_all_buf:
374 kfree(rb);
375
376fail:
377 return NULL;
378}
379
380#endif
diff --git a/kernel/exit.c b/kernel/exit.c
index 8dd874181542..2913b3509d42 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -85,7 +85,6 @@ static void __exit_signal(struct task_struct *tsk)
85 struct tty_struct *uninitialized_var(tty); 85 struct tty_struct *uninitialized_var(tty);
86 86
87 sighand = rcu_dereference_check(tsk->sighand, 87 sighand = rcu_dereference_check(tsk->sighand,
88 rcu_read_lock_held() ||
89 lockdep_tasklist_lock_is_held()); 88 lockdep_tasklist_lock_is_held());
90 spin_lock(&sighand->siglock); 89 spin_lock(&sighand->siglock);
91 90
@@ -169,7 +168,6 @@ void release_task(struct task_struct * p)
169 struct task_struct *leader; 168 struct task_struct *leader;
170 int zap_leader; 169 int zap_leader;
171repeat: 170repeat:
172 tracehook_prepare_release_task(p);
173 /* don't need to get the RCU readlock here - the process is dead and 171 /* don't need to get the RCU readlock here - the process is dead and
174 * can't be modifying its own credentials. But shut RCU-lockdep up */ 172 * can't be modifying its own credentials. But shut RCU-lockdep up */
175 rcu_read_lock(); 173 rcu_read_lock();
@@ -179,7 +177,7 @@ repeat:
179 proc_flush_task(p); 177 proc_flush_task(p);
180 178
181 write_lock_irq(&tasklist_lock); 179 write_lock_irq(&tasklist_lock);
182 tracehook_finish_release_task(p); 180 ptrace_release_task(p);
183 __exit_signal(p); 181 __exit_signal(p);
184 182
185 /* 183 /*
@@ -190,22 +188,12 @@ repeat:
190 zap_leader = 0; 188 zap_leader = 0;
191 leader = p->group_leader; 189 leader = p->group_leader;
192 if (leader != p && thread_group_empty(leader) && leader->exit_state == EXIT_ZOMBIE) { 190 if (leader != p && thread_group_empty(leader) && leader->exit_state == EXIT_ZOMBIE) {
193 BUG_ON(task_detached(leader));
194 do_notify_parent(leader, leader->exit_signal);
195 /* 191 /*
196 * If we were the last child thread and the leader has 192 * If we were the last child thread and the leader has
197 * exited already, and the leader's parent ignores SIGCHLD, 193 * exited already, and the leader's parent ignores SIGCHLD,
198 * then we are the one who should release the leader. 194 * then we are the one who should release the leader.
199 *
200 * do_notify_parent() will have marked it self-reaping in
201 * that case.
202 */
203 zap_leader = task_detached(leader);
204
205 /*
206 * This maintains the invariant that release_task()
207 * only runs on a task in EXIT_DEAD, just for sanity.
208 */ 195 */
196 zap_leader = do_notify_parent(leader, leader->exit_signal);
209 if (zap_leader) 197 if (zap_leader)
210 leader->exit_state = EXIT_DEAD; 198 leader->exit_state = EXIT_DEAD;
211 } 199 }
@@ -277,18 +265,16 @@ int is_current_pgrp_orphaned(void)
277 return retval; 265 return retval;
278} 266}
279 267
280static int has_stopped_jobs(struct pid *pgrp) 268static bool has_stopped_jobs(struct pid *pgrp)
281{ 269{
282 int retval = 0;
283 struct task_struct *p; 270 struct task_struct *p;
284 271
285 do_each_pid_task(pgrp, PIDTYPE_PGID, p) { 272 do_each_pid_task(pgrp, PIDTYPE_PGID, p) {
286 if (!task_is_stopped(p)) 273 if (p->signal->flags & SIGNAL_STOP_STOPPED)
287 continue; 274 return true;
288 retval = 1;
289 break;
290 } while_each_pid_task(pgrp, PIDTYPE_PGID, p); 275 } while_each_pid_task(pgrp, PIDTYPE_PGID, p);
291 return retval; 276
277 return false;
292} 278}
293 279
294/* 280/*
@@ -561,29 +547,28 @@ void exit_files(struct task_struct *tsk)
561 547
562#ifdef CONFIG_MM_OWNER 548#ifdef CONFIG_MM_OWNER
563/* 549/*
564 * Task p is exiting and it owned mm, lets find a new owner for it 550 * A task is exiting. If it owned this mm, find a new owner for the mm.
565 */ 551 */
566static inline int
567mm_need_new_owner(struct mm_struct *mm, struct task_struct *p)
568{
569 /*
570 * If there are other users of the mm and the owner (us) is exiting
571 * we need to find a new owner to take on the responsibility.
572 */
573 if (atomic_read(&mm->mm_users) <= 1)
574 return 0;
575 if (mm->owner != p)
576 return 0;
577 return 1;
578}
579
580void mm_update_next_owner(struct mm_struct *mm) 552void mm_update_next_owner(struct mm_struct *mm)
581{ 553{
582 struct task_struct *c, *g, *p = current; 554 struct task_struct *c, *g, *p = current;
583 555
584retry: 556retry:
585 if (!mm_need_new_owner(mm, p)) 557 /*
558 * If the exiting or execing task is not the owner, it's
559 * someone else's problem.
560 */
561 if (mm->owner != p)
586 return; 562 return;
563 /*
564 * The current owner is exiting/execing and there are no other
565 * candidates. Do not leave the mm pointing to a possibly
566 * freed task structure.
567 */
568 if (atomic_read(&mm->mm_users) <= 1) {
569 mm->owner = NULL;
570 return;
571 }
587 572
588 read_lock(&tasklist_lock); 573 read_lock(&tasklist_lock);
589 /* 574 /*
@@ -752,7 +737,7 @@ static void reparent_leader(struct task_struct *father, struct task_struct *p,
752{ 737{
753 list_move_tail(&p->sibling, &p->real_parent->children); 738 list_move_tail(&p->sibling, &p->real_parent->children);
754 739
755 if (task_detached(p)) 740 if (p->exit_state == EXIT_DEAD)
756 return; 741 return;
757 /* 742 /*
758 * If this is a threaded reparent there is no need to 743 * If this is a threaded reparent there is no need to
@@ -765,10 +750,9 @@ static void reparent_leader(struct task_struct *father, struct task_struct *p,
765 p->exit_signal = SIGCHLD; 750 p->exit_signal = SIGCHLD;
766 751
767 /* If it has exited notify the new parent about this child's death. */ 752 /* If it has exited notify the new parent about this child's death. */
768 if (!task_ptrace(p) && 753 if (!p->ptrace &&
769 p->exit_state == EXIT_ZOMBIE && thread_group_empty(p)) { 754 p->exit_state == EXIT_ZOMBIE && thread_group_empty(p)) {
770 do_notify_parent(p, p->exit_signal); 755 if (do_notify_parent(p, p->exit_signal)) {
771 if (task_detached(p)) {
772 p->exit_state = EXIT_DEAD; 756 p->exit_state = EXIT_DEAD;
773 list_move_tail(&p->sibling, dead); 757 list_move_tail(&p->sibling, dead);
774 } 758 }
@@ -795,7 +779,7 @@ static void forget_original_parent(struct task_struct *father)
795 do { 779 do {
796 t->real_parent = reaper; 780 t->real_parent = reaper;
797 if (t->parent == father) { 781 if (t->parent == father) {
798 BUG_ON(task_ptrace(t)); 782 BUG_ON(t->ptrace);
799 t->parent = t->real_parent; 783 t->parent = t->real_parent;
800 } 784 }
801 if (t->pdeath_signal) 785 if (t->pdeath_signal)
@@ -820,8 +804,7 @@ static void forget_original_parent(struct task_struct *father)
820 */ 804 */
821static void exit_notify(struct task_struct *tsk, int group_dead) 805static void exit_notify(struct task_struct *tsk, int group_dead)
822{ 806{
823 int signal; 807 bool autoreap;
824 void *cookie;
825 808
826 /* 809 /*
827 * This does two things: 810 * This does two things:
@@ -852,26 +835,33 @@ static void exit_notify(struct task_struct *tsk, int group_dead)
852 * we have changed execution domain as these two values started 835 * we have changed execution domain as these two values started
853 * the same after a fork. 836 * the same after a fork.
854 */ 837 */
855 if (tsk->exit_signal != SIGCHLD && !task_detached(tsk) && 838 if (thread_group_leader(tsk) && tsk->exit_signal != SIGCHLD &&
856 (tsk->parent_exec_id != tsk->real_parent->self_exec_id || 839 (tsk->parent_exec_id != tsk->real_parent->self_exec_id ||
857 tsk->self_exec_id != tsk->parent_exec_id)) 840 tsk->self_exec_id != tsk->parent_exec_id))
858 tsk->exit_signal = SIGCHLD; 841 tsk->exit_signal = SIGCHLD;
859 842
860 signal = tracehook_notify_death(tsk, &cookie, group_dead); 843 if (unlikely(tsk->ptrace)) {
861 if (signal >= 0) 844 int sig = thread_group_leader(tsk) &&
862 signal = do_notify_parent(tsk, signal); 845 thread_group_empty(tsk) &&
846 !ptrace_reparented(tsk) ?
847 tsk->exit_signal : SIGCHLD;
848 autoreap = do_notify_parent(tsk, sig);
849 } else if (thread_group_leader(tsk)) {
850 autoreap = thread_group_empty(tsk) &&
851 do_notify_parent(tsk, tsk->exit_signal);
852 } else {
853 autoreap = true;
854 }
863 855
864 tsk->exit_state = signal == DEATH_REAP ? EXIT_DEAD : EXIT_ZOMBIE; 856 tsk->exit_state = autoreap ? EXIT_DEAD : EXIT_ZOMBIE;
865 857
866 /* mt-exec, de_thread() is waiting for group leader */ 858 /* mt-exec, de_thread() is waiting for group leader */
867 if (unlikely(tsk->signal->notify_count < 0)) 859 if (unlikely(tsk->signal->notify_count < 0))
868 wake_up_process(tsk->signal->group_exit_task); 860 wake_up_process(tsk->signal->group_exit_task);
869 write_unlock_irq(&tasklist_lock); 861 write_unlock_irq(&tasklist_lock);
870 862
871 tracehook_report_death(tsk, signal, cookie, group_dead);
872
873 /* If the process is dead, release it - nobody will wait for it */ 863 /* If the process is dead, release it - nobody will wait for it */
874 if (signal == DEATH_REAP) 864 if (autoreap)
875 release_task(tsk); 865 release_task(tsk);
876} 866}
877 867
@@ -907,7 +897,6 @@ NORET_TYPE void do_exit(long code)
907 897
908 profile_task_exit(tsk); 898 profile_task_exit(tsk);
909 899
910 WARN_ON(atomic_read(&tsk->fs_excl));
911 WARN_ON(blk_needs_flush_plug(tsk)); 900 WARN_ON(blk_needs_flush_plug(tsk));
912 901
913 if (unlikely(in_interrupt())) 902 if (unlikely(in_interrupt()))
@@ -924,7 +913,7 @@ NORET_TYPE void do_exit(long code)
924 */ 913 */
925 set_fs(USER_DS); 914 set_fs(USER_DS);
926 915
927 tracehook_report_exit(&code); 916 ptrace_event(PTRACE_EVENT_EXIT, code);
928 917
929 validate_creds_for_do_exit(tsk); 918 validate_creds_for_do_exit(tsk);
930 919
@@ -991,6 +980,7 @@ NORET_TYPE void do_exit(long code)
991 trace_sched_process_exit(tsk); 980 trace_sched_process_exit(tsk);
992 981
993 exit_sem(tsk); 982 exit_sem(tsk);
983 exit_shm(tsk);
994 exit_files(tsk); 984 exit_files(tsk);
995 exit_fs(tsk); 985 exit_fs(tsk);
996 check_stack_usage(); 986 check_stack_usage();
@@ -1236,9 +1226,9 @@ static int wait_task_zombie(struct wait_opts *wo, struct task_struct *p)
1236 traced = ptrace_reparented(p); 1226 traced = ptrace_reparented(p);
1237 /* 1227 /*
1238 * It can be ptraced but not reparented, check 1228 * It can be ptraced but not reparented, check
1239 * !task_detached() to filter out sub-threads. 1229 * thread_group_leader() to filter out sub-threads.
1240 */ 1230 */
1241 if (likely(!traced) && likely(!task_detached(p))) { 1231 if (likely(!traced) && thread_group_leader(p)) {
1242 struct signal_struct *psig; 1232 struct signal_struct *psig;
1243 struct signal_struct *sig; 1233 struct signal_struct *sig;
1244 unsigned long maxrss; 1234 unsigned long maxrss;
@@ -1346,16 +1336,13 @@ static int wait_task_zombie(struct wait_opts *wo, struct task_struct *p)
1346 /* We dropped tasklist, ptracer could die and untrace */ 1336 /* We dropped tasklist, ptracer could die and untrace */
1347 ptrace_unlink(p); 1337 ptrace_unlink(p);
1348 /* 1338 /*
1349 * If this is not a detached task, notify the parent. 1339 * If this is not a sub-thread, notify the parent.
1350 * If it's still not detached after that, don't release 1340 * If parent wants a zombie, don't release it now.
1351 * it now.
1352 */ 1341 */
1353 if (!task_detached(p)) { 1342 if (thread_group_leader(p) &&
1354 do_notify_parent(p, p->exit_signal); 1343 !do_notify_parent(p, p->exit_signal)) {
1355 if (!task_detached(p)) { 1344 p->exit_state = EXIT_ZOMBIE;
1356 p->exit_state = EXIT_ZOMBIE; 1345 p = NULL;
1357 p = NULL;
1358 }
1359 } 1346 }
1360 write_unlock_irq(&tasklist_lock); 1347 write_unlock_irq(&tasklist_lock);
1361 } 1348 }
@@ -1368,7 +1355,8 @@ static int wait_task_zombie(struct wait_opts *wo, struct task_struct *p)
1368static int *task_stopped_code(struct task_struct *p, bool ptrace) 1355static int *task_stopped_code(struct task_struct *p, bool ptrace)
1369{ 1356{
1370 if (ptrace) { 1357 if (ptrace) {
1371 if (task_is_stopped_or_traced(p)) 1358 if (task_is_stopped_or_traced(p) &&
1359 !(p->jobctl & JOBCTL_LISTENING))
1372 return &p->exit_code; 1360 return &p->exit_code;
1373 } else { 1361 } else {
1374 if (p->signal->flags & SIGNAL_STOP_STOPPED) 1362 if (p->signal->flags & SIGNAL_STOP_STOPPED)
@@ -1377,11 +1365,23 @@ static int *task_stopped_code(struct task_struct *p, bool ptrace)
1377 return NULL; 1365 return NULL;
1378} 1366}
1379 1367
1380/* 1368/**
1381 * Handle sys_wait4 work for one task in state TASK_STOPPED. We hold 1369 * wait_task_stopped - Wait for %TASK_STOPPED or %TASK_TRACED
1382 * read_lock(&tasklist_lock) on entry. If we return zero, we still hold 1370 * @wo: wait options
1383 * the lock and this task is uninteresting. If we return nonzero, we have 1371 * @ptrace: is the wait for ptrace
1384 * released the lock and the system call should return. 1372 * @p: task to wait for
1373 *
1374 * Handle sys_wait4() work for %p in state %TASK_STOPPED or %TASK_TRACED.
1375 *
1376 * CONTEXT:
1377 * read_lock(&tasklist_lock), which is released if return value is
1378 * non-zero. Also, grabs and releases @p->sighand->siglock.
1379 *
1380 * RETURNS:
1381 * 0 if wait condition didn't exist and search for other wait conditions
1382 * should continue. Non-zero return, -errno on failure and @p's pid on
1383 * success, implies that tasklist_lock is released and wait condition
1384 * search should terminate.
1385 */ 1385 */
1386static int wait_task_stopped(struct wait_opts *wo, 1386static int wait_task_stopped(struct wait_opts *wo,
1387 int ptrace, struct task_struct *p) 1387 int ptrace, struct task_struct *p)
@@ -1397,6 +1397,9 @@ static int wait_task_stopped(struct wait_opts *wo,
1397 if (!ptrace && !(wo->wo_flags & WUNTRACED)) 1397 if (!ptrace && !(wo->wo_flags & WUNTRACED))
1398 return 0; 1398 return 0;
1399 1399
1400 if (!task_stopped_code(p, ptrace))
1401 return 0;
1402
1400 exit_code = 0; 1403 exit_code = 0;
1401 spin_lock_irq(&p->sighand->siglock); 1404 spin_lock_irq(&p->sighand->siglock);
1402 1405
@@ -1538,33 +1541,83 @@ static int wait_consider_task(struct wait_opts *wo, int ptrace,
1538 return 0; 1541 return 0;
1539 } 1542 }
1540 1543
1541 if (likely(!ptrace) && unlikely(task_ptrace(p))) { 1544 /* dead body doesn't have much to contribute */
1545 if (p->exit_state == EXIT_DEAD)
1546 return 0;
1547
1548 /* slay zombie? */
1549 if (p->exit_state == EXIT_ZOMBIE) {
1550 /*
1551 * A zombie ptracee is only visible to its ptracer.
1552 * Notification and reaping will be cascaded to the real
1553 * parent when the ptracer detaches.
1554 */
1555 if (likely(!ptrace) && unlikely(p->ptrace)) {
1556 /* it will become visible, clear notask_error */
1557 wo->notask_error = 0;
1558 return 0;
1559 }
1560
1561 /* we don't reap group leaders with subthreads */
1562 if (!delay_group_leader(p))
1563 return wait_task_zombie(wo, p);
1564
1565 /*
1566 * Allow access to stopped/continued state via zombie by
1567 * falling through. Clearing of notask_error is complex.
1568 *
1569 * When !@ptrace:
1570 *
1571 * If WEXITED is set, notask_error should naturally be
1572 * cleared. If not, subset of WSTOPPED|WCONTINUED is set,
1573 * so, if there are live subthreads, there are events to
1574 * wait for. If all subthreads are dead, it's still safe
1575 * to clear - this function will be called again in finite
1576 * amount time once all the subthreads are released and
1577 * will then return without clearing.
1578 *
1579 * When @ptrace:
1580 *
1581 * Stopped state is per-task and thus can't change once the
1582 * target task dies. Only continued and exited can happen.
1583 * Clear notask_error if WCONTINUED | WEXITED.
1584 */
1585 if (likely(!ptrace) || (wo->wo_flags & (WCONTINUED | WEXITED)))
1586 wo->notask_error = 0;
1587 } else {
1588 /*
1589 * If @p is ptraced by a task in its real parent's group,
1590 * hide group stop/continued state when looking at @p as
1591 * the real parent; otherwise, a single stop can be
1592 * reported twice as group and ptrace stops.
1593 *
1594 * If a ptracer wants to distinguish the two events for its
1595 * own children, it should create a separate process which
1596 * takes the role of real parent.
1597 */
1598 if (likely(!ptrace) && p->ptrace && !ptrace_reparented(p))
1599 return 0;
1600
1542 /* 1601 /*
1543 * This child is hidden by ptrace. 1602 * @p is alive and it's gonna stop, continue or exit, so
1544 * We aren't allowed to see it now, but eventually we will. 1603 * there always is something to wait for.
1545 */ 1604 */
1546 wo->notask_error = 0; 1605 wo->notask_error = 0;
1547 return 0;
1548 } 1606 }
1549 1607
1550 if (p->exit_state == EXIT_DEAD)
1551 return 0;
1552
1553 /* 1608 /*
1554 * We don't reap group leaders with subthreads. 1609 * Wait for stopped. Depending on @ptrace, different stopped state
1610 * is used and the two don't interact with each other.
1555 */ 1611 */
1556 if (p->exit_state == EXIT_ZOMBIE && !delay_group_leader(p)) 1612 ret = wait_task_stopped(wo, ptrace, p);
1557 return wait_task_zombie(wo, p); 1613 if (ret)
1614 return ret;
1558 1615
1559 /* 1616 /*
1560 * It's stopped or running now, so it might 1617 * Wait for continued. There's only one continued state and the
1561 * later continue, exit, or stop again. 1618 * ptracer can consume it which can confuse the real parent. Don't
1619 * use WCONTINUED from ptracer. You don't need or want it.
1562 */ 1620 */
1563 wo->notask_error = 0;
1564
1565 if (task_stopped_code(p, ptrace))
1566 return wait_task_stopped(wo, ptrace, p);
1567
1568 return wait_task_continued(wo, p); 1621 return wait_task_continued(wo, p);
1569} 1622}
1570 1623
diff --git a/kernel/extable.c b/kernel/extable.c
index 7f8f263f8524..5339705b8241 100644
--- a/kernel/extable.c
+++ b/kernel/extable.c
@@ -72,6 +72,24 @@ int core_kernel_text(unsigned long addr)
72 return 0; 72 return 0;
73} 73}
74 74
75/**
76 * core_kernel_data - tell if addr points to kernel data
77 * @addr: address to test
78 *
79 * Returns true if @addr passed in is from the core kernel data
80 * section.
81 *
82 * Note: On some archs it may return true for core RODATA, and false
83 * for others. But will always be true for core RW data.
84 */
85int core_kernel_data(unsigned long addr)
86{
87 if (addr >= (unsigned long)_sdata &&
88 addr < (unsigned long)_edata)
89 return 1;
90 return 0;
91}
92
75int __kernel_text_address(unsigned long addr) 93int __kernel_text_address(unsigned long addr)
76{ 94{
77 if (core_kernel_text(addr)) 95 if (core_kernel_text(addr))
diff --git a/kernel/fork.c b/kernel/fork.c
index e7548dee636b..8e6b6f4fb272 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -37,7 +37,6 @@
37#include <linux/swap.h> 37#include <linux/swap.h>
38#include <linux/syscalls.h> 38#include <linux/syscalls.h>
39#include <linux/jiffies.h> 39#include <linux/jiffies.h>
40#include <linux/tracehook.h>
41#include <linux/futex.h> 40#include <linux/futex.h>
42#include <linux/compat.h> 41#include <linux/compat.h>
43#include <linux/kthread.h> 42#include <linux/kthread.h>
@@ -59,7 +58,6 @@
59#include <linux/taskstats_kern.h> 58#include <linux/taskstats_kern.h>
60#include <linux/random.h> 59#include <linux/random.h>
61#include <linux/tty.h> 60#include <linux/tty.h>
62#include <linux/proc_fs.h>
63#include <linux/blkdev.h> 61#include <linux/blkdev.h>
64#include <linux/fs_struct.h> 62#include <linux/fs_struct.h>
65#include <linux/magic.h> 63#include <linux/magic.h>
@@ -82,7 +80,7 @@
82 * Protected counters by write_lock_irq(&tasklist_lock) 80 * Protected counters by write_lock_irq(&tasklist_lock)
83 */ 81 */
84unsigned long total_forks; /* Handle normal Linux uptimes. */ 82unsigned long total_forks; /* Handle normal Linux uptimes. */
85int nr_threads; /* The idle threads do not count.. */ 83int nr_threads; /* The idle threads do not count.. */
86 84
87int max_threads; /* tunable limit on nr_threads */ 85int max_threads; /* tunable limit on nr_threads */
88 86
@@ -234,7 +232,7 @@ void __init fork_init(unsigned long mempages)
234 /* 232 /*
235 * we need to allow at least 20 threads to boot a system 233 * we need to allow at least 20 threads to boot a system
236 */ 234 */
237 if(max_threads < 20) 235 if (max_threads < 20)
238 max_threads = 20; 236 max_threads = 20;
239 237
240 init_task.signal->rlim[RLIMIT_NPROC].rlim_cur = max_threads/2; 238 init_task.signal->rlim[RLIMIT_NPROC].rlim_cur = max_threads/2;
@@ -270,7 +268,7 @@ static struct task_struct *dup_task_struct(struct task_struct *orig)
270 return NULL; 268 return NULL;
271 } 269 }
272 270
273 err = arch_dup_task_struct(tsk, orig); 271 err = arch_dup_task_struct(tsk, orig);
274 if (err) 272 if (err)
275 goto out; 273 goto out;
276 274
@@ -290,9 +288,11 @@ static struct task_struct *dup_task_struct(struct task_struct *orig)
290 tsk->stack_canary = get_random_int(); 288 tsk->stack_canary = get_random_int();
291#endif 289#endif
292 290
293 /* One for us, one for whoever does the "release_task()" (usually parent) */ 291 /*
294 atomic_set(&tsk->usage,2); 292 * One for us, one for whoever does the "release_task()" (usually
295 atomic_set(&tsk->fs_excl, 0); 293 * parent)
294 */
295 atomic_set(&tsk->usage, 2);
296#ifdef CONFIG_BLK_DEV_IO_TRACE 296#ifdef CONFIG_BLK_DEV_IO_TRACE
297 tsk->btrace_seq = 0; 297 tsk->btrace_seq = 0;
298#endif 298#endif
@@ -383,15 +383,14 @@ static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm)
383 get_file(file); 383 get_file(file);
384 if (tmp->vm_flags & VM_DENYWRITE) 384 if (tmp->vm_flags & VM_DENYWRITE)
385 atomic_dec(&inode->i_writecount); 385 atomic_dec(&inode->i_writecount);
386 spin_lock(&mapping->i_mmap_lock); 386 mutex_lock(&mapping->i_mmap_mutex);
387 if (tmp->vm_flags & VM_SHARED) 387 if (tmp->vm_flags & VM_SHARED)
388 mapping->i_mmap_writable++; 388 mapping->i_mmap_writable++;
389 tmp->vm_truncate_count = mpnt->vm_truncate_count;
390 flush_dcache_mmap_lock(mapping); 389 flush_dcache_mmap_lock(mapping);
391 /* insert tmp into the share list, just after mpnt */ 390 /* insert tmp into the share list, just after mpnt */
392 vma_prio_tree_add(tmp, mpnt); 391 vma_prio_tree_add(tmp, mpnt);
393 flush_dcache_mmap_unlock(mapping); 392 flush_dcache_mmap_unlock(mapping);
394 spin_unlock(&mapping->i_mmap_lock); 393 mutex_unlock(&mapping->i_mmap_mutex);
395 } 394 }
396 395
397 /* 396 /*
@@ -441,7 +440,7 @@ fail_nomem:
441 goto out; 440 goto out;
442} 441}
443 442
444static inline int mm_alloc_pgd(struct mm_struct * mm) 443static inline int mm_alloc_pgd(struct mm_struct *mm)
445{ 444{
446 mm->pgd = pgd_alloc(mm); 445 mm->pgd = pgd_alloc(mm);
447 if (unlikely(!mm->pgd)) 446 if (unlikely(!mm->pgd))
@@ -449,7 +448,7 @@ static inline int mm_alloc_pgd(struct mm_struct * mm)
449 return 0; 448 return 0;
450} 449}
451 450
452static inline void mm_free_pgd(struct mm_struct * mm) 451static inline void mm_free_pgd(struct mm_struct *mm)
453{ 452{
454 pgd_free(mm, mm->pgd); 453 pgd_free(mm, mm->pgd);
455} 454}
@@ -486,7 +485,7 @@ static void mm_init_aio(struct mm_struct *mm)
486#endif 485#endif
487} 486}
488 487
489static struct mm_struct * mm_init(struct mm_struct * mm, struct task_struct *p) 488static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p)
490{ 489{
491 atomic_set(&mm->mm_users, 1); 490 atomic_set(&mm->mm_users, 1);
492 atomic_set(&mm->mm_count, 1); 491 atomic_set(&mm->mm_count, 1);
@@ -517,16 +516,17 @@ static struct mm_struct * mm_init(struct mm_struct * mm, struct task_struct *p)
517/* 516/*
518 * Allocate and initialize an mm_struct. 517 * Allocate and initialize an mm_struct.
519 */ 518 */
520struct mm_struct * mm_alloc(void) 519struct mm_struct *mm_alloc(void)
521{ 520{
522 struct mm_struct * mm; 521 struct mm_struct *mm;
523 522
524 mm = allocate_mm(); 523 mm = allocate_mm();
525 if (mm) { 524 if (!mm)
526 memset(mm, 0, sizeof(*mm)); 525 return NULL;
527 mm = mm_init(mm, current); 526
528 } 527 memset(mm, 0, sizeof(*mm));
529 return mm; 528 mm_init_cpumask(mm);
529 return mm_init(mm, current);
530} 530}
531 531
532/* 532/*
@@ -573,6 +573,57 @@ void mmput(struct mm_struct *mm)
573} 573}
574EXPORT_SYMBOL_GPL(mmput); 574EXPORT_SYMBOL_GPL(mmput);
575 575
576/*
577 * We added or removed a vma mapping the executable. The vmas are only mapped
578 * during exec and are not mapped with the mmap system call.
579 * Callers must hold down_write() on the mm's mmap_sem for these
580 */
581void added_exe_file_vma(struct mm_struct *mm)
582{
583 mm->num_exe_file_vmas++;
584}
585
586void removed_exe_file_vma(struct mm_struct *mm)
587{
588 mm->num_exe_file_vmas--;
589 if ((mm->num_exe_file_vmas == 0) && mm->exe_file) {
590 fput(mm->exe_file);
591 mm->exe_file = NULL;
592 }
593
594}
595
596void set_mm_exe_file(struct mm_struct *mm, struct file *new_exe_file)
597{
598 if (new_exe_file)
599 get_file(new_exe_file);
600 if (mm->exe_file)
601 fput(mm->exe_file);
602 mm->exe_file = new_exe_file;
603 mm->num_exe_file_vmas = 0;
604}
605
606struct file *get_mm_exe_file(struct mm_struct *mm)
607{
608 struct file *exe_file;
609
610 /* We need mmap_sem to protect against races with removal of
611 * VM_EXECUTABLE vmas */
612 down_read(&mm->mmap_sem);
613 exe_file = mm->exe_file;
614 if (exe_file)
615 get_file(exe_file);
616 up_read(&mm->mmap_sem);
617 return exe_file;
618}
619
620static void dup_mm_exe_file(struct mm_struct *oldmm, struct mm_struct *newmm)
621{
622 /* It's safe to write the exe_file pointer without exe_file_lock because
623 * this is called during fork when the task is not yet in /proc */
624 newmm->exe_file = get_mm_exe_file(oldmm);
625}
626
576/** 627/**
577 * get_task_mm - acquire a reference to the task's mm 628 * get_task_mm - acquire a reference to the task's mm
578 * 629 *
@@ -679,6 +730,7 @@ struct mm_struct *dup_mm(struct task_struct *tsk)
679 goto fail_nomem; 730 goto fail_nomem;
680 731
681 memcpy(mm, oldmm, sizeof(*mm)); 732 memcpy(mm, oldmm, sizeof(*mm));
733 mm_init_cpumask(mm);
682 734
683 /* Initializing for Swap token stuff */ 735 /* Initializing for Swap token stuff */
684 mm->token_priority = 0; 736 mm->token_priority = 0;
@@ -726,9 +778,9 @@ fail_nocontext:
726 return NULL; 778 return NULL;
727} 779}
728 780
729static int copy_mm(unsigned long clone_flags, struct task_struct * tsk) 781static int copy_mm(unsigned long clone_flags, struct task_struct *tsk)
730{ 782{
731 struct mm_struct * mm, *oldmm; 783 struct mm_struct *mm, *oldmm;
732 int retval; 784 int retval;
733 785
734 tsk->min_flt = tsk->maj_flt = 0; 786 tsk->min_flt = tsk->maj_flt = 0;
@@ -795,7 +847,7 @@ static int copy_fs(unsigned long clone_flags, struct task_struct *tsk)
795 return 0; 847 return 0;
796} 848}
797 849
798static int copy_files(unsigned long clone_flags, struct task_struct * tsk) 850static int copy_files(unsigned long clone_flags, struct task_struct *tsk)
799{ 851{
800 struct files_struct *oldf, *newf; 852 struct files_struct *oldf, *newf;
801 int error = 0; 853 int error = 0;
@@ -927,6 +979,10 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk)
927 tty_audit_fork(sig); 979 tty_audit_fork(sig);
928 sched_autogroup_fork(sig); 980 sched_autogroup_fork(sig);
929 981
982#ifdef CONFIG_CGROUPS
983 init_rwsem(&sig->threadgroup_fork_lock);
984#endif
985
930 sig->oom_adj = current->signal->oom_adj; 986 sig->oom_adj = current->signal->oom_adj;
931 sig->oom_score_adj = current->signal->oom_score_adj; 987 sig->oom_score_adj = current->signal->oom_score_adj;
932 sig->oom_score_adj_min = current->signal->oom_score_adj_min; 988 sig->oom_score_adj_min = current->signal->oom_score_adj_min;
@@ -958,7 +1014,7 @@ static void rt_mutex_init_task(struct task_struct *p)
958{ 1014{
959 raw_spin_lock_init(&p->pi_lock); 1015 raw_spin_lock_init(&p->pi_lock);
960#ifdef CONFIG_RT_MUTEXES 1016#ifdef CONFIG_RT_MUTEXES
961 plist_head_init_raw(&p->pi_waiters, &p->pi_lock); 1017 plist_head_init(&p->pi_waiters);
962 p->pi_blocked_on = NULL; 1018 p->pi_blocked_on = NULL;
963#endif 1019#endif
964} 1020}
@@ -1055,6 +1111,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
1055 p->real_cred->user != INIT_USER) 1111 p->real_cred->user != INIT_USER)
1056 goto bad_fork_free; 1112 goto bad_fork_free;
1057 } 1113 }
1114 current->flags &= ~PF_NPROC_EXCEEDED;
1058 1115
1059 retval = copy_creds(p, clone_flags); 1116 retval = copy_creds(p, clone_flags);
1060 if (retval < 0) 1117 if (retval < 0)
@@ -1103,22 +1160,27 @@ static struct task_struct *copy_process(unsigned long clone_flags,
1103 1160
1104 posix_cpu_timers_init(p); 1161 posix_cpu_timers_init(p);
1105 1162
1106 p->lock_depth = -1; /* -1 = no lock */
1107 do_posix_clock_monotonic_gettime(&p->start_time); 1163 do_posix_clock_monotonic_gettime(&p->start_time);
1108 p->real_start_time = p->start_time; 1164 p->real_start_time = p->start_time;
1109 monotonic_to_bootbased(&p->real_start_time); 1165 monotonic_to_bootbased(&p->real_start_time);
1110 p->io_context = NULL; 1166 p->io_context = NULL;
1111 p->audit_context = NULL; 1167 p->audit_context = NULL;
1168 if (clone_flags & CLONE_THREAD)
1169 threadgroup_fork_read_lock(current);
1112 cgroup_fork(p); 1170 cgroup_fork(p);
1113#ifdef CONFIG_NUMA 1171#ifdef CONFIG_NUMA
1114 p->mempolicy = mpol_dup(p->mempolicy); 1172 p->mempolicy = mpol_dup(p->mempolicy);
1115 if (IS_ERR(p->mempolicy)) { 1173 if (IS_ERR(p->mempolicy)) {
1116 retval = PTR_ERR(p->mempolicy); 1174 retval = PTR_ERR(p->mempolicy);
1117 p->mempolicy = NULL; 1175 p->mempolicy = NULL;
1118 goto bad_fork_cleanup_cgroup; 1176 goto bad_fork_cleanup_cgroup;
1119 } 1177 }
1120 mpol_fix_fork_child_flag(p); 1178 mpol_fix_fork_child_flag(p);
1121#endif 1179#endif
1180#ifdef CONFIG_CPUSETS
1181 p->cpuset_mem_spread_rotor = NUMA_NO_NODE;
1182 p->cpuset_slab_spread_rotor = NUMA_NO_NODE;
1183#endif
1122#ifdef CONFIG_TRACE_IRQFLAGS 1184#ifdef CONFIG_TRACE_IRQFLAGS
1123 p->irq_events = 0; 1185 p->irq_events = 0;
1124#ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW 1186#ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW
@@ -1153,30 +1215,38 @@ static struct task_struct *copy_process(unsigned long clone_flags,
1153#endif 1215#endif
1154 1216
1155 /* Perform scheduler related setup. Assign this task to a CPU. */ 1217 /* Perform scheduler related setup. Assign this task to a CPU. */
1156 sched_fork(p, clone_flags); 1218 sched_fork(p);
1157 1219
1158 retval = perf_event_init_task(p); 1220 retval = perf_event_init_task(p);
1159 if (retval) 1221 if (retval)
1160 goto bad_fork_cleanup_policy; 1222 goto bad_fork_cleanup_policy;
1161 1223 retval = audit_alloc(p);
1162 if ((retval = audit_alloc(p))) 1224 if (retval)
1163 goto bad_fork_cleanup_policy; 1225 goto bad_fork_cleanup_policy;
1164 /* copy all the process information */ 1226 /* copy all the process information */
1165 if ((retval = copy_semundo(clone_flags, p))) 1227 retval = copy_semundo(clone_flags, p);
1228 if (retval)
1166 goto bad_fork_cleanup_audit; 1229 goto bad_fork_cleanup_audit;
1167 if ((retval = copy_files(clone_flags, p))) 1230 retval = copy_files(clone_flags, p);
1231 if (retval)
1168 goto bad_fork_cleanup_semundo; 1232 goto bad_fork_cleanup_semundo;
1169 if ((retval = copy_fs(clone_flags, p))) 1233 retval = copy_fs(clone_flags, p);
1234 if (retval)
1170 goto bad_fork_cleanup_files; 1235 goto bad_fork_cleanup_files;
1171 if ((retval = copy_sighand(clone_flags, p))) 1236 retval = copy_sighand(clone_flags, p);
1237 if (retval)
1172 goto bad_fork_cleanup_fs; 1238 goto bad_fork_cleanup_fs;
1173 if ((retval = copy_signal(clone_flags, p))) 1239 retval = copy_signal(clone_flags, p);
1240 if (retval)
1174 goto bad_fork_cleanup_sighand; 1241 goto bad_fork_cleanup_sighand;
1175 if ((retval = copy_mm(clone_flags, p))) 1242 retval = copy_mm(clone_flags, p);
1243 if (retval)
1176 goto bad_fork_cleanup_signal; 1244 goto bad_fork_cleanup_signal;
1177 if ((retval = copy_namespaces(clone_flags, p))) 1245 retval = copy_namespaces(clone_flags, p);
1246 if (retval)
1178 goto bad_fork_cleanup_mm; 1247 goto bad_fork_cleanup_mm;
1179 if ((retval = copy_io(clone_flags, p))) 1248 retval = copy_io(clone_flags, p);
1249 if (retval)
1180 goto bad_fork_cleanup_namespaces; 1250 goto bad_fork_cleanup_namespaces;
1181 retval = copy_thread(clone_flags, stack_start, stack_size, p, regs); 1251 retval = copy_thread(clone_flags, stack_start, stack_size, p, regs);
1182 if (retval) 1252 if (retval)
@@ -1194,17 +1264,11 @@ static struct task_struct *copy_process(unsigned long clone_flags,
1194 if (clone_flags & CLONE_THREAD) 1264 if (clone_flags & CLONE_THREAD)
1195 p->tgid = current->tgid; 1265 p->tgid = current->tgid;
1196 1266
1197 if (current->nsproxy != p->nsproxy) {
1198 retval = ns_cgroup_clone(p, pid);
1199 if (retval)
1200 goto bad_fork_free_pid;
1201 }
1202
1203 p->set_child_tid = (clone_flags & CLONE_CHILD_SETTID) ? child_tidptr : NULL; 1267 p->set_child_tid = (clone_flags & CLONE_CHILD_SETTID) ? child_tidptr : NULL;
1204 /* 1268 /*
1205 * Clear TID on mm_release()? 1269 * Clear TID on mm_release()?
1206 */ 1270 */
1207 p->clear_child_tid = (clone_flags & CLONE_CHILD_CLEARTID) ? child_tidptr: NULL; 1271 p->clear_child_tid = (clone_flags & CLONE_CHILD_CLEARTID) ? child_tidptr : NULL;
1208#ifdef CONFIG_BLOCK 1272#ifdef CONFIG_BLOCK
1209 p->plug = NULL; 1273 p->plug = NULL;
1210#endif 1274#endif
@@ -1272,7 +1336,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
1272 * it's process group. 1336 * it's process group.
1273 * A fatal signal pending means that current will exit, so the new 1337 * A fatal signal pending means that current will exit, so the new
1274 * thread can't slip out of an OOM kill (or normal SIGKILL). 1338 * thread can't slip out of an OOM kill (or normal SIGKILL).
1275 */ 1339 */
1276 recalc_sigpending(); 1340 recalc_sigpending();
1277 if (signal_pending(current)) { 1341 if (signal_pending(current)) {
1278 spin_unlock(&current->sighand->siglock); 1342 spin_unlock(&current->sighand->siglock);
@@ -1290,7 +1354,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
1290 } 1354 }
1291 1355
1292 if (likely(p->pid)) { 1356 if (likely(p->pid)) {
1293 tracehook_finish_clone(p, clone_flags, trace); 1357 ptrace_init_task(p, (clone_flags & CLONE_PTRACE) || trace);
1294 1358
1295 if (thread_group_leader(p)) { 1359 if (thread_group_leader(p)) {
1296 if (is_child_reaper(pid)) 1360 if (is_child_reaper(pid))
@@ -1313,6 +1377,8 @@ static struct task_struct *copy_process(unsigned long clone_flags,
1313 write_unlock_irq(&tasklist_lock); 1377 write_unlock_irq(&tasklist_lock);
1314 proc_fork_connector(p); 1378 proc_fork_connector(p);
1315 cgroup_post_fork(p); 1379 cgroup_post_fork(p);
1380 if (clone_flags & CLONE_THREAD)
1381 threadgroup_fork_read_unlock(current);
1316 perf_event_fork(p); 1382 perf_event_fork(p);
1317 return p; 1383 return p;
1318 1384
@@ -1351,6 +1417,8 @@ bad_fork_cleanup_policy:
1351 mpol_put(p->mempolicy); 1417 mpol_put(p->mempolicy);
1352bad_fork_cleanup_cgroup: 1418bad_fork_cleanup_cgroup:
1353#endif 1419#endif
1420 if (clone_flags & CLONE_THREAD)
1421 threadgroup_fork_read_unlock(current);
1354 cgroup_exit(p, cgroup_callbacks_done); 1422 cgroup_exit(p, cgroup_callbacks_done);
1355 delayacct_tsk_free(p); 1423 delayacct_tsk_free(p);
1356 module_put(task_thread_info(p)->exec_domain->module); 1424 module_put(task_thread_info(p)->exec_domain->module);
@@ -1427,10 +1495,22 @@ long do_fork(unsigned long clone_flags,
1427 } 1495 }
1428 1496
1429 /* 1497 /*
1430 * When called from kernel_thread, don't do user tracing stuff. 1498 * Determine whether and which event to report to ptracer. When
1499 * called from kernel_thread or CLONE_UNTRACED is explicitly
1500 * requested, no event is reported; otherwise, report if the event
1501 * for the type of forking is enabled.
1431 */ 1502 */
1432 if (likely(user_mode(regs))) 1503 if (likely(user_mode(regs)) && !(clone_flags & CLONE_UNTRACED)) {
1433 trace = tracehook_prepare_clone(clone_flags); 1504 if (clone_flags & CLONE_VFORK)
1505 trace = PTRACE_EVENT_VFORK;
1506 else if ((clone_flags & CSIGNAL) != SIGCHLD)
1507 trace = PTRACE_EVENT_CLONE;
1508 else
1509 trace = PTRACE_EVENT_FORK;
1510
1511 if (likely(!ptrace_event_enabled(current, trace)))
1512 trace = 0;
1513 }
1434 1514
1435 p = copy_process(clone_flags, stack_start, regs, stack_size, 1515 p = copy_process(clone_flags, stack_start, regs, stack_size,
1436 child_tidptr, NULL, trace); 1516 child_tidptr, NULL, trace);
@@ -1454,26 +1534,26 @@ long do_fork(unsigned long clone_flags,
1454 } 1534 }
1455 1535
1456 audit_finish_fork(p); 1536 audit_finish_fork(p);
1457 tracehook_report_clone(regs, clone_flags, nr, p);
1458 1537
1459 /* 1538 /*
1460 * We set PF_STARTING at creation in case tracing wants to 1539 * We set PF_STARTING at creation in case tracing wants to
1461 * use this to distinguish a fully live task from one that 1540 * use this to distinguish a fully live task from one that
1462 * hasn't gotten to tracehook_report_clone() yet. Now we 1541 * hasn't finished SIGSTOP raising yet. Now we clear it
1463 * clear it and set the child going. 1542 * and set the child going.
1464 */ 1543 */
1465 p->flags &= ~PF_STARTING; 1544 p->flags &= ~PF_STARTING;
1466 1545
1467 wake_up_new_task(p, clone_flags); 1546 wake_up_new_task(p);
1468 1547
1469 tracehook_report_clone_complete(trace, regs, 1548 /* forking complete and child started to run, tell ptracer */
1470 clone_flags, nr, p); 1549 if (unlikely(trace))
1550 ptrace_event(trace, nr);
1471 1551
1472 if (clone_flags & CLONE_VFORK) { 1552 if (clone_flags & CLONE_VFORK) {
1473 freezer_do_not_count(); 1553 freezer_do_not_count();
1474 wait_for_completion(&vfork); 1554 wait_for_completion(&vfork);
1475 freezer_count(); 1555 freezer_count();
1476 tracehook_report_vfork_done(p, nr); 1556 ptrace_event(PTRACE_EVENT_VFORK_DONE, nr);
1477 } 1557 }
1478 } else { 1558 } else {
1479 nr = PTR_ERR(p); 1559 nr = PTR_ERR(p);
@@ -1508,11 +1588,19 @@ void __init proc_caches_init(void)
1508 fs_cachep = kmem_cache_create("fs_cache", 1588 fs_cachep = kmem_cache_create("fs_cache",
1509 sizeof(struct fs_struct), 0, 1589 sizeof(struct fs_struct), 0,
1510 SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_NOTRACK, NULL); 1590 SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_NOTRACK, NULL);
1591 /*
1592 * FIXME! The "sizeof(struct mm_struct)" currently includes the
1593 * whole struct cpumask for the OFFSTACK case. We could change
1594 * this to *only* allocate as much of it as required by the
1595 * maximum number of CPU's we can ever have. The cpumask_allocation
1596 * is at the end of the structure, exactly for that reason.
1597 */
1511 mm_cachep = kmem_cache_create("mm_struct", 1598 mm_cachep = kmem_cache_create("mm_struct",
1512 sizeof(struct mm_struct), ARCH_MIN_MMSTRUCT_ALIGN, 1599 sizeof(struct mm_struct), ARCH_MIN_MMSTRUCT_ALIGN,
1513 SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_NOTRACK, NULL); 1600 SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_NOTRACK, NULL);
1514 vm_area_cachep = KMEM_CACHE(vm_area_struct, SLAB_PANIC); 1601 vm_area_cachep = KMEM_CACHE(vm_area_struct, SLAB_PANIC);
1515 mmap_init(); 1602 mmap_init();
1603 nsproxy_cache_init();
1516} 1604}
1517 1605
1518/* 1606/*
@@ -1609,12 +1697,14 @@ SYSCALL_DEFINE1(unshare, unsigned long, unshare_flags)
1609 */ 1697 */
1610 if (unshare_flags & (CLONE_NEWIPC|CLONE_SYSVSEM)) 1698 if (unshare_flags & (CLONE_NEWIPC|CLONE_SYSVSEM))
1611 do_sysvsem = 1; 1699 do_sysvsem = 1;
1612 if ((err = unshare_fs(unshare_flags, &new_fs))) 1700 err = unshare_fs(unshare_flags, &new_fs);
1701 if (err)
1613 goto bad_unshare_out; 1702 goto bad_unshare_out;
1614 if ((err = unshare_fd(unshare_flags, &new_fd))) 1703 err = unshare_fd(unshare_flags, &new_fd);
1704 if (err)
1615 goto bad_unshare_cleanup_fs; 1705 goto bad_unshare_cleanup_fs;
1616 if ((err = unshare_nsproxy_namespaces(unshare_flags, &new_nsproxy, 1706 err = unshare_nsproxy_namespaces(unshare_flags, &new_nsproxy, new_fs);
1617 new_fs))) 1707 if (err)
1618 goto bad_unshare_cleanup_fd; 1708 goto bad_unshare_cleanup_fd;
1619 1709
1620 if (new_fs || new_fd || do_sysvsem || new_nsproxy) { 1710 if (new_fs || new_fd || do_sysvsem || new_nsproxy) {
diff --git a/kernel/freezer.c b/kernel/freezer.c
index 66ecd2ead215..7b01de98bb6a 100644
--- a/kernel/freezer.c
+++ b/kernel/freezer.c
@@ -17,7 +17,7 @@ static inline void frozen_process(void)
17{ 17{
18 if (!unlikely(current->flags & PF_NOFREEZE)) { 18 if (!unlikely(current->flags & PF_NOFREEZE)) {
19 current->flags |= PF_FROZEN; 19 current->flags |= PF_FROZEN;
20 wmb(); 20 smp_wmb();
21 } 21 }
22 clear_freeze_flag(current); 22 clear_freeze_flag(current);
23} 23}
@@ -93,7 +93,7 @@ bool freeze_task(struct task_struct *p, bool sig_only)
93 * the task as frozen and next clears its TIF_FREEZE. 93 * the task as frozen and next clears its TIF_FREEZE.
94 */ 94 */
95 if (!freezing(p)) { 95 if (!freezing(p)) {
96 rmb(); 96 smp_rmb();
97 if (frozen(p)) 97 if (frozen(p))
98 return false; 98 return false;
99 99
diff --git a/kernel/futex.c b/kernel/futex.c
index fe28dc282eae..11cbe052b2e8 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -218,6 +218,8 @@ static void drop_futex_key_refs(union futex_key *key)
218 * @uaddr: virtual address of the futex 218 * @uaddr: virtual address of the futex
219 * @fshared: 0 for a PROCESS_PRIVATE futex, 1 for PROCESS_SHARED 219 * @fshared: 0 for a PROCESS_PRIVATE futex, 1 for PROCESS_SHARED
220 * @key: address where result is stored. 220 * @key: address where result is stored.
221 * @rw: mapping needs to be read/write (values: VERIFY_READ,
222 * VERIFY_WRITE)
221 * 223 *
222 * Returns a negative error code or 0 224 * Returns a negative error code or 0
223 * The key words are stored in *key on success. 225 * The key words are stored in *key on success.
@@ -229,12 +231,12 @@ static void drop_futex_key_refs(union futex_key *key)
229 * lock_page() might sleep, the caller should not hold a spinlock. 231 * lock_page() might sleep, the caller should not hold a spinlock.
230 */ 232 */
231static int 233static int
232get_futex_key(u32 __user *uaddr, int fshared, union futex_key *key) 234get_futex_key(u32 __user *uaddr, int fshared, union futex_key *key, int rw)
233{ 235{
234 unsigned long address = (unsigned long)uaddr; 236 unsigned long address = (unsigned long)uaddr;
235 struct mm_struct *mm = current->mm; 237 struct mm_struct *mm = current->mm;
236 struct page *page, *page_head; 238 struct page *page, *page_head;
237 int err; 239 int err, ro = 0;
238 240
239 /* 241 /*
240 * The futex address must be "naturally" aligned. 242 * The futex address must be "naturally" aligned.
@@ -262,8 +264,18 @@ get_futex_key(u32 __user *uaddr, int fshared, union futex_key *key)
262 264
263again: 265again:
264 err = get_user_pages_fast(address, 1, 1, &page); 266 err = get_user_pages_fast(address, 1, 1, &page);
267 /*
268 * If write access is not required (eg. FUTEX_WAIT), try
269 * and get read-only access.
270 */
271 if (err == -EFAULT && rw == VERIFY_READ) {
272 err = get_user_pages_fast(address, 1, 0, &page);
273 ro = 1;
274 }
265 if (err < 0) 275 if (err < 0)
266 return err; 276 return err;
277 else
278 err = 0;
267 279
268#ifdef CONFIG_TRANSPARENT_HUGEPAGE 280#ifdef CONFIG_TRANSPARENT_HUGEPAGE
269 page_head = page; 281 page_head = page;
@@ -305,6 +317,13 @@ again:
305 if (!page_head->mapping) { 317 if (!page_head->mapping) {
306 unlock_page(page_head); 318 unlock_page(page_head);
307 put_page(page_head); 319 put_page(page_head);
320 /*
321 * ZERO_PAGE pages don't have a mapping. Avoid a busy loop
322 * trying to find one. RW mapping would have COW'd (and thus
323 * have a mapping) so this page is RO and won't ever change.
324 */
325 if ((page_head == ZERO_PAGE(address)))
326 return -EFAULT;
308 goto again; 327 goto again;
309 } 328 }
310 329
@@ -316,6 +335,15 @@ again:
316 * the object not the particular process. 335 * the object not the particular process.
317 */ 336 */
318 if (PageAnon(page_head)) { 337 if (PageAnon(page_head)) {
338 /*
339 * A RO anonymous page will never change and thus doesn't make
340 * sense for futex operations.
341 */
342 if (ro) {
343 err = -EFAULT;
344 goto out;
345 }
346
319 key->both.offset |= FUT_OFF_MMSHARED; /* ref taken on mm */ 347 key->both.offset |= FUT_OFF_MMSHARED; /* ref taken on mm */
320 key->private.mm = mm; 348 key->private.mm = mm;
321 key->private.address = address; 349 key->private.address = address;
@@ -327,9 +355,10 @@ again:
327 355
328 get_futex_key_refs(key); 356 get_futex_key_refs(key);
329 357
358out:
330 unlock_page(page_head); 359 unlock_page(page_head);
331 put_page(page_head); 360 put_page(page_head);
332 return 0; 361 return err;
333} 362}
334 363
335static inline void put_futex_key(union futex_key *key) 364static inline void put_futex_key(union futex_key *key)
@@ -355,8 +384,8 @@ static int fault_in_user_writeable(u32 __user *uaddr)
355 int ret; 384 int ret;
356 385
357 down_read(&mm->mmap_sem); 386 down_read(&mm->mmap_sem);
358 ret = get_user_pages(current, mm, (unsigned long)uaddr, 387 ret = fixup_user_fault(current, mm, (unsigned long)uaddr,
359 1, 1, 0, NULL, NULL); 388 FAULT_FLAG_WRITE);
360 up_read(&mm->mmap_sem); 389 up_read(&mm->mmap_sem);
361 390
362 return ret < 0 ? ret : 0; 391 return ret < 0 ? ret : 0;
@@ -940,7 +969,7 @@ futex_wake(u32 __user *uaddr, unsigned int flags, int nr_wake, u32 bitset)
940 if (!bitset) 969 if (!bitset)
941 return -EINVAL; 970 return -EINVAL;
942 971
943 ret = get_futex_key(uaddr, flags & FLAGS_SHARED, &key); 972 ret = get_futex_key(uaddr, flags & FLAGS_SHARED, &key, VERIFY_READ);
944 if (unlikely(ret != 0)) 973 if (unlikely(ret != 0))
945 goto out; 974 goto out;
946 975
@@ -986,10 +1015,10 @@ futex_wake_op(u32 __user *uaddr1, unsigned int flags, u32 __user *uaddr2,
986 int ret, op_ret; 1015 int ret, op_ret;
987 1016
988retry: 1017retry:
989 ret = get_futex_key(uaddr1, flags & FLAGS_SHARED, &key1); 1018 ret = get_futex_key(uaddr1, flags & FLAGS_SHARED, &key1, VERIFY_READ);
990 if (unlikely(ret != 0)) 1019 if (unlikely(ret != 0))
991 goto out; 1020 goto out;
992 ret = get_futex_key(uaddr2, flags & FLAGS_SHARED, &key2); 1021 ret = get_futex_key(uaddr2, flags & FLAGS_SHARED, &key2, VERIFY_WRITE);
993 if (unlikely(ret != 0)) 1022 if (unlikely(ret != 0))
994 goto out_put_key1; 1023 goto out_put_key1;
995 1024
@@ -1243,10 +1272,11 @@ retry:
1243 pi_state = NULL; 1272 pi_state = NULL;
1244 } 1273 }
1245 1274
1246 ret = get_futex_key(uaddr1, flags & FLAGS_SHARED, &key1); 1275 ret = get_futex_key(uaddr1, flags & FLAGS_SHARED, &key1, VERIFY_READ);
1247 if (unlikely(ret != 0)) 1276 if (unlikely(ret != 0))
1248 goto out; 1277 goto out;
1249 ret = get_futex_key(uaddr2, flags & FLAGS_SHARED, &key2); 1278 ret = get_futex_key(uaddr2, flags & FLAGS_SHARED, &key2,
1279 requeue_pi ? VERIFY_WRITE : VERIFY_READ);
1250 if (unlikely(ret != 0)) 1280 if (unlikely(ret != 0))
1251 goto out_put_key1; 1281 goto out_put_key1;
1252 1282
@@ -1790,7 +1820,7 @@ static int futex_wait_setup(u32 __user *uaddr, u32 val, unsigned int flags,
1790 * while the syscall executes. 1820 * while the syscall executes.
1791 */ 1821 */
1792retry: 1822retry:
1793 ret = get_futex_key(uaddr, flags & FLAGS_SHARED, &q->key); 1823 ret = get_futex_key(uaddr, flags & FLAGS_SHARED, &q->key, VERIFY_READ);
1794 if (unlikely(ret != 0)) 1824 if (unlikely(ret != 0))
1795 return ret; 1825 return ret;
1796 1826
@@ -1941,7 +1971,7 @@ static int futex_lock_pi(u32 __user *uaddr, unsigned int flags, int detect,
1941 } 1971 }
1942 1972
1943retry: 1973retry:
1944 ret = get_futex_key(uaddr, flags & FLAGS_SHARED, &q.key); 1974 ret = get_futex_key(uaddr, flags & FLAGS_SHARED, &q.key, VERIFY_WRITE);
1945 if (unlikely(ret != 0)) 1975 if (unlikely(ret != 0))
1946 goto out; 1976 goto out;
1947 1977
@@ -2060,7 +2090,7 @@ retry:
2060 if ((uval & FUTEX_TID_MASK) != vpid) 2090 if ((uval & FUTEX_TID_MASK) != vpid)
2061 return -EPERM; 2091 return -EPERM;
2062 2092
2063 ret = get_futex_key(uaddr, flags & FLAGS_SHARED, &key); 2093 ret = get_futex_key(uaddr, flags & FLAGS_SHARED, &key, VERIFY_WRITE);
2064 if (unlikely(ret != 0)) 2094 if (unlikely(ret != 0))
2065 goto out; 2095 goto out;
2066 2096
@@ -2249,7 +2279,7 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags,
2249 debug_rt_mutex_init_waiter(&rt_waiter); 2279 debug_rt_mutex_init_waiter(&rt_waiter);
2250 rt_waiter.task = NULL; 2280 rt_waiter.task = NULL;
2251 2281
2252 ret = get_futex_key(uaddr2, flags & FLAGS_SHARED, &key2); 2282 ret = get_futex_key(uaddr2, flags & FLAGS_SHARED, &key2, VERIFY_WRITE);
2253 if (unlikely(ret != 0)) 2283 if (unlikely(ret != 0))
2254 goto out; 2284 goto out;
2255 2285
@@ -2697,7 +2727,7 @@ static int __init futex_init(void)
2697 futex_cmpxchg_enabled = 1; 2727 futex_cmpxchg_enabled = 1;
2698 2728
2699 for (i = 0; i < ARRAY_SIZE(futex_queues); i++) { 2729 for (i = 0; i < ARRAY_SIZE(futex_queues); i++) {
2700 plist_head_init(&futex_queues[i].chain, &futex_queues[i].lock); 2730 plist_head_init(&futex_queues[i].chain);
2701 spin_lock_init(&futex_queues[i].lock); 2731 spin_lock_init(&futex_queues[i].lock);
2702 } 2732 }
2703 2733
diff --git a/kernel/gcov/Kconfig b/kernel/gcov/Kconfig
index b8cadf70b1fb..a92028196cc1 100644
--- a/kernel/gcov/Kconfig
+++ b/kernel/gcov/Kconfig
@@ -2,7 +2,8 @@ menu "GCOV-based kernel profiling"
2 2
3config GCOV_KERNEL 3config GCOV_KERNEL
4 bool "Enable gcov-based kernel profiling" 4 bool "Enable gcov-based kernel profiling"
5 depends on DEBUG_FS && CONSTRUCTORS 5 depends on DEBUG_FS
6 select CONSTRUCTORS if !UML
6 default n 7 default n
7 ---help--- 8 ---help---
8 This option enables gcov-based code profiling (e.g. for code coverage 9 This option enables gcov-based code profiling (e.g. for code coverage
diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c
index 87fdb3f8db14..a9205e32a059 100644
--- a/kernel/hrtimer.c
+++ b/kernel/hrtimer.c
@@ -64,24 +64,27 @@ DEFINE_PER_CPU(struct hrtimer_cpu_base, hrtimer_bases) =
64 .clock_base = 64 .clock_base =
65 { 65 {
66 { 66 {
67 .index = CLOCK_REALTIME, 67 .index = HRTIMER_BASE_MONOTONIC,
68 .get_time = &ktime_get_real, 68 .clockid = CLOCK_MONOTONIC,
69 .get_time = &ktime_get,
69 .resolution = KTIME_LOW_RES, 70 .resolution = KTIME_LOW_RES,
70 }, 71 },
71 { 72 {
72 .index = CLOCK_MONOTONIC, 73 .index = HRTIMER_BASE_REALTIME,
73 .get_time = &ktime_get, 74 .clockid = CLOCK_REALTIME,
75 .get_time = &ktime_get_real,
74 .resolution = KTIME_LOW_RES, 76 .resolution = KTIME_LOW_RES,
75 }, 77 },
76 { 78 {
77 .index = CLOCK_BOOTTIME, 79 .index = HRTIMER_BASE_BOOTTIME,
80 .clockid = CLOCK_BOOTTIME,
78 .get_time = &ktime_get_boottime, 81 .get_time = &ktime_get_boottime,
79 .resolution = KTIME_LOW_RES, 82 .resolution = KTIME_LOW_RES,
80 }, 83 },
81 } 84 }
82}; 85};
83 86
84static int hrtimer_clock_to_base_table[MAX_CLOCKS] = { 87static const int hrtimer_clock_to_base_table[MAX_CLOCKS] = {
85 [CLOCK_REALTIME] = HRTIMER_BASE_REALTIME, 88 [CLOCK_REALTIME] = HRTIMER_BASE_REALTIME,
86 [CLOCK_MONOTONIC] = HRTIMER_BASE_MONOTONIC, 89 [CLOCK_MONOTONIC] = HRTIMER_BASE_MONOTONIC,
87 [CLOCK_BOOTTIME] = HRTIMER_BASE_BOOTTIME, 90 [CLOCK_BOOTTIME] = HRTIMER_BASE_BOOTTIME,
@@ -196,7 +199,7 @@ switch_hrtimer_base(struct hrtimer *timer, struct hrtimer_clock_base *base,
196 struct hrtimer_cpu_base *new_cpu_base; 199 struct hrtimer_cpu_base *new_cpu_base;
197 int this_cpu = smp_processor_id(); 200 int this_cpu = smp_processor_id();
198 int cpu = hrtimer_get_target(this_cpu, pinned); 201 int cpu = hrtimer_get_target(this_cpu, pinned);
199 int basenum = hrtimer_clockid_to_base(base->index); 202 int basenum = base->index;
200 203
201again: 204again:
202 new_cpu_base = &per_cpu(hrtimer_bases, cpu); 205 new_cpu_base = &per_cpu(hrtimer_bases, cpu);
@@ -621,66 +624,6 @@ static int hrtimer_reprogram(struct hrtimer *timer,
621 return res; 624 return res;
622} 625}
623 626
624
625/*
626 * Retrigger next event is called after clock was set
627 *
628 * Called with interrupts disabled via on_each_cpu()
629 */
630static void retrigger_next_event(void *arg)
631{
632 struct hrtimer_cpu_base *base;
633 struct timespec realtime_offset, wtm, sleep;
634
635 if (!hrtimer_hres_active())
636 return;
637
638 get_xtime_and_monotonic_and_sleep_offset(&realtime_offset, &wtm,
639 &sleep);
640 set_normalized_timespec(&realtime_offset, -wtm.tv_sec, -wtm.tv_nsec);
641
642 base = &__get_cpu_var(hrtimer_bases);
643
644 /* Adjust CLOCK_REALTIME offset */
645 raw_spin_lock(&base->lock);
646 base->clock_base[HRTIMER_BASE_REALTIME].offset =
647 timespec_to_ktime(realtime_offset);
648 base->clock_base[HRTIMER_BASE_BOOTTIME].offset =
649 timespec_to_ktime(sleep);
650
651 hrtimer_force_reprogram(base, 0);
652 raw_spin_unlock(&base->lock);
653}
654
655/*
656 * Clock realtime was set
657 *
658 * Change the offset of the realtime clock vs. the monotonic
659 * clock.
660 *
661 * We might have to reprogram the high resolution timer interrupt. On
662 * SMP we call the architecture specific code to retrigger _all_ high
663 * resolution timer interrupts. On UP we just disable interrupts and
664 * call the high resolution interrupt code.
665 */
666void clock_was_set(void)
667{
668 /* Retrigger the CPU local events everywhere */
669 on_each_cpu(retrigger_next_event, NULL, 1);
670}
671
672/*
673 * During resume we might have to reprogram the high resolution timer
674 * interrupt (on the local CPU):
675 */
676void hres_timers_resume(void)
677{
678 WARN_ONCE(!irqs_disabled(),
679 KERN_INFO "hres_timers_resume() called with IRQs enabled!");
680
681 retrigger_next_event(NULL);
682}
683
684/* 627/*
685 * Initialize the high resolution related parts of cpu_base 628 * Initialize the high resolution related parts of cpu_base
686 */ 629 */
@@ -715,11 +658,39 @@ static inline int hrtimer_enqueue_reprogram(struct hrtimer *timer,
715} 658}
716 659
717/* 660/*
661 * Retrigger next event is called after clock was set
662 *
663 * Called with interrupts disabled via on_each_cpu()
664 */
665static void retrigger_next_event(void *arg)
666{
667 struct hrtimer_cpu_base *base = &__get_cpu_var(hrtimer_bases);
668 struct timespec realtime_offset, xtim, wtm, sleep;
669
670 if (!hrtimer_hres_active())
671 return;
672
673 /* Optimized out for !HIGH_RES */
674 get_xtime_and_monotonic_and_sleep_offset(&xtim, &wtm, &sleep);
675 set_normalized_timespec(&realtime_offset, -wtm.tv_sec, -wtm.tv_nsec);
676
677 /* Adjust CLOCK_REALTIME offset */
678 raw_spin_lock(&base->lock);
679 base->clock_base[HRTIMER_BASE_REALTIME].offset =
680 timespec_to_ktime(realtime_offset);
681 base->clock_base[HRTIMER_BASE_BOOTTIME].offset =
682 timespec_to_ktime(sleep);
683
684 hrtimer_force_reprogram(base, 0);
685 raw_spin_unlock(&base->lock);
686}
687
688/*
718 * Switch to high resolution mode 689 * Switch to high resolution mode
719 */ 690 */
720static int hrtimer_switch_to_hres(void) 691static int hrtimer_switch_to_hres(void)
721{ 692{
722 int cpu = smp_processor_id(); 693 int i, cpu = smp_processor_id();
723 struct hrtimer_cpu_base *base = &per_cpu(hrtimer_bases, cpu); 694 struct hrtimer_cpu_base *base = &per_cpu(hrtimer_bases, cpu);
724 unsigned long flags; 695 unsigned long flags;
725 696
@@ -735,9 +706,8 @@ static int hrtimer_switch_to_hres(void)
735 return 0; 706 return 0;
736 } 707 }
737 base->hres_active = 1; 708 base->hres_active = 1;
738 base->clock_base[HRTIMER_BASE_REALTIME].resolution = KTIME_HIGH_RES; 709 for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++)
739 base->clock_base[HRTIMER_BASE_MONOTONIC].resolution = KTIME_HIGH_RES; 710 base->clock_base[i].resolution = KTIME_HIGH_RES;
740 base->clock_base[HRTIMER_BASE_BOOTTIME].resolution = KTIME_HIGH_RES;
741 711
742 tick_setup_sched_timer(); 712 tick_setup_sched_timer();
743 713
@@ -761,9 +731,43 @@ static inline int hrtimer_enqueue_reprogram(struct hrtimer *timer,
761 return 0; 731 return 0;
762} 732}
763static inline void hrtimer_init_hres(struct hrtimer_cpu_base *base) { } 733static inline void hrtimer_init_hres(struct hrtimer_cpu_base *base) { }
734static inline void retrigger_next_event(void *arg) { }
764 735
765#endif /* CONFIG_HIGH_RES_TIMERS */ 736#endif /* CONFIG_HIGH_RES_TIMERS */
766 737
738/*
739 * Clock realtime was set
740 *
741 * Change the offset of the realtime clock vs. the monotonic
742 * clock.
743 *
744 * We might have to reprogram the high resolution timer interrupt. On
745 * SMP we call the architecture specific code to retrigger _all_ high
746 * resolution timer interrupts. On UP we just disable interrupts and
747 * call the high resolution interrupt code.
748 */
749void clock_was_set(void)
750{
751#ifdef CONFIG_HIGH_RES_TIMERS
752 /* Retrigger the CPU local events everywhere */
753 on_each_cpu(retrigger_next_event, NULL, 1);
754#endif
755 timerfd_clock_was_set();
756}
757
758/*
759 * During resume we might have to reprogram the high resolution timer
760 * interrupt (on the local CPU):
761 */
762void hrtimers_resume(void)
763{
764 WARN_ONCE(!irqs_disabled(),
765 KERN_INFO "hrtimers_resume() called with IRQs enabled!");
766
767 retrigger_next_event(NULL);
768 timerfd_clock_was_set();
769}
770
767static inline void timer_stats_hrtimer_set_start_info(struct hrtimer *timer) 771static inline void timer_stats_hrtimer_set_start_info(struct hrtimer *timer)
768{ 772{
769#ifdef CONFIG_TIMER_STATS 773#ifdef CONFIG_TIMER_STATS
@@ -856,6 +860,7 @@ static int enqueue_hrtimer(struct hrtimer *timer,
856 debug_activate(timer); 860 debug_activate(timer);
857 861
858 timerqueue_add(&base->active, &timer->node); 862 timerqueue_add(&base->active, &timer->node);
863 base->cpu_base->active_bases |= 1 << base->index;
859 864
860 /* 865 /*
861 * HRTIMER_STATE_ENQUEUED is or'ed to the current state to preserve the 866 * HRTIMER_STATE_ENQUEUED is or'ed to the current state to preserve the
@@ -897,6 +902,8 @@ static void __remove_hrtimer(struct hrtimer *timer,
897#endif 902#endif
898 } 903 }
899 timerqueue_del(&base->active, &timer->node); 904 timerqueue_del(&base->active, &timer->node);
905 if (!timerqueue_getnext(&base->active))
906 base->cpu_base->active_bases &= ~(1 << base->index);
900out: 907out:
901 timer->state = newstate; 908 timer->state = newstate;
902} 909}
@@ -1234,7 +1241,6 @@ static void __run_hrtimer(struct hrtimer *timer, ktime_t *now)
1234void hrtimer_interrupt(struct clock_event_device *dev) 1241void hrtimer_interrupt(struct clock_event_device *dev)
1235{ 1242{
1236 struct hrtimer_cpu_base *cpu_base = &__get_cpu_var(hrtimer_bases); 1243 struct hrtimer_cpu_base *cpu_base = &__get_cpu_var(hrtimer_bases);
1237 struct hrtimer_clock_base *base;
1238 ktime_t expires_next, now, entry_time, delta; 1244 ktime_t expires_next, now, entry_time, delta;
1239 int i, retries = 0; 1245 int i, retries = 0;
1240 1246
@@ -1256,12 +1262,15 @@ retry:
1256 */ 1262 */
1257 cpu_base->expires_next.tv64 = KTIME_MAX; 1263 cpu_base->expires_next.tv64 = KTIME_MAX;
1258 1264
1259 base = cpu_base->clock_base;
1260
1261 for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++) { 1265 for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++) {
1262 ktime_t basenow; 1266 struct hrtimer_clock_base *base;
1263 struct timerqueue_node *node; 1267 struct timerqueue_node *node;
1268 ktime_t basenow;
1269
1270 if (!(cpu_base->active_bases & (1 << i)))
1271 continue;
1264 1272
1273 base = cpu_base->clock_base + i;
1265 basenow = ktime_add(now, base->offset); 1274 basenow = ktime_add(now, base->offset);
1266 1275
1267 while ((node = timerqueue_getnext(&base->active))) { 1276 while ((node = timerqueue_getnext(&base->active))) {
@@ -1294,7 +1303,6 @@ retry:
1294 1303
1295 __run_hrtimer(timer, &basenow); 1304 __run_hrtimer(timer, &basenow);
1296 } 1305 }
1297 base++;
1298 } 1306 }
1299 1307
1300 /* 1308 /*
@@ -1525,7 +1533,7 @@ long __sched hrtimer_nanosleep_restart(struct restart_block *restart)
1525 struct timespec __user *rmtp; 1533 struct timespec __user *rmtp;
1526 int ret = 0; 1534 int ret = 0;
1527 1535
1528 hrtimer_init_on_stack(&t.timer, restart->nanosleep.index, 1536 hrtimer_init_on_stack(&t.timer, restart->nanosleep.clockid,
1529 HRTIMER_MODE_ABS); 1537 HRTIMER_MODE_ABS);
1530 hrtimer_set_expires_tv64(&t.timer, restart->nanosleep.expires); 1538 hrtimer_set_expires_tv64(&t.timer, restart->nanosleep.expires);
1531 1539
@@ -1577,7 +1585,7 @@ long hrtimer_nanosleep(struct timespec *rqtp, struct timespec __user *rmtp,
1577 1585
1578 restart = &current_thread_info()->restart_block; 1586 restart = &current_thread_info()->restart_block;
1579 restart->fn = hrtimer_nanosleep_restart; 1587 restart->fn = hrtimer_nanosleep_restart;
1580 restart->nanosleep.index = t.timer.base->index; 1588 restart->nanosleep.clockid = t.timer.base->clockid;
1581 restart->nanosleep.rmtp = rmtp; 1589 restart->nanosleep.rmtp = rmtp;
1582 restart->nanosleep.expires = hrtimer_get_expires_tv64(&t.timer); 1590 restart->nanosleep.expires = hrtimer_get_expires_tv64(&t.timer);
1583 1591
diff --git a/kernel/hung_task.c b/kernel/hung_task.c
index 53ead174da2f..ea640120ab86 100644
--- a/kernel/hung_task.c
+++ b/kernel/hung_task.c
@@ -33,7 +33,7 @@ unsigned long __read_mostly sysctl_hung_task_check_count = PID_MAX_LIMIT;
33/* 33/*
34 * Zero means infinite timeout - no checking done: 34 * Zero means infinite timeout - no checking done:
35 */ 35 */
36unsigned long __read_mostly sysctl_hung_task_timeout_secs = 120; 36unsigned long __read_mostly sysctl_hung_task_timeout_secs = CONFIG_DEFAULT_HUNG_TASK_TIMEOUT;
37 37
38unsigned long __read_mostly sysctl_hung_task_warnings = 10; 38unsigned long __read_mostly sysctl_hung_task_warnings = 10;
39 39
diff --git a/kernel/irq/Kconfig b/kernel/irq/Kconfig
index c574f9a12c48..5a38bf4de641 100644
--- a/kernel/irq/Kconfig
+++ b/kernel/irq/Kconfig
@@ -48,6 +48,14 @@ config IRQ_PREFLOW_FASTEOI
48config IRQ_EDGE_EOI_HANDLER 48config IRQ_EDGE_EOI_HANDLER
49 bool 49 bool
50 50
51# Generic configurable interrupt chip implementation
52config GENERIC_IRQ_CHIP
53 bool
54
55# Generic irq_domain hw <--> linux irq number translation
56config IRQ_DOMAIN
57 bool
58
51# Support forced irq threading 59# Support forced irq threading
52config IRQ_FORCED_THREADING 60config IRQ_FORCED_THREADING
53 bool 61 bool
diff --git a/kernel/irq/Makefile b/kernel/irq/Makefile
index 54329cd7b3ee..fff17381f0af 100644
--- a/kernel/irq/Makefile
+++ b/kernel/irq/Makefile
@@ -1,6 +1,8 @@
1 1
2obj-y := irqdesc.o handle.o manage.o spurious.o resend.o chip.o dummychip.o devres.o 2obj-y := irqdesc.o handle.o manage.o spurious.o resend.o chip.o dummychip.o devres.o
3obj-$(CONFIG_GENERIC_IRQ_CHIP) += generic-chip.o
3obj-$(CONFIG_GENERIC_IRQ_PROBE) += autoprobe.o 4obj-$(CONFIG_GENERIC_IRQ_PROBE) += autoprobe.o
5obj-$(CONFIG_IRQ_DOMAIN) += irqdomain.o
4obj-$(CONFIG_PROC_FS) += proc.o 6obj-$(CONFIG_PROC_FS) += proc.o
5obj-$(CONFIG_GENERIC_PENDING_IRQ) += migration.o 7obj-$(CONFIG_GENERIC_PENDING_IRQ) += migration.o
6obj-$(CONFIG_PM_SLEEP) += pm.o 8obj-$(CONFIG_PM_SLEEP) += pm.o
diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c
index 4af1e2b244cb..d5a3009da71a 100644
--- a/kernel/irq/chip.c
+++ b/kernel/irq/chip.c
@@ -310,6 +310,7 @@ handle_simple_irq(unsigned int irq, struct irq_desc *desc)
310out_unlock: 310out_unlock:
311 raw_spin_unlock(&desc->lock); 311 raw_spin_unlock(&desc->lock);
312} 312}
313EXPORT_SYMBOL_GPL(handle_simple_irq);
313 314
314/** 315/**
315 * handle_level_irq - Level type irq handler 316 * handle_level_irq - Level type irq handler
@@ -573,6 +574,7 @@ __irq_set_handler(unsigned int irq, irq_flow_handler_t handle, int is_chained,
573 if (handle != handle_bad_irq && is_chained) { 574 if (handle != handle_bad_irq && is_chained) {
574 irq_settings_set_noprobe(desc); 575 irq_settings_set_noprobe(desc);
575 irq_settings_set_norequest(desc); 576 irq_settings_set_norequest(desc);
577 irq_settings_set_nothread(desc);
576 irq_startup(desc); 578 irq_startup(desc);
577 } 579 }
578out: 580out:
@@ -612,6 +614,7 @@ void irq_modify_status(unsigned int irq, unsigned long clr, unsigned long set)
612 614
613 irq_put_desc_unlock(desc, flags); 615 irq_put_desc_unlock(desc, flags);
614} 616}
617EXPORT_SYMBOL_GPL(irq_modify_status);
615 618
616/** 619/**
617 * irq_cpu_online - Invoke all irq_cpu_online functions. 620 * irq_cpu_online - Invoke all irq_cpu_online functions.
diff --git a/kernel/irq/debug.h b/kernel/irq/debug.h
index 306cba37e9a5..97a8bfadc88a 100644
--- a/kernel/irq/debug.h
+++ b/kernel/irq/debug.h
@@ -27,6 +27,7 @@ static inline void print_irq_desc(unsigned int irq, struct irq_desc *desc)
27 P(IRQ_PER_CPU); 27 P(IRQ_PER_CPU);
28 P(IRQ_NOPROBE); 28 P(IRQ_NOPROBE);
29 P(IRQ_NOREQUEST); 29 P(IRQ_NOREQUEST);
30 P(IRQ_NOTHREAD);
30 P(IRQ_NOAUTOEN); 31 P(IRQ_NOAUTOEN);
31 32
32 PS(IRQS_AUTODETECT); 33 PS(IRQS_AUTODETECT);
diff --git a/kernel/irq/devres.c b/kernel/irq/devres.c
index 1ef4ffcdfa55..bd8e788d71e0 100644
--- a/kernel/irq/devres.c
+++ b/kernel/irq/devres.c
@@ -87,8 +87,8 @@ void devm_free_irq(struct device *dev, unsigned int irq, void *dev_id)
87{ 87{
88 struct irq_devres match_data = { irq, dev_id }; 88 struct irq_devres match_data = { irq, dev_id };
89 89
90 free_irq(irq, dev_id);
91 WARN_ON(devres_destroy(dev, devm_irq_release, devm_irq_match, 90 WARN_ON(devres_destroy(dev, devm_irq_release, devm_irq_match,
92 &match_data)); 91 &match_data));
92 free_irq(irq, dev_id);
93} 93}
94EXPORT_SYMBOL(devm_free_irq); 94EXPORT_SYMBOL(devm_free_irq);
diff --git a/kernel/irq/generic-chip.c b/kernel/irq/generic-chip.c
new file mode 100644
index 000000000000..3a2cab407b93
--- /dev/null
+++ b/kernel/irq/generic-chip.c
@@ -0,0 +1,368 @@
1/*
2 * Library implementing the most common irq chip callback functions
3 *
4 * Copyright (C) 2011, Thomas Gleixner
5 */
6#include <linux/io.h>
7#include <linux/irq.h>
8#include <linux/slab.h>
9#include <linux/interrupt.h>
10#include <linux/kernel_stat.h>
11#include <linux/syscore_ops.h>
12
13#include "internals.h"
14
15static LIST_HEAD(gc_list);
16static DEFINE_RAW_SPINLOCK(gc_lock);
17
18static inline struct irq_chip_regs *cur_regs(struct irq_data *d)
19{
20 return &container_of(d->chip, struct irq_chip_type, chip)->regs;
21}
22
23/**
24 * irq_gc_noop - NOOP function
25 * @d: irq_data
26 */
27void irq_gc_noop(struct irq_data *d)
28{
29}
30
31/**
32 * irq_gc_mask_disable_reg - Mask chip via disable register
33 * @d: irq_data
34 *
35 * Chip has separate enable/disable registers instead of a single mask
36 * register.
37 */
38void irq_gc_mask_disable_reg(struct irq_data *d)
39{
40 struct irq_chip_generic *gc = irq_data_get_irq_chip_data(d);
41 u32 mask = 1 << (d->irq - gc->irq_base);
42
43 irq_gc_lock(gc);
44 irq_reg_writel(mask, gc->reg_base + cur_regs(d)->disable);
45 gc->mask_cache &= ~mask;
46 irq_gc_unlock(gc);
47}
48
49/**
50 * irq_gc_mask_set_mask_bit - Mask chip via setting bit in mask register
51 * @d: irq_data
52 *
53 * Chip has a single mask register. Values of this register are cached
54 * and protected by gc->lock
55 */
56void irq_gc_mask_set_bit(struct irq_data *d)
57{
58 struct irq_chip_generic *gc = irq_data_get_irq_chip_data(d);
59 u32 mask = 1 << (d->irq - gc->irq_base);
60
61 irq_gc_lock(gc);
62 gc->mask_cache |= mask;
63 irq_reg_writel(gc->mask_cache, gc->reg_base + cur_regs(d)->mask);
64 irq_gc_unlock(gc);
65}
66
67/**
68 * irq_gc_mask_set_mask_bit - Mask chip via clearing bit in mask register
69 * @d: irq_data
70 *
71 * Chip has a single mask register. Values of this register are cached
72 * and protected by gc->lock
73 */
74void irq_gc_mask_clr_bit(struct irq_data *d)
75{
76 struct irq_chip_generic *gc = irq_data_get_irq_chip_data(d);
77 u32 mask = 1 << (d->irq - gc->irq_base);
78
79 irq_gc_lock(gc);
80 gc->mask_cache &= ~mask;
81 irq_reg_writel(gc->mask_cache, gc->reg_base + cur_regs(d)->mask);
82 irq_gc_unlock(gc);
83}
84
85/**
86 * irq_gc_unmask_enable_reg - Unmask chip via enable register
87 * @d: irq_data
88 *
89 * Chip has separate enable/disable registers instead of a single mask
90 * register.
91 */
92void irq_gc_unmask_enable_reg(struct irq_data *d)
93{
94 struct irq_chip_generic *gc = irq_data_get_irq_chip_data(d);
95 u32 mask = 1 << (d->irq - gc->irq_base);
96
97 irq_gc_lock(gc);
98 irq_reg_writel(mask, gc->reg_base + cur_regs(d)->enable);
99 gc->mask_cache |= mask;
100 irq_gc_unlock(gc);
101}
102
103/**
104 * irq_gc_ack_set_bit - Ack pending interrupt via setting bit
105 * @d: irq_data
106 */
107void irq_gc_ack_set_bit(struct irq_data *d)
108{
109 struct irq_chip_generic *gc = irq_data_get_irq_chip_data(d);
110 u32 mask = 1 << (d->irq - gc->irq_base);
111
112 irq_gc_lock(gc);
113 irq_reg_writel(mask, gc->reg_base + cur_regs(d)->ack);
114 irq_gc_unlock(gc);
115}
116
117/**
118 * irq_gc_ack_clr_bit - Ack pending interrupt via clearing bit
119 * @d: irq_data
120 */
121void irq_gc_ack_clr_bit(struct irq_data *d)
122{
123 struct irq_chip_generic *gc = irq_data_get_irq_chip_data(d);
124 u32 mask = ~(1 << (d->irq - gc->irq_base));
125
126 irq_gc_lock(gc);
127 irq_reg_writel(mask, gc->reg_base + cur_regs(d)->ack);
128 irq_gc_unlock(gc);
129}
130
131/**
132 * irq_gc_mask_disable_reg_and_ack- Mask and ack pending interrupt
133 * @d: irq_data
134 */
135void irq_gc_mask_disable_reg_and_ack(struct irq_data *d)
136{
137 struct irq_chip_generic *gc = irq_data_get_irq_chip_data(d);
138 u32 mask = 1 << (d->irq - gc->irq_base);
139
140 irq_gc_lock(gc);
141 irq_reg_writel(mask, gc->reg_base + cur_regs(d)->mask);
142 irq_reg_writel(mask, gc->reg_base + cur_regs(d)->ack);
143 irq_gc_unlock(gc);
144}
145
146/**
147 * irq_gc_eoi - EOI interrupt
148 * @d: irq_data
149 */
150void irq_gc_eoi(struct irq_data *d)
151{
152 struct irq_chip_generic *gc = irq_data_get_irq_chip_data(d);
153 u32 mask = 1 << (d->irq - gc->irq_base);
154
155 irq_gc_lock(gc);
156 irq_reg_writel(mask, gc->reg_base + cur_regs(d)->eoi);
157 irq_gc_unlock(gc);
158}
159
160/**
161 * irq_gc_set_wake - Set/clr wake bit for an interrupt
162 * @d: irq_data
163 *
164 * For chips where the wake from suspend functionality is not
165 * configured in a separate register and the wakeup active state is
166 * just stored in a bitmask.
167 */
168int irq_gc_set_wake(struct irq_data *d, unsigned int on)
169{
170 struct irq_chip_generic *gc = irq_data_get_irq_chip_data(d);
171 u32 mask = 1 << (d->irq - gc->irq_base);
172
173 if (!(mask & gc->wake_enabled))
174 return -EINVAL;
175
176 irq_gc_lock(gc);
177 if (on)
178 gc->wake_active |= mask;
179 else
180 gc->wake_active &= ~mask;
181 irq_gc_unlock(gc);
182 return 0;
183}
184
185/**
186 * irq_alloc_generic_chip - Allocate a generic chip and initialize it
187 * @name: Name of the irq chip
188 * @num_ct: Number of irq_chip_type instances associated with this
189 * @irq_base: Interrupt base nr for this chip
190 * @reg_base: Register base address (virtual)
191 * @handler: Default flow handler associated with this chip
192 *
193 * Returns an initialized irq_chip_generic structure. The chip defaults
194 * to the primary (index 0) irq_chip_type and @handler
195 */
196struct irq_chip_generic *
197irq_alloc_generic_chip(const char *name, int num_ct, unsigned int irq_base,
198 void __iomem *reg_base, irq_flow_handler_t handler)
199{
200 struct irq_chip_generic *gc;
201 unsigned long sz = sizeof(*gc) + num_ct * sizeof(struct irq_chip_type);
202
203 gc = kzalloc(sz, GFP_KERNEL);
204 if (gc) {
205 raw_spin_lock_init(&gc->lock);
206 gc->num_ct = num_ct;
207 gc->irq_base = irq_base;
208 gc->reg_base = reg_base;
209 gc->chip_types->chip.name = name;
210 gc->chip_types->handler = handler;
211 }
212 return gc;
213}
214
215/*
216 * Separate lockdep class for interrupt chip which can nest irq_desc
217 * lock.
218 */
219static struct lock_class_key irq_nested_lock_class;
220
221/**
222 * irq_setup_generic_chip - Setup a range of interrupts with a generic chip
223 * @gc: Generic irq chip holding all data
224 * @msk: Bitmask holding the irqs to initialize relative to gc->irq_base
225 * @flags: Flags for initialization
226 * @clr: IRQ_* bits to clear
227 * @set: IRQ_* bits to set
228 *
229 * Set up max. 32 interrupts starting from gc->irq_base. Note, this
230 * initializes all interrupts to the primary irq_chip_type and its
231 * associated handler.
232 */
233void irq_setup_generic_chip(struct irq_chip_generic *gc, u32 msk,
234 enum irq_gc_flags flags, unsigned int clr,
235 unsigned int set)
236{
237 struct irq_chip_type *ct = gc->chip_types;
238 unsigned int i;
239
240 raw_spin_lock(&gc_lock);
241 list_add_tail(&gc->list, &gc_list);
242 raw_spin_unlock(&gc_lock);
243
244 /* Init mask cache ? */
245 if (flags & IRQ_GC_INIT_MASK_CACHE)
246 gc->mask_cache = irq_reg_readl(gc->reg_base + ct->regs.mask);
247
248 for (i = gc->irq_base; msk; msk >>= 1, i++) {
249 if (!msk & 0x01)
250 continue;
251
252 if (flags & IRQ_GC_INIT_NESTED_LOCK)
253 irq_set_lockdep_class(i, &irq_nested_lock_class);
254
255 irq_set_chip_and_handler(i, &ct->chip, ct->handler);
256 irq_set_chip_data(i, gc);
257 irq_modify_status(i, clr, set);
258 }
259 gc->irq_cnt = i - gc->irq_base;
260}
261
262/**
263 * irq_setup_alt_chip - Switch to alternative chip
264 * @d: irq_data for this interrupt
265 * @type Flow type to be initialized
266 *
267 * Only to be called from chip->irq_set_type() callbacks.
268 */
269int irq_setup_alt_chip(struct irq_data *d, unsigned int type)
270{
271 struct irq_chip_generic *gc = irq_data_get_irq_chip_data(d);
272 struct irq_chip_type *ct = gc->chip_types;
273 unsigned int i;
274
275 for (i = 0; i < gc->num_ct; i++, ct++) {
276 if (ct->type & type) {
277 d->chip = &ct->chip;
278 irq_data_to_desc(d)->handle_irq = ct->handler;
279 return 0;
280 }
281 }
282 return -EINVAL;
283}
284
285/**
286 * irq_remove_generic_chip - Remove a chip
287 * @gc: Generic irq chip holding all data
288 * @msk: Bitmask holding the irqs to initialize relative to gc->irq_base
289 * @clr: IRQ_* bits to clear
290 * @set: IRQ_* bits to set
291 *
292 * Remove up to 32 interrupts starting from gc->irq_base.
293 */
294void irq_remove_generic_chip(struct irq_chip_generic *gc, u32 msk,
295 unsigned int clr, unsigned int set)
296{
297 unsigned int i = gc->irq_base;
298
299 raw_spin_lock(&gc_lock);
300 list_del(&gc->list);
301 raw_spin_unlock(&gc_lock);
302
303 for (; msk; msk >>= 1, i++) {
304 if (!msk & 0x01)
305 continue;
306
307 /* Remove handler first. That will mask the irq line */
308 irq_set_handler(i, NULL);
309 irq_set_chip(i, &no_irq_chip);
310 irq_set_chip_data(i, NULL);
311 irq_modify_status(i, clr, set);
312 }
313}
314
315#ifdef CONFIG_PM
316static int irq_gc_suspend(void)
317{
318 struct irq_chip_generic *gc;
319
320 list_for_each_entry(gc, &gc_list, list) {
321 struct irq_chip_type *ct = gc->chip_types;
322
323 if (ct->chip.irq_suspend)
324 ct->chip.irq_suspend(irq_get_irq_data(gc->irq_base));
325 }
326 return 0;
327}
328
329static void irq_gc_resume(void)
330{
331 struct irq_chip_generic *gc;
332
333 list_for_each_entry(gc, &gc_list, list) {
334 struct irq_chip_type *ct = gc->chip_types;
335
336 if (ct->chip.irq_resume)
337 ct->chip.irq_resume(irq_get_irq_data(gc->irq_base));
338 }
339}
340#else
341#define irq_gc_suspend NULL
342#define irq_gc_resume NULL
343#endif
344
345static void irq_gc_shutdown(void)
346{
347 struct irq_chip_generic *gc;
348
349 list_for_each_entry(gc, &gc_list, list) {
350 struct irq_chip_type *ct = gc->chip_types;
351
352 if (ct->chip.irq_pm_shutdown)
353 ct->chip.irq_pm_shutdown(irq_get_irq_data(gc->irq_base));
354 }
355}
356
357static struct syscore_ops irq_gc_syscore_ops = {
358 .suspend = irq_gc_suspend,
359 .resume = irq_gc_resume,
360 .shutdown = irq_gc_shutdown,
361};
362
363static int __init irq_gc_init_ops(void)
364{
365 register_syscore_ops(&irq_gc_syscore_ops);
366 return 0;
367}
368device_initcall(irq_gc_init_ops);
diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c
index 90cb55f6d7eb..470d08c82bbe 100644
--- a/kernel/irq/handle.c
+++ b/kernel/irq/handle.c
@@ -133,12 +133,6 @@ handle_irq_event_percpu(struct irq_desc *desc, struct irqaction *action)
133 switch (res) { 133 switch (res) {
134 case IRQ_WAKE_THREAD: 134 case IRQ_WAKE_THREAD:
135 /* 135 /*
136 * Set result to handled so the spurious check
137 * does not trigger.
138 */
139 res = IRQ_HANDLED;
140
141 /*
142 * Catch drivers which return WAKE_THREAD but 136 * Catch drivers which return WAKE_THREAD but
143 * did not set up a thread function 137 * did not set up a thread function
144 */ 138 */
diff --git a/kernel/irq/irqdesc.c b/kernel/irq/irqdesc.c
index 2c039c9b9383..4c60a50e66b2 100644
--- a/kernel/irq/irqdesc.c
+++ b/kernel/irq/irqdesc.c
@@ -22,7 +22,7 @@
22 */ 22 */
23static struct lock_class_key irq_desc_lock_class; 23static struct lock_class_key irq_desc_lock_class;
24 24
25#if defined(CONFIG_SMP) && defined(CONFIG_GENERIC_HARDIRQS) 25#if defined(CONFIG_SMP)
26static void __init init_irq_default_affinity(void) 26static void __init init_irq_default_affinity(void)
27{ 27{
28 alloc_cpumask_var(&irq_default_affinity, GFP_NOWAIT); 28 alloc_cpumask_var(&irq_default_affinity, GFP_NOWAIT);
@@ -257,13 +257,11 @@ int __init early_irq_init(void)
257 count = ARRAY_SIZE(irq_desc); 257 count = ARRAY_SIZE(irq_desc);
258 258
259 for (i = 0; i < count; i++) { 259 for (i = 0; i < count; i++) {
260 desc[i].irq_data.irq = i;
261 desc[i].irq_data.chip = &no_irq_chip;
262 desc[i].kstat_irqs = alloc_percpu(unsigned int); 260 desc[i].kstat_irqs = alloc_percpu(unsigned int);
263 irq_settings_clr_and_set(desc, ~0, _IRQ_DEFAULT_INIT_FLAGS); 261 alloc_masks(&desc[i], GFP_KERNEL, node);
264 alloc_masks(desc + i, GFP_KERNEL, node); 262 raw_spin_lock_init(&desc[i].lock);
265 desc_smp_init(desc + i, node);
266 lockdep_set_class(&desc[i].lock, &irq_desc_lock_class); 263 lockdep_set_class(&desc[i].lock, &irq_desc_lock_class);
264 desc_set_defaults(i, &desc[i], node);
267 } 265 }
268 return arch_early_irq_init(); 266 return arch_early_irq_init();
269} 267}
@@ -290,6 +288,22 @@ static int irq_expand_nr_irqs(unsigned int nr)
290 288
291#endif /* !CONFIG_SPARSE_IRQ */ 289#endif /* !CONFIG_SPARSE_IRQ */
292 290
291/**
292 * generic_handle_irq - Invoke the handler for a particular irq
293 * @irq: The irq number to handle
294 *
295 */
296int generic_handle_irq(unsigned int irq)
297{
298 struct irq_desc *desc = irq_to_desc(irq);
299
300 if (!desc)
301 return -EINVAL;
302 generic_handle_irq_desc(irq, desc);
303 return 0;
304}
305EXPORT_SYMBOL_GPL(generic_handle_irq);
306
293/* Dynamic interrupt handling */ 307/* Dynamic interrupt handling */
294 308
295/** 309/**
@@ -311,6 +325,7 @@ void irq_free_descs(unsigned int from, unsigned int cnt)
311 bitmap_clear(allocated_irqs, from, cnt); 325 bitmap_clear(allocated_irqs, from, cnt);
312 mutex_unlock(&sparse_irq_lock); 326 mutex_unlock(&sparse_irq_lock);
313} 327}
328EXPORT_SYMBOL_GPL(irq_free_descs);
314 329
315/** 330/**
316 * irq_alloc_descs - allocate and initialize a range of irq descriptors 331 * irq_alloc_descs - allocate and initialize a range of irq descriptors
@@ -329,6 +344,12 @@ irq_alloc_descs(int irq, unsigned int from, unsigned int cnt, int node)
329 if (!cnt) 344 if (!cnt)
330 return -EINVAL; 345 return -EINVAL;
331 346
347 if (irq >= 0) {
348 if (from > irq)
349 return -EINVAL;
350 from = irq;
351 }
352
332 mutex_lock(&sparse_irq_lock); 353 mutex_lock(&sparse_irq_lock);
333 354
334 start = bitmap_find_next_zero_area(allocated_irqs, IRQ_BITMAP_BITS, 355 start = bitmap_find_next_zero_area(allocated_irqs, IRQ_BITMAP_BITS,
@@ -351,6 +372,7 @@ err:
351 mutex_unlock(&sparse_irq_lock); 372 mutex_unlock(&sparse_irq_lock);
352 return ret; 373 return ret;
353} 374}
375EXPORT_SYMBOL_GPL(irq_alloc_descs);
354 376
355/** 377/**
356 * irq_reserve_irqs - mark irqs allocated 378 * irq_reserve_irqs - mark irqs allocated
@@ -430,7 +452,6 @@ unsigned int kstat_irqs_cpu(unsigned int irq, int cpu)
430 *per_cpu_ptr(desc->kstat_irqs, cpu) : 0; 452 *per_cpu_ptr(desc->kstat_irqs, cpu) : 0;
431} 453}
432 454
433#ifdef CONFIG_GENERIC_HARDIRQS
434unsigned int kstat_irqs(unsigned int irq) 455unsigned int kstat_irqs(unsigned int irq)
435{ 456{
436 struct irq_desc *desc = irq_to_desc(irq); 457 struct irq_desc *desc = irq_to_desc(irq);
@@ -443,4 +464,3 @@ unsigned int kstat_irqs(unsigned int irq)
443 sum += *per_cpu_ptr(desc->kstat_irqs, cpu); 464 sum += *per_cpu_ptr(desc->kstat_irqs, cpu);
444 return sum; 465 return sum;
445} 466}
446#endif /* CONFIG_GENERIC_HARDIRQS */
diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c
new file mode 100644
index 000000000000..d5828da3fd38
--- /dev/null
+++ b/kernel/irq/irqdomain.c
@@ -0,0 +1,180 @@
1#include <linux/irq.h>
2#include <linux/irqdomain.h>
3#include <linux/module.h>
4#include <linux/mutex.h>
5#include <linux/of.h>
6#include <linux/of_address.h>
7#include <linux/slab.h>
8
9static LIST_HEAD(irq_domain_list);
10static DEFINE_MUTEX(irq_domain_mutex);
11
12/**
13 * irq_domain_add() - Register an irq_domain
14 * @domain: ptr to initialized irq_domain structure
15 *
16 * Registers an irq_domain structure. The irq_domain must at a minimum be
17 * initialized with an ops structure pointer, and either a ->to_irq hook or
18 * a valid irq_base value. Everything else is optional.
19 */
20void irq_domain_add(struct irq_domain *domain)
21{
22 struct irq_data *d;
23 int hwirq;
24
25 /*
26 * This assumes that the irq_domain owner has already allocated
27 * the irq_descs. This block will be removed when support for dynamic
28 * allocation of irq_descs is added to irq_domain.
29 */
30 for (hwirq = 0; hwirq < domain->nr_irq; hwirq++) {
31 d = irq_get_irq_data(irq_domain_to_irq(domain, hwirq));
32 if (d || d->domain) {
33 /* things are broken; just report, don't clean up */
34 WARN(1, "error: irq_desc already assigned to a domain");
35 return;
36 }
37 d->domain = domain;
38 d->hwirq = hwirq;
39 }
40
41 mutex_lock(&irq_domain_mutex);
42 list_add(&domain->list, &irq_domain_list);
43 mutex_unlock(&irq_domain_mutex);
44}
45
46/**
47 * irq_domain_del() - Unregister an irq_domain
48 * @domain: ptr to registered irq_domain.
49 */
50void irq_domain_del(struct irq_domain *domain)
51{
52 struct irq_data *d;
53 int hwirq;
54
55 mutex_lock(&irq_domain_mutex);
56 list_del(&domain->list);
57 mutex_unlock(&irq_domain_mutex);
58
59 /* Clear the irq_domain assignments */
60 for (hwirq = 0; hwirq < domain->nr_irq; hwirq++) {
61 d = irq_get_irq_data(irq_domain_to_irq(domain, hwirq));
62 d->domain = NULL;
63 }
64}
65
66#if defined(CONFIG_OF_IRQ)
67/**
68 * irq_create_of_mapping() - Map a linux irq number from a DT interrupt spec
69 *
70 * Used by the device tree interrupt mapping code to translate a device tree
71 * interrupt specifier to a valid linux irq number. Returns either a valid
72 * linux IRQ number or 0.
73 *
74 * When the caller no longer need the irq number returned by this function it
75 * should arrange to call irq_dispose_mapping().
76 */
77unsigned int irq_create_of_mapping(struct device_node *controller,
78 const u32 *intspec, unsigned int intsize)
79{
80 struct irq_domain *domain;
81 unsigned long hwirq;
82 unsigned int irq, type;
83 int rc = -EINVAL;
84
85 /* Find a domain which can translate the irq spec */
86 mutex_lock(&irq_domain_mutex);
87 list_for_each_entry(domain, &irq_domain_list, list) {
88 if (!domain->ops->dt_translate)
89 continue;
90 rc = domain->ops->dt_translate(domain, controller,
91 intspec, intsize, &hwirq, &type);
92 if (rc == 0)
93 break;
94 }
95 mutex_unlock(&irq_domain_mutex);
96
97 if (rc != 0)
98 return 0;
99
100 irq = irq_domain_to_irq(domain, hwirq);
101 if (type != IRQ_TYPE_NONE)
102 irq_set_irq_type(irq, type);
103 pr_debug("%s: mapped hwirq=%i to irq=%i, flags=%x\n",
104 controller->full_name, (int)hwirq, irq, type);
105 return irq;
106}
107EXPORT_SYMBOL_GPL(irq_create_of_mapping);
108
109/**
110 * irq_dispose_mapping() - Discard a mapping created by irq_create_of_mapping()
111 * @irq: linux irq number to be discarded
112 *
113 * Calling this function indicates the caller no longer needs a reference to
114 * the linux irq number returned by a prior call to irq_create_of_mapping().
115 */
116void irq_dispose_mapping(unsigned int irq)
117{
118 /*
119 * nothing yet; will be filled when support for dynamic allocation of
120 * irq_descs is added to irq_domain
121 */
122}
123EXPORT_SYMBOL_GPL(irq_dispose_mapping);
124
125int irq_domain_simple_dt_translate(struct irq_domain *d,
126 struct device_node *controller,
127 const u32 *intspec, unsigned int intsize,
128 unsigned long *out_hwirq, unsigned int *out_type)
129{
130 if (d->of_node != controller)
131 return -EINVAL;
132 if (intsize < 1)
133 return -EINVAL;
134
135 *out_hwirq = intspec[0];
136 *out_type = IRQ_TYPE_NONE;
137 if (intsize > 1)
138 *out_type = intspec[1] & IRQ_TYPE_SENSE_MASK;
139 return 0;
140}
141
142struct irq_domain_ops irq_domain_simple_ops = {
143 .dt_translate = irq_domain_simple_dt_translate,
144};
145EXPORT_SYMBOL_GPL(irq_domain_simple_ops);
146
147/**
148 * irq_domain_create_simple() - Set up a 'simple' translation range
149 */
150void irq_domain_add_simple(struct device_node *controller, int irq_base)
151{
152 struct irq_domain *domain;
153
154 domain = kzalloc(sizeof(*domain), GFP_KERNEL);
155 if (!domain) {
156 WARN_ON(1);
157 return;
158 }
159
160 domain->irq_base = irq_base;
161 domain->of_node = of_node_get(controller);
162 domain->ops = &irq_domain_simple_ops;
163 irq_domain_add(domain);
164}
165EXPORT_SYMBOL_GPL(irq_domain_add_simple);
166
167void irq_domain_generate_simple(const struct of_device_id *match,
168 u64 phys_base, unsigned int irq_start)
169{
170 struct device_node *node;
171 pr_info("looking for phys_base=%llx, irq_start=%i\n",
172 (unsigned long long) phys_base, (int) irq_start);
173 node = of_find_matching_node_by_address(NULL, match, phys_base);
174 if (node)
175 irq_domain_add_simple(node, irq_start);
176 else
177 pr_info("no node found\n");
178}
179EXPORT_SYMBOL_GPL(irq_domain_generate_simple);
180#endif /* CONFIG_OF_IRQ */
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index 07c1611f3899..0a7840aeb0fb 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -491,6 +491,9 @@ int irq_set_irq_wake(unsigned int irq, unsigned int on)
491 struct irq_desc *desc = irq_get_desc_buslock(irq, &flags); 491 struct irq_desc *desc = irq_get_desc_buslock(irq, &flags);
492 int ret = 0; 492 int ret = 0;
493 493
494 if (!desc)
495 return -EINVAL;
496
494 /* wakeup-capable irqs can be shared between drivers that 497 /* wakeup-capable irqs can be shared between drivers that
495 * don't need to have the same sleep mode behaviors. 498 * don't need to have the same sleep mode behaviors.
496 */ 499 */
@@ -723,13 +726,16 @@ irq_thread_check_affinity(struct irq_desc *desc, struct irqaction *action) { }
723 * context. So we need to disable bh here to avoid deadlocks and other 726 * context. So we need to disable bh here to avoid deadlocks and other
724 * side effects. 727 * side effects.
725 */ 728 */
726static void 729static irqreturn_t
727irq_forced_thread_fn(struct irq_desc *desc, struct irqaction *action) 730irq_forced_thread_fn(struct irq_desc *desc, struct irqaction *action)
728{ 731{
732 irqreturn_t ret;
733
729 local_bh_disable(); 734 local_bh_disable();
730 action->thread_fn(action->irq, action->dev_id); 735 ret = action->thread_fn(action->irq, action->dev_id);
731 irq_finalize_oneshot(desc, action, false); 736 irq_finalize_oneshot(desc, action, false);
732 local_bh_enable(); 737 local_bh_enable();
738 return ret;
733} 739}
734 740
735/* 741/*
@@ -737,10 +743,14 @@ irq_forced_thread_fn(struct irq_desc *desc, struct irqaction *action)
737 * preemtible - many of them need to sleep and wait for slow busses to 743 * preemtible - many of them need to sleep and wait for slow busses to
738 * complete. 744 * complete.
739 */ 745 */
740static void irq_thread_fn(struct irq_desc *desc, struct irqaction *action) 746static irqreturn_t irq_thread_fn(struct irq_desc *desc,
747 struct irqaction *action)
741{ 748{
742 action->thread_fn(action->irq, action->dev_id); 749 irqreturn_t ret;
750
751 ret = action->thread_fn(action->irq, action->dev_id);
743 irq_finalize_oneshot(desc, action, false); 752 irq_finalize_oneshot(desc, action, false);
753 return ret;
744} 754}
745 755
746/* 756/*
@@ -753,7 +763,8 @@ static int irq_thread(void *data)
753 }; 763 };
754 struct irqaction *action = data; 764 struct irqaction *action = data;
755 struct irq_desc *desc = irq_to_desc(action->irq); 765 struct irq_desc *desc = irq_to_desc(action->irq);
756 void (*handler_fn)(struct irq_desc *desc, struct irqaction *action); 766 irqreturn_t (*handler_fn)(struct irq_desc *desc,
767 struct irqaction *action);
757 int wake; 768 int wake;
758 769
759 if (force_irqthreads & test_bit(IRQTF_FORCED_THREAD, 770 if (force_irqthreads & test_bit(IRQTF_FORCED_THREAD,
@@ -783,8 +794,12 @@ static int irq_thread(void *data)
783 desc->istate |= IRQS_PENDING; 794 desc->istate |= IRQS_PENDING;
784 raw_spin_unlock_irq(&desc->lock); 795 raw_spin_unlock_irq(&desc->lock);
785 } else { 796 } else {
797 irqreturn_t action_ret;
798
786 raw_spin_unlock_irq(&desc->lock); 799 raw_spin_unlock_irq(&desc->lock);
787 handler_fn(desc, action); 800 action_ret = handler_fn(desc, action);
801 if (!noirqdebug)
802 note_interrupt(action->irq, desc, action_ret);
788 } 803 }
789 804
790 wake = atomic_dec_and_test(&desc->threads_active); 805 wake = atomic_dec_and_test(&desc->threads_active);
@@ -900,7 +915,8 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new)
900 */ 915 */
901 new->handler = irq_nested_primary_handler; 916 new->handler = irq_nested_primary_handler;
902 } else { 917 } else {
903 irq_setup_forced_threading(new); 918 if (irq_settings_can_thread(desc))
919 irq_setup_forced_threading(new);
904 } 920 }
905 921
906 /* 922 /*
diff --git a/kernel/irq/proc.c b/kernel/irq/proc.c
index 834899f2500f..4bd4faa6323a 100644
--- a/kernel/irq/proc.c
+++ b/kernel/irq/proc.c
@@ -19,7 +19,7 @@ static struct proc_dir_entry *root_irq_dir;
19 19
20#ifdef CONFIG_SMP 20#ifdef CONFIG_SMP
21 21
22static int irq_affinity_proc_show(struct seq_file *m, void *v) 22static int show_irq_affinity(int type, struct seq_file *m, void *v)
23{ 23{
24 struct irq_desc *desc = irq_to_desc((long)m->private); 24 struct irq_desc *desc = irq_to_desc((long)m->private);
25 const struct cpumask *mask = desc->irq_data.affinity; 25 const struct cpumask *mask = desc->irq_data.affinity;
@@ -28,7 +28,10 @@ static int irq_affinity_proc_show(struct seq_file *m, void *v)
28 if (irqd_is_setaffinity_pending(&desc->irq_data)) 28 if (irqd_is_setaffinity_pending(&desc->irq_data))
29 mask = desc->pending_mask; 29 mask = desc->pending_mask;
30#endif 30#endif
31 seq_cpumask(m, mask); 31 if (type)
32 seq_cpumask_list(m, mask);
33 else
34 seq_cpumask(m, mask);
32 seq_putc(m, '\n'); 35 seq_putc(m, '\n');
33 return 0; 36 return 0;
34} 37}
@@ -59,7 +62,18 @@ static int irq_affinity_hint_proc_show(struct seq_file *m, void *v)
59#endif 62#endif
60 63
61int no_irq_affinity; 64int no_irq_affinity;
62static ssize_t irq_affinity_proc_write(struct file *file, 65static int irq_affinity_proc_show(struct seq_file *m, void *v)
66{
67 return show_irq_affinity(0, m, v);
68}
69
70static int irq_affinity_list_proc_show(struct seq_file *m, void *v)
71{
72 return show_irq_affinity(1, m, v);
73}
74
75
76static ssize_t write_irq_affinity(int type, struct file *file,
63 const char __user *buffer, size_t count, loff_t *pos) 77 const char __user *buffer, size_t count, loff_t *pos)
64{ 78{
65 unsigned int irq = (int)(long)PDE(file->f_path.dentry->d_inode)->data; 79 unsigned int irq = (int)(long)PDE(file->f_path.dentry->d_inode)->data;
@@ -72,7 +86,10 @@ static ssize_t irq_affinity_proc_write(struct file *file,
72 if (!alloc_cpumask_var(&new_value, GFP_KERNEL)) 86 if (!alloc_cpumask_var(&new_value, GFP_KERNEL))
73 return -ENOMEM; 87 return -ENOMEM;
74 88
75 err = cpumask_parse_user(buffer, count, new_value); 89 if (type)
90 err = cpumask_parselist_user(buffer, count, new_value);
91 else
92 err = cpumask_parse_user(buffer, count, new_value);
76 if (err) 93 if (err)
77 goto free_cpumask; 94 goto free_cpumask;
78 95
@@ -100,11 +117,28 @@ free_cpumask:
100 return err; 117 return err;
101} 118}
102 119
120static ssize_t irq_affinity_proc_write(struct file *file,
121 const char __user *buffer, size_t count, loff_t *pos)
122{
123 return write_irq_affinity(0, file, buffer, count, pos);
124}
125
126static ssize_t irq_affinity_list_proc_write(struct file *file,
127 const char __user *buffer, size_t count, loff_t *pos)
128{
129 return write_irq_affinity(1, file, buffer, count, pos);
130}
131
103static int irq_affinity_proc_open(struct inode *inode, struct file *file) 132static int irq_affinity_proc_open(struct inode *inode, struct file *file)
104{ 133{
105 return single_open(file, irq_affinity_proc_show, PDE(inode)->data); 134 return single_open(file, irq_affinity_proc_show, PDE(inode)->data);
106} 135}
107 136
137static int irq_affinity_list_proc_open(struct inode *inode, struct file *file)
138{
139 return single_open(file, irq_affinity_list_proc_show, PDE(inode)->data);
140}
141
108static int irq_affinity_hint_proc_open(struct inode *inode, struct file *file) 142static int irq_affinity_hint_proc_open(struct inode *inode, struct file *file)
109{ 143{
110 return single_open(file, irq_affinity_hint_proc_show, PDE(inode)->data); 144 return single_open(file, irq_affinity_hint_proc_show, PDE(inode)->data);
@@ -125,6 +159,14 @@ static const struct file_operations irq_affinity_hint_proc_fops = {
125 .release = single_release, 159 .release = single_release,
126}; 160};
127 161
162static const struct file_operations irq_affinity_list_proc_fops = {
163 .open = irq_affinity_list_proc_open,
164 .read = seq_read,
165 .llseek = seq_lseek,
166 .release = single_release,
167 .write = irq_affinity_list_proc_write,
168};
169
128static int default_affinity_show(struct seq_file *m, void *v) 170static int default_affinity_show(struct seq_file *m, void *v)
129{ 171{
130 seq_cpumask(m, irq_default_affinity); 172 seq_cpumask(m, irq_default_affinity);
@@ -289,6 +331,10 @@ void register_irq_proc(unsigned int irq, struct irq_desc *desc)
289 proc_create_data("affinity_hint", 0400, desc->dir, 331 proc_create_data("affinity_hint", 0400, desc->dir,
290 &irq_affinity_hint_proc_fops, (void *)(long)irq); 332 &irq_affinity_hint_proc_fops, (void *)(long)irq);
291 333
334 /* create /proc/irq/<irq>/smp_affinity_list */
335 proc_create_data("smp_affinity_list", 0600, desc->dir,
336 &irq_affinity_list_proc_fops, (void *)(long)irq);
337
292 proc_create_data("node", 0444, desc->dir, 338 proc_create_data("node", 0444, desc->dir,
293 &irq_node_proc_fops, (void *)(long)irq); 339 &irq_node_proc_fops, (void *)(long)irq);
294#endif 340#endif
@@ -306,6 +352,7 @@ void unregister_irq_proc(unsigned int irq, struct irq_desc *desc)
306#ifdef CONFIG_SMP 352#ifdef CONFIG_SMP
307 remove_proc_entry("smp_affinity", desc->dir); 353 remove_proc_entry("smp_affinity", desc->dir);
308 remove_proc_entry("affinity_hint", desc->dir); 354 remove_proc_entry("affinity_hint", desc->dir);
355 remove_proc_entry("smp_affinity_list", desc->dir);
309 remove_proc_entry("node", desc->dir); 356 remove_proc_entry("node", desc->dir);
310#endif 357#endif
311 remove_proc_entry("spurious", desc->dir); 358 remove_proc_entry("spurious", desc->dir);
diff --git a/kernel/irq/settings.h b/kernel/irq/settings.h
index 0d91730b6330..f1667833d444 100644
--- a/kernel/irq/settings.h
+++ b/kernel/irq/settings.h
@@ -8,6 +8,7 @@ enum {
8 _IRQ_LEVEL = IRQ_LEVEL, 8 _IRQ_LEVEL = IRQ_LEVEL,
9 _IRQ_NOPROBE = IRQ_NOPROBE, 9 _IRQ_NOPROBE = IRQ_NOPROBE,
10 _IRQ_NOREQUEST = IRQ_NOREQUEST, 10 _IRQ_NOREQUEST = IRQ_NOREQUEST,
11 _IRQ_NOTHREAD = IRQ_NOTHREAD,
11 _IRQ_NOAUTOEN = IRQ_NOAUTOEN, 12 _IRQ_NOAUTOEN = IRQ_NOAUTOEN,
12 _IRQ_MOVE_PCNTXT = IRQ_MOVE_PCNTXT, 13 _IRQ_MOVE_PCNTXT = IRQ_MOVE_PCNTXT,
13 _IRQ_NO_BALANCING = IRQ_NO_BALANCING, 14 _IRQ_NO_BALANCING = IRQ_NO_BALANCING,
@@ -20,6 +21,7 @@ enum {
20#define IRQ_LEVEL GOT_YOU_MORON 21#define IRQ_LEVEL GOT_YOU_MORON
21#define IRQ_NOPROBE GOT_YOU_MORON 22#define IRQ_NOPROBE GOT_YOU_MORON
22#define IRQ_NOREQUEST GOT_YOU_MORON 23#define IRQ_NOREQUEST GOT_YOU_MORON
24#define IRQ_NOTHREAD GOT_YOU_MORON
23#define IRQ_NOAUTOEN GOT_YOU_MORON 25#define IRQ_NOAUTOEN GOT_YOU_MORON
24#define IRQ_NESTED_THREAD GOT_YOU_MORON 26#define IRQ_NESTED_THREAD GOT_YOU_MORON
25#undef IRQF_MODIFY_MASK 27#undef IRQF_MODIFY_MASK
@@ -94,6 +96,21 @@ static inline void irq_settings_set_norequest(struct irq_desc *desc)
94 desc->status_use_accessors |= _IRQ_NOREQUEST; 96 desc->status_use_accessors |= _IRQ_NOREQUEST;
95} 97}
96 98
99static inline bool irq_settings_can_thread(struct irq_desc *desc)
100{
101 return !(desc->status_use_accessors & _IRQ_NOTHREAD);
102}
103
104static inline void irq_settings_clr_nothread(struct irq_desc *desc)
105{
106 desc->status_use_accessors &= ~_IRQ_NOTHREAD;
107}
108
109static inline void irq_settings_set_nothread(struct irq_desc *desc)
110{
111 desc->status_use_accessors |= _IRQ_NOTHREAD;
112}
113
97static inline bool irq_settings_can_probe(struct irq_desc *desc) 114static inline bool irq_settings_can_probe(struct irq_desc *desc)
98{ 115{
99 return !(desc->status_use_accessors & _IRQ_NOPROBE); 116 return !(desc->status_use_accessors & _IRQ_NOPROBE);
diff --git a/kernel/irq/spurious.c b/kernel/irq/spurious.c
index dfbd550401b2..aa57d5da18c1 100644
--- a/kernel/irq/spurious.c
+++ b/kernel/irq/spurious.c
@@ -167,6 +167,13 @@ out:
167 jiffies + POLL_SPURIOUS_IRQ_INTERVAL); 167 jiffies + POLL_SPURIOUS_IRQ_INTERVAL);
168} 168}
169 169
170static inline int bad_action_ret(irqreturn_t action_ret)
171{
172 if (likely(action_ret <= (IRQ_HANDLED | IRQ_WAKE_THREAD)))
173 return 0;
174 return 1;
175}
176
170/* 177/*
171 * If 99,900 of the previous 100,000 interrupts have not been handled 178 * If 99,900 of the previous 100,000 interrupts have not been handled
172 * then assume that the IRQ is stuck in some manner. Drop a diagnostic 179 * then assume that the IRQ is stuck in some manner. Drop a diagnostic
@@ -182,7 +189,7 @@ __report_bad_irq(unsigned int irq, struct irq_desc *desc,
182 struct irqaction *action; 189 struct irqaction *action;
183 unsigned long flags; 190 unsigned long flags;
184 191
185 if (action_ret != IRQ_HANDLED && action_ret != IRQ_NONE) { 192 if (bad_action_ret(action_ret)) {
186 printk(KERN_ERR "irq event %d: bogus return value %x\n", 193 printk(KERN_ERR "irq event %d: bogus return value %x\n",
187 irq, action_ret); 194 irq, action_ret);
188 } else { 195 } else {
@@ -201,10 +208,11 @@ __report_bad_irq(unsigned int irq, struct irq_desc *desc,
201 raw_spin_lock_irqsave(&desc->lock, flags); 208 raw_spin_lock_irqsave(&desc->lock, flags);
202 action = desc->action; 209 action = desc->action;
203 while (action) { 210 while (action) {
204 printk(KERN_ERR "[<%p>]", action->handler); 211 printk(KERN_ERR "[<%p>] %pf", action->handler, action->handler);
205 print_symbol(" (%s)", 212 if (action->thread_fn)
206 (unsigned long)action->handler); 213 printk(KERN_CONT " threaded [<%p>] %pf",
207 printk("\n"); 214 action->thread_fn, action->thread_fn);
215 printk(KERN_CONT "\n");
208 action = action->next; 216 action = action->next;
209 } 217 }
210 raw_spin_unlock_irqrestore(&desc->lock, flags); 218 raw_spin_unlock_irqrestore(&desc->lock, flags);
@@ -262,7 +270,16 @@ void note_interrupt(unsigned int irq, struct irq_desc *desc,
262 if (desc->istate & IRQS_POLL_INPROGRESS) 270 if (desc->istate & IRQS_POLL_INPROGRESS)
263 return; 271 return;
264 272
265 if (unlikely(action_ret != IRQ_HANDLED)) { 273 /* we get here again via the threaded handler */
274 if (action_ret == IRQ_WAKE_THREAD)
275 return;
276
277 if (bad_action_ret(action_ret)) {
278 report_bad_irq(irq, desc, action_ret);
279 return;
280 }
281
282 if (unlikely(action_ret == IRQ_NONE)) {
266 /* 283 /*
267 * If we are seeing only the odd spurious IRQ caused by 284 * If we are seeing only the odd spurious IRQ caused by
268 * bus asynchronicity then don't eventually trigger an error, 285 * bus asynchronicity then don't eventually trigger an error,
@@ -274,8 +291,6 @@ void note_interrupt(unsigned int irq, struct irq_desc *desc,
274 else 291 else
275 desc->irqs_unhandled++; 292 desc->irqs_unhandled++;
276 desc->last_unhandled = jiffies; 293 desc->last_unhandled = jiffies;
277 if (unlikely(action_ret != IRQ_NONE))
278 report_bad_irq(irq, desc, action_ret);
279 } 294 }
280 295
281 if (unlikely(try_misrouted_irq(irq, desc, action_ret))) { 296 if (unlikely(try_misrouted_irq(irq, desc, action_ret))) {
diff --git a/kernel/jump_label.c b/kernel/jump_label.c
index 3b79bd938330..a8ce45097f3d 100644
--- a/kernel/jump_label.c
+++ b/kernel/jump_label.c
@@ -2,43 +2,23 @@
2 * jump label support 2 * jump label support
3 * 3 *
4 * Copyright (C) 2009 Jason Baron <jbaron@redhat.com> 4 * Copyright (C) 2009 Jason Baron <jbaron@redhat.com>
5 * Copyright (C) 2011 Peter Zijlstra <pzijlstr@redhat.com>
5 * 6 *
6 */ 7 */
7#include <linux/jump_label.h>
8#include <linux/memory.h> 8#include <linux/memory.h>
9#include <linux/uaccess.h> 9#include <linux/uaccess.h>
10#include <linux/module.h> 10#include <linux/module.h>
11#include <linux/list.h> 11#include <linux/list.h>
12#include <linux/jhash.h>
13#include <linux/slab.h> 12#include <linux/slab.h>
14#include <linux/sort.h> 13#include <linux/sort.h>
15#include <linux/err.h> 14#include <linux/err.h>
15#include <linux/jump_label.h>
16 16
17#ifdef HAVE_JUMP_LABEL 17#ifdef HAVE_JUMP_LABEL
18 18
19#define JUMP_LABEL_HASH_BITS 6
20#define JUMP_LABEL_TABLE_SIZE (1 << JUMP_LABEL_HASH_BITS)
21static struct hlist_head jump_label_table[JUMP_LABEL_TABLE_SIZE];
22
23/* mutex to protect coming/going of the the jump_label table */ 19/* mutex to protect coming/going of the the jump_label table */
24static DEFINE_MUTEX(jump_label_mutex); 20static DEFINE_MUTEX(jump_label_mutex);
25 21
26struct jump_label_entry {
27 struct hlist_node hlist;
28 struct jump_entry *table;
29 int nr_entries;
30 /* hang modules off here */
31 struct hlist_head modules;
32 unsigned long key;
33};
34
35struct jump_label_module_entry {
36 struct hlist_node hlist;
37 struct jump_entry *table;
38 int nr_entries;
39 struct module *mod;
40};
41
42void jump_label_lock(void) 22void jump_label_lock(void)
43{ 23{
44 mutex_lock(&jump_label_mutex); 24 mutex_lock(&jump_label_mutex);
@@ -49,6 +29,11 @@ void jump_label_unlock(void)
49 mutex_unlock(&jump_label_mutex); 29 mutex_unlock(&jump_label_mutex);
50} 30}
51 31
32bool jump_label_enabled(struct jump_label_key *key)
33{
34 return !!atomic_read(&key->enabled);
35}
36
52static int jump_label_cmp(const void *a, const void *b) 37static int jump_label_cmp(const void *a, const void *b)
53{ 38{
54 const struct jump_entry *jea = a; 39 const struct jump_entry *jea = a;
@@ -64,7 +49,7 @@ static int jump_label_cmp(const void *a, const void *b)
64} 49}
65 50
66static void 51static void
67sort_jump_label_entries(struct jump_entry *start, struct jump_entry *stop) 52jump_label_sort_entries(struct jump_entry *start, struct jump_entry *stop)
68{ 53{
69 unsigned long size; 54 unsigned long size;
70 55
@@ -73,118 +58,25 @@ sort_jump_label_entries(struct jump_entry *start, struct jump_entry *stop)
73 sort(start, size, sizeof(struct jump_entry), jump_label_cmp, NULL); 58 sort(start, size, sizeof(struct jump_entry), jump_label_cmp, NULL);
74} 59}
75 60
76static struct jump_label_entry *get_jump_label_entry(jump_label_t key) 61static void jump_label_update(struct jump_label_key *key, int enable);
77{
78 struct hlist_head *head;
79 struct hlist_node *node;
80 struct jump_label_entry *e;
81 u32 hash = jhash((void *)&key, sizeof(jump_label_t), 0);
82
83 head = &jump_label_table[hash & (JUMP_LABEL_TABLE_SIZE - 1)];
84 hlist_for_each_entry(e, node, head, hlist) {
85 if (key == e->key)
86 return e;
87 }
88 return NULL;
89}
90 62
91static struct jump_label_entry * 63void jump_label_inc(struct jump_label_key *key)
92add_jump_label_entry(jump_label_t key, int nr_entries, struct jump_entry *table)
93{ 64{
94 struct hlist_head *head; 65 if (atomic_inc_not_zero(&key->enabled))
95 struct jump_label_entry *e; 66 return;
96 u32 hash;
97
98 e = get_jump_label_entry(key);
99 if (e)
100 return ERR_PTR(-EEXIST);
101
102 e = kmalloc(sizeof(struct jump_label_entry), GFP_KERNEL);
103 if (!e)
104 return ERR_PTR(-ENOMEM);
105
106 hash = jhash((void *)&key, sizeof(jump_label_t), 0);
107 head = &jump_label_table[hash & (JUMP_LABEL_TABLE_SIZE - 1)];
108 e->key = key;
109 e->table = table;
110 e->nr_entries = nr_entries;
111 INIT_HLIST_HEAD(&(e->modules));
112 hlist_add_head(&e->hlist, head);
113 return e;
114}
115 67
116static int 68 jump_label_lock();
117build_jump_label_hashtable(struct jump_entry *start, struct jump_entry *stop) 69 if (atomic_add_return(1, &key->enabled) == 1)
118{ 70 jump_label_update(key, JUMP_LABEL_ENABLE);
119 struct jump_entry *iter, *iter_begin; 71 jump_label_unlock();
120 struct jump_label_entry *entry;
121 int count;
122
123 sort_jump_label_entries(start, stop);
124 iter = start;
125 while (iter < stop) {
126 entry = get_jump_label_entry(iter->key);
127 if (!entry) {
128 iter_begin = iter;
129 count = 0;
130 while ((iter < stop) &&
131 (iter->key == iter_begin->key)) {
132 iter++;
133 count++;
134 }
135 entry = add_jump_label_entry(iter_begin->key,
136 count, iter_begin);
137 if (IS_ERR(entry))
138 return PTR_ERR(entry);
139 } else {
140 WARN_ONCE(1, KERN_ERR "build_jump_hashtable: unexpected entry!\n");
141 return -1;
142 }
143 }
144 return 0;
145} 72}
146 73
147/*** 74void jump_label_dec(struct jump_label_key *key)
148 * jump_label_update - update jump label text
149 * @key - key value associated with a a jump label
150 * @type - enum set to JUMP_LABEL_ENABLE or JUMP_LABEL_DISABLE
151 *
152 * Will enable/disable the jump for jump label @key, depending on the
153 * value of @type.
154 *
155 */
156
157void jump_label_update(unsigned long key, enum jump_label_type type)
158{ 75{
159 struct jump_entry *iter; 76 if (!atomic_dec_and_mutex_lock(&key->enabled, &jump_label_mutex))
160 struct jump_label_entry *entry; 77 return;
161 struct hlist_node *module_node;
162 struct jump_label_module_entry *e_module;
163 int count;
164 78
165 jump_label_lock(); 79 jump_label_update(key, JUMP_LABEL_DISABLE);
166 entry = get_jump_label_entry((jump_label_t)key);
167 if (entry) {
168 count = entry->nr_entries;
169 iter = entry->table;
170 while (count--) {
171 if (kernel_text_address(iter->code))
172 arch_jump_label_transform(iter, type);
173 iter++;
174 }
175 /* eanble/disable jump labels in modules */
176 hlist_for_each_entry(e_module, module_node, &(entry->modules),
177 hlist) {
178 count = e_module->nr_entries;
179 iter = e_module->table;
180 while (count--) {
181 if (iter->key &&
182 kernel_text_address(iter->code))
183 arch_jump_label_transform(iter, type);
184 iter++;
185 }
186 }
187 }
188 jump_label_unlock(); 80 jump_label_unlock();
189} 81}
190 82
@@ -197,77 +89,36 @@ static int addr_conflict(struct jump_entry *entry, void *start, void *end)
197 return 0; 89 return 0;
198} 90}
199 91
200#ifdef CONFIG_MODULES 92static int __jump_label_text_reserved(struct jump_entry *iter_start,
201 93 struct jump_entry *iter_stop, void *start, void *end)
202static int module_conflict(void *start, void *end)
203{
204 struct hlist_head *head;
205 struct hlist_node *node, *node_next, *module_node, *module_node_next;
206 struct jump_label_entry *e;
207 struct jump_label_module_entry *e_module;
208 struct jump_entry *iter;
209 int i, count;
210 int conflict = 0;
211
212 for (i = 0; i < JUMP_LABEL_TABLE_SIZE; i++) {
213 head = &jump_label_table[i];
214 hlist_for_each_entry_safe(e, node, node_next, head, hlist) {
215 hlist_for_each_entry_safe(e_module, module_node,
216 module_node_next,
217 &(e->modules), hlist) {
218 count = e_module->nr_entries;
219 iter = e_module->table;
220 while (count--) {
221 if (addr_conflict(iter, start, end)) {
222 conflict = 1;
223 goto out;
224 }
225 iter++;
226 }
227 }
228 }
229 }
230out:
231 return conflict;
232}
233
234#endif
235
236/***
237 * jump_label_text_reserved - check if addr range is reserved
238 * @start: start text addr
239 * @end: end text addr
240 *
241 * checks if the text addr located between @start and @end
242 * overlaps with any of the jump label patch addresses. Code
243 * that wants to modify kernel text should first verify that
244 * it does not overlap with any of the jump label addresses.
245 * Caller must hold jump_label_mutex.
246 *
247 * returns 1 if there is an overlap, 0 otherwise
248 */
249int jump_label_text_reserved(void *start, void *end)
250{ 94{
251 struct jump_entry *iter; 95 struct jump_entry *iter;
252 struct jump_entry *iter_start = __start___jump_table;
253 struct jump_entry *iter_stop = __start___jump_table;
254 int conflict = 0;
255 96
256 iter = iter_start; 97 iter = iter_start;
257 while (iter < iter_stop) { 98 while (iter < iter_stop) {
258 if (addr_conflict(iter, start, end)) { 99 if (addr_conflict(iter, start, end))
259 conflict = 1; 100 return 1;
260 goto out;
261 }
262 iter++; 101 iter++;
263 } 102 }
264 103
265 /* now check modules */ 104 return 0;
266#ifdef CONFIG_MODULES 105}
267 conflict = module_conflict(start, end); 106
268#endif 107static void __jump_label_update(struct jump_label_key *key,
269out: 108 struct jump_entry *entry,
270 return conflict; 109 struct jump_entry *stop, int enable)
110{
111 for (; (entry < stop) &&
112 (entry->key == (jump_label_t)(unsigned long)key);
113 entry++) {
114 /*
115 * entry->code set to 0 invalidates module init text sections
116 * kernel_text_address() verifies we are not in core kernel
117 * init code, see jump_label_invalidate_module_init().
118 */
119 if (entry->code && kernel_text_address(entry->code))
120 arch_jump_label_transform(entry, enable);
121 }
271} 122}
272 123
273/* 124/*
@@ -277,145 +128,181 @@ void __weak arch_jump_label_text_poke_early(jump_label_t addr)
277{ 128{
278} 129}
279 130
280static __init int init_jump_label(void) 131static __init int jump_label_init(void)
281{ 132{
282 int ret;
283 struct jump_entry *iter_start = __start___jump_table; 133 struct jump_entry *iter_start = __start___jump_table;
284 struct jump_entry *iter_stop = __stop___jump_table; 134 struct jump_entry *iter_stop = __stop___jump_table;
135 struct jump_label_key *key = NULL;
285 struct jump_entry *iter; 136 struct jump_entry *iter;
286 137
287 jump_label_lock(); 138 jump_label_lock();
288 ret = build_jump_label_hashtable(__start___jump_table, 139 jump_label_sort_entries(iter_start, iter_stop);
289 __stop___jump_table); 140
290 iter = iter_start; 141 for (iter = iter_start; iter < iter_stop; iter++) {
291 while (iter < iter_stop) {
292 arch_jump_label_text_poke_early(iter->code); 142 arch_jump_label_text_poke_early(iter->code);
293 iter++; 143 if (iter->key == (jump_label_t)(unsigned long)key)
144 continue;
145
146 key = (struct jump_label_key *)(unsigned long)iter->key;
147 atomic_set(&key->enabled, 0);
148 key->entries = iter;
149#ifdef CONFIG_MODULES
150 key->next = NULL;
151#endif
294 } 152 }
295 jump_label_unlock(); 153 jump_label_unlock();
296 return ret; 154
155 return 0;
297} 156}
298early_initcall(init_jump_label); 157early_initcall(jump_label_init);
299 158
300#ifdef CONFIG_MODULES 159#ifdef CONFIG_MODULES
301 160
302static struct jump_label_module_entry * 161struct jump_label_mod {
303add_jump_label_module_entry(struct jump_label_entry *entry, 162 struct jump_label_mod *next;
304 struct jump_entry *iter_begin, 163 struct jump_entry *entries;
305 int count, struct module *mod) 164 struct module *mod;
165};
166
167static int __jump_label_mod_text_reserved(void *start, void *end)
306{ 168{
307 struct jump_label_module_entry *e; 169 struct module *mod;
308 170
309 e = kmalloc(sizeof(struct jump_label_module_entry), GFP_KERNEL); 171 mod = __module_text_address((unsigned long)start);
310 if (!e) 172 if (!mod)
311 return ERR_PTR(-ENOMEM); 173 return 0;
312 e->mod = mod; 174
313 e->nr_entries = count; 175 WARN_ON_ONCE(__module_text_address((unsigned long)end) != mod);
314 e->table = iter_begin; 176
315 hlist_add_head(&e->hlist, &entry->modules); 177 return __jump_label_text_reserved(mod->jump_entries,
316 return e; 178 mod->jump_entries + mod->num_jump_entries,
179 start, end);
317} 180}
318 181
319static int add_jump_label_module(struct module *mod) 182static void __jump_label_mod_update(struct jump_label_key *key, int enable)
320{ 183{
321 struct jump_entry *iter, *iter_begin; 184 struct jump_label_mod *mod = key->next;
322 struct jump_label_entry *entry;
323 struct jump_label_module_entry *module_entry;
324 int count;
325 185
326 /* if the module doesn't have jump label entries, just return */ 186 while (mod) {
327 if (!mod->num_jump_entries) 187 struct module *m = mod->mod;
328 return 0;
329 188
330 sort_jump_label_entries(mod->jump_entries, 189 __jump_label_update(key, mod->entries,
331 mod->jump_entries + mod->num_jump_entries); 190 m->jump_entries + m->num_jump_entries,
332 iter = mod->jump_entries; 191 enable);
333 while (iter < mod->jump_entries + mod->num_jump_entries) { 192 mod = mod->next;
334 entry = get_jump_label_entry(iter->key);
335 iter_begin = iter;
336 count = 0;
337 while ((iter < mod->jump_entries + mod->num_jump_entries) &&
338 (iter->key == iter_begin->key)) {
339 iter++;
340 count++;
341 }
342 if (!entry) {
343 entry = add_jump_label_entry(iter_begin->key, 0, NULL);
344 if (IS_ERR(entry))
345 return PTR_ERR(entry);
346 }
347 module_entry = add_jump_label_module_entry(entry, iter_begin,
348 count, mod);
349 if (IS_ERR(module_entry))
350 return PTR_ERR(module_entry);
351 } 193 }
352 return 0;
353} 194}
354 195
355static void remove_jump_label_module(struct module *mod) 196/***
197 * apply_jump_label_nops - patch module jump labels with arch_get_jump_label_nop()
198 * @mod: module to patch
199 *
200 * Allow for run-time selection of the optimal nops. Before the module
201 * loads patch these with arch_get_jump_label_nop(), which is specified by
202 * the arch specific jump label code.
203 */
204void jump_label_apply_nops(struct module *mod)
356{ 205{
357 struct hlist_head *head; 206 struct jump_entry *iter_start = mod->jump_entries;
358 struct hlist_node *node, *node_next, *module_node, *module_node_next; 207 struct jump_entry *iter_stop = iter_start + mod->num_jump_entries;
359 struct jump_label_entry *e; 208 struct jump_entry *iter;
360 struct jump_label_module_entry *e_module;
361 int i;
362 209
363 /* if the module doesn't have jump label entries, just return */ 210 /* if the module doesn't have jump label entries, just return */
364 if (!mod->num_jump_entries) 211 if (iter_start == iter_stop)
365 return; 212 return;
366 213
367 for (i = 0; i < JUMP_LABEL_TABLE_SIZE; i++) { 214 for (iter = iter_start; iter < iter_stop; iter++)
368 head = &jump_label_table[i]; 215 arch_jump_label_text_poke_early(iter->code);
369 hlist_for_each_entry_safe(e, node, node_next, head, hlist) { 216}
370 hlist_for_each_entry_safe(e_module, module_node, 217
371 module_node_next, 218static int jump_label_add_module(struct module *mod)
372 &(e->modules), hlist) { 219{
373 if (e_module->mod == mod) { 220 struct jump_entry *iter_start = mod->jump_entries;
374 hlist_del(&e_module->hlist); 221 struct jump_entry *iter_stop = iter_start + mod->num_jump_entries;
375 kfree(e_module); 222 struct jump_entry *iter;
376 } 223 struct jump_label_key *key = NULL;
377 } 224 struct jump_label_mod *jlm;
378 if (hlist_empty(&e->modules) && (e->nr_entries == 0)) { 225
379 hlist_del(&e->hlist); 226 /* if the module doesn't have jump label entries, just return */
380 kfree(e); 227 if (iter_start == iter_stop)
381 } 228 return 0;
229
230 jump_label_sort_entries(iter_start, iter_stop);
231
232 for (iter = iter_start; iter < iter_stop; iter++) {
233 if (iter->key == (jump_label_t)(unsigned long)key)
234 continue;
235
236 key = (struct jump_label_key *)(unsigned long)iter->key;
237
238 if (__module_address(iter->key) == mod) {
239 atomic_set(&key->enabled, 0);
240 key->entries = iter;
241 key->next = NULL;
242 continue;
382 } 243 }
244
245 jlm = kzalloc(sizeof(struct jump_label_mod), GFP_KERNEL);
246 if (!jlm)
247 return -ENOMEM;
248
249 jlm->mod = mod;
250 jlm->entries = iter;
251 jlm->next = key->next;
252 key->next = jlm;
253
254 if (jump_label_enabled(key))
255 __jump_label_update(key, iter, iter_stop,
256 JUMP_LABEL_ENABLE);
383 } 257 }
258
259 return 0;
384} 260}
385 261
386static void remove_jump_label_module_init(struct module *mod) 262static void jump_label_del_module(struct module *mod)
387{ 263{
388 struct hlist_head *head; 264 struct jump_entry *iter_start = mod->jump_entries;
389 struct hlist_node *node, *node_next, *module_node, *module_node_next; 265 struct jump_entry *iter_stop = iter_start + mod->num_jump_entries;
390 struct jump_label_entry *e;
391 struct jump_label_module_entry *e_module;
392 struct jump_entry *iter; 266 struct jump_entry *iter;
393 int i, count; 267 struct jump_label_key *key = NULL;
268 struct jump_label_mod *jlm, **prev;
394 269
395 /* if the module doesn't have jump label entries, just return */ 270 for (iter = iter_start; iter < iter_stop; iter++) {
396 if (!mod->num_jump_entries) 271 if (iter->key == (jump_label_t)(unsigned long)key)
397 return; 272 continue;
273
274 key = (struct jump_label_key *)(unsigned long)iter->key;
275
276 if (__module_address(iter->key) == mod)
277 continue;
278
279 prev = &key->next;
280 jlm = key->next;
281
282 while (jlm && jlm->mod != mod) {
283 prev = &jlm->next;
284 jlm = jlm->next;
285 }
398 286
399 for (i = 0; i < JUMP_LABEL_TABLE_SIZE; i++) { 287 if (jlm) {
400 head = &jump_label_table[i]; 288 *prev = jlm->next;
401 hlist_for_each_entry_safe(e, node, node_next, head, hlist) { 289 kfree(jlm);
402 hlist_for_each_entry_safe(e_module, module_node,
403 module_node_next,
404 &(e->modules), hlist) {
405 if (e_module->mod != mod)
406 continue;
407 count = e_module->nr_entries;
408 iter = e_module->table;
409 while (count--) {
410 if (within_module_init(iter->code, mod))
411 iter->key = 0;
412 iter++;
413 }
414 }
415 } 290 }
416 } 291 }
417} 292}
418 293
294static void jump_label_invalidate_module_init(struct module *mod)
295{
296 struct jump_entry *iter_start = mod->jump_entries;
297 struct jump_entry *iter_stop = iter_start + mod->num_jump_entries;
298 struct jump_entry *iter;
299
300 for (iter = iter_start; iter < iter_stop; iter++) {
301 if (within_module_init(iter->code, mod))
302 iter->code = 0;
303 }
304}
305
419static int 306static int
420jump_label_module_notify(struct notifier_block *self, unsigned long val, 307jump_label_module_notify(struct notifier_block *self, unsigned long val,
421 void *data) 308 void *data)
@@ -426,59 +313,81 @@ jump_label_module_notify(struct notifier_block *self, unsigned long val,
426 switch (val) { 313 switch (val) {
427 case MODULE_STATE_COMING: 314 case MODULE_STATE_COMING:
428 jump_label_lock(); 315 jump_label_lock();
429 ret = add_jump_label_module(mod); 316 ret = jump_label_add_module(mod);
430 if (ret) 317 if (ret)
431 remove_jump_label_module(mod); 318 jump_label_del_module(mod);
432 jump_label_unlock(); 319 jump_label_unlock();
433 break; 320 break;
434 case MODULE_STATE_GOING: 321 case MODULE_STATE_GOING:
435 jump_label_lock(); 322 jump_label_lock();
436 remove_jump_label_module(mod); 323 jump_label_del_module(mod);
437 jump_label_unlock(); 324 jump_label_unlock();
438 break; 325 break;
439 case MODULE_STATE_LIVE: 326 case MODULE_STATE_LIVE:
440 jump_label_lock(); 327 jump_label_lock();
441 remove_jump_label_module_init(mod); 328 jump_label_invalidate_module_init(mod);
442 jump_label_unlock(); 329 jump_label_unlock();
443 break; 330 break;
444 } 331 }
445 return ret;
446}
447
448/***
449 * apply_jump_label_nops - patch module jump labels with arch_get_jump_label_nop()
450 * @mod: module to patch
451 *
452 * Allow for run-time selection of the optimal nops. Before the module
453 * loads patch these with arch_get_jump_label_nop(), which is specified by
454 * the arch specific jump label code.
455 */
456void jump_label_apply_nops(struct module *mod)
457{
458 struct jump_entry *iter;
459
460 /* if the module doesn't have jump label entries, just return */
461 if (!mod->num_jump_entries)
462 return;
463 332
464 iter = mod->jump_entries; 333 return notifier_from_errno(ret);
465 while (iter < mod->jump_entries + mod->num_jump_entries) {
466 arch_jump_label_text_poke_early(iter->code);
467 iter++;
468 }
469} 334}
470 335
471struct notifier_block jump_label_module_nb = { 336struct notifier_block jump_label_module_nb = {
472 .notifier_call = jump_label_module_notify, 337 .notifier_call = jump_label_module_notify,
473 .priority = 0, 338 .priority = 1, /* higher than tracepoints */
474}; 339};
475 340
476static __init int init_jump_label_module(void) 341static __init int jump_label_init_module(void)
477{ 342{
478 return register_module_notifier(&jump_label_module_nb); 343 return register_module_notifier(&jump_label_module_nb);
479} 344}
480early_initcall(init_jump_label_module); 345early_initcall(jump_label_init_module);
481 346
482#endif /* CONFIG_MODULES */ 347#endif /* CONFIG_MODULES */
483 348
349/***
350 * jump_label_text_reserved - check if addr range is reserved
351 * @start: start text addr
352 * @end: end text addr
353 *
354 * checks if the text addr located between @start and @end
355 * overlaps with any of the jump label patch addresses. Code
356 * that wants to modify kernel text should first verify that
357 * it does not overlap with any of the jump label addresses.
358 * Caller must hold jump_label_mutex.
359 *
360 * returns 1 if there is an overlap, 0 otherwise
361 */
362int jump_label_text_reserved(void *start, void *end)
363{
364 int ret = __jump_label_text_reserved(__start___jump_table,
365 __stop___jump_table, start, end);
366
367 if (ret)
368 return ret;
369
370#ifdef CONFIG_MODULES
371 ret = __jump_label_mod_text_reserved(start, end);
372#endif
373 return ret;
374}
375
376static void jump_label_update(struct jump_label_key *key, int enable)
377{
378 struct jump_entry *entry = key->entries, *stop = __stop___jump_table;
379
380#ifdef CONFIG_MODULES
381 struct module *mod = __module_address((jump_label_t)key);
382
383 __jump_label_mod_update(key, enable);
384
385 if (mod)
386 stop = mod->jump_entries + mod->num_jump_entries;
387#endif
388 /* if there are no users, entry can be NULL */
389 if (entry)
390 __jump_label_update(key, entry, stop, enable);
391}
392
484#endif 393#endif
diff --git a/kernel/kexec.c b/kernel/kexec.c
index 87b77de03dd3..296fbc84d659 100644
--- a/kernel/kexec.c
+++ b/kernel/kexec.c
@@ -1095,7 +1095,7 @@ size_t crash_get_memory_size(void)
1095 size_t size = 0; 1095 size_t size = 0;
1096 mutex_lock(&kexec_mutex); 1096 mutex_lock(&kexec_mutex);
1097 if (crashk_res.end != crashk_res.start) 1097 if (crashk_res.end != crashk_res.start)
1098 size = crashk_res.end - crashk_res.start + 1; 1098 size = resource_size(&crashk_res);
1099 mutex_unlock(&kexec_mutex); 1099 mutex_unlock(&kexec_mutex);
1100 return size; 1100 return size;
1101} 1101}
@@ -1531,13 +1531,7 @@ int kernel_kexec(void)
1531 if (error) 1531 if (error)
1532 goto Enable_cpus; 1532 goto Enable_cpus;
1533 local_irq_disable(); 1533 local_irq_disable();
1534 /* Suspend system devices */ 1534 error = syscore_suspend();
1535 error = sysdev_suspend(PMSG_FREEZE);
1536 if (!error) {
1537 error = syscore_suspend();
1538 if (error)
1539 sysdev_resume();
1540 }
1541 if (error) 1535 if (error)
1542 goto Enable_irqs; 1536 goto Enable_irqs;
1543 } else 1537 } else
@@ -1553,7 +1547,6 @@ int kernel_kexec(void)
1553#ifdef CONFIG_KEXEC_JUMP 1547#ifdef CONFIG_KEXEC_JUMP
1554 if (kexec_image->preserve_context) { 1548 if (kexec_image->preserve_context) {
1555 syscore_resume(); 1549 syscore_resume();
1556 sysdev_resume();
1557 Enable_irqs: 1550 Enable_irqs:
1558 local_irq_enable(); 1551 local_irq_enable();
1559 Enable_cpus: 1552 Enable_cpus:
diff --git a/kernel/kmod.c b/kernel/kmod.c
index 9cd0591c96a2..ddc7644c1305 100644
--- a/kernel/kmod.c
+++ b/kernel/kmod.c
@@ -25,6 +25,7 @@
25#include <linux/kmod.h> 25#include <linux/kmod.h>
26#include <linux/slab.h> 26#include <linux/slab.h>
27#include <linux/completion.h> 27#include <linux/completion.h>
28#include <linux/cred.h>
28#include <linux/file.h> 29#include <linux/file.h>
29#include <linux/fdtable.h> 30#include <linux/fdtable.h>
30#include <linux/workqueue.h> 31#include <linux/workqueue.h>
@@ -43,6 +44,13 @@ extern int max_threads;
43 44
44static struct workqueue_struct *khelper_wq; 45static struct workqueue_struct *khelper_wq;
45 46
47#define CAP_BSET (void *)1
48#define CAP_PI (void *)2
49
50static kernel_cap_t usermodehelper_bset = CAP_FULL_SET;
51static kernel_cap_t usermodehelper_inheritable = CAP_FULL_SET;
52static DEFINE_SPINLOCK(umh_sysctl_lock);
53
46#ifdef CONFIG_MODULES 54#ifdef CONFIG_MODULES
47 55
48/* 56/*
@@ -132,6 +140,7 @@ EXPORT_SYMBOL(__request_module);
132static int ____call_usermodehelper(void *data) 140static int ____call_usermodehelper(void *data)
133{ 141{
134 struct subprocess_info *sub_info = data; 142 struct subprocess_info *sub_info = data;
143 struct cred *new;
135 int retval; 144 int retval;
136 145
137 spin_lock_irq(&current->sighand->siglock); 146 spin_lock_irq(&current->sighand->siglock);
@@ -147,12 +156,27 @@ static int ____call_usermodehelper(void *data)
147 */ 156 */
148 set_user_nice(current, 0); 157 set_user_nice(current, 0);
149 158
159 retval = -ENOMEM;
160 new = prepare_kernel_cred(current);
161 if (!new)
162 goto fail;
163
164 spin_lock(&umh_sysctl_lock);
165 new->cap_bset = cap_intersect(usermodehelper_bset, new->cap_bset);
166 new->cap_inheritable = cap_intersect(usermodehelper_inheritable,
167 new->cap_inheritable);
168 spin_unlock(&umh_sysctl_lock);
169
150 if (sub_info->init) { 170 if (sub_info->init) {
151 retval = sub_info->init(sub_info); 171 retval = sub_info->init(sub_info, new);
152 if (retval) 172 if (retval) {
173 abort_creds(new);
153 goto fail; 174 goto fail;
175 }
154 } 176 }
155 177
178 commit_creds(new);
179
156 retval = kernel_execve(sub_info->path, 180 retval = kernel_execve(sub_info->path,
157 (const char *const *)sub_info->argv, 181 (const char *const *)sub_info->argv,
158 (const char *const *)sub_info->envp); 182 (const char *const *)sub_info->envp);
@@ -245,13 +269,12 @@ static void __call_usermodehelper(struct work_struct *work)
245 } 269 }
246} 270}
247 271
248#ifdef CONFIG_PM_SLEEP
249/* 272/*
250 * If set, call_usermodehelper_exec() will exit immediately returning -EBUSY 273 * If set, call_usermodehelper_exec() will exit immediately returning -EBUSY
251 * (used for preventing user land processes from being created after the user 274 * (used for preventing user land processes from being created after the user
252 * land has been frozen during a system-wide hibernation or suspend operation). 275 * land has been frozen during a system-wide hibernation or suspend operation).
253 */ 276 */
254static int usermodehelper_disabled; 277static int usermodehelper_disabled = 1;
255 278
256/* Number of helpers running */ 279/* Number of helpers running */
257static atomic_t running_helpers = ATOMIC_INIT(0); 280static atomic_t running_helpers = ATOMIC_INIT(0);
@@ -301,6 +324,15 @@ void usermodehelper_enable(void)
301 usermodehelper_disabled = 0; 324 usermodehelper_disabled = 0;
302} 325}
303 326
327/**
328 * usermodehelper_is_disabled - check if new helpers are allowed to be started
329 */
330bool usermodehelper_is_disabled(void)
331{
332 return usermodehelper_disabled;
333}
334EXPORT_SYMBOL_GPL(usermodehelper_is_disabled);
335
304static void helper_lock(void) 336static void helper_lock(void)
305{ 337{
306 atomic_inc(&running_helpers); 338 atomic_inc(&running_helpers);
@@ -312,12 +344,6 @@ static void helper_unlock(void)
312 if (atomic_dec_and_test(&running_helpers)) 344 if (atomic_dec_and_test(&running_helpers))
313 wake_up(&running_helpers_waitq); 345 wake_up(&running_helpers_waitq);
314} 346}
315#else /* CONFIG_PM_SLEEP */
316#define usermodehelper_disabled 0
317
318static inline void helper_lock(void) {}
319static inline void helper_unlock(void) {}
320#endif /* CONFIG_PM_SLEEP */
321 347
322/** 348/**
323 * call_usermodehelper_setup - prepare to call a usermode helper 349 * call_usermodehelper_setup - prepare to call a usermode helper
@@ -364,7 +390,7 @@ EXPORT_SYMBOL(call_usermodehelper_setup);
364 * context in which call_usermodehelper_exec is called. 390 * context in which call_usermodehelper_exec is called.
365 */ 391 */
366void call_usermodehelper_setfns(struct subprocess_info *info, 392void call_usermodehelper_setfns(struct subprocess_info *info,
367 int (*init)(struct subprocess_info *info), 393 int (*init)(struct subprocess_info *info, struct cred *new),
368 void (*cleanup)(struct subprocess_info *info), 394 void (*cleanup)(struct subprocess_info *info),
369 void *data) 395 void *data)
370{ 396{
@@ -418,6 +444,84 @@ unlock:
418} 444}
419EXPORT_SYMBOL(call_usermodehelper_exec); 445EXPORT_SYMBOL(call_usermodehelper_exec);
420 446
447static int proc_cap_handler(struct ctl_table *table, int write,
448 void __user *buffer, size_t *lenp, loff_t *ppos)
449{
450 struct ctl_table t;
451 unsigned long cap_array[_KERNEL_CAPABILITY_U32S];
452 kernel_cap_t new_cap;
453 int err, i;
454
455 if (write && (!capable(CAP_SETPCAP) ||
456 !capable(CAP_SYS_MODULE)))
457 return -EPERM;
458
459 /*
460 * convert from the global kernel_cap_t to the ulong array to print to
461 * userspace if this is a read.
462 */
463 spin_lock(&umh_sysctl_lock);
464 for (i = 0; i < _KERNEL_CAPABILITY_U32S; i++) {
465 if (table->data == CAP_BSET)
466 cap_array[i] = usermodehelper_bset.cap[i];
467 else if (table->data == CAP_PI)
468 cap_array[i] = usermodehelper_inheritable.cap[i];
469 else
470 BUG();
471 }
472 spin_unlock(&umh_sysctl_lock);
473
474 t = *table;
475 t.data = &cap_array;
476
477 /*
478 * actually read or write and array of ulongs from userspace. Remember
479 * these are least significant 32 bits first
480 */
481 err = proc_doulongvec_minmax(&t, write, buffer, lenp, ppos);
482 if (err < 0)
483 return err;
484
485 /*
486 * convert from the sysctl array of ulongs to the kernel_cap_t
487 * internal representation
488 */
489 for (i = 0; i < _KERNEL_CAPABILITY_U32S; i++)
490 new_cap.cap[i] = cap_array[i];
491
492 /*
493 * Drop everything not in the new_cap (but don't add things)
494 */
495 spin_lock(&umh_sysctl_lock);
496 if (write) {
497 if (table->data == CAP_BSET)
498 usermodehelper_bset = cap_intersect(usermodehelper_bset, new_cap);
499 if (table->data == CAP_PI)
500 usermodehelper_inheritable = cap_intersect(usermodehelper_inheritable, new_cap);
501 }
502 spin_unlock(&umh_sysctl_lock);
503
504 return 0;
505}
506
507struct ctl_table usermodehelper_table[] = {
508 {
509 .procname = "bset",
510 .data = CAP_BSET,
511 .maxlen = _KERNEL_CAPABILITY_U32S * sizeof(unsigned long),
512 .mode = 0600,
513 .proc_handler = proc_cap_handler,
514 },
515 {
516 .procname = "inheritable",
517 .data = CAP_PI,
518 .maxlen = _KERNEL_CAPABILITY_U32S * sizeof(unsigned long),
519 .mode = 0600,
520 .proc_handler = proc_cap_handler,
521 },
522 { }
523};
524
421void __init usermodehelper_init(void) 525void __init usermodehelper_init(void)
422{ 526{
423 khelper_wq = create_singlethread_workqueue("khelper"); 527 khelper_wq = create_singlethread_workqueue("khelper");
diff --git a/kernel/kprobes.c b/kernel/kprobes.c
index 77981813a1e7..b30fd54eb985 100644
--- a/kernel/kprobes.c
+++ b/kernel/kprobes.c
@@ -1255,19 +1255,29 @@ static int __kprobes in_kprobes_functions(unsigned long addr)
1255/* 1255/*
1256 * If we have a symbol_name argument, look it up and add the offset field 1256 * If we have a symbol_name argument, look it up and add the offset field
1257 * to it. This way, we can specify a relative address to a symbol. 1257 * to it. This way, we can specify a relative address to a symbol.
1258 * This returns encoded errors if it fails to look up symbol or invalid
1259 * combination of parameters.
1258 */ 1260 */
1259static kprobe_opcode_t __kprobes *kprobe_addr(struct kprobe *p) 1261static kprobe_opcode_t __kprobes *kprobe_addr(struct kprobe *p)
1260{ 1262{
1261 kprobe_opcode_t *addr = p->addr; 1263 kprobe_opcode_t *addr = p->addr;
1264
1265 if ((p->symbol_name && p->addr) ||
1266 (!p->symbol_name && !p->addr))
1267 goto invalid;
1268
1262 if (p->symbol_name) { 1269 if (p->symbol_name) {
1263 if (addr)
1264 return NULL;
1265 kprobe_lookup_name(p->symbol_name, addr); 1270 kprobe_lookup_name(p->symbol_name, addr);
1271 if (!addr)
1272 return ERR_PTR(-ENOENT);
1266 } 1273 }
1267 1274
1268 if (!addr) 1275 addr = (kprobe_opcode_t *)(((char *)addr) + p->offset);
1269 return NULL; 1276 if (addr)
1270 return (kprobe_opcode_t *)(((char *)addr) + p->offset); 1277 return addr;
1278
1279invalid:
1280 return ERR_PTR(-EINVAL);
1271} 1281}
1272 1282
1273/* Check passed kprobe is valid and return kprobe in kprobe_table. */ 1283/* Check passed kprobe is valid and return kprobe in kprobe_table. */
@@ -1311,8 +1321,8 @@ int __kprobes register_kprobe(struct kprobe *p)
1311 kprobe_opcode_t *addr; 1321 kprobe_opcode_t *addr;
1312 1322
1313 addr = kprobe_addr(p); 1323 addr = kprobe_addr(p);
1314 if (!addr) 1324 if (IS_ERR(addr))
1315 return -EINVAL; 1325 return PTR_ERR(addr);
1316 p->addr = addr; 1326 p->addr = addr;
1317 1327
1318 ret = check_kprobe_rereg(p); 1328 ret = check_kprobe_rereg(p);
@@ -1335,6 +1345,8 @@ int __kprobes register_kprobe(struct kprobe *p)
1335 */ 1345 */
1336 probed_mod = __module_text_address((unsigned long) p->addr); 1346 probed_mod = __module_text_address((unsigned long) p->addr);
1337 if (probed_mod) { 1347 if (probed_mod) {
1348 /* Return -ENOENT if fail. */
1349 ret = -ENOENT;
1338 /* 1350 /*
1339 * We must hold a refcount of the probed module while updating 1351 * We must hold a refcount of the probed module while updating
1340 * its code to prohibit unexpected unloading. 1352 * its code to prohibit unexpected unloading.
@@ -1351,6 +1363,7 @@ int __kprobes register_kprobe(struct kprobe *p)
1351 module_put(probed_mod); 1363 module_put(probed_mod);
1352 goto fail_with_jump_label; 1364 goto fail_with_jump_label;
1353 } 1365 }
1366 /* ret will be updated by following code */
1354 } 1367 }
1355 preempt_enable(); 1368 preempt_enable();
1356 jump_label_unlock(); 1369 jump_label_unlock();
@@ -1399,7 +1412,7 @@ out:
1399fail_with_jump_label: 1412fail_with_jump_label:
1400 preempt_enable(); 1413 preempt_enable();
1401 jump_label_unlock(); 1414 jump_label_unlock();
1402 return -EINVAL; 1415 return ret;
1403} 1416}
1404EXPORT_SYMBOL_GPL(register_kprobe); 1417EXPORT_SYMBOL_GPL(register_kprobe);
1405 1418
@@ -1686,8 +1699,8 @@ int __kprobes register_kretprobe(struct kretprobe *rp)
1686 1699
1687 if (kretprobe_blacklist_size) { 1700 if (kretprobe_blacklist_size) {
1688 addr = kprobe_addr(&rp->kp); 1701 addr = kprobe_addr(&rp->kp);
1689 if (!addr) 1702 if (IS_ERR(addr))
1690 return -EINVAL; 1703 return PTR_ERR(addr);
1691 1704
1692 for (i = 0; kretprobe_blacklist[i].name != NULL; i++) { 1705 for (i = 0; kretprobe_blacklist[i].name != NULL; i++) {
1693 if (kretprobe_blacklist[i].addr == addr) 1706 if (kretprobe_blacklist[i].addr == addr)
diff --git a/kernel/ksysfs.c b/kernel/ksysfs.c
index 0b624e791805..3b053c04dd86 100644
--- a/kernel/ksysfs.c
+++ b/kernel/ksysfs.c
@@ -16,6 +16,7 @@
16#include <linux/kexec.h> 16#include <linux/kexec.h>
17#include <linux/profile.h> 17#include <linux/profile.h>
18#include <linux/sched.h> 18#include <linux/sched.h>
19#include <linux/capability.h>
19 20
20#define KERNEL_ATTR_RO(_name) \ 21#define KERNEL_ATTR_RO(_name) \
21static struct kobj_attribute _name##_attr = __ATTR_RO(_name) 22static struct kobj_attribute _name##_attr = __ATTR_RO(_name)
@@ -131,6 +132,14 @@ KERNEL_ATTR_RO(vmcoreinfo);
131 132
132#endif /* CONFIG_KEXEC */ 133#endif /* CONFIG_KEXEC */
133 134
135/* whether file capabilities are enabled */
136static ssize_t fscaps_show(struct kobject *kobj,
137 struct kobj_attribute *attr, char *buf)
138{
139 return sprintf(buf, "%d\n", file_caps_enabled);
140}
141KERNEL_ATTR_RO(fscaps);
142
134/* 143/*
135 * Make /sys/kernel/notes give the raw contents of our kernel .notes section. 144 * Make /sys/kernel/notes give the raw contents of our kernel .notes section.
136 */ 145 */
@@ -158,6 +167,7 @@ struct kobject *kernel_kobj;
158EXPORT_SYMBOL_GPL(kernel_kobj); 167EXPORT_SYMBOL_GPL(kernel_kobj);
159 168
160static struct attribute * kernel_attrs[] = { 169static struct attribute * kernel_attrs[] = {
170 &fscaps_attr.attr,
161#if defined(CONFIG_HOTPLUG) 171#if defined(CONFIG_HOTPLUG)
162 &uevent_seqnum_attr.attr, 172 &uevent_seqnum_attr.attr,
163 &uevent_helper_attr.attr, 173 &uevent_helper_attr.attr,
diff --git a/kernel/kthread.c b/kernel/kthread.c
index 3b34d2732bce..4ba7cccb4994 100644
--- a/kernel/kthread.c
+++ b/kernel/kthread.c
@@ -202,8 +202,8 @@ void kthread_bind(struct task_struct *p, unsigned int cpu)
202 return; 202 return;
203 } 203 }
204 204
205 p->cpus_allowed = cpumask_of_cpu(cpu); 205 /* It's safe because the task is inactive. */
206 p->rt.nr_cpus_allowed = 1; 206 do_set_cpus_allowed(p, cpumask_of(cpu));
207 p->flags |= PF_THREAD_BOUND; 207 p->flags |= PF_THREAD_BOUND;
208} 208}
209EXPORT_SYMBOL(kthread_bind); 209EXPORT_SYMBOL(kthread_bind);
diff --git a/kernel/lockdep.c b/kernel/lockdep.c
index 53a68956f131..8c24294e477f 100644
--- a/kernel/lockdep.c
+++ b/kernel/lockdep.c
@@ -490,6 +490,18 @@ void get_usage_chars(struct lock_class *class, char usage[LOCK_USAGE_CHARS])
490 usage[i] = '\0'; 490 usage[i] = '\0';
491} 491}
492 492
493static int __print_lock_name(struct lock_class *class)
494{
495 char str[KSYM_NAME_LEN];
496 const char *name;
497
498 name = class->name;
499 if (!name)
500 name = __get_key_name(class->key, str);
501
502 return printk("%s", name);
503}
504
493static void print_lock_name(struct lock_class *class) 505static void print_lock_name(struct lock_class *class)
494{ 506{
495 char str[KSYM_NAME_LEN], usage[LOCK_USAGE_CHARS]; 507 char str[KSYM_NAME_LEN], usage[LOCK_USAGE_CHARS];
@@ -1053,6 +1065,56 @@ print_circular_bug_entry(struct lock_list *target, int depth)
1053 return 0; 1065 return 0;
1054} 1066}
1055 1067
1068static void
1069print_circular_lock_scenario(struct held_lock *src,
1070 struct held_lock *tgt,
1071 struct lock_list *prt)
1072{
1073 struct lock_class *source = hlock_class(src);
1074 struct lock_class *target = hlock_class(tgt);
1075 struct lock_class *parent = prt->class;
1076
1077 /*
1078 * A direct locking problem where unsafe_class lock is taken
1079 * directly by safe_class lock, then all we need to show
1080 * is the deadlock scenario, as it is obvious that the
1081 * unsafe lock is taken under the safe lock.
1082 *
1083 * But if there is a chain instead, where the safe lock takes
1084 * an intermediate lock (middle_class) where this lock is
1085 * not the same as the safe lock, then the lock chain is
1086 * used to describe the problem. Otherwise we would need
1087 * to show a different CPU case for each link in the chain
1088 * from the safe_class lock to the unsafe_class lock.
1089 */
1090 if (parent != source) {
1091 printk("Chain exists of:\n ");
1092 __print_lock_name(source);
1093 printk(" --> ");
1094 __print_lock_name(parent);
1095 printk(" --> ");
1096 __print_lock_name(target);
1097 printk("\n\n");
1098 }
1099
1100 printk(" Possible unsafe locking scenario:\n\n");
1101 printk(" CPU0 CPU1\n");
1102 printk(" ---- ----\n");
1103 printk(" lock(");
1104 __print_lock_name(target);
1105 printk(");\n");
1106 printk(" lock(");
1107 __print_lock_name(parent);
1108 printk(");\n");
1109 printk(" lock(");
1110 __print_lock_name(target);
1111 printk(");\n");
1112 printk(" lock(");
1113 __print_lock_name(source);
1114 printk(");\n");
1115 printk("\n *** DEADLOCK ***\n\n");
1116}
1117
1056/* 1118/*
1057 * When a circular dependency is detected, print the 1119 * When a circular dependency is detected, print the
1058 * header first: 1120 * header first:
@@ -1096,6 +1158,7 @@ static noinline int print_circular_bug(struct lock_list *this,
1096{ 1158{
1097 struct task_struct *curr = current; 1159 struct task_struct *curr = current;
1098 struct lock_list *parent; 1160 struct lock_list *parent;
1161 struct lock_list *first_parent;
1099 int depth; 1162 int depth;
1100 1163
1101 if (!debug_locks_off_graph_unlock() || debug_locks_silent) 1164 if (!debug_locks_off_graph_unlock() || debug_locks_silent)
@@ -1109,6 +1172,7 @@ static noinline int print_circular_bug(struct lock_list *this,
1109 print_circular_bug_header(target, depth, check_src, check_tgt); 1172 print_circular_bug_header(target, depth, check_src, check_tgt);
1110 1173
1111 parent = get_lock_parent(target); 1174 parent = get_lock_parent(target);
1175 first_parent = parent;
1112 1176
1113 while (parent) { 1177 while (parent) {
1114 print_circular_bug_entry(parent, --depth); 1178 print_circular_bug_entry(parent, --depth);
@@ -1116,6 +1180,9 @@ static noinline int print_circular_bug(struct lock_list *this,
1116 } 1180 }
1117 1181
1118 printk("\nother info that might help us debug this:\n\n"); 1182 printk("\nother info that might help us debug this:\n\n");
1183 print_circular_lock_scenario(check_src, check_tgt,
1184 first_parent);
1185
1119 lockdep_print_held_locks(curr); 1186 lockdep_print_held_locks(curr);
1120 1187
1121 printk("\nstack backtrace:\n"); 1188 printk("\nstack backtrace:\n");
@@ -1314,7 +1381,7 @@ print_shortest_lock_dependencies(struct lock_list *leaf,
1314 printk("\n"); 1381 printk("\n");
1315 1382
1316 if (depth == 0 && (entry != root)) { 1383 if (depth == 0 && (entry != root)) {
1317 printk("lockdep:%s bad BFS generated tree\n", __func__); 1384 printk("lockdep:%s bad path found in chain graph\n", __func__);
1318 break; 1385 break;
1319 } 1386 }
1320 1387
@@ -1325,6 +1392,62 @@ print_shortest_lock_dependencies(struct lock_list *leaf,
1325 return; 1392 return;
1326} 1393}
1327 1394
1395static void
1396print_irq_lock_scenario(struct lock_list *safe_entry,
1397 struct lock_list *unsafe_entry,
1398 struct lock_class *prev_class,
1399 struct lock_class *next_class)
1400{
1401 struct lock_class *safe_class = safe_entry->class;
1402 struct lock_class *unsafe_class = unsafe_entry->class;
1403 struct lock_class *middle_class = prev_class;
1404
1405 if (middle_class == safe_class)
1406 middle_class = next_class;
1407
1408 /*
1409 * A direct locking problem where unsafe_class lock is taken
1410 * directly by safe_class lock, then all we need to show
1411 * is the deadlock scenario, as it is obvious that the
1412 * unsafe lock is taken under the safe lock.
1413 *
1414 * But if there is a chain instead, where the safe lock takes
1415 * an intermediate lock (middle_class) where this lock is
1416 * not the same as the safe lock, then the lock chain is
1417 * used to describe the problem. Otherwise we would need
1418 * to show a different CPU case for each link in the chain
1419 * from the safe_class lock to the unsafe_class lock.
1420 */
1421 if (middle_class != unsafe_class) {
1422 printk("Chain exists of:\n ");
1423 __print_lock_name(safe_class);
1424 printk(" --> ");
1425 __print_lock_name(middle_class);
1426 printk(" --> ");
1427 __print_lock_name(unsafe_class);
1428 printk("\n\n");
1429 }
1430
1431 printk(" Possible interrupt unsafe locking scenario:\n\n");
1432 printk(" CPU0 CPU1\n");
1433 printk(" ---- ----\n");
1434 printk(" lock(");
1435 __print_lock_name(unsafe_class);
1436 printk(");\n");
1437 printk(" local_irq_disable();\n");
1438 printk(" lock(");
1439 __print_lock_name(safe_class);
1440 printk(");\n");
1441 printk(" lock(");
1442 __print_lock_name(middle_class);
1443 printk(");\n");
1444 printk(" <Interrupt>\n");
1445 printk(" lock(");
1446 __print_lock_name(safe_class);
1447 printk(");\n");
1448 printk("\n *** DEADLOCK ***\n\n");
1449}
1450
1328static int 1451static int
1329print_bad_irq_dependency(struct task_struct *curr, 1452print_bad_irq_dependency(struct task_struct *curr,
1330 struct lock_list *prev_root, 1453 struct lock_list *prev_root,
@@ -1376,6 +1499,9 @@ print_bad_irq_dependency(struct task_struct *curr,
1376 print_stack_trace(forwards_entry->class->usage_traces + bit2, 1); 1499 print_stack_trace(forwards_entry->class->usage_traces + bit2, 1);
1377 1500
1378 printk("\nother info that might help us debug this:\n\n"); 1501 printk("\nother info that might help us debug this:\n\n");
1502 print_irq_lock_scenario(backwards_entry, forwards_entry,
1503 hlock_class(prev), hlock_class(next));
1504
1379 lockdep_print_held_locks(curr); 1505 lockdep_print_held_locks(curr);
1380 1506
1381 printk("\nthe dependencies between %s-irq-safe lock", irqclass); 1507 printk("\nthe dependencies between %s-irq-safe lock", irqclass);
@@ -1539,6 +1665,26 @@ static inline void inc_chains(void)
1539 1665
1540#endif 1666#endif
1541 1667
1668static void
1669print_deadlock_scenario(struct held_lock *nxt,
1670 struct held_lock *prv)
1671{
1672 struct lock_class *next = hlock_class(nxt);
1673 struct lock_class *prev = hlock_class(prv);
1674
1675 printk(" Possible unsafe locking scenario:\n\n");
1676 printk(" CPU0\n");
1677 printk(" ----\n");
1678 printk(" lock(");
1679 __print_lock_name(prev);
1680 printk(");\n");
1681 printk(" lock(");
1682 __print_lock_name(next);
1683 printk(");\n");
1684 printk("\n *** DEADLOCK ***\n\n");
1685 printk(" May be due to missing lock nesting notation\n\n");
1686}
1687
1542static int 1688static int
1543print_deadlock_bug(struct task_struct *curr, struct held_lock *prev, 1689print_deadlock_bug(struct task_struct *curr, struct held_lock *prev,
1544 struct held_lock *next) 1690 struct held_lock *next)
@@ -1557,6 +1703,7 @@ print_deadlock_bug(struct task_struct *curr, struct held_lock *prev,
1557 print_lock(prev); 1703 print_lock(prev);
1558 1704
1559 printk("\nother info that might help us debug this:\n"); 1705 printk("\nother info that might help us debug this:\n");
1706 print_deadlock_scenario(next, prev);
1560 lockdep_print_held_locks(curr); 1707 lockdep_print_held_locks(curr);
1561 1708
1562 printk("\nstack backtrace:\n"); 1709 printk("\nstack backtrace:\n");
@@ -1826,7 +1973,7 @@ static inline int lookup_chain_cache(struct task_struct *curr,
1826 struct list_head *hash_head = chainhashentry(chain_key); 1973 struct list_head *hash_head = chainhashentry(chain_key);
1827 struct lock_chain *chain; 1974 struct lock_chain *chain;
1828 struct held_lock *hlock_curr, *hlock_next; 1975 struct held_lock *hlock_curr, *hlock_next;
1829 int i, j, n, cn; 1976 int i, j;
1830 1977
1831 if (DEBUG_LOCKS_WARN_ON(!irqs_disabled())) 1978 if (DEBUG_LOCKS_WARN_ON(!irqs_disabled()))
1832 return 0; 1979 return 0;
@@ -1886,15 +2033,9 @@ cache_hit:
1886 } 2033 }
1887 i++; 2034 i++;
1888 chain->depth = curr->lockdep_depth + 1 - i; 2035 chain->depth = curr->lockdep_depth + 1 - i;
1889 cn = nr_chain_hlocks; 2036 if (likely(nr_chain_hlocks + chain->depth <= MAX_LOCKDEP_CHAIN_HLOCKS)) {
1890 while (cn + chain->depth <= MAX_LOCKDEP_CHAIN_HLOCKS) { 2037 chain->base = nr_chain_hlocks;
1891 n = cmpxchg(&nr_chain_hlocks, cn, cn + chain->depth); 2038 nr_chain_hlocks += chain->depth;
1892 if (n == cn)
1893 break;
1894 cn = n;
1895 }
1896 if (likely(cn + chain->depth <= MAX_LOCKDEP_CHAIN_HLOCKS)) {
1897 chain->base = cn;
1898 for (j = 0; j < chain->depth - 1; j++, i++) { 2039 for (j = 0; j < chain->depth - 1; j++, i++) {
1899 int lock_id = curr->held_locks[i].class_idx - 1; 2040 int lock_id = curr->held_locks[i].class_idx - 1;
1900 chain_hlocks[chain->base + j] = lock_id; 2041 chain_hlocks[chain->base + j] = lock_id;
@@ -2011,6 +2152,24 @@ static void check_chain_key(struct task_struct *curr)
2011#endif 2152#endif
2012} 2153}
2013 2154
2155static void
2156print_usage_bug_scenario(struct held_lock *lock)
2157{
2158 struct lock_class *class = hlock_class(lock);
2159
2160 printk(" Possible unsafe locking scenario:\n\n");
2161 printk(" CPU0\n");
2162 printk(" ----\n");
2163 printk(" lock(");
2164 __print_lock_name(class);
2165 printk(");\n");
2166 printk(" <Interrupt>\n");
2167 printk(" lock(");
2168 __print_lock_name(class);
2169 printk(");\n");
2170 printk("\n *** DEADLOCK ***\n\n");
2171}
2172
2014static int 2173static int
2015print_usage_bug(struct task_struct *curr, struct held_lock *this, 2174print_usage_bug(struct task_struct *curr, struct held_lock *this,
2016 enum lock_usage_bit prev_bit, enum lock_usage_bit new_bit) 2175 enum lock_usage_bit prev_bit, enum lock_usage_bit new_bit)
@@ -2039,6 +2198,8 @@ print_usage_bug(struct task_struct *curr, struct held_lock *this,
2039 2198
2040 print_irqtrace_events(curr); 2199 print_irqtrace_events(curr);
2041 printk("\nother info that might help us debug this:\n"); 2200 printk("\nother info that might help us debug this:\n");
2201 print_usage_bug_scenario(this);
2202
2042 lockdep_print_held_locks(curr); 2203 lockdep_print_held_locks(curr);
2043 2204
2044 printk("\nstack backtrace:\n"); 2205 printk("\nstack backtrace:\n");
@@ -2073,6 +2234,10 @@ print_irq_inversion_bug(struct task_struct *curr,
2073 struct held_lock *this, int forwards, 2234 struct held_lock *this, int forwards,
2074 const char *irqclass) 2235 const char *irqclass)
2075{ 2236{
2237 struct lock_list *entry = other;
2238 struct lock_list *middle = NULL;
2239 int depth;
2240
2076 if (!debug_locks_off_graph_unlock() || debug_locks_silent) 2241 if (!debug_locks_off_graph_unlock() || debug_locks_silent)
2077 return 0; 2242 return 0;
2078 2243
@@ -2091,6 +2256,25 @@ print_irq_inversion_bug(struct task_struct *curr,
2091 printk("\n\nand interrupts could create inverse lock ordering between them.\n\n"); 2256 printk("\n\nand interrupts could create inverse lock ordering between them.\n\n");
2092 2257
2093 printk("\nother info that might help us debug this:\n"); 2258 printk("\nother info that might help us debug this:\n");
2259
2260 /* Find a middle lock (if one exists) */
2261 depth = get_lock_depth(other);
2262 do {
2263 if (depth == 0 && (entry != root)) {
2264 printk("lockdep:%s bad path found in chain graph\n", __func__);
2265 break;
2266 }
2267 middle = entry;
2268 entry = get_lock_parent(entry);
2269 depth--;
2270 } while (entry && entry != root && (depth >= 0));
2271 if (forwards)
2272 print_irq_lock_scenario(root, other,
2273 middle ? middle->class : root->class, other->class);
2274 else
2275 print_irq_lock_scenario(other, root,
2276 middle ? middle->class : other->class, root->class);
2277
2094 lockdep_print_held_locks(curr); 2278 lockdep_print_held_locks(curr);
2095 2279
2096 printk("\nthe shortest dependencies between 2nd lock and 1st lock:\n"); 2280 printk("\nthe shortest dependencies between 2nd lock and 1st lock:\n");
@@ -2284,6 +2468,9 @@ mark_held_locks(struct task_struct *curr, enum mark_type mark)
2284 2468
2285 BUG_ON(usage_bit >= LOCK_USAGE_STATES); 2469 BUG_ON(usage_bit >= LOCK_USAGE_STATES);
2286 2470
2471 if (hlock_class(hlock)->key == __lockdep_no_validate__.subkeys)
2472 continue;
2473
2287 if (!mark_lock(curr, hlock, usage_bit)) 2474 if (!mark_lock(curr, hlock, usage_bit))
2288 return 0; 2475 return 0;
2289 } 2476 }
@@ -2294,34 +2481,13 @@ mark_held_locks(struct task_struct *curr, enum mark_type mark)
2294/* 2481/*
2295 * Hardirqs will be enabled: 2482 * Hardirqs will be enabled:
2296 */ 2483 */
2297void trace_hardirqs_on_caller(unsigned long ip) 2484static void __trace_hardirqs_on_caller(unsigned long ip)
2298{ 2485{
2299 struct task_struct *curr = current; 2486 struct task_struct *curr = current;
2300 2487
2301 time_hardirqs_on(CALLER_ADDR0, ip);
2302
2303 if (unlikely(!debug_locks || current->lockdep_recursion))
2304 return;
2305
2306 if (DEBUG_LOCKS_WARN_ON(unlikely(early_boot_irqs_disabled)))
2307 return;
2308
2309 if (unlikely(curr->hardirqs_enabled)) {
2310 /*
2311 * Neither irq nor preemption are disabled here
2312 * so this is racy by nature but losing one hit
2313 * in a stat is not a big deal.
2314 */
2315 __debug_atomic_inc(redundant_hardirqs_on);
2316 return;
2317 }
2318 /* we'll do an OFF -> ON transition: */ 2488 /* we'll do an OFF -> ON transition: */
2319 curr->hardirqs_enabled = 1; 2489 curr->hardirqs_enabled = 1;
2320 2490
2321 if (DEBUG_LOCKS_WARN_ON(!irqs_disabled()))
2322 return;
2323 if (DEBUG_LOCKS_WARN_ON(current->hardirq_context))
2324 return;
2325 /* 2491 /*
2326 * We are going to turn hardirqs on, so set the 2492 * We are going to turn hardirqs on, so set the
2327 * usage bit for all held locks: 2493 * usage bit for all held locks:
@@ -2341,6 +2507,37 @@ void trace_hardirqs_on_caller(unsigned long ip)
2341 curr->hardirq_enable_event = ++curr->irq_events; 2507 curr->hardirq_enable_event = ++curr->irq_events;
2342 debug_atomic_inc(hardirqs_on_events); 2508 debug_atomic_inc(hardirqs_on_events);
2343} 2509}
2510
2511void trace_hardirqs_on_caller(unsigned long ip)
2512{
2513 time_hardirqs_on(CALLER_ADDR0, ip);
2514
2515 if (unlikely(!debug_locks || current->lockdep_recursion))
2516 return;
2517
2518 if (unlikely(current->hardirqs_enabled)) {
2519 /*
2520 * Neither irq nor preemption are disabled here
2521 * so this is racy by nature but losing one hit
2522 * in a stat is not a big deal.
2523 */
2524 __debug_atomic_inc(redundant_hardirqs_on);
2525 return;
2526 }
2527
2528 if (DEBUG_LOCKS_WARN_ON(!irqs_disabled()))
2529 return;
2530
2531 if (DEBUG_LOCKS_WARN_ON(unlikely(early_boot_irqs_disabled)))
2532 return;
2533
2534 if (DEBUG_LOCKS_WARN_ON(current->hardirq_context))
2535 return;
2536
2537 current->lockdep_recursion = 1;
2538 __trace_hardirqs_on_caller(ip);
2539 current->lockdep_recursion = 0;
2540}
2344EXPORT_SYMBOL(trace_hardirqs_on_caller); 2541EXPORT_SYMBOL(trace_hardirqs_on_caller);
2345 2542
2346void trace_hardirqs_on(void) 2543void trace_hardirqs_on(void)
@@ -2390,7 +2587,7 @@ void trace_softirqs_on(unsigned long ip)
2390{ 2587{
2391 struct task_struct *curr = current; 2588 struct task_struct *curr = current;
2392 2589
2393 if (unlikely(!debug_locks)) 2590 if (unlikely(!debug_locks || current->lockdep_recursion))
2394 return; 2591 return;
2395 2592
2396 if (DEBUG_LOCKS_WARN_ON(!irqs_disabled())) 2593 if (DEBUG_LOCKS_WARN_ON(!irqs_disabled()))
@@ -2401,6 +2598,7 @@ void trace_softirqs_on(unsigned long ip)
2401 return; 2598 return;
2402 } 2599 }
2403 2600
2601 current->lockdep_recursion = 1;
2404 /* 2602 /*
2405 * We'll do an OFF -> ON transition: 2603 * We'll do an OFF -> ON transition:
2406 */ 2604 */
@@ -2415,6 +2613,7 @@ void trace_softirqs_on(unsigned long ip)
2415 */ 2613 */
2416 if (curr->hardirqs_enabled) 2614 if (curr->hardirqs_enabled)
2417 mark_held_locks(curr, SOFTIRQ); 2615 mark_held_locks(curr, SOFTIRQ);
2616 current->lockdep_recursion = 0;
2418} 2617}
2419 2618
2420/* 2619/*
@@ -2424,7 +2623,7 @@ void trace_softirqs_off(unsigned long ip)
2424{ 2623{
2425 struct task_struct *curr = current; 2624 struct task_struct *curr = current;
2426 2625
2427 if (unlikely(!debug_locks)) 2626 if (unlikely(!debug_locks || current->lockdep_recursion))
2428 return; 2627 return;
2429 2628
2430 if (DEBUG_LOCKS_WARN_ON(!irqs_disabled())) 2629 if (DEBUG_LOCKS_WARN_ON(!irqs_disabled()))
@@ -2675,10 +2874,7 @@ static int mark_lock(struct task_struct *curr, struct held_lock *this,
2675void lockdep_init_map(struct lockdep_map *lock, const char *name, 2874void lockdep_init_map(struct lockdep_map *lock, const char *name,
2676 struct lock_class_key *key, int subclass) 2875 struct lock_class_key *key, int subclass)
2677{ 2876{
2678 int i; 2877 memset(lock, 0, sizeof(*lock));
2679
2680 for (i = 0; i < NR_LOCKDEP_CACHING_CLASSES; i++)
2681 lock->class_cache[i] = NULL;
2682 2878
2683#ifdef CONFIG_LOCK_STAT 2879#ifdef CONFIG_LOCK_STAT
2684 lock->cpu = raw_smp_processor_id(); 2880 lock->cpu = raw_smp_processor_id();
@@ -3242,7 +3438,7 @@ int lock_is_held(struct lockdep_map *lock)
3242 int ret = 0; 3438 int ret = 0;
3243 3439
3244 if (unlikely(current->lockdep_recursion)) 3440 if (unlikely(current->lockdep_recursion))
3245 return ret; 3441 return 1; /* avoid false negative lockdep_assert_held() */
3246 3442
3247 raw_local_irq_save(flags); 3443 raw_local_irq_save(flags);
3248 check_flags(flags); 3444 check_flags(flags);
diff --git a/kernel/module.c b/kernel/module.c
index d5938a5c19c4..04379f92f843 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -57,6 +57,7 @@
57#include <linux/kmemleak.h> 57#include <linux/kmemleak.h>
58#include <linux/jump_label.h> 58#include <linux/jump_label.h>
59#include <linux/pfn.h> 59#include <linux/pfn.h>
60#include <linux/bsearch.h>
60 61
61#define CREATE_TRACE_POINTS 62#define CREATE_TRACE_POINTS
62#include <trace/events/module.h> 63#include <trace/events/module.h>
@@ -240,23 +241,24 @@ static bool each_symbol_in_section(const struct symsearch *arr,
240 struct module *owner, 241 struct module *owner,
241 bool (*fn)(const struct symsearch *syms, 242 bool (*fn)(const struct symsearch *syms,
242 struct module *owner, 243 struct module *owner,
243 unsigned int symnum, void *data), 244 void *data),
244 void *data) 245 void *data)
245{ 246{
246 unsigned int i, j; 247 unsigned int j;
247 248
248 for (j = 0; j < arrsize; j++) { 249 for (j = 0; j < arrsize; j++) {
249 for (i = 0; i < arr[j].stop - arr[j].start; i++) 250 if (fn(&arr[j], owner, data))
250 if (fn(&arr[j], owner, i, data)) 251 return true;
251 return true;
252 } 252 }
253 253
254 return false; 254 return false;
255} 255}
256 256
257/* Returns true as soon as fn returns true, otherwise false. */ 257/* Returns true as soon as fn returns true, otherwise false. */
258bool each_symbol(bool (*fn)(const struct symsearch *arr, struct module *owner, 258bool each_symbol_section(bool (*fn)(const struct symsearch *arr,
259 unsigned int symnum, void *data), void *data) 259 struct module *owner,
260 void *data),
261 void *data)
260{ 262{
261 struct module *mod; 263 struct module *mod;
262 static const struct symsearch arr[] = { 264 static const struct symsearch arr[] = {
@@ -309,7 +311,7 @@ bool each_symbol(bool (*fn)(const struct symsearch *arr, struct module *owner,
309 } 311 }
310 return false; 312 return false;
311} 313}
312EXPORT_SYMBOL_GPL(each_symbol); 314EXPORT_SYMBOL_GPL(each_symbol_section);
313 315
314struct find_symbol_arg { 316struct find_symbol_arg {
315 /* Input */ 317 /* Input */
@@ -323,15 +325,12 @@ struct find_symbol_arg {
323 const struct kernel_symbol *sym; 325 const struct kernel_symbol *sym;
324}; 326};
325 327
326static bool find_symbol_in_section(const struct symsearch *syms, 328static bool check_symbol(const struct symsearch *syms,
327 struct module *owner, 329 struct module *owner,
328 unsigned int symnum, void *data) 330 unsigned int symnum, void *data)
329{ 331{
330 struct find_symbol_arg *fsa = data; 332 struct find_symbol_arg *fsa = data;
331 333
332 if (strcmp(syms->start[symnum].name, fsa->name) != 0)
333 return false;
334
335 if (!fsa->gplok) { 334 if (!fsa->gplok) {
336 if (syms->licence == GPL_ONLY) 335 if (syms->licence == GPL_ONLY)
337 return false; 336 return false;
@@ -365,6 +364,30 @@ static bool find_symbol_in_section(const struct symsearch *syms,
365 return true; 364 return true;
366} 365}
367 366
367static int cmp_name(const void *va, const void *vb)
368{
369 const char *a;
370 const struct kernel_symbol *b;
371 a = va; b = vb;
372 return strcmp(a, b->name);
373}
374
375static bool find_symbol_in_section(const struct symsearch *syms,
376 struct module *owner,
377 void *data)
378{
379 struct find_symbol_arg *fsa = data;
380 struct kernel_symbol *sym;
381
382 sym = bsearch(fsa->name, syms->start, syms->stop - syms->start,
383 sizeof(struct kernel_symbol), cmp_name);
384
385 if (sym != NULL && check_symbol(syms, owner, sym - syms->start, data))
386 return true;
387
388 return false;
389}
390
368/* Find a symbol and return it, along with, (optional) crc and 391/* Find a symbol and return it, along with, (optional) crc and
369 * (optional) module which owns it. Needs preempt disabled or module_mutex. */ 392 * (optional) module which owns it. Needs preempt disabled or module_mutex. */
370const struct kernel_symbol *find_symbol(const char *name, 393const struct kernel_symbol *find_symbol(const char *name,
@@ -379,7 +402,7 @@ const struct kernel_symbol *find_symbol(const char *name,
379 fsa.gplok = gplok; 402 fsa.gplok = gplok;
380 fsa.warn = warn; 403 fsa.warn = warn;
381 404
382 if (each_symbol(find_symbol_in_section, &fsa)) { 405 if (each_symbol_section(find_symbol_in_section, &fsa)) {
383 if (owner) 406 if (owner)
384 *owner = fsa.owner; 407 *owner = fsa.owner;
385 if (crc) 408 if (crc)
@@ -522,9 +545,9 @@ static void setup_modinfo_##field(struct module *mod, const char *s) \
522 mod->field = kstrdup(s, GFP_KERNEL); \ 545 mod->field = kstrdup(s, GFP_KERNEL); \
523} \ 546} \
524static ssize_t show_modinfo_##field(struct module_attribute *mattr, \ 547static ssize_t show_modinfo_##field(struct module_attribute *mattr, \
525 struct module *mod, char *buffer) \ 548 struct module_kobject *mk, char *buffer) \
526{ \ 549{ \
527 return sprintf(buffer, "%s\n", mod->field); \ 550 return sprintf(buffer, "%s\n", mk->mod->field); \
528} \ 551} \
529static int modinfo_##field##_exists(struct module *mod) \ 552static int modinfo_##field##_exists(struct module *mod) \
530{ \ 553{ \
@@ -879,9 +902,9 @@ void symbol_put_addr(void *addr)
879EXPORT_SYMBOL_GPL(symbol_put_addr); 902EXPORT_SYMBOL_GPL(symbol_put_addr);
880 903
881static ssize_t show_refcnt(struct module_attribute *mattr, 904static ssize_t show_refcnt(struct module_attribute *mattr,
882 struct module *mod, char *buffer) 905 struct module_kobject *mk, char *buffer)
883{ 906{
884 return sprintf(buffer, "%u\n", module_refcount(mod)); 907 return sprintf(buffer, "%u\n", module_refcount(mk->mod));
885} 908}
886 909
887static struct module_attribute refcnt = { 910static struct module_attribute refcnt = {
@@ -929,11 +952,11 @@ static inline int module_unload_init(struct module *mod)
929#endif /* CONFIG_MODULE_UNLOAD */ 952#endif /* CONFIG_MODULE_UNLOAD */
930 953
931static ssize_t show_initstate(struct module_attribute *mattr, 954static ssize_t show_initstate(struct module_attribute *mattr,
932 struct module *mod, char *buffer) 955 struct module_kobject *mk, char *buffer)
933{ 956{
934 const char *state = "unknown"; 957 const char *state = "unknown";
935 958
936 switch (mod->state) { 959 switch (mk->mod->state) {
937 case MODULE_STATE_LIVE: 960 case MODULE_STATE_LIVE:
938 state = "live"; 961 state = "live";
939 break; 962 break;
@@ -952,10 +975,27 @@ static struct module_attribute initstate = {
952 .show = show_initstate, 975 .show = show_initstate,
953}; 976};
954 977
978static ssize_t store_uevent(struct module_attribute *mattr,
979 struct module_kobject *mk,
980 const char *buffer, size_t count)
981{
982 enum kobject_action action;
983
984 if (kobject_action_type(buffer, count, &action) == 0)
985 kobject_uevent(&mk->kobj, action);
986 return count;
987}
988
989struct module_attribute module_uevent = {
990 .attr = { .name = "uevent", .mode = 0200 },
991 .store = store_uevent,
992};
993
955static struct module_attribute *modinfo_attrs[] = { 994static struct module_attribute *modinfo_attrs[] = {
956 &modinfo_version, 995 &modinfo_version,
957 &modinfo_srcversion, 996 &modinfo_srcversion,
958 &initstate, 997 &initstate,
998 &module_uevent,
959#ifdef CONFIG_MODULE_UNLOAD 999#ifdef CONFIG_MODULE_UNLOAD
960 &refcnt, 1000 &refcnt,
961#endif 1001#endif
@@ -1164,7 +1204,7 @@ struct module_sect_attrs
1164}; 1204};
1165 1205
1166static ssize_t module_sect_show(struct module_attribute *mattr, 1206static ssize_t module_sect_show(struct module_attribute *mattr,
1167 struct module *mod, char *buf) 1207 struct module_kobject *mk, char *buf)
1168{ 1208{
1169 struct module_sect_attr *sattr = 1209 struct module_sect_attr *sattr =
1170 container_of(mattr, struct module_sect_attr, mattr); 1210 container_of(mattr, struct module_sect_attr, mattr);
@@ -1607,27 +1647,28 @@ static void set_section_ro_nx(void *base,
1607 } 1647 }
1608} 1648}
1609 1649
1610/* Setting memory back to RW+NX before releasing it */ 1650static void unset_module_core_ro_nx(struct module *mod)
1611void unset_section_ro_nx(struct module *mod, void *module_region)
1612{ 1651{
1613 unsigned long total_pages; 1652 set_page_attributes(mod->module_core + mod->core_text_size,
1614 1653 mod->module_core + mod->core_size,
1615 if (mod->module_core == module_region) { 1654 set_memory_x);
1616 /* Set core as NX+RW */ 1655 set_page_attributes(mod->module_core,
1617 total_pages = MOD_NUMBER_OF_PAGES(mod->module_core, mod->core_size); 1656 mod->module_core + mod->core_ro_size,
1618 set_memory_nx((unsigned long)mod->module_core, total_pages); 1657 set_memory_rw);
1619 set_memory_rw((unsigned long)mod->module_core, total_pages); 1658}
1620 1659
1621 } else if (mod->module_init == module_region) { 1660static void unset_module_init_ro_nx(struct module *mod)
1622 /* Set init as NX+RW */ 1661{
1623 total_pages = MOD_NUMBER_OF_PAGES(mod->module_init, mod->init_size); 1662 set_page_attributes(mod->module_init + mod->init_text_size,
1624 set_memory_nx((unsigned long)mod->module_init, total_pages); 1663 mod->module_init + mod->init_size,
1625 set_memory_rw((unsigned long)mod->module_init, total_pages); 1664 set_memory_x);
1626 } 1665 set_page_attributes(mod->module_init,
1666 mod->module_init + mod->init_ro_size,
1667 set_memory_rw);
1627} 1668}
1628 1669
1629/* Iterate through all modules and set each module's text as RW */ 1670/* Iterate through all modules and set each module's text as RW */
1630void set_all_modules_text_rw() 1671void set_all_modules_text_rw(void)
1631{ 1672{
1632 struct module *mod; 1673 struct module *mod;
1633 1674
@@ -1648,7 +1689,7 @@ void set_all_modules_text_rw()
1648} 1689}
1649 1690
1650/* Iterate through all modules and set each module's text as RO */ 1691/* Iterate through all modules and set each module's text as RO */
1651void set_all_modules_text_ro() 1692void set_all_modules_text_ro(void)
1652{ 1693{
1653 struct module *mod; 1694 struct module *mod;
1654 1695
@@ -1669,9 +1710,19 @@ void set_all_modules_text_ro()
1669} 1710}
1670#else 1711#else
1671static inline void set_section_ro_nx(void *base, unsigned long text_size, unsigned long ro_size, unsigned long total_size) { } 1712static inline void set_section_ro_nx(void *base, unsigned long text_size, unsigned long ro_size, unsigned long total_size) { }
1672static inline void unset_section_ro_nx(struct module *mod, void *module_region) { } 1713static void unset_module_core_ro_nx(struct module *mod) { }
1714static void unset_module_init_ro_nx(struct module *mod) { }
1673#endif 1715#endif
1674 1716
1717void __weak module_free(struct module *mod, void *module_region)
1718{
1719 vfree(module_region);
1720}
1721
1722void __weak module_arch_cleanup(struct module *mod)
1723{
1724}
1725
1675/* Free a module, remove from lists, etc. */ 1726/* Free a module, remove from lists, etc. */
1676static void free_module(struct module *mod) 1727static void free_module(struct module *mod)
1677{ 1728{
@@ -1696,7 +1747,7 @@ static void free_module(struct module *mod)
1696 destroy_params(mod->kp, mod->num_kp); 1747 destroy_params(mod->kp, mod->num_kp);
1697 1748
1698 /* This may be NULL, but that's OK */ 1749 /* This may be NULL, but that's OK */
1699 unset_section_ro_nx(mod, mod->module_init); 1750 unset_module_init_ro_nx(mod);
1700 module_free(mod, mod->module_init); 1751 module_free(mod, mod->module_init);
1701 kfree(mod->args); 1752 kfree(mod->args);
1702 percpu_modfree(mod); 1753 percpu_modfree(mod);
@@ -1705,7 +1756,7 @@ static void free_module(struct module *mod)
1705 lockdep_free_key_range(mod->module_core, mod->core_size); 1756 lockdep_free_key_range(mod->module_core, mod->core_size);
1706 1757
1707 /* Finally, free the core (containing the module structure) */ 1758 /* Finally, free the core (containing the module structure) */
1708 unset_section_ro_nx(mod, mod->module_core); 1759 unset_module_core_ro_nx(mod);
1709 module_free(mod, mod->module_core); 1760 module_free(mod, mod->module_core);
1710 1761
1711#ifdef CONFIG_MPU 1762#ifdef CONFIG_MPU
@@ -1826,6 +1877,26 @@ static int simplify_symbols(struct module *mod, const struct load_info *info)
1826 return ret; 1877 return ret;
1827} 1878}
1828 1879
1880int __weak apply_relocate(Elf_Shdr *sechdrs,
1881 const char *strtab,
1882 unsigned int symindex,
1883 unsigned int relsec,
1884 struct module *me)
1885{
1886 pr_err("module %s: REL relocation unsupported\n", me->name);
1887 return -ENOEXEC;
1888}
1889
1890int __weak apply_relocate_add(Elf_Shdr *sechdrs,
1891 const char *strtab,
1892 unsigned int symindex,
1893 unsigned int relsec,
1894 struct module *me)
1895{
1896 pr_err("module %s: RELA relocation unsupported\n", me->name);
1897 return -ENOEXEC;
1898}
1899
1829static int apply_relocations(struct module *mod, const struct load_info *info) 1900static int apply_relocations(struct module *mod, const struct load_info *info)
1830{ 1901{
1831 unsigned int i; 1902 unsigned int i;
@@ -2030,11 +2101,8 @@ static const struct kernel_symbol *lookup_symbol(const char *name,
2030 const struct kernel_symbol *start, 2101 const struct kernel_symbol *start,
2031 const struct kernel_symbol *stop) 2102 const struct kernel_symbol *stop)
2032{ 2103{
2033 const struct kernel_symbol *ks = start; 2104 return bsearch(name, start, stop - start,
2034 for (; ks < stop; ks++) 2105 sizeof(struct kernel_symbol), cmp_name);
2035 if (strcmp(ks->name, name) == 0)
2036 return ks;
2037 return NULL;
2038} 2106}
2039 2107
2040static int is_exported(const char *name, unsigned long value, 2108static int is_exported(const char *name, unsigned long value,
@@ -2213,6 +2281,11 @@ static void dynamic_debug_remove(struct _ddebug *debug)
2213 ddebug_remove_module(debug->modname); 2281 ddebug_remove_module(debug->modname);
2214} 2282}
2215 2283
2284void * __weak module_alloc(unsigned long size)
2285{
2286 return size == 0 ? NULL : vmalloc_exec(size);
2287}
2288
2216static void *module_alloc_update_bounds(unsigned long size) 2289static void *module_alloc_update_bounds(unsigned long size)
2217{ 2290{
2218 void *ret = module_alloc(size); 2291 void *ret = module_alloc(size);
@@ -2623,6 +2696,14 @@ static void flush_module_icache(const struct module *mod)
2623 set_fs(old_fs); 2696 set_fs(old_fs);
2624} 2697}
2625 2698
2699int __weak module_frob_arch_sections(Elf_Ehdr *hdr,
2700 Elf_Shdr *sechdrs,
2701 char *secstrings,
2702 struct module *mod)
2703{
2704 return 0;
2705}
2706
2626static struct module *layout_and_allocate(struct load_info *info) 2707static struct module *layout_and_allocate(struct load_info *info)
2627{ 2708{
2628 /* Module within temporary copy. */ 2709 /* Module within temporary copy. */
@@ -2694,6 +2775,13 @@ static void module_deallocate(struct module *mod, struct load_info *info)
2694 module_free(mod, mod->module_core); 2775 module_free(mod, mod->module_core);
2695} 2776}
2696 2777
2778int __weak module_finalize(const Elf_Ehdr *hdr,
2779 const Elf_Shdr *sechdrs,
2780 struct module *me)
2781{
2782 return 0;
2783}
2784
2697static int post_relocation(struct module *mod, const struct load_info *info) 2785static int post_relocation(struct module *mod, const struct load_info *info)
2698{ 2786{
2699 /* Sort exception table now relocations are done. */ 2787 /* Sort exception table now relocations are done. */
@@ -2790,7 +2878,7 @@ static struct module *load_module(void __user *umod,
2790 } 2878 }
2791 2879
2792 /* This has to be done once we're sure module name is unique. */ 2880 /* This has to be done once we're sure module name is unique. */
2793 if (!mod->taints) 2881 if (!mod->taints || mod->taints == (1U<<TAINT_CRAP))
2794 dynamic_debug_setup(info.debug, info.num_debug); 2882 dynamic_debug_setup(info.debug, info.num_debug);
2795 2883
2796 /* Find duplicate symbols */ 2884 /* Find duplicate symbols */
@@ -2827,7 +2915,7 @@ static struct module *load_module(void __user *umod,
2827 module_bug_cleanup(mod); 2915 module_bug_cleanup(mod);
2828 2916
2829 ddebug: 2917 ddebug:
2830 if (!mod->taints) 2918 if (!mod->taints || mod->taints == (1U<<TAINT_CRAP))
2831 dynamic_debug_remove(info.debug); 2919 dynamic_debug_remove(info.debug);
2832 unlock: 2920 unlock:
2833 mutex_unlock(&module_mutex); 2921 mutex_unlock(&module_mutex);
@@ -2931,10 +3019,11 @@ SYSCALL_DEFINE3(init_module, void __user *, umod,
2931 mod->symtab = mod->core_symtab; 3019 mod->symtab = mod->core_symtab;
2932 mod->strtab = mod->core_strtab; 3020 mod->strtab = mod->core_strtab;
2933#endif 3021#endif
2934 unset_section_ro_nx(mod, mod->module_init); 3022 unset_module_init_ro_nx(mod);
2935 module_free(mod, mod->module_init); 3023 module_free(mod, mod->module_init);
2936 mod->module_init = NULL; 3024 mod->module_init = NULL;
2937 mod->init_size = 0; 3025 mod->init_size = 0;
3026 mod->init_ro_size = 0;
2938 mod->init_text_size = 0; 3027 mod->init_text_size = 0;
2939 mutex_unlock(&module_mutex); 3028 mutex_unlock(&module_mutex);
2940 3029
diff --git a/kernel/mutex-debug.c b/kernel/mutex-debug.c
index ec815a960b5d..73da83aff418 100644
--- a/kernel/mutex-debug.c
+++ b/kernel/mutex-debug.c
@@ -75,7 +75,7 @@ void debug_mutex_unlock(struct mutex *lock)
75 return; 75 return;
76 76
77 DEBUG_LOCKS_WARN_ON(lock->magic != lock); 77 DEBUG_LOCKS_WARN_ON(lock->magic != lock);
78 DEBUG_LOCKS_WARN_ON(lock->owner != current_thread_info()); 78 DEBUG_LOCKS_WARN_ON(lock->owner != current);
79 DEBUG_LOCKS_WARN_ON(!lock->wait_list.prev && !lock->wait_list.next); 79 DEBUG_LOCKS_WARN_ON(!lock->wait_list.prev && !lock->wait_list.next);
80 mutex_clear_owner(lock); 80 mutex_clear_owner(lock);
81} 81}
diff --git a/kernel/mutex-debug.h b/kernel/mutex-debug.h
index 57d527a16f9d..0799fd3e4cfa 100644
--- a/kernel/mutex-debug.h
+++ b/kernel/mutex-debug.h
@@ -29,7 +29,7 @@ extern void debug_mutex_init(struct mutex *lock, const char *name,
29 29
30static inline void mutex_set_owner(struct mutex *lock) 30static inline void mutex_set_owner(struct mutex *lock)
31{ 31{
32 lock->owner = current_thread_info(); 32 lock->owner = current;
33} 33}
34 34
35static inline void mutex_clear_owner(struct mutex *lock) 35static inline void mutex_clear_owner(struct mutex *lock)
diff --git a/kernel/mutex.c b/kernel/mutex.c
index c4195fa98900..d607ed5dd441 100644
--- a/kernel/mutex.c
+++ b/kernel/mutex.c
@@ -131,14 +131,14 @@ EXPORT_SYMBOL(mutex_unlock);
131 */ 131 */
132static inline int __sched 132static inline int __sched
133__mutex_lock_common(struct mutex *lock, long state, unsigned int subclass, 133__mutex_lock_common(struct mutex *lock, long state, unsigned int subclass,
134 unsigned long ip) 134 struct lockdep_map *nest_lock, unsigned long ip)
135{ 135{
136 struct task_struct *task = current; 136 struct task_struct *task = current;
137 struct mutex_waiter waiter; 137 struct mutex_waiter waiter;
138 unsigned long flags; 138 unsigned long flags;
139 139
140 preempt_disable(); 140 preempt_disable();
141 mutex_acquire(&lock->dep_map, subclass, 0, ip); 141 mutex_acquire_nest(&lock->dep_map, subclass, 0, nest_lock, ip);
142 142
143#ifdef CONFIG_MUTEX_SPIN_ON_OWNER 143#ifdef CONFIG_MUTEX_SPIN_ON_OWNER
144 /* 144 /*
@@ -160,14 +160,7 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass,
160 */ 160 */
161 161
162 for (;;) { 162 for (;;) {
163 struct thread_info *owner; 163 struct task_struct *owner;
164
165 /*
166 * If we own the BKL, then don't spin. The owner of
167 * the mutex might be waiting on us to release the BKL.
168 */
169 if (unlikely(current->lock_depth >= 0))
170 break;
171 164
172 /* 165 /*
173 * If there's an owner, wait for it to either 166 * If there's an owner, wait for it to either
@@ -276,16 +269,25 @@ void __sched
276mutex_lock_nested(struct mutex *lock, unsigned int subclass) 269mutex_lock_nested(struct mutex *lock, unsigned int subclass)
277{ 270{
278 might_sleep(); 271 might_sleep();
279 __mutex_lock_common(lock, TASK_UNINTERRUPTIBLE, subclass, _RET_IP_); 272 __mutex_lock_common(lock, TASK_UNINTERRUPTIBLE, subclass, NULL, _RET_IP_);
280} 273}
281 274
282EXPORT_SYMBOL_GPL(mutex_lock_nested); 275EXPORT_SYMBOL_GPL(mutex_lock_nested);
283 276
277void __sched
278_mutex_lock_nest_lock(struct mutex *lock, struct lockdep_map *nest)
279{
280 might_sleep();
281 __mutex_lock_common(lock, TASK_UNINTERRUPTIBLE, 0, nest, _RET_IP_);
282}
283
284EXPORT_SYMBOL_GPL(_mutex_lock_nest_lock);
285
284int __sched 286int __sched
285mutex_lock_killable_nested(struct mutex *lock, unsigned int subclass) 287mutex_lock_killable_nested(struct mutex *lock, unsigned int subclass)
286{ 288{
287 might_sleep(); 289 might_sleep();
288 return __mutex_lock_common(lock, TASK_KILLABLE, subclass, _RET_IP_); 290 return __mutex_lock_common(lock, TASK_KILLABLE, subclass, NULL, _RET_IP_);
289} 291}
290EXPORT_SYMBOL_GPL(mutex_lock_killable_nested); 292EXPORT_SYMBOL_GPL(mutex_lock_killable_nested);
291 293
@@ -294,7 +296,7 @@ mutex_lock_interruptible_nested(struct mutex *lock, unsigned int subclass)
294{ 296{
295 might_sleep(); 297 might_sleep();
296 return __mutex_lock_common(lock, TASK_INTERRUPTIBLE, 298 return __mutex_lock_common(lock, TASK_INTERRUPTIBLE,
297 subclass, _RET_IP_); 299 subclass, NULL, _RET_IP_);
298} 300}
299 301
300EXPORT_SYMBOL_GPL(mutex_lock_interruptible_nested); 302EXPORT_SYMBOL_GPL(mutex_lock_interruptible_nested);
@@ -400,7 +402,7 @@ __mutex_lock_slowpath(atomic_t *lock_count)
400{ 402{
401 struct mutex *lock = container_of(lock_count, struct mutex, count); 403 struct mutex *lock = container_of(lock_count, struct mutex, count);
402 404
403 __mutex_lock_common(lock, TASK_UNINTERRUPTIBLE, 0, _RET_IP_); 405 __mutex_lock_common(lock, TASK_UNINTERRUPTIBLE, 0, NULL, _RET_IP_);
404} 406}
405 407
406static noinline int __sched 408static noinline int __sched
@@ -408,7 +410,7 @@ __mutex_lock_killable_slowpath(atomic_t *lock_count)
408{ 410{
409 struct mutex *lock = container_of(lock_count, struct mutex, count); 411 struct mutex *lock = container_of(lock_count, struct mutex, count);
410 412
411 return __mutex_lock_common(lock, TASK_KILLABLE, 0, _RET_IP_); 413 return __mutex_lock_common(lock, TASK_KILLABLE, 0, NULL, _RET_IP_);
412} 414}
413 415
414static noinline int __sched 416static noinline int __sched
@@ -416,7 +418,7 @@ __mutex_lock_interruptible_slowpath(atomic_t *lock_count)
416{ 418{
417 struct mutex *lock = container_of(lock_count, struct mutex, count); 419 struct mutex *lock = container_of(lock_count, struct mutex, count);
418 420
419 return __mutex_lock_common(lock, TASK_INTERRUPTIBLE, 0, _RET_IP_); 421 return __mutex_lock_common(lock, TASK_INTERRUPTIBLE, 0, NULL, _RET_IP_);
420} 422}
421#endif 423#endif
422 424
diff --git a/kernel/mutex.h b/kernel/mutex.h
index 67578ca48f94..4115fbf83b12 100644
--- a/kernel/mutex.h
+++ b/kernel/mutex.h
@@ -19,7 +19,7 @@
19#ifdef CONFIG_SMP 19#ifdef CONFIG_SMP
20static inline void mutex_set_owner(struct mutex *lock) 20static inline void mutex_set_owner(struct mutex *lock)
21{ 21{
22 lock->owner = current_thread_info(); 22 lock->owner = current;
23} 23}
24 24
25static inline void mutex_clear_owner(struct mutex *lock) 25static inline void mutex_clear_owner(struct mutex *lock)
diff --git a/kernel/notifier.c b/kernel/notifier.c
index 2488ba7eb568..8d7b435806c9 100644
--- a/kernel/notifier.c
+++ b/kernel/notifier.c
@@ -525,37 +525,6 @@ void srcu_init_notifier_head(struct srcu_notifier_head *nh)
525} 525}
526EXPORT_SYMBOL_GPL(srcu_init_notifier_head); 526EXPORT_SYMBOL_GPL(srcu_init_notifier_head);
527 527
528/**
529 * register_reboot_notifier - Register function to be called at reboot time
530 * @nb: Info about notifier function to be called
531 *
532 * Registers a function with the list of functions
533 * to be called at reboot time.
534 *
535 * Currently always returns zero, as blocking_notifier_chain_register()
536 * always returns zero.
537 */
538int register_reboot_notifier(struct notifier_block *nb)
539{
540 return blocking_notifier_chain_register(&reboot_notifier_list, nb);
541}
542EXPORT_SYMBOL(register_reboot_notifier);
543
544/**
545 * unregister_reboot_notifier - Unregister previously registered reboot notifier
546 * @nb: Hook to be unregistered
547 *
548 * Unregisters a previously registered reboot
549 * notifier function.
550 *
551 * Returns zero on success, or %-ENOENT on failure.
552 */
553int unregister_reboot_notifier(struct notifier_block *nb)
554{
555 return blocking_notifier_chain_unregister(&reboot_notifier_list, nb);
556}
557EXPORT_SYMBOL(unregister_reboot_notifier);
558
559static ATOMIC_NOTIFIER_HEAD(die_chain); 528static ATOMIC_NOTIFIER_HEAD(die_chain);
560 529
561int notrace __kprobes notify_die(enum die_val val, const char *str, 530int notrace __kprobes notify_die(enum die_val val, const char *str,
diff --git a/kernel/ns_cgroup.c b/kernel/ns_cgroup.c
deleted file mode 100644
index 2c98ad94ba0e..000000000000
--- a/kernel/ns_cgroup.c
+++ /dev/null
@@ -1,118 +0,0 @@
1/*
2 * ns_cgroup.c - namespace cgroup subsystem
3 *
4 * Copyright 2006, 2007 IBM Corp
5 */
6
7#include <linux/module.h>
8#include <linux/cgroup.h>
9#include <linux/fs.h>
10#include <linux/proc_fs.h>
11#include <linux/slab.h>
12#include <linux/nsproxy.h>
13
14struct ns_cgroup {
15 struct cgroup_subsys_state css;
16};
17
18struct cgroup_subsys ns_subsys;
19
20static inline struct ns_cgroup *cgroup_to_ns(
21 struct cgroup *cgroup)
22{
23 return container_of(cgroup_subsys_state(cgroup, ns_subsys_id),
24 struct ns_cgroup, css);
25}
26
27int ns_cgroup_clone(struct task_struct *task, struct pid *pid)
28{
29 char name[PROC_NUMBUF];
30
31 snprintf(name, PROC_NUMBUF, "%d", pid_vnr(pid));
32 return cgroup_clone(task, &ns_subsys, name);
33}
34
35/*
36 * Rules:
37 * 1. you can only enter a cgroup which is a descendant of your current
38 * cgroup
39 * 2. you can only place another process into a cgroup if
40 * a. you have CAP_SYS_ADMIN
41 * b. your cgroup is an ancestor of task's destination cgroup
42 * (hence either you are in the same cgroup as task, or in an
43 * ancestor cgroup thereof)
44 */
45static int ns_can_attach(struct cgroup_subsys *ss, struct cgroup *new_cgroup,
46 struct task_struct *task, bool threadgroup)
47{
48 if (current != task) {
49 if (!capable(CAP_SYS_ADMIN))
50 return -EPERM;
51
52 if (!cgroup_is_descendant(new_cgroup, current))
53 return -EPERM;
54 }
55
56 if (!cgroup_is_descendant(new_cgroup, task))
57 return -EPERM;
58
59 if (threadgroup) {
60 struct task_struct *c;
61 rcu_read_lock();
62 list_for_each_entry_rcu(c, &task->thread_group, thread_group) {
63 if (!cgroup_is_descendant(new_cgroup, c)) {
64 rcu_read_unlock();
65 return -EPERM;
66 }
67 }
68 rcu_read_unlock();
69 }
70
71 return 0;
72}
73
74/*
75 * Rules: you can only create a cgroup if
76 * 1. you are capable(CAP_SYS_ADMIN)
77 * 2. the target cgroup is a descendant of your own cgroup
78 */
79static struct cgroup_subsys_state *ns_create(struct cgroup_subsys *ss,
80 struct cgroup *cgroup)
81{
82 struct ns_cgroup *ns_cgroup;
83
84 if (!capable(CAP_SYS_ADMIN))
85 return ERR_PTR(-EPERM);
86 if (!cgroup_is_descendant(cgroup, current))
87 return ERR_PTR(-EPERM);
88 if (test_bit(CGRP_CLONE_CHILDREN, &cgroup->flags)) {
89 printk("ns_cgroup can't be created with parent "
90 "'clone_children' set.\n");
91 return ERR_PTR(-EINVAL);
92 }
93
94 printk_once("ns_cgroup deprecated: consider using the "
95 "'clone_children' flag without the ns_cgroup.\n");
96
97 ns_cgroup = kzalloc(sizeof(*ns_cgroup), GFP_KERNEL);
98 if (!ns_cgroup)
99 return ERR_PTR(-ENOMEM);
100 return &ns_cgroup->css;
101}
102
103static void ns_destroy(struct cgroup_subsys *ss,
104 struct cgroup *cgroup)
105{
106 struct ns_cgroup *ns_cgroup;
107
108 ns_cgroup = cgroup_to_ns(cgroup);
109 kfree(ns_cgroup);
110}
111
112struct cgroup_subsys ns_subsys = {
113 .name = "ns",
114 .can_attach = ns_can_attach,
115 .create = ns_create,
116 .destroy = ns_destroy,
117 .subsys_id = ns_subsys_id,
118};
diff --git a/kernel/nsproxy.c b/kernel/nsproxy.c
index a05d191ffdd9..9aeab4b98c64 100644
--- a/kernel/nsproxy.c
+++ b/kernel/nsproxy.c
@@ -22,6 +22,9 @@
22#include <linux/pid_namespace.h> 22#include <linux/pid_namespace.h>
23#include <net/net_namespace.h> 23#include <net/net_namespace.h>
24#include <linux/ipc_namespace.h> 24#include <linux/ipc_namespace.h>
25#include <linux/proc_fs.h>
26#include <linux/file.h>
27#include <linux/syscalls.h>
25 28
26static struct kmem_cache *nsproxy_cachep; 29static struct kmem_cache *nsproxy_cachep;
27 30
@@ -198,10 +201,6 @@ int unshare_nsproxy_namespaces(unsigned long unshare_flags,
198 goto out; 201 goto out;
199 } 202 }
200 203
201 err = ns_cgroup_clone(current, task_pid(current));
202 if (err)
203 put_nsproxy(*new_nsp);
204
205out: 204out:
206 return err; 205 return err;
207} 206}
@@ -233,10 +232,47 @@ void exit_task_namespaces(struct task_struct *p)
233 switch_task_namespaces(p, NULL); 232 switch_task_namespaces(p, NULL);
234} 233}
235 234
236static int __init nsproxy_cache_init(void) 235SYSCALL_DEFINE2(setns, int, fd, int, nstype)
236{
237 const struct proc_ns_operations *ops;
238 struct task_struct *tsk = current;
239 struct nsproxy *new_nsproxy;
240 struct proc_inode *ei;
241 struct file *file;
242 int err;
243
244 if (!capable(CAP_SYS_ADMIN))
245 return -EPERM;
246
247 file = proc_ns_fget(fd);
248 if (IS_ERR(file))
249 return PTR_ERR(file);
250
251 err = -EINVAL;
252 ei = PROC_I(file->f_dentry->d_inode);
253 ops = ei->ns_ops;
254 if (nstype && (ops->type != nstype))
255 goto out;
256
257 new_nsproxy = create_new_namespaces(0, tsk, tsk->fs);
258 if (IS_ERR(new_nsproxy)) {
259 err = PTR_ERR(new_nsproxy);
260 goto out;
261 }
262
263 err = ops->install(new_nsproxy, ei->ns);
264 if (err) {
265 free_nsproxy(new_nsproxy);
266 goto out;
267 }
268 switch_task_namespaces(tsk, new_nsproxy);
269out:
270 fput(file);
271 return err;
272}
273
274int __init nsproxy_cache_init(void)
237{ 275{
238 nsproxy_cachep = KMEM_CACHE(nsproxy, SLAB_PANIC); 276 nsproxy_cachep = KMEM_CACHE(nsproxy, SLAB_PANIC);
239 return 0; 277 return 0;
240} 278}
241
242module_init(nsproxy_cache_init);
diff --git a/kernel/panic.c b/kernel/panic.c
index 69231670eb95..d7bb6974efb5 100644
--- a/kernel/panic.c
+++ b/kernel/panic.c
@@ -119,6 +119,8 @@ NORET_TYPE void panic(const char * fmt, ...)
119 } 119 }
120 mdelay(PANIC_TIMER_STEP); 120 mdelay(PANIC_TIMER_STEP);
121 } 121 }
122 }
123 if (panic_timeout != 0) {
122 /* 124 /*
123 * This will not be a clean reboot, with everything 125 * This will not be a clean reboot, with everything
124 * shutting down. But if there is a chance of 126 * shutting down. But if there is a chance of
diff --git a/kernel/params.c b/kernel/params.c
index 7ab388a48a2e..22df3e0d142a 100644
--- a/kernel/params.c
+++ b/kernel/params.c
@@ -225,8 +225,8 @@ int parse_args(const char *name,
225 int ret; \ 225 int ret; \
226 \ 226 \
227 ret = strtolfn(val, 0, &l); \ 227 ret = strtolfn(val, 0, &l); \
228 if (ret == -EINVAL || ((type)l != l)) \ 228 if (ret < 0 || ((type)l != l)) \
229 return -EINVAL; \ 229 return ret < 0 ? ret : -EINVAL; \
230 *((type *)kp->arg) = l; \ 230 *((type *)kp->arg) = l; \
231 return 0; \ 231 return 0; \
232 } \ 232 } \
@@ -297,21 +297,15 @@ EXPORT_SYMBOL(param_ops_charp);
297int param_set_bool(const char *val, const struct kernel_param *kp) 297int param_set_bool(const char *val, const struct kernel_param *kp)
298{ 298{
299 bool v; 299 bool v;
300 int ret;
300 301
301 /* No equals means "set"... */ 302 /* No equals means "set"... */
302 if (!val) val = "1"; 303 if (!val) val = "1";
303 304
304 /* One of =[yYnN01] */ 305 /* One of =[yYnN01] */
305 switch (val[0]) { 306 ret = strtobool(val, &v);
306 case 'y': case 'Y': case '1': 307 if (ret)
307 v = true; 308 return ret;
308 break;
309 case 'n': case 'N': case '0':
310 v = false;
311 break;
312 default:
313 return -EINVAL;
314 }
315 309
316 if (kp->flags & KPARAM_ISBOOL) 310 if (kp->flags & KPARAM_ISBOOL)
317 *(bool *)kp->arg = v; 311 *(bool *)kp->arg = v;
@@ -517,7 +511,7 @@ struct module_param_attrs
517#define to_param_attr(n) container_of(n, struct param_attribute, mattr) 511#define to_param_attr(n) container_of(n, struct param_attribute, mattr)
518 512
519static ssize_t param_attr_show(struct module_attribute *mattr, 513static ssize_t param_attr_show(struct module_attribute *mattr,
520 struct module *mod, char *buf) 514 struct module_kobject *mk, char *buf)
521{ 515{
522 int count; 516 int count;
523 struct param_attribute *attribute = to_param_attr(mattr); 517 struct param_attribute *attribute = to_param_attr(mattr);
@@ -537,7 +531,7 @@ static ssize_t param_attr_show(struct module_attribute *mattr,
537 531
538/* sysfs always hands a nul-terminated string in buf. We rely on that. */ 532/* sysfs always hands a nul-terminated string in buf. We rely on that. */
539static ssize_t param_attr_store(struct module_attribute *mattr, 533static ssize_t param_attr_store(struct module_attribute *mattr,
540 struct module *owner, 534 struct module_kobject *km,
541 const char *buf, size_t len) 535 const char *buf, size_t len)
542{ 536{
543 int err; 537 int err;
@@ -736,6 +730,10 @@ static struct module_kobject * __init locate_module_kobject(const char *name)
736 mk->kobj.kset = module_kset; 730 mk->kobj.kset = module_kset;
737 err = kobject_init_and_add(&mk->kobj, &module_ktype, NULL, 731 err = kobject_init_and_add(&mk->kobj, &module_ktype, NULL,
738 "%s", name); 732 "%s", name);
733#ifdef CONFIG_MODULES
734 if (!err)
735 err = sysfs_create_file(&mk->kobj, &module_uevent.attr);
736#endif
739 if (err) { 737 if (err) {
740 kobject_put(&mk->kobj); 738 kobject_put(&mk->kobj);
741 printk(KERN_ERR 739 printk(KERN_ERR
@@ -813,7 +811,7 @@ static void __init param_sysfs_builtin(void)
813} 811}
814 812
815ssize_t __modver_version_show(struct module_attribute *mattr, 813ssize_t __modver_version_show(struct module_attribute *mattr,
816 struct module *mod, char *buf) 814 struct module_kobject *mk, char *buf)
817{ 815{
818 struct module_version_attribute *vattr = 816 struct module_version_attribute *vattr =
819 container_of(mattr, struct module_version_attribute, mattr); 817 container_of(mattr, struct module_version_attribute, mattr);
@@ -821,15 +819,18 @@ ssize_t __modver_version_show(struct module_attribute *mattr,
821 return sprintf(buf, "%s\n", vattr->version); 819 return sprintf(buf, "%s\n", vattr->version);
822} 820}
823 821
824extern struct module_version_attribute __start___modver[], __stop___modver[]; 822extern const struct module_version_attribute *__start___modver[];
823extern const struct module_version_attribute *__stop___modver[];
825 824
826static void __init version_sysfs_builtin(void) 825static void __init version_sysfs_builtin(void)
827{ 826{
828 const struct module_version_attribute *vattr; 827 const struct module_version_attribute **p;
829 struct module_kobject *mk; 828 struct module_kobject *mk;
830 int err; 829 int err;
831 830
832 for (vattr = __start___modver; vattr < __stop___modver; vattr++) { 831 for (p = __start___modver; p < __stop___modver; p++) {
832 const struct module_version_attribute *vattr = *p;
833
833 mk = locate_module_kobject(vattr->module_name); 834 mk = locate_module_kobject(vattr->module_name);
834 if (mk) { 835 if (mk) {
835 err = sysfs_create_file(&mk->kobj, &vattr->mattr.attr); 836 err = sysfs_create_file(&mk->kobj, &vattr->mattr.attr);
@@ -855,7 +856,7 @@ static ssize_t module_attr_show(struct kobject *kobj,
855 if (!attribute->show) 856 if (!attribute->show)
856 return -EIO; 857 return -EIO;
857 858
858 ret = attribute->show(attribute, mk->mod, buf); 859 ret = attribute->show(attribute, mk, buf);
859 860
860 return ret; 861 return ret;
861} 862}
@@ -874,7 +875,7 @@ static ssize_t module_attr_store(struct kobject *kobj,
874 if (!attribute->store) 875 if (!attribute->store)
875 return -EIO; 876 return -EIO;
876 877
877 ret = attribute->store(attribute, mk->mod, buf, len); 878 ret = attribute->store(attribute, mk, buf, len);
878 879
879 return ret; 880 return ret;
880} 881}
diff --git a/kernel/pid.c b/kernel/pid.c
index 57a8346a270e..e432057f3b21 100644
--- a/kernel/pid.c
+++ b/kernel/pid.c
@@ -405,7 +405,6 @@ struct task_struct *pid_task(struct pid *pid, enum pid_type type)
405 if (pid) { 405 if (pid) {
406 struct hlist_node *first; 406 struct hlist_node *first;
407 first = rcu_dereference_check(hlist_first_rcu(&pid->tasks[type]), 407 first = rcu_dereference_check(hlist_first_rcu(&pid->tasks[type]),
408 rcu_read_lock_held() ||
409 lockdep_tasklist_lock_is_held()); 408 lockdep_tasklist_lock_is_held());
410 if (first) 409 if (first)
411 result = hlist_entry(first, struct task_struct, pids[(type)].node); 410 result = hlist_entry(first, struct task_struct, pids[(type)].node);
diff --git a/kernel/pm_qos_params.c b/kernel/pm_qos_params.c
index 0da058bff8eb..37f05d0f0793 100644
--- a/kernel/pm_qos_params.c
+++ b/kernel/pm_qos_params.c
@@ -40,6 +40,7 @@
40#include <linux/string.h> 40#include <linux/string.h>
41#include <linux/platform_device.h> 41#include <linux/platform_device.h>
42#include <linux/init.h> 42#include <linux/init.h>
43#include <linux/kernel.h>
43 44
44#include <linux/uaccess.h> 45#include <linux/uaccess.h>
45 46
@@ -53,11 +54,17 @@ enum pm_qos_type {
53 PM_QOS_MIN /* return the smallest value */ 54 PM_QOS_MIN /* return the smallest value */
54}; 55};
55 56
57/*
58 * Note: The lockless read path depends on the CPU accessing
59 * target_value atomically. Atomic access is only guaranteed on all CPU
60 * types linux supports for 32 bit quantites
61 */
56struct pm_qos_object { 62struct pm_qos_object {
57 struct plist_head requests; 63 struct plist_head requests;
58 struct blocking_notifier_head *notifiers; 64 struct blocking_notifier_head *notifiers;
59 struct miscdevice pm_qos_power_miscdev; 65 struct miscdevice pm_qos_power_miscdev;
60 char *name; 66 char *name;
67 s32 target_value; /* Do not change to 64 bit */
61 s32 default_value; 68 s32 default_value;
62 enum pm_qos_type type; 69 enum pm_qos_type type;
63}; 70};
@@ -67,29 +74,32 @@ static DEFINE_SPINLOCK(pm_qos_lock);
67static struct pm_qos_object null_pm_qos; 74static struct pm_qos_object null_pm_qos;
68static BLOCKING_NOTIFIER_HEAD(cpu_dma_lat_notifier); 75static BLOCKING_NOTIFIER_HEAD(cpu_dma_lat_notifier);
69static struct pm_qos_object cpu_dma_pm_qos = { 76static struct pm_qos_object cpu_dma_pm_qos = {
70 .requests = PLIST_HEAD_INIT(cpu_dma_pm_qos.requests, pm_qos_lock), 77 .requests = PLIST_HEAD_INIT(cpu_dma_pm_qos.requests),
71 .notifiers = &cpu_dma_lat_notifier, 78 .notifiers = &cpu_dma_lat_notifier,
72 .name = "cpu_dma_latency", 79 .name = "cpu_dma_latency",
73 .default_value = 2000 * USEC_PER_SEC, 80 .target_value = PM_QOS_CPU_DMA_LAT_DEFAULT_VALUE,
81 .default_value = PM_QOS_CPU_DMA_LAT_DEFAULT_VALUE,
74 .type = PM_QOS_MIN, 82 .type = PM_QOS_MIN,
75}; 83};
76 84
77static BLOCKING_NOTIFIER_HEAD(network_lat_notifier); 85static BLOCKING_NOTIFIER_HEAD(network_lat_notifier);
78static struct pm_qos_object network_lat_pm_qos = { 86static struct pm_qos_object network_lat_pm_qos = {
79 .requests = PLIST_HEAD_INIT(network_lat_pm_qos.requests, pm_qos_lock), 87 .requests = PLIST_HEAD_INIT(network_lat_pm_qos.requests),
80 .notifiers = &network_lat_notifier, 88 .notifiers = &network_lat_notifier,
81 .name = "network_latency", 89 .name = "network_latency",
82 .default_value = 2000 * USEC_PER_SEC, 90 .target_value = PM_QOS_NETWORK_LAT_DEFAULT_VALUE,
91 .default_value = PM_QOS_NETWORK_LAT_DEFAULT_VALUE,
83 .type = PM_QOS_MIN 92 .type = PM_QOS_MIN
84}; 93};
85 94
86 95
87static BLOCKING_NOTIFIER_HEAD(network_throughput_notifier); 96static BLOCKING_NOTIFIER_HEAD(network_throughput_notifier);
88static struct pm_qos_object network_throughput_pm_qos = { 97static struct pm_qos_object network_throughput_pm_qos = {
89 .requests = PLIST_HEAD_INIT(network_throughput_pm_qos.requests, pm_qos_lock), 98 .requests = PLIST_HEAD_INIT(network_throughput_pm_qos.requests),
90 .notifiers = &network_throughput_notifier, 99 .notifiers = &network_throughput_notifier,
91 .name = "network_throughput", 100 .name = "network_throughput",
92 .default_value = 0, 101 .target_value = PM_QOS_NETWORK_THROUGHPUT_DEFAULT_VALUE,
102 .default_value = PM_QOS_NETWORK_THROUGHPUT_DEFAULT_VALUE,
93 .type = PM_QOS_MAX, 103 .type = PM_QOS_MAX,
94}; 104};
95 105
@@ -135,6 +145,16 @@ static inline int pm_qos_get_value(struct pm_qos_object *o)
135 } 145 }
136} 146}
137 147
148static inline s32 pm_qos_read_value(struct pm_qos_object *o)
149{
150 return o->target_value;
151}
152
153static inline void pm_qos_set_value(struct pm_qos_object *o, s32 value)
154{
155 o->target_value = value;
156}
157
138static void update_target(struct pm_qos_object *o, struct plist_node *node, 158static void update_target(struct pm_qos_object *o, struct plist_node *node,
139 int del, int value) 159 int del, int value)
140{ 160{
@@ -159,6 +179,7 @@ static void update_target(struct pm_qos_object *o, struct plist_node *node,
159 plist_add(node, &o->requests); 179 plist_add(node, &o->requests);
160 } 180 }
161 curr_value = pm_qos_get_value(o); 181 curr_value = pm_qos_get_value(o);
182 pm_qos_set_value(o, curr_value);
162 spin_unlock_irqrestore(&pm_qos_lock, flags); 183 spin_unlock_irqrestore(&pm_qos_lock, flags);
163 184
164 if (prev_value != curr_value) 185 if (prev_value != curr_value)
@@ -193,18 +214,11 @@ static int find_pm_qos_object_by_minor(int minor)
193 * pm_qos_request - returns current system wide qos expectation 214 * pm_qos_request - returns current system wide qos expectation
194 * @pm_qos_class: identification of which qos value is requested 215 * @pm_qos_class: identification of which qos value is requested
195 * 216 *
196 * This function returns the current target value in an atomic manner. 217 * This function returns the current target value.
197 */ 218 */
198int pm_qos_request(int pm_qos_class) 219int pm_qos_request(int pm_qos_class)
199{ 220{
200 unsigned long flags; 221 return pm_qos_read_value(pm_qos_array[pm_qos_class]);
201 int value;
202
203 spin_lock_irqsave(&pm_qos_lock, flags);
204 value = pm_qos_get_value(pm_qos_array[pm_qos_class]);
205 spin_unlock_irqrestore(&pm_qos_lock, flags);
206
207 return value;
208} 222}
209EXPORT_SYMBOL_GPL(pm_qos_request); 223EXPORT_SYMBOL_GPL(pm_qos_request);
210 224
@@ -385,7 +399,7 @@ static ssize_t pm_qos_power_read(struct file *filp, char __user *buf,
385 s32 value; 399 s32 value;
386 unsigned long flags; 400 unsigned long flags;
387 struct pm_qos_object *o; 401 struct pm_qos_object *o;
388 struct pm_qos_request_list *pm_qos_req = filp->private_data;; 402 struct pm_qos_request_list *pm_qos_req = filp->private_data;
389 403
390 if (!pm_qos_req) 404 if (!pm_qos_req)
391 return -EINVAL; 405 return -EINVAL;
@@ -404,24 +418,36 @@ static ssize_t pm_qos_power_write(struct file *filp, const char __user *buf,
404 size_t count, loff_t *f_pos) 418 size_t count, loff_t *f_pos)
405{ 419{
406 s32 value; 420 s32 value;
407 int x;
408 char ascii_value[11];
409 struct pm_qos_request_list *pm_qos_req; 421 struct pm_qos_request_list *pm_qos_req;
410 422
411 if (count == sizeof(s32)) { 423 if (count == sizeof(s32)) {
412 if (copy_from_user(&value, buf, sizeof(s32))) 424 if (copy_from_user(&value, buf, sizeof(s32)))
413 return -EFAULT; 425 return -EFAULT;
414 } else if (count == 11) { /* len('0x12345678/0') */ 426 } else if (count <= 11) { /* ASCII perhaps? */
415 if (copy_from_user(ascii_value, buf, 11)) 427 char ascii_value[11];
428 unsigned long int ulval;
429 int ret;
430
431 if (copy_from_user(ascii_value, buf, count))
416 return -EFAULT; 432 return -EFAULT;
417 if (strlen(ascii_value) != 10) 433
418 return -EINVAL; 434 if (count > 10) {
419 x = sscanf(ascii_value, "%x", &value); 435 if (ascii_value[10] == '\n')
420 if (x != 1) 436 ascii_value[10] = '\0';
437 else
438 return -EINVAL;
439 } else {
440 ascii_value[count] = '\0';
441 }
442 ret = strict_strtoul(ascii_value, 16, &ulval);
443 if (ret) {
444 pr_debug("%s, 0x%lx, 0x%x\n", ascii_value, ulval, ret);
421 return -EINVAL; 445 return -EINVAL;
422 pr_debug("%s, %d, 0x%x\n", ascii_value, x, value); 446 }
423 } else 447 value = (s32)lower_32_bits(ulval);
448 } else {
424 return -EINVAL; 449 return -EINVAL;
450 }
425 451
426 pm_qos_req = filp->private_data; 452 pm_qos_req = filp->private_data;
427 pm_qos_update_request(pm_qos_req, value); 453 pm_qos_update_request(pm_qos_req, value);
diff --git a/kernel/posix-cpu-timers.c b/kernel/posix-cpu-timers.c
index 0791b13df7bf..58f405b581e7 100644
--- a/kernel/posix-cpu-timers.c
+++ b/kernel/posix-cpu-timers.c
@@ -1514,7 +1514,7 @@ static int posix_cpu_nsleep(const clockid_t which_clock, int flags,
1514 return -EFAULT; 1514 return -EFAULT;
1515 1515
1516 restart_block->fn = posix_cpu_nsleep_restart; 1516 restart_block->fn = posix_cpu_nsleep_restart;
1517 restart_block->nanosleep.index = which_clock; 1517 restart_block->nanosleep.clockid = which_clock;
1518 restart_block->nanosleep.rmtp = rmtp; 1518 restart_block->nanosleep.rmtp = rmtp;
1519 restart_block->nanosleep.expires = timespec_to_ns(rqtp); 1519 restart_block->nanosleep.expires = timespec_to_ns(rqtp);
1520 } 1520 }
@@ -1523,7 +1523,7 @@ static int posix_cpu_nsleep(const clockid_t which_clock, int flags,
1523 1523
1524static long posix_cpu_nsleep_restart(struct restart_block *restart_block) 1524static long posix_cpu_nsleep_restart(struct restart_block *restart_block)
1525{ 1525{
1526 clockid_t which_clock = restart_block->nanosleep.index; 1526 clockid_t which_clock = restart_block->nanosleep.clockid;
1527 struct timespec t; 1527 struct timespec t;
1528 struct itimerspec it; 1528 struct itimerspec it;
1529 int error; 1529 int error;
diff --git a/kernel/posix-timers.c b/kernel/posix-timers.c
index e5498d7405c3..4556182527f3 100644
--- a/kernel/posix-timers.c
+++ b/kernel/posix-timers.c
@@ -491,6 +491,13 @@ static struct k_itimer * alloc_posix_timer(void)
491 return tmr; 491 return tmr;
492} 492}
493 493
494static void k_itimer_rcu_free(struct rcu_head *head)
495{
496 struct k_itimer *tmr = container_of(head, struct k_itimer, it.rcu);
497
498 kmem_cache_free(posix_timers_cache, tmr);
499}
500
494#define IT_ID_SET 1 501#define IT_ID_SET 1
495#define IT_ID_NOT_SET 0 502#define IT_ID_NOT_SET 0
496static void release_posix_timer(struct k_itimer *tmr, int it_id_set) 503static void release_posix_timer(struct k_itimer *tmr, int it_id_set)
@@ -503,7 +510,7 @@ static void release_posix_timer(struct k_itimer *tmr, int it_id_set)
503 } 510 }
504 put_pid(tmr->it_pid); 511 put_pid(tmr->it_pid);
505 sigqueue_free(tmr->sigq); 512 sigqueue_free(tmr->sigq);
506 kmem_cache_free(posix_timers_cache, tmr); 513 call_rcu(&tmr->it.rcu, k_itimer_rcu_free);
507} 514}
508 515
509static struct k_clock *clockid_to_kclock(const clockid_t id) 516static struct k_clock *clockid_to_kclock(const clockid_t id)
@@ -631,22 +638,18 @@ out:
631static struct k_itimer *__lock_timer(timer_t timer_id, unsigned long *flags) 638static struct k_itimer *__lock_timer(timer_t timer_id, unsigned long *flags)
632{ 639{
633 struct k_itimer *timr; 640 struct k_itimer *timr;
634 /* 641
635 * Watch out here. We do a irqsave on the idr_lock and pass the 642 rcu_read_lock();
636 * flags part over to the timer lock. Must not let interrupts in
637 * while we are moving the lock.
638 */
639 spin_lock_irqsave(&idr_lock, *flags);
640 timr = idr_find(&posix_timers_id, (int)timer_id); 643 timr = idr_find(&posix_timers_id, (int)timer_id);
641 if (timr) { 644 if (timr) {
642 spin_lock(&timr->it_lock); 645 spin_lock_irqsave(&timr->it_lock, *flags);
643 if (timr->it_signal == current->signal) { 646 if (timr->it_signal == current->signal) {
644 spin_unlock(&idr_lock); 647 rcu_read_unlock();
645 return timr; 648 return timr;
646 } 649 }
647 spin_unlock(&timr->it_lock); 650 spin_unlock_irqrestore(&timr->it_lock, *flags);
648 } 651 }
649 spin_unlock_irqrestore(&idr_lock, *flags); 652 rcu_read_unlock();
650 653
651 return NULL; 654 return NULL;
652} 655}
@@ -1056,7 +1059,7 @@ SYSCALL_DEFINE4(clock_nanosleep, const clockid_t, which_clock, int, flags,
1056 */ 1059 */
1057long clock_nanosleep_restart(struct restart_block *restart_block) 1060long clock_nanosleep_restart(struct restart_block *restart_block)
1058{ 1061{
1059 clockid_t which_clock = restart_block->nanosleep.index; 1062 clockid_t which_clock = restart_block->nanosleep.clockid;
1060 struct k_clock *kc = clockid_to_kclock(which_clock); 1063 struct k_clock *kc = clockid_to_kclock(which_clock);
1061 1064
1062 if (WARN_ON_ONCE(!kc || !kc->nsleep_restart)) 1065 if (WARN_ON_ONCE(!kc || !kc->nsleep_restart))
diff --git a/kernel/power/Kconfig b/kernel/power/Kconfig
index 6de9a8fc3417..b1914cb9095c 100644
--- a/kernel/power/Kconfig
+++ b/kernel/power/Kconfig
@@ -125,12 +125,6 @@ config PM_DEBUG
125 code. This is helpful when debugging and reporting PM bugs, like 125 code. This is helpful when debugging and reporting PM bugs, like
126 suspend support. 126 suspend support.
127 127
128config PM_VERBOSE
129 bool "Verbose Power Management debugging"
130 depends on PM_DEBUG
131 ---help---
132 This option enables verbose messages from the Power Management code.
133
134config PM_ADVANCED_DEBUG 128config PM_ADVANCED_DEBUG
135 bool "Extra PM attributes in sysfs for low-level debugging/testing" 129 bool "Extra PM attributes in sysfs for low-level debugging/testing"
136 depends on PM_DEBUG 130 depends on PM_DEBUG
@@ -199,8 +193,8 @@ config APM_EMULATION
199 notification of APM "events" (e.g. battery status change). 193 notification of APM "events" (e.g. battery status change).
200 194
201 In order to use APM, you will need supporting software. For location 195 In order to use APM, you will need supporting software. For location
202 and more information, read <file:Documentation/power/pm.txt> and the 196 and more information, read <file:Documentation/power/apm-acpi.txt>
203 Battery Powered Linux mini-HOWTO, available from 197 and the Battery Powered Linux mini-HOWTO, available from
204 <http://www.tldp.org/docs.html#howto>. 198 <http://www.tldp.org/docs.html#howto>.
205 199
206 This driver does not spin down disk drives (see the hdparm(8) 200 This driver does not spin down disk drives (see the hdparm(8)
@@ -229,3 +223,11 @@ config PM_OPP
229 representing individual voltage domains and provides SOC 223 representing individual voltage domains and provides SOC
230 implementations a ready to use framework to manage OPPs. 224 implementations a ready to use framework to manage OPPs.
231 For more information, read <file:Documentation/power/opp.txt> 225 For more information, read <file:Documentation/power/opp.txt>
226
227config PM_CLK
228 def_bool y
229 depends on PM && HAVE_CLK
230
231config PM_GENERIC_DOMAINS
232 bool
233 depends on PM
diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c
index 50aae660174d..8f7b1db1ece1 100644
--- a/kernel/power/hibernate.c
+++ b/kernel/power/hibernate.c
@@ -25,7 +25,6 @@
25#include <linux/gfp.h> 25#include <linux/gfp.h>
26#include <linux/syscore_ops.h> 26#include <linux/syscore_ops.h>
27#include <scsi/scsi_scan.h> 27#include <scsi/scsi_scan.h>
28#include <asm/suspend.h>
29 28
30#include "power.h" 29#include "power.h"
31 30
@@ -55,10 +54,9 @@ static int hibernation_mode = HIBERNATION_SHUTDOWN;
55static const struct platform_hibernation_ops *hibernation_ops; 54static const struct platform_hibernation_ops *hibernation_ops;
56 55
57/** 56/**
58 * hibernation_set_ops - set the global hibernate operations 57 * hibernation_set_ops - Set the global hibernate operations.
59 * @ops: the hibernation operations to use in subsequent hibernation transitions 58 * @ops: Hibernation operations to use in subsequent hibernation transitions.
60 */ 59 */
61
62void hibernation_set_ops(const struct platform_hibernation_ops *ops) 60void hibernation_set_ops(const struct platform_hibernation_ops *ops)
63{ 61{
64 if (ops && !(ops->begin && ops->end && ops->pre_snapshot 62 if (ops && !(ops->begin && ops->end && ops->pre_snapshot
@@ -115,10 +113,9 @@ static int hibernation_test(int level) { return 0; }
115#endif /* !CONFIG_PM_DEBUG */ 113#endif /* !CONFIG_PM_DEBUG */
116 114
117/** 115/**
118 * platform_begin - tell the platform driver that we're starting 116 * platform_begin - Call platform to start hibernation.
119 * hibernation 117 * @platform_mode: Whether or not to use the platform driver.
120 */ 118 */
121
122static int platform_begin(int platform_mode) 119static int platform_begin(int platform_mode)
123{ 120{
124 return (platform_mode && hibernation_ops) ? 121 return (platform_mode && hibernation_ops) ?
@@ -126,10 +123,9 @@ static int platform_begin(int platform_mode)
126} 123}
127 124
128/** 125/**
129 * platform_end - tell the platform driver that we've entered the 126 * platform_end - Call platform to finish transition to the working state.
130 * working state 127 * @platform_mode: Whether or not to use the platform driver.
131 */ 128 */
132
133static void platform_end(int platform_mode) 129static void platform_end(int platform_mode)
134{ 130{
135 if (platform_mode && hibernation_ops) 131 if (platform_mode && hibernation_ops)
@@ -137,8 +133,11 @@ static void platform_end(int platform_mode)
137} 133}
138 134
139/** 135/**
140 * platform_pre_snapshot - prepare the machine for hibernation using the 136 * platform_pre_snapshot - Call platform to prepare the machine for hibernation.
141 * platform driver if so configured and return an error code if it fails 137 * @platform_mode: Whether or not to use the platform driver.
138 *
139 * Use the platform driver to prepare the system for creating a hibernate image,
140 * if so configured, and return an error code if that fails.
142 */ 141 */
143 142
144static int platform_pre_snapshot(int platform_mode) 143static int platform_pre_snapshot(int platform_mode)
@@ -148,10 +147,14 @@ static int platform_pre_snapshot(int platform_mode)
148} 147}
149 148
150/** 149/**
151 * platform_leave - prepare the machine for switching to the normal mode 150 * platform_leave - Call platform to prepare a transition to the working state.
152 * of operation using the platform driver (called with interrupts disabled) 151 * @platform_mode: Whether or not to use the platform driver.
152 *
153 * Use the platform driver prepare to prepare the machine for switching to the
154 * normal mode of operation.
155 *
156 * This routine is called on one CPU with interrupts disabled.
153 */ 157 */
154
155static void platform_leave(int platform_mode) 158static void platform_leave(int platform_mode)
156{ 159{
157 if (platform_mode && hibernation_ops) 160 if (platform_mode && hibernation_ops)
@@ -159,10 +162,14 @@ static void platform_leave(int platform_mode)
159} 162}
160 163
161/** 164/**
162 * platform_finish - switch the machine to the normal mode of operation 165 * platform_finish - Call platform to switch the system to the working state.
163 * using the platform driver (must be called after platform_prepare()) 166 * @platform_mode: Whether or not to use the platform driver.
167 *
168 * Use the platform driver to switch the machine to the normal mode of
169 * operation.
170 *
171 * This routine must be called after platform_prepare().
164 */ 172 */
165
166static void platform_finish(int platform_mode) 173static void platform_finish(int platform_mode)
167{ 174{
168 if (platform_mode && hibernation_ops) 175 if (platform_mode && hibernation_ops)
@@ -170,11 +177,15 @@ static void platform_finish(int platform_mode)
170} 177}
171 178
172/** 179/**
173 * platform_pre_restore - prepare the platform for the restoration from a 180 * platform_pre_restore - Prepare for hibernate image restoration.
174 * hibernation image. If the restore fails after this function has been 181 * @platform_mode: Whether or not to use the platform driver.
175 * called, platform_restore_cleanup() must be called. 182 *
183 * Use the platform driver to prepare the system for resume from a hibernation
184 * image.
185 *
186 * If the restore fails after this function has been called,
187 * platform_restore_cleanup() must be called.
176 */ 188 */
177
178static int platform_pre_restore(int platform_mode) 189static int platform_pre_restore(int platform_mode)
179{ 190{
180 return (platform_mode && hibernation_ops) ? 191 return (platform_mode && hibernation_ops) ?
@@ -182,12 +193,16 @@ static int platform_pre_restore(int platform_mode)
182} 193}
183 194
184/** 195/**
185 * platform_restore_cleanup - switch the platform to the normal mode of 196 * platform_restore_cleanup - Switch to the working state after failing restore.
186 * operation after a failing restore. If platform_pre_restore() has been 197 * @platform_mode: Whether or not to use the platform driver.
187 * called before the failing restore, this function must be called too, 198 *
188 * regardless of the result of platform_pre_restore(). 199 * Use the platform driver to switch the system to the normal mode of operation
200 * after a failing restore.
201 *
202 * If platform_pre_restore() has been called before the failing restore, this
203 * function must be called too, regardless of the result of
204 * platform_pre_restore().
189 */ 205 */
190
191static void platform_restore_cleanup(int platform_mode) 206static void platform_restore_cleanup(int platform_mode)
192{ 207{
193 if (platform_mode && hibernation_ops) 208 if (platform_mode && hibernation_ops)
@@ -195,10 +210,9 @@ static void platform_restore_cleanup(int platform_mode)
195} 210}
196 211
197/** 212/**
198 * platform_recover - recover the platform from a failure to suspend 213 * platform_recover - Recover from a failure to suspend devices.
199 * devices. 214 * @platform_mode: Whether or not to use the platform driver.
200 */ 215 */
201
202static void platform_recover(int platform_mode) 216static void platform_recover(int platform_mode)
203{ 217{
204 if (platform_mode && hibernation_ops && hibernation_ops->recover) 218 if (platform_mode && hibernation_ops && hibernation_ops->recover)
@@ -206,13 +220,12 @@ static void platform_recover(int platform_mode)
206} 220}
207 221
208/** 222/**
209 * swsusp_show_speed - print the time elapsed between two events. 223 * swsusp_show_speed - Print time elapsed between two events during hibernation.
210 * @start: Starting event. 224 * @start: Starting event.
211 * @stop: Final event. 225 * @stop: Final event.
212 * @nr_pages - number of pages processed between @start and @stop 226 * @nr_pages: Number of memory pages processed between @start and @stop.
213 * @msg - introductory message to print 227 * @msg: Additional diagnostic message to print.
214 */ 228 */
215
216void swsusp_show_speed(struct timeval *start, struct timeval *stop, 229void swsusp_show_speed(struct timeval *start, struct timeval *stop,
217 unsigned nr_pages, char *msg) 230 unsigned nr_pages, char *msg)
218{ 231{
@@ -235,25 +248,18 @@ void swsusp_show_speed(struct timeval *start, struct timeval *stop,
235} 248}
236 249
237/** 250/**
238 * create_image - freeze devices that need to be frozen with interrupts 251 * create_image - Create a hibernation image.
239 * off, create the hibernation image and thaw those devices. Control 252 * @platform_mode: Whether or not to use the platform driver.
240 * reappears in this routine after a restore. 253 *
254 * Execute device drivers' .freeze_noirq() callbacks, create a hibernation image
255 * and execute the drivers' .thaw_noirq() callbacks.
256 *
257 * Control reappears in this routine after the subsequent restore.
241 */ 258 */
242
243static int create_image(int platform_mode) 259static int create_image(int platform_mode)
244{ 260{
245 int error; 261 int error;
246 262
247 error = arch_prepare_suspend();
248 if (error)
249 return error;
250
251 /* At this point, dpm_suspend_start() has been called, but *not*
252 * dpm_suspend_noirq(). We *must* call dpm_suspend_noirq() now.
253 * Otherwise, drivers for some devices (e.g. interrupt controllers)
254 * become desynchronized with the actual state of the hardware
255 * at resume time, and evil weirdness ensues.
256 */
257 error = dpm_suspend_noirq(PMSG_FREEZE); 263 error = dpm_suspend_noirq(PMSG_FREEZE);
258 if (error) { 264 if (error) {
259 printk(KERN_ERR "PM: Some devices failed to power down, " 265 printk(KERN_ERR "PM: Some devices failed to power down, "
@@ -272,12 +278,7 @@ static int create_image(int platform_mode)
272 278
273 local_irq_disable(); 279 local_irq_disable();
274 280
275 error = sysdev_suspend(PMSG_FREEZE); 281 error = syscore_suspend();
276 if (!error) {
277 error = syscore_suspend();
278 if (error)
279 sysdev_resume();
280 }
281 if (error) { 282 if (error) {
282 printk(KERN_ERR "PM: Some system devices failed to power down, " 283 printk(KERN_ERR "PM: Some system devices failed to power down, "
283 "aborting hibernation\n"); 284 "aborting hibernation\n");
@@ -302,10 +303,6 @@ static int create_image(int platform_mode)
302 303
303 Power_up: 304 Power_up:
304 syscore_resume(); 305 syscore_resume();
305 sysdev_resume();
306 /* NOTE: dpm_resume_noirq() is just a resume() for devices
307 * that suspended with irqs off ... no overall powerup.
308 */
309 306
310 Enable_irqs: 307 Enable_irqs:
311 local_irq_enable(); 308 local_irq_enable();
@@ -323,30 +320,32 @@ static int create_image(int platform_mode)
323} 320}
324 321
325/** 322/**
326 * hibernation_snapshot - quiesce devices and create the hibernation 323 * hibernation_snapshot - Quiesce devices and create a hibernation image.
327 * snapshot image. 324 * @platform_mode: If set, use platform driver to prepare for the transition.
328 * @platform_mode - if set, use the platform driver, if available, to
329 * prepare the platform firmware for the power transition.
330 * 325 *
331 * Must be called with pm_mutex held 326 * This routine must be called with pm_mutex held.
332 */ 327 */
333
334int hibernation_snapshot(int platform_mode) 328int hibernation_snapshot(int platform_mode)
335{ 329{
330 pm_message_t msg = PMSG_RECOVER;
336 int error; 331 int error;
337 332
338 error = platform_begin(platform_mode); 333 error = platform_begin(platform_mode);
339 if (error) 334 if (error)
340 goto Close; 335 goto Close;
341 336
337 error = dpm_prepare(PMSG_FREEZE);
338 if (error)
339 goto Complete_devices;
340
342 /* Preallocate image memory before shutting down devices. */ 341 /* Preallocate image memory before shutting down devices. */
343 error = hibernate_preallocate_memory(); 342 error = hibernate_preallocate_memory();
344 if (error) 343 if (error)
345 goto Close; 344 goto Complete_devices;
346 345
347 suspend_console(); 346 suspend_console();
348 pm_restrict_gfp_mask(); 347 pm_restrict_gfp_mask();
349 error = dpm_suspend_start(PMSG_FREEZE); 348 error = dpm_suspend(PMSG_FREEZE);
350 if (error) 349 if (error)
351 goto Recover_platform; 350 goto Recover_platform;
352 351
@@ -364,13 +363,17 @@ int hibernation_snapshot(int platform_mode)
364 if (error || !in_suspend) 363 if (error || !in_suspend)
365 swsusp_free(); 364 swsusp_free();
366 365
367 dpm_resume_end(in_suspend ? 366 msg = in_suspend ? (error ? PMSG_RECOVER : PMSG_THAW) : PMSG_RESTORE;
368 (error ? PMSG_RECOVER : PMSG_THAW) : PMSG_RESTORE); 367 dpm_resume(msg);
369 368
370 if (error || !in_suspend) 369 if (error || !in_suspend)
371 pm_restore_gfp_mask(); 370 pm_restore_gfp_mask();
372 371
373 resume_console(); 372 resume_console();
373
374 Complete_devices:
375 dpm_complete(msg);
376
374 Close: 377 Close:
375 platform_end(platform_mode); 378 platform_end(platform_mode);
376 return error; 379 return error;
@@ -381,13 +384,14 @@ int hibernation_snapshot(int platform_mode)
381} 384}
382 385
383/** 386/**
384 * resume_target_kernel - prepare devices that need to be suspended with 387 * resume_target_kernel - Restore system state from a hibernation image.
385 * interrupts off, restore the contents of highmem that have not been 388 * @platform_mode: Whether or not to use the platform driver.
386 * restored yet from the image and run the low level code that will restore 389 *
387 * the remaining contents of memory and switch to the just restored target 390 * Execute device drivers' .freeze_noirq() callbacks, restore the contents of
388 * kernel. 391 * highmem that have not been restored yet from the image and run the low-level
392 * code that will restore the remaining contents of memory and switch to the
393 * just restored target kernel.
389 */ 394 */
390
391static int resume_target_kernel(bool platform_mode) 395static int resume_target_kernel(bool platform_mode)
392{ 396{
393 int error; 397 int error;
@@ -409,40 +413,36 @@ static int resume_target_kernel(bool platform_mode)
409 413
410 local_irq_disable(); 414 local_irq_disable();
411 415
412 error = sysdev_suspend(PMSG_QUIESCE); 416 error = syscore_suspend();
413 if (!error) {
414 error = syscore_suspend();
415 if (error)
416 sysdev_resume();
417 }
418 if (error) 417 if (error)
419 goto Enable_irqs; 418 goto Enable_irqs;
420 419
421 /* We'll ignore saved state, but this gets preempt count (etc) right */
422 save_processor_state(); 420 save_processor_state();
423 error = restore_highmem(); 421 error = restore_highmem();
424 if (!error) { 422 if (!error) {
425 error = swsusp_arch_resume(); 423 error = swsusp_arch_resume();
426 /* 424 /*
427 * The code below is only ever reached in case of a failure. 425 * The code below is only ever reached in case of a failure.
428 * Otherwise execution continues at place where 426 * Otherwise, execution continues at the place where
429 * swsusp_arch_suspend() was called 427 * swsusp_arch_suspend() was called.
430 */ 428 */
431 BUG_ON(!error); 429 BUG_ON(!error);
432 /* This call to restore_highmem() undos the previous one */ 430 /*
431 * This call to restore_highmem() reverts the changes made by
432 * the previous one.
433 */
433 restore_highmem(); 434 restore_highmem();
434 } 435 }
435 /* 436 /*
436 * The only reason why swsusp_arch_resume() can fail is memory being 437 * The only reason why swsusp_arch_resume() can fail is memory being
437 * very tight, so we have to free it as soon as we can to avoid 438 * very tight, so we have to free it as soon as we can to avoid
438 * subsequent failures 439 * subsequent failures.
439 */ 440 */
440 swsusp_free(); 441 swsusp_free();
441 restore_processor_state(); 442 restore_processor_state();
442 touch_softlockup_watchdog(); 443 touch_softlockup_watchdog();
443 444
444 syscore_resume(); 445 syscore_resume();
445 sysdev_resume();
446 446
447 Enable_irqs: 447 Enable_irqs:
448 local_irq_enable(); 448 local_irq_enable();
@@ -459,14 +459,12 @@ static int resume_target_kernel(bool platform_mode)
459} 459}
460 460
461/** 461/**
462 * hibernation_restore - quiesce devices and restore the hibernation 462 * hibernation_restore - Quiesce devices and restore from a hibernation image.
463 * snapshot image. If successful, control returns in hibernation_snaphot() 463 * @platform_mode: If set, use platform driver to prepare for the transition.
464 * @platform_mode - if set, use the platform driver, if available, to
465 * prepare the platform firmware for the transition.
466 * 464 *
467 * Must be called with pm_mutex held 465 * This routine must be called with pm_mutex held. If it is successful, control
466 * reappears in the restored target kernel in hibernation_snaphot().
468 */ 467 */
469
470int hibernation_restore(int platform_mode) 468int hibernation_restore(int platform_mode)
471{ 469{
472 int error; 470 int error;
@@ -486,10 +484,8 @@ int hibernation_restore(int platform_mode)
486} 484}
487 485
488/** 486/**
489 * hibernation_platform_enter - enter the hibernation state using the 487 * hibernation_platform_enter - Power off the system using the platform driver.
490 * platform driver (if available)
491 */ 488 */
492
493int hibernation_platform_enter(void) 489int hibernation_platform_enter(void)
494{ 490{
495 int error; 491 int error;
@@ -528,7 +524,6 @@ int hibernation_platform_enter(void)
528 goto Platform_finish; 524 goto Platform_finish;
529 525
530 local_irq_disable(); 526 local_irq_disable();
531 sysdev_suspend(PMSG_HIBERNATE);
532 syscore_suspend(); 527 syscore_suspend();
533 if (pm_wakeup_pending()) { 528 if (pm_wakeup_pending()) {
534 error = -EAGAIN; 529 error = -EAGAIN;
@@ -541,7 +536,6 @@ int hibernation_platform_enter(void)
541 536
542 Power_up: 537 Power_up:
543 syscore_resume(); 538 syscore_resume();
544 sysdev_resume();
545 local_irq_enable(); 539 local_irq_enable();
546 enable_nonboot_cpus(); 540 enable_nonboot_cpus();
547 541
@@ -562,12 +556,12 @@ int hibernation_platform_enter(void)
562} 556}
563 557
564/** 558/**
565 * power_down - Shut the machine down for hibernation. 559 * power_down - Shut the machine down for hibernation.
566 * 560 *
567 * Use the platform driver, if configured so; otherwise try 561 * Use the platform driver, if configured, to put the system into the sleep
568 * to power off or reboot. 562 * state corresponding to hibernation, or try to power it off or reboot,
563 * depending on the value of hibernation_mode.
569 */ 564 */
570
571static void power_down(void) 565static void power_down(void)
572{ 566{
573 switch (hibernation_mode) { 567 switch (hibernation_mode) {
@@ -604,9 +598,8 @@ static int prepare_processes(void)
604} 598}
605 599
606/** 600/**
607 * hibernate - The granpappy of the built-in hibernation management 601 * hibernate - Carry out system hibernation, including saving the image.
608 */ 602 */
609
610int hibernate(void) 603int hibernate(void)
611{ 604{
612 int error; 605 int error;
@@ -684,17 +677,20 @@ int hibernate(void)
684 677
685 678
686/** 679/**
687 * software_resume - Resume from a saved image. 680 * software_resume - Resume from a saved hibernation image.
681 *
682 * This routine is called as a late initcall, when all devices have been
683 * discovered and initialized already.
688 * 684 *
689 * Called as a late_initcall (so all devices are discovered and 685 * The image reading code is called to see if there is a hibernation image
690 * initialized), we call swsusp to see if we have a saved image or not. 686 * available for reading. If that is the case, devices are quiesced and the
691 * If so, we quiesce devices, the restore the saved image. We will 687 * contents of memory is restored from the saved image.
692 * return above (in hibernate() ) if everything goes well.
693 * Otherwise, we fail gracefully and return to the normally
694 * scheduled program.
695 * 688 *
689 * If this is successful, control reappears in the restored target kernel in
690 * hibernation_snaphot() which returns to hibernate(). Otherwise, the routine
691 * attempts to recover gracefully and make the kernel return to the normal mode
692 * of operation.
696 */ 693 */
697
698static int software_resume(void) 694static int software_resume(void)
699{ 695{
700 int error; 696 int error;
@@ -824,21 +820,17 @@ static const char * const hibernation_modes[] = {
824 [HIBERNATION_TESTPROC] = "testproc", 820 [HIBERNATION_TESTPROC] = "testproc",
825}; 821};
826 822
827/** 823/*
828 * disk - Control hibernation mode 824 * /sys/power/disk - Control hibernation mode.
829 *
830 * Suspend-to-disk can be handled in several ways. We have a few options
831 * for putting the system to sleep - using the platform driver (e.g. ACPI
832 * or other hibernation_ops), powering off the system or rebooting the
833 * system (for testing) as well as the two test modes.
834 * 825 *
835 * The system can support 'platform', and that is known a priori (and 826 * Hibernation can be handled in several ways. There are a few different ways
836 * encoded by the presence of hibernation_ops). However, the user may 827 * to put the system into the sleep state: using the platform driver (e.g. ACPI
837 * choose 'shutdown' or 'reboot' as alternatives, as well as one fo the 828 * or other hibernation_ops), powering it off or rebooting it (for testing
838 * test modes, 'test' or 'testproc'. 829 * mostly), or using one of the two available test modes.
839 * 830 *
840 * show() will display what the mode is currently set to. 831 * The sysfs file /sys/power/disk provides an interface for selecting the
841 * store() will accept one of 832 * hibernation mode to use. Reading from this file causes the available modes
833 * to be printed. There are 5 modes that can be supported:
842 * 834 *
843 * 'platform' 835 * 'platform'
844 * 'shutdown' 836 * 'shutdown'
@@ -846,8 +838,14 @@ static const char * const hibernation_modes[] = {
846 * 'test' 838 * 'test'
847 * 'testproc' 839 * 'testproc'
848 * 840 *
849 * It will only change to 'platform' if the system 841 * If a platform hibernation driver is in use, 'platform' will be supported
850 * supports it (as determined by having hibernation_ops). 842 * and will be used by default. Otherwise, 'shutdown' will be used by default.
843 * The selected option (i.e. the one corresponding to the current value of
844 * hibernation_mode) is enclosed by a square bracket.
845 *
846 * To select a given hibernation mode it is necessary to write the mode's
847 * string representation (as returned by reading from /sys/power/disk) back
848 * into /sys/power/disk.
851 */ 849 */
852 850
853static ssize_t disk_show(struct kobject *kobj, struct kobj_attribute *attr, 851static ssize_t disk_show(struct kobject *kobj, struct kobj_attribute *attr,
@@ -880,7 +878,6 @@ static ssize_t disk_show(struct kobject *kobj, struct kobj_attribute *attr,
880 return buf-start; 878 return buf-start;
881} 879}
882 880
883
884static ssize_t disk_store(struct kobject *kobj, struct kobj_attribute *attr, 881static ssize_t disk_store(struct kobject *kobj, struct kobj_attribute *attr,
885 const char *buf, size_t n) 882 const char *buf, size_t n)
886{ 883{
@@ -982,10 +979,33 @@ static ssize_t image_size_store(struct kobject *kobj, struct kobj_attribute *att
982 979
983power_attr(image_size); 980power_attr(image_size);
984 981
982static ssize_t reserved_size_show(struct kobject *kobj,
983 struct kobj_attribute *attr, char *buf)
984{
985 return sprintf(buf, "%lu\n", reserved_size);
986}
987
988static ssize_t reserved_size_store(struct kobject *kobj,
989 struct kobj_attribute *attr,
990 const char *buf, size_t n)
991{
992 unsigned long size;
993
994 if (sscanf(buf, "%lu", &size) == 1) {
995 reserved_size = size;
996 return n;
997 }
998
999 return -EINVAL;
1000}
1001
1002power_attr(reserved_size);
1003
985static struct attribute * g[] = { 1004static struct attribute * g[] = {
986 &disk_attr.attr, 1005 &disk_attr.attr,
987 &resume_attr.attr, 1006 &resume_attr.attr,
988 &image_size_attr.attr, 1007 &image_size_attr.attr,
1008 &reserved_size_attr.attr,
989 NULL, 1009 NULL,
990}; 1010};
991 1011
diff --git a/kernel/power/main.c b/kernel/power/main.c
index de9aef8742f4..6c601f871964 100644
--- a/kernel/power/main.c
+++ b/kernel/power/main.c
@@ -37,8 +37,9 @@ EXPORT_SYMBOL_GPL(unregister_pm_notifier);
37 37
38int pm_notifier_call_chain(unsigned long val) 38int pm_notifier_call_chain(unsigned long val)
39{ 39{
40 return (blocking_notifier_call_chain(&pm_chain_head, val, NULL) 40 int ret = blocking_notifier_call_chain(&pm_chain_head, val, NULL);
41 == NOTIFY_BAD) ? -EINVAL : 0; 41
42 return notifier_to_errno(ret);
42} 43}
43 44
44/* If set, devices may be suspended and resumed asynchronously. */ 45/* If set, devices may be suspended and resumed asynchronously. */
@@ -337,6 +338,7 @@ static int __init pm_init(void)
337 if (error) 338 if (error)
338 return error; 339 return error;
339 hibernate_image_size_init(); 340 hibernate_image_size_init();
341 hibernate_reserved_size_init();
340 power_kobj = kobject_create_and_add("power", NULL); 342 power_kobj = kobject_create_and_add("power", NULL);
341 if (!power_kobj) 343 if (!power_kobj)
342 return -ENOMEM; 344 return -ENOMEM;
diff --git a/kernel/power/power.h b/kernel/power/power.h
index 03634be55f62..9a00a0a26280 100644
--- a/kernel/power/power.h
+++ b/kernel/power/power.h
@@ -15,6 +15,7 @@ struct swsusp_info {
15 15
16#ifdef CONFIG_HIBERNATION 16#ifdef CONFIG_HIBERNATION
17/* kernel/power/snapshot.c */ 17/* kernel/power/snapshot.c */
18extern void __init hibernate_reserved_size_init(void);
18extern void __init hibernate_image_size_init(void); 19extern void __init hibernate_image_size_init(void);
19 20
20#ifdef CONFIG_ARCH_HIBERNATION_HEADER 21#ifdef CONFIG_ARCH_HIBERNATION_HEADER
@@ -55,6 +56,7 @@ extern int hibernation_platform_enter(void);
55 56
56#else /* !CONFIG_HIBERNATION */ 57#else /* !CONFIG_HIBERNATION */
57 58
59static inline void hibernate_reserved_size_init(void) {}
58static inline void hibernate_image_size_init(void) {} 60static inline void hibernate_image_size_init(void) {}
59#endif /* !CONFIG_HIBERNATION */ 61#endif /* !CONFIG_HIBERNATION */
60 62
@@ -72,6 +74,8 @@ static struct kobj_attribute _name##_attr = { \
72 74
73/* Preferred image size in bytes (default 500 MB) */ 75/* Preferred image size in bytes (default 500 MB) */
74extern unsigned long image_size; 76extern unsigned long image_size;
77/* Size of memory reserved for drivers (default SPARE_PAGES x PAGE_SIZE) */
78extern unsigned long reserved_size;
75extern int in_suspend; 79extern int in_suspend;
76extern dev_t swsusp_resume_device; 80extern dev_t swsusp_resume_device;
77extern sector_t swsusp_resume_block; 81extern sector_t swsusp_resume_block;
diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c
index ca0aacc24874..06efa54f93d6 100644
--- a/kernel/power/snapshot.c
+++ b/kernel/power/snapshot.c
@@ -41,16 +41,28 @@ static void swsusp_set_page_forbidden(struct page *);
41static void swsusp_unset_page_forbidden(struct page *); 41static void swsusp_unset_page_forbidden(struct page *);
42 42
43/* 43/*
44 * Number of bytes to reserve for memory allocations made by device drivers
45 * from their ->freeze() and ->freeze_noirq() callbacks so that they don't
46 * cause image creation to fail (tunable via /sys/power/reserved_size).
47 */
48unsigned long reserved_size;
49
50void __init hibernate_reserved_size_init(void)
51{
52 reserved_size = SPARE_PAGES * PAGE_SIZE;
53}
54
55/*
44 * Preferred image size in bytes (tunable via /sys/power/image_size). 56 * Preferred image size in bytes (tunable via /sys/power/image_size).
45 * When it is set to N, the image creating code will do its best to 57 * When it is set to N, swsusp will do its best to ensure the image
46 * ensure the image size will not exceed N bytes, but if that is 58 * size will not exceed N bytes, but if that is impossible, it will
47 * impossible, it will try to create the smallest image possible. 59 * try to create the smallest image possible.
48 */ 60 */
49unsigned long image_size; 61unsigned long image_size;
50 62
51void __init hibernate_image_size_init(void) 63void __init hibernate_image_size_init(void)
52{ 64{
53 image_size = (totalram_pages / 3) * PAGE_SIZE; 65 image_size = ((totalram_pages * 2) / 5) * PAGE_SIZE;
54} 66}
55 67
56/* List of PBEs needed for restoring the pages that were allocated before 68/* List of PBEs needed for restoring the pages that were allocated before
@@ -1199,7 +1211,11 @@ static void free_unnecessary_pages(void)
1199 to_free_highmem = alloc_highmem - save; 1211 to_free_highmem = alloc_highmem - save;
1200 } else { 1212 } else {
1201 to_free_highmem = 0; 1213 to_free_highmem = 0;
1202 to_free_normal -= save - alloc_highmem; 1214 save -= alloc_highmem;
1215 if (to_free_normal > save)
1216 to_free_normal -= save;
1217 else
1218 to_free_normal = 0;
1203 } 1219 }
1204 1220
1205 memory_bm_position_reset(&copy_bm); 1221 memory_bm_position_reset(&copy_bm);
@@ -1263,11 +1279,13 @@ static unsigned long minimum_image_size(unsigned long saveable)
1263 * frame in use. We also need a number of page frames to be free during 1279 * frame in use. We also need a number of page frames to be free during
1264 * hibernation for allocations made while saving the image and for device 1280 * hibernation for allocations made while saving the image and for device
1265 * drivers, in case they need to allocate memory from their hibernation 1281 * drivers, in case they need to allocate memory from their hibernation
1266 * callbacks (these two numbers are given by PAGES_FOR_IO and SPARE_PAGES, 1282 * callbacks (these two numbers are given by PAGES_FOR_IO (which is a rough
1267 * respectively, both of which are rough estimates). To make this happen, we 1283 * estimate) and reserverd_size divided by PAGE_SIZE (which is tunable through
1268 * compute the total number of available page frames and allocate at least 1284 * /sys/power/reserved_size, respectively). To make this happen, we compute the
1285 * total number of available page frames and allocate at least
1269 * 1286 *
1270 * ([page frames total] + PAGES_FOR_IO + [metadata pages]) / 2 + 2 * SPARE_PAGES 1287 * ([page frames total] + PAGES_FOR_IO + [metadata pages]) / 2
1288 * + 2 * DIV_ROUND_UP(reserved_size, PAGE_SIZE)
1271 * 1289 *
1272 * of them, which corresponds to the maximum size of a hibernation image. 1290 * of them, which corresponds to the maximum size of a hibernation image.
1273 * 1291 *
@@ -1322,7 +1340,8 @@ int hibernate_preallocate_memory(void)
1322 count -= totalreserve_pages; 1340 count -= totalreserve_pages;
1323 1341
1324 /* Compute the maximum number of saveable pages to leave in memory. */ 1342 /* Compute the maximum number of saveable pages to leave in memory. */
1325 max_size = (count - (size + PAGES_FOR_IO)) / 2 - 2 * SPARE_PAGES; 1343 max_size = (count - (size + PAGES_FOR_IO)) / 2
1344 - 2 * DIV_ROUND_UP(reserved_size, PAGE_SIZE);
1326 /* Compute the desired number of image pages specified by image_size. */ 1345 /* Compute the desired number of image pages specified by image_size. */
1327 size = DIV_ROUND_UP(image_size, PAGE_SIZE); 1346 size = DIV_ROUND_UP(image_size, PAGE_SIZE);
1328 if (size > max_size) 1347 if (size > max_size)
diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c
index 8935369d503a..b6b71ad2208f 100644
--- a/kernel/power/suspend.c
+++ b/kernel/power/suspend.c
@@ -44,6 +44,7 @@ void suspend_set_ops(const struct platform_suspend_ops *ops)
44 suspend_ops = ops; 44 suspend_ops = ops;
45 mutex_unlock(&pm_mutex); 45 mutex_unlock(&pm_mutex);
46} 46}
47EXPORT_SYMBOL_GPL(suspend_set_ops);
47 48
48bool valid_state(suspend_state_t state) 49bool valid_state(suspend_state_t state)
49{ 50{
@@ -65,6 +66,7 @@ int suspend_valid_only_mem(suspend_state_t state)
65{ 66{
66 return state == PM_SUSPEND_MEM; 67 return state == PM_SUSPEND_MEM;
67} 68}
69EXPORT_SYMBOL_GPL(suspend_valid_only_mem);
68 70
69static int suspend_test(int level) 71static int suspend_test(int level)
70{ 72{
@@ -126,12 +128,13 @@ void __attribute__ ((weak)) arch_suspend_enable_irqs(void)
126} 128}
127 129
128/** 130/**
129 * suspend_enter - enter the desired system sleep state. 131 * suspend_enter - enter the desired system sleep state.
130 * @state: state to enter 132 * @state: State to enter
133 * @wakeup: Returns information that suspend should not be entered again.
131 * 134 *
132 * This function should be called after devices have been suspended. 135 * This function should be called after devices have been suspended.
133 */ 136 */
134static int suspend_enter(suspend_state_t state) 137static int suspend_enter(suspend_state_t state, bool *wakeup)
135{ 138{
136 int error; 139 int error;
137 140
@@ -163,19 +166,14 @@ static int suspend_enter(suspend_state_t state)
163 arch_suspend_disable_irqs(); 166 arch_suspend_disable_irqs();
164 BUG_ON(!irqs_disabled()); 167 BUG_ON(!irqs_disabled());
165 168
166 error = sysdev_suspend(PMSG_SUSPEND); 169 error = syscore_suspend();
167 if (!error) { 170 if (!error) {
168 error = syscore_suspend(); 171 *wakeup = pm_wakeup_pending();
169 if (error) 172 if (!(suspend_test(TEST_CORE) || *wakeup)) {
170 sysdev_resume();
171 }
172 if (!error) {
173 if (!(suspend_test(TEST_CORE) || pm_wakeup_pending())) {
174 error = suspend_ops->enter(state); 173 error = suspend_ops->enter(state);
175 events_check_enabled = false; 174 events_check_enabled = false;
176 } 175 }
177 syscore_resume(); 176 syscore_resume();
178 sysdev_resume();
179 } 177 }
180 178
181 arch_suspend_enable_irqs(); 179 arch_suspend_enable_irqs();
@@ -205,6 +203,7 @@ static int suspend_enter(suspend_state_t state)
205int suspend_devices_and_enter(suspend_state_t state) 203int suspend_devices_and_enter(suspend_state_t state)
206{ 204{
207 int error; 205 int error;
206 bool wakeup = false;
208 207
209 if (!suspend_ops) 208 if (!suspend_ops)
210 return -ENOSYS; 209 return -ENOSYS;
@@ -216,7 +215,6 @@ int suspend_devices_and_enter(suspend_state_t state)
216 goto Close; 215 goto Close;
217 } 216 }
218 suspend_console(); 217 suspend_console();
219 pm_restrict_gfp_mask();
220 suspend_test_start(); 218 suspend_test_start();
221 error = dpm_suspend_start(PMSG_SUSPEND); 219 error = dpm_suspend_start(PMSG_SUSPEND);
222 if (error) { 220 if (error) {
@@ -227,13 +225,15 @@ int suspend_devices_and_enter(suspend_state_t state)
227 if (suspend_test(TEST_DEVICES)) 225 if (suspend_test(TEST_DEVICES))
228 goto Recover_platform; 226 goto Recover_platform;
229 227
230 suspend_enter(state); 228 do {
229 error = suspend_enter(state, &wakeup);
230 } while (!error && !wakeup
231 && suspend_ops->suspend_again && suspend_ops->suspend_again());
231 232
232 Resume_devices: 233 Resume_devices:
233 suspend_test_start(); 234 suspend_test_start();
234 dpm_resume_end(PMSG_RESUME); 235 dpm_resume_end(PMSG_RESUME);
235 suspend_test_finish("resume devices"); 236 suspend_test_finish("resume devices");
236 pm_restore_gfp_mask();
237 resume_console(); 237 resume_console();
238 Close: 238 Close:
239 if (suspend_ops->end) 239 if (suspend_ops->end)
@@ -294,7 +294,9 @@ int enter_state(suspend_state_t state)
294 goto Finish; 294 goto Finish;
295 295
296 pr_debug("PM: Entering %s sleep\n", pm_states[state]); 296 pr_debug("PM: Entering %s sleep\n", pm_states[state]);
297 pm_restrict_gfp_mask();
297 error = suspend_devices_and_enter(state); 298 error = suspend_devices_and_enter(state);
299 pm_restore_gfp_mask();
298 300
299 Finish: 301 Finish:
300 pr_debug("PM: Finishing wakeup.\n"); 302 pr_debug("PM: Finishing wakeup.\n");
diff --git a/kernel/power/user.c b/kernel/power/user.c
index c36c3b9e8a84..42ddbc6f0de6 100644
--- a/kernel/power/user.c
+++ b/kernel/power/user.c
@@ -113,8 +113,10 @@ static int snapshot_open(struct inode *inode, struct file *filp)
113 if (error) 113 if (error)
114 pm_notifier_call_chain(PM_POST_RESTORE); 114 pm_notifier_call_chain(PM_POST_RESTORE);
115 } 115 }
116 if (error) 116 if (error) {
117 free_basic_memory_bitmaps();
117 atomic_inc(&snapshot_device_available); 118 atomic_inc(&snapshot_device_available);
119 }
118 data->frozen = 0; 120 data->frozen = 0;
119 data->ready = 0; 121 data->ready = 0;
120 data->platform_support = 0; 122 data->platform_support = 0;
@@ -135,8 +137,10 @@ static int snapshot_release(struct inode *inode, struct file *filp)
135 free_basic_memory_bitmaps(); 137 free_basic_memory_bitmaps();
136 data = filp->private_data; 138 data = filp->private_data;
137 free_all_swap_pages(data->swap); 139 free_all_swap_pages(data->swap);
138 if (data->frozen) 140 if (data->frozen) {
141 pm_restore_gfp_mask();
139 thaw_processes(); 142 thaw_processes();
143 }
140 pm_notifier_call_chain(data->mode == O_RDONLY ? 144 pm_notifier_call_chain(data->mode == O_RDONLY ?
141 PM_POST_HIBERNATION : PM_POST_RESTORE); 145 PM_POST_HIBERNATION : PM_POST_RESTORE);
142 atomic_inc(&snapshot_device_available); 146 atomic_inc(&snapshot_device_available);
@@ -379,6 +383,7 @@ static long snapshot_ioctl(struct file *filp, unsigned int cmd,
379 * PM_HIBERNATION_PREPARE 383 * PM_HIBERNATION_PREPARE
380 */ 384 */
381 error = suspend_devices_and_enter(PM_SUSPEND_MEM); 385 error = suspend_devices_and_enter(PM_SUSPEND_MEM);
386 data->ready = 0;
382 break; 387 break;
383 388
384 case SNAPSHOT_PLATFORM_SUPPORT: 389 case SNAPSHOT_PLATFORM_SUPPORT:
diff --git a/kernel/printk.c b/kernel/printk.c
index da8ca817eae3..836a2ae0ac31 100644
--- a/kernel/printk.c
+++ b/kernel/printk.c
@@ -31,6 +31,7 @@
31#include <linux/smp.h> 31#include <linux/smp.h>
32#include <linux/security.h> 32#include <linux/security.h>
33#include <linux/bootmem.h> 33#include <linux/bootmem.h>
34#include <linux/memblock.h>
34#include <linux/syscalls.h> 35#include <linux/syscalls.h>
35#include <linux/kexec.h> 36#include <linux/kexec.h>
36#include <linux/kdb.h> 37#include <linux/kdb.h>
@@ -167,46 +168,74 @@ void log_buf_kexec_setup(void)
167} 168}
168#endif 169#endif
169 170
171/* requested log_buf_len from kernel cmdline */
172static unsigned long __initdata new_log_buf_len;
173
174/* save requested log_buf_len since it's too early to process it */
170static int __init log_buf_len_setup(char *str) 175static int __init log_buf_len_setup(char *str)
171{ 176{
172 unsigned size = memparse(str, &str); 177 unsigned size = memparse(str, &str);
173 unsigned long flags;
174 178
175 if (size) 179 if (size)
176 size = roundup_pow_of_two(size); 180 size = roundup_pow_of_two(size);
177 if (size > log_buf_len) { 181 if (size > log_buf_len)
178 unsigned start, dest_idx, offset; 182 new_log_buf_len = size;
179 char *new_log_buf;
180 183
181 new_log_buf = alloc_bootmem(size); 184 return 0;
182 if (!new_log_buf) { 185}
183 printk(KERN_WARNING "log_buf_len: allocation failed\n"); 186early_param("log_buf_len", log_buf_len_setup);
184 goto out;
185 }
186 187
187 spin_lock_irqsave(&logbuf_lock, flags); 188void __init setup_log_buf(int early)
188 log_buf_len = size; 189{
189 log_buf = new_log_buf; 190 unsigned long flags;
190 191 unsigned start, dest_idx, offset;
191 offset = start = min(con_start, log_start); 192 char *new_log_buf;
192 dest_idx = 0; 193 int free;
193 while (start != log_end) { 194
194 log_buf[dest_idx] = __log_buf[start & (__LOG_BUF_LEN - 1)]; 195 if (!new_log_buf_len)
195 start++; 196 return;
196 dest_idx++; 197
197 } 198 if (early) {
198 log_start -= offset; 199 unsigned long mem;
199 con_start -= offset;
200 log_end -= offset;
201 spin_unlock_irqrestore(&logbuf_lock, flags);
202 200
203 printk(KERN_NOTICE "log_buf_len: %d\n", log_buf_len); 201 mem = memblock_alloc(new_log_buf_len, PAGE_SIZE);
202 if (mem == MEMBLOCK_ERROR)
203 return;
204 new_log_buf = __va(mem);
205 } else {
206 new_log_buf = alloc_bootmem_nopanic(new_log_buf_len);
204 } 207 }
205out:
206 return 1;
207}
208 208
209__setup("log_buf_len=", log_buf_len_setup); 209 if (unlikely(!new_log_buf)) {
210 pr_err("log_buf_len: %ld bytes not available\n",
211 new_log_buf_len);
212 return;
213 }
214
215 spin_lock_irqsave(&logbuf_lock, flags);
216 log_buf_len = new_log_buf_len;
217 log_buf = new_log_buf;
218 new_log_buf_len = 0;
219 free = __LOG_BUF_LEN - log_end;
220
221 offset = start = min(con_start, log_start);
222 dest_idx = 0;
223 while (start != log_end) {
224 unsigned log_idx_mask = start & (__LOG_BUF_LEN - 1);
225
226 log_buf[dest_idx] = __log_buf[log_idx_mask];
227 start++;
228 dest_idx++;
229 }
230 log_start -= offset;
231 con_start -= offset;
232 log_end -= offset;
233 spin_unlock_irqrestore(&logbuf_lock, flags);
234
235 pr_info("log_buf_len: %d\n", log_buf_len);
236 pr_info("early log buf free: %d(%d%%)\n",
237 free, (free * 100) / __LOG_BUF_LEN);
238}
210 239
211#ifdef CONFIG_BOOT_PRINTK_DELAY 240#ifdef CONFIG_BOOT_PRINTK_DELAY
212 241
@@ -289,8 +318,10 @@ static int check_syslog_permissions(int type, bool from_file)
289 return 0; 318 return 0;
290 /* For historical reasons, accept CAP_SYS_ADMIN too, with a warning */ 319 /* For historical reasons, accept CAP_SYS_ADMIN too, with a warning */
291 if (capable(CAP_SYS_ADMIN)) { 320 if (capable(CAP_SYS_ADMIN)) {
292 WARN_ONCE(1, "Attempt to access syslog with CAP_SYS_ADMIN " 321 printk_once(KERN_WARNING "%s (%d): "
293 "but no CAP_SYSLOG (deprecated).\n"); 322 "Attempt to access syslog with CAP_SYS_ADMIN "
323 "but no CAP_SYSLOG (deprecated).\n",
324 current->comm, task_pid_nr(current));
294 return 0; 325 return 0;
295 } 326 }
296 return -EPERM; 327 return -EPERM;
@@ -753,7 +784,7 @@ static inline int can_use_console(unsigned int cpu)
753static int console_trylock_for_printk(unsigned int cpu) 784static int console_trylock_for_printk(unsigned int cpu)
754 __releases(&logbuf_lock) 785 __releases(&logbuf_lock)
755{ 786{
756 int retval = 0; 787 int retval = 0, wake = 0;
757 788
758 if (console_trylock()) { 789 if (console_trylock()) {
759 retval = 1; 790 retval = 1;
@@ -766,12 +797,14 @@ static int console_trylock_for_printk(unsigned int cpu)
766 */ 797 */
767 if (!can_use_console(cpu)) { 798 if (!can_use_console(cpu)) {
768 console_locked = 0; 799 console_locked = 0;
769 up(&console_sem); 800 wake = 1;
770 retval = 0; 801 retval = 0;
771 } 802 }
772 } 803 }
773 printk_cpu = UINT_MAX; 804 printk_cpu = UINT_MAX;
774 spin_unlock(&logbuf_lock); 805 spin_unlock(&logbuf_lock);
806 if (wake)
807 up(&console_sem);
775 return retval; 808 return retval;
776} 809}
777static const char recursion_bug_msg [] = 810static const char recursion_bug_msg [] =
@@ -1213,7 +1246,7 @@ void console_unlock(void)
1213{ 1246{
1214 unsigned long flags; 1247 unsigned long flags;
1215 unsigned _con_start, _log_end; 1248 unsigned _con_start, _log_end;
1216 unsigned wake_klogd = 0; 1249 unsigned wake_klogd = 0, retry = 0;
1217 1250
1218 if (console_suspended) { 1251 if (console_suspended) {
1219 up(&console_sem); 1252 up(&console_sem);
@@ -1222,6 +1255,7 @@ void console_unlock(void)
1222 1255
1223 console_may_schedule = 0; 1256 console_may_schedule = 0;
1224 1257
1258again:
1225 for ( ; ; ) { 1259 for ( ; ; ) {
1226 spin_lock_irqsave(&logbuf_lock, flags); 1260 spin_lock_irqsave(&logbuf_lock, flags);
1227 wake_klogd |= log_start - log_end; 1261 wake_klogd |= log_start - log_end;
@@ -1242,8 +1276,23 @@ void console_unlock(void)
1242 if (unlikely(exclusive_console)) 1276 if (unlikely(exclusive_console))
1243 exclusive_console = NULL; 1277 exclusive_console = NULL;
1244 1278
1279 spin_unlock(&logbuf_lock);
1280
1245 up(&console_sem); 1281 up(&console_sem);
1282
1283 /*
1284 * Someone could have filled up the buffer again, so re-check if there's
1285 * something to flush. In case we cannot trylock the console_sem again,
1286 * there's a new owner and the console_unlock() from them will do the
1287 * flush, no worries.
1288 */
1289 spin_lock(&logbuf_lock);
1290 if (con_start != log_end)
1291 retry = 1;
1246 spin_unlock_irqrestore(&logbuf_lock, flags); 1292 spin_unlock_irqrestore(&logbuf_lock, flags);
1293 if (retry && console_trylock())
1294 goto again;
1295
1247 if (wake_klogd) 1296 if (wake_klogd)
1248 wake_up_klogd(); 1297 wake_up_klogd();
1249} 1298}
diff --git a/kernel/profile.c b/kernel/profile.c
index 66f841b7fbd3..961b389fe52f 100644
--- a/kernel/profile.c
+++ b/kernel/profile.c
@@ -126,11 +126,9 @@ int __ref profile_init(void)
126 if (prof_buffer) 126 if (prof_buffer)
127 return 0; 127 return 0;
128 128
129 prof_buffer = vmalloc(buffer_bytes); 129 prof_buffer = vzalloc(buffer_bytes);
130 if (prof_buffer) { 130 if (prof_buffer)
131 memset(prof_buffer, 0, buffer_bytes);
132 return 0; 131 return 0;
133 }
134 132
135 free_cpumask_var(prof_cpu_mask); 133 free_cpumask_var(prof_cpu_mask);
136 return -ENOMEM; 134 return -ENOMEM;
@@ -305,14 +303,12 @@ static void profile_discard_flip_buffers(void)
305 mutex_unlock(&profile_flip_mutex); 303 mutex_unlock(&profile_flip_mutex);
306} 304}
307 305
308void profile_hits(int type, void *__pc, unsigned int nr_hits) 306static void do_profile_hits(int type, void *__pc, unsigned int nr_hits)
309{ 307{
310 unsigned long primary, secondary, flags, pc = (unsigned long)__pc; 308 unsigned long primary, secondary, flags, pc = (unsigned long)__pc;
311 int i, j, cpu; 309 int i, j, cpu;
312 struct profile_hit *hits; 310 struct profile_hit *hits;
313 311
314 if (prof_on != type || !prof_buffer)
315 return;
316 pc = min((pc - (unsigned long)_stext) >> prof_shift, prof_len - 1); 312 pc = min((pc - (unsigned long)_stext) >> prof_shift, prof_len - 1);
317 i = primary = (pc & (NR_PROFILE_GRP - 1)) << PROFILE_GRPSHIFT; 313 i = primary = (pc & (NR_PROFILE_GRP - 1)) << PROFILE_GRPSHIFT;
318 secondary = (~(pc << 1) & (NR_PROFILE_GRP - 1)) << PROFILE_GRPSHIFT; 314 secondary = (~(pc << 1) & (NR_PROFILE_GRP - 1)) << PROFILE_GRPSHIFT;
@@ -419,16 +415,20 @@ out_free:
419#define profile_discard_flip_buffers() do { } while (0) 415#define profile_discard_flip_buffers() do { } while (0)
420#define profile_cpu_callback NULL 416#define profile_cpu_callback NULL
421 417
422void profile_hits(int type, void *__pc, unsigned int nr_hits) 418static void do_profile_hits(int type, void *__pc, unsigned int nr_hits)
423{ 419{
424 unsigned long pc; 420 unsigned long pc;
425
426 if (prof_on != type || !prof_buffer)
427 return;
428 pc = ((unsigned long)__pc - (unsigned long)_stext) >> prof_shift; 421 pc = ((unsigned long)__pc - (unsigned long)_stext) >> prof_shift;
429 atomic_add(nr_hits, &prof_buffer[min(pc, prof_len - 1)]); 422 atomic_add(nr_hits, &prof_buffer[min(pc, prof_len - 1)]);
430} 423}
431#endif /* !CONFIG_SMP */ 424#endif /* !CONFIG_SMP */
425
426void profile_hits(int type, void *__pc, unsigned int nr_hits)
427{
428 if (prof_on != type || !prof_buffer)
429 return;
430 do_profile_hits(type, __pc, nr_hits);
431}
432EXPORT_SYMBOL_GPL(profile_hits); 432EXPORT_SYMBOL_GPL(profile_hits);
433 433
434void profile_tick(int type) 434void profile_tick(int type)
diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index dc7ab65f3b36..9de3ecfd20f9 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -23,8 +23,15 @@
23#include <linux/uaccess.h> 23#include <linux/uaccess.h>
24#include <linux/regset.h> 24#include <linux/regset.h>
25#include <linux/hw_breakpoint.h> 25#include <linux/hw_breakpoint.h>
26#include <linux/cn_proc.h>
26 27
27 28
29static int ptrace_trapping_sleep_fn(void *flags)
30{
31 schedule();
32 return 0;
33}
34
28/* 35/*
29 * ptrace a task: make the debugger its new parent and 36 * ptrace a task: make the debugger its new parent and
30 * move it to the ptrace list. 37 * move it to the ptrace list.
@@ -38,35 +45,33 @@ void __ptrace_link(struct task_struct *child, struct task_struct *new_parent)
38 child->parent = new_parent; 45 child->parent = new_parent;
39} 46}
40 47
41/* 48/**
42 * Turn a tracing stop into a normal stop now, since with no tracer there 49 * __ptrace_unlink - unlink ptracee and restore its execution state
43 * would be no way to wake it up with SIGCONT or SIGKILL. If there was a 50 * @child: ptracee to be unlinked
44 * signal sent that would resume the child, but didn't because it was in
45 * TASK_TRACED, resume it now.
46 * Requires that irqs be disabled.
47 */
48static void ptrace_untrace(struct task_struct *child)
49{
50 spin_lock(&child->sighand->siglock);
51 if (task_is_traced(child)) {
52 /*
53 * If the group stop is completed or in progress,
54 * this thread was already counted as stopped.
55 */
56 if (child->signal->flags & SIGNAL_STOP_STOPPED ||
57 child->signal->group_stop_count)
58 __set_task_state(child, TASK_STOPPED);
59 else
60 signal_wake_up(child, 1);
61 }
62 spin_unlock(&child->sighand->siglock);
63}
64
65/*
66 * unptrace a task: move it back to its original parent and
67 * remove it from the ptrace list.
68 * 51 *
69 * Must be called with the tasklist lock write-held. 52 * Remove @child from the ptrace list, move it back to the original parent,
53 * and restore the execution state so that it conforms to the group stop
54 * state.
55 *
56 * Unlinking can happen via two paths - explicit PTRACE_DETACH or ptracer
57 * exiting. For PTRACE_DETACH, unless the ptracee has been killed between
58 * ptrace_check_attach() and here, it's guaranteed to be in TASK_TRACED.
59 * If the ptracer is exiting, the ptracee can be in any state.
60 *
61 * After detach, the ptracee should be in a state which conforms to the
62 * group stop. If the group is stopped or in the process of stopping, the
63 * ptracee should be put into TASK_STOPPED; otherwise, it should be woken
64 * up from TASK_TRACED.
65 *
66 * If the ptracee is in TASK_TRACED and needs to be moved to TASK_STOPPED,
67 * it goes through TRACED -> RUNNING -> STOPPED transition which is similar
68 * to but in the opposite direction of what happens while attaching to a
69 * stopped task. However, in this direction, the intermediate RUNNING
70 * state is not hidden even from the current ptracer and if it immediately
71 * re-attaches and performs a WNOHANG wait(2), it may fail.
72 *
73 * CONTEXT:
74 * write_lock_irq(tasklist_lock)
70 */ 75 */
71void __ptrace_unlink(struct task_struct *child) 76void __ptrace_unlink(struct task_struct *child)
72{ 77{
@@ -76,14 +81,54 @@ void __ptrace_unlink(struct task_struct *child)
76 child->parent = child->real_parent; 81 child->parent = child->real_parent;
77 list_del_init(&child->ptrace_entry); 82 list_del_init(&child->ptrace_entry);
78 83
79 if (task_is_traced(child)) 84 spin_lock(&child->sighand->siglock);
80 ptrace_untrace(child); 85
86 /*
87 * Clear all pending traps and TRAPPING. TRAPPING should be
88 * cleared regardless of JOBCTL_STOP_PENDING. Do it explicitly.
89 */
90 task_clear_jobctl_pending(child, JOBCTL_TRAP_MASK);
91 task_clear_jobctl_trapping(child);
92
93 /*
94 * Reinstate JOBCTL_STOP_PENDING if group stop is in effect and
95 * @child isn't dead.
96 */
97 if (!(child->flags & PF_EXITING) &&
98 (child->signal->flags & SIGNAL_STOP_STOPPED ||
99 child->signal->group_stop_count))
100 child->jobctl |= JOBCTL_STOP_PENDING;
101
102 /*
103 * If transition to TASK_STOPPED is pending or in TASK_TRACED, kick
104 * @child in the butt. Note that @resume should be used iff @child
105 * is in TASK_TRACED; otherwise, we might unduly disrupt
106 * TASK_KILLABLE sleeps.
107 */
108 if (child->jobctl & JOBCTL_STOP_PENDING || task_is_traced(child))
109 signal_wake_up(child, task_is_traced(child));
110
111 spin_unlock(&child->sighand->siglock);
81} 112}
82 113
83/* 114/**
84 * Check that we have indeed attached to the thing.. 115 * ptrace_check_attach - check whether ptracee is ready for ptrace operation
116 * @child: ptracee to check for
117 * @ignore_state: don't check whether @child is currently %TASK_TRACED
118 *
119 * Check whether @child is being ptraced by %current and ready for further
120 * ptrace operations. If @ignore_state is %false, @child also should be in
121 * %TASK_TRACED state and on return the child is guaranteed to be traced
122 * and not executing. If @ignore_state is %true, @child can be in any
123 * state.
124 *
125 * CONTEXT:
126 * Grabs and releases tasklist_lock and @child->sighand->siglock.
127 *
128 * RETURNS:
129 * 0 on success, -ESRCH if %child is not ready.
85 */ 130 */
86int ptrace_check_attach(struct task_struct *child, int kill) 131int ptrace_check_attach(struct task_struct *child, bool ignore_state)
87{ 132{
88 int ret = -ESRCH; 133 int ret = -ESRCH;
89 134
@@ -96,21 +141,20 @@ int ptrace_check_attach(struct task_struct *child, int kill)
96 */ 141 */
97 read_lock(&tasklist_lock); 142 read_lock(&tasklist_lock);
98 if ((child->ptrace & PT_PTRACED) && child->parent == current) { 143 if ((child->ptrace & PT_PTRACED) && child->parent == current) {
99 ret = 0;
100 /* 144 /*
101 * child->sighand can't be NULL, release_task() 145 * child->sighand can't be NULL, release_task()
102 * does ptrace_unlink() before __exit_signal(). 146 * does ptrace_unlink() before __exit_signal().
103 */ 147 */
104 spin_lock_irq(&child->sighand->siglock); 148 spin_lock_irq(&child->sighand->siglock);
105 if (task_is_stopped(child)) 149 WARN_ON_ONCE(task_is_stopped(child));
106 child->state = TASK_TRACED; 150 if (ignore_state || (task_is_traced(child) &&
107 else if (!task_is_traced(child) && !kill) 151 !(child->jobctl & JOBCTL_LISTENING)))
108 ret = -ESRCH; 152 ret = 0;
109 spin_unlock_irq(&child->sighand->siglock); 153 spin_unlock_irq(&child->sighand->siglock);
110 } 154 }
111 read_unlock(&tasklist_lock); 155 read_unlock(&tasklist_lock);
112 156
113 if (!ret && !kill) 157 if (!ret && !ignore_state)
114 ret = wait_task_inactive(child, TASK_TRACED) ? 0 : -ESRCH; 158 ret = wait_task_inactive(child, TASK_TRACED) ? 0 : -ESRCH;
115 159
116 /* All systems go.. */ 160 /* All systems go.. */
@@ -167,10 +211,28 @@ bool ptrace_may_access(struct task_struct *task, unsigned int mode)
167 return !err; 211 return !err;
168} 212}
169 213
170static int ptrace_attach(struct task_struct *task) 214static int ptrace_attach(struct task_struct *task, long request,
215 unsigned long flags)
171{ 216{
217 bool seize = (request == PTRACE_SEIZE);
172 int retval; 218 int retval;
173 219
220 /*
221 * SEIZE will enable new ptrace behaviors which will be implemented
222 * gradually. SEIZE_DEVEL is used to prevent applications
223 * expecting full SEIZE behaviors trapping on kernel commits which
224 * are still in the process of implementing them.
225 *
226 * Only test programs for new ptrace behaviors being implemented
227 * should set SEIZE_DEVEL. If unset, SEIZE will fail with -EIO.
228 *
229 * Once SEIZE behaviors are completely implemented, this flag and
230 * the following test will be removed.
231 */
232 retval = -EIO;
233 if (seize && !(flags & PTRACE_SEIZE_DEVEL))
234 goto out;
235
174 audit_ptrace(task); 236 audit_ptrace(task);
175 237
176 retval = -EPERM; 238 retval = -EPERM;
@@ -202,11 +264,41 @@ static int ptrace_attach(struct task_struct *task)
202 goto unlock_tasklist; 264 goto unlock_tasklist;
203 265
204 task->ptrace = PT_PTRACED; 266 task->ptrace = PT_PTRACED;
267 if (seize)
268 task->ptrace |= PT_SEIZED;
205 if (task_ns_capable(task, CAP_SYS_PTRACE)) 269 if (task_ns_capable(task, CAP_SYS_PTRACE))
206 task->ptrace |= PT_PTRACE_CAP; 270 task->ptrace |= PT_PTRACE_CAP;
207 271
208 __ptrace_link(task, current); 272 __ptrace_link(task, current);
209 send_sig_info(SIGSTOP, SEND_SIG_FORCED, task); 273
274 /* SEIZE doesn't trap tracee on attach */
275 if (!seize)
276 send_sig_info(SIGSTOP, SEND_SIG_FORCED, task);
277
278 spin_lock(&task->sighand->siglock);
279
280 /*
281 * If the task is already STOPPED, set JOBCTL_TRAP_STOP and
282 * TRAPPING, and kick it so that it transits to TRACED. TRAPPING
283 * will be cleared if the child completes the transition or any
284 * event which clears the group stop states happens. We'll wait
285 * for the transition to complete before returning from this
286 * function.
287 *
288 * This hides STOPPED -> RUNNING -> TRACED transition from the
289 * attaching thread but a different thread in the same group can
290 * still observe the transient RUNNING state. IOW, if another
291 * thread's WNOHANG wait(2) on the stopped tracee races against
292 * ATTACH, the wait(2) may fail due to the transient RUNNING.
293 *
294 * The following task_is_stopped() test is safe as both transitions
295 * in and out of STOPPED are protected by siglock.
296 */
297 if (task_is_stopped(task) &&
298 task_set_jobctl_pending(task, JOBCTL_TRAP_STOP | JOBCTL_TRAPPING))
299 signal_wake_up(task, 1);
300
301 spin_unlock(&task->sighand->siglock);
210 302
211 retval = 0; 303 retval = 0;
212unlock_tasklist: 304unlock_tasklist:
@@ -214,6 +306,12 @@ unlock_tasklist:
214unlock_creds: 306unlock_creds:
215 mutex_unlock(&task->signal->cred_guard_mutex); 307 mutex_unlock(&task->signal->cred_guard_mutex);
216out: 308out:
309 if (!retval) {
310 wait_on_bit(&task->jobctl, JOBCTL_TRAPPING_BIT,
311 ptrace_trapping_sleep_fn, TASK_UNINTERRUPTIBLE);
312 proc_ptrace_connector(task, PTRACE_ATTACH);
313 }
314
217 return retval; 315 return retval;
218} 316}
219 317
@@ -276,25 +374,27 @@ static int ignoring_children(struct sighand_struct *sigh)
276 */ 374 */
277static bool __ptrace_detach(struct task_struct *tracer, struct task_struct *p) 375static bool __ptrace_detach(struct task_struct *tracer, struct task_struct *p)
278{ 376{
377 bool dead;
378
279 __ptrace_unlink(p); 379 __ptrace_unlink(p);
280 380
281 if (p->exit_state == EXIT_ZOMBIE) { 381 if (p->exit_state != EXIT_ZOMBIE)
282 if (!task_detached(p) && thread_group_empty(p)) { 382 return false;
283 if (!same_thread_group(p->real_parent, tracer)) 383
284 do_notify_parent(p, p->exit_signal); 384 dead = !thread_group_leader(p);
285 else if (ignoring_children(tracer->sighand)) { 385
286 __wake_up_parent(p, tracer); 386 if (!dead && thread_group_empty(p)) {
287 p->exit_signal = -1; 387 if (!same_thread_group(p->real_parent, tracer))
288 } 388 dead = do_notify_parent(p, p->exit_signal);
289 } 389 else if (ignoring_children(tracer->sighand)) {
290 if (task_detached(p)) { 390 __wake_up_parent(p, tracer);
291 /* Mark it as in the process of being reaped. */ 391 dead = true;
292 p->exit_state = EXIT_DEAD;
293 return true;
294 } 392 }
295 } 393 }
296 394 /* Mark it as in the process of being reaped. */
297 return false; 395 if (dead)
396 p->exit_state = EXIT_DEAD;
397 return dead;
298} 398}
299 399
300static int ptrace_detach(struct task_struct *child, unsigned int data) 400static int ptrace_detach(struct task_struct *child, unsigned int data)
@@ -316,11 +416,10 @@ static int ptrace_detach(struct task_struct *child, unsigned int data)
316 if (child->ptrace) { 416 if (child->ptrace) {
317 child->exit_code = data; 417 child->exit_code = data;
318 dead = __ptrace_detach(current, child); 418 dead = __ptrace_detach(current, child);
319 if (!child->exit_state)
320 wake_up_state(child, TASK_TRACED | TASK_STOPPED);
321 } 419 }
322 write_unlock_irq(&tasklist_lock); 420 write_unlock_irq(&tasklist_lock);
323 421
422 proc_ptrace_connector(child, PTRACE_DETACH);
324 if (unlikely(dead)) 423 if (unlikely(dead))
325 release_task(child); 424 release_task(child);
326 425
@@ -518,7 +617,7 @@ static int ptrace_resume(struct task_struct *child, long request,
518 } 617 }
519 618
520 child->exit_code = data; 619 child->exit_code = data;
521 wake_up_process(child); 620 wake_up_state(child, __TASK_TRACED);
522 621
523 return 0; 622 return 0;
524} 623}
@@ -567,10 +666,12 @@ static int ptrace_regset(struct task_struct *task, int req, unsigned int type,
567int ptrace_request(struct task_struct *child, long request, 666int ptrace_request(struct task_struct *child, long request,
568 unsigned long addr, unsigned long data) 667 unsigned long addr, unsigned long data)
569{ 668{
669 bool seized = child->ptrace & PT_SEIZED;
570 int ret = -EIO; 670 int ret = -EIO;
571 siginfo_t siginfo; 671 siginfo_t siginfo, *si;
572 void __user *datavp = (void __user *) data; 672 void __user *datavp = (void __user *) data;
573 unsigned long __user *datalp = datavp; 673 unsigned long __user *datalp = datavp;
674 unsigned long flags;
574 675
575 switch (request) { 676 switch (request) {
576 case PTRACE_PEEKTEXT: 677 case PTRACE_PEEKTEXT:
@@ -603,6 +704,62 @@ int ptrace_request(struct task_struct *child, long request,
603 ret = ptrace_setsiginfo(child, &siginfo); 704 ret = ptrace_setsiginfo(child, &siginfo);
604 break; 705 break;
605 706
707 case PTRACE_INTERRUPT:
708 /*
709 * Stop tracee without any side-effect on signal or job
710 * control. At least one trap is guaranteed to happen
711 * after this request. If @child is already trapped, the
712 * current trap is not disturbed and another trap will
713 * happen after the current trap is ended with PTRACE_CONT.
714 *
715 * The actual trap might not be PTRACE_EVENT_STOP trap but
716 * the pending condition is cleared regardless.
717 */
718 if (unlikely(!seized || !lock_task_sighand(child, &flags)))
719 break;
720
721 /*
722 * INTERRUPT doesn't disturb existing trap sans one
723 * exception. If ptracer issued LISTEN for the current
724 * STOP, this INTERRUPT should clear LISTEN and re-trap
725 * tracee into STOP.
726 */
727 if (likely(task_set_jobctl_pending(child, JOBCTL_TRAP_STOP)))
728 signal_wake_up(child, child->jobctl & JOBCTL_LISTENING);
729
730 unlock_task_sighand(child, &flags);
731 ret = 0;
732 break;
733
734 case PTRACE_LISTEN:
735 /*
736 * Listen for events. Tracee must be in STOP. It's not
737 * resumed per-se but is not considered to be in TRACED by
738 * wait(2) or ptrace(2). If an async event (e.g. group
739 * stop state change) happens, tracee will enter STOP trap
740 * again. Alternatively, ptracer can issue INTERRUPT to
741 * finish listening and re-trap tracee into STOP.
742 */
743 if (unlikely(!seized || !lock_task_sighand(child, &flags)))
744 break;
745
746 si = child->last_siginfo;
747 if (unlikely(!si || si->si_code >> 8 != PTRACE_EVENT_STOP))
748 break;
749
750 child->jobctl |= JOBCTL_LISTENING;
751
752 /*
753 * If NOTIFY is set, it means event happened between start
754 * of this trap and now. Trigger re-trap immediately.
755 */
756 if (child->jobctl & JOBCTL_TRAP_NOTIFY)
757 signal_wake_up(child, true);
758
759 unlock_task_sighand(child, &flags);
760 ret = 0;
761 break;
762
606 case PTRACE_DETACH: /* detach a process that was attached. */ 763 case PTRACE_DETACH: /* detach a process that was attached. */
607 ret = ptrace_detach(child, data); 764 ret = ptrace_detach(child, data);
608 break; 765 break;
@@ -717,8 +874,8 @@ SYSCALL_DEFINE4(ptrace, long, request, long, pid, unsigned long, addr,
717 goto out; 874 goto out;
718 } 875 }
719 876
720 if (request == PTRACE_ATTACH) { 877 if (request == PTRACE_ATTACH || request == PTRACE_SEIZE) {
721 ret = ptrace_attach(child); 878 ret = ptrace_attach(child, request, data);
722 /* 879 /*
723 * Some architectures need to do book-keeping after 880 * Some architectures need to do book-keeping after
724 * a ptrace attach. 881 * a ptrace attach.
@@ -728,7 +885,8 @@ SYSCALL_DEFINE4(ptrace, long, request, long, pid, unsigned long, addr,
728 goto out_put_task_struct; 885 goto out_put_task_struct;
729 } 886 }
730 887
731 ret = ptrace_check_attach(child, request == PTRACE_KILL); 888 ret = ptrace_check_attach(child, request == PTRACE_KILL ||
889 request == PTRACE_INTERRUPT);
732 if (ret < 0) 890 if (ret < 0)
733 goto out_put_task_struct; 891 goto out_put_task_struct;
734 892
@@ -859,8 +1017,8 @@ asmlinkage long compat_sys_ptrace(compat_long_t request, compat_long_t pid,
859 goto out; 1017 goto out;
860 } 1018 }
861 1019
862 if (request == PTRACE_ATTACH) { 1020 if (request == PTRACE_ATTACH || request == PTRACE_SEIZE) {
863 ret = ptrace_attach(child); 1021 ret = ptrace_attach(child, request, data);
864 /* 1022 /*
865 * Some architectures need to do book-keeping after 1023 * Some architectures need to do book-keeping after
866 * a ptrace attach. 1024 * a ptrace attach.
@@ -870,7 +1028,8 @@ asmlinkage long compat_sys_ptrace(compat_long_t request, compat_long_t pid,
870 goto out_put_task_struct; 1028 goto out_put_task_struct;
871 } 1029 }
872 1030
873 ret = ptrace_check_attach(child, request == PTRACE_KILL); 1031 ret = ptrace_check_attach(child, request == PTRACE_KILL ||
1032 request == PTRACE_INTERRUPT);
874 if (!ret) 1033 if (!ret)
875 ret = compat_arch_ptrace(child, request, addr, data); 1034 ret = compat_arch_ptrace(child, request, addr, data);
876 1035
diff --git a/kernel/rcupdate.c b/kernel/rcupdate.c
index f3240e987928..ddddb320be61 100644
--- a/kernel/rcupdate.c
+++ b/kernel/rcupdate.c
@@ -37,7 +37,7 @@
37#include <linux/smp.h> 37#include <linux/smp.h>
38#include <linux/interrupt.h> 38#include <linux/interrupt.h>
39#include <linux/sched.h> 39#include <linux/sched.h>
40#include <asm/atomic.h> 40#include <linux/atomic.h>
41#include <linux/bitops.h> 41#include <linux/bitops.h>
42#include <linux/percpu.h> 42#include <linux/percpu.h>
43#include <linux/notifier.h> 43#include <linux/notifier.h>
@@ -142,10 +142,17 @@ static int rcuhead_fixup_init(void *addr, enum debug_obj_state state)
142 * Ensure that queued callbacks are all executed. 142 * Ensure that queued callbacks are all executed.
143 * If we detect that we are nested in a RCU read-side critical 143 * If we detect that we are nested in a RCU read-side critical
144 * section, we should simply fail, otherwise we would deadlock. 144 * section, we should simply fail, otherwise we would deadlock.
145 * In !PREEMPT configurations, there is no way to tell if we are
146 * in a RCU read-side critical section or not, so we never
147 * attempt any fixup and just print a warning.
145 */ 148 */
149#ifndef CONFIG_PREEMPT
150 WARN_ON_ONCE(1);
151 return 0;
152#endif
146 if (rcu_preempt_depth() != 0 || preempt_count() != 0 || 153 if (rcu_preempt_depth() != 0 || preempt_count() != 0 ||
147 irqs_disabled()) { 154 irqs_disabled()) {
148 WARN_ON(1); 155 WARN_ON_ONCE(1);
149 return 0; 156 return 0;
150 } 157 }
151 rcu_barrier(); 158 rcu_barrier();
@@ -184,10 +191,17 @@ static int rcuhead_fixup_activate(void *addr, enum debug_obj_state state)
184 * Ensure that queued callbacks are all executed. 191 * Ensure that queued callbacks are all executed.
185 * If we detect that we are nested in a RCU read-side critical 192 * If we detect that we are nested in a RCU read-side critical
186 * section, we should simply fail, otherwise we would deadlock. 193 * section, we should simply fail, otherwise we would deadlock.
194 * In !PREEMPT configurations, there is no way to tell if we are
195 * in a RCU read-side critical section or not, so we never
196 * attempt any fixup and just print a warning.
187 */ 197 */
198#ifndef CONFIG_PREEMPT
199 WARN_ON_ONCE(1);
200 return 0;
201#endif
188 if (rcu_preempt_depth() != 0 || preempt_count() != 0 || 202 if (rcu_preempt_depth() != 0 || preempt_count() != 0 ||
189 irqs_disabled()) { 203 irqs_disabled()) {
190 WARN_ON(1); 204 WARN_ON_ONCE(1);
191 return 0; 205 return 0;
192 } 206 }
193 rcu_barrier(); 207 rcu_barrier();
@@ -214,15 +228,17 @@ static int rcuhead_fixup_free(void *addr, enum debug_obj_state state)
214 * Ensure that queued callbacks are all executed. 228 * Ensure that queued callbacks are all executed.
215 * If we detect that we are nested in a RCU read-side critical 229 * If we detect that we are nested in a RCU read-side critical
216 * section, we should simply fail, otherwise we would deadlock. 230 * section, we should simply fail, otherwise we would deadlock.
217 * Note that the machinery to reliably determine whether 231 * In !PREEMPT configurations, there is no way to tell if we are
218 * or not we are in an RCU read-side critical section 232 * in a RCU read-side critical section or not, so we never
219 * exists only in the preemptible RCU implementations 233 * attempt any fixup and just print a warning.
220 * (TINY_PREEMPT_RCU and TREE_PREEMPT_RCU), which is why
221 * DEBUG_OBJECTS_RCU_HEAD is disallowed if !PREEMPT.
222 */ 234 */
235#ifndef CONFIG_PREEMPT
236 WARN_ON_ONCE(1);
237 return 0;
238#endif
223 if (rcu_preempt_depth() != 0 || preempt_count() != 0 || 239 if (rcu_preempt_depth() != 0 || preempt_count() != 0 ||
224 irqs_disabled()) { 240 irqs_disabled()) {
225 WARN_ON(1); 241 WARN_ON_ONCE(1);
226 return 0; 242 return 0;
227 } 243 }
228 rcu_barrier(); 244 rcu_barrier();
diff --git a/kernel/rcutiny.c b/kernel/rcutiny.c
index 0c343b9a46d5..7bbac7d0f5ab 100644
--- a/kernel/rcutiny.c
+++ b/kernel/rcutiny.c
@@ -35,15 +35,16 @@
35#include <linux/init.h> 35#include <linux/init.h>
36#include <linux/time.h> 36#include <linux/time.h>
37#include <linux/cpu.h> 37#include <linux/cpu.h>
38#include <linux/prefetch.h>
38 39
39/* Controls for rcu_kthread() kthread, replacing RCU_SOFTIRQ used previously. */ 40/* Controls for rcu_kthread() kthread, replacing RCU_SOFTIRQ used previously. */
40static struct task_struct *rcu_kthread_task; 41static struct task_struct *rcu_kthread_task;
41static DECLARE_WAIT_QUEUE_HEAD(rcu_kthread_wq); 42static DECLARE_WAIT_QUEUE_HEAD(rcu_kthread_wq);
42static unsigned long have_rcu_kthread_work; 43static unsigned long have_rcu_kthread_work;
43static void invoke_rcu_kthread(void);
44 44
45/* Forward declarations for rcutiny_plugin.h. */ 45/* Forward declarations for rcutiny_plugin.h. */
46struct rcu_ctrlblk; 46struct rcu_ctrlblk;
47static void invoke_rcu_kthread(void);
47static void rcu_process_callbacks(struct rcu_ctrlblk *rcp); 48static void rcu_process_callbacks(struct rcu_ctrlblk *rcp);
48static int rcu_kthread(void *arg); 49static int rcu_kthread(void *arg);
49static void __call_rcu(struct rcu_head *head, 50static void __call_rcu(struct rcu_head *head,
@@ -79,36 +80,45 @@ void rcu_exit_nohz(void)
79#endif /* #ifdef CONFIG_NO_HZ */ 80#endif /* #ifdef CONFIG_NO_HZ */
80 81
81/* 82/*
82 * Helper function for rcu_qsctr_inc() and rcu_bh_qsctr_inc(). 83 * Helper function for rcu_sched_qs() and rcu_bh_qs().
83 * Also disable irqs to avoid confusion due to interrupt handlers 84 * Also irqs are disabled to avoid confusion due to interrupt handlers
84 * invoking call_rcu(). 85 * invoking call_rcu().
85 */ 86 */
86static int rcu_qsctr_help(struct rcu_ctrlblk *rcp) 87static int rcu_qsctr_help(struct rcu_ctrlblk *rcp)
87{ 88{
88 unsigned long flags;
89
90 local_irq_save(flags);
91 if (rcp->rcucblist != NULL && 89 if (rcp->rcucblist != NULL &&
92 rcp->donetail != rcp->curtail) { 90 rcp->donetail != rcp->curtail) {
93 rcp->donetail = rcp->curtail; 91 rcp->donetail = rcp->curtail;
94 local_irq_restore(flags);
95 return 1; 92 return 1;
96 } 93 }
97 local_irq_restore(flags);
98 94
99 return 0; 95 return 0;
100} 96}
101 97
102/* 98/*
99 * Wake up rcu_kthread() to process callbacks now eligible for invocation
100 * or to boost readers.
101 */
102static void invoke_rcu_kthread(void)
103{
104 have_rcu_kthread_work = 1;
105 wake_up(&rcu_kthread_wq);
106}
107
108/*
103 * Record an rcu quiescent state. And an rcu_bh quiescent state while we 109 * Record an rcu quiescent state. And an rcu_bh quiescent state while we
104 * are at it, given that any rcu quiescent state is also an rcu_bh 110 * are at it, given that any rcu quiescent state is also an rcu_bh
105 * quiescent state. Use "+" instead of "||" to defeat short circuiting. 111 * quiescent state. Use "+" instead of "||" to defeat short circuiting.
106 */ 112 */
107void rcu_sched_qs(int cpu) 113void rcu_sched_qs(int cpu)
108{ 114{
115 unsigned long flags;
116
117 local_irq_save(flags);
109 if (rcu_qsctr_help(&rcu_sched_ctrlblk) + 118 if (rcu_qsctr_help(&rcu_sched_ctrlblk) +
110 rcu_qsctr_help(&rcu_bh_ctrlblk)) 119 rcu_qsctr_help(&rcu_bh_ctrlblk))
111 invoke_rcu_kthread(); 120 invoke_rcu_kthread();
121 local_irq_restore(flags);
112} 122}
113 123
114/* 124/*
@@ -116,8 +126,12 @@ void rcu_sched_qs(int cpu)
116 */ 126 */
117void rcu_bh_qs(int cpu) 127void rcu_bh_qs(int cpu)
118{ 128{
129 unsigned long flags;
130
131 local_irq_save(flags);
119 if (rcu_qsctr_help(&rcu_bh_ctrlblk)) 132 if (rcu_qsctr_help(&rcu_bh_ctrlblk))
120 invoke_rcu_kthread(); 133 invoke_rcu_kthread();
134 local_irq_restore(flags);
121} 135}
122 136
123/* 137/*
@@ -167,7 +181,7 @@ static void rcu_process_callbacks(struct rcu_ctrlblk *rcp)
167 prefetch(next); 181 prefetch(next);
168 debug_rcu_head_unqueue(list); 182 debug_rcu_head_unqueue(list);
169 local_bh_disable(); 183 local_bh_disable();
170 list->func(list); 184 __rcu_reclaim(list);
171 local_bh_enable(); 185 local_bh_enable();
172 list = next; 186 list = next;
173 RCU_TRACE(cb_count++); 187 RCU_TRACE(cb_count++);
@@ -208,20 +222,6 @@ static int rcu_kthread(void *arg)
208} 222}
209 223
210/* 224/*
211 * Wake up rcu_kthread() to process callbacks now eligible for invocation
212 * or to boost readers.
213 */
214static void invoke_rcu_kthread(void)
215{
216 unsigned long flags;
217
218 local_irq_save(flags);
219 have_rcu_kthread_work = 1;
220 wake_up(&rcu_kthread_wq);
221 local_irq_restore(flags);
222}
223
224/*
225 * Wait for a grace period to elapse. But it is illegal to invoke 225 * Wait for a grace period to elapse. But it is illegal to invoke
226 * synchronize_sched() from within an RCU read-side critical section. 226 * synchronize_sched() from within an RCU read-side critical section.
227 * Therefore, any legal call to synchronize_sched() is a quiescent 227 * Therefore, any legal call to synchronize_sched() is a quiescent
diff --git a/kernel/rcutiny_plugin.h b/kernel/rcutiny_plugin.h
index 3cb8e362e883..f259c676195f 100644
--- a/kernel/rcutiny_plugin.h
+++ b/kernel/rcutiny_plugin.h
@@ -100,23 +100,28 @@ struct rcu_preempt_ctrlblk {
100 u8 completed; /* Last grace period completed. */ 100 u8 completed; /* Last grace period completed. */
101 /* If all three are equal, RCU is idle. */ 101 /* If all three are equal, RCU is idle. */
102#ifdef CONFIG_RCU_BOOST 102#ifdef CONFIG_RCU_BOOST
103 s8 boosted_this_gp; /* Has boosting already happened? */
104 unsigned long boost_time; /* When to start boosting (jiffies) */ 103 unsigned long boost_time; /* When to start boosting (jiffies) */
105#endif /* #ifdef CONFIG_RCU_BOOST */ 104#endif /* #ifdef CONFIG_RCU_BOOST */
106#ifdef CONFIG_RCU_TRACE 105#ifdef CONFIG_RCU_TRACE
107 unsigned long n_grace_periods; 106 unsigned long n_grace_periods;
108#ifdef CONFIG_RCU_BOOST 107#ifdef CONFIG_RCU_BOOST
109 unsigned long n_tasks_boosted; 108 unsigned long n_tasks_boosted;
109 /* Total number of tasks boosted. */
110 unsigned long n_exp_boosts; 110 unsigned long n_exp_boosts;
111 /* Number of tasks boosted for expedited GP. */
111 unsigned long n_normal_boosts; 112 unsigned long n_normal_boosts;
112 unsigned long n_normal_balk_blkd_tasks; 113 /* Number of tasks boosted for normal GP. */
113 unsigned long n_normal_balk_gp_tasks; 114 unsigned long n_balk_blkd_tasks;
114 unsigned long n_normal_balk_boost_tasks; 115 /* Refused to boost: no blocked tasks. */
115 unsigned long n_normal_balk_boosted; 116 unsigned long n_balk_exp_gp_tasks;
116 unsigned long n_normal_balk_notyet; 117 /* Refused to boost: nothing blocking GP. */
117 unsigned long n_normal_balk_nos; 118 unsigned long n_balk_boost_tasks;
118 unsigned long n_exp_balk_blkd_tasks; 119 /* Refused to boost: already boosting. */
119 unsigned long n_exp_balk_nos; 120 unsigned long n_balk_notyet;
121 /* Refused to boost: not yet time. */
122 unsigned long n_balk_nos;
123 /* Refused to boost: not sure why, though. */
124 /* This can happen due to race conditions. */
120#endif /* #ifdef CONFIG_RCU_BOOST */ 125#endif /* #ifdef CONFIG_RCU_BOOST */
121#endif /* #ifdef CONFIG_RCU_TRACE */ 126#endif /* #ifdef CONFIG_RCU_TRACE */
122}; 127};
@@ -201,7 +206,6 @@ static struct list_head *rcu_next_node_entry(struct task_struct *t)
201 206
202#ifdef CONFIG_RCU_BOOST 207#ifdef CONFIG_RCU_BOOST
203static void rcu_initiate_boost_trace(void); 208static void rcu_initiate_boost_trace(void);
204static void rcu_initiate_exp_boost_trace(void);
205#endif /* #ifdef CONFIG_RCU_BOOST */ 209#endif /* #ifdef CONFIG_RCU_BOOST */
206 210
207/* 211/*
@@ -219,41 +223,21 @@ static void show_tiny_preempt_stats(struct seq_file *m)
219 "N."[!rcu_preempt_ctrlblk.gp_tasks], 223 "N."[!rcu_preempt_ctrlblk.gp_tasks],
220 "E."[!rcu_preempt_ctrlblk.exp_tasks]); 224 "E."[!rcu_preempt_ctrlblk.exp_tasks]);
221#ifdef CONFIG_RCU_BOOST 225#ifdef CONFIG_RCU_BOOST
222 seq_printf(m, " ttb=%c btg=", 226 seq_printf(m, "%sttb=%c ntb=%lu neb=%lu nnb=%lu j=%04x bt=%04x\n",
223 "B."[!rcu_preempt_ctrlblk.boost_tasks]); 227 " ",
224 switch (rcu_preempt_ctrlblk.boosted_this_gp) { 228 "B."[!rcu_preempt_ctrlblk.boost_tasks],
225 case -1:
226 seq_puts(m, "exp");
227 break;
228 case 0:
229 seq_puts(m, "no");
230 break;
231 case 1:
232 seq_puts(m, "begun");
233 break;
234 case 2:
235 seq_puts(m, "done");
236 break;
237 default:
238 seq_printf(m, "?%d?", rcu_preempt_ctrlblk.boosted_this_gp);
239 }
240 seq_printf(m, " ntb=%lu neb=%lu nnb=%lu j=%04x bt=%04x\n",
241 rcu_preempt_ctrlblk.n_tasks_boosted, 229 rcu_preempt_ctrlblk.n_tasks_boosted,
242 rcu_preempt_ctrlblk.n_exp_boosts, 230 rcu_preempt_ctrlblk.n_exp_boosts,
243 rcu_preempt_ctrlblk.n_normal_boosts, 231 rcu_preempt_ctrlblk.n_normal_boosts,
244 (int)(jiffies & 0xffff), 232 (int)(jiffies & 0xffff),
245 (int)(rcu_preempt_ctrlblk.boost_time & 0xffff)); 233 (int)(rcu_preempt_ctrlblk.boost_time & 0xffff));
246 seq_printf(m, " %s: nt=%lu gt=%lu bt=%lu b=%lu ny=%lu nos=%lu\n", 234 seq_printf(m, "%s: nt=%lu egt=%lu bt=%lu ny=%lu nos=%lu\n",
247 "normal balk", 235 " balk",
248 rcu_preempt_ctrlblk.n_normal_balk_blkd_tasks, 236 rcu_preempt_ctrlblk.n_balk_blkd_tasks,
249 rcu_preempt_ctrlblk.n_normal_balk_gp_tasks, 237 rcu_preempt_ctrlblk.n_balk_exp_gp_tasks,
250 rcu_preempt_ctrlblk.n_normal_balk_boost_tasks, 238 rcu_preempt_ctrlblk.n_balk_boost_tasks,
251 rcu_preempt_ctrlblk.n_normal_balk_boosted, 239 rcu_preempt_ctrlblk.n_balk_notyet,
252 rcu_preempt_ctrlblk.n_normal_balk_notyet, 240 rcu_preempt_ctrlblk.n_balk_nos);
253 rcu_preempt_ctrlblk.n_normal_balk_nos);
254 seq_printf(m, " exp balk: bt=%lu nos=%lu\n",
255 rcu_preempt_ctrlblk.n_exp_balk_blkd_tasks,
256 rcu_preempt_ctrlblk.n_exp_balk_nos);
257#endif /* #ifdef CONFIG_RCU_BOOST */ 241#endif /* #ifdef CONFIG_RCU_BOOST */
258} 242}
259 243
@@ -271,25 +255,59 @@ static int rcu_boost(void)
271{ 255{
272 unsigned long flags; 256 unsigned long flags;
273 struct rt_mutex mtx; 257 struct rt_mutex mtx;
274 struct list_head *np;
275 struct task_struct *t; 258 struct task_struct *t;
259 struct list_head *tb;
276 260
277 if (rcu_preempt_ctrlblk.boost_tasks == NULL) 261 if (rcu_preempt_ctrlblk.boost_tasks == NULL &&
262 rcu_preempt_ctrlblk.exp_tasks == NULL)
278 return 0; /* Nothing to boost. */ 263 return 0; /* Nothing to boost. */
264
279 raw_local_irq_save(flags); 265 raw_local_irq_save(flags);
280 rcu_preempt_ctrlblk.boosted_this_gp++; 266
281 t = container_of(rcu_preempt_ctrlblk.boost_tasks, struct task_struct, 267 /*
282 rcu_node_entry); 268 * Recheck with irqs disabled: all tasks in need of boosting
283 np = rcu_next_node_entry(t); 269 * might exit their RCU read-side critical sections on their own
270 * if we are preempted just before disabling irqs.
271 */
272 if (rcu_preempt_ctrlblk.boost_tasks == NULL &&
273 rcu_preempt_ctrlblk.exp_tasks == NULL) {
274 raw_local_irq_restore(flags);
275 return 0;
276 }
277
278 /*
279 * Preferentially boost tasks blocking expedited grace periods.
280 * This cannot starve the normal grace periods because a second
281 * expedited grace period must boost all blocked tasks, including
282 * those blocking the pre-existing normal grace period.
283 */
284 if (rcu_preempt_ctrlblk.exp_tasks != NULL) {
285 tb = rcu_preempt_ctrlblk.exp_tasks;
286 RCU_TRACE(rcu_preempt_ctrlblk.n_exp_boosts++);
287 } else {
288 tb = rcu_preempt_ctrlblk.boost_tasks;
289 RCU_TRACE(rcu_preempt_ctrlblk.n_normal_boosts++);
290 }
291 RCU_TRACE(rcu_preempt_ctrlblk.n_tasks_boosted++);
292
293 /*
294 * We boost task t by manufacturing an rt_mutex that appears to
295 * be held by task t. We leave a pointer to that rt_mutex where
296 * task t can find it, and task t will release the mutex when it
297 * exits its outermost RCU read-side critical section. Then
298 * simply acquiring this artificial rt_mutex will boost task
299 * t's priority. (Thanks to tglx for suggesting this approach!)
300 */
301 t = container_of(tb, struct task_struct, rcu_node_entry);
284 rt_mutex_init_proxy_locked(&mtx, t); 302 rt_mutex_init_proxy_locked(&mtx, t);
285 t->rcu_boost_mutex = &mtx; 303 t->rcu_boost_mutex = &mtx;
286 t->rcu_read_unlock_special |= RCU_READ_UNLOCK_BOOSTED; 304 t->rcu_read_unlock_special |= RCU_READ_UNLOCK_BOOSTED;
287 raw_local_irq_restore(flags); 305 raw_local_irq_restore(flags);
288 rt_mutex_lock(&mtx); 306 rt_mutex_lock(&mtx);
289 RCU_TRACE(rcu_preempt_ctrlblk.n_tasks_boosted++); 307 rt_mutex_unlock(&mtx); /* Keep lockdep happy. */
290 rcu_preempt_ctrlblk.boosted_this_gp++; 308
291 rt_mutex_unlock(&mtx); 309 return rcu_preempt_ctrlblk.boost_tasks != NULL ||
292 return rcu_preempt_ctrlblk.boost_tasks != NULL; 310 rcu_preempt_ctrlblk.exp_tasks != NULL;
293} 311}
294 312
295/* 313/*
@@ -304,42 +322,25 @@ static int rcu_boost(void)
304 */ 322 */
305static int rcu_initiate_boost(void) 323static int rcu_initiate_boost(void)
306{ 324{
307 if (!rcu_preempt_blocked_readers_cgp()) { 325 if (!rcu_preempt_blocked_readers_cgp() &&
308 RCU_TRACE(rcu_preempt_ctrlblk.n_normal_balk_blkd_tasks++); 326 rcu_preempt_ctrlblk.exp_tasks == NULL) {
327 RCU_TRACE(rcu_preempt_ctrlblk.n_balk_exp_gp_tasks++);
309 return 0; 328 return 0;
310 } 329 }
311 if (rcu_preempt_ctrlblk.gp_tasks != NULL && 330 if (rcu_preempt_ctrlblk.exp_tasks != NULL ||
312 rcu_preempt_ctrlblk.boost_tasks == NULL && 331 (rcu_preempt_ctrlblk.gp_tasks != NULL &&
313 rcu_preempt_ctrlblk.boosted_this_gp == 0 && 332 rcu_preempt_ctrlblk.boost_tasks == NULL &&
314 ULONG_CMP_GE(jiffies, rcu_preempt_ctrlblk.boost_time)) { 333 ULONG_CMP_GE(jiffies, rcu_preempt_ctrlblk.boost_time))) {
315 rcu_preempt_ctrlblk.boost_tasks = rcu_preempt_ctrlblk.gp_tasks; 334 if (rcu_preempt_ctrlblk.exp_tasks == NULL)
335 rcu_preempt_ctrlblk.boost_tasks =
336 rcu_preempt_ctrlblk.gp_tasks;
316 invoke_rcu_kthread(); 337 invoke_rcu_kthread();
317 RCU_TRACE(rcu_preempt_ctrlblk.n_normal_boosts++);
318 } else 338 } else
319 RCU_TRACE(rcu_initiate_boost_trace()); 339 RCU_TRACE(rcu_initiate_boost_trace());
320 return 1; 340 return 1;
321} 341}
322 342
323/* 343#define RCU_BOOST_DELAY_JIFFIES DIV_ROUND_UP(CONFIG_RCU_BOOST_DELAY * HZ, 1000)
324 * Initiate boosting for an expedited grace period.
325 */
326static void rcu_initiate_expedited_boost(void)
327{
328 unsigned long flags;
329
330 raw_local_irq_save(flags);
331 if (!list_empty(&rcu_preempt_ctrlblk.blkd_tasks)) {
332 rcu_preempt_ctrlblk.boost_tasks =
333 rcu_preempt_ctrlblk.blkd_tasks.next;
334 rcu_preempt_ctrlblk.boosted_this_gp = -1;
335 invoke_rcu_kthread();
336 RCU_TRACE(rcu_preempt_ctrlblk.n_exp_boosts++);
337 } else
338 RCU_TRACE(rcu_initiate_exp_boost_trace());
339 raw_local_irq_restore(flags);
340}
341
342#define RCU_BOOST_DELAY_JIFFIES DIV_ROUND_UP(CONFIG_RCU_BOOST_DELAY * HZ, 1000);
343 344
344/* 345/*
345 * Do priority-boost accounting for the start of a new grace period. 346 * Do priority-boost accounting for the start of a new grace period.
@@ -347,8 +348,6 @@ static void rcu_initiate_expedited_boost(void)
347static void rcu_preempt_boost_start_gp(void) 348static void rcu_preempt_boost_start_gp(void)
348{ 349{
349 rcu_preempt_ctrlblk.boost_time = jiffies + RCU_BOOST_DELAY_JIFFIES; 350 rcu_preempt_ctrlblk.boost_time = jiffies + RCU_BOOST_DELAY_JIFFIES;
350 if (rcu_preempt_ctrlblk.boosted_this_gp > 0)
351 rcu_preempt_ctrlblk.boosted_this_gp = 0;
352} 351}
353 352
354#else /* #ifdef CONFIG_RCU_BOOST */ 353#else /* #ifdef CONFIG_RCU_BOOST */
@@ -372,13 +371,6 @@ static int rcu_initiate_boost(void)
372} 371}
373 372
374/* 373/*
375 * If there is no RCU priority boosting, we don't initiate expedited boosting.
376 */
377static void rcu_initiate_expedited_boost(void)
378{
379}
380
381/*
382 * If there is no RCU priority boosting, nothing to do at grace-period start. 374 * If there is no RCU priority boosting, nothing to do at grace-period start.
383 */ 375 */
384static void rcu_preempt_boost_start_gp(void) 376static void rcu_preempt_boost_start_gp(void)
@@ -418,7 +410,7 @@ static void rcu_preempt_cpu_qs(void)
418 if (!rcu_preempt_gp_in_progress()) 410 if (!rcu_preempt_gp_in_progress())
419 return; 411 return;
420 /* 412 /*
421 * Check up on boosting. If there are no readers blocking the 413 * Check up on boosting. If there are readers blocking the
422 * current grace period, leave. 414 * current grace period, leave.
423 */ 415 */
424 if (rcu_initiate_boost()) 416 if (rcu_initiate_boost())
@@ -578,7 +570,7 @@ static void rcu_read_unlock_special(struct task_struct *t)
578 empty = !rcu_preempt_blocked_readers_cgp(); 570 empty = !rcu_preempt_blocked_readers_cgp();
579 empty_exp = rcu_preempt_ctrlblk.exp_tasks == NULL; 571 empty_exp = rcu_preempt_ctrlblk.exp_tasks == NULL;
580 np = rcu_next_node_entry(t); 572 np = rcu_next_node_entry(t);
581 list_del(&t->rcu_node_entry); 573 list_del_init(&t->rcu_node_entry);
582 if (&t->rcu_node_entry == rcu_preempt_ctrlblk.gp_tasks) 574 if (&t->rcu_node_entry == rcu_preempt_ctrlblk.gp_tasks)
583 rcu_preempt_ctrlblk.gp_tasks = np; 575 rcu_preempt_ctrlblk.gp_tasks = np;
584 if (&t->rcu_node_entry == rcu_preempt_ctrlblk.exp_tasks) 576 if (&t->rcu_node_entry == rcu_preempt_ctrlblk.exp_tasks)
@@ -587,7 +579,6 @@ static void rcu_read_unlock_special(struct task_struct *t)
587 if (&t->rcu_node_entry == rcu_preempt_ctrlblk.boost_tasks) 579 if (&t->rcu_node_entry == rcu_preempt_ctrlblk.boost_tasks)
588 rcu_preempt_ctrlblk.boost_tasks = np; 580 rcu_preempt_ctrlblk.boost_tasks = np;
589#endif /* #ifdef CONFIG_RCU_BOOST */ 581#endif /* #ifdef CONFIG_RCU_BOOST */
590 INIT_LIST_HEAD(&t->rcu_node_entry);
591 582
592 /* 583 /*
593 * If this was the last task on the current list, and if 584 * If this was the last task on the current list, and if
@@ -812,13 +803,16 @@ void synchronize_rcu_expedited(void)
812 rpcp->exp_tasks = rpcp->blkd_tasks.next; 803 rpcp->exp_tasks = rpcp->blkd_tasks.next;
813 if (rpcp->exp_tasks == &rpcp->blkd_tasks) 804 if (rpcp->exp_tasks == &rpcp->blkd_tasks)
814 rpcp->exp_tasks = NULL; 805 rpcp->exp_tasks = NULL;
815 local_irq_restore(flags);
816 806
817 /* Wait for tail of ->blkd_tasks list to drain. */ 807 /* Wait for tail of ->blkd_tasks list to drain. */
818 if (rcu_preempted_readers_exp()) 808 if (!rcu_preempted_readers_exp())
819 rcu_initiate_expedited_boost(); 809 local_irq_restore(flags);
810 else {
811 rcu_initiate_boost();
812 local_irq_restore(flags);
820 wait_event(sync_rcu_preempt_exp_wq, 813 wait_event(sync_rcu_preempt_exp_wq,
821 !rcu_preempted_readers_exp()); 814 !rcu_preempted_readers_exp());
815 }
822 816
823 /* Clean up and exit. */ 817 /* Clean up and exit. */
824 barrier(); /* ensure expedited GP seen before counter increment. */ 818 barrier(); /* ensure expedited GP seen before counter increment. */
@@ -931,24 +925,17 @@ void __init rcu_scheduler_starting(void)
931 925
932static void rcu_initiate_boost_trace(void) 926static void rcu_initiate_boost_trace(void)
933{ 927{
934 if (rcu_preempt_ctrlblk.gp_tasks == NULL) 928 if (list_empty(&rcu_preempt_ctrlblk.blkd_tasks))
935 rcu_preempt_ctrlblk.n_normal_balk_gp_tasks++; 929 rcu_preempt_ctrlblk.n_balk_blkd_tasks++;
930 else if (rcu_preempt_ctrlblk.gp_tasks == NULL &&
931 rcu_preempt_ctrlblk.exp_tasks == NULL)
932 rcu_preempt_ctrlblk.n_balk_exp_gp_tasks++;
936 else if (rcu_preempt_ctrlblk.boost_tasks != NULL) 933 else if (rcu_preempt_ctrlblk.boost_tasks != NULL)
937 rcu_preempt_ctrlblk.n_normal_balk_boost_tasks++; 934 rcu_preempt_ctrlblk.n_balk_boost_tasks++;
938 else if (rcu_preempt_ctrlblk.boosted_this_gp != 0)
939 rcu_preempt_ctrlblk.n_normal_balk_boosted++;
940 else if (!ULONG_CMP_GE(jiffies, rcu_preempt_ctrlblk.boost_time)) 935 else if (!ULONG_CMP_GE(jiffies, rcu_preempt_ctrlblk.boost_time))
941 rcu_preempt_ctrlblk.n_normal_balk_notyet++; 936 rcu_preempt_ctrlblk.n_balk_notyet++;
942 else
943 rcu_preempt_ctrlblk.n_normal_balk_nos++;
944}
945
946static void rcu_initiate_exp_boost_trace(void)
947{
948 if (list_empty(&rcu_preempt_ctrlblk.blkd_tasks))
949 rcu_preempt_ctrlblk.n_exp_balk_blkd_tasks++;
950 else 937 else
951 rcu_preempt_ctrlblk.n_exp_balk_nos++; 938 rcu_preempt_ctrlblk.n_balk_nos++;
952} 939}
953 940
954#endif /* #ifdef CONFIG_RCU_BOOST */ 941#endif /* #ifdef CONFIG_RCU_BOOST */
diff --git a/kernel/rcutorture.c b/kernel/rcutorture.c
index c224da41890c..98f51b13bb7e 100644
--- a/kernel/rcutorture.c
+++ b/kernel/rcutorture.c
@@ -33,7 +33,7 @@
33#include <linux/rcupdate.h> 33#include <linux/rcupdate.h>
34#include <linux/interrupt.h> 34#include <linux/interrupt.h>
35#include <linux/sched.h> 35#include <linux/sched.h>
36#include <asm/atomic.h> 36#include <linux/atomic.h>
37#include <linux/bitops.h> 37#include <linux/bitops.h>
38#include <linux/completion.h> 38#include <linux/completion.h>
39#include <linux/moduleparam.h> 39#include <linux/moduleparam.h>
@@ -131,7 +131,7 @@ struct rcu_torture {
131 131
132static LIST_HEAD(rcu_torture_freelist); 132static LIST_HEAD(rcu_torture_freelist);
133static struct rcu_torture __rcu *rcu_torture_current; 133static struct rcu_torture __rcu *rcu_torture_current;
134static long rcu_torture_current_version; 134static unsigned long rcu_torture_current_version;
135static struct rcu_torture rcu_tortures[10 * RCU_TORTURE_PIPE_LEN]; 135static struct rcu_torture rcu_tortures[10 * RCU_TORTURE_PIPE_LEN];
136static DEFINE_SPINLOCK(rcu_torture_lock); 136static DEFINE_SPINLOCK(rcu_torture_lock);
137static DEFINE_PER_CPU(long [RCU_TORTURE_PIPE_LEN + 1], rcu_torture_count) = 137static DEFINE_PER_CPU(long [RCU_TORTURE_PIPE_LEN + 1], rcu_torture_count) =
@@ -146,8 +146,6 @@ static atomic_t n_rcu_torture_mberror;
146static atomic_t n_rcu_torture_error; 146static atomic_t n_rcu_torture_error;
147static long n_rcu_torture_boost_ktrerror; 147static long n_rcu_torture_boost_ktrerror;
148static long n_rcu_torture_boost_rterror; 148static long n_rcu_torture_boost_rterror;
149static long n_rcu_torture_boost_allocerror;
150static long n_rcu_torture_boost_afferror;
151static long n_rcu_torture_boost_failure; 149static long n_rcu_torture_boost_failure;
152static long n_rcu_torture_boosts; 150static long n_rcu_torture_boosts;
153static long n_rcu_torture_timers; 151static long n_rcu_torture_timers;
@@ -163,11 +161,11 @@ static int stutter_pause_test;
163#endif 161#endif
164int rcutorture_runnable = RCUTORTURE_RUNNABLE_INIT; 162int rcutorture_runnable = RCUTORTURE_RUNNABLE_INIT;
165 163
166#ifdef CONFIG_RCU_BOOST 164#if defined(CONFIG_RCU_BOOST) && !defined(CONFIG_HOTPLUG_CPU)
167#define rcu_can_boost() 1 165#define rcu_can_boost() 1
168#else /* #ifdef CONFIG_RCU_BOOST */ 166#else /* #if defined(CONFIG_RCU_BOOST) && !defined(CONFIG_HOTPLUG_CPU) */
169#define rcu_can_boost() 0 167#define rcu_can_boost() 0
170#endif /* #else #ifdef CONFIG_RCU_BOOST */ 168#endif /* #else #if defined(CONFIG_RCU_BOOST) && !defined(CONFIG_HOTPLUG_CPU) */
171 169
172static unsigned long boost_starttime; /* jiffies of next boost test start. */ 170static unsigned long boost_starttime; /* jiffies of next boost test start. */
173DEFINE_MUTEX(boost_mutex); /* protect setting boost_starttime */ 171DEFINE_MUTEX(boost_mutex); /* protect setting boost_starttime */
@@ -751,6 +749,7 @@ static int rcu_torture_boost(void *arg)
751 n_rcu_torture_boost_rterror++; 749 n_rcu_torture_boost_rterror++;
752 } 750 }
753 751
752 init_rcu_head_on_stack(&rbi.rcu);
754 /* Each pass through the following loop does one boost-test cycle. */ 753 /* Each pass through the following loop does one boost-test cycle. */
755 do { 754 do {
756 /* Wait for the next test interval. */ 755 /* Wait for the next test interval. */
@@ -810,6 +809,7 @@ checkwait: rcu_stutter_wait("rcu_torture_boost");
810 809
811 /* Clean up and exit. */ 810 /* Clean up and exit. */
812 VERBOSE_PRINTK_STRING("rcu_torture_boost task stopping"); 811 VERBOSE_PRINTK_STRING("rcu_torture_boost task stopping");
812 destroy_rcu_head_on_stack(&rbi.rcu);
813 rcutorture_shutdown_absorb("rcu_torture_boost"); 813 rcutorture_shutdown_absorb("rcu_torture_boost");
814 while (!kthread_should_stop() || rbi.inflight) 814 while (!kthread_should_stop() || rbi.inflight)
815 schedule_timeout_uninterruptible(1); 815 schedule_timeout_uninterruptible(1);
@@ -886,7 +886,7 @@ rcu_torture_writer(void *arg)
886 old_rp->rtort_pipe_count++; 886 old_rp->rtort_pipe_count++;
887 cur_ops->deferred_free(old_rp); 887 cur_ops->deferred_free(old_rp);
888 } 888 }
889 rcu_torture_current_version++; 889 rcutorture_record_progress(++rcu_torture_current_version);
890 oldbatch = cur_ops->completed(); 890 oldbatch = cur_ops->completed();
891 rcu_stutter_wait("rcu_torture_writer"); 891 rcu_stutter_wait("rcu_torture_writer");
892 } while (!kthread_should_stop() && fullstop == FULLSTOP_DONTSTOP); 892 } while (!kthread_should_stop() && fullstop == FULLSTOP_DONTSTOP);
@@ -941,7 +941,6 @@ static void rcu_torture_timer(unsigned long unused)
941 idx = cur_ops->readlock(); 941 idx = cur_ops->readlock();
942 completed = cur_ops->completed(); 942 completed = cur_ops->completed();
943 p = rcu_dereference_check(rcu_torture_current, 943 p = rcu_dereference_check(rcu_torture_current,
944 rcu_read_lock_held() ||
945 rcu_read_lock_bh_held() || 944 rcu_read_lock_bh_held() ||
946 rcu_read_lock_sched_held() || 945 rcu_read_lock_sched_held() ||
947 srcu_read_lock_held(&srcu_ctl)); 946 srcu_read_lock_held(&srcu_ctl));
@@ -1002,7 +1001,6 @@ rcu_torture_reader(void *arg)
1002 idx = cur_ops->readlock(); 1001 idx = cur_ops->readlock();
1003 completed = cur_ops->completed(); 1002 completed = cur_ops->completed();
1004 p = rcu_dereference_check(rcu_torture_current, 1003 p = rcu_dereference_check(rcu_torture_current,
1005 rcu_read_lock_held() ||
1006 rcu_read_lock_bh_held() || 1004 rcu_read_lock_bh_held() ||
1007 rcu_read_lock_sched_held() || 1005 rcu_read_lock_sched_held() ||
1008 srcu_read_lock_held(&srcu_ctl)); 1006 srcu_read_lock_held(&srcu_ctl));
@@ -1066,8 +1064,8 @@ rcu_torture_printk(char *page)
1066 } 1064 }
1067 cnt += sprintf(&page[cnt], "%s%s ", torture_type, TORTURE_FLAG); 1065 cnt += sprintf(&page[cnt], "%s%s ", torture_type, TORTURE_FLAG);
1068 cnt += sprintf(&page[cnt], 1066 cnt += sprintf(&page[cnt],
1069 "rtc: %p ver: %ld tfle: %d rta: %d rtaf: %d rtf: %d " 1067 "rtc: %p ver: %lu tfle: %d rta: %d rtaf: %d rtf: %d "
1070 "rtmbe: %d rtbke: %ld rtbre: %ld rtbae: %ld rtbafe: %ld " 1068 "rtmbe: %d rtbke: %ld rtbre: %ld "
1071 "rtbf: %ld rtb: %ld nt: %ld", 1069 "rtbf: %ld rtb: %ld nt: %ld",
1072 rcu_torture_current, 1070 rcu_torture_current,
1073 rcu_torture_current_version, 1071 rcu_torture_current_version,
@@ -1078,16 +1076,12 @@ rcu_torture_printk(char *page)
1078 atomic_read(&n_rcu_torture_mberror), 1076 atomic_read(&n_rcu_torture_mberror),
1079 n_rcu_torture_boost_ktrerror, 1077 n_rcu_torture_boost_ktrerror,
1080 n_rcu_torture_boost_rterror, 1078 n_rcu_torture_boost_rterror,
1081 n_rcu_torture_boost_allocerror,
1082 n_rcu_torture_boost_afferror,
1083 n_rcu_torture_boost_failure, 1079 n_rcu_torture_boost_failure,
1084 n_rcu_torture_boosts, 1080 n_rcu_torture_boosts,
1085 n_rcu_torture_timers); 1081 n_rcu_torture_timers);
1086 if (atomic_read(&n_rcu_torture_mberror) != 0 || 1082 if (atomic_read(&n_rcu_torture_mberror) != 0 ||
1087 n_rcu_torture_boost_ktrerror != 0 || 1083 n_rcu_torture_boost_ktrerror != 0 ||
1088 n_rcu_torture_boost_rterror != 0 || 1084 n_rcu_torture_boost_rterror != 0 ||
1089 n_rcu_torture_boost_allocerror != 0 ||
1090 n_rcu_torture_boost_afferror != 0 ||
1091 n_rcu_torture_boost_failure != 0) 1085 n_rcu_torture_boost_failure != 0)
1092 cnt += sprintf(&page[cnt], " !!!"); 1086 cnt += sprintf(&page[cnt], " !!!");
1093 cnt += sprintf(&page[cnt], "\n%s%s ", torture_type, TORTURE_FLAG); 1087 cnt += sprintf(&page[cnt], "\n%s%s ", torture_type, TORTURE_FLAG);
@@ -1331,6 +1325,7 @@ rcu_torture_cleanup(void)
1331 int i; 1325 int i;
1332 1326
1333 mutex_lock(&fullstop_mutex); 1327 mutex_lock(&fullstop_mutex);
1328 rcutorture_record_test_transition();
1334 if (fullstop == FULLSTOP_SHUTDOWN) { 1329 if (fullstop == FULLSTOP_SHUTDOWN) {
1335 printk(KERN_WARNING /* but going down anyway, so... */ 1330 printk(KERN_WARNING /* but going down anyway, so... */
1336 "Concurrent 'rmmod rcutorture' and shutdown illegal!\n"); 1331 "Concurrent 'rmmod rcutorture' and shutdown illegal!\n");
@@ -1486,8 +1481,6 @@ rcu_torture_init(void)
1486 atomic_set(&n_rcu_torture_error, 0); 1481 atomic_set(&n_rcu_torture_error, 0);
1487 n_rcu_torture_boost_ktrerror = 0; 1482 n_rcu_torture_boost_ktrerror = 0;
1488 n_rcu_torture_boost_rterror = 0; 1483 n_rcu_torture_boost_rterror = 0;
1489 n_rcu_torture_boost_allocerror = 0;
1490 n_rcu_torture_boost_afferror = 0;
1491 n_rcu_torture_boost_failure = 0; 1484 n_rcu_torture_boost_failure = 0;
1492 n_rcu_torture_boosts = 0; 1485 n_rcu_torture_boosts = 0;
1493 for (i = 0; i < RCU_TORTURE_PIPE_LEN + 1; i++) 1486 for (i = 0; i < RCU_TORTURE_PIPE_LEN + 1; i++)
@@ -1624,6 +1617,7 @@ rcu_torture_init(void)
1624 } 1617 }
1625 } 1618 }
1626 register_reboot_notifier(&rcutorture_shutdown_nb); 1619 register_reboot_notifier(&rcutorture_shutdown_nb);
1620 rcutorture_record_test_transition();
1627 mutex_unlock(&fullstop_mutex); 1621 mutex_unlock(&fullstop_mutex);
1628 return 0; 1622 return 0;
1629 1623
diff --git a/kernel/rcutree.c b/kernel/rcutree.c
index dd4aea806f8e..ba06207b1dd3 100644
--- a/kernel/rcutree.c
+++ b/kernel/rcutree.c
@@ -36,7 +36,7 @@
36#include <linux/interrupt.h> 36#include <linux/interrupt.h>
37#include <linux/sched.h> 37#include <linux/sched.h>
38#include <linux/nmi.h> 38#include <linux/nmi.h>
39#include <asm/atomic.h> 39#include <linux/atomic.h>
40#include <linux/bitops.h> 40#include <linux/bitops.h>
41#include <linux/module.h> 41#include <linux/module.h>
42#include <linux/completion.h> 42#include <linux/completion.h>
@@ -47,6 +47,9 @@
47#include <linux/mutex.h> 47#include <linux/mutex.h>
48#include <linux/time.h> 48#include <linux/time.h>
49#include <linux/kernel_stat.h> 49#include <linux/kernel_stat.h>
50#include <linux/wait.h>
51#include <linux/kthread.h>
52#include <linux/prefetch.h>
50 53
51#include "rcutree.h" 54#include "rcutree.h"
52 55
@@ -79,10 +82,67 @@ DEFINE_PER_CPU(struct rcu_data, rcu_sched_data);
79struct rcu_state rcu_bh_state = RCU_STATE_INITIALIZER(rcu_bh_state); 82struct rcu_state rcu_bh_state = RCU_STATE_INITIALIZER(rcu_bh_state);
80DEFINE_PER_CPU(struct rcu_data, rcu_bh_data); 83DEFINE_PER_CPU(struct rcu_data, rcu_bh_data);
81 84
85static struct rcu_state *rcu_state;
86
87/*
88 * The rcu_scheduler_active variable transitions from zero to one just
89 * before the first task is spawned. So when this variable is zero, RCU
90 * can assume that there is but one task, allowing RCU to (for example)
91 * optimized synchronize_sched() to a simple barrier(). When this variable
92 * is one, RCU must actually do all the hard work required to detect real
93 * grace periods. This variable is also used to suppress boot-time false
94 * positives from lockdep-RCU error checking.
95 */
82int rcu_scheduler_active __read_mostly; 96int rcu_scheduler_active __read_mostly;
83EXPORT_SYMBOL_GPL(rcu_scheduler_active); 97EXPORT_SYMBOL_GPL(rcu_scheduler_active);
84 98
85/* 99/*
100 * The rcu_scheduler_fully_active variable transitions from zero to one
101 * during the early_initcall() processing, which is after the scheduler
102 * is capable of creating new tasks. So RCU processing (for example,
103 * creating tasks for RCU priority boosting) must be delayed until after
104 * rcu_scheduler_fully_active transitions from zero to one. We also
105 * currently delay invocation of any RCU callbacks until after this point.
106 *
107 * It might later prove better for people registering RCU callbacks during
108 * early boot to take responsibility for these callbacks, but one step at
109 * a time.
110 */
111static int rcu_scheduler_fully_active __read_mostly;
112
113#ifdef CONFIG_RCU_BOOST
114
115/*
116 * Control variables for per-CPU and per-rcu_node kthreads. These
117 * handle all flavors of RCU.
118 */
119static DEFINE_PER_CPU(struct task_struct *, rcu_cpu_kthread_task);
120DEFINE_PER_CPU(unsigned int, rcu_cpu_kthread_status);
121DEFINE_PER_CPU(int, rcu_cpu_kthread_cpu);
122DEFINE_PER_CPU(unsigned int, rcu_cpu_kthread_loops);
123DEFINE_PER_CPU(char, rcu_cpu_has_work);
124
125#endif /* #ifdef CONFIG_RCU_BOOST */
126
127static void rcu_node_kthread_setaffinity(struct rcu_node *rnp, int outgoingcpu);
128static void invoke_rcu_core(void);
129static void invoke_rcu_callbacks(struct rcu_state *rsp, struct rcu_data *rdp);
130
131#define RCU_KTHREAD_PRIO 1 /* RT priority for per-CPU kthreads. */
132
133/*
134 * Track the rcutorture test sequence number and the update version
135 * number within a given test. The rcutorture_testseq is incremented
136 * on every rcutorture module load and unload, so has an odd value
137 * when a test is running. The rcutorture_vernum is set to zero
138 * when rcutorture starts and is incremented on each rcutorture update.
139 * These variables enable correlating rcutorture output with the
140 * RCU tracing information.
141 */
142unsigned long rcutorture_testseq;
143unsigned long rcutorture_vernum;
144
145/*
86 * Return true if an RCU grace period is in progress. The ACCESS_ONCE()s 146 * Return true if an RCU grace period is in progress. The ACCESS_ONCE()s
87 * permit this function to be invoked without holding the root rcu_node 147 * permit this function to be invoked without holding the root rcu_node
88 * structure's ->lock, but of course results can be subject to change. 148 * structure's ->lock, but of course results can be subject to change.
@@ -124,11 +184,12 @@ void rcu_note_context_switch(int cpu)
124 rcu_sched_qs(cpu); 184 rcu_sched_qs(cpu);
125 rcu_preempt_note_context_switch(cpu); 185 rcu_preempt_note_context_switch(cpu);
126} 186}
187EXPORT_SYMBOL_GPL(rcu_note_context_switch);
127 188
128#ifdef CONFIG_NO_HZ 189#ifdef CONFIG_NO_HZ
129DEFINE_PER_CPU(struct rcu_dynticks, rcu_dynticks) = { 190DEFINE_PER_CPU(struct rcu_dynticks, rcu_dynticks) = {
130 .dynticks_nesting = 1, 191 .dynticks_nesting = 1,
131 .dynticks = 1, 192 .dynticks = ATOMIC_INIT(1),
132}; 193};
133#endif /* #ifdef CONFIG_NO_HZ */ 194#endif /* #ifdef CONFIG_NO_HZ */
134 195
@@ -140,10 +201,8 @@ module_param(blimit, int, 0);
140module_param(qhimark, int, 0); 201module_param(qhimark, int, 0);
141module_param(qlowmark, int, 0); 202module_param(qlowmark, int, 0);
142 203
143#ifdef CONFIG_RCU_CPU_STALL_DETECTOR 204int rcu_cpu_stall_suppress __read_mostly;
144int rcu_cpu_stall_suppress __read_mostly = RCU_CPU_STALL_SUPPRESS_INIT;
145module_param(rcu_cpu_stall_suppress, int, 0644); 205module_param(rcu_cpu_stall_suppress, int, 0644);
146#endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */
147 206
148static void force_quiescent_state(struct rcu_state *rsp, int relaxed); 207static void force_quiescent_state(struct rcu_state *rsp, int relaxed);
149static int rcu_pending(int cpu); 208static int rcu_pending(int cpu);
@@ -176,6 +235,31 @@ void rcu_bh_force_quiescent_state(void)
176EXPORT_SYMBOL_GPL(rcu_bh_force_quiescent_state); 235EXPORT_SYMBOL_GPL(rcu_bh_force_quiescent_state);
177 236
178/* 237/*
238 * Record the number of times rcutorture tests have been initiated and
239 * terminated. This information allows the debugfs tracing stats to be
240 * correlated to the rcutorture messages, even when the rcutorture module
241 * is being repeatedly loaded and unloaded. In other words, we cannot
242 * store this state in rcutorture itself.
243 */
244void rcutorture_record_test_transition(void)
245{
246 rcutorture_testseq++;
247 rcutorture_vernum = 0;
248}
249EXPORT_SYMBOL_GPL(rcutorture_record_test_transition);
250
251/*
252 * Record the number of writer passes through the current rcutorture test.
253 * This is also used to correlate debugfs tracing stats with the rcutorture
254 * messages.
255 */
256void rcutorture_record_progress(unsigned long vernum)
257{
258 rcutorture_vernum++;
259}
260EXPORT_SYMBOL_GPL(rcutorture_record_progress);
261
262/*
179 * Force a quiescent state for RCU-sched. 263 * Force a quiescent state for RCU-sched.
180 */ 264 */
181void rcu_sched_force_quiescent_state(void) 265void rcu_sched_force_quiescent_state(void)
@@ -234,8 +318,8 @@ static int rcu_implicit_offline_qs(struct rcu_data *rdp)
234 return 1; 318 return 1;
235 } 319 }
236 320
237 /* If preemptable RCU, no point in sending reschedule IPI. */ 321 /* If preemptible RCU, no point in sending reschedule IPI. */
238 if (rdp->preemptable) 322 if (rdp->preemptible)
239 return 0; 323 return 0;
240 324
241 /* The CPU is online, so send it a reschedule IPI. */ 325 /* The CPU is online, so send it a reschedule IPI. */
@@ -264,13 +348,25 @@ void rcu_enter_nohz(void)
264 unsigned long flags; 348 unsigned long flags;
265 struct rcu_dynticks *rdtp; 349 struct rcu_dynticks *rdtp;
266 350
267 smp_mb(); /* CPUs seeing ++ must see prior RCU read-side crit sects */
268 local_irq_save(flags); 351 local_irq_save(flags);
269 rdtp = &__get_cpu_var(rcu_dynticks); 352 rdtp = &__get_cpu_var(rcu_dynticks);
270 rdtp->dynticks++; 353 if (--rdtp->dynticks_nesting) {
271 rdtp->dynticks_nesting--; 354 local_irq_restore(flags);
272 WARN_ON_ONCE(rdtp->dynticks & 0x1); 355 return;
356 }
357 /* CPUs seeing atomic_inc() must see prior RCU read-side crit sects */
358 smp_mb__before_atomic_inc(); /* See above. */
359 atomic_inc(&rdtp->dynticks);
360 smp_mb__after_atomic_inc(); /* Force ordering with next sojourn. */
361 WARN_ON_ONCE(atomic_read(&rdtp->dynticks) & 0x1);
273 local_irq_restore(flags); 362 local_irq_restore(flags);
363
364 /* If the interrupt queued a callback, get out of dyntick mode. */
365 if (in_irq() &&
366 (__get_cpu_var(rcu_sched_data).nxtlist ||
367 __get_cpu_var(rcu_bh_data).nxtlist ||
368 rcu_preempt_needs_cpu(smp_processor_id())))
369 set_need_resched();
274} 370}
275 371
276/* 372/*
@@ -286,11 +382,16 @@ void rcu_exit_nohz(void)
286 382
287 local_irq_save(flags); 383 local_irq_save(flags);
288 rdtp = &__get_cpu_var(rcu_dynticks); 384 rdtp = &__get_cpu_var(rcu_dynticks);
289 rdtp->dynticks++; 385 if (rdtp->dynticks_nesting++) {
290 rdtp->dynticks_nesting++; 386 local_irq_restore(flags);
291 WARN_ON_ONCE(!(rdtp->dynticks & 0x1)); 387 return;
388 }
389 smp_mb__before_atomic_inc(); /* Force ordering w/previous sojourn. */
390 atomic_inc(&rdtp->dynticks);
391 /* CPUs seeing atomic_inc() must see later RCU read-side crit sects */
392 smp_mb__after_atomic_inc(); /* See above. */
393 WARN_ON_ONCE(!(atomic_read(&rdtp->dynticks) & 0x1));
292 local_irq_restore(flags); 394 local_irq_restore(flags);
293 smp_mb(); /* CPUs seeing ++ must see later RCU read-side crit sects */
294} 395}
295 396
296/** 397/**
@@ -304,11 +405,15 @@ void rcu_nmi_enter(void)
304{ 405{
305 struct rcu_dynticks *rdtp = &__get_cpu_var(rcu_dynticks); 406 struct rcu_dynticks *rdtp = &__get_cpu_var(rcu_dynticks);
306 407
307 if (rdtp->dynticks & 0x1) 408 if (rdtp->dynticks_nmi_nesting == 0 &&
409 (atomic_read(&rdtp->dynticks) & 0x1))
308 return; 410 return;
309 rdtp->dynticks_nmi++; 411 rdtp->dynticks_nmi_nesting++;
310 WARN_ON_ONCE(!(rdtp->dynticks_nmi & 0x1)); 412 smp_mb__before_atomic_inc(); /* Force delay from prior write. */
311 smp_mb(); /* CPUs seeing ++ must see later RCU read-side crit sects */ 413 atomic_inc(&rdtp->dynticks);
414 /* CPUs seeing atomic_inc() must see later RCU read-side crit sects */
415 smp_mb__after_atomic_inc(); /* See above. */
416 WARN_ON_ONCE(!(atomic_read(&rdtp->dynticks) & 0x1));
312} 417}
313 418
314/** 419/**
@@ -322,11 +427,14 @@ void rcu_nmi_exit(void)
322{ 427{
323 struct rcu_dynticks *rdtp = &__get_cpu_var(rcu_dynticks); 428 struct rcu_dynticks *rdtp = &__get_cpu_var(rcu_dynticks);
324 429
325 if (rdtp->dynticks & 0x1) 430 if (rdtp->dynticks_nmi_nesting == 0 ||
431 --rdtp->dynticks_nmi_nesting != 0)
326 return; 432 return;
327 smp_mb(); /* CPUs seeing ++ must see prior RCU read-side crit sects */ 433 /* CPUs seeing atomic_inc() must see prior RCU read-side crit sects */
328 rdtp->dynticks_nmi++; 434 smp_mb__before_atomic_inc(); /* See above. */
329 WARN_ON_ONCE(rdtp->dynticks_nmi & 0x1); 435 atomic_inc(&rdtp->dynticks);
436 smp_mb__after_atomic_inc(); /* Force delay to next write. */
437 WARN_ON_ONCE(atomic_read(&rdtp->dynticks) & 0x1);
330} 438}
331 439
332/** 440/**
@@ -337,13 +445,7 @@ void rcu_nmi_exit(void)
337 */ 445 */
338void rcu_irq_enter(void) 446void rcu_irq_enter(void)
339{ 447{
340 struct rcu_dynticks *rdtp = &__get_cpu_var(rcu_dynticks); 448 rcu_exit_nohz();
341
342 if (rdtp->dynticks_nesting++)
343 return;
344 rdtp->dynticks++;
345 WARN_ON_ONCE(!(rdtp->dynticks & 0x1));
346 smp_mb(); /* CPUs seeing ++ must see later RCU read-side crit sects */
347} 449}
348 450
349/** 451/**
@@ -355,18 +457,7 @@ void rcu_irq_enter(void)
355 */ 457 */
356void rcu_irq_exit(void) 458void rcu_irq_exit(void)
357{ 459{
358 struct rcu_dynticks *rdtp = &__get_cpu_var(rcu_dynticks); 460 rcu_enter_nohz();
359
360 if (--rdtp->dynticks_nesting)
361 return;
362 smp_mb(); /* CPUs seeing ++ must see prior RCU read-side crit sects */
363 rdtp->dynticks++;
364 WARN_ON_ONCE(rdtp->dynticks & 0x1);
365
366 /* If the interrupt queued a callback, get out of dyntick mode. */
367 if (__this_cpu_read(rcu_sched_data.nxtlist) ||
368 __this_cpu_read(rcu_bh_data.nxtlist))
369 set_need_resched();
370} 461}
371 462
372#ifdef CONFIG_SMP 463#ifdef CONFIG_SMP
@@ -378,19 +469,8 @@ void rcu_irq_exit(void)
378 */ 469 */
379static int dyntick_save_progress_counter(struct rcu_data *rdp) 470static int dyntick_save_progress_counter(struct rcu_data *rdp)
380{ 471{
381 int ret; 472 rdp->dynticks_snap = atomic_add_return(0, &rdp->dynticks->dynticks);
382 int snap; 473 return 0;
383 int snap_nmi;
384
385 snap = rdp->dynticks->dynticks;
386 snap_nmi = rdp->dynticks->dynticks_nmi;
387 smp_mb(); /* Order sampling of snap with end of grace period. */
388 rdp->dynticks_snap = snap;
389 rdp->dynticks_nmi_snap = snap_nmi;
390 ret = ((snap & 0x1) == 0) && ((snap_nmi & 0x1) == 0);
391 if (ret)
392 rdp->dynticks_fqs++;
393 return ret;
394} 474}
395 475
396/* 476/*
@@ -401,16 +481,11 @@ static int dyntick_save_progress_counter(struct rcu_data *rdp)
401 */ 481 */
402static int rcu_implicit_dynticks_qs(struct rcu_data *rdp) 482static int rcu_implicit_dynticks_qs(struct rcu_data *rdp)
403{ 483{
404 long curr; 484 unsigned long curr;
405 long curr_nmi; 485 unsigned long snap;
406 long snap;
407 long snap_nmi;
408 486
409 curr = rdp->dynticks->dynticks; 487 curr = (unsigned long)atomic_add_return(0, &rdp->dynticks->dynticks);
410 snap = rdp->dynticks_snap; 488 snap = (unsigned long)rdp->dynticks_snap;
411 curr_nmi = rdp->dynticks->dynticks_nmi;
412 snap_nmi = rdp->dynticks_nmi_snap;
413 smp_mb(); /* force ordering with cpu entering/leaving dynticks. */
414 489
415 /* 490 /*
416 * If the CPU passed through or entered a dynticks idle phase with 491 * If the CPU passed through or entered a dynticks idle phase with
@@ -420,8 +495,7 @@ static int rcu_implicit_dynticks_qs(struct rcu_data *rdp)
420 * read-side critical section that started before the beginning 495 * read-side critical section that started before the beginning
421 * of the current RCU grace period. 496 * of the current RCU grace period.
422 */ 497 */
423 if ((curr != snap || (curr & 0x1) == 0) && 498 if ((curr & 0x1) == 0 || ULONG_CMP_GE(curr, snap + 2)) {
424 (curr_nmi != snap_nmi || (curr_nmi & 0x1) == 0)) {
425 rdp->dynticks_fqs++; 499 rdp->dynticks_fqs++;
426 return 1; 500 return 1;
427 } 501 }
@@ -450,8 +524,6 @@ static int rcu_implicit_dynticks_qs(struct rcu_data *rdp)
450 524
451#endif /* #else #ifdef CONFIG_NO_HZ */ 525#endif /* #else #ifdef CONFIG_NO_HZ */
452 526
453#ifdef CONFIG_RCU_CPU_STALL_DETECTOR
454
455int rcu_cpu_stall_suppress __read_mostly; 527int rcu_cpu_stall_suppress __read_mostly;
456 528
457static void record_gp_stall_check_time(struct rcu_state *rsp) 529static void record_gp_stall_check_time(struct rcu_state *rsp)
@@ -537,21 +609,24 @@ static void print_cpu_stall(struct rcu_state *rsp)
537 609
538static void check_cpu_stall(struct rcu_state *rsp, struct rcu_data *rdp) 610static void check_cpu_stall(struct rcu_state *rsp, struct rcu_data *rdp)
539{ 611{
540 long delta; 612 unsigned long j;
613 unsigned long js;
541 struct rcu_node *rnp; 614 struct rcu_node *rnp;
542 615
543 if (rcu_cpu_stall_suppress) 616 if (rcu_cpu_stall_suppress)
544 return; 617 return;
545 delta = jiffies - ACCESS_ONCE(rsp->jiffies_stall); 618 j = ACCESS_ONCE(jiffies);
619 js = ACCESS_ONCE(rsp->jiffies_stall);
546 rnp = rdp->mynode; 620 rnp = rdp->mynode;
547 if ((ACCESS_ONCE(rnp->qsmask) & rdp->grpmask) && delta >= 0) { 621 if ((ACCESS_ONCE(rnp->qsmask) & rdp->grpmask) && ULONG_CMP_GE(j, js)) {
548 622
549 /* We haven't checked in, so go dump stack. */ 623 /* We haven't checked in, so go dump stack. */
550 print_cpu_stall(rsp); 624 print_cpu_stall(rsp);
551 625
552 } else if (rcu_gp_in_progress(rsp) && delta >= RCU_STALL_RAT_DELAY) { 626 } else if (rcu_gp_in_progress(rsp) &&
627 ULONG_CMP_GE(j, js + RCU_STALL_RAT_DELAY)) {
553 628
554 /* They had two time units to dump stack, so complain. */ 629 /* They had a few time units to dump stack, so complain. */
555 print_other_cpu_stall(rsp); 630 print_other_cpu_stall(rsp);
556 } 631 }
557} 632}
@@ -587,26 +662,6 @@ static void __init check_cpu_stall_init(void)
587 atomic_notifier_chain_register(&panic_notifier_list, &rcu_panic_block); 662 atomic_notifier_chain_register(&panic_notifier_list, &rcu_panic_block);
588} 663}
589 664
590#else /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */
591
592static void record_gp_stall_check_time(struct rcu_state *rsp)
593{
594}
595
596static void check_cpu_stall(struct rcu_state *rsp, struct rcu_data *rdp)
597{
598}
599
600void rcu_cpu_stall_reset(void)
601{
602}
603
604static void __init check_cpu_stall_init(void)
605{
606}
607
608#endif /* #else #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */
609
610/* 665/*
611 * Update CPU-local rcu_data state to record the newly noticed grace period. 666 * Update CPU-local rcu_data state to record the newly noticed grace period.
612 * This is used both when we started the grace period and when we notice 667 * This is used both when we started the grace period and when we notice
@@ -809,6 +864,7 @@ rcu_start_gp(struct rcu_state *rsp, unsigned long flags)
809 rnp->completed = rsp->completed; 864 rnp->completed = rsp->completed;
810 rsp->signaled = RCU_SIGNAL_INIT; /* force_quiescent_state OK. */ 865 rsp->signaled = RCU_SIGNAL_INIT; /* force_quiescent_state OK. */
811 rcu_start_gp_per_cpu(rsp, rnp, rdp); 866 rcu_start_gp_per_cpu(rsp, rnp, rdp);
867 rcu_preempt_boost_start_gp(rnp);
812 raw_spin_unlock_irqrestore(&rnp->lock, flags); 868 raw_spin_unlock_irqrestore(&rnp->lock, flags);
813 return; 869 return;
814 } 870 }
@@ -844,6 +900,7 @@ rcu_start_gp(struct rcu_state *rsp, unsigned long flags)
844 rnp->completed = rsp->completed; 900 rnp->completed = rsp->completed;
845 if (rnp == rdp->mynode) 901 if (rnp == rdp->mynode)
846 rcu_start_gp_per_cpu(rsp, rnp, rdp); 902 rcu_start_gp_per_cpu(rsp, rnp, rdp);
903 rcu_preempt_boost_start_gp(rnp);
847 raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */ 904 raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */
848 } 905 }
849 906
@@ -864,7 +921,18 @@ rcu_start_gp(struct rcu_state *rsp, unsigned long flags)
864static void rcu_report_qs_rsp(struct rcu_state *rsp, unsigned long flags) 921static void rcu_report_qs_rsp(struct rcu_state *rsp, unsigned long flags)
865 __releases(rcu_get_root(rsp)->lock) 922 __releases(rcu_get_root(rsp)->lock)
866{ 923{
924 unsigned long gp_duration;
925
867 WARN_ON_ONCE(!rcu_gp_in_progress(rsp)); 926 WARN_ON_ONCE(!rcu_gp_in_progress(rsp));
927
928 /*
929 * Ensure that all grace-period and pre-grace-period activity
930 * is seen before the assignment to rsp->completed.
931 */
932 smp_mb(); /* See above block comment. */
933 gp_duration = jiffies - rsp->gp_start;
934 if (gp_duration > rsp->gp_max)
935 rsp->gp_max = gp_duration;
868 rsp->completed = rsp->gpnum; 936 rsp->completed = rsp->gpnum;
869 rsp->signaled = RCU_GP_IDLE; 937 rsp->signaled = RCU_GP_IDLE;
870 rcu_start_gp(rsp, flags); /* releases root node's rnp->lock. */ 938 rcu_start_gp(rsp, flags); /* releases root node's rnp->lock. */
@@ -894,7 +962,7 @@ rcu_report_qs_rnp(unsigned long mask, struct rcu_state *rsp,
894 return; 962 return;
895 } 963 }
896 rnp->qsmask &= ~mask; 964 rnp->qsmask &= ~mask;
897 if (rnp->qsmask != 0 || rcu_preempted_readers(rnp)) { 965 if (rnp->qsmask != 0 || rcu_preempt_blocked_readers_cgp(rnp)) {
898 966
899 /* Other bits still set at this level, so done. */ 967 /* Other bits still set at this level, so done. */
900 raw_spin_unlock_irqrestore(&rnp->lock, flags); 968 raw_spin_unlock_irqrestore(&rnp->lock, flags);
@@ -1037,6 +1105,8 @@ static void rcu_send_cbs_to_online(struct rcu_state *rsp)
1037/* 1105/*
1038 * Remove the outgoing CPU from the bitmasks in the rcu_node hierarchy 1106 * Remove the outgoing CPU from the bitmasks in the rcu_node hierarchy
1039 * and move all callbacks from the outgoing CPU to the current one. 1107 * and move all callbacks from the outgoing CPU to the current one.
1108 * There can only be one CPU hotplug operation at a time, so no other
1109 * CPU can be attempting to update rcu_cpu_kthread_task.
1040 */ 1110 */
1041static void __rcu_offline_cpu(int cpu, struct rcu_state *rsp) 1111static void __rcu_offline_cpu(int cpu, struct rcu_state *rsp)
1042{ 1112{
@@ -1046,6 +1116,8 @@ static void __rcu_offline_cpu(int cpu, struct rcu_state *rsp)
1046 struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu); 1116 struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu);
1047 struct rcu_node *rnp; 1117 struct rcu_node *rnp;
1048 1118
1119 rcu_stop_cpu_kthread(cpu);
1120
1049 /* Exclude any attempts to start a new grace period. */ 1121 /* Exclude any attempts to start a new grace period. */
1050 raw_spin_lock_irqsave(&rsp->onofflock, flags); 1122 raw_spin_lock_irqsave(&rsp->onofflock, flags);
1051 1123
@@ -1082,6 +1154,7 @@ static void __rcu_offline_cpu(int cpu, struct rcu_state *rsp)
1082 raw_spin_unlock_irqrestore(&rnp->lock, flags); 1154 raw_spin_unlock_irqrestore(&rnp->lock, flags);
1083 if (need_report & RCU_OFL_TASKS_EXP_GP) 1155 if (need_report & RCU_OFL_TASKS_EXP_GP)
1084 rcu_report_exp_rnp(rsp, rnp); 1156 rcu_report_exp_rnp(rsp, rnp);
1157 rcu_node_kthread_setaffinity(rnp, -1);
1085} 1158}
1086 1159
1087/* 1160/*
@@ -1143,7 +1216,7 @@ static void rcu_do_batch(struct rcu_state *rsp, struct rcu_data *rdp)
1143 next = list->next; 1216 next = list->next;
1144 prefetch(next); 1217 prefetch(next);
1145 debug_rcu_head_unqueue(list); 1218 debug_rcu_head_unqueue(list);
1146 list->func(list); 1219 __rcu_reclaim(list);
1147 list = next; 1220 list = next;
1148 if (++count >= rdp->blimit) 1221 if (++count >= rdp->blimit)
1149 break; 1222 break;
@@ -1179,7 +1252,7 @@ static void rcu_do_batch(struct rcu_state *rsp, struct rcu_data *rdp)
1179 1252
1180 /* Re-raise the RCU softirq if there are callbacks remaining. */ 1253 /* Re-raise the RCU softirq if there are callbacks remaining. */
1181 if (cpu_has_callbacks_ready_to_invoke(rdp)) 1254 if (cpu_has_callbacks_ready_to_invoke(rdp))
1182 raise_softirq(RCU_SOFTIRQ); 1255 invoke_rcu_core();
1183} 1256}
1184 1257
1185/* 1258/*
@@ -1225,7 +1298,7 @@ void rcu_check_callbacks(int cpu, int user)
1225 } 1298 }
1226 rcu_preempt_check_callbacks(cpu); 1299 rcu_preempt_check_callbacks(cpu);
1227 if (rcu_pending(cpu)) 1300 if (rcu_pending(cpu))
1228 raise_softirq(RCU_SOFTIRQ); 1301 invoke_rcu_core();
1229} 1302}
1230 1303
1231#ifdef CONFIG_SMP 1304#ifdef CONFIG_SMP
@@ -1233,6 +1306,8 @@ void rcu_check_callbacks(int cpu, int user)
1233/* 1306/*
1234 * Scan the leaf rcu_node structures, processing dyntick state for any that 1307 * Scan the leaf rcu_node structures, processing dyntick state for any that
1235 * have not yet encountered a quiescent state, using the function specified. 1308 * have not yet encountered a quiescent state, using the function specified.
1309 * Also initiate boosting for any threads blocked on the root rcu_node.
1310 *
1236 * The caller must have suppressed start of new grace periods. 1311 * The caller must have suppressed start of new grace periods.
1237 */ 1312 */
1238static void force_qs_rnp(struct rcu_state *rsp, int (*f)(struct rcu_data *)) 1313static void force_qs_rnp(struct rcu_state *rsp, int (*f)(struct rcu_data *))
@@ -1251,7 +1326,7 @@ static void force_qs_rnp(struct rcu_state *rsp, int (*f)(struct rcu_data *))
1251 return; 1326 return;
1252 } 1327 }
1253 if (rnp->qsmask == 0) { 1328 if (rnp->qsmask == 0) {
1254 raw_spin_unlock_irqrestore(&rnp->lock, flags); 1329 rcu_initiate_boost(rnp, flags); /* releases rnp->lock */
1255 continue; 1330 continue;
1256 } 1331 }
1257 cpu = rnp->grplo; 1332 cpu = rnp->grplo;
@@ -1269,6 +1344,11 @@ static void force_qs_rnp(struct rcu_state *rsp, int (*f)(struct rcu_data *))
1269 } 1344 }
1270 raw_spin_unlock_irqrestore(&rnp->lock, flags); 1345 raw_spin_unlock_irqrestore(&rnp->lock, flags);
1271 } 1346 }
1347 rnp = rcu_get_root(rsp);
1348 if (rnp->qsmask == 0) {
1349 raw_spin_lock_irqsave(&rnp->lock, flags);
1350 rcu_initiate_boost(rnp, flags); /* releases rnp->lock. */
1351 }
1272} 1352}
1273 1353
1274/* 1354/*
@@ -1383,7 +1463,8 @@ __rcu_process_callbacks(struct rcu_state *rsp, struct rcu_data *rdp)
1383 } 1463 }
1384 1464
1385 /* If there are callbacks ready, invoke them. */ 1465 /* If there are callbacks ready, invoke them. */
1386 rcu_do_batch(rsp, rdp); 1466 if (cpu_has_callbacks_ready_to_invoke(rdp))
1467 invoke_rcu_callbacks(rsp, rdp);
1387} 1468}
1388 1469
1389/* 1470/*
@@ -1391,29 +1472,37 @@ __rcu_process_callbacks(struct rcu_state *rsp, struct rcu_data *rdp)
1391 */ 1472 */
1392static void rcu_process_callbacks(struct softirq_action *unused) 1473static void rcu_process_callbacks(struct softirq_action *unused)
1393{ 1474{
1394 /*
1395 * Memory references from any prior RCU read-side critical sections
1396 * executed by the interrupted code must be seen before any RCU
1397 * grace-period manipulations below.
1398 */
1399 smp_mb(); /* See above block comment. */
1400
1401 __rcu_process_callbacks(&rcu_sched_state, 1475 __rcu_process_callbacks(&rcu_sched_state,
1402 &__get_cpu_var(rcu_sched_data)); 1476 &__get_cpu_var(rcu_sched_data));
1403 __rcu_process_callbacks(&rcu_bh_state, &__get_cpu_var(rcu_bh_data)); 1477 __rcu_process_callbacks(&rcu_bh_state, &__get_cpu_var(rcu_bh_data));
1404 rcu_preempt_process_callbacks(); 1478 rcu_preempt_process_callbacks();
1405 1479
1406 /*
1407 * Memory references from any later RCU read-side critical sections
1408 * executed by the interrupted code must be seen after any RCU
1409 * grace-period manipulations above.
1410 */
1411 smp_mb(); /* See above block comment. */
1412
1413 /* If we are last CPU on way to dyntick-idle mode, accelerate it. */ 1480 /* If we are last CPU on way to dyntick-idle mode, accelerate it. */
1414 rcu_needs_cpu_flush(); 1481 rcu_needs_cpu_flush();
1415} 1482}
1416 1483
1484/*
1485 * Wake up the current CPU's kthread. This replaces raise_softirq()
1486 * in earlier versions of RCU. Note that because we are running on
1487 * the current CPU with interrupts disabled, the rcu_cpu_kthread_task
1488 * cannot disappear out from under us.
1489 */
1490static void invoke_rcu_callbacks(struct rcu_state *rsp, struct rcu_data *rdp)
1491{
1492 if (unlikely(!ACCESS_ONCE(rcu_scheduler_fully_active)))
1493 return;
1494 if (likely(!rsp->boost)) {
1495 rcu_do_batch(rsp, rdp);
1496 return;
1497 }
1498 invoke_rcu_callbacks_kthread();
1499}
1500
1501static void invoke_rcu_core(void)
1502{
1503 raise_softirq(RCU_SOFTIRQ);
1504}
1505
1417static void 1506static void
1418__call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu), 1507__call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu),
1419 struct rcu_state *rsp) 1508 struct rcu_state *rsp)
@@ -1439,6 +1528,13 @@ __call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu),
1439 /* Add the callback to our list. */ 1528 /* Add the callback to our list. */
1440 *rdp->nxttail[RCU_NEXT_TAIL] = head; 1529 *rdp->nxttail[RCU_NEXT_TAIL] = head;
1441 rdp->nxttail[RCU_NEXT_TAIL] = &head->next; 1530 rdp->nxttail[RCU_NEXT_TAIL] = &head->next;
1531 rdp->qlen++;
1532
1533 /* If interrupts were disabled, don't dive into RCU core. */
1534 if (irqs_disabled_flags(flags)) {
1535 local_irq_restore(flags);
1536 return;
1537 }
1442 1538
1443 /* 1539 /*
1444 * Force the grace period if too many callbacks or too long waiting. 1540 * Force the grace period if too many callbacks or too long waiting.
@@ -1447,7 +1543,7 @@ __call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu),
1447 * invoking force_quiescent_state() if the newly enqueued callback 1543 * invoking force_quiescent_state() if the newly enqueued callback
1448 * is the only one waiting for a grace period to complete. 1544 * is the only one waiting for a grace period to complete.
1449 */ 1545 */
1450 if (unlikely(++rdp->qlen > rdp->qlen_last_fqs_check + qhimark)) { 1546 if (unlikely(rdp->qlen > rdp->qlen_last_fqs_check + qhimark)) {
1451 1547
1452 /* Are we ignoring a completed grace period? */ 1548 /* Are we ignoring a completed grace period? */
1453 rcu_process_gp_end(rsp, rdp); 1549 rcu_process_gp_end(rsp, rdp);
@@ -1583,7 +1679,7 @@ static int __rcu_pending(struct rcu_state *rsp, struct rcu_data *rdp)
1583 * or RCU-bh, force a local reschedule. 1679 * or RCU-bh, force a local reschedule.
1584 */ 1680 */
1585 rdp->n_rp_qs_pending++; 1681 rdp->n_rp_qs_pending++;
1586 if (!rdp->preemptable && 1682 if (!rdp->preemptible &&
1587 ULONG_CMP_LT(ACCESS_ONCE(rsp->jiffies_force_qs) - 1, 1683 ULONG_CMP_LT(ACCESS_ONCE(rsp->jiffies_force_qs) - 1,
1588 jiffies)) 1684 jiffies))
1589 set_need_resched(); 1685 set_need_resched();
@@ -1760,7 +1856,7 @@ rcu_boot_init_percpu_data(int cpu, struct rcu_state *rsp)
1760 * that this CPU cannot possibly have any RCU callbacks in flight yet. 1856 * that this CPU cannot possibly have any RCU callbacks in flight yet.
1761 */ 1857 */
1762static void __cpuinit 1858static void __cpuinit
1763rcu_init_percpu_data(int cpu, struct rcu_state *rsp, int preemptable) 1859rcu_init_percpu_data(int cpu, struct rcu_state *rsp, int preemptible)
1764{ 1860{
1765 unsigned long flags; 1861 unsigned long flags;
1766 unsigned long mask; 1862 unsigned long mask;
@@ -1772,7 +1868,7 @@ rcu_init_percpu_data(int cpu, struct rcu_state *rsp, int preemptable)
1772 rdp->passed_quiesc = 0; /* We could be racing with new GP, */ 1868 rdp->passed_quiesc = 0; /* We could be racing with new GP, */
1773 rdp->qs_pending = 1; /* so set up to respond to current GP. */ 1869 rdp->qs_pending = 1; /* so set up to respond to current GP. */
1774 rdp->beenonline = 1; /* We have now been online. */ 1870 rdp->beenonline = 1; /* We have now been online. */
1775 rdp->preemptable = preemptable; 1871 rdp->preemptible = preemptible;
1776 rdp->qlen_last_fqs_check = 0; 1872 rdp->qlen_last_fqs_check = 0;
1777 rdp->n_force_qs_snap = rsp->n_force_qs; 1873 rdp->n_force_qs_snap = rsp->n_force_qs;
1778 rdp->blimit = blimit; 1874 rdp->blimit = blimit;
@@ -1806,7 +1902,7 @@ rcu_init_percpu_data(int cpu, struct rcu_state *rsp, int preemptable)
1806 raw_spin_unlock_irqrestore(&rsp->onofflock, flags); 1902 raw_spin_unlock_irqrestore(&rsp->onofflock, flags);
1807} 1903}
1808 1904
1809static void __cpuinit rcu_online_cpu(int cpu) 1905static void __cpuinit rcu_prepare_cpu(int cpu)
1810{ 1906{
1811 rcu_init_percpu_data(cpu, &rcu_sched_state, 0); 1907 rcu_init_percpu_data(cpu, &rcu_sched_state, 0);
1812 rcu_init_percpu_data(cpu, &rcu_bh_state, 0); 1908 rcu_init_percpu_data(cpu, &rcu_bh_state, 0);
@@ -1820,11 +1916,23 @@ static int __cpuinit rcu_cpu_notify(struct notifier_block *self,
1820 unsigned long action, void *hcpu) 1916 unsigned long action, void *hcpu)
1821{ 1917{
1822 long cpu = (long)hcpu; 1918 long cpu = (long)hcpu;
1919 struct rcu_data *rdp = per_cpu_ptr(rcu_state->rda, cpu);
1920 struct rcu_node *rnp = rdp->mynode;
1823 1921
1824 switch (action) { 1922 switch (action) {
1825 case CPU_UP_PREPARE: 1923 case CPU_UP_PREPARE:
1826 case CPU_UP_PREPARE_FROZEN: 1924 case CPU_UP_PREPARE_FROZEN:
1827 rcu_online_cpu(cpu); 1925 rcu_prepare_cpu(cpu);
1926 rcu_prepare_kthreads(cpu);
1927 break;
1928 case CPU_ONLINE:
1929 case CPU_DOWN_FAILED:
1930 rcu_node_kthread_setaffinity(rnp, -1);
1931 rcu_cpu_kthread_setrt(cpu, 1);
1932 break;
1933 case CPU_DOWN_PREPARE:
1934 rcu_node_kthread_setaffinity(rnp, cpu);
1935 rcu_cpu_kthread_setrt(cpu, 0);
1828 break; 1936 break;
1829 case CPU_DYING: 1937 case CPU_DYING:
1830 case CPU_DYING_FROZEN: 1938 case CPU_DYING_FROZEN:
@@ -1943,10 +2051,7 @@ static void __init rcu_init_one(struct rcu_state *rsp,
1943 j / rsp->levelspread[i - 1]; 2051 j / rsp->levelspread[i - 1];
1944 } 2052 }
1945 rnp->level = i; 2053 rnp->level = i;
1946 INIT_LIST_HEAD(&rnp->blocked_tasks[0]); 2054 INIT_LIST_HEAD(&rnp->blkd_tasks);
1947 INIT_LIST_HEAD(&rnp->blocked_tasks[1]);
1948 INIT_LIST_HEAD(&rnp->blocked_tasks[2]);
1949 INIT_LIST_HEAD(&rnp->blocked_tasks[3]);
1950 } 2055 }
1951 } 2056 }
1952 2057
@@ -1968,7 +2073,7 @@ void __init rcu_init(void)
1968 rcu_init_one(&rcu_sched_state, &rcu_sched_data); 2073 rcu_init_one(&rcu_sched_state, &rcu_sched_data);
1969 rcu_init_one(&rcu_bh_state, &rcu_bh_data); 2074 rcu_init_one(&rcu_bh_state, &rcu_bh_data);
1970 __rcu_init_preempt(); 2075 __rcu_init_preempt();
1971 open_softirq(RCU_SOFTIRQ, rcu_process_callbacks); 2076 open_softirq(RCU_SOFTIRQ, rcu_process_callbacks);
1972 2077
1973 /* 2078 /*
1974 * We don't need protection against CPU-hotplug here because 2079 * We don't need protection against CPU-hotplug here because
diff --git a/kernel/rcutree.h b/kernel/rcutree.h
index e8f057e44e3e..01b2ccda26fb 100644
--- a/kernel/rcutree.h
+++ b/kernel/rcutree.h
@@ -84,13 +84,19 @@
84 * Dynticks per-CPU state. 84 * Dynticks per-CPU state.
85 */ 85 */
86struct rcu_dynticks { 86struct rcu_dynticks {
87 int dynticks_nesting; /* Track nesting level, sort of. */ 87 int dynticks_nesting; /* Track irq/process nesting level. */
88 int dynticks; /* Even value for dynticks-idle, else odd. */ 88 int dynticks_nmi_nesting; /* Track NMI nesting level. */
89 int dynticks_nmi; /* Even value for either dynticks-idle or */ 89 atomic_t dynticks; /* Even value for dynticks-idle, else odd. */
90 /* not in nmi handler, else odd. So this */
91 /* remains even for nmi from irq handler. */
92}; 90};
93 91
92/* RCU's kthread states for tracing. */
93#define RCU_KTHREAD_STOPPED 0
94#define RCU_KTHREAD_RUNNING 1
95#define RCU_KTHREAD_WAITING 2
96#define RCU_KTHREAD_OFFCPU 3
97#define RCU_KTHREAD_YIELDING 4
98#define RCU_KTHREAD_MAX 4
99
94/* 100/*
95 * Definition for node within the RCU grace-period-detection hierarchy. 101 * Definition for node within the RCU grace-period-detection hierarchy.
96 */ 102 */
@@ -109,10 +115,13 @@ struct rcu_node {
109 /* an rcu_data structure, otherwise, each */ 115 /* an rcu_data structure, otherwise, each */
110 /* bit corresponds to a child rcu_node */ 116 /* bit corresponds to a child rcu_node */
111 /* structure. */ 117 /* structure. */
112 unsigned long expmask; /* Groups that have ->blocked_tasks[] */ 118 unsigned long expmask; /* Groups that have ->blkd_tasks */
113 /* elements that need to drain to allow the */ 119 /* elements that need to drain to allow the */
114 /* current expedited grace period to */ 120 /* current expedited grace period to */
115 /* complete (only for TREE_PREEMPT_RCU). */ 121 /* complete (only for TREE_PREEMPT_RCU). */
122 atomic_t wakemask; /* CPUs whose kthread needs to be awakened. */
123 /* Since this has meaning only for leaf */
124 /* rcu_node structures, 32 bits suffices. */
116 unsigned long qsmaskinit; 125 unsigned long qsmaskinit;
117 /* Per-GP initial value for qsmask & expmask. */ 126 /* Per-GP initial value for qsmask & expmask. */
118 unsigned long grpmask; /* Mask to apply to parent qsmask. */ 127 unsigned long grpmask; /* Mask to apply to parent qsmask. */
@@ -122,11 +131,62 @@ struct rcu_node {
122 u8 grpnum; /* CPU/group number for next level up. */ 131 u8 grpnum; /* CPU/group number for next level up. */
123 u8 level; /* root is at level 0. */ 132 u8 level; /* root is at level 0. */
124 struct rcu_node *parent; 133 struct rcu_node *parent;
125 struct list_head blocked_tasks[4]; 134 struct list_head blkd_tasks;
126 /* Tasks blocked in RCU read-side critsect. */ 135 /* Tasks blocked in RCU read-side critical */
127 /* Grace period number (->gpnum) x blocked */ 136 /* section. Tasks are placed at the head */
128 /* by tasks on the (x & 0x1) element of the */ 137 /* of this list and age towards the tail. */
129 /* blocked_tasks[] array. */ 138 struct list_head *gp_tasks;
139 /* Pointer to the first task blocking the */
140 /* current grace period, or NULL if there */
141 /* is no such task. */
142 struct list_head *exp_tasks;
143 /* Pointer to the first task blocking the */
144 /* current expedited grace period, or NULL */
145 /* if there is no such task. If there */
146 /* is no current expedited grace period, */
147 /* then there can cannot be any such task. */
148#ifdef CONFIG_RCU_BOOST
149 struct list_head *boost_tasks;
150 /* Pointer to first task that needs to be */
151 /* priority boosted, or NULL if no priority */
152 /* boosting is needed for this rcu_node */
153 /* structure. If there are no tasks */
154 /* queued on this rcu_node structure that */
155 /* are blocking the current grace period, */
156 /* there can be no such task. */
157 unsigned long boost_time;
158 /* When to start boosting (jiffies). */
159 struct task_struct *boost_kthread_task;
160 /* kthread that takes care of priority */
161 /* boosting for this rcu_node structure. */
162 unsigned int boost_kthread_status;
163 /* State of boost_kthread_task for tracing. */
164 unsigned long n_tasks_boosted;
165 /* Total number of tasks boosted. */
166 unsigned long n_exp_boosts;
167 /* Number of tasks boosted for expedited GP. */
168 unsigned long n_normal_boosts;
169 /* Number of tasks boosted for normal GP. */
170 unsigned long n_balk_blkd_tasks;
171 /* Refused to boost: no blocked tasks. */
172 unsigned long n_balk_exp_gp_tasks;
173 /* Refused to boost: nothing blocking GP. */
174 unsigned long n_balk_boost_tasks;
175 /* Refused to boost: already boosting. */
176 unsigned long n_balk_notblocked;
177 /* Refused to boost: RCU RS CS still running. */
178 unsigned long n_balk_notyet;
179 /* Refused to boost: not yet time. */
180 unsigned long n_balk_nos;
181 /* Refused to boost: not sure why, though. */
182 /* This can happen due to race conditions. */
183#endif /* #ifdef CONFIG_RCU_BOOST */
184 struct task_struct *node_kthread_task;
185 /* kthread that takes care of this rcu_node */
186 /* structure, for example, awakening the */
187 /* per-CPU kthreads as needed. */
188 unsigned int node_kthread_status;
189 /* State of node_kthread_task for tracing. */
130} ____cacheline_internodealigned_in_smp; 190} ____cacheline_internodealigned_in_smp;
131 191
132/* 192/*
@@ -175,7 +235,7 @@ struct rcu_data {
175 bool passed_quiesc; /* User-mode/idle loop etc. */ 235 bool passed_quiesc; /* User-mode/idle loop etc. */
176 bool qs_pending; /* Core waits for quiesc state. */ 236 bool qs_pending; /* Core waits for quiesc state. */
177 bool beenonline; /* CPU online at least once. */ 237 bool beenonline; /* CPU online at least once. */
178 bool preemptable; /* Preemptable RCU? */ 238 bool preemptible; /* Preemptible RCU? */
179 struct rcu_node *mynode; /* This CPU's leaf of hierarchy */ 239 struct rcu_node *mynode; /* This CPU's leaf of hierarchy */
180 unsigned long grpmask; /* Mask to apply to leaf qsmask. */ 240 unsigned long grpmask; /* Mask to apply to leaf qsmask. */
181 241
@@ -218,7 +278,6 @@ struct rcu_data {
218 /* 3) dynticks interface. */ 278 /* 3) dynticks interface. */
219 struct rcu_dynticks *dynticks; /* Shared per-CPU dynticks state. */ 279 struct rcu_dynticks *dynticks; /* Shared per-CPU dynticks state. */
220 int dynticks_snap; /* Per-GP tracking for dynticks. */ 280 int dynticks_snap; /* Per-GP tracking for dynticks. */
221 int dynticks_nmi_snap; /* Per-GP tracking for dynticks_nmi. */
222#endif /* #ifdef CONFIG_NO_HZ */ 281#endif /* #ifdef CONFIG_NO_HZ */
223 282
224 /* 4) reasons this CPU needed to be kicked by force_quiescent_state */ 283 /* 4) reasons this CPU needed to be kicked by force_quiescent_state */
@@ -254,7 +313,6 @@ struct rcu_data {
254#endif /* #else #ifdef CONFIG_NO_HZ */ 313#endif /* #else #ifdef CONFIG_NO_HZ */
255 314
256#define RCU_JIFFIES_TILL_FORCE_QS 3 /* for rsp->jiffies_force_qs */ 315#define RCU_JIFFIES_TILL_FORCE_QS 3 /* for rsp->jiffies_force_qs */
257#ifdef CONFIG_RCU_CPU_STALL_DETECTOR
258 316
259#ifdef CONFIG_PROVE_RCU 317#ifdef CONFIG_PROVE_RCU
260#define RCU_STALL_DELAY_DELTA (5 * HZ) 318#define RCU_STALL_DELAY_DELTA (5 * HZ)
@@ -272,13 +330,16 @@ struct rcu_data {
272 /* scheduling clock irq */ 330 /* scheduling clock irq */
273 /* before ratting on them. */ 331 /* before ratting on them. */
274 332
275#ifdef CONFIG_RCU_CPU_STALL_DETECTOR_RUNNABLE 333#define rcu_wait(cond) \
276#define RCU_CPU_STALL_SUPPRESS_INIT 0 334do { \
277#else 335 for (;;) { \
278#define RCU_CPU_STALL_SUPPRESS_INIT 1 336 set_current_state(TASK_INTERRUPTIBLE); \
279#endif 337 if (cond) \
280 338 break; \
281#endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */ 339 schedule(); \
340 } \
341 __set_current_state(TASK_RUNNING); \
342} while (0)
282 343
283/* 344/*
284 * RCU global state, including node hierarchy. This hierarchy is 345 * RCU global state, including node hierarchy. This hierarchy is
@@ -308,6 +369,7 @@ struct rcu_state {
308 /* period because */ 369 /* period because */
309 /* force_quiescent_state() */ 370 /* force_quiescent_state() */
310 /* was running. */ 371 /* was running. */
372 u8 boost; /* Subject to priority boost. */
311 unsigned long gpnum; /* Current gp number. */ 373 unsigned long gpnum; /* Current gp number. */
312 unsigned long completed; /* # of last completed gp. */ 374 unsigned long completed; /* # of last completed gp. */
313 375
@@ -325,12 +387,12 @@ struct rcu_state {
325 /* due to lock unavailable. */ 387 /* due to lock unavailable. */
326 unsigned long n_force_qs_ngp; /* Number of calls leaving */ 388 unsigned long n_force_qs_ngp; /* Number of calls leaving */
327 /* due to no GP active. */ 389 /* due to no GP active. */
328#ifdef CONFIG_RCU_CPU_STALL_DETECTOR
329 unsigned long gp_start; /* Time at which GP started, */ 390 unsigned long gp_start; /* Time at which GP started, */
330 /* but in jiffies. */ 391 /* but in jiffies. */
331 unsigned long jiffies_stall; /* Time at which to check */ 392 unsigned long jiffies_stall; /* Time at which to check */
332 /* for CPU stalls. */ 393 /* for CPU stalls. */
333#endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */ 394 unsigned long gp_max; /* Maximum GP duration in */
395 /* jiffies. */
334 char *name; /* Name of structure. */ 396 char *name; /* Name of structure. */
335}; 397};
336 398
@@ -361,16 +423,15 @@ DECLARE_PER_CPU(struct rcu_data, rcu_preempt_data);
361static void rcu_bootup_announce(void); 423static void rcu_bootup_announce(void);
362long rcu_batches_completed(void); 424long rcu_batches_completed(void);
363static void rcu_preempt_note_context_switch(int cpu); 425static void rcu_preempt_note_context_switch(int cpu);
364static int rcu_preempted_readers(struct rcu_node *rnp); 426static int rcu_preempt_blocked_readers_cgp(struct rcu_node *rnp);
365#ifdef CONFIG_HOTPLUG_CPU 427#ifdef CONFIG_HOTPLUG_CPU
366static void rcu_report_unblock_qs_rnp(struct rcu_node *rnp, 428static void rcu_report_unblock_qs_rnp(struct rcu_node *rnp,
367 unsigned long flags); 429 unsigned long flags);
430static void rcu_stop_cpu_kthread(int cpu);
368#endif /* #ifdef CONFIG_HOTPLUG_CPU */ 431#endif /* #ifdef CONFIG_HOTPLUG_CPU */
369#ifdef CONFIG_RCU_CPU_STALL_DETECTOR
370static void rcu_print_detail_task_stall(struct rcu_state *rsp); 432static void rcu_print_detail_task_stall(struct rcu_state *rsp);
371static void rcu_print_task_stall(struct rcu_node *rnp); 433static void rcu_print_task_stall(struct rcu_node *rnp);
372static void rcu_preempt_stall_reset(void); 434static void rcu_preempt_stall_reset(void);
373#endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */
374static void rcu_preempt_check_blocked_tasks(struct rcu_node *rnp); 435static void rcu_preempt_check_blocked_tasks(struct rcu_node *rnp);
375#ifdef CONFIG_HOTPLUG_CPU 436#ifdef CONFIG_HOTPLUG_CPU
376static int rcu_preempt_offline_tasks(struct rcu_state *rsp, 437static int rcu_preempt_offline_tasks(struct rcu_state *rsp,
@@ -390,5 +451,20 @@ static void __cpuinit rcu_preempt_init_percpu_data(int cpu);
390static void rcu_preempt_send_cbs_to_online(void); 451static void rcu_preempt_send_cbs_to_online(void);
391static void __init __rcu_init_preempt(void); 452static void __init __rcu_init_preempt(void);
392static void rcu_needs_cpu_flush(void); 453static void rcu_needs_cpu_flush(void);
454static void rcu_initiate_boost(struct rcu_node *rnp, unsigned long flags);
455static void rcu_preempt_boost_start_gp(struct rcu_node *rnp);
456static void invoke_rcu_callbacks_kthread(void);
457#ifdef CONFIG_RCU_BOOST
458static void rcu_preempt_do_callbacks(void);
459static void rcu_boost_kthread_setaffinity(struct rcu_node *rnp,
460 cpumask_var_t cm);
461static int __cpuinit rcu_spawn_one_boost_kthread(struct rcu_state *rsp,
462 struct rcu_node *rnp,
463 int rnp_index);
464static void invoke_rcu_node_kthread(struct rcu_node *rnp);
465static void rcu_yield(void (*f)(unsigned long), unsigned long arg);
466#endif /* #ifdef CONFIG_RCU_BOOST */
467static void rcu_cpu_kthread_setrt(int cpu, int to_rt);
468static void __cpuinit rcu_prepare_kthreads(int cpu);
393 469
394#endif /* #ifndef RCU_TREE_NONCORE */ 470#endif /* #ifndef RCU_TREE_NONCORE */
diff --git a/kernel/rcutree_plugin.h b/kernel/rcutree_plugin.h
index a3638710dc67..8aafbb80b8b0 100644
--- a/kernel/rcutree_plugin.h
+++ b/kernel/rcutree_plugin.h
@@ -1,7 +1,7 @@
1/* 1/*
2 * Read-Copy Update mechanism for mutual exclusion (tree-based version) 2 * Read-Copy Update mechanism for mutual exclusion (tree-based version)
3 * Internal non-public definitions that provide either classic 3 * Internal non-public definitions that provide either classic
4 * or preemptable semantics. 4 * or preemptible semantics.
5 * 5 *
6 * This program is free software; you can redistribute it and/or modify 6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by 7 * it under the terms of the GNU General Public License as published by
@@ -54,10 +54,6 @@ static void __init rcu_bootup_announce_oddness(void)
54#ifdef CONFIG_RCU_TORTURE_TEST_RUNNABLE 54#ifdef CONFIG_RCU_TORTURE_TEST_RUNNABLE
55 printk(KERN_INFO "\tRCU torture testing starts during boot.\n"); 55 printk(KERN_INFO "\tRCU torture testing starts during boot.\n");
56#endif 56#endif
57#ifndef CONFIG_RCU_CPU_STALL_DETECTOR
58 printk(KERN_INFO
59 "\tRCU-based detection of stalled CPUs is disabled.\n");
60#endif
61#if defined(CONFIG_TREE_PREEMPT_RCU) && !defined(CONFIG_RCU_CPU_STALL_VERBOSE) 57#if defined(CONFIG_TREE_PREEMPT_RCU) && !defined(CONFIG_RCU_CPU_STALL_VERBOSE)
62 printk(KERN_INFO "\tVerbose stalled-CPUs detection is disabled.\n"); 58 printk(KERN_INFO "\tVerbose stalled-CPUs detection is disabled.\n");
63#endif 59#endif
@@ -70,7 +66,9 @@ static void __init rcu_bootup_announce_oddness(void)
70 66
71struct rcu_state rcu_preempt_state = RCU_STATE_INITIALIZER(rcu_preempt_state); 67struct rcu_state rcu_preempt_state = RCU_STATE_INITIALIZER(rcu_preempt_state);
72DEFINE_PER_CPU(struct rcu_data, rcu_preempt_data); 68DEFINE_PER_CPU(struct rcu_data, rcu_preempt_data);
69static struct rcu_state *rcu_state = &rcu_preempt_state;
73 70
71static void rcu_read_unlock_special(struct task_struct *t);
74static int rcu_preempted_readers_exp(struct rcu_node *rnp); 72static int rcu_preempted_readers_exp(struct rcu_node *rnp);
75 73
76/* 74/*
@@ -78,7 +76,7 @@ static int rcu_preempted_readers_exp(struct rcu_node *rnp);
78 */ 76 */
79static void __init rcu_bootup_announce(void) 77static void __init rcu_bootup_announce(void)
80{ 78{
81 printk(KERN_INFO "Preemptable hierarchical RCU implementation.\n"); 79 printk(KERN_INFO "Preemptible hierarchical RCU implementation.\n");
82 rcu_bootup_announce_oddness(); 80 rcu_bootup_announce_oddness();
83} 81}
84 82
@@ -111,7 +109,7 @@ void rcu_force_quiescent_state(void)
111EXPORT_SYMBOL_GPL(rcu_force_quiescent_state); 109EXPORT_SYMBOL_GPL(rcu_force_quiescent_state);
112 110
113/* 111/*
114 * Record a preemptable-RCU quiescent state for the specified CPU. Note 112 * Record a preemptible-RCU quiescent state for the specified CPU. Note
115 * that this just means that the task currently running on the CPU is 113 * that this just means that the task currently running on the CPU is
116 * not in a quiescent state. There might be any number of tasks blocked 114 * not in a quiescent state. There might be any number of tasks blocked
117 * while in an RCU read-side critical section. 115 * while in an RCU read-side critical section.
@@ -134,12 +132,12 @@ static void rcu_preempt_qs(int cpu)
134 * We have entered the scheduler, and the current task might soon be 132 * We have entered the scheduler, and the current task might soon be
135 * context-switched away from. If this task is in an RCU read-side 133 * context-switched away from. If this task is in an RCU read-side
136 * critical section, we will no longer be able to rely on the CPU to 134 * critical section, we will no longer be able to rely on the CPU to
137 * record that fact, so we enqueue the task on the appropriate entry 135 * record that fact, so we enqueue the task on the blkd_tasks list.
138 * of the blocked_tasks[] array. The task will dequeue itself when 136 * The task will dequeue itself when it exits the outermost enclosing
139 * it exits the outermost enclosing RCU read-side critical section. 137 * RCU read-side critical section. Therefore, the current grace period
140 * Therefore, the current grace period cannot be permitted to complete 138 * cannot be permitted to complete until the blkd_tasks list entries
141 * until the blocked_tasks[] entry indexed by the low-order bit of 139 * predating the current grace period drain, in other words, until
142 * rnp->gpnum empties. 140 * rnp->gp_tasks becomes NULL.
143 * 141 *
144 * Caller must disable preemption. 142 * Caller must disable preemption.
145 */ 143 */
@@ -147,11 +145,10 @@ static void rcu_preempt_note_context_switch(int cpu)
147{ 145{
148 struct task_struct *t = current; 146 struct task_struct *t = current;
149 unsigned long flags; 147 unsigned long flags;
150 int phase;
151 struct rcu_data *rdp; 148 struct rcu_data *rdp;
152 struct rcu_node *rnp; 149 struct rcu_node *rnp;
153 150
154 if (t->rcu_read_lock_nesting && 151 if (t->rcu_read_lock_nesting > 0 &&
155 (t->rcu_read_unlock_special & RCU_READ_UNLOCK_BLOCKED) == 0) { 152 (t->rcu_read_unlock_special & RCU_READ_UNLOCK_BLOCKED) == 0) {
156 153
157 /* Possibly blocking in an RCU read-side critical section. */ 154 /* Possibly blocking in an RCU read-side critical section. */
@@ -169,16 +166,39 @@ static void rcu_preempt_note_context_switch(int cpu)
169 * (i.e., this CPU has not yet passed through a quiescent 166 * (i.e., this CPU has not yet passed through a quiescent
170 * state for the current grace period), then as long 167 * state for the current grace period), then as long
171 * as that task remains queued, the current grace period 168 * as that task remains queued, the current grace period
172 * cannot end. 169 * cannot end. Note that there is some uncertainty as
170 * to exactly when the current grace period started.
171 * We take a conservative approach, which can result
172 * in unnecessarily waiting on tasks that started very
173 * slightly after the current grace period began. C'est
174 * la vie!!!
173 * 175 *
174 * But first, note that the current CPU must still be 176 * But first, note that the current CPU must still be
175 * on line! 177 * on line!
176 */ 178 */
177 WARN_ON_ONCE((rdp->grpmask & rnp->qsmaskinit) == 0); 179 WARN_ON_ONCE((rdp->grpmask & rnp->qsmaskinit) == 0);
178 WARN_ON_ONCE(!list_empty(&t->rcu_node_entry)); 180 WARN_ON_ONCE(!list_empty(&t->rcu_node_entry));
179 phase = (rnp->gpnum + !(rnp->qsmask & rdp->grpmask)) & 0x1; 181 if ((rnp->qsmask & rdp->grpmask) && rnp->gp_tasks != NULL) {
180 list_add(&t->rcu_node_entry, &rnp->blocked_tasks[phase]); 182 list_add(&t->rcu_node_entry, rnp->gp_tasks->prev);
183 rnp->gp_tasks = &t->rcu_node_entry;
184#ifdef CONFIG_RCU_BOOST
185 if (rnp->boost_tasks != NULL)
186 rnp->boost_tasks = rnp->gp_tasks;
187#endif /* #ifdef CONFIG_RCU_BOOST */
188 } else {
189 list_add(&t->rcu_node_entry, &rnp->blkd_tasks);
190 if (rnp->qsmask & rdp->grpmask)
191 rnp->gp_tasks = &t->rcu_node_entry;
192 }
181 raw_spin_unlock_irqrestore(&rnp->lock, flags); 193 raw_spin_unlock_irqrestore(&rnp->lock, flags);
194 } else if (t->rcu_read_lock_nesting < 0 &&
195 t->rcu_read_unlock_special) {
196
197 /*
198 * Complete exit from RCU read-side critical section on
199 * behalf of preempted instance of __rcu_read_unlock().
200 */
201 rcu_read_unlock_special(t);
182 } 202 }
183 203
184 /* 204 /*
@@ -196,7 +216,7 @@ static void rcu_preempt_note_context_switch(int cpu)
196} 216}
197 217
198/* 218/*
199 * Tree-preemptable RCU implementation for rcu_read_lock(). 219 * Tree-preemptible RCU implementation for rcu_read_lock().
200 * Just increment ->rcu_read_lock_nesting, shared state will be updated 220 * Just increment ->rcu_read_lock_nesting, shared state will be updated
201 * if we block. 221 * if we block.
202 */ 222 */
@@ -212,12 +232,9 @@ EXPORT_SYMBOL_GPL(__rcu_read_lock);
212 * for the specified rcu_node structure. If the caller needs a reliable 232 * for the specified rcu_node structure. If the caller needs a reliable
213 * answer, it must hold the rcu_node's ->lock. 233 * answer, it must hold the rcu_node's ->lock.
214 */ 234 */
215static int rcu_preempted_readers(struct rcu_node *rnp) 235static int rcu_preempt_blocked_readers_cgp(struct rcu_node *rnp)
216{ 236{
217 int phase = rnp->gpnum & 0x1; 237 return rnp->gp_tasks != NULL;
218
219 return !list_empty(&rnp->blocked_tasks[phase]) ||
220 !list_empty(&rnp->blocked_tasks[phase + 2]);
221} 238}
222 239
223/* 240/*
@@ -233,7 +250,7 @@ static void rcu_report_unblock_qs_rnp(struct rcu_node *rnp, unsigned long flags)
233 unsigned long mask; 250 unsigned long mask;
234 struct rcu_node *rnp_p; 251 struct rcu_node *rnp_p;
235 252
236 if (rnp->qsmask != 0 || rcu_preempted_readers(rnp)) { 253 if (rnp->qsmask != 0 || rcu_preempt_blocked_readers_cgp(rnp)) {
237 raw_spin_unlock_irqrestore(&rnp->lock, flags); 254 raw_spin_unlock_irqrestore(&rnp->lock, flags);
238 return; /* Still need more quiescent states! */ 255 return; /* Still need more quiescent states! */
239 } 256 }
@@ -257,15 +274,31 @@ static void rcu_report_unblock_qs_rnp(struct rcu_node *rnp, unsigned long flags)
257} 274}
258 275
259/* 276/*
277 * Advance a ->blkd_tasks-list pointer to the next entry, instead
278 * returning NULL if at the end of the list.
279 */
280static struct list_head *rcu_next_node_entry(struct task_struct *t,
281 struct rcu_node *rnp)
282{
283 struct list_head *np;
284
285 np = t->rcu_node_entry.next;
286 if (np == &rnp->blkd_tasks)
287 np = NULL;
288 return np;
289}
290
291/*
260 * Handle special cases during rcu_read_unlock(), such as needing to 292 * Handle special cases during rcu_read_unlock(), such as needing to
261 * notify RCU core processing or task having blocked during the RCU 293 * notify RCU core processing or task having blocked during the RCU
262 * read-side critical section. 294 * read-side critical section.
263 */ 295 */
264static void rcu_read_unlock_special(struct task_struct *t) 296static noinline void rcu_read_unlock_special(struct task_struct *t)
265{ 297{
266 int empty; 298 int empty;
267 int empty_exp; 299 int empty_exp;
268 unsigned long flags; 300 unsigned long flags;
301 struct list_head *np;
269 struct rcu_node *rnp; 302 struct rcu_node *rnp;
270 int special; 303 int special;
271 304
@@ -285,7 +318,7 @@ static void rcu_read_unlock_special(struct task_struct *t)
285 } 318 }
286 319
287 /* Hardware IRQ handlers cannot block. */ 320 /* Hardware IRQ handlers cannot block. */
288 if (in_irq()) { 321 if (in_irq() || in_serving_softirq()) {
289 local_irq_restore(flags); 322 local_irq_restore(flags);
290 return; 323 return;
291 } 324 }
@@ -306,10 +339,24 @@ static void rcu_read_unlock_special(struct task_struct *t)
306 break; 339 break;
307 raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */ 340 raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */
308 } 341 }
309 empty = !rcu_preempted_readers(rnp); 342 empty = !rcu_preempt_blocked_readers_cgp(rnp);
310 empty_exp = !rcu_preempted_readers_exp(rnp); 343 empty_exp = !rcu_preempted_readers_exp(rnp);
311 smp_mb(); /* ensure expedited fastpath sees end of RCU c-s. */ 344 smp_mb(); /* ensure expedited fastpath sees end of RCU c-s. */
345 np = rcu_next_node_entry(t, rnp);
312 list_del_init(&t->rcu_node_entry); 346 list_del_init(&t->rcu_node_entry);
347 if (&t->rcu_node_entry == rnp->gp_tasks)
348 rnp->gp_tasks = np;
349 if (&t->rcu_node_entry == rnp->exp_tasks)
350 rnp->exp_tasks = np;
351#ifdef CONFIG_RCU_BOOST
352 if (&t->rcu_node_entry == rnp->boost_tasks)
353 rnp->boost_tasks = np;
354 /* Snapshot and clear ->rcu_boosted with rcu_node lock held. */
355 if (t->rcu_boosted) {
356 special |= RCU_READ_UNLOCK_BOOSTED;
357 t->rcu_boosted = 0;
358 }
359#endif /* #ifdef CONFIG_RCU_BOOST */
313 t->rcu_blocked_node = NULL; 360 t->rcu_blocked_node = NULL;
314 361
315 /* 362 /*
@@ -322,6 +369,14 @@ static void rcu_read_unlock_special(struct task_struct *t)
322 else 369 else
323 rcu_report_unblock_qs_rnp(rnp, flags); 370 rcu_report_unblock_qs_rnp(rnp, flags);
324 371
372#ifdef CONFIG_RCU_BOOST
373 /* Unboost if we were boosted. */
374 if (special & RCU_READ_UNLOCK_BOOSTED) {
375 rt_mutex_unlock(t->rcu_boost_mutex);
376 t->rcu_boost_mutex = NULL;
377 }
378#endif /* #ifdef CONFIG_RCU_BOOST */
379
325 /* 380 /*
326 * If this was the last task on the expedited lists, 381 * If this was the last task on the expedited lists,
327 * then we need to report up the rcu_node hierarchy. 382 * then we need to report up the rcu_node hierarchy.
@@ -334,7 +389,7 @@ static void rcu_read_unlock_special(struct task_struct *t)
334} 389}
335 390
336/* 391/*
337 * Tree-preemptable RCU implementation for rcu_read_unlock(). 392 * Tree-preemptible RCU implementation for rcu_read_unlock().
338 * Decrement ->rcu_read_lock_nesting. If the result is zero (outermost 393 * Decrement ->rcu_read_lock_nesting. If the result is zero (outermost
339 * rcu_read_unlock()) and ->rcu_read_unlock_special is non-zero, then 394 * rcu_read_unlock()) and ->rcu_read_unlock_special is non-zero, then
340 * invoke rcu_read_unlock_special() to clean up after a context switch 395 * invoke rcu_read_unlock_special() to clean up after a context switch
@@ -345,19 +400,26 @@ void __rcu_read_unlock(void)
345 struct task_struct *t = current; 400 struct task_struct *t = current;
346 401
347 barrier(); /* needed if we ever invoke rcu_read_unlock in rcutree.c */ 402 barrier(); /* needed if we ever invoke rcu_read_unlock in rcutree.c */
348 --t->rcu_read_lock_nesting; 403 if (t->rcu_read_lock_nesting != 1)
349 barrier(); /* decrement before load of ->rcu_read_unlock_special */ 404 --t->rcu_read_lock_nesting;
350 if (t->rcu_read_lock_nesting == 0 && 405 else {
351 unlikely(ACCESS_ONCE(t->rcu_read_unlock_special))) 406 t->rcu_read_lock_nesting = INT_MIN;
352 rcu_read_unlock_special(t); 407 barrier(); /* assign before ->rcu_read_unlock_special load */
408 if (unlikely(ACCESS_ONCE(t->rcu_read_unlock_special)))
409 rcu_read_unlock_special(t);
410 barrier(); /* ->rcu_read_unlock_special load before assign */
411 t->rcu_read_lock_nesting = 0;
412 }
353#ifdef CONFIG_PROVE_LOCKING 413#ifdef CONFIG_PROVE_LOCKING
354 WARN_ON_ONCE(ACCESS_ONCE(t->rcu_read_lock_nesting) < 0); 414 {
415 int rrln = ACCESS_ONCE(t->rcu_read_lock_nesting);
416
417 WARN_ON_ONCE(rrln < 0 && rrln > INT_MIN / 2);
418 }
355#endif /* #ifdef CONFIG_PROVE_LOCKING */ 419#endif /* #ifdef CONFIG_PROVE_LOCKING */
356} 420}
357EXPORT_SYMBOL_GPL(__rcu_read_unlock); 421EXPORT_SYMBOL_GPL(__rcu_read_unlock);
358 422
359#ifdef CONFIG_RCU_CPU_STALL_DETECTOR
360
361#ifdef CONFIG_RCU_CPU_STALL_VERBOSE 423#ifdef CONFIG_RCU_CPU_STALL_VERBOSE
362 424
363/* 425/*
@@ -367,18 +429,16 @@ EXPORT_SYMBOL_GPL(__rcu_read_unlock);
367static void rcu_print_detail_task_stall_rnp(struct rcu_node *rnp) 429static void rcu_print_detail_task_stall_rnp(struct rcu_node *rnp)
368{ 430{
369 unsigned long flags; 431 unsigned long flags;
370 struct list_head *lp;
371 int phase;
372 struct task_struct *t; 432 struct task_struct *t;
373 433
374 if (rcu_preempted_readers(rnp)) { 434 if (!rcu_preempt_blocked_readers_cgp(rnp))
375 raw_spin_lock_irqsave(&rnp->lock, flags); 435 return;
376 phase = rnp->gpnum & 0x1; 436 raw_spin_lock_irqsave(&rnp->lock, flags);
377 lp = &rnp->blocked_tasks[phase]; 437 t = list_entry(rnp->gp_tasks,
378 list_for_each_entry(t, lp, rcu_node_entry) 438 struct task_struct, rcu_node_entry);
379 sched_show_task(t); 439 list_for_each_entry_continue(t, &rnp->blkd_tasks, rcu_node_entry)
380 raw_spin_unlock_irqrestore(&rnp->lock, flags); 440 sched_show_task(t);
381 } 441 raw_spin_unlock_irqrestore(&rnp->lock, flags);
382} 442}
383 443
384/* 444/*
@@ -408,16 +468,14 @@ static void rcu_print_detail_task_stall(struct rcu_state *rsp)
408 */ 468 */
409static void rcu_print_task_stall(struct rcu_node *rnp) 469static void rcu_print_task_stall(struct rcu_node *rnp)
410{ 470{
411 struct list_head *lp;
412 int phase;
413 struct task_struct *t; 471 struct task_struct *t;
414 472
415 if (rcu_preempted_readers(rnp)) { 473 if (!rcu_preempt_blocked_readers_cgp(rnp))
416 phase = rnp->gpnum & 0x1; 474 return;
417 lp = &rnp->blocked_tasks[phase]; 475 t = list_entry(rnp->gp_tasks,
418 list_for_each_entry(t, lp, rcu_node_entry) 476 struct task_struct, rcu_node_entry);
419 printk(" P%d", t->pid); 477 list_for_each_entry_continue(t, &rnp->blkd_tasks, rcu_node_entry)
420 } 478 printk(" P%d", t->pid);
421} 479}
422 480
423/* 481/*
@@ -430,18 +488,21 @@ static void rcu_preempt_stall_reset(void)
430 rcu_preempt_state.jiffies_stall = jiffies + ULONG_MAX / 2; 488 rcu_preempt_state.jiffies_stall = jiffies + ULONG_MAX / 2;
431} 489}
432 490
433#endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */
434
435/* 491/*
436 * Check that the list of blocked tasks for the newly completed grace 492 * Check that the list of blocked tasks for the newly completed grace
437 * period is in fact empty. It is a serious bug to complete a grace 493 * period is in fact empty. It is a serious bug to complete a grace
438 * period that still has RCU readers blocked! This function must be 494 * period that still has RCU readers blocked! This function must be
439 * invoked -before- updating this rnp's ->gpnum, and the rnp's ->lock 495 * invoked -before- updating this rnp's ->gpnum, and the rnp's ->lock
440 * must be held by the caller. 496 * must be held by the caller.
497 *
498 * Also, if there are blocked tasks on the list, they automatically
499 * block the newly created grace period, so set up ->gp_tasks accordingly.
441 */ 500 */
442static void rcu_preempt_check_blocked_tasks(struct rcu_node *rnp) 501static void rcu_preempt_check_blocked_tasks(struct rcu_node *rnp)
443{ 502{
444 WARN_ON_ONCE(rcu_preempted_readers(rnp)); 503 WARN_ON_ONCE(rcu_preempt_blocked_readers_cgp(rnp));
504 if (!list_empty(&rnp->blkd_tasks))
505 rnp->gp_tasks = rnp->blkd_tasks.next;
445 WARN_ON_ONCE(rnp->qsmask); 506 WARN_ON_ONCE(rnp->qsmask);
446} 507}
447 508
@@ -465,50 +526,68 @@ static int rcu_preempt_offline_tasks(struct rcu_state *rsp,
465 struct rcu_node *rnp, 526 struct rcu_node *rnp,
466 struct rcu_data *rdp) 527 struct rcu_data *rdp)
467{ 528{
468 int i;
469 struct list_head *lp; 529 struct list_head *lp;
470 struct list_head *lp_root; 530 struct list_head *lp_root;
471 int retval = 0; 531 int retval = 0;
472 struct rcu_node *rnp_root = rcu_get_root(rsp); 532 struct rcu_node *rnp_root = rcu_get_root(rsp);
473 struct task_struct *tp; 533 struct task_struct *t;
474 534
475 if (rnp == rnp_root) { 535 if (rnp == rnp_root) {
476 WARN_ONCE(1, "Last CPU thought to be offlined?"); 536 WARN_ONCE(1, "Last CPU thought to be offlined?");
477 return 0; /* Shouldn't happen: at least one CPU online. */ 537 return 0; /* Shouldn't happen: at least one CPU online. */
478 } 538 }
479 WARN_ON_ONCE(rnp != rdp->mynode && 539
480 (!list_empty(&rnp->blocked_tasks[0]) || 540 /* If we are on an internal node, complain bitterly. */
481 !list_empty(&rnp->blocked_tasks[1]) || 541 WARN_ON_ONCE(rnp != rdp->mynode);
482 !list_empty(&rnp->blocked_tasks[2]) ||
483 !list_empty(&rnp->blocked_tasks[3])));
484 542
485 /* 543 /*
486 * Move tasks up to root rcu_node. Rely on the fact that the 544 * Move tasks up to root rcu_node. Don't try to get fancy for
487 * root rcu_node can be at most one ahead of the rest of the 545 * this corner-case operation -- just put this node's tasks
488 * rcu_nodes in terms of gp_num value. This fact allows us to 546 * at the head of the root node's list, and update the root node's
489 * move the blocked_tasks[] array directly, element by element. 547 * ->gp_tasks and ->exp_tasks pointers to those of this node's,
548 * if non-NULL. This might result in waiting for more tasks than
549 * absolutely necessary, but this is a good performance/complexity
550 * tradeoff.
490 */ 551 */
491 if (rcu_preempted_readers(rnp)) 552 if (rcu_preempt_blocked_readers_cgp(rnp))
492 retval |= RCU_OFL_TASKS_NORM_GP; 553 retval |= RCU_OFL_TASKS_NORM_GP;
493 if (rcu_preempted_readers_exp(rnp)) 554 if (rcu_preempted_readers_exp(rnp))
494 retval |= RCU_OFL_TASKS_EXP_GP; 555 retval |= RCU_OFL_TASKS_EXP_GP;
495 for (i = 0; i < 4; i++) { 556 lp = &rnp->blkd_tasks;
496 lp = &rnp->blocked_tasks[i]; 557 lp_root = &rnp_root->blkd_tasks;
497 lp_root = &rnp_root->blocked_tasks[i]; 558 while (!list_empty(lp)) {
498 while (!list_empty(lp)) { 559 t = list_entry(lp->next, typeof(*t), rcu_node_entry);
499 tp = list_entry(lp->next, typeof(*tp), rcu_node_entry); 560 raw_spin_lock(&rnp_root->lock); /* irqs already disabled */
500 raw_spin_lock(&rnp_root->lock); /* irqs already disabled */ 561 list_del(&t->rcu_node_entry);
501 list_del(&tp->rcu_node_entry); 562 t->rcu_blocked_node = rnp_root;
502 tp->rcu_blocked_node = rnp_root; 563 list_add(&t->rcu_node_entry, lp_root);
503 list_add(&tp->rcu_node_entry, lp_root); 564 if (&t->rcu_node_entry == rnp->gp_tasks)
504 raw_spin_unlock(&rnp_root->lock); /* irqs remain disabled */ 565 rnp_root->gp_tasks = rnp->gp_tasks;
505 } 566 if (&t->rcu_node_entry == rnp->exp_tasks)
567 rnp_root->exp_tasks = rnp->exp_tasks;
568#ifdef CONFIG_RCU_BOOST
569 if (&t->rcu_node_entry == rnp->boost_tasks)
570 rnp_root->boost_tasks = rnp->boost_tasks;
571#endif /* #ifdef CONFIG_RCU_BOOST */
572 raw_spin_unlock(&rnp_root->lock); /* irqs still disabled */
506 } 573 }
574
575#ifdef CONFIG_RCU_BOOST
576 /* In case root is being boosted and leaf is not. */
577 raw_spin_lock(&rnp_root->lock); /* irqs already disabled */
578 if (rnp_root->boost_tasks != NULL &&
579 rnp_root->boost_tasks != rnp_root->gp_tasks)
580 rnp_root->boost_tasks = rnp_root->gp_tasks;
581 raw_spin_unlock(&rnp_root->lock); /* irqs still disabled */
582#endif /* #ifdef CONFIG_RCU_BOOST */
583
584 rnp->gp_tasks = NULL;
585 rnp->exp_tasks = NULL;
507 return retval; 586 return retval;
508} 587}
509 588
510/* 589/*
511 * Do CPU-offline processing for preemptable RCU. 590 * Do CPU-offline processing for preemptible RCU.
512 */ 591 */
513static void rcu_preempt_offline_cpu(int cpu) 592static void rcu_preempt_offline_cpu(int cpu)
514{ 593{
@@ -532,12 +611,13 @@ static void rcu_preempt_check_callbacks(int cpu)
532 rcu_preempt_qs(cpu); 611 rcu_preempt_qs(cpu);
533 return; 612 return;
534 } 613 }
535 if (per_cpu(rcu_preempt_data, cpu).qs_pending) 614 if (t->rcu_read_lock_nesting > 0 &&
615 per_cpu(rcu_preempt_data, cpu).qs_pending)
536 t->rcu_read_unlock_special |= RCU_READ_UNLOCK_NEED_QS; 616 t->rcu_read_unlock_special |= RCU_READ_UNLOCK_NEED_QS;
537} 617}
538 618
539/* 619/*
540 * Process callbacks for preemptable RCU. 620 * Process callbacks for preemptible RCU.
541 */ 621 */
542static void rcu_preempt_process_callbacks(void) 622static void rcu_preempt_process_callbacks(void)
543{ 623{
@@ -545,8 +625,17 @@ static void rcu_preempt_process_callbacks(void)
545 &__get_cpu_var(rcu_preempt_data)); 625 &__get_cpu_var(rcu_preempt_data));
546} 626}
547 627
628#ifdef CONFIG_RCU_BOOST
629
630static void rcu_preempt_do_callbacks(void)
631{
632 rcu_do_batch(&rcu_preempt_state, &__get_cpu_var(rcu_preempt_data));
633}
634
635#endif /* #ifdef CONFIG_RCU_BOOST */
636
548/* 637/*
549 * Queue a preemptable-RCU callback for invocation after a grace period. 638 * Queue a preemptible-RCU callback for invocation after a grace period.
550 */ 639 */
551void call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu)) 640void call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu))
552{ 641{
@@ -594,8 +683,7 @@ static DEFINE_MUTEX(sync_rcu_preempt_exp_mutex);
594 */ 683 */
595static int rcu_preempted_readers_exp(struct rcu_node *rnp) 684static int rcu_preempted_readers_exp(struct rcu_node *rnp)
596{ 685{
597 return !list_empty(&rnp->blocked_tasks[2]) || 686 return rnp->exp_tasks != NULL;
598 !list_empty(&rnp->blocked_tasks[3]);
599} 687}
600 688
601/* 689/*
@@ -630,9 +718,12 @@ static void rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp)
630 718
631 raw_spin_lock_irqsave(&rnp->lock, flags); 719 raw_spin_lock_irqsave(&rnp->lock, flags);
632 for (;;) { 720 for (;;) {
633 if (!sync_rcu_preempt_exp_done(rnp)) 721 if (!sync_rcu_preempt_exp_done(rnp)) {
722 raw_spin_unlock_irqrestore(&rnp->lock, flags);
634 break; 723 break;
724 }
635 if (rnp->parent == NULL) { 725 if (rnp->parent == NULL) {
726 raw_spin_unlock_irqrestore(&rnp->lock, flags);
636 wake_up(&sync_rcu_preempt_exp_wq); 727 wake_up(&sync_rcu_preempt_exp_wq);
637 break; 728 break;
638 } 729 }
@@ -642,7 +733,6 @@ static void rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp)
642 raw_spin_lock(&rnp->lock); /* irqs already disabled */ 733 raw_spin_lock(&rnp->lock); /* irqs already disabled */
643 rnp->expmask &= ~mask; 734 rnp->expmask &= ~mask;
644 } 735 }
645 raw_spin_unlock_irqrestore(&rnp->lock, flags);
646} 736}
647 737
648/* 738/*
@@ -655,13 +745,17 @@ static void rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp)
655static void 745static void
656sync_rcu_preempt_exp_init(struct rcu_state *rsp, struct rcu_node *rnp) 746sync_rcu_preempt_exp_init(struct rcu_state *rsp, struct rcu_node *rnp)
657{ 747{
658 int must_wait; 748 unsigned long flags;
749 int must_wait = 0;
659 750
660 raw_spin_lock(&rnp->lock); /* irqs already disabled */ 751 raw_spin_lock_irqsave(&rnp->lock, flags);
661 list_splice_init(&rnp->blocked_tasks[0], &rnp->blocked_tasks[2]); 752 if (list_empty(&rnp->blkd_tasks))
662 list_splice_init(&rnp->blocked_tasks[1], &rnp->blocked_tasks[3]); 753 raw_spin_unlock_irqrestore(&rnp->lock, flags);
663 must_wait = rcu_preempted_readers_exp(rnp); 754 else {
664 raw_spin_unlock(&rnp->lock); /* irqs remain disabled */ 755 rnp->exp_tasks = rnp->blkd_tasks.next;
756 rcu_initiate_boost(rnp, flags); /* releases rnp->lock */
757 must_wait = 1;
758 }
665 if (!must_wait) 759 if (!must_wait)
666 rcu_report_exp_rnp(rsp, rnp); 760 rcu_report_exp_rnp(rsp, rnp);
667} 761}
@@ -669,9 +763,7 @@ sync_rcu_preempt_exp_init(struct rcu_state *rsp, struct rcu_node *rnp)
669/* 763/*
670 * Wait for an rcu-preempt grace period, but expedite it. The basic idea 764 * Wait for an rcu-preempt grace period, but expedite it. The basic idea
671 * is to invoke synchronize_sched_expedited() to push all the tasks to 765 * is to invoke synchronize_sched_expedited() to push all the tasks to
672 * the ->blocked_tasks[] lists, move all entries from the first set of 766 * the ->blkd_tasks lists and wait for this list to drain.
673 * ->blocked_tasks[] lists to the second set, and finally wait for this
674 * second set to drain.
675 */ 767 */
676void synchronize_rcu_expedited(void) 768void synchronize_rcu_expedited(void)
677{ 769{
@@ -703,7 +795,7 @@ void synchronize_rcu_expedited(void)
703 if ((ACCESS_ONCE(sync_rcu_preempt_exp_count) - snap) > 0) 795 if ((ACCESS_ONCE(sync_rcu_preempt_exp_count) - snap) > 0)
704 goto unlock_mb_ret; /* Others did our work for us. */ 796 goto unlock_mb_ret; /* Others did our work for us. */
705 797
706 /* force all RCU readers onto blocked_tasks[]. */ 798 /* force all RCU readers onto ->blkd_tasks lists. */
707 synchronize_sched_expedited(); 799 synchronize_sched_expedited();
708 800
709 raw_spin_lock_irqsave(&rsp->onofflock, flags); 801 raw_spin_lock_irqsave(&rsp->onofflock, flags);
@@ -715,7 +807,7 @@ void synchronize_rcu_expedited(void)
715 raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */ 807 raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */
716 } 808 }
717 809
718 /* Snapshot current state of ->blocked_tasks[] lists. */ 810 /* Snapshot current state of ->blkd_tasks lists. */
719 rcu_for_each_leaf_node(rsp, rnp) 811 rcu_for_each_leaf_node(rsp, rnp)
720 sync_rcu_preempt_exp_init(rsp, rnp); 812 sync_rcu_preempt_exp_init(rsp, rnp);
721 if (NUM_RCU_NODES > 1) 813 if (NUM_RCU_NODES > 1)
@@ -723,7 +815,7 @@ void synchronize_rcu_expedited(void)
723 815
724 raw_spin_unlock_irqrestore(&rsp->onofflock, flags); 816 raw_spin_unlock_irqrestore(&rsp->onofflock, flags);
725 817
726 /* Wait for snapshotted ->blocked_tasks[] lists to drain. */ 818 /* Wait for snapshotted ->blkd_tasks lists to drain. */
727 rnp = rcu_get_root(rsp); 819 rnp = rcu_get_root(rsp);
728 wait_event(sync_rcu_preempt_exp_wq, 820 wait_event(sync_rcu_preempt_exp_wq,
729 sync_rcu_preempt_exp_done(rnp)); 821 sync_rcu_preempt_exp_done(rnp));
@@ -739,7 +831,7 @@ mb_ret:
739EXPORT_SYMBOL_GPL(synchronize_rcu_expedited); 831EXPORT_SYMBOL_GPL(synchronize_rcu_expedited);
740 832
741/* 833/*
742 * Check to see if there is any immediate preemptable-RCU-related work 834 * Check to see if there is any immediate preemptible-RCU-related work
743 * to be done. 835 * to be done.
744 */ 836 */
745static int rcu_preempt_pending(int cpu) 837static int rcu_preempt_pending(int cpu)
@@ -749,7 +841,7 @@ static int rcu_preempt_pending(int cpu)
749} 841}
750 842
751/* 843/*
752 * Does preemptable RCU need the CPU to stay out of dynticks mode? 844 * Does preemptible RCU need the CPU to stay out of dynticks mode?
753 */ 845 */
754static int rcu_preempt_needs_cpu(int cpu) 846static int rcu_preempt_needs_cpu(int cpu)
755{ 847{
@@ -766,7 +858,7 @@ void rcu_barrier(void)
766EXPORT_SYMBOL_GPL(rcu_barrier); 858EXPORT_SYMBOL_GPL(rcu_barrier);
767 859
768/* 860/*
769 * Initialize preemptable RCU's per-CPU data. 861 * Initialize preemptible RCU's per-CPU data.
770 */ 862 */
771static void __cpuinit rcu_preempt_init_percpu_data(int cpu) 863static void __cpuinit rcu_preempt_init_percpu_data(int cpu)
772{ 864{
@@ -774,7 +866,7 @@ static void __cpuinit rcu_preempt_init_percpu_data(int cpu)
774} 866}
775 867
776/* 868/*
777 * Move preemptable RCU's callbacks from dying CPU to other online CPU. 869 * Move preemptible RCU's callbacks from dying CPU to other online CPU.
778 */ 870 */
779static void rcu_preempt_send_cbs_to_online(void) 871static void rcu_preempt_send_cbs_to_online(void)
780{ 872{
@@ -782,7 +874,7 @@ static void rcu_preempt_send_cbs_to_online(void)
782} 874}
783 875
784/* 876/*
785 * Initialize preemptable RCU's state structures. 877 * Initialize preemptible RCU's state structures.
786 */ 878 */
787static void __init __rcu_init_preempt(void) 879static void __init __rcu_init_preempt(void)
788{ 880{
@@ -790,7 +882,7 @@ static void __init __rcu_init_preempt(void)
790} 882}
791 883
792/* 884/*
793 * Check for a task exiting while in a preemptable-RCU read-side 885 * Check for a task exiting while in a preemptible-RCU read-side
794 * critical section, clean up if so. No need to issue warnings, 886 * critical section, clean up if so. No need to issue warnings,
795 * as debug_check_no_locks_held() already does this if lockdep 887 * as debug_check_no_locks_held() already does this if lockdep
796 * is enabled. 888 * is enabled.
@@ -802,11 +894,13 @@ void exit_rcu(void)
802 if (t->rcu_read_lock_nesting == 0) 894 if (t->rcu_read_lock_nesting == 0)
803 return; 895 return;
804 t->rcu_read_lock_nesting = 1; 896 t->rcu_read_lock_nesting = 1;
805 rcu_read_unlock(); 897 __rcu_read_unlock();
806} 898}
807 899
808#else /* #ifdef CONFIG_TREE_PREEMPT_RCU */ 900#else /* #ifdef CONFIG_TREE_PREEMPT_RCU */
809 901
902static struct rcu_state *rcu_state = &rcu_sched_state;
903
810/* 904/*
811 * Tell them what RCU they are running. 905 * Tell them what RCU they are running.
812 */ 906 */
@@ -836,7 +930,7 @@ void rcu_force_quiescent_state(void)
836EXPORT_SYMBOL_GPL(rcu_force_quiescent_state); 930EXPORT_SYMBOL_GPL(rcu_force_quiescent_state);
837 931
838/* 932/*
839 * Because preemptable RCU does not exist, we never have to check for 933 * Because preemptible RCU does not exist, we never have to check for
840 * CPUs being in quiescent states. 934 * CPUs being in quiescent states.
841 */ 935 */
842static void rcu_preempt_note_context_switch(int cpu) 936static void rcu_preempt_note_context_switch(int cpu)
@@ -844,10 +938,10 @@ static void rcu_preempt_note_context_switch(int cpu)
844} 938}
845 939
846/* 940/*
847 * Because preemptable RCU does not exist, there are never any preempted 941 * Because preemptible RCU does not exist, there are never any preempted
848 * RCU readers. 942 * RCU readers.
849 */ 943 */
850static int rcu_preempted_readers(struct rcu_node *rnp) 944static int rcu_preempt_blocked_readers_cgp(struct rcu_node *rnp)
851{ 945{
852 return 0; 946 return 0;
853} 947}
@@ -862,10 +956,8 @@ static void rcu_report_unblock_qs_rnp(struct rcu_node *rnp, unsigned long flags)
862 956
863#endif /* #ifdef CONFIG_HOTPLUG_CPU */ 957#endif /* #ifdef CONFIG_HOTPLUG_CPU */
864 958
865#ifdef CONFIG_RCU_CPU_STALL_DETECTOR
866
867/* 959/*
868 * Because preemptable RCU does not exist, we never have to check for 960 * Because preemptible RCU does not exist, we never have to check for
869 * tasks blocked within RCU read-side critical sections. 961 * tasks blocked within RCU read-side critical sections.
870 */ 962 */
871static void rcu_print_detail_task_stall(struct rcu_state *rsp) 963static void rcu_print_detail_task_stall(struct rcu_state *rsp)
@@ -873,7 +965,7 @@ static void rcu_print_detail_task_stall(struct rcu_state *rsp)
873} 965}
874 966
875/* 967/*
876 * Because preemptable RCU does not exist, we never have to check for 968 * Because preemptible RCU does not exist, we never have to check for
877 * tasks blocked within RCU read-side critical sections. 969 * tasks blocked within RCU read-side critical sections.
878 */ 970 */
879static void rcu_print_task_stall(struct rcu_node *rnp) 971static void rcu_print_task_stall(struct rcu_node *rnp)
@@ -888,10 +980,8 @@ static void rcu_preempt_stall_reset(void)
888{ 980{
889} 981}
890 982
891#endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */
892
893/* 983/*
894 * Because there is no preemptable RCU, there can be no readers blocked, 984 * Because there is no preemptible RCU, there can be no readers blocked,
895 * so there is no need to check for blocked tasks. So check only for 985 * so there is no need to check for blocked tasks. So check only for
896 * bogus qsmask values. 986 * bogus qsmask values.
897 */ 987 */
@@ -903,7 +993,7 @@ static void rcu_preempt_check_blocked_tasks(struct rcu_node *rnp)
903#ifdef CONFIG_HOTPLUG_CPU 993#ifdef CONFIG_HOTPLUG_CPU
904 994
905/* 995/*
906 * Because preemptable RCU does not exist, it never needs to migrate 996 * Because preemptible RCU does not exist, it never needs to migrate
907 * tasks that were blocked within RCU read-side critical sections, and 997 * tasks that were blocked within RCU read-side critical sections, and
908 * such non-existent tasks cannot possibly have been blocking the current 998 * such non-existent tasks cannot possibly have been blocking the current
909 * grace period. 999 * grace period.
@@ -916,7 +1006,7 @@ static int rcu_preempt_offline_tasks(struct rcu_state *rsp,
916} 1006}
917 1007
918/* 1008/*
919 * Because preemptable RCU does not exist, it never needs CPU-offline 1009 * Because preemptible RCU does not exist, it never needs CPU-offline
920 * processing. 1010 * processing.
921 */ 1011 */
922static void rcu_preempt_offline_cpu(int cpu) 1012static void rcu_preempt_offline_cpu(int cpu)
@@ -926,7 +1016,7 @@ static void rcu_preempt_offline_cpu(int cpu)
926#endif /* #ifdef CONFIG_HOTPLUG_CPU */ 1016#endif /* #ifdef CONFIG_HOTPLUG_CPU */
927 1017
928/* 1018/*
929 * Because preemptable RCU does not exist, it never has any callbacks 1019 * Because preemptible RCU does not exist, it never has any callbacks
930 * to check. 1020 * to check.
931 */ 1021 */
932static void rcu_preempt_check_callbacks(int cpu) 1022static void rcu_preempt_check_callbacks(int cpu)
@@ -934,7 +1024,7 @@ static void rcu_preempt_check_callbacks(int cpu)
934} 1024}
935 1025
936/* 1026/*
937 * Because preemptable RCU does not exist, it never has any callbacks 1027 * Because preemptible RCU does not exist, it never has any callbacks
938 * to process. 1028 * to process.
939 */ 1029 */
940static void rcu_preempt_process_callbacks(void) 1030static void rcu_preempt_process_callbacks(void)
@@ -943,7 +1033,7 @@ static void rcu_preempt_process_callbacks(void)
943 1033
944/* 1034/*
945 * Wait for an rcu-preempt grace period, but make it happen quickly. 1035 * Wait for an rcu-preempt grace period, but make it happen quickly.
946 * But because preemptable RCU does not exist, map to rcu-sched. 1036 * But because preemptible RCU does not exist, map to rcu-sched.
947 */ 1037 */
948void synchronize_rcu_expedited(void) 1038void synchronize_rcu_expedited(void)
949{ 1039{
@@ -954,7 +1044,7 @@ EXPORT_SYMBOL_GPL(synchronize_rcu_expedited);
954#ifdef CONFIG_HOTPLUG_CPU 1044#ifdef CONFIG_HOTPLUG_CPU
955 1045
956/* 1046/*
957 * Because preemptable RCU does not exist, there is never any need to 1047 * Because preemptible RCU does not exist, there is never any need to
958 * report on tasks preempted in RCU read-side critical sections during 1048 * report on tasks preempted in RCU read-side critical sections during
959 * expedited RCU grace periods. 1049 * expedited RCU grace periods.
960 */ 1050 */
@@ -966,7 +1056,7 @@ static void rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp)
966#endif /* #ifdef CONFIG_HOTPLUG_CPU */ 1056#endif /* #ifdef CONFIG_HOTPLUG_CPU */
967 1057
968/* 1058/*
969 * Because preemptable RCU does not exist, it never has any work to do. 1059 * Because preemptible RCU does not exist, it never has any work to do.
970 */ 1060 */
971static int rcu_preempt_pending(int cpu) 1061static int rcu_preempt_pending(int cpu)
972{ 1062{
@@ -974,7 +1064,7 @@ static int rcu_preempt_pending(int cpu)
974} 1064}
975 1065
976/* 1066/*
977 * Because preemptable RCU does not exist, it never needs any CPU. 1067 * Because preemptible RCU does not exist, it never needs any CPU.
978 */ 1068 */
979static int rcu_preempt_needs_cpu(int cpu) 1069static int rcu_preempt_needs_cpu(int cpu)
980{ 1070{
@@ -982,7 +1072,7 @@ static int rcu_preempt_needs_cpu(int cpu)
982} 1072}
983 1073
984/* 1074/*
985 * Because preemptable RCU does not exist, rcu_barrier() is just 1075 * Because preemptible RCU does not exist, rcu_barrier() is just
986 * another name for rcu_barrier_sched(). 1076 * another name for rcu_barrier_sched().
987 */ 1077 */
988void rcu_barrier(void) 1078void rcu_barrier(void)
@@ -992,7 +1082,7 @@ void rcu_barrier(void)
992EXPORT_SYMBOL_GPL(rcu_barrier); 1082EXPORT_SYMBOL_GPL(rcu_barrier);
993 1083
994/* 1084/*
995 * Because preemptable RCU does not exist, there is no per-CPU 1085 * Because preemptible RCU does not exist, there is no per-CPU
996 * data to initialize. 1086 * data to initialize.
997 */ 1087 */
998static void __cpuinit rcu_preempt_init_percpu_data(int cpu) 1088static void __cpuinit rcu_preempt_init_percpu_data(int cpu)
@@ -1000,14 +1090,14 @@ static void __cpuinit rcu_preempt_init_percpu_data(int cpu)
1000} 1090}
1001 1091
1002/* 1092/*
1003 * Because there is no preemptable RCU, there are no callbacks to move. 1093 * Because there is no preemptible RCU, there are no callbacks to move.
1004 */ 1094 */
1005static void rcu_preempt_send_cbs_to_online(void) 1095static void rcu_preempt_send_cbs_to_online(void)
1006{ 1096{
1007} 1097}
1008 1098
1009/* 1099/*
1010 * Because preemptable RCU does not exist, it need not be initialized. 1100 * Because preemptible RCU does not exist, it need not be initialized.
1011 */ 1101 */
1012static void __init __rcu_init_preempt(void) 1102static void __init __rcu_init_preempt(void)
1013{ 1103{
@@ -1015,6 +1105,665 @@ static void __init __rcu_init_preempt(void)
1015 1105
1016#endif /* #else #ifdef CONFIG_TREE_PREEMPT_RCU */ 1106#endif /* #else #ifdef CONFIG_TREE_PREEMPT_RCU */
1017 1107
1108#ifdef CONFIG_RCU_BOOST
1109
1110#include "rtmutex_common.h"
1111
1112#ifdef CONFIG_RCU_TRACE
1113
1114static void rcu_initiate_boost_trace(struct rcu_node *rnp)
1115{
1116 if (list_empty(&rnp->blkd_tasks))
1117 rnp->n_balk_blkd_tasks++;
1118 else if (rnp->exp_tasks == NULL && rnp->gp_tasks == NULL)
1119 rnp->n_balk_exp_gp_tasks++;
1120 else if (rnp->gp_tasks != NULL && rnp->boost_tasks != NULL)
1121 rnp->n_balk_boost_tasks++;
1122 else if (rnp->gp_tasks != NULL && rnp->qsmask != 0)
1123 rnp->n_balk_notblocked++;
1124 else if (rnp->gp_tasks != NULL &&
1125 ULONG_CMP_LT(jiffies, rnp->boost_time))
1126 rnp->n_balk_notyet++;
1127 else
1128 rnp->n_balk_nos++;
1129}
1130
1131#else /* #ifdef CONFIG_RCU_TRACE */
1132
1133static void rcu_initiate_boost_trace(struct rcu_node *rnp)
1134{
1135}
1136
1137#endif /* #else #ifdef CONFIG_RCU_TRACE */
1138
1139/*
1140 * Carry out RCU priority boosting on the task indicated by ->exp_tasks
1141 * or ->boost_tasks, advancing the pointer to the next task in the
1142 * ->blkd_tasks list.
1143 *
1144 * Note that irqs must be enabled: boosting the task can block.
1145 * Returns 1 if there are more tasks needing to be boosted.
1146 */
1147static int rcu_boost(struct rcu_node *rnp)
1148{
1149 unsigned long flags;
1150 struct rt_mutex mtx;
1151 struct task_struct *t;
1152 struct list_head *tb;
1153
1154 if (rnp->exp_tasks == NULL && rnp->boost_tasks == NULL)
1155 return 0; /* Nothing left to boost. */
1156
1157 raw_spin_lock_irqsave(&rnp->lock, flags);
1158
1159 /*
1160 * Recheck under the lock: all tasks in need of boosting
1161 * might exit their RCU read-side critical sections on their own.
1162 */
1163 if (rnp->exp_tasks == NULL && rnp->boost_tasks == NULL) {
1164 raw_spin_unlock_irqrestore(&rnp->lock, flags);
1165 return 0;
1166 }
1167
1168 /*
1169 * Preferentially boost tasks blocking expedited grace periods.
1170 * This cannot starve the normal grace periods because a second
1171 * expedited grace period must boost all blocked tasks, including
1172 * those blocking the pre-existing normal grace period.
1173 */
1174 if (rnp->exp_tasks != NULL) {
1175 tb = rnp->exp_tasks;
1176 rnp->n_exp_boosts++;
1177 } else {
1178 tb = rnp->boost_tasks;
1179 rnp->n_normal_boosts++;
1180 }
1181 rnp->n_tasks_boosted++;
1182
1183 /*
1184 * We boost task t by manufacturing an rt_mutex that appears to
1185 * be held by task t. We leave a pointer to that rt_mutex where
1186 * task t can find it, and task t will release the mutex when it
1187 * exits its outermost RCU read-side critical section. Then
1188 * simply acquiring this artificial rt_mutex will boost task
1189 * t's priority. (Thanks to tglx for suggesting this approach!)
1190 *
1191 * Note that task t must acquire rnp->lock to remove itself from
1192 * the ->blkd_tasks list, which it will do from exit() if from
1193 * nowhere else. We therefore are guaranteed that task t will
1194 * stay around at least until we drop rnp->lock. Note that
1195 * rnp->lock also resolves races between our priority boosting
1196 * and task t's exiting its outermost RCU read-side critical
1197 * section.
1198 */
1199 t = container_of(tb, struct task_struct, rcu_node_entry);
1200 rt_mutex_init_proxy_locked(&mtx, t);
1201 t->rcu_boost_mutex = &mtx;
1202 t->rcu_boosted = 1;
1203 raw_spin_unlock_irqrestore(&rnp->lock, flags);
1204 rt_mutex_lock(&mtx); /* Side effect: boosts task t's priority. */
1205 rt_mutex_unlock(&mtx); /* Keep lockdep happy. */
1206
1207 return rnp->exp_tasks != NULL || rnp->boost_tasks != NULL;
1208}
1209
1210/*
1211 * Timer handler to initiate waking up of boost kthreads that
1212 * have yielded the CPU due to excessive numbers of tasks to
1213 * boost. We wake up the per-rcu_node kthread, which in turn
1214 * will wake up the booster kthread.
1215 */
1216static void rcu_boost_kthread_timer(unsigned long arg)
1217{
1218 invoke_rcu_node_kthread((struct rcu_node *)arg);
1219}
1220
1221/*
1222 * Priority-boosting kthread. One per leaf rcu_node and one for the
1223 * root rcu_node.
1224 */
1225static int rcu_boost_kthread(void *arg)
1226{
1227 struct rcu_node *rnp = (struct rcu_node *)arg;
1228 int spincnt = 0;
1229 int more2boost;
1230
1231 for (;;) {
1232 rnp->boost_kthread_status = RCU_KTHREAD_WAITING;
1233 rcu_wait(rnp->boost_tasks || rnp->exp_tasks);
1234 rnp->boost_kthread_status = RCU_KTHREAD_RUNNING;
1235 more2boost = rcu_boost(rnp);
1236 if (more2boost)
1237 spincnt++;
1238 else
1239 spincnt = 0;
1240 if (spincnt > 10) {
1241 rcu_yield(rcu_boost_kthread_timer, (unsigned long)rnp);
1242 spincnt = 0;
1243 }
1244 }
1245 /* NOTREACHED */
1246 return 0;
1247}
1248
1249/*
1250 * Check to see if it is time to start boosting RCU readers that are
1251 * blocking the current grace period, and, if so, tell the per-rcu_node
1252 * kthread to start boosting them. If there is an expedited grace
1253 * period in progress, it is always time to boost.
1254 *
1255 * The caller must hold rnp->lock, which this function releases,
1256 * but irqs remain disabled. The ->boost_kthread_task is immortal,
1257 * so we don't need to worry about it going away.
1258 */
1259static void rcu_initiate_boost(struct rcu_node *rnp, unsigned long flags)
1260{
1261 struct task_struct *t;
1262
1263 if (!rcu_preempt_blocked_readers_cgp(rnp) && rnp->exp_tasks == NULL) {
1264 rnp->n_balk_exp_gp_tasks++;
1265 raw_spin_unlock_irqrestore(&rnp->lock, flags);
1266 return;
1267 }
1268 if (rnp->exp_tasks != NULL ||
1269 (rnp->gp_tasks != NULL &&
1270 rnp->boost_tasks == NULL &&
1271 rnp->qsmask == 0 &&
1272 ULONG_CMP_GE(jiffies, rnp->boost_time))) {
1273 if (rnp->exp_tasks == NULL)
1274 rnp->boost_tasks = rnp->gp_tasks;
1275 raw_spin_unlock_irqrestore(&rnp->lock, flags);
1276 t = rnp->boost_kthread_task;
1277 if (t != NULL)
1278 wake_up_process(t);
1279 } else {
1280 rcu_initiate_boost_trace(rnp);
1281 raw_spin_unlock_irqrestore(&rnp->lock, flags);
1282 }
1283}
1284
1285/*
1286 * Wake up the per-CPU kthread to invoke RCU callbacks.
1287 */
1288static void invoke_rcu_callbacks_kthread(void)
1289{
1290 unsigned long flags;
1291
1292 local_irq_save(flags);
1293 __this_cpu_write(rcu_cpu_has_work, 1);
1294 if (__this_cpu_read(rcu_cpu_kthread_task) == NULL) {
1295 local_irq_restore(flags);
1296 return;
1297 }
1298 wake_up_process(__this_cpu_read(rcu_cpu_kthread_task));
1299 local_irq_restore(flags);
1300}
1301
1302/*
1303 * Set the affinity of the boost kthread. The CPU-hotplug locks are
1304 * held, so no one should be messing with the existence of the boost
1305 * kthread.
1306 */
1307static void rcu_boost_kthread_setaffinity(struct rcu_node *rnp,
1308 cpumask_var_t cm)
1309{
1310 struct task_struct *t;
1311
1312 t = rnp->boost_kthread_task;
1313 if (t != NULL)
1314 set_cpus_allowed_ptr(rnp->boost_kthread_task, cm);
1315}
1316
1317#define RCU_BOOST_DELAY_JIFFIES DIV_ROUND_UP(CONFIG_RCU_BOOST_DELAY * HZ, 1000)
1318
1319/*
1320 * Do priority-boost accounting for the start of a new grace period.
1321 */
1322static void rcu_preempt_boost_start_gp(struct rcu_node *rnp)
1323{
1324 rnp->boost_time = jiffies + RCU_BOOST_DELAY_JIFFIES;
1325}
1326
1327/*
1328 * Create an RCU-boost kthread for the specified node if one does not
1329 * already exist. We only create this kthread for preemptible RCU.
1330 * Returns zero if all is well, a negated errno otherwise.
1331 */
1332static int __cpuinit rcu_spawn_one_boost_kthread(struct rcu_state *rsp,
1333 struct rcu_node *rnp,
1334 int rnp_index)
1335{
1336 unsigned long flags;
1337 struct sched_param sp;
1338 struct task_struct *t;
1339
1340 if (&rcu_preempt_state != rsp)
1341 return 0;
1342 rsp->boost = 1;
1343 if (rnp->boost_kthread_task != NULL)
1344 return 0;
1345 t = kthread_create(rcu_boost_kthread, (void *)rnp,
1346 "rcub%d", rnp_index);
1347 if (IS_ERR(t))
1348 return PTR_ERR(t);
1349 raw_spin_lock_irqsave(&rnp->lock, flags);
1350 rnp->boost_kthread_task = t;
1351 raw_spin_unlock_irqrestore(&rnp->lock, flags);
1352 sp.sched_priority = RCU_KTHREAD_PRIO;
1353 sched_setscheduler_nocheck(t, SCHED_FIFO, &sp);
1354 wake_up_process(t); /* get to TASK_INTERRUPTIBLE quickly. */
1355 return 0;
1356}
1357
1358#ifdef CONFIG_HOTPLUG_CPU
1359
1360/*
1361 * Stop the RCU's per-CPU kthread when its CPU goes offline,.
1362 */
1363static void rcu_stop_cpu_kthread(int cpu)
1364{
1365 struct task_struct *t;
1366
1367 /* Stop the CPU's kthread. */
1368 t = per_cpu(rcu_cpu_kthread_task, cpu);
1369 if (t != NULL) {
1370 per_cpu(rcu_cpu_kthread_task, cpu) = NULL;
1371 kthread_stop(t);
1372 }
1373}
1374
1375#endif /* #ifdef CONFIG_HOTPLUG_CPU */
1376
1377static void rcu_kthread_do_work(void)
1378{
1379 rcu_do_batch(&rcu_sched_state, &__get_cpu_var(rcu_sched_data));
1380 rcu_do_batch(&rcu_bh_state, &__get_cpu_var(rcu_bh_data));
1381 rcu_preempt_do_callbacks();
1382}
1383
1384/*
1385 * Wake up the specified per-rcu_node-structure kthread.
1386 * Because the per-rcu_node kthreads are immortal, we don't need
1387 * to do anything to keep them alive.
1388 */
1389static void invoke_rcu_node_kthread(struct rcu_node *rnp)
1390{
1391 struct task_struct *t;
1392
1393 t = rnp->node_kthread_task;
1394 if (t != NULL)
1395 wake_up_process(t);
1396}
1397
1398/*
1399 * Set the specified CPU's kthread to run RT or not, as specified by
1400 * the to_rt argument. The CPU-hotplug locks are held, so the task
1401 * is not going away.
1402 */
1403static void rcu_cpu_kthread_setrt(int cpu, int to_rt)
1404{
1405 int policy;
1406 struct sched_param sp;
1407 struct task_struct *t;
1408
1409 t = per_cpu(rcu_cpu_kthread_task, cpu);
1410 if (t == NULL)
1411 return;
1412 if (to_rt) {
1413 policy = SCHED_FIFO;
1414 sp.sched_priority = RCU_KTHREAD_PRIO;
1415 } else {
1416 policy = SCHED_NORMAL;
1417 sp.sched_priority = 0;
1418 }
1419 sched_setscheduler_nocheck(t, policy, &sp);
1420}
1421
1422/*
1423 * Timer handler to initiate the waking up of per-CPU kthreads that
1424 * have yielded the CPU due to excess numbers of RCU callbacks.
1425 * We wake up the per-rcu_node kthread, which in turn will wake up
1426 * the booster kthread.
1427 */
1428static void rcu_cpu_kthread_timer(unsigned long arg)
1429{
1430 struct rcu_data *rdp = per_cpu_ptr(rcu_state->rda, arg);
1431 struct rcu_node *rnp = rdp->mynode;
1432
1433 atomic_or(rdp->grpmask, &rnp->wakemask);
1434 invoke_rcu_node_kthread(rnp);
1435}
1436
1437/*
1438 * Drop to non-real-time priority and yield, but only after posting a
1439 * timer that will cause us to regain our real-time priority if we
1440 * remain preempted. Either way, we restore our real-time priority
1441 * before returning.
1442 */
1443static void rcu_yield(void (*f)(unsigned long), unsigned long arg)
1444{
1445 struct sched_param sp;
1446 struct timer_list yield_timer;
1447
1448 setup_timer_on_stack(&yield_timer, f, arg);
1449 mod_timer(&yield_timer, jiffies + 2);
1450 sp.sched_priority = 0;
1451 sched_setscheduler_nocheck(current, SCHED_NORMAL, &sp);
1452 set_user_nice(current, 19);
1453 schedule();
1454 sp.sched_priority = RCU_KTHREAD_PRIO;
1455 sched_setscheduler_nocheck(current, SCHED_FIFO, &sp);
1456 del_timer(&yield_timer);
1457}
1458
1459/*
1460 * Handle cases where the rcu_cpu_kthread() ends up on the wrong CPU.
1461 * This can happen while the corresponding CPU is either coming online
1462 * or going offline. We cannot wait until the CPU is fully online
1463 * before starting the kthread, because the various notifier functions
1464 * can wait for RCU grace periods. So we park rcu_cpu_kthread() until
1465 * the corresponding CPU is online.
1466 *
1467 * Return 1 if the kthread needs to stop, 0 otherwise.
1468 *
1469 * Caller must disable bh. This function can momentarily enable it.
1470 */
1471static int rcu_cpu_kthread_should_stop(int cpu)
1472{
1473 while (cpu_is_offline(cpu) ||
1474 !cpumask_equal(&current->cpus_allowed, cpumask_of(cpu)) ||
1475 smp_processor_id() != cpu) {
1476 if (kthread_should_stop())
1477 return 1;
1478 per_cpu(rcu_cpu_kthread_status, cpu) = RCU_KTHREAD_OFFCPU;
1479 per_cpu(rcu_cpu_kthread_cpu, cpu) = raw_smp_processor_id();
1480 local_bh_enable();
1481 schedule_timeout_uninterruptible(1);
1482 if (!cpumask_equal(&current->cpus_allowed, cpumask_of(cpu)))
1483 set_cpus_allowed_ptr(current, cpumask_of(cpu));
1484 local_bh_disable();
1485 }
1486 per_cpu(rcu_cpu_kthread_cpu, cpu) = cpu;
1487 return 0;
1488}
1489
1490/*
1491 * Per-CPU kernel thread that invokes RCU callbacks. This replaces the
1492 * earlier RCU softirq.
1493 */
1494static int rcu_cpu_kthread(void *arg)
1495{
1496 int cpu = (int)(long)arg;
1497 unsigned long flags;
1498 int spincnt = 0;
1499 unsigned int *statusp = &per_cpu(rcu_cpu_kthread_status, cpu);
1500 char work;
1501 char *workp = &per_cpu(rcu_cpu_has_work, cpu);
1502
1503 for (;;) {
1504 *statusp = RCU_KTHREAD_WAITING;
1505 rcu_wait(*workp != 0 || kthread_should_stop());
1506 local_bh_disable();
1507 if (rcu_cpu_kthread_should_stop(cpu)) {
1508 local_bh_enable();
1509 break;
1510 }
1511 *statusp = RCU_KTHREAD_RUNNING;
1512 per_cpu(rcu_cpu_kthread_loops, cpu)++;
1513 local_irq_save(flags);
1514 work = *workp;
1515 *workp = 0;
1516 local_irq_restore(flags);
1517 if (work)
1518 rcu_kthread_do_work();
1519 local_bh_enable();
1520 if (*workp != 0)
1521 spincnt++;
1522 else
1523 spincnt = 0;
1524 if (spincnt > 10) {
1525 *statusp = RCU_KTHREAD_YIELDING;
1526 rcu_yield(rcu_cpu_kthread_timer, (unsigned long)cpu);
1527 spincnt = 0;
1528 }
1529 }
1530 *statusp = RCU_KTHREAD_STOPPED;
1531 return 0;
1532}
1533
1534/*
1535 * Spawn a per-CPU kthread, setting up affinity and priority.
1536 * Because the CPU hotplug lock is held, no other CPU will be attempting
1537 * to manipulate rcu_cpu_kthread_task. There might be another CPU
1538 * attempting to access it during boot, but the locking in kthread_bind()
1539 * will enforce sufficient ordering.
1540 *
1541 * Please note that we cannot simply refuse to wake up the per-CPU
1542 * kthread because kthreads are created in TASK_UNINTERRUPTIBLE state,
1543 * which can result in softlockup complaints if the task ends up being
1544 * idle for more than a couple of minutes.
1545 *
1546 * However, please note also that we cannot bind the per-CPU kthread to its
1547 * CPU until that CPU is fully online. We also cannot wait until the
1548 * CPU is fully online before we create its per-CPU kthread, as this would
1549 * deadlock the system when CPU notifiers tried waiting for grace
1550 * periods. So we bind the per-CPU kthread to its CPU only if the CPU
1551 * is online. If its CPU is not yet fully online, then the code in
1552 * rcu_cpu_kthread() will wait until it is fully online, and then do
1553 * the binding.
1554 */
1555static int __cpuinit rcu_spawn_one_cpu_kthread(int cpu)
1556{
1557 struct sched_param sp;
1558 struct task_struct *t;
1559
1560 if (!rcu_scheduler_fully_active ||
1561 per_cpu(rcu_cpu_kthread_task, cpu) != NULL)
1562 return 0;
1563 t = kthread_create(rcu_cpu_kthread, (void *)(long)cpu, "rcuc%d", cpu);
1564 if (IS_ERR(t))
1565 return PTR_ERR(t);
1566 if (cpu_online(cpu))
1567 kthread_bind(t, cpu);
1568 per_cpu(rcu_cpu_kthread_cpu, cpu) = cpu;
1569 WARN_ON_ONCE(per_cpu(rcu_cpu_kthread_task, cpu) != NULL);
1570 sp.sched_priority = RCU_KTHREAD_PRIO;
1571 sched_setscheduler_nocheck(t, SCHED_FIFO, &sp);
1572 per_cpu(rcu_cpu_kthread_task, cpu) = t;
1573 wake_up_process(t); /* Get to TASK_INTERRUPTIBLE quickly. */
1574 return 0;
1575}
1576
1577/*
1578 * Per-rcu_node kthread, which is in charge of waking up the per-CPU
1579 * kthreads when needed. We ignore requests to wake up kthreads
1580 * for offline CPUs, which is OK because force_quiescent_state()
1581 * takes care of this case.
1582 */
1583static int rcu_node_kthread(void *arg)
1584{
1585 int cpu;
1586 unsigned long flags;
1587 unsigned long mask;
1588 struct rcu_node *rnp = (struct rcu_node *)arg;
1589 struct sched_param sp;
1590 struct task_struct *t;
1591
1592 for (;;) {
1593 rnp->node_kthread_status = RCU_KTHREAD_WAITING;
1594 rcu_wait(atomic_read(&rnp->wakemask) != 0);
1595 rnp->node_kthread_status = RCU_KTHREAD_RUNNING;
1596 raw_spin_lock_irqsave(&rnp->lock, flags);
1597 mask = atomic_xchg(&rnp->wakemask, 0);
1598 rcu_initiate_boost(rnp, flags); /* releases rnp->lock. */
1599 for (cpu = rnp->grplo; cpu <= rnp->grphi; cpu++, mask >>= 1) {
1600 if ((mask & 0x1) == 0)
1601 continue;
1602 preempt_disable();
1603 t = per_cpu(rcu_cpu_kthread_task, cpu);
1604 if (!cpu_online(cpu) || t == NULL) {
1605 preempt_enable();
1606 continue;
1607 }
1608 per_cpu(rcu_cpu_has_work, cpu) = 1;
1609 sp.sched_priority = RCU_KTHREAD_PRIO;
1610 sched_setscheduler_nocheck(t, SCHED_FIFO, &sp);
1611 preempt_enable();
1612 }
1613 }
1614 /* NOTREACHED */
1615 rnp->node_kthread_status = RCU_KTHREAD_STOPPED;
1616 return 0;
1617}
1618
1619/*
1620 * Set the per-rcu_node kthread's affinity to cover all CPUs that are
1621 * served by the rcu_node in question. The CPU hotplug lock is still
1622 * held, so the value of rnp->qsmaskinit will be stable.
1623 *
1624 * We don't include outgoingcpu in the affinity set, use -1 if there is
1625 * no outgoing CPU. If there are no CPUs left in the affinity set,
1626 * this function allows the kthread to execute on any CPU.
1627 */
1628static void rcu_node_kthread_setaffinity(struct rcu_node *rnp, int outgoingcpu)
1629{
1630 cpumask_var_t cm;
1631 int cpu;
1632 unsigned long mask = rnp->qsmaskinit;
1633
1634 if (rnp->node_kthread_task == NULL)
1635 return;
1636 if (!alloc_cpumask_var(&cm, GFP_KERNEL))
1637 return;
1638 cpumask_clear(cm);
1639 for (cpu = rnp->grplo; cpu <= rnp->grphi; cpu++, mask >>= 1)
1640 if ((mask & 0x1) && cpu != outgoingcpu)
1641 cpumask_set_cpu(cpu, cm);
1642 if (cpumask_weight(cm) == 0) {
1643 cpumask_setall(cm);
1644 for (cpu = rnp->grplo; cpu <= rnp->grphi; cpu++)
1645 cpumask_clear_cpu(cpu, cm);
1646 WARN_ON_ONCE(cpumask_weight(cm) == 0);
1647 }
1648 set_cpus_allowed_ptr(rnp->node_kthread_task, cm);
1649 rcu_boost_kthread_setaffinity(rnp, cm);
1650 free_cpumask_var(cm);
1651}
1652
1653/*
1654 * Spawn a per-rcu_node kthread, setting priority and affinity.
1655 * Called during boot before online/offline can happen, or, if
1656 * during runtime, with the main CPU-hotplug locks held. So only
1657 * one of these can be executing at a time.
1658 */
1659static int __cpuinit rcu_spawn_one_node_kthread(struct rcu_state *rsp,
1660 struct rcu_node *rnp)
1661{
1662 unsigned long flags;
1663 int rnp_index = rnp - &rsp->node[0];
1664 struct sched_param sp;
1665 struct task_struct *t;
1666
1667 if (!rcu_scheduler_fully_active ||
1668 rnp->qsmaskinit == 0)
1669 return 0;
1670 if (rnp->node_kthread_task == NULL) {
1671 t = kthread_create(rcu_node_kthread, (void *)rnp,
1672 "rcun%d", rnp_index);
1673 if (IS_ERR(t))
1674 return PTR_ERR(t);
1675 raw_spin_lock_irqsave(&rnp->lock, flags);
1676 rnp->node_kthread_task = t;
1677 raw_spin_unlock_irqrestore(&rnp->lock, flags);
1678 sp.sched_priority = 99;
1679 sched_setscheduler_nocheck(t, SCHED_FIFO, &sp);
1680 wake_up_process(t); /* get to TASK_INTERRUPTIBLE quickly. */
1681 }
1682 return rcu_spawn_one_boost_kthread(rsp, rnp, rnp_index);
1683}
1684
1685/*
1686 * Spawn all kthreads -- called as soon as the scheduler is running.
1687 */
1688static int __init rcu_spawn_kthreads(void)
1689{
1690 int cpu;
1691 struct rcu_node *rnp;
1692
1693 rcu_scheduler_fully_active = 1;
1694 for_each_possible_cpu(cpu) {
1695 per_cpu(rcu_cpu_has_work, cpu) = 0;
1696 if (cpu_online(cpu))
1697 (void)rcu_spawn_one_cpu_kthread(cpu);
1698 }
1699 rnp = rcu_get_root(rcu_state);
1700 (void)rcu_spawn_one_node_kthread(rcu_state, rnp);
1701 if (NUM_RCU_NODES > 1) {
1702 rcu_for_each_leaf_node(rcu_state, rnp)
1703 (void)rcu_spawn_one_node_kthread(rcu_state, rnp);
1704 }
1705 return 0;
1706}
1707early_initcall(rcu_spawn_kthreads);
1708
1709static void __cpuinit rcu_prepare_kthreads(int cpu)
1710{
1711 struct rcu_data *rdp = per_cpu_ptr(rcu_state->rda, cpu);
1712 struct rcu_node *rnp = rdp->mynode;
1713
1714 /* Fire up the incoming CPU's kthread and leaf rcu_node kthread. */
1715 if (rcu_scheduler_fully_active) {
1716 (void)rcu_spawn_one_cpu_kthread(cpu);
1717 if (rnp->node_kthread_task == NULL)
1718 (void)rcu_spawn_one_node_kthread(rcu_state, rnp);
1719 }
1720}
1721
1722#else /* #ifdef CONFIG_RCU_BOOST */
1723
1724static void rcu_initiate_boost(struct rcu_node *rnp, unsigned long flags)
1725{
1726 raw_spin_unlock_irqrestore(&rnp->lock, flags);
1727}
1728
1729static void invoke_rcu_callbacks_kthread(void)
1730{
1731 WARN_ON_ONCE(1);
1732}
1733
1734static void rcu_preempt_boost_start_gp(struct rcu_node *rnp)
1735{
1736}
1737
1738#ifdef CONFIG_HOTPLUG_CPU
1739
1740static void rcu_stop_cpu_kthread(int cpu)
1741{
1742}
1743
1744#endif /* #ifdef CONFIG_HOTPLUG_CPU */
1745
1746static void rcu_node_kthread_setaffinity(struct rcu_node *rnp, int outgoingcpu)
1747{
1748}
1749
1750static void rcu_cpu_kthread_setrt(int cpu, int to_rt)
1751{
1752}
1753
1754static int __init rcu_scheduler_really_started(void)
1755{
1756 rcu_scheduler_fully_active = 1;
1757 return 0;
1758}
1759early_initcall(rcu_scheduler_really_started);
1760
1761static void __cpuinit rcu_prepare_kthreads(int cpu)
1762{
1763}
1764
1765#endif /* #else #ifdef CONFIG_RCU_BOOST */
1766
1018#ifndef CONFIG_SMP 1767#ifndef CONFIG_SMP
1019 1768
1020void synchronize_sched_expedited(void) 1769void synchronize_sched_expedited(void)
@@ -1187,14 +1936,13 @@ static DEFINE_PER_CPU(unsigned long, rcu_dyntick_holdoff);
1187 * 1936 *
1188 * Because it is not legal to invoke rcu_process_callbacks() with irqs 1937 * Because it is not legal to invoke rcu_process_callbacks() with irqs
1189 * disabled, we do one pass of force_quiescent_state(), then do a 1938 * disabled, we do one pass of force_quiescent_state(), then do a
1190 * raise_softirq() to cause rcu_process_callbacks() to be invoked later. 1939 * invoke_rcu_core() to cause rcu_process_callbacks() to be invoked
1191 * The per-cpu rcu_dyntick_drain variable controls the sequencing. 1940 * later. The per-cpu rcu_dyntick_drain variable controls the sequencing.
1192 */ 1941 */
1193int rcu_needs_cpu(int cpu) 1942int rcu_needs_cpu(int cpu)
1194{ 1943{
1195 int c = 0; 1944 int c = 0;
1196 int snap; 1945 int snap;
1197 int snap_nmi;
1198 int thatcpu; 1946 int thatcpu;
1199 1947
1200 /* Check for being in the holdoff period. */ 1948 /* Check for being in the holdoff period. */
@@ -1205,10 +1953,10 @@ int rcu_needs_cpu(int cpu)
1205 for_each_online_cpu(thatcpu) { 1953 for_each_online_cpu(thatcpu) {
1206 if (thatcpu == cpu) 1954 if (thatcpu == cpu)
1207 continue; 1955 continue;
1208 snap = per_cpu(rcu_dynticks, thatcpu).dynticks; 1956 snap = atomic_add_return(0, &per_cpu(rcu_dynticks,
1209 snap_nmi = per_cpu(rcu_dynticks, thatcpu).dynticks_nmi; 1957 thatcpu).dynticks);
1210 smp_mb(); /* Order sampling of snap with end of grace period. */ 1958 smp_mb(); /* Order sampling of snap with end of grace period. */
1211 if (((snap & 0x1) != 0) || ((snap_nmi & 0x1) != 0)) { 1959 if ((snap & 0x1) != 0) {
1212 per_cpu(rcu_dyntick_drain, cpu) = 0; 1960 per_cpu(rcu_dyntick_drain, cpu) = 0;
1213 per_cpu(rcu_dyntick_holdoff, cpu) = jiffies - 1; 1961 per_cpu(rcu_dyntick_holdoff, cpu) = jiffies - 1;
1214 return rcu_needs_cpu_quick_check(cpu); 1962 return rcu_needs_cpu_quick_check(cpu);
@@ -1239,7 +1987,7 @@ int rcu_needs_cpu(int cpu)
1239 1987
1240 /* If RCU callbacks are still pending, RCU still needs this CPU. */ 1988 /* If RCU callbacks are still pending, RCU still needs this CPU. */
1241 if (c) 1989 if (c)
1242 raise_softirq(RCU_SOFTIRQ); 1990 invoke_rcu_core();
1243 return c; 1991 return c;
1244} 1992}
1245 1993
diff --git a/kernel/rcutree_trace.c b/kernel/rcutree_trace.c
index c8e97853b970..3b0c0986afc0 100644
--- a/kernel/rcutree_trace.c
+++ b/kernel/rcutree_trace.c
@@ -31,7 +31,7 @@
31#include <linux/rcupdate.h> 31#include <linux/rcupdate.h>
32#include <linux/interrupt.h> 32#include <linux/interrupt.h>
33#include <linux/sched.h> 33#include <linux/sched.h>
34#include <asm/atomic.h> 34#include <linux/atomic.h>
35#include <linux/bitops.h> 35#include <linux/bitops.h>
36#include <linux/module.h> 36#include <linux/module.h>
37#include <linux/completion.h> 37#include <linux/completion.h>
@@ -46,6 +46,22 @@
46#define RCU_TREE_NONCORE 46#define RCU_TREE_NONCORE
47#include "rcutree.h" 47#include "rcutree.h"
48 48
49#ifdef CONFIG_RCU_BOOST
50
51DECLARE_PER_CPU(unsigned int, rcu_cpu_kthread_status);
52DECLARE_PER_CPU(unsigned int, rcu_cpu_kthread_cpu);
53DECLARE_PER_CPU(unsigned int, rcu_cpu_kthread_loops);
54DECLARE_PER_CPU(char, rcu_cpu_has_work);
55
56static char convert_kthread_status(unsigned int kthread_status)
57{
58 if (kthread_status > RCU_KTHREAD_MAX)
59 return '?';
60 return "SRWOY"[kthread_status];
61}
62
63#endif /* #ifdef CONFIG_RCU_BOOST */
64
49static void print_one_rcu_data(struct seq_file *m, struct rcu_data *rdp) 65static void print_one_rcu_data(struct seq_file *m, struct rcu_data *rdp)
50{ 66{
51 if (!rdp->beenonline) 67 if (!rdp->beenonline)
@@ -57,14 +73,31 @@ static void print_one_rcu_data(struct seq_file *m, struct rcu_data *rdp)
57 rdp->passed_quiesc, rdp->passed_quiesc_completed, 73 rdp->passed_quiesc, rdp->passed_quiesc_completed,
58 rdp->qs_pending); 74 rdp->qs_pending);
59#ifdef CONFIG_NO_HZ 75#ifdef CONFIG_NO_HZ
60 seq_printf(m, " dt=%d/%d dn=%d df=%lu", 76 seq_printf(m, " dt=%d/%d/%d df=%lu",
61 rdp->dynticks->dynticks, 77 atomic_read(&rdp->dynticks->dynticks),
62 rdp->dynticks->dynticks_nesting, 78 rdp->dynticks->dynticks_nesting,
63 rdp->dynticks->dynticks_nmi, 79 rdp->dynticks->dynticks_nmi_nesting,
64 rdp->dynticks_fqs); 80 rdp->dynticks_fqs);
65#endif /* #ifdef CONFIG_NO_HZ */ 81#endif /* #ifdef CONFIG_NO_HZ */
66 seq_printf(m, " of=%lu ri=%lu", rdp->offline_fqs, rdp->resched_ipi); 82 seq_printf(m, " of=%lu ri=%lu", rdp->offline_fqs, rdp->resched_ipi);
67 seq_printf(m, " ql=%ld b=%ld", rdp->qlen, rdp->blimit); 83 seq_printf(m, " ql=%ld qs=%c%c%c%c",
84 rdp->qlen,
85 ".N"[rdp->nxttail[RCU_NEXT_READY_TAIL] !=
86 rdp->nxttail[RCU_NEXT_TAIL]],
87 ".R"[rdp->nxttail[RCU_WAIT_TAIL] !=
88 rdp->nxttail[RCU_NEXT_READY_TAIL]],
89 ".W"[rdp->nxttail[RCU_DONE_TAIL] !=
90 rdp->nxttail[RCU_WAIT_TAIL]],
91 ".D"[&rdp->nxtlist != rdp->nxttail[RCU_DONE_TAIL]]);
92#ifdef CONFIG_RCU_BOOST
93 seq_printf(m, " kt=%d/%c/%d ktl=%x",
94 per_cpu(rcu_cpu_has_work, rdp->cpu),
95 convert_kthread_status(per_cpu(rcu_cpu_kthread_status,
96 rdp->cpu)),
97 per_cpu(rcu_cpu_kthread_cpu, rdp->cpu),
98 per_cpu(rcu_cpu_kthread_loops, rdp->cpu) & 0xffff);
99#endif /* #ifdef CONFIG_RCU_BOOST */
100 seq_printf(m, " b=%ld", rdp->blimit);
68 seq_printf(m, " ci=%lu co=%lu ca=%lu\n", 101 seq_printf(m, " ci=%lu co=%lu ca=%lu\n",
69 rdp->n_cbs_invoked, rdp->n_cbs_orphaned, rdp->n_cbs_adopted); 102 rdp->n_cbs_invoked, rdp->n_cbs_orphaned, rdp->n_cbs_adopted);
70} 103}
@@ -115,13 +148,27 @@ static void print_one_rcu_data_csv(struct seq_file *m, struct rcu_data *rdp)
115 rdp->qs_pending); 148 rdp->qs_pending);
116#ifdef CONFIG_NO_HZ 149#ifdef CONFIG_NO_HZ
117 seq_printf(m, ",%d,%d,%d,%lu", 150 seq_printf(m, ",%d,%d,%d,%lu",
118 rdp->dynticks->dynticks, 151 atomic_read(&rdp->dynticks->dynticks),
119 rdp->dynticks->dynticks_nesting, 152 rdp->dynticks->dynticks_nesting,
120 rdp->dynticks->dynticks_nmi, 153 rdp->dynticks->dynticks_nmi_nesting,
121 rdp->dynticks_fqs); 154 rdp->dynticks_fqs);
122#endif /* #ifdef CONFIG_NO_HZ */ 155#endif /* #ifdef CONFIG_NO_HZ */
123 seq_printf(m, ",%lu,%lu", rdp->offline_fqs, rdp->resched_ipi); 156 seq_printf(m, ",%lu,%lu", rdp->offline_fqs, rdp->resched_ipi);
124 seq_printf(m, ",%ld,%ld", rdp->qlen, rdp->blimit); 157 seq_printf(m, ",%ld,\"%c%c%c%c\"", rdp->qlen,
158 ".N"[rdp->nxttail[RCU_NEXT_READY_TAIL] !=
159 rdp->nxttail[RCU_NEXT_TAIL]],
160 ".R"[rdp->nxttail[RCU_WAIT_TAIL] !=
161 rdp->nxttail[RCU_NEXT_READY_TAIL]],
162 ".W"[rdp->nxttail[RCU_DONE_TAIL] !=
163 rdp->nxttail[RCU_WAIT_TAIL]],
164 ".D"[&rdp->nxtlist != rdp->nxttail[RCU_DONE_TAIL]]);
165#ifdef CONFIG_RCU_BOOST
166 seq_printf(m, ",%d,\"%c\"",
167 per_cpu(rcu_cpu_has_work, rdp->cpu),
168 convert_kthread_status(per_cpu(rcu_cpu_kthread_status,
169 rdp->cpu)));
170#endif /* #ifdef CONFIG_RCU_BOOST */
171 seq_printf(m, ",%ld", rdp->blimit);
125 seq_printf(m, ",%lu,%lu,%lu\n", 172 seq_printf(m, ",%lu,%lu,%lu\n",
126 rdp->n_cbs_invoked, rdp->n_cbs_orphaned, rdp->n_cbs_adopted); 173 rdp->n_cbs_invoked, rdp->n_cbs_orphaned, rdp->n_cbs_adopted);
127} 174}
@@ -130,9 +177,13 @@ static int show_rcudata_csv(struct seq_file *m, void *unused)
130{ 177{
131 seq_puts(m, "\"CPU\",\"Online?\",\"c\",\"g\",\"pq\",\"pqc\",\"pq\","); 178 seq_puts(m, "\"CPU\",\"Online?\",\"c\",\"g\",\"pq\",\"pqc\",\"pq\",");
132#ifdef CONFIG_NO_HZ 179#ifdef CONFIG_NO_HZ
133 seq_puts(m, "\"dt\",\"dt nesting\",\"dn\",\"df\","); 180 seq_puts(m, "\"dt\",\"dt nesting\",\"dt NMI nesting\",\"df\",");
134#endif /* #ifdef CONFIG_NO_HZ */ 181#endif /* #ifdef CONFIG_NO_HZ */
135 seq_puts(m, "\"of\",\"ri\",\"ql\",\"b\",\"ci\",\"co\",\"ca\"\n"); 182 seq_puts(m, "\"of\",\"ri\",\"ql\",\"qs\"");
183#ifdef CONFIG_RCU_BOOST
184 seq_puts(m, "\"kt\",\"ktl\"");
185#endif /* #ifdef CONFIG_RCU_BOOST */
186 seq_puts(m, ",\"b\",\"ci\",\"co\",\"ca\"\n");
136#ifdef CONFIG_TREE_PREEMPT_RCU 187#ifdef CONFIG_TREE_PREEMPT_RCU
137 seq_puts(m, "\"rcu_preempt:\"\n"); 188 seq_puts(m, "\"rcu_preempt:\"\n");
138 PRINT_RCU_DATA(rcu_preempt_data, print_one_rcu_data_csv, m); 189 PRINT_RCU_DATA(rcu_preempt_data, print_one_rcu_data_csv, m);
@@ -157,11 +208,76 @@ static const struct file_operations rcudata_csv_fops = {
157 .release = single_release, 208 .release = single_release,
158}; 209};
159 210
211#ifdef CONFIG_RCU_BOOST
212
213static void print_one_rcu_node_boost(struct seq_file *m, struct rcu_node *rnp)
214{
215 seq_printf(m, "%d:%d tasks=%c%c%c%c kt=%c ntb=%lu neb=%lu nnb=%lu "
216 "j=%04x bt=%04x\n",
217 rnp->grplo, rnp->grphi,
218 "T."[list_empty(&rnp->blkd_tasks)],
219 "N."[!rnp->gp_tasks],
220 "E."[!rnp->exp_tasks],
221 "B."[!rnp->boost_tasks],
222 convert_kthread_status(rnp->boost_kthread_status),
223 rnp->n_tasks_boosted, rnp->n_exp_boosts,
224 rnp->n_normal_boosts,
225 (int)(jiffies & 0xffff),
226 (int)(rnp->boost_time & 0xffff));
227 seq_printf(m, "%s: nt=%lu egt=%lu bt=%lu nb=%lu ny=%lu nos=%lu\n",
228 " balk",
229 rnp->n_balk_blkd_tasks,
230 rnp->n_balk_exp_gp_tasks,
231 rnp->n_balk_boost_tasks,
232 rnp->n_balk_notblocked,
233 rnp->n_balk_notyet,
234 rnp->n_balk_nos);
235}
236
237static int show_rcu_node_boost(struct seq_file *m, void *unused)
238{
239 struct rcu_node *rnp;
240
241 rcu_for_each_leaf_node(&rcu_preempt_state, rnp)
242 print_one_rcu_node_boost(m, rnp);
243 return 0;
244}
245
246static int rcu_node_boost_open(struct inode *inode, struct file *file)
247{
248 return single_open(file, show_rcu_node_boost, NULL);
249}
250
251static const struct file_operations rcu_node_boost_fops = {
252 .owner = THIS_MODULE,
253 .open = rcu_node_boost_open,
254 .read = seq_read,
255 .llseek = seq_lseek,
256 .release = single_release,
257};
258
259/*
260 * Create the rcuboost debugfs entry. Standard error return.
261 */
262static int rcu_boost_trace_create_file(struct dentry *rcudir)
263{
264 return !debugfs_create_file("rcuboost", 0444, rcudir, NULL,
265 &rcu_node_boost_fops);
266}
267
268#else /* #ifdef CONFIG_RCU_BOOST */
269
270static int rcu_boost_trace_create_file(struct dentry *rcudir)
271{
272 return 0; /* There cannot be an error if we didn't create it! */
273}
274
275#endif /* #else #ifdef CONFIG_RCU_BOOST */
276
160static void print_one_rcu_state(struct seq_file *m, struct rcu_state *rsp) 277static void print_one_rcu_state(struct seq_file *m, struct rcu_state *rsp)
161{ 278{
162 unsigned long gpnum; 279 unsigned long gpnum;
163 int level = 0; 280 int level = 0;
164 int phase;
165 struct rcu_node *rnp; 281 struct rcu_node *rnp;
166 282
167 gpnum = rsp->gpnum; 283 gpnum = rsp->gpnum;
@@ -178,13 +294,11 @@ static void print_one_rcu_state(struct seq_file *m, struct rcu_state *rsp)
178 seq_puts(m, "\n"); 294 seq_puts(m, "\n");
179 level = rnp->level; 295 level = rnp->level;
180 } 296 }
181 phase = gpnum & 0x1; 297 seq_printf(m, "%lx/%lx %c%c>%c %d:%d ^%d ",
182 seq_printf(m, "%lx/%lx %c%c>%c%c %d:%d ^%d ",
183 rnp->qsmask, rnp->qsmaskinit, 298 rnp->qsmask, rnp->qsmaskinit,
184 "T."[list_empty(&rnp->blocked_tasks[phase])], 299 ".G"[rnp->gp_tasks != NULL],
185 "E."[list_empty(&rnp->blocked_tasks[phase + 2])], 300 ".E"[rnp->exp_tasks != NULL],
186 "T."[list_empty(&rnp->blocked_tasks[!phase])], 301 ".T"[!list_empty(&rnp->blkd_tasks)],
187 "E."[list_empty(&rnp->blocked_tasks[!phase + 2])],
188 rnp->grplo, rnp->grphi, rnp->grpnum); 302 rnp->grplo, rnp->grphi, rnp->grpnum);
189 } 303 }
190 seq_puts(m, "\n"); 304 seq_puts(m, "\n");
@@ -216,16 +330,35 @@ static const struct file_operations rcuhier_fops = {
216 .release = single_release, 330 .release = single_release,
217}; 331};
218 332
333static void show_one_rcugp(struct seq_file *m, struct rcu_state *rsp)
334{
335 unsigned long flags;
336 unsigned long completed;
337 unsigned long gpnum;
338 unsigned long gpage;
339 unsigned long gpmax;
340 struct rcu_node *rnp = &rsp->node[0];
341
342 raw_spin_lock_irqsave(&rnp->lock, flags);
343 completed = rsp->completed;
344 gpnum = rsp->gpnum;
345 if (rsp->completed == rsp->gpnum)
346 gpage = 0;
347 else
348 gpage = jiffies - rsp->gp_start;
349 gpmax = rsp->gp_max;
350 raw_spin_unlock_irqrestore(&rnp->lock, flags);
351 seq_printf(m, "%s: completed=%ld gpnum=%lu age=%ld max=%ld\n",
352 rsp->name, completed, gpnum, gpage, gpmax);
353}
354
219static int show_rcugp(struct seq_file *m, void *unused) 355static int show_rcugp(struct seq_file *m, void *unused)
220{ 356{
221#ifdef CONFIG_TREE_PREEMPT_RCU 357#ifdef CONFIG_TREE_PREEMPT_RCU
222 seq_printf(m, "rcu_preempt: completed=%ld gpnum=%lu\n", 358 show_one_rcugp(m, &rcu_preempt_state);
223 rcu_preempt_state.completed, rcu_preempt_state.gpnum);
224#endif /* #ifdef CONFIG_TREE_PREEMPT_RCU */ 359#endif /* #ifdef CONFIG_TREE_PREEMPT_RCU */
225 seq_printf(m, "rcu_sched: completed=%ld gpnum=%lu\n", 360 show_one_rcugp(m, &rcu_sched_state);
226 rcu_sched_state.completed, rcu_sched_state.gpnum); 361 show_one_rcugp(m, &rcu_bh_state);
227 seq_printf(m, "rcu_bh: completed=%ld gpnum=%lu\n",
228 rcu_bh_state.completed, rcu_bh_state.gpnum);
229 return 0; 362 return 0;
230} 363}
231 364
@@ -298,6 +431,29 @@ static const struct file_operations rcu_pending_fops = {
298 .release = single_release, 431 .release = single_release,
299}; 432};
300 433
434static int show_rcutorture(struct seq_file *m, void *unused)
435{
436 seq_printf(m, "rcutorture test sequence: %lu %s\n",
437 rcutorture_testseq >> 1,
438 (rcutorture_testseq & 0x1) ? "(test in progress)" : "");
439 seq_printf(m, "rcutorture update version number: %lu\n",
440 rcutorture_vernum);
441 return 0;
442}
443
444static int rcutorture_open(struct inode *inode, struct file *file)
445{
446 return single_open(file, show_rcutorture, NULL);
447}
448
449static const struct file_operations rcutorture_fops = {
450 .owner = THIS_MODULE,
451 .open = rcutorture_open,
452 .read = seq_read,
453 .llseek = seq_lseek,
454 .release = single_release,
455};
456
301static struct dentry *rcudir; 457static struct dentry *rcudir;
302 458
303static int __init rcutree_trace_init(void) 459static int __init rcutree_trace_init(void)
@@ -318,6 +474,9 @@ static int __init rcutree_trace_init(void)
318 if (!retval) 474 if (!retval)
319 goto free_out; 475 goto free_out;
320 476
477 if (rcu_boost_trace_create_file(rcudir))
478 goto free_out;
479
321 retval = debugfs_create_file("rcugp", 0444, rcudir, NULL, &rcugp_fops); 480 retval = debugfs_create_file("rcugp", 0444, rcudir, NULL, &rcugp_fops);
322 if (!retval) 481 if (!retval)
323 goto free_out; 482 goto free_out;
@@ -331,6 +490,11 @@ static int __init rcutree_trace_init(void)
331 NULL, &rcu_pending_fops); 490 NULL, &rcu_pending_fops);
332 if (!retval) 491 if (!retval)
333 goto free_out; 492 goto free_out;
493
494 retval = debugfs_create_file("rcutorture", 0444, rcudir,
495 NULL, &rcutorture_fops);
496 if (!retval)
497 goto free_out;
334 return 0; 498 return 0;
335free_out: 499free_out:
336 debugfs_remove_recursive(rcudir); 500 debugfs_remove_recursive(rcudir);
diff --git a/kernel/resource.c b/kernel/resource.c
index 798e2fae2a06..3b3cedc52592 100644
--- a/kernel/resource.c
+++ b/kernel/resource.c
@@ -38,6 +38,14 @@ struct resource iomem_resource = {
38}; 38};
39EXPORT_SYMBOL(iomem_resource); 39EXPORT_SYMBOL(iomem_resource);
40 40
41/* constraints to be met while allocating resources */
42struct resource_constraint {
43 resource_size_t min, max, align;
44 resource_size_t (*alignf)(void *, const struct resource *,
45 resource_size_t, resource_size_t);
46 void *alignf_data;
47};
48
41static DEFINE_RWLOCK(resource_lock); 49static DEFINE_RWLOCK(resource_lock);
42 50
43static void *r_next(struct seq_file *m, void *v, loff_t *pos) 51static void *r_next(struct seq_file *m, void *v, loff_t *pos)
@@ -384,16 +392,13 @@ static bool resource_contains(struct resource *res1, struct resource *res2)
384} 392}
385 393
386/* 394/*
387 * Find empty slot in the resource tree given range and alignment. 395 * Find empty slot in the resource tree with the given range and
396 * alignment constraints
388 */ 397 */
389static int find_resource(struct resource *root, struct resource *new, 398static int __find_resource(struct resource *root, struct resource *old,
390 resource_size_t size, resource_size_t min, 399 struct resource *new,
391 resource_size_t max, resource_size_t align, 400 resource_size_t size,
392 resource_size_t (*alignf)(void *, 401 struct resource_constraint *constraint)
393 const struct resource *,
394 resource_size_t,
395 resource_size_t),
396 void *alignf_data)
397{ 402{
398 struct resource *this = root->child; 403 struct resource *this = root->child;
399 struct resource tmp = *new, avail, alloc; 404 struct resource tmp = *new, avail, alloc;
@@ -404,25 +409,26 @@ static int find_resource(struct resource *root, struct resource *new,
404 * Skip past an allocated resource that starts at 0, since the assignment 409 * Skip past an allocated resource that starts at 0, since the assignment
405 * of this->start - 1 to tmp->end below would cause an underflow. 410 * of this->start - 1 to tmp->end below would cause an underflow.
406 */ 411 */
407 if (this && this->start == 0) { 412 if (this && this->start == root->start) {
408 tmp.start = this->end + 1; 413 tmp.start = (this == old) ? old->start : this->end + 1;
409 this = this->sibling; 414 this = this->sibling;
410 } 415 }
411 for(;;) { 416 for(;;) {
412 if (this) 417 if (this)
413 tmp.end = this->start - 1; 418 tmp.end = (this == old) ? this->end : this->start - 1;
414 else 419 else
415 tmp.end = root->end; 420 tmp.end = root->end;
416 421
417 resource_clip(&tmp, min, max); 422 resource_clip(&tmp, constraint->min, constraint->max);
418 arch_remove_reservations(&tmp); 423 arch_remove_reservations(&tmp);
419 424
420 /* Check for overflow after ALIGN() */ 425 /* Check for overflow after ALIGN() */
421 avail = *new; 426 avail = *new;
422 avail.start = ALIGN(tmp.start, align); 427 avail.start = ALIGN(tmp.start, constraint->align);
423 avail.end = tmp.end; 428 avail.end = tmp.end;
424 if (avail.start >= tmp.start) { 429 if (avail.start >= tmp.start) {
425 alloc.start = alignf(alignf_data, &avail, size, align); 430 alloc.start = constraint->alignf(constraint->alignf_data, &avail,
431 size, constraint->align);
426 alloc.end = alloc.start + size - 1; 432 alloc.end = alloc.start + size - 1;
427 if (resource_contains(&avail, &alloc)) { 433 if (resource_contains(&avail, &alloc)) {
428 new->start = alloc.start; 434 new->start = alloc.start;
@@ -432,14 +438,75 @@ static int find_resource(struct resource *root, struct resource *new,
432 } 438 }
433 if (!this) 439 if (!this)
434 break; 440 break;
435 tmp.start = this->end + 1; 441 if (this != old)
442 tmp.start = this->end + 1;
436 this = this->sibling; 443 this = this->sibling;
437 } 444 }
438 return -EBUSY; 445 return -EBUSY;
439} 446}
440 447
448/*
449 * Find empty slot in the resource tree given range and alignment.
450 */
451static int find_resource(struct resource *root, struct resource *new,
452 resource_size_t size,
453 struct resource_constraint *constraint)
454{
455 return __find_resource(root, NULL, new, size, constraint);
456}
457
441/** 458/**
442 * allocate_resource - allocate empty slot in the resource tree given range & alignment 459 * reallocate_resource - allocate a slot in the resource tree given range & alignment.
460 * The resource will be relocated if the new size cannot be reallocated in the
461 * current location.
462 *
463 * @root: root resource descriptor
464 * @old: resource descriptor desired by caller
465 * @newsize: new size of the resource descriptor
466 * @constraint: the size and alignment constraints to be met.
467 */
468int reallocate_resource(struct resource *root, struct resource *old,
469 resource_size_t newsize,
470 struct resource_constraint *constraint)
471{
472 int err=0;
473 struct resource new = *old;
474 struct resource *conflict;
475
476 write_lock(&resource_lock);
477
478 if ((err = __find_resource(root, old, &new, newsize, constraint)))
479 goto out;
480
481 if (resource_contains(&new, old)) {
482 old->start = new.start;
483 old->end = new.end;
484 goto out;
485 }
486
487 if (old->child) {
488 err = -EBUSY;
489 goto out;
490 }
491
492 if (resource_contains(old, &new)) {
493 old->start = new.start;
494 old->end = new.end;
495 } else {
496 __release_resource(old);
497 *old = new;
498 conflict = __request_resource(root, old);
499 BUG_ON(conflict);
500 }
501out:
502 write_unlock(&resource_lock);
503 return err;
504}
505
506
507/**
508 * allocate_resource - allocate empty slot in the resource tree given range & alignment.
509 * The resource will be reallocated with a new size if it was already allocated
443 * @root: root resource descriptor 510 * @root: root resource descriptor
444 * @new: resource descriptor desired by caller 511 * @new: resource descriptor desired by caller
445 * @size: requested resource region size 512 * @size: requested resource region size
@@ -459,12 +526,25 @@ int allocate_resource(struct resource *root, struct resource *new,
459 void *alignf_data) 526 void *alignf_data)
460{ 527{
461 int err; 528 int err;
529 struct resource_constraint constraint;
462 530
463 if (!alignf) 531 if (!alignf)
464 alignf = simple_align_resource; 532 alignf = simple_align_resource;
465 533
534 constraint.min = min;
535 constraint.max = max;
536 constraint.align = align;
537 constraint.alignf = alignf;
538 constraint.alignf_data = alignf_data;
539
540 if ( new->parent ) {
541 /* resource is already allocated, try reallocating with
542 the new constraints */
543 return reallocate_resource(root, new, size, &constraint);
544 }
545
466 write_lock(&resource_lock); 546 write_lock(&resource_lock);
467 err = find_resource(root, new, size, min, max, align, alignf, alignf_data); 547 err = find_resource(root, new, size, &constraint);
468 if (err >= 0 && __request_resource(root, new)) 548 if (err >= 0 && __request_resource(root, new))
469 err = -EBUSY; 549 err = -EBUSY;
470 write_unlock(&resource_lock); 550 write_unlock(&resource_lock);
@@ -473,6 +553,27 @@ int allocate_resource(struct resource *root, struct resource *new,
473 553
474EXPORT_SYMBOL(allocate_resource); 554EXPORT_SYMBOL(allocate_resource);
475 555
556/**
557 * lookup_resource - find an existing resource by a resource start address
558 * @root: root resource descriptor
559 * @start: resource start address
560 *
561 * Returns a pointer to the resource if found, NULL otherwise
562 */
563struct resource *lookup_resource(struct resource *root, resource_size_t start)
564{
565 struct resource *res;
566
567 read_lock(&resource_lock);
568 for (res = root->child; res; res = res->sibling) {
569 if (res->start == start)
570 break;
571 }
572 read_unlock(&resource_lock);
573
574 return res;
575}
576
476/* 577/*
477 * Insert a resource into the resource tree. If successful, return NULL, 578 * Insert a resource into the resource tree. If successful, return NULL,
478 * otherwise return the conflicting resource (compare to __request_resource()) 579 * otherwise return the conflicting resource (compare to __request_resource())
diff --git a/kernel/rtmutex.c b/kernel/rtmutex.c
index ab449117aaf2..255e1662acdb 100644
--- a/kernel/rtmutex.c
+++ b/kernel/rtmutex.c
@@ -890,7 +890,7 @@ void __rt_mutex_init(struct rt_mutex *lock, const char *name)
890{ 890{
891 lock->owner = NULL; 891 lock->owner = NULL;
892 raw_spin_lock_init(&lock->wait_lock); 892 raw_spin_lock_init(&lock->wait_lock);
893 plist_head_init_raw(&lock->wait_list, &lock->wait_lock); 893 plist_head_init(&lock->wait_list);
894 894
895 debug_rt_mutex_init(lock, name); 895 debug_rt_mutex_init(lock, name);
896} 896}
diff --git a/kernel/rwsem.c b/kernel/rwsem.c
index cae050b05f5e..9f48f3d82e9b 100644
--- a/kernel/rwsem.c
+++ b/kernel/rwsem.c
@@ -11,7 +11,7 @@
11#include <linux/rwsem.h> 11#include <linux/rwsem.h>
12 12
13#include <asm/system.h> 13#include <asm/system.h>
14#include <asm/atomic.h> 14#include <linux/atomic.h>
15 15
16/* 16/*
17 * lock for reading 17 * lock for reading
@@ -117,15 +117,6 @@ void down_read_nested(struct rw_semaphore *sem, int subclass)
117 117
118EXPORT_SYMBOL(down_read_nested); 118EXPORT_SYMBOL(down_read_nested);
119 119
120void down_read_non_owner(struct rw_semaphore *sem)
121{
122 might_sleep();
123
124 __down_read(sem);
125}
126
127EXPORT_SYMBOL(down_read_non_owner);
128
129void down_write_nested(struct rw_semaphore *sem, int subclass) 120void down_write_nested(struct rw_semaphore *sem, int subclass)
130{ 121{
131 might_sleep(); 122 might_sleep();
@@ -136,13 +127,6 @@ void down_write_nested(struct rw_semaphore *sem, int subclass)
136 127
137EXPORT_SYMBOL(down_write_nested); 128EXPORT_SYMBOL(down_write_nested);
138 129
139void up_read_non_owner(struct rw_semaphore *sem)
140{
141 __up_read(sem);
142}
143
144EXPORT_SYMBOL(up_read_non_owner);
145
146#endif 130#endif
147 131
148 132
diff --git a/kernel/sched.c b/kernel/sched.c
index 312f8b95c2d4..ccacdbdecf45 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -75,6 +75,9 @@
75#include <asm/tlb.h> 75#include <asm/tlb.h>
76#include <asm/irq_regs.h> 76#include <asm/irq_regs.h>
77#include <asm/mutex.h> 77#include <asm/mutex.h>
78#ifdef CONFIG_PARAVIRT
79#include <asm/paravirt.h>
80#endif
78 81
79#include "sched_cpupri.h" 82#include "sched_cpupri.h"
80#include "workqueue_sched.h" 83#include "workqueue_sched.h"
@@ -124,7 +127,7 @@
124 127
125static inline int rt_policy(int policy) 128static inline int rt_policy(int policy)
126{ 129{
127 if (unlikely(policy == SCHED_FIFO || policy == SCHED_RR)) 130 if (policy == SCHED_FIFO || policy == SCHED_RR)
128 return 1; 131 return 1;
129 return 0; 132 return 0;
130} 133}
@@ -231,7 +234,7 @@ static void destroy_rt_bandwidth(struct rt_bandwidth *rt_b)
231#endif 234#endif
232 235
233/* 236/*
234 * sched_domains_mutex serializes calls to arch_init_sched_domains, 237 * sched_domains_mutex serializes calls to init_sched_domains,
235 * detach_destroy_domains and partition_sched_domains. 238 * detach_destroy_domains and partition_sched_domains.
236 */ 239 */
237static DEFINE_MUTEX(sched_domains_mutex); 240static DEFINE_MUTEX(sched_domains_mutex);
@@ -292,7 +295,7 @@ static DEFINE_SPINLOCK(task_group_lock);
292 * (The default weight is 1024 - so there's no practical 295 * (The default weight is 1024 - so there's no practical
293 * limitation from this.) 296 * limitation from this.)
294 */ 297 */
295#define MIN_SHARES 2 298#define MIN_SHARES (1UL << 1)
296#define MAX_SHARES (1UL << 18) 299#define MAX_SHARES (1UL << 18)
297 300
298static int root_task_group_load = ROOT_TASK_GROUP_LOAD; 301static int root_task_group_load = ROOT_TASK_GROUP_LOAD;
@@ -312,6 +315,9 @@ struct cfs_rq {
312 315
313 u64 exec_clock; 316 u64 exec_clock;
314 u64 min_vruntime; 317 u64 min_vruntime;
318#ifndef CONFIG_64BIT
319 u64 min_vruntime_copy;
320#endif
315 321
316 struct rb_root tasks_timeline; 322 struct rb_root tasks_timeline;
317 struct rb_node *rb_leftmost; 323 struct rb_node *rb_leftmost;
@@ -325,7 +331,9 @@ struct cfs_rq {
325 */ 331 */
326 struct sched_entity *curr, *next, *last, *skip; 332 struct sched_entity *curr, *next, *last, *skip;
327 333
334#ifdef CONFIG_SCHED_DEBUG
328 unsigned int nr_spread_over; 335 unsigned int nr_spread_over;
336#endif
329 337
330#ifdef CONFIG_FAIR_GROUP_SCHED 338#ifdef CONFIG_FAIR_GROUP_SCHED
331 struct rq *rq; /* cpu runqueue to which this cfs_rq is attached */ 339 struct rq *rq; /* cpu runqueue to which this cfs_rq is attached */
@@ -417,6 +425,8 @@ struct rt_rq {
417 */ 425 */
418struct root_domain { 426struct root_domain {
419 atomic_t refcount; 427 atomic_t refcount;
428 atomic_t rto_count;
429 struct rcu_head rcu;
420 cpumask_var_t span; 430 cpumask_var_t span;
421 cpumask_var_t online; 431 cpumask_var_t online;
422 432
@@ -425,7 +435,6 @@ struct root_domain {
425 * one runnable RT task. 435 * one runnable RT task.
426 */ 436 */
427 cpumask_var_t rto_mask; 437 cpumask_var_t rto_mask;
428 atomic_t rto_count;
429 struct cpupri cpupri; 438 struct cpupri cpupri;
430}; 439};
431 440
@@ -460,7 +469,7 @@ struct rq {
460 u64 nohz_stamp; 469 u64 nohz_stamp;
461 unsigned char nohz_balance_kick; 470 unsigned char nohz_balance_kick;
462#endif 471#endif
463 unsigned int skip_clock_update; 472 int skip_clock_update;
464 473
465 /* capture load from *all* tasks on this cpu: */ 474 /* capture load from *all* tasks on this cpu: */
466 struct load_weight load; 475 struct load_weight load;
@@ -522,6 +531,12 @@ struct rq {
522#ifdef CONFIG_IRQ_TIME_ACCOUNTING 531#ifdef CONFIG_IRQ_TIME_ACCOUNTING
523 u64 prev_irq_time; 532 u64 prev_irq_time;
524#endif 533#endif
534#ifdef CONFIG_PARAVIRT
535 u64 prev_steal_time;
536#endif
537#ifdef CONFIG_PARAVIRT_TIME_ACCOUNTING
538 u64 prev_steal_time_rq;
539#endif
525 540
526 /* calc_load related fields */ 541 /* calc_load related fields */
527 unsigned long calc_load_update; 542 unsigned long calc_load_update;
@@ -553,6 +568,10 @@ struct rq {
553 unsigned int ttwu_count; 568 unsigned int ttwu_count;
554 unsigned int ttwu_local; 569 unsigned int ttwu_local;
555#endif 570#endif
571
572#ifdef CONFIG_SMP
573 struct task_struct *wake_list;
574#endif
556}; 575};
557 576
558static DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues); 577static DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues);
@@ -571,7 +590,6 @@ static inline int cpu_of(struct rq *rq)
571 590
572#define rcu_dereference_check_sched_domain(p) \ 591#define rcu_dereference_check_sched_domain(p) \
573 rcu_dereference_check((p), \ 592 rcu_dereference_check((p), \
574 rcu_read_lock_sched_held() || \
575 lockdep_is_held(&sched_domains_mutex)) 593 lockdep_is_held(&sched_domains_mutex))
576 594
577/* 595/*
@@ -595,10 +613,10 @@ static inline int cpu_of(struct rq *rq)
595/* 613/*
596 * Return the group to which this tasks belongs. 614 * Return the group to which this tasks belongs.
597 * 615 *
598 * We use task_subsys_state_check() and extend the RCU verification 616 * We use task_subsys_state_check() and extend the RCU verification with
599 * with lockdep_is_held(&task_rq(p)->lock) because cpu_cgroup_attach() 617 * pi->lock and rq->lock because cpu_cgroup_attach() holds those locks for each
600 * holds that lock for each task it moves into the cgroup. Therefore 618 * task it moves into the cgroup. Therefore by holding either of those locks,
601 * by holding that lock, we pin the task to the current cgroup. 619 * we pin the task to the current cgroup.
602 */ 620 */
603static inline struct task_group *task_group(struct task_struct *p) 621static inline struct task_group *task_group(struct task_struct *p)
604{ 622{
@@ -606,6 +624,7 @@ static inline struct task_group *task_group(struct task_struct *p)
606 struct cgroup_subsys_state *css; 624 struct cgroup_subsys_state *css;
607 625
608 css = task_subsys_state_check(p, cpu_cgroup_subsys_id, 626 css = task_subsys_state_check(p, cpu_cgroup_subsys_id,
627 lockdep_is_held(&p->pi_lock) ||
609 lockdep_is_held(&task_rq(p)->lock)); 628 lockdep_is_held(&task_rq(p)->lock));
610 tg = container_of(css, struct task_group, css); 629 tg = container_of(css, struct task_group, css);
611 630
@@ -642,7 +661,7 @@ static void update_rq_clock(struct rq *rq)
642{ 661{
643 s64 delta; 662 s64 delta;
644 663
645 if (rq->skip_clock_update) 664 if (rq->skip_clock_update > 0)
646 return; 665 return;
647 666
648 delta = sched_clock_cpu(cpu_of(rq)) - rq->clock; 667 delta = sched_clock_cpu(cpu_of(rq)) - rq->clock;
@@ -838,18 +857,39 @@ static inline int task_current(struct rq *rq, struct task_struct *p)
838 return rq->curr == p; 857 return rq->curr == p;
839} 858}
840 859
841#ifndef __ARCH_WANT_UNLOCKED_CTXSW
842static inline int task_running(struct rq *rq, struct task_struct *p) 860static inline int task_running(struct rq *rq, struct task_struct *p)
843{ 861{
862#ifdef CONFIG_SMP
863 return p->on_cpu;
864#else
844 return task_current(rq, p); 865 return task_current(rq, p);
866#endif
845} 867}
846 868
869#ifndef __ARCH_WANT_UNLOCKED_CTXSW
847static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next) 870static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next)
848{ 871{
872#ifdef CONFIG_SMP
873 /*
874 * We can optimise this out completely for !SMP, because the
875 * SMP rebalancing from interrupt is the only thing that cares
876 * here.
877 */
878 next->on_cpu = 1;
879#endif
849} 880}
850 881
851static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev) 882static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev)
852{ 883{
884#ifdef CONFIG_SMP
885 /*
886 * After ->on_cpu is cleared, the task can be moved to a different CPU.
887 * We must ensure this doesn't happen until the switch is completely
888 * finished.
889 */
890 smp_wmb();
891 prev->on_cpu = 0;
892#endif
853#ifdef CONFIG_DEBUG_SPINLOCK 893#ifdef CONFIG_DEBUG_SPINLOCK
854 /* this is a valid case when another task releases the spinlock */ 894 /* this is a valid case when another task releases the spinlock */
855 rq->lock.owner = current; 895 rq->lock.owner = current;
@@ -865,15 +905,6 @@ static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev)
865} 905}
866 906
867#else /* __ARCH_WANT_UNLOCKED_CTXSW */ 907#else /* __ARCH_WANT_UNLOCKED_CTXSW */
868static inline int task_running(struct rq *rq, struct task_struct *p)
869{
870#ifdef CONFIG_SMP
871 return p->oncpu;
872#else
873 return task_current(rq, p);
874#endif
875}
876
877static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next) 908static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next)
878{ 909{
879#ifdef CONFIG_SMP 910#ifdef CONFIG_SMP
@@ -882,7 +913,7 @@ static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next)
882 * SMP rebalancing from interrupt is the only thing that cares 913 * SMP rebalancing from interrupt is the only thing that cares
883 * here. 914 * here.
884 */ 915 */
885 next->oncpu = 1; 916 next->on_cpu = 1;
886#endif 917#endif
887#ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW 918#ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW
888 raw_spin_unlock_irq(&rq->lock); 919 raw_spin_unlock_irq(&rq->lock);
@@ -895,12 +926,12 @@ static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev)
895{ 926{
896#ifdef CONFIG_SMP 927#ifdef CONFIG_SMP
897 /* 928 /*
898 * After ->oncpu is cleared, the task can be moved to a different CPU. 929 * After ->on_cpu is cleared, the task can be moved to a different CPU.
899 * We must ensure this doesn't happen until the switch is completely 930 * We must ensure this doesn't happen until the switch is completely
900 * finished. 931 * finished.
901 */ 932 */
902 smp_wmb(); 933 smp_wmb();
903 prev->oncpu = 0; 934 prev->on_cpu = 0;
904#endif 935#endif
905#ifndef __ARCH_WANT_INTERRUPTS_ON_CTXSW 936#ifndef __ARCH_WANT_INTERRUPTS_ON_CTXSW
906 local_irq_enable(); 937 local_irq_enable();
@@ -909,23 +940,15 @@ static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev)
909#endif /* __ARCH_WANT_UNLOCKED_CTXSW */ 940#endif /* __ARCH_WANT_UNLOCKED_CTXSW */
910 941
911/* 942/*
912 * Check whether the task is waking, we use this to synchronize ->cpus_allowed 943 * __task_rq_lock - lock the rq @p resides on.
913 * against ttwu().
914 */
915static inline int task_is_waking(struct task_struct *p)
916{
917 return unlikely(p->state == TASK_WAKING);
918}
919
920/*
921 * __task_rq_lock - lock the runqueue a given task resides on.
922 * Must be called interrupts disabled.
923 */ 944 */
924static inline struct rq *__task_rq_lock(struct task_struct *p) 945static inline struct rq *__task_rq_lock(struct task_struct *p)
925 __acquires(rq->lock) 946 __acquires(rq->lock)
926{ 947{
927 struct rq *rq; 948 struct rq *rq;
928 949
950 lockdep_assert_held(&p->pi_lock);
951
929 for (;;) { 952 for (;;) {
930 rq = task_rq(p); 953 rq = task_rq(p);
931 raw_spin_lock(&rq->lock); 954 raw_spin_lock(&rq->lock);
@@ -936,22 +959,22 @@ static inline struct rq *__task_rq_lock(struct task_struct *p)
936} 959}
937 960
938/* 961/*
939 * task_rq_lock - lock the runqueue a given task resides on and disable 962 * task_rq_lock - lock p->pi_lock and lock the rq @p resides on.
940 * interrupts. Note the ordering: we can safely lookup the task_rq without
941 * explicitly disabling preemption.
942 */ 963 */
943static struct rq *task_rq_lock(struct task_struct *p, unsigned long *flags) 964static struct rq *task_rq_lock(struct task_struct *p, unsigned long *flags)
965 __acquires(p->pi_lock)
944 __acquires(rq->lock) 966 __acquires(rq->lock)
945{ 967{
946 struct rq *rq; 968 struct rq *rq;
947 969
948 for (;;) { 970 for (;;) {
949 local_irq_save(*flags); 971 raw_spin_lock_irqsave(&p->pi_lock, *flags);
950 rq = task_rq(p); 972 rq = task_rq(p);
951 raw_spin_lock(&rq->lock); 973 raw_spin_lock(&rq->lock);
952 if (likely(rq == task_rq(p))) 974 if (likely(rq == task_rq(p)))
953 return rq; 975 return rq;
954 raw_spin_unlock_irqrestore(&rq->lock, *flags); 976 raw_spin_unlock(&rq->lock);
977 raw_spin_unlock_irqrestore(&p->pi_lock, *flags);
955 } 978 }
956} 979}
957 980
@@ -961,10 +984,13 @@ static void __task_rq_unlock(struct rq *rq)
961 raw_spin_unlock(&rq->lock); 984 raw_spin_unlock(&rq->lock);
962} 985}
963 986
964static inline void task_rq_unlock(struct rq *rq, unsigned long *flags) 987static inline void
988task_rq_unlock(struct rq *rq, struct task_struct *p, unsigned long *flags)
965 __releases(rq->lock) 989 __releases(rq->lock)
990 __releases(p->pi_lock)
966{ 991{
967 raw_spin_unlock_irqrestore(&rq->lock, *flags); 992 raw_spin_unlock(&rq->lock);
993 raw_spin_unlock_irqrestore(&p->pi_lock, *flags);
968} 994}
969 995
970/* 996/*
@@ -1193,11 +1219,17 @@ int get_nohz_timer_target(void)
1193 int i; 1219 int i;
1194 struct sched_domain *sd; 1220 struct sched_domain *sd;
1195 1221
1222 rcu_read_lock();
1196 for_each_domain(cpu, sd) { 1223 for_each_domain(cpu, sd) {
1197 for_each_cpu(i, sched_domain_span(sd)) 1224 for_each_cpu(i, sched_domain_span(sd)) {
1198 if (!idle_cpu(i)) 1225 if (!idle_cpu(i)) {
1199 return i; 1226 cpu = i;
1227 goto unlock;
1228 }
1229 }
1200 } 1230 }
1231unlock:
1232 rcu_read_unlock();
1201 return cpu; 1233 return cpu;
1202} 1234}
1203/* 1235/*
@@ -1307,15 +1339,27 @@ calc_delta_mine(unsigned long delta_exec, unsigned long weight,
1307{ 1339{
1308 u64 tmp; 1340 u64 tmp;
1309 1341
1342 /*
1343 * weight can be less than 2^SCHED_LOAD_RESOLUTION for task group sched
1344 * entities since MIN_SHARES = 2. Treat weight as 1 if less than
1345 * 2^SCHED_LOAD_RESOLUTION.
1346 */
1347 if (likely(weight > (1UL << SCHED_LOAD_RESOLUTION)))
1348 tmp = (u64)delta_exec * scale_load_down(weight);
1349 else
1350 tmp = (u64)delta_exec;
1351
1310 if (!lw->inv_weight) { 1352 if (!lw->inv_weight) {
1311 if (BITS_PER_LONG > 32 && unlikely(lw->weight >= WMULT_CONST)) 1353 unsigned long w = scale_load_down(lw->weight);
1354
1355 if (BITS_PER_LONG > 32 && unlikely(w >= WMULT_CONST))
1312 lw->inv_weight = 1; 1356 lw->inv_weight = 1;
1357 else if (unlikely(!w))
1358 lw->inv_weight = WMULT_CONST;
1313 else 1359 else
1314 lw->inv_weight = 1 + (WMULT_CONST-lw->weight/2) 1360 lw->inv_weight = WMULT_CONST / w;
1315 / (lw->weight+1);
1316 } 1361 }
1317 1362
1318 tmp = (u64)delta_exec * weight;
1319 /* 1363 /*
1320 * Check whether we'd overflow the 64-bit multiplication: 1364 * Check whether we'd overflow the 64-bit multiplication:
1321 */ 1365 */
@@ -1532,38 +1576,6 @@ static unsigned long cpu_avg_load_per_task(int cpu)
1532 return rq->avg_load_per_task; 1576 return rq->avg_load_per_task;
1533} 1577}
1534 1578
1535#ifdef CONFIG_FAIR_GROUP_SCHED
1536
1537/*
1538 * Compute the cpu's hierarchical load factor for each task group.
1539 * This needs to be done in a top-down fashion because the load of a child
1540 * group is a fraction of its parents load.
1541 */
1542static int tg_load_down(struct task_group *tg, void *data)
1543{
1544 unsigned long load;
1545 long cpu = (long)data;
1546
1547 if (!tg->parent) {
1548 load = cpu_rq(cpu)->load.weight;
1549 } else {
1550 load = tg->parent->cfs_rq[cpu]->h_load;
1551 load *= tg->se[cpu]->load.weight;
1552 load /= tg->parent->cfs_rq[cpu]->load.weight + 1;
1553 }
1554
1555 tg->cfs_rq[cpu]->h_load = load;
1556
1557 return 0;
1558}
1559
1560static void update_h_load(long cpu)
1561{
1562 walk_tg_tree(tg_load_down, tg_nop, (void *)cpu);
1563}
1564
1565#endif
1566
1567#ifdef CONFIG_PREEMPT 1579#ifdef CONFIG_PREEMPT
1568 1580
1569static void double_rq_lock(struct rq *rq1, struct rq *rq2); 1581static void double_rq_lock(struct rq *rq1, struct rq *rq2);
@@ -1755,17 +1767,20 @@ static void dec_nr_running(struct rq *rq)
1755 1767
1756static void set_load_weight(struct task_struct *p) 1768static void set_load_weight(struct task_struct *p)
1757{ 1769{
1770 int prio = p->static_prio - MAX_RT_PRIO;
1771 struct load_weight *load = &p->se.load;
1772
1758 /* 1773 /*
1759 * SCHED_IDLE tasks get minimal weight: 1774 * SCHED_IDLE tasks get minimal weight:
1760 */ 1775 */
1761 if (p->policy == SCHED_IDLE) { 1776 if (p->policy == SCHED_IDLE) {
1762 p->se.load.weight = WEIGHT_IDLEPRIO; 1777 load->weight = scale_load(WEIGHT_IDLEPRIO);
1763 p->se.load.inv_weight = WMULT_IDLEPRIO; 1778 load->inv_weight = WMULT_IDLEPRIO;
1764 return; 1779 return;
1765 } 1780 }
1766 1781
1767 p->se.load.weight = prio_to_weight[p->static_prio - MAX_RT_PRIO]; 1782 load->weight = scale_load(prio_to_weight[prio]);
1768 p->se.load.inv_weight = prio_to_wmult[p->static_prio - MAX_RT_PRIO]; 1783 load->inv_weight = prio_to_wmult[prio];
1769} 1784}
1770 1785
1771static void enqueue_task(struct rq *rq, struct task_struct *p, int flags) 1786static void enqueue_task(struct rq *rq, struct task_struct *p, int flags)
@@ -1773,7 +1788,6 @@ static void enqueue_task(struct rq *rq, struct task_struct *p, int flags)
1773 update_rq_clock(rq); 1788 update_rq_clock(rq);
1774 sched_info_queued(p); 1789 sched_info_queued(p);
1775 p->sched_class->enqueue_task(rq, p, flags); 1790 p->sched_class->enqueue_task(rq, p, flags);
1776 p->se.on_rq = 1;
1777} 1791}
1778 1792
1779static void dequeue_task(struct rq *rq, struct task_struct *p, int flags) 1793static void dequeue_task(struct rq *rq, struct task_struct *p, int flags)
@@ -1781,7 +1795,6 @@ static void dequeue_task(struct rq *rq, struct task_struct *p, int flags)
1781 update_rq_clock(rq); 1795 update_rq_clock(rq);
1782 sched_info_dequeued(p); 1796 sched_info_dequeued(p);
1783 p->sched_class->dequeue_task(rq, p, flags); 1797 p->sched_class->dequeue_task(rq, p, flags);
1784 p->se.on_rq = 0;
1785} 1798}
1786 1799
1787/* 1800/*
@@ -1916,10 +1929,28 @@ void account_system_vtime(struct task_struct *curr)
1916} 1929}
1917EXPORT_SYMBOL_GPL(account_system_vtime); 1930EXPORT_SYMBOL_GPL(account_system_vtime);
1918 1931
1919static void update_rq_clock_task(struct rq *rq, s64 delta) 1932#endif /* CONFIG_IRQ_TIME_ACCOUNTING */
1933
1934#ifdef CONFIG_PARAVIRT
1935static inline u64 steal_ticks(u64 steal)
1920{ 1936{
1921 s64 irq_delta; 1937 if (unlikely(steal > NSEC_PER_SEC))
1938 return div_u64(steal, TICK_NSEC);
1939
1940 return __iter_div_u64_rem(steal, TICK_NSEC, &steal);
1941}
1942#endif
1922 1943
1944static void update_rq_clock_task(struct rq *rq, s64 delta)
1945{
1946/*
1947 * In theory, the compile should just see 0 here, and optimize out the call
1948 * to sched_rt_avg_update. But I don't trust it...
1949 */
1950#if defined(CONFIG_IRQ_TIME_ACCOUNTING) || defined(CONFIG_PARAVIRT_TIME_ACCOUNTING)
1951 s64 steal = 0, irq_delta = 0;
1952#endif
1953#ifdef CONFIG_IRQ_TIME_ACCOUNTING
1923 irq_delta = irq_time_read(cpu_of(rq)) - rq->prev_irq_time; 1954 irq_delta = irq_time_read(cpu_of(rq)) - rq->prev_irq_time;
1924 1955
1925 /* 1956 /*
@@ -1942,12 +1973,35 @@ static void update_rq_clock_task(struct rq *rq, s64 delta)
1942 1973
1943 rq->prev_irq_time += irq_delta; 1974 rq->prev_irq_time += irq_delta;
1944 delta -= irq_delta; 1975 delta -= irq_delta;
1976#endif
1977#ifdef CONFIG_PARAVIRT_TIME_ACCOUNTING
1978 if (static_branch((&paravirt_steal_rq_enabled))) {
1979 u64 st;
1980
1981 steal = paravirt_steal_clock(cpu_of(rq));
1982 steal -= rq->prev_steal_time_rq;
1983
1984 if (unlikely(steal > delta))
1985 steal = delta;
1986
1987 st = steal_ticks(steal);
1988 steal = st * TICK_NSEC;
1989
1990 rq->prev_steal_time_rq += steal;
1991
1992 delta -= steal;
1993 }
1994#endif
1995
1945 rq->clock_task += delta; 1996 rq->clock_task += delta;
1946 1997
1947 if (irq_delta && sched_feat(NONIRQ_POWER)) 1998#if defined(CONFIG_IRQ_TIME_ACCOUNTING) || defined(CONFIG_PARAVIRT_TIME_ACCOUNTING)
1948 sched_rt_avg_update(rq, irq_delta); 1999 if ((irq_delta + steal) && sched_feat(NONTASK_POWER))
2000 sched_rt_avg_update(rq, irq_delta + steal);
2001#endif
1949} 2002}
1950 2003
2004#ifdef CONFIG_IRQ_TIME_ACCOUNTING
1951static int irqtime_account_hi_update(void) 2005static int irqtime_account_hi_update(void)
1952{ 2006{
1953 struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat; 2007 struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;
@@ -1982,12 +2036,7 @@ static int irqtime_account_si_update(void)
1982 2036
1983#define sched_clock_irqtime (0) 2037#define sched_clock_irqtime (0)
1984 2038
1985static void update_rq_clock_task(struct rq *rq, s64 delta) 2039#endif
1986{
1987 rq->clock_task += delta;
1988}
1989
1990#endif /* CONFIG_IRQ_TIME_ACCOUNTING */
1991 2040
1992#include "sched_idletask.c" 2041#include "sched_idletask.c"
1993#include "sched_fair.c" 2042#include "sched_fair.c"
@@ -2116,7 +2165,7 @@ static void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags)
2116 * A queue event has occurred, and we're going to schedule. In 2165 * A queue event has occurred, and we're going to schedule. In
2117 * this case, we can save a useless back to back clock update. 2166 * this case, we can save a useless back to back clock update.
2118 */ 2167 */
2119 if (rq->curr->se.on_rq && test_tsk_need_resched(rq->curr)) 2168 if (rq->curr->on_rq && test_tsk_need_resched(rq->curr))
2120 rq->skip_clock_update = 1; 2169 rq->skip_clock_update = 1;
2121} 2170}
2122 2171
@@ -2162,13 +2211,28 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
2162 */ 2211 */
2163 WARN_ON_ONCE(p->state != TASK_RUNNING && p->state != TASK_WAKING && 2212 WARN_ON_ONCE(p->state != TASK_RUNNING && p->state != TASK_WAKING &&
2164 !(task_thread_info(p)->preempt_count & PREEMPT_ACTIVE)); 2213 !(task_thread_info(p)->preempt_count & PREEMPT_ACTIVE));
2214
2215#ifdef CONFIG_LOCKDEP
2216 /*
2217 * The caller should hold either p->pi_lock or rq->lock, when changing
2218 * a task's CPU. ->pi_lock for waking tasks, rq->lock for runnable tasks.
2219 *
2220 * sched_move_task() holds both and thus holding either pins the cgroup,
2221 * see set_task_rq().
2222 *
2223 * Furthermore, all task_rq users should acquire both locks, see
2224 * task_rq_lock().
2225 */
2226 WARN_ON_ONCE(debug_locks && !(lockdep_is_held(&p->pi_lock) ||
2227 lockdep_is_held(&task_rq(p)->lock)));
2228#endif
2165#endif 2229#endif
2166 2230
2167 trace_sched_migrate_task(p, new_cpu); 2231 trace_sched_migrate_task(p, new_cpu);
2168 2232
2169 if (task_cpu(p) != new_cpu) { 2233 if (task_cpu(p) != new_cpu) {
2170 p->se.nr_migrations++; 2234 p->se.nr_migrations++;
2171 perf_sw_event(PERF_COUNT_SW_CPU_MIGRATIONS, 1, 1, NULL, 0); 2235 perf_sw_event(PERF_COUNT_SW_CPU_MIGRATIONS, 1, NULL, 0);
2172 } 2236 }
2173 2237
2174 __set_task_cpu(p, new_cpu); 2238 __set_task_cpu(p, new_cpu);
@@ -2182,19 +2246,6 @@ struct migration_arg {
2182static int migration_cpu_stop(void *data); 2246static int migration_cpu_stop(void *data);
2183 2247
2184/* 2248/*
2185 * The task's runqueue lock must be held.
2186 * Returns true if you have to wait for migration thread.
2187 */
2188static bool migrate_task(struct task_struct *p, struct rq *rq)
2189{
2190 /*
2191 * If the task is not on a runqueue (and not running), then
2192 * the next wake-up will properly place the task.
2193 */
2194 return p->se.on_rq || task_running(rq, p);
2195}
2196
2197/*
2198 * wait_task_inactive - wait for a thread to unschedule. 2249 * wait_task_inactive - wait for a thread to unschedule.
2199 * 2250 *
2200 * If @match_state is nonzero, it's the @p->state value just checked and 2251 * If @match_state is nonzero, it's the @p->state value just checked and
@@ -2251,11 +2302,11 @@ unsigned long wait_task_inactive(struct task_struct *p, long match_state)
2251 rq = task_rq_lock(p, &flags); 2302 rq = task_rq_lock(p, &flags);
2252 trace_sched_wait_task(p); 2303 trace_sched_wait_task(p);
2253 running = task_running(rq, p); 2304 running = task_running(rq, p);
2254 on_rq = p->se.on_rq; 2305 on_rq = p->on_rq;
2255 ncsw = 0; 2306 ncsw = 0;
2256 if (!match_state || p->state == match_state) 2307 if (!match_state || p->state == match_state)
2257 ncsw = p->nvcsw | LONG_MIN; /* sets MSB */ 2308 ncsw = p->nvcsw | LONG_MIN; /* sets MSB */
2258 task_rq_unlock(rq, &flags); 2309 task_rq_unlock(rq, p, &flags);
2259 2310
2260 /* 2311 /*
2261 * If it changed from the expected state, bail out now. 2312 * If it changed from the expected state, bail out now.
@@ -2330,7 +2381,7 @@ EXPORT_SYMBOL_GPL(kick_process);
2330 2381
2331#ifdef CONFIG_SMP 2382#ifdef CONFIG_SMP
2332/* 2383/*
2333 * ->cpus_allowed is protected by either TASK_WAKING or rq->lock held. 2384 * ->cpus_allowed is protected by both rq->lock and p->pi_lock
2334 */ 2385 */
2335static int select_fallback_rq(int cpu, struct task_struct *p) 2386static int select_fallback_rq(int cpu, struct task_struct *p)
2336{ 2387{
@@ -2363,12 +2414,12 @@ static int select_fallback_rq(int cpu, struct task_struct *p)
2363} 2414}
2364 2415
2365/* 2416/*
2366 * The caller (fork, wakeup) owns TASK_WAKING, ->cpus_allowed is stable. 2417 * The caller (fork, wakeup) owns p->pi_lock, ->cpus_allowed is stable.
2367 */ 2418 */
2368static inline 2419static inline
2369int select_task_rq(struct rq *rq, struct task_struct *p, int sd_flags, int wake_flags) 2420int select_task_rq(struct task_struct *p, int sd_flags, int wake_flags)
2370{ 2421{
2371 int cpu = p->sched_class->select_task_rq(rq, p, sd_flags, wake_flags); 2422 int cpu = p->sched_class->select_task_rq(p, sd_flags, wake_flags);
2372 2423
2373 /* 2424 /*
2374 * In order not to call set_task_cpu() on a blocking task we need 2425 * In order not to call set_task_cpu() on a blocking task we need
@@ -2394,27 +2445,63 @@ static void update_avg(u64 *avg, u64 sample)
2394} 2445}
2395#endif 2446#endif
2396 2447
2397static inline void ttwu_activate(struct task_struct *p, struct rq *rq, 2448static void
2398 bool is_sync, bool is_migrate, bool is_local, 2449ttwu_stat(struct task_struct *p, int cpu, int wake_flags)
2399 unsigned long en_flags)
2400{ 2450{
2401 schedstat_inc(p, se.statistics.nr_wakeups); 2451#ifdef CONFIG_SCHEDSTATS
2402 if (is_sync) 2452 struct rq *rq = this_rq();
2403 schedstat_inc(p, se.statistics.nr_wakeups_sync); 2453
2404 if (is_migrate) 2454#ifdef CONFIG_SMP
2405 schedstat_inc(p, se.statistics.nr_wakeups_migrate); 2455 int this_cpu = smp_processor_id();
2406 if (is_local) 2456
2457 if (cpu == this_cpu) {
2458 schedstat_inc(rq, ttwu_local);
2407 schedstat_inc(p, se.statistics.nr_wakeups_local); 2459 schedstat_inc(p, se.statistics.nr_wakeups_local);
2408 else 2460 } else {
2461 struct sched_domain *sd;
2462
2409 schedstat_inc(p, se.statistics.nr_wakeups_remote); 2463 schedstat_inc(p, se.statistics.nr_wakeups_remote);
2464 rcu_read_lock();
2465 for_each_domain(this_cpu, sd) {
2466 if (cpumask_test_cpu(cpu, sched_domain_span(sd))) {
2467 schedstat_inc(sd, ttwu_wake_remote);
2468 break;
2469 }
2470 }
2471 rcu_read_unlock();
2472 }
2473
2474 if (wake_flags & WF_MIGRATED)
2475 schedstat_inc(p, se.statistics.nr_wakeups_migrate);
2410 2476
2477#endif /* CONFIG_SMP */
2478
2479 schedstat_inc(rq, ttwu_count);
2480 schedstat_inc(p, se.statistics.nr_wakeups);
2481
2482 if (wake_flags & WF_SYNC)
2483 schedstat_inc(p, se.statistics.nr_wakeups_sync);
2484
2485#endif /* CONFIG_SCHEDSTATS */
2486}
2487
2488static void ttwu_activate(struct rq *rq, struct task_struct *p, int en_flags)
2489{
2411 activate_task(rq, p, en_flags); 2490 activate_task(rq, p, en_flags);
2491 p->on_rq = 1;
2492
2493 /* if a worker is waking up, notify workqueue */
2494 if (p->flags & PF_WQ_WORKER)
2495 wq_worker_waking_up(p, cpu_of(rq));
2412} 2496}
2413 2497
2414static inline void ttwu_post_activation(struct task_struct *p, struct rq *rq, 2498/*
2415 int wake_flags, bool success) 2499 * Mark the task runnable and perform wakeup-preemption.
2500 */
2501static void
2502ttwu_do_wakeup(struct rq *rq, struct task_struct *p, int wake_flags)
2416{ 2503{
2417 trace_sched_wakeup(p, success); 2504 trace_sched_wakeup(p, true);
2418 check_preempt_curr(rq, p, wake_flags); 2505 check_preempt_curr(rq, p, wake_flags);
2419 2506
2420 p->state = TASK_RUNNING; 2507 p->state = TASK_RUNNING;
@@ -2422,7 +2509,7 @@ static inline void ttwu_post_activation(struct task_struct *p, struct rq *rq,
2422 if (p->sched_class->task_woken) 2509 if (p->sched_class->task_woken)
2423 p->sched_class->task_woken(rq, p); 2510 p->sched_class->task_woken(rq, p);
2424 2511
2425 if (unlikely(rq->idle_stamp)) { 2512 if (rq->idle_stamp) {
2426 u64 delta = rq->clock - rq->idle_stamp; 2513 u64 delta = rq->clock - rq->idle_stamp;
2427 u64 max = 2*sysctl_sched_migration_cost; 2514 u64 max = 2*sysctl_sched_migration_cost;
2428 2515
@@ -2433,9 +2520,151 @@ static inline void ttwu_post_activation(struct task_struct *p, struct rq *rq,
2433 rq->idle_stamp = 0; 2520 rq->idle_stamp = 0;
2434 } 2521 }
2435#endif 2522#endif
2436 /* if a worker is waking up, notify workqueue */ 2523}
2437 if ((p->flags & PF_WQ_WORKER) && success) 2524
2438 wq_worker_waking_up(p, cpu_of(rq)); 2525static void
2526ttwu_do_activate(struct rq *rq, struct task_struct *p, int wake_flags)
2527{
2528#ifdef CONFIG_SMP
2529 if (p->sched_contributes_to_load)
2530 rq->nr_uninterruptible--;
2531#endif
2532
2533 ttwu_activate(rq, p, ENQUEUE_WAKEUP | ENQUEUE_WAKING);
2534 ttwu_do_wakeup(rq, p, wake_flags);
2535}
2536
2537/*
2538 * Called in case the task @p isn't fully descheduled from its runqueue,
2539 * in this case we must do a remote wakeup. Its a 'light' wakeup though,
2540 * since all we need to do is flip p->state to TASK_RUNNING, since
2541 * the task is still ->on_rq.
2542 */
2543static int ttwu_remote(struct task_struct *p, int wake_flags)
2544{
2545 struct rq *rq;
2546 int ret = 0;
2547
2548 rq = __task_rq_lock(p);
2549 if (p->on_rq) {
2550 ttwu_do_wakeup(rq, p, wake_flags);
2551 ret = 1;
2552 }
2553 __task_rq_unlock(rq);
2554
2555 return ret;
2556}
2557
2558#ifdef CONFIG_SMP
2559static void sched_ttwu_do_pending(struct task_struct *list)
2560{
2561 struct rq *rq = this_rq();
2562
2563 raw_spin_lock(&rq->lock);
2564
2565 while (list) {
2566 struct task_struct *p = list;
2567 list = list->wake_entry;
2568 ttwu_do_activate(rq, p, 0);
2569 }
2570
2571 raw_spin_unlock(&rq->lock);
2572}
2573
2574#ifdef CONFIG_HOTPLUG_CPU
2575
2576static void sched_ttwu_pending(void)
2577{
2578 struct rq *rq = this_rq();
2579 struct task_struct *list = xchg(&rq->wake_list, NULL);
2580
2581 if (!list)
2582 return;
2583
2584 sched_ttwu_do_pending(list);
2585}
2586
2587#endif /* CONFIG_HOTPLUG_CPU */
2588
2589void scheduler_ipi(void)
2590{
2591 struct rq *rq = this_rq();
2592 struct task_struct *list = xchg(&rq->wake_list, NULL);
2593
2594 if (!list)
2595 return;
2596
2597 /*
2598 * Not all reschedule IPI handlers call irq_enter/irq_exit, since
2599 * traditionally all their work was done from the interrupt return
2600 * path. Now that we actually do some work, we need to make sure
2601 * we do call them.
2602 *
2603 * Some archs already do call them, luckily irq_enter/exit nest
2604 * properly.
2605 *
2606 * Arguably we should visit all archs and update all handlers,
2607 * however a fair share of IPIs are still resched only so this would
2608 * somewhat pessimize the simple resched case.
2609 */
2610 irq_enter();
2611 sched_ttwu_do_pending(list);
2612 irq_exit();
2613}
2614
2615static void ttwu_queue_remote(struct task_struct *p, int cpu)
2616{
2617 struct rq *rq = cpu_rq(cpu);
2618 struct task_struct *next = rq->wake_list;
2619
2620 for (;;) {
2621 struct task_struct *old = next;
2622
2623 p->wake_entry = next;
2624 next = cmpxchg(&rq->wake_list, old, p);
2625 if (next == old)
2626 break;
2627 }
2628
2629 if (!next)
2630 smp_send_reschedule(cpu);
2631}
2632
2633#ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW
2634static int ttwu_activate_remote(struct task_struct *p, int wake_flags)
2635{
2636 struct rq *rq;
2637 int ret = 0;
2638
2639 rq = __task_rq_lock(p);
2640 if (p->on_cpu) {
2641 ttwu_activate(rq, p, ENQUEUE_WAKEUP);
2642 ttwu_do_wakeup(rq, p, wake_flags);
2643 ret = 1;
2644 }
2645 __task_rq_unlock(rq);
2646
2647 return ret;
2648
2649}
2650#endif /* __ARCH_WANT_INTERRUPTS_ON_CTXSW */
2651#endif /* CONFIG_SMP */
2652
2653static void ttwu_queue(struct task_struct *p, int cpu)
2654{
2655 struct rq *rq = cpu_rq(cpu);
2656
2657#if defined(CONFIG_SMP)
2658 if (sched_feat(TTWU_QUEUE) && cpu != smp_processor_id()) {
2659 sched_clock_cpu(cpu); /* sync clocks x-cpu */
2660 ttwu_queue_remote(p, cpu);
2661 return;
2662 }
2663#endif
2664
2665 raw_spin_lock(&rq->lock);
2666 ttwu_do_activate(rq, p, 0);
2667 raw_spin_unlock(&rq->lock);
2439} 2668}
2440 2669
2441/** 2670/**
@@ -2453,92 +2682,66 @@ static inline void ttwu_post_activation(struct task_struct *p, struct rq *rq,
2453 * Returns %true if @p was woken up, %false if it was already running 2682 * Returns %true if @p was woken up, %false if it was already running
2454 * or @state didn't match @p's state. 2683 * or @state didn't match @p's state.
2455 */ 2684 */
2456static int try_to_wake_up(struct task_struct *p, unsigned int state, 2685static int
2457 int wake_flags) 2686try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
2458{ 2687{
2459 int cpu, orig_cpu, this_cpu, success = 0;
2460 unsigned long flags; 2688 unsigned long flags;
2461 unsigned long en_flags = ENQUEUE_WAKEUP; 2689 int cpu, success = 0;
2462 struct rq *rq;
2463
2464 this_cpu = get_cpu();
2465 2690
2466 smp_wmb(); 2691 smp_wmb();
2467 rq = task_rq_lock(p, &flags); 2692 raw_spin_lock_irqsave(&p->pi_lock, flags);
2468 if (!(p->state & state)) 2693 if (!(p->state & state))
2469 goto out; 2694 goto out;
2470 2695
2471 if (p->se.on_rq) 2696 success = 1; /* we're going to change ->state */
2472 goto out_running;
2473
2474 cpu = task_cpu(p); 2697 cpu = task_cpu(p);
2475 orig_cpu = cpu;
2476 2698
2477#ifdef CONFIG_SMP 2699 if (p->on_rq && ttwu_remote(p, wake_flags))
2478 if (unlikely(task_running(rq, p))) 2700 goto stat;
2479 goto out_activate;
2480 2701
2702#ifdef CONFIG_SMP
2481 /* 2703 /*
2482 * In order to handle concurrent wakeups and release the rq->lock 2704 * If the owning (remote) cpu is still in the middle of schedule() with
2483 * we put the task in TASK_WAKING state. 2705 * this task as prev, wait until its done referencing the task.
2484 *
2485 * First fix up the nr_uninterruptible count:
2486 */ 2706 */
2487 if (task_contributes_to_load(p)) { 2707 while (p->on_cpu) {
2488 if (likely(cpu_online(orig_cpu))) 2708#ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW
2489 rq->nr_uninterruptible--; 2709 /*
2490 else 2710 * In case the architecture enables interrupts in
2491 this_rq()->nr_uninterruptible--; 2711 * context_switch(), we cannot busy wait, since that
2712 * would lead to deadlocks when an interrupt hits and
2713 * tries to wake up @prev. So bail and do a complete
2714 * remote wakeup.
2715 */
2716 if (ttwu_activate_remote(p, wake_flags))
2717 goto stat;
2718#else
2719 cpu_relax();
2720#endif
2492 } 2721 }
2722 /*
2723 * Pairs with the smp_wmb() in finish_lock_switch().
2724 */
2725 smp_rmb();
2726
2727 p->sched_contributes_to_load = !!task_contributes_to_load(p);
2493 p->state = TASK_WAKING; 2728 p->state = TASK_WAKING;
2494 2729
2495 if (p->sched_class->task_waking) { 2730 if (p->sched_class->task_waking)
2496 p->sched_class->task_waking(rq, p); 2731 p->sched_class->task_waking(p);
2497 en_flags |= ENQUEUE_WAKING;
2498 }
2499 2732
2500 cpu = select_task_rq(rq, p, SD_BALANCE_WAKE, wake_flags); 2733 cpu = select_task_rq(p, SD_BALANCE_WAKE, wake_flags);
2501 if (cpu != orig_cpu) 2734 if (task_cpu(p) != cpu) {
2735 wake_flags |= WF_MIGRATED;
2502 set_task_cpu(p, cpu); 2736 set_task_cpu(p, cpu);
2503 __task_rq_unlock(rq);
2504
2505 rq = cpu_rq(cpu);
2506 raw_spin_lock(&rq->lock);
2507
2508 /*
2509 * We migrated the task without holding either rq->lock, however
2510 * since the task is not on the task list itself, nobody else
2511 * will try and migrate the task, hence the rq should match the
2512 * cpu we just moved it to.
2513 */
2514 WARN_ON(task_cpu(p) != cpu);
2515 WARN_ON(p->state != TASK_WAKING);
2516
2517#ifdef CONFIG_SCHEDSTATS
2518 schedstat_inc(rq, ttwu_count);
2519 if (cpu == this_cpu)
2520 schedstat_inc(rq, ttwu_local);
2521 else {
2522 struct sched_domain *sd;
2523 for_each_domain(this_cpu, sd) {
2524 if (cpumask_test_cpu(cpu, sched_domain_span(sd))) {
2525 schedstat_inc(sd, ttwu_wake_remote);
2526 break;
2527 }
2528 }
2529 } 2737 }
2530#endif /* CONFIG_SCHEDSTATS */
2531
2532out_activate:
2533#endif /* CONFIG_SMP */ 2738#endif /* CONFIG_SMP */
2534 ttwu_activate(p, rq, wake_flags & WF_SYNC, orig_cpu != cpu, 2739
2535 cpu == this_cpu, en_flags); 2740 ttwu_queue(p, cpu);
2536 success = 1; 2741stat:
2537out_running: 2742 ttwu_stat(p, cpu, wake_flags);
2538 ttwu_post_activation(p, rq, wake_flags, success);
2539out: 2743out:
2540 task_rq_unlock(rq, &flags); 2744 raw_spin_unlock_irqrestore(&p->pi_lock, flags);
2541 put_cpu();
2542 2745
2543 return success; 2746 return success;
2544} 2747}
@@ -2547,31 +2750,34 @@ out:
2547 * try_to_wake_up_local - try to wake up a local task with rq lock held 2750 * try_to_wake_up_local - try to wake up a local task with rq lock held
2548 * @p: the thread to be awakened 2751 * @p: the thread to be awakened
2549 * 2752 *
2550 * Put @p on the run-queue if it's not already there. The caller must 2753 * Put @p on the run-queue if it's not already there. The caller must
2551 * ensure that this_rq() is locked, @p is bound to this_rq() and not 2754 * ensure that this_rq() is locked, @p is bound to this_rq() and not
2552 * the current task. this_rq() stays locked over invocation. 2755 * the current task.
2553 */ 2756 */
2554static void try_to_wake_up_local(struct task_struct *p) 2757static void try_to_wake_up_local(struct task_struct *p)
2555{ 2758{
2556 struct rq *rq = task_rq(p); 2759 struct rq *rq = task_rq(p);
2557 bool success = false;
2558 2760
2559 BUG_ON(rq != this_rq()); 2761 BUG_ON(rq != this_rq());
2560 BUG_ON(p == current); 2762 BUG_ON(p == current);
2561 lockdep_assert_held(&rq->lock); 2763 lockdep_assert_held(&rq->lock);
2562 2764
2765 if (!raw_spin_trylock(&p->pi_lock)) {
2766 raw_spin_unlock(&rq->lock);
2767 raw_spin_lock(&p->pi_lock);
2768 raw_spin_lock(&rq->lock);
2769 }
2770
2563 if (!(p->state & TASK_NORMAL)) 2771 if (!(p->state & TASK_NORMAL))
2564 return; 2772 goto out;
2565 2773
2566 if (!p->se.on_rq) { 2774 if (!p->on_rq)
2567 if (likely(!task_running(rq, p))) { 2775 ttwu_activate(rq, p, ENQUEUE_WAKEUP);
2568 schedstat_inc(rq, ttwu_count); 2776
2569 schedstat_inc(rq, ttwu_local); 2777 ttwu_do_wakeup(rq, p, 0);
2570 } 2778 ttwu_stat(p, smp_processor_id(), 0);
2571 ttwu_activate(p, rq, false, false, true, ENQUEUE_WAKEUP); 2779out:
2572 success = true; 2780 raw_spin_unlock(&p->pi_lock);
2573 }
2574 ttwu_post_activation(p, rq, 0, success);
2575} 2781}
2576 2782
2577/** 2783/**
@@ -2604,19 +2810,21 @@ int wake_up_state(struct task_struct *p, unsigned int state)
2604 */ 2810 */
2605static void __sched_fork(struct task_struct *p) 2811static void __sched_fork(struct task_struct *p)
2606{ 2812{
2813 p->on_rq = 0;
2814
2815 p->se.on_rq = 0;
2607 p->se.exec_start = 0; 2816 p->se.exec_start = 0;
2608 p->se.sum_exec_runtime = 0; 2817 p->se.sum_exec_runtime = 0;
2609 p->se.prev_sum_exec_runtime = 0; 2818 p->se.prev_sum_exec_runtime = 0;
2610 p->se.nr_migrations = 0; 2819 p->se.nr_migrations = 0;
2611 p->se.vruntime = 0; 2820 p->se.vruntime = 0;
2821 INIT_LIST_HEAD(&p->se.group_node);
2612 2822
2613#ifdef CONFIG_SCHEDSTATS 2823#ifdef CONFIG_SCHEDSTATS
2614 memset(&p->se.statistics, 0, sizeof(p->se.statistics)); 2824 memset(&p->se.statistics, 0, sizeof(p->se.statistics));
2615#endif 2825#endif
2616 2826
2617 INIT_LIST_HEAD(&p->rt.run_list); 2827 INIT_LIST_HEAD(&p->rt.run_list);
2618 p->se.on_rq = 0;
2619 INIT_LIST_HEAD(&p->se.group_node);
2620 2828
2621#ifdef CONFIG_PREEMPT_NOTIFIERS 2829#ifdef CONFIG_PREEMPT_NOTIFIERS
2622 INIT_HLIST_HEAD(&p->preempt_notifiers); 2830 INIT_HLIST_HEAD(&p->preempt_notifiers);
@@ -2626,8 +2834,9 @@ static void __sched_fork(struct task_struct *p)
2626/* 2834/*
2627 * fork()/clone()-time setup: 2835 * fork()/clone()-time setup:
2628 */ 2836 */
2629void sched_fork(struct task_struct *p, int clone_flags) 2837void sched_fork(struct task_struct *p)
2630{ 2838{
2839 unsigned long flags;
2631 int cpu = get_cpu(); 2840 int cpu = get_cpu();
2632 2841
2633 __sched_fork(p); 2842 __sched_fork(p);
@@ -2678,18 +2887,18 @@ void sched_fork(struct task_struct *p, int clone_flags)
2678 * 2887 *
2679 * Silence PROVE_RCU. 2888 * Silence PROVE_RCU.
2680 */ 2889 */
2681 rcu_read_lock(); 2890 raw_spin_lock_irqsave(&p->pi_lock, flags);
2682 set_task_cpu(p, cpu); 2891 set_task_cpu(p, cpu);
2683 rcu_read_unlock(); 2892 raw_spin_unlock_irqrestore(&p->pi_lock, flags);
2684 2893
2685#if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT) 2894#if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT)
2686 if (likely(sched_info_on())) 2895 if (likely(sched_info_on()))
2687 memset(&p->sched_info, 0, sizeof(p->sched_info)); 2896 memset(&p->sched_info, 0, sizeof(p->sched_info));
2688#endif 2897#endif
2689#if defined(CONFIG_SMP) && defined(__ARCH_WANT_UNLOCKED_CTXSW) 2898#if defined(CONFIG_SMP)
2690 p->oncpu = 0; 2899 p->on_cpu = 0;
2691#endif 2900#endif
2692#ifdef CONFIG_PREEMPT 2901#ifdef CONFIG_PREEMPT_COUNT
2693 /* Want to start with kernel preemption disabled. */ 2902 /* Want to start with kernel preemption disabled. */
2694 task_thread_info(p)->preempt_count = 1; 2903 task_thread_info(p)->preempt_count = 1;
2695#endif 2904#endif
@@ -2707,41 +2916,31 @@ void sched_fork(struct task_struct *p, int clone_flags)
2707 * that must be done for every newly created context, then puts the task 2916 * that must be done for every newly created context, then puts the task
2708 * on the runqueue and wakes it. 2917 * on the runqueue and wakes it.
2709 */ 2918 */
2710void wake_up_new_task(struct task_struct *p, unsigned long clone_flags) 2919void wake_up_new_task(struct task_struct *p)
2711{ 2920{
2712 unsigned long flags; 2921 unsigned long flags;
2713 struct rq *rq; 2922 struct rq *rq;
2714 int cpu __maybe_unused = get_cpu();
2715 2923
2924 raw_spin_lock_irqsave(&p->pi_lock, flags);
2716#ifdef CONFIG_SMP 2925#ifdef CONFIG_SMP
2717 rq = task_rq_lock(p, &flags);
2718 p->state = TASK_WAKING;
2719
2720 /* 2926 /*
2721 * Fork balancing, do it here and not earlier because: 2927 * Fork balancing, do it here and not earlier because:
2722 * - cpus_allowed can change in the fork path 2928 * - cpus_allowed can change in the fork path
2723 * - any previously selected cpu might disappear through hotplug 2929 * - any previously selected cpu might disappear through hotplug
2724 *
2725 * We set TASK_WAKING so that select_task_rq() can drop rq->lock
2726 * without people poking at ->cpus_allowed.
2727 */ 2930 */
2728 cpu = select_task_rq(rq, p, SD_BALANCE_FORK, 0); 2931 set_task_cpu(p, select_task_rq(p, SD_BALANCE_FORK, 0));
2729 set_task_cpu(p, cpu);
2730
2731 p->state = TASK_RUNNING;
2732 task_rq_unlock(rq, &flags);
2733#endif 2932#endif
2734 2933
2735 rq = task_rq_lock(p, &flags); 2934 rq = __task_rq_lock(p);
2736 activate_task(rq, p, 0); 2935 activate_task(rq, p, 0);
2737 trace_sched_wakeup_new(p, 1); 2936 p->on_rq = 1;
2937 trace_sched_wakeup_new(p, true);
2738 check_preempt_curr(rq, p, WF_FORK); 2938 check_preempt_curr(rq, p, WF_FORK);
2739#ifdef CONFIG_SMP 2939#ifdef CONFIG_SMP
2740 if (p->sched_class->task_woken) 2940 if (p->sched_class->task_woken)
2741 p->sched_class->task_woken(rq, p); 2941 p->sched_class->task_woken(rq, p);
2742#endif 2942#endif
2743 task_rq_unlock(rq, &flags); 2943 task_rq_unlock(rq, p, &flags);
2744 put_cpu();
2745} 2944}
2746 2945
2747#ifdef CONFIG_PREEMPT_NOTIFIERS 2946#ifdef CONFIG_PREEMPT_NOTIFIERS
@@ -3450,27 +3649,22 @@ void sched_exec(void)
3450{ 3649{
3451 struct task_struct *p = current; 3650 struct task_struct *p = current;
3452 unsigned long flags; 3651 unsigned long flags;
3453 struct rq *rq;
3454 int dest_cpu; 3652 int dest_cpu;
3455 3653
3456 rq = task_rq_lock(p, &flags); 3654 raw_spin_lock_irqsave(&p->pi_lock, flags);
3457 dest_cpu = p->sched_class->select_task_rq(rq, p, SD_BALANCE_EXEC, 0); 3655 dest_cpu = p->sched_class->select_task_rq(p, SD_BALANCE_EXEC, 0);
3458 if (dest_cpu == smp_processor_id()) 3656 if (dest_cpu == smp_processor_id())
3459 goto unlock; 3657 goto unlock;
3460 3658
3461 /* 3659 if (likely(cpu_active(dest_cpu))) {
3462 * select_task_rq() can race against ->cpus_allowed
3463 */
3464 if (cpumask_test_cpu(dest_cpu, &p->cpus_allowed) &&
3465 likely(cpu_active(dest_cpu)) && migrate_task(p, rq)) {
3466 struct migration_arg arg = { p, dest_cpu }; 3660 struct migration_arg arg = { p, dest_cpu };
3467 3661
3468 task_rq_unlock(rq, &flags); 3662 raw_spin_unlock_irqrestore(&p->pi_lock, flags);
3469 stop_one_cpu(cpu_of(rq), migration_cpu_stop, &arg); 3663 stop_one_cpu(task_cpu(p), migration_cpu_stop, &arg);
3470 return; 3664 return;
3471 } 3665 }
3472unlock: 3666unlock:
3473 task_rq_unlock(rq, &flags); 3667 raw_spin_unlock_irqrestore(&p->pi_lock, flags);
3474} 3668}
3475 3669
3476#endif 3670#endif
@@ -3507,7 +3701,7 @@ unsigned long long task_delta_exec(struct task_struct *p)
3507 3701
3508 rq = task_rq_lock(p, &flags); 3702 rq = task_rq_lock(p, &flags);
3509 ns = do_task_delta_exec(p, rq); 3703 ns = do_task_delta_exec(p, rq);
3510 task_rq_unlock(rq, &flags); 3704 task_rq_unlock(rq, p, &flags);
3511 3705
3512 return ns; 3706 return ns;
3513} 3707}
@@ -3525,7 +3719,7 @@ unsigned long long task_sched_runtime(struct task_struct *p)
3525 3719
3526 rq = task_rq_lock(p, &flags); 3720 rq = task_rq_lock(p, &flags);
3527 ns = p->se.sum_exec_runtime + do_task_delta_exec(p, rq); 3721 ns = p->se.sum_exec_runtime + do_task_delta_exec(p, rq);
3528 task_rq_unlock(rq, &flags); 3722 task_rq_unlock(rq, p, &flags);
3529 3723
3530 return ns; 3724 return ns;
3531} 3725}
@@ -3549,7 +3743,7 @@ unsigned long long thread_group_sched_runtime(struct task_struct *p)
3549 rq = task_rq_lock(p, &flags); 3743 rq = task_rq_lock(p, &flags);
3550 thread_group_cputime(p, &totals); 3744 thread_group_cputime(p, &totals);
3551 ns = totals.sum_exec_runtime + do_task_delta_exec(p, rq); 3745 ns = totals.sum_exec_runtime + do_task_delta_exec(p, rq);
3552 task_rq_unlock(rq, &flags); 3746 task_rq_unlock(rq, p, &flags);
3553 3747
3554 return ns; 3748 return ns;
3555} 3749}
@@ -3695,6 +3889,25 @@ void account_idle_time(cputime_t cputime)
3695 cpustat->idle = cputime64_add(cpustat->idle, cputime64); 3889 cpustat->idle = cputime64_add(cpustat->idle, cputime64);
3696} 3890}
3697 3891
3892static __always_inline bool steal_account_process_tick(void)
3893{
3894#ifdef CONFIG_PARAVIRT
3895 if (static_branch(&paravirt_steal_enabled)) {
3896 u64 steal, st = 0;
3897
3898 steal = paravirt_steal_clock(smp_processor_id());
3899 steal -= this_rq()->prev_steal_time;
3900
3901 st = steal_ticks(steal);
3902 this_rq()->prev_steal_time += st * TICK_NSEC;
3903
3904 account_steal_time(st);
3905 return st;
3906 }
3907#endif
3908 return false;
3909}
3910
3698#ifndef CONFIG_VIRT_CPU_ACCOUNTING 3911#ifndef CONFIG_VIRT_CPU_ACCOUNTING
3699 3912
3700#ifdef CONFIG_IRQ_TIME_ACCOUNTING 3913#ifdef CONFIG_IRQ_TIME_ACCOUNTING
@@ -3726,6 +3939,9 @@ static void irqtime_account_process_tick(struct task_struct *p, int user_tick,
3726 cputime64_t tmp = cputime_to_cputime64(cputime_one_jiffy); 3939 cputime64_t tmp = cputime_to_cputime64(cputime_one_jiffy);
3727 struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat; 3940 struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;
3728 3941
3942 if (steal_account_process_tick())
3943 return;
3944
3729 if (irqtime_account_hi_update()) { 3945 if (irqtime_account_hi_update()) {
3730 cpustat->irq = cputime64_add(cpustat->irq, tmp); 3946 cpustat->irq = cputime64_add(cpustat->irq, tmp);
3731 } else if (irqtime_account_si_update()) { 3947 } else if (irqtime_account_si_update()) {
@@ -3779,6 +3995,9 @@ void account_process_tick(struct task_struct *p, int user_tick)
3779 return; 3995 return;
3780 } 3996 }
3781 3997
3998 if (steal_account_process_tick())
3999 return;
4000
3782 if (user_tick) 4001 if (user_tick)
3783 account_user_time(p, cputime_one_jiffy, one_jiffy_scaled); 4002 account_user_time(p, cputime_one_jiffy, one_jiffy_scaled);
3784 else if ((p != rq->idle) || (irq_count() != HARDIRQ_OFFSET)) 4003 else if ((p != rq->idle) || (irq_count() != HARDIRQ_OFFSET))
@@ -3903,9 +4122,6 @@ void thread_group_times(struct task_struct *p, cputime_t *ut, cputime_t *st)
3903/* 4122/*
3904 * This function gets called by the timer code, with HZ frequency. 4123 * This function gets called by the timer code, with HZ frequency.
3905 * We call it with interrupts disabled. 4124 * We call it with interrupts disabled.
3906 *
3907 * It also gets called by the fork code, when changing the parent's
3908 * timeslices.
3909 */ 4125 */
3910void scheduler_tick(void) 4126void scheduler_tick(void)
3911{ 4127{
@@ -4025,17 +4241,11 @@ static inline void schedule_debug(struct task_struct *prev)
4025 profile_hit(SCHED_PROFILING, __builtin_return_address(0)); 4241 profile_hit(SCHED_PROFILING, __builtin_return_address(0));
4026 4242
4027 schedstat_inc(this_rq(), sched_count); 4243 schedstat_inc(this_rq(), sched_count);
4028#ifdef CONFIG_SCHEDSTATS
4029 if (unlikely(prev->lock_depth >= 0)) {
4030 schedstat_inc(this_rq(), rq_sched_info.bkl_count);
4031 schedstat_inc(prev, sched_info.bkl_count);
4032 }
4033#endif
4034} 4244}
4035 4245
4036static void put_prev_task(struct rq *rq, struct task_struct *prev) 4246static void put_prev_task(struct rq *rq, struct task_struct *prev)
4037{ 4247{
4038 if (prev->se.on_rq) 4248 if (prev->on_rq || rq->skip_clock_update < 0)
4039 update_rq_clock(rq); 4249 update_rq_clock(rq);
4040 prev->sched_class->put_prev_task(rq, prev); 4250 prev->sched_class->put_prev_task(rq, prev);
4041} 4251}
@@ -4097,11 +4307,13 @@ need_resched:
4097 if (unlikely(signal_pending_state(prev->state, prev))) { 4307 if (unlikely(signal_pending_state(prev->state, prev))) {
4098 prev->state = TASK_RUNNING; 4308 prev->state = TASK_RUNNING;
4099 } else { 4309 } else {
4310 deactivate_task(rq, prev, DEQUEUE_SLEEP);
4311 prev->on_rq = 0;
4312
4100 /* 4313 /*
4101 * If a worker is going to sleep, notify and 4314 * If a worker went to sleep, notify and ask workqueue
4102 * ask workqueue whether it wants to wake up a 4315 * whether it wants to wake up a task to maintain
4103 * task to maintain concurrency. If so, wake 4316 * concurrency.
4104 * up the task.
4105 */ 4317 */
4106 if (prev->flags & PF_WQ_WORKER) { 4318 if (prev->flags & PF_WQ_WORKER) {
4107 struct task_struct *to_wakeup; 4319 struct task_struct *to_wakeup;
@@ -4110,11 +4322,10 @@ need_resched:
4110 if (to_wakeup) 4322 if (to_wakeup)
4111 try_to_wake_up_local(to_wakeup); 4323 try_to_wake_up_local(to_wakeup);
4112 } 4324 }
4113 deactivate_task(rq, prev, DEQUEUE_SLEEP);
4114 4325
4115 /* 4326 /*
4116 * If we are going to sleep and we have plugged IO queued, make 4327 * If we are going to sleep and we have plugged IO
4117 * sure to submit it to avoid deadlocks. 4328 * queued, make sure to submit it to avoid deadlocks.
4118 */ 4329 */
4119 if (blk_needs_flush_plug(prev)) { 4330 if (blk_needs_flush_plug(prev)) {
4120 raw_spin_unlock(&rq->lock); 4331 raw_spin_unlock(&rq->lock);
@@ -4161,71 +4372,47 @@ need_resched:
4161EXPORT_SYMBOL(schedule); 4372EXPORT_SYMBOL(schedule);
4162 4373
4163#ifdef CONFIG_MUTEX_SPIN_ON_OWNER 4374#ifdef CONFIG_MUTEX_SPIN_ON_OWNER
4164/*
4165 * Look out! "owner" is an entirely speculative pointer
4166 * access and not reliable.
4167 */
4168int mutex_spin_on_owner(struct mutex *lock, struct thread_info *owner)
4169{
4170 unsigned int cpu;
4171 struct rq *rq;
4172 4375
4173 if (!sched_feat(OWNER_SPIN)) 4376static inline bool owner_running(struct mutex *lock, struct task_struct *owner)
4174 return 0; 4377{
4378 if (lock->owner != owner)
4379 return false;
4175 4380
4176#ifdef CONFIG_DEBUG_PAGEALLOC
4177 /* 4381 /*
4178 * Need to access the cpu field knowing that 4382 * Ensure we emit the owner->on_cpu, dereference _after_ checking
4179 * DEBUG_PAGEALLOC could have unmapped it if 4383 * lock->owner still matches owner, if that fails, owner might
4180 * the mutex owner just released it and exited. 4384 * point to free()d memory, if it still matches, the rcu_read_lock()
4385 * ensures the memory stays valid.
4181 */ 4386 */
4182 if (probe_kernel_address(&owner->cpu, cpu)) 4387 barrier();
4183 return 0;
4184#else
4185 cpu = owner->cpu;
4186#endif
4187 4388
4188 /* 4389 return owner->on_cpu;
4189 * Even if the access succeeded (likely case), 4390}
4190 * the cpu field may no longer be valid.
4191 */
4192 if (cpu >= nr_cpumask_bits)
4193 return 0;
4194 4391
4195 /* 4392/*
4196 * We need to validate that we can do a 4393 * Look out! "owner" is an entirely speculative pointer
4197 * get_cpu() and that we have the percpu area. 4394 * access and not reliable.
4198 */ 4395 */
4199 if (!cpu_online(cpu)) 4396int mutex_spin_on_owner(struct mutex *lock, struct task_struct *owner)
4397{
4398 if (!sched_feat(OWNER_SPIN))
4200 return 0; 4399 return 0;
4201 4400
4202 rq = cpu_rq(cpu); 4401 rcu_read_lock();
4203 4402 while (owner_running(lock, owner)) {
4204 for (;;) { 4403 if (need_resched())
4205 /*
4206 * Owner changed, break to re-assess state.
4207 */
4208 if (lock->owner != owner) {
4209 /*
4210 * If the lock has switched to a different owner,
4211 * we likely have heavy contention. Return 0 to quit
4212 * optimistic spinning and not contend further:
4213 */
4214 if (lock->owner)
4215 return 0;
4216 break; 4404 break;
4217 }
4218
4219 /*
4220 * Is that owner really running on that cpu?
4221 */
4222 if (task_thread_info(rq->curr) != owner || need_resched())
4223 return 0;
4224 4405
4225 arch_mutex_cpu_relax(); 4406 arch_mutex_cpu_relax();
4226 } 4407 }
4408 rcu_read_unlock();
4227 4409
4228 return 1; 4410 /*
4411 * We break out the loop above on need_resched() and when the
4412 * owner changed, which is a sign for heavy contention. Return
4413 * success only when lock->owner is NULL.
4414 */
4415 return lock->owner == NULL;
4229} 4416}
4230#endif 4417#endif
4231 4418
@@ -4684,19 +4871,18 @@ EXPORT_SYMBOL(sleep_on_timeout);
4684 */ 4871 */
4685void rt_mutex_setprio(struct task_struct *p, int prio) 4872void rt_mutex_setprio(struct task_struct *p, int prio)
4686{ 4873{
4687 unsigned long flags;
4688 int oldprio, on_rq, running; 4874 int oldprio, on_rq, running;
4689 struct rq *rq; 4875 struct rq *rq;
4690 const struct sched_class *prev_class; 4876 const struct sched_class *prev_class;
4691 4877
4692 BUG_ON(prio < 0 || prio > MAX_PRIO); 4878 BUG_ON(prio < 0 || prio > MAX_PRIO);
4693 4879
4694 rq = task_rq_lock(p, &flags); 4880 rq = __task_rq_lock(p);
4695 4881
4696 trace_sched_pi_setprio(p, prio); 4882 trace_sched_pi_setprio(p, prio);
4697 oldprio = p->prio; 4883 oldprio = p->prio;
4698 prev_class = p->sched_class; 4884 prev_class = p->sched_class;
4699 on_rq = p->se.on_rq; 4885 on_rq = p->on_rq;
4700 running = task_current(rq, p); 4886 running = task_current(rq, p);
4701 if (on_rq) 4887 if (on_rq)
4702 dequeue_task(rq, p, 0); 4888 dequeue_task(rq, p, 0);
@@ -4716,7 +4902,7 @@ void rt_mutex_setprio(struct task_struct *p, int prio)
4716 enqueue_task(rq, p, oldprio < prio ? ENQUEUE_HEAD : 0); 4902 enqueue_task(rq, p, oldprio < prio ? ENQUEUE_HEAD : 0);
4717 4903
4718 check_class_changed(rq, p, prev_class, oldprio); 4904 check_class_changed(rq, p, prev_class, oldprio);
4719 task_rq_unlock(rq, &flags); 4905 __task_rq_unlock(rq);
4720} 4906}
4721 4907
4722#endif 4908#endif
@@ -4744,7 +4930,7 @@ void set_user_nice(struct task_struct *p, long nice)
4744 p->static_prio = NICE_TO_PRIO(nice); 4930 p->static_prio = NICE_TO_PRIO(nice);
4745 goto out_unlock; 4931 goto out_unlock;
4746 } 4932 }
4747 on_rq = p->se.on_rq; 4933 on_rq = p->on_rq;
4748 if (on_rq) 4934 if (on_rq)
4749 dequeue_task(rq, p, 0); 4935 dequeue_task(rq, p, 0);
4750 4936
@@ -4764,7 +4950,7 @@ void set_user_nice(struct task_struct *p, long nice)
4764 resched_task(rq->curr); 4950 resched_task(rq->curr);
4765 } 4951 }
4766out_unlock: 4952out_unlock:
4767 task_rq_unlock(rq, &flags); 4953 task_rq_unlock(rq, p, &flags);
4768} 4954}
4769EXPORT_SYMBOL(set_user_nice); 4955EXPORT_SYMBOL(set_user_nice);
4770 4956
@@ -4878,8 +5064,6 @@ static struct task_struct *find_process_by_pid(pid_t pid)
4878static void 5064static void
4879__setscheduler(struct rq *rq, struct task_struct *p, int policy, int prio) 5065__setscheduler(struct rq *rq, struct task_struct *p, int policy, int prio)
4880{ 5066{
4881 BUG_ON(p->se.on_rq);
4882
4883 p->policy = policy; 5067 p->policy = policy;
4884 p->rt_priority = prio; 5068 p->rt_priority = prio;
4885 p->normal_prio = normal_prio(p); 5069 p->normal_prio = normal_prio(p);
@@ -4994,20 +5178,17 @@ recheck:
4994 /* 5178 /*
4995 * make sure no PI-waiters arrive (or leave) while we are 5179 * make sure no PI-waiters arrive (or leave) while we are
4996 * changing the priority of the task: 5180 * changing the priority of the task:
4997 */ 5181 *
4998 raw_spin_lock_irqsave(&p->pi_lock, flags);
4999 /*
5000 * To be able to change p->policy safely, the appropriate 5182 * To be able to change p->policy safely, the appropriate
5001 * runqueue lock must be held. 5183 * runqueue lock must be held.
5002 */ 5184 */
5003 rq = __task_rq_lock(p); 5185 rq = task_rq_lock(p, &flags);
5004 5186
5005 /* 5187 /*
5006 * Changing the policy of the stop threads its a very bad idea 5188 * Changing the policy of the stop threads its a very bad idea
5007 */ 5189 */
5008 if (p == rq->stop) { 5190 if (p == rq->stop) {
5009 __task_rq_unlock(rq); 5191 task_rq_unlock(rq, p, &flags);
5010 raw_spin_unlock_irqrestore(&p->pi_lock, flags);
5011 return -EINVAL; 5192 return -EINVAL;
5012 } 5193 }
5013 5194
@@ -5031,8 +5212,7 @@ recheck:
5031 if (rt_bandwidth_enabled() && rt_policy(policy) && 5212 if (rt_bandwidth_enabled() && rt_policy(policy) &&
5032 task_group(p)->rt_bandwidth.rt_runtime == 0 && 5213 task_group(p)->rt_bandwidth.rt_runtime == 0 &&
5033 !task_group_is_autogroup(task_group(p))) { 5214 !task_group_is_autogroup(task_group(p))) {
5034 __task_rq_unlock(rq); 5215 task_rq_unlock(rq, p, &flags);
5035 raw_spin_unlock_irqrestore(&p->pi_lock, flags);
5036 return -EPERM; 5216 return -EPERM;
5037 } 5217 }
5038 } 5218 }
@@ -5041,11 +5221,10 @@ recheck:
5041 /* recheck policy now with rq lock held */ 5221 /* recheck policy now with rq lock held */
5042 if (unlikely(oldpolicy != -1 && oldpolicy != p->policy)) { 5222 if (unlikely(oldpolicy != -1 && oldpolicy != p->policy)) {
5043 policy = oldpolicy = -1; 5223 policy = oldpolicy = -1;
5044 __task_rq_unlock(rq); 5224 task_rq_unlock(rq, p, &flags);
5045 raw_spin_unlock_irqrestore(&p->pi_lock, flags);
5046 goto recheck; 5225 goto recheck;
5047 } 5226 }
5048 on_rq = p->se.on_rq; 5227 on_rq = p->on_rq;
5049 running = task_current(rq, p); 5228 running = task_current(rq, p);
5050 if (on_rq) 5229 if (on_rq)
5051 deactivate_task(rq, p, 0); 5230 deactivate_task(rq, p, 0);
@@ -5064,8 +5243,7 @@ recheck:
5064 activate_task(rq, p, 0); 5243 activate_task(rq, p, 0);
5065 5244
5066 check_class_changed(rq, p, prev_class, oldprio); 5245 check_class_changed(rq, p, prev_class, oldprio);
5067 __task_rq_unlock(rq); 5246 task_rq_unlock(rq, p, &flags);
5068 raw_spin_unlock_irqrestore(&p->pi_lock, flags);
5069 5247
5070 rt_mutex_adjust_pi(p); 5248 rt_mutex_adjust_pi(p);
5071 5249
@@ -5316,7 +5494,6 @@ long sched_getaffinity(pid_t pid, struct cpumask *mask)
5316{ 5494{
5317 struct task_struct *p; 5495 struct task_struct *p;
5318 unsigned long flags; 5496 unsigned long flags;
5319 struct rq *rq;
5320 int retval; 5497 int retval;
5321 5498
5322 get_online_cpus(); 5499 get_online_cpus();
@@ -5331,9 +5508,9 @@ long sched_getaffinity(pid_t pid, struct cpumask *mask)
5331 if (retval) 5508 if (retval)
5332 goto out_unlock; 5509 goto out_unlock;
5333 5510
5334 rq = task_rq_lock(p, &flags); 5511 raw_spin_lock_irqsave(&p->pi_lock, flags);
5335 cpumask_and(mask, &p->cpus_allowed, cpu_online_mask); 5512 cpumask_and(mask, &p->cpus_allowed, cpu_online_mask);
5336 task_rq_unlock(rq, &flags); 5513 raw_spin_unlock_irqrestore(&p->pi_lock, flags);
5337 5514
5338out_unlock: 5515out_unlock:
5339 rcu_read_unlock(); 5516 rcu_read_unlock();
@@ -5658,7 +5835,7 @@ SYSCALL_DEFINE2(sched_rr_get_interval, pid_t, pid,
5658 5835
5659 rq = task_rq_lock(p, &flags); 5836 rq = task_rq_lock(p, &flags);
5660 time_slice = p->sched_class->get_rr_interval(rq, p); 5837 time_slice = p->sched_class->get_rr_interval(rq, p);
5661 task_rq_unlock(rq, &flags); 5838 task_rq_unlock(rq, p, &flags);
5662 5839
5663 rcu_read_unlock(); 5840 rcu_read_unlock();
5664 jiffies_to_timespec(time_slice, &t); 5841 jiffies_to_timespec(time_slice, &t);
@@ -5760,7 +5937,7 @@ void __cpuinit init_idle(struct task_struct *idle, int cpu)
5760 idle->state = TASK_RUNNING; 5937 idle->state = TASK_RUNNING;
5761 idle->se.exec_start = sched_clock(); 5938 idle->se.exec_start = sched_clock();
5762 5939
5763 cpumask_copy(&idle->cpus_allowed, cpumask_of(cpu)); 5940 do_set_cpus_allowed(idle, cpumask_of(cpu));
5764 /* 5941 /*
5765 * We're having a chicken and egg problem, even though we are 5942 * We're having a chicken and egg problem, even though we are
5766 * holding rq->lock, the cpu isn't yet set to this cpu so the 5943 * holding rq->lock, the cpu isn't yet set to this cpu so the
@@ -5776,17 +5953,14 @@ void __cpuinit init_idle(struct task_struct *idle, int cpu)
5776 rcu_read_unlock(); 5953 rcu_read_unlock();
5777 5954
5778 rq->curr = rq->idle = idle; 5955 rq->curr = rq->idle = idle;
5779#if defined(CONFIG_SMP) && defined(__ARCH_WANT_UNLOCKED_CTXSW) 5956#if defined(CONFIG_SMP)
5780 idle->oncpu = 1; 5957 idle->on_cpu = 1;
5781#endif 5958#endif
5782 raw_spin_unlock_irqrestore(&rq->lock, flags); 5959 raw_spin_unlock_irqrestore(&rq->lock, flags);
5783 5960
5784 /* Set the preempt count _outside_ the spinlocks! */ 5961 /* Set the preempt count _outside_ the spinlocks! */
5785#if defined(CONFIG_PREEMPT)
5786 task_thread_info(idle)->preempt_count = (idle->lock_depth >= 0);
5787#else
5788 task_thread_info(idle)->preempt_count = 0; 5962 task_thread_info(idle)->preempt_count = 0;
5789#endif 5963
5790 /* 5964 /*
5791 * The idle tasks have their own, simple scheduling class: 5965 * The idle tasks have their own, simple scheduling class:
5792 */ 5966 */
@@ -5851,6 +6025,16 @@ static inline void sched_init_granularity(void)
5851} 6025}
5852 6026
5853#ifdef CONFIG_SMP 6027#ifdef CONFIG_SMP
6028void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask)
6029{
6030 if (p->sched_class && p->sched_class->set_cpus_allowed)
6031 p->sched_class->set_cpus_allowed(p, new_mask);
6032 else {
6033 cpumask_copy(&p->cpus_allowed, new_mask);
6034 p->rt.nr_cpus_allowed = cpumask_weight(new_mask);
6035 }
6036}
6037
5854/* 6038/*
5855 * This is how migration works: 6039 * This is how migration works:
5856 * 6040 *
@@ -5881,52 +6065,38 @@ int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask)
5881 unsigned int dest_cpu; 6065 unsigned int dest_cpu;
5882 int ret = 0; 6066 int ret = 0;
5883 6067
5884 /*
5885 * Serialize against TASK_WAKING so that ttwu() and wunt() can
5886 * drop the rq->lock and still rely on ->cpus_allowed.
5887 */
5888again:
5889 while (task_is_waking(p))
5890 cpu_relax();
5891 rq = task_rq_lock(p, &flags); 6068 rq = task_rq_lock(p, &flags);
5892 if (task_is_waking(p)) { 6069
5893 task_rq_unlock(rq, &flags); 6070 if (cpumask_equal(&p->cpus_allowed, new_mask))
5894 goto again; 6071 goto out;
5895 }
5896 6072
5897 if (!cpumask_intersects(new_mask, cpu_active_mask)) { 6073 if (!cpumask_intersects(new_mask, cpu_active_mask)) {
5898 ret = -EINVAL; 6074 ret = -EINVAL;
5899 goto out; 6075 goto out;
5900 } 6076 }
5901 6077
5902 if (unlikely((p->flags & PF_THREAD_BOUND) && p != current && 6078 if (unlikely((p->flags & PF_THREAD_BOUND) && p != current)) {
5903 !cpumask_equal(&p->cpus_allowed, new_mask))) {
5904 ret = -EINVAL; 6079 ret = -EINVAL;
5905 goto out; 6080 goto out;
5906 } 6081 }
5907 6082
5908 if (p->sched_class->set_cpus_allowed) 6083 do_set_cpus_allowed(p, new_mask);
5909 p->sched_class->set_cpus_allowed(p, new_mask);
5910 else {
5911 cpumask_copy(&p->cpus_allowed, new_mask);
5912 p->rt.nr_cpus_allowed = cpumask_weight(new_mask);
5913 }
5914 6084
5915 /* Can the task run on the task's current CPU? If so, we're done */ 6085 /* Can the task run on the task's current CPU? If so, we're done */
5916 if (cpumask_test_cpu(task_cpu(p), new_mask)) 6086 if (cpumask_test_cpu(task_cpu(p), new_mask))
5917 goto out; 6087 goto out;
5918 6088
5919 dest_cpu = cpumask_any_and(cpu_active_mask, new_mask); 6089 dest_cpu = cpumask_any_and(cpu_active_mask, new_mask);
5920 if (migrate_task(p, rq)) { 6090 if (p->on_rq) {
5921 struct migration_arg arg = { p, dest_cpu }; 6091 struct migration_arg arg = { p, dest_cpu };
5922 /* Need help from migration thread: drop lock and wait. */ 6092 /* Need help from migration thread: drop lock and wait. */
5923 task_rq_unlock(rq, &flags); 6093 task_rq_unlock(rq, p, &flags);
5924 stop_one_cpu(cpu_of(rq), migration_cpu_stop, &arg); 6094 stop_one_cpu(cpu_of(rq), migration_cpu_stop, &arg);
5925 tlb_migrate_finish(p->mm); 6095 tlb_migrate_finish(p->mm);
5926 return 0; 6096 return 0;
5927 } 6097 }
5928out: 6098out:
5929 task_rq_unlock(rq, &flags); 6099 task_rq_unlock(rq, p, &flags);
5930 6100
5931 return ret; 6101 return ret;
5932} 6102}
@@ -5954,6 +6124,7 @@ static int __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu)
5954 rq_src = cpu_rq(src_cpu); 6124 rq_src = cpu_rq(src_cpu);
5955 rq_dest = cpu_rq(dest_cpu); 6125 rq_dest = cpu_rq(dest_cpu);
5956 6126
6127 raw_spin_lock(&p->pi_lock);
5957 double_rq_lock(rq_src, rq_dest); 6128 double_rq_lock(rq_src, rq_dest);
5958 /* Already moved. */ 6129 /* Already moved. */
5959 if (task_cpu(p) != src_cpu) 6130 if (task_cpu(p) != src_cpu)
@@ -5966,7 +6137,7 @@ static int __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu)
5966 * If we're not on a rq, the next wake-up will ensure we're 6137 * If we're not on a rq, the next wake-up will ensure we're
5967 * placed properly. 6138 * placed properly.
5968 */ 6139 */
5969 if (p->se.on_rq) { 6140 if (p->on_rq) {
5970 deactivate_task(rq_src, p, 0); 6141 deactivate_task(rq_src, p, 0);
5971 set_task_cpu(p, dest_cpu); 6142 set_task_cpu(p, dest_cpu);
5972 activate_task(rq_dest, p, 0); 6143 activate_task(rq_dest, p, 0);
@@ -5976,6 +6147,7 @@ done:
5976 ret = 1; 6147 ret = 1;
5977fail: 6148fail:
5978 double_rq_unlock(rq_src, rq_dest); 6149 double_rq_unlock(rq_src, rq_dest);
6150 raw_spin_unlock(&p->pi_lock);
5979 return ret; 6151 return ret;
5980} 6152}
5981 6153
@@ -6316,6 +6488,7 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
6316 6488
6317#ifdef CONFIG_HOTPLUG_CPU 6489#ifdef CONFIG_HOTPLUG_CPU
6318 case CPU_DYING: 6490 case CPU_DYING:
6491 sched_ttwu_pending();
6319 /* Update our root-domain */ 6492 /* Update our root-domain */
6320 raw_spin_lock_irqsave(&rq->lock, flags); 6493 raw_spin_lock_irqsave(&rq->lock, flags);
6321 if (rq->rd) { 6494 if (rq->rd) {
@@ -6394,6 +6567,8 @@ early_initcall(migration_init);
6394 6567
6395#ifdef CONFIG_SMP 6568#ifdef CONFIG_SMP
6396 6569
6570static cpumask_var_t sched_domains_tmpmask; /* sched_domains_mutex */
6571
6397#ifdef CONFIG_SCHED_DEBUG 6572#ifdef CONFIG_SCHED_DEBUG
6398 6573
6399static __read_mostly int sched_domain_debug_enabled; 6574static __read_mostly int sched_domain_debug_enabled;
@@ -6444,7 +6619,7 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level,
6444 break; 6619 break;
6445 } 6620 }
6446 6621
6447 if (!group->cpu_power) { 6622 if (!group->sgp->power) {
6448 printk(KERN_CONT "\n"); 6623 printk(KERN_CONT "\n");
6449 printk(KERN_ERR "ERROR: domain->cpu_power not " 6624 printk(KERN_ERR "ERROR: domain->cpu_power not "
6450 "set\n"); 6625 "set\n");
@@ -6468,9 +6643,9 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level,
6468 cpulist_scnprintf(str, sizeof(str), sched_group_cpus(group)); 6643 cpulist_scnprintf(str, sizeof(str), sched_group_cpus(group));
6469 6644
6470 printk(KERN_CONT " %s", str); 6645 printk(KERN_CONT " %s", str);
6471 if (group->cpu_power != SCHED_LOAD_SCALE) { 6646 if (group->sgp->power != SCHED_POWER_SCALE) {
6472 printk(KERN_CONT " (cpu_power = %d)", 6647 printk(KERN_CONT " (cpu_power = %d)",
6473 group->cpu_power); 6648 group->sgp->power);
6474 } 6649 }
6475 6650
6476 group = group->next; 6651 group = group->next;
@@ -6489,7 +6664,6 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level,
6489 6664
6490static void sched_domain_debug(struct sched_domain *sd, int cpu) 6665static void sched_domain_debug(struct sched_domain *sd, int cpu)
6491{ 6666{
6492 cpumask_var_t groupmask;
6493 int level = 0; 6667 int level = 0;
6494 6668
6495 if (!sched_domain_debug_enabled) 6669 if (!sched_domain_debug_enabled)
@@ -6502,20 +6676,14 @@ static void sched_domain_debug(struct sched_domain *sd, int cpu)
6502 6676
6503 printk(KERN_DEBUG "CPU%d attaching sched-domain:\n", cpu); 6677 printk(KERN_DEBUG "CPU%d attaching sched-domain:\n", cpu);
6504 6678
6505 if (!alloc_cpumask_var(&groupmask, GFP_KERNEL)) {
6506 printk(KERN_DEBUG "Cannot load-balance (out of memory)\n");
6507 return;
6508 }
6509
6510 for (;;) { 6679 for (;;) {
6511 if (sched_domain_debug_one(sd, cpu, level, groupmask)) 6680 if (sched_domain_debug_one(sd, cpu, level, sched_domains_tmpmask))
6512 break; 6681 break;
6513 level++; 6682 level++;
6514 sd = sd->parent; 6683 sd = sd->parent;
6515 if (!sd) 6684 if (!sd)
6516 break; 6685 break;
6517 } 6686 }
6518 free_cpumask_var(groupmask);
6519} 6687}
6520#else /* !CONFIG_SCHED_DEBUG */ 6688#else /* !CONFIG_SCHED_DEBUG */
6521# define sched_domain_debug(sd, cpu) do { } while (0) 6689# define sched_domain_debug(sd, cpu) do { } while (0)
@@ -6572,12 +6740,11 @@ sd_parent_degenerate(struct sched_domain *sd, struct sched_domain *parent)
6572 return 1; 6740 return 1;
6573} 6741}
6574 6742
6575static void free_rootdomain(struct root_domain *rd) 6743static void free_rootdomain(struct rcu_head *rcu)
6576{ 6744{
6577 synchronize_sched(); 6745 struct root_domain *rd = container_of(rcu, struct root_domain, rcu);
6578 6746
6579 cpupri_cleanup(&rd->cpupri); 6747 cpupri_cleanup(&rd->cpupri);
6580
6581 free_cpumask_var(rd->rto_mask); 6748 free_cpumask_var(rd->rto_mask);
6582 free_cpumask_var(rd->online); 6749 free_cpumask_var(rd->online);
6583 free_cpumask_var(rd->span); 6750 free_cpumask_var(rd->span);
@@ -6618,7 +6785,7 @@ static void rq_attach_root(struct rq *rq, struct root_domain *rd)
6618 raw_spin_unlock_irqrestore(&rq->lock, flags); 6785 raw_spin_unlock_irqrestore(&rq->lock, flags);
6619 6786
6620 if (old_rd) 6787 if (old_rd)
6621 free_rootdomain(old_rd); 6788 call_rcu_sched(&old_rd->rcu, free_rootdomain);
6622} 6789}
6623 6790
6624static int init_rootdomain(struct root_domain *rd) 6791static int init_rootdomain(struct root_domain *rd)
@@ -6669,6 +6836,53 @@ static struct root_domain *alloc_rootdomain(void)
6669 return rd; 6836 return rd;
6670} 6837}
6671 6838
6839static void free_sched_groups(struct sched_group *sg, int free_sgp)
6840{
6841 struct sched_group *tmp, *first;
6842
6843 if (!sg)
6844 return;
6845
6846 first = sg;
6847 do {
6848 tmp = sg->next;
6849
6850 if (free_sgp && atomic_dec_and_test(&sg->sgp->ref))
6851 kfree(sg->sgp);
6852
6853 kfree(sg);
6854 sg = tmp;
6855 } while (sg != first);
6856}
6857
6858static void free_sched_domain(struct rcu_head *rcu)
6859{
6860 struct sched_domain *sd = container_of(rcu, struct sched_domain, rcu);
6861
6862 /*
6863 * If its an overlapping domain it has private groups, iterate and
6864 * nuke them all.
6865 */
6866 if (sd->flags & SD_OVERLAP) {
6867 free_sched_groups(sd->groups, 1);
6868 } else if (atomic_dec_and_test(&sd->groups->ref)) {
6869 kfree(sd->groups->sgp);
6870 kfree(sd->groups);
6871 }
6872 kfree(sd);
6873}
6874
6875static void destroy_sched_domain(struct sched_domain *sd, int cpu)
6876{
6877 call_rcu(&sd->rcu, free_sched_domain);
6878}
6879
6880static void destroy_sched_domains(struct sched_domain *sd, int cpu)
6881{
6882 for (; sd; sd = sd->parent)
6883 destroy_sched_domain(sd, cpu);
6884}
6885
6672/* 6886/*
6673 * Attach the domain 'sd' to 'cpu' as its base domain. Callers must 6887 * Attach the domain 'sd' to 'cpu' as its base domain. Callers must
6674 * hold the hotplug lock. 6888 * hold the hotplug lock.
@@ -6679,9 +6893,6 @@ cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu)
6679 struct rq *rq = cpu_rq(cpu); 6893 struct rq *rq = cpu_rq(cpu);
6680 struct sched_domain *tmp; 6894 struct sched_domain *tmp;
6681 6895
6682 for (tmp = sd; tmp; tmp = tmp->parent)
6683 tmp->span_weight = cpumask_weight(sched_domain_span(tmp));
6684
6685 /* Remove the sched domains which do not contribute to scheduling. */ 6896 /* Remove the sched domains which do not contribute to scheduling. */
6686 for (tmp = sd; tmp; ) { 6897 for (tmp = sd; tmp; ) {
6687 struct sched_domain *parent = tmp->parent; 6898 struct sched_domain *parent = tmp->parent;
@@ -6692,12 +6903,15 @@ cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu)
6692 tmp->parent = parent->parent; 6903 tmp->parent = parent->parent;
6693 if (parent->parent) 6904 if (parent->parent)
6694 parent->parent->child = tmp; 6905 parent->parent->child = tmp;
6906 destroy_sched_domain(parent, cpu);
6695 } else 6907 } else
6696 tmp = tmp->parent; 6908 tmp = tmp->parent;
6697 } 6909 }
6698 6910
6699 if (sd && sd_degenerate(sd)) { 6911 if (sd && sd_degenerate(sd)) {
6912 tmp = sd;
6700 sd = sd->parent; 6913 sd = sd->parent;
6914 destroy_sched_domain(tmp, cpu);
6701 if (sd) 6915 if (sd)
6702 sd->child = NULL; 6916 sd->child = NULL;
6703 } 6917 }
@@ -6705,7 +6919,9 @@ cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu)
6705 sched_domain_debug(sd, cpu); 6919 sched_domain_debug(sd, cpu);
6706 6920
6707 rq_attach_root(rq, rd); 6921 rq_attach_root(rq, rd);
6922 tmp = rq->sd;
6708 rcu_assign_pointer(rq->sd, sd); 6923 rcu_assign_pointer(rq->sd, sd);
6924 destroy_sched_domains(tmp, cpu);
6709} 6925}
6710 6926
6711/* cpus with isolated domains */ 6927/* cpus with isolated domains */
@@ -6721,56 +6937,6 @@ static int __init isolated_cpu_setup(char *str)
6721 6937
6722__setup("isolcpus=", isolated_cpu_setup); 6938__setup("isolcpus=", isolated_cpu_setup);
6723 6939
6724/*
6725 * init_sched_build_groups takes the cpumask we wish to span, and a pointer
6726 * to a function which identifies what group(along with sched group) a CPU
6727 * belongs to. The return value of group_fn must be a >= 0 and < nr_cpu_ids
6728 * (due to the fact that we keep track of groups covered with a struct cpumask).
6729 *
6730 * init_sched_build_groups will build a circular linked list of the groups
6731 * covered by the given span, and will set each group's ->cpumask correctly,
6732 * and ->cpu_power to 0.
6733 */
6734static void
6735init_sched_build_groups(const struct cpumask *span,
6736 const struct cpumask *cpu_map,
6737 int (*group_fn)(int cpu, const struct cpumask *cpu_map,
6738 struct sched_group **sg,
6739 struct cpumask *tmpmask),
6740 struct cpumask *covered, struct cpumask *tmpmask)
6741{
6742 struct sched_group *first = NULL, *last = NULL;
6743 int i;
6744
6745 cpumask_clear(covered);
6746
6747 for_each_cpu(i, span) {
6748 struct sched_group *sg;
6749 int group = group_fn(i, cpu_map, &sg, tmpmask);
6750 int j;
6751
6752 if (cpumask_test_cpu(i, covered))
6753 continue;
6754
6755 cpumask_clear(sched_group_cpus(sg));
6756 sg->cpu_power = 0;
6757
6758 for_each_cpu(j, span) {
6759 if (group_fn(j, cpu_map, NULL, tmpmask) != group)
6760 continue;
6761
6762 cpumask_set_cpu(j, covered);
6763 cpumask_set_cpu(j, sched_group_cpus(sg));
6764 }
6765 if (!first)
6766 first = sg;
6767 if (last)
6768 last->next = sg;
6769 last = sg;
6770 }
6771 last->next = first;
6772}
6773
6774#define SD_NODES_PER_DOMAIN 16 6940#define SD_NODES_PER_DOMAIN 16
6775 6941
6776#ifdef CONFIG_NUMA 6942#ifdef CONFIG_NUMA
@@ -6787,7 +6953,7 @@ init_sched_build_groups(const struct cpumask *span,
6787 */ 6953 */
6788static int find_next_best_node(int node, nodemask_t *used_nodes) 6954static int find_next_best_node(int node, nodemask_t *used_nodes)
6789{ 6955{
6790 int i, n, val, min_val, best_node = 0; 6956 int i, n, val, min_val, best_node = -1;
6791 6957
6792 min_val = INT_MAX; 6958 min_val = INT_MAX;
6793 6959
@@ -6811,7 +6977,8 @@ static int find_next_best_node(int node, nodemask_t *used_nodes)
6811 } 6977 }
6812 } 6978 }
6813 6979
6814 node_set(best_node, *used_nodes); 6980 if (best_node != -1)
6981 node_set(best_node, *used_nodes);
6815 return best_node; 6982 return best_node;
6816} 6983}
6817 6984
@@ -6837,315 +7004,197 @@ static void sched_domain_node_span(int node, struct cpumask *span)
6837 7004
6838 for (i = 1; i < SD_NODES_PER_DOMAIN; i++) { 7005 for (i = 1; i < SD_NODES_PER_DOMAIN; i++) {
6839 int next_node = find_next_best_node(node, &used_nodes); 7006 int next_node = find_next_best_node(node, &used_nodes);
6840 7007 if (next_node < 0)
7008 break;
6841 cpumask_or(span, span, cpumask_of_node(next_node)); 7009 cpumask_or(span, span, cpumask_of_node(next_node));
6842 } 7010 }
6843} 7011}
7012
7013static const struct cpumask *cpu_node_mask(int cpu)
7014{
7015 lockdep_assert_held(&sched_domains_mutex);
7016
7017 sched_domain_node_span(cpu_to_node(cpu), sched_domains_tmpmask);
7018
7019 return sched_domains_tmpmask;
7020}
7021
7022static const struct cpumask *cpu_allnodes_mask(int cpu)
7023{
7024 return cpu_possible_mask;
7025}
6844#endif /* CONFIG_NUMA */ 7026#endif /* CONFIG_NUMA */
6845 7027
6846int sched_smt_power_savings = 0, sched_mc_power_savings = 0; 7028static const struct cpumask *cpu_cpu_mask(int cpu)
7029{
7030 return cpumask_of_node(cpu_to_node(cpu));
7031}
6847 7032
6848/* 7033int sched_smt_power_savings = 0, sched_mc_power_savings = 0;
6849 * The cpus mask in sched_group and sched_domain hangs off the end.
6850 *
6851 * ( See the the comments in include/linux/sched.h:struct sched_group
6852 * and struct sched_domain. )
6853 */
6854struct static_sched_group {
6855 struct sched_group sg;
6856 DECLARE_BITMAP(cpus, CONFIG_NR_CPUS);
6857};
6858 7034
6859struct static_sched_domain { 7035struct sd_data {
6860 struct sched_domain sd; 7036 struct sched_domain **__percpu sd;
6861 DECLARE_BITMAP(span, CONFIG_NR_CPUS); 7037 struct sched_group **__percpu sg;
7038 struct sched_group_power **__percpu sgp;
6862}; 7039};
6863 7040
6864struct s_data { 7041struct s_data {
6865#ifdef CONFIG_NUMA 7042 struct sched_domain ** __percpu sd;
6866 int sd_allnodes;
6867 cpumask_var_t domainspan;
6868 cpumask_var_t covered;
6869 cpumask_var_t notcovered;
6870#endif
6871 cpumask_var_t nodemask;
6872 cpumask_var_t this_sibling_map;
6873 cpumask_var_t this_core_map;
6874 cpumask_var_t this_book_map;
6875 cpumask_var_t send_covered;
6876 cpumask_var_t tmpmask;
6877 struct sched_group **sched_group_nodes;
6878 struct root_domain *rd; 7043 struct root_domain *rd;
6879}; 7044};
6880 7045
6881enum s_alloc { 7046enum s_alloc {
6882 sa_sched_groups = 0,
6883 sa_rootdomain, 7047 sa_rootdomain,
6884 sa_tmpmask, 7048 sa_sd,
6885 sa_send_covered, 7049 sa_sd_storage,
6886 sa_this_book_map,
6887 sa_this_core_map,
6888 sa_this_sibling_map,
6889 sa_nodemask,
6890 sa_sched_group_nodes,
6891#ifdef CONFIG_NUMA
6892 sa_notcovered,
6893 sa_covered,
6894 sa_domainspan,
6895#endif
6896 sa_none, 7050 sa_none,
6897}; 7051};
6898 7052
6899/* 7053struct sched_domain_topology_level;
6900 * SMT sched-domains:
6901 */
6902#ifdef CONFIG_SCHED_SMT
6903static DEFINE_PER_CPU(struct static_sched_domain, cpu_domains);
6904static DEFINE_PER_CPU(struct static_sched_group, sched_groups);
6905 7054
6906static int 7055typedef struct sched_domain *(*sched_domain_init_f)(struct sched_domain_topology_level *tl, int cpu);
6907cpu_to_cpu_group(int cpu, const struct cpumask *cpu_map, 7056typedef const struct cpumask *(*sched_domain_mask_f)(int cpu);
6908 struct sched_group **sg, struct cpumask *unused)
6909{
6910 if (sg)
6911 *sg = &per_cpu(sched_groups, cpu).sg;
6912 return cpu;
6913}
6914#endif /* CONFIG_SCHED_SMT */
6915 7057
6916/* 7058#define SDTL_OVERLAP 0x01
6917 * multi-core sched-domains: 7059
6918 */ 7060struct sched_domain_topology_level {
6919#ifdef CONFIG_SCHED_MC 7061 sched_domain_init_f init;
6920static DEFINE_PER_CPU(struct static_sched_domain, core_domains); 7062 sched_domain_mask_f mask;
6921static DEFINE_PER_CPU(struct static_sched_group, sched_group_core); 7063 int flags;
7064 struct sd_data data;
7065};
6922 7066
6923static int 7067static int
6924cpu_to_core_group(int cpu, const struct cpumask *cpu_map, 7068build_overlap_sched_groups(struct sched_domain *sd, int cpu)
6925 struct sched_group **sg, struct cpumask *mask)
6926{ 7069{
6927 int group; 7070 struct sched_group *first = NULL, *last = NULL, *groups = NULL, *sg;
6928#ifdef CONFIG_SCHED_SMT 7071 const struct cpumask *span = sched_domain_span(sd);
6929 cpumask_and(mask, topology_thread_cpumask(cpu), cpu_map); 7072 struct cpumask *covered = sched_domains_tmpmask;
6930 group = cpumask_first(mask); 7073 struct sd_data *sdd = sd->private;
6931#else 7074 struct sched_domain *child;
6932 group = cpu; 7075 int i;
6933#endif
6934 if (sg)
6935 *sg = &per_cpu(sched_group_core, group).sg;
6936 return group;
6937}
6938#endif /* CONFIG_SCHED_MC */
6939 7076
6940/* 7077 cpumask_clear(covered);
6941 * book sched-domains:
6942 */
6943#ifdef CONFIG_SCHED_BOOK
6944static DEFINE_PER_CPU(struct static_sched_domain, book_domains);
6945static DEFINE_PER_CPU(struct static_sched_group, sched_group_book);
6946 7078
6947static int 7079 for_each_cpu(i, span) {
6948cpu_to_book_group(int cpu, const struct cpumask *cpu_map, 7080 struct cpumask *sg_span;
6949 struct sched_group **sg, struct cpumask *mask)
6950{
6951 int group = cpu;
6952#ifdef CONFIG_SCHED_MC
6953 cpumask_and(mask, cpu_coregroup_mask(cpu), cpu_map);
6954 group = cpumask_first(mask);
6955#elif defined(CONFIG_SCHED_SMT)
6956 cpumask_and(mask, topology_thread_cpumask(cpu), cpu_map);
6957 group = cpumask_first(mask);
6958#endif
6959 if (sg)
6960 *sg = &per_cpu(sched_group_book, group).sg;
6961 return group;
6962}
6963#endif /* CONFIG_SCHED_BOOK */
6964 7081
6965static DEFINE_PER_CPU(struct static_sched_domain, phys_domains); 7082 if (cpumask_test_cpu(i, covered))
6966static DEFINE_PER_CPU(struct static_sched_group, sched_group_phys); 7083 continue;
6967 7084
6968static int 7085 sg = kzalloc_node(sizeof(struct sched_group) + cpumask_size(),
6969cpu_to_phys_group(int cpu, const struct cpumask *cpu_map, 7086 GFP_KERNEL, cpu_to_node(i));
6970 struct sched_group **sg, struct cpumask *mask)
6971{
6972 int group;
6973#ifdef CONFIG_SCHED_BOOK
6974 cpumask_and(mask, cpu_book_mask(cpu), cpu_map);
6975 group = cpumask_first(mask);
6976#elif defined(CONFIG_SCHED_MC)
6977 cpumask_and(mask, cpu_coregroup_mask(cpu), cpu_map);
6978 group = cpumask_first(mask);
6979#elif defined(CONFIG_SCHED_SMT)
6980 cpumask_and(mask, topology_thread_cpumask(cpu), cpu_map);
6981 group = cpumask_first(mask);
6982#else
6983 group = cpu;
6984#endif
6985 if (sg)
6986 *sg = &per_cpu(sched_group_phys, group).sg;
6987 return group;
6988}
6989 7087
6990#ifdef CONFIG_NUMA 7088 if (!sg)
6991/* 7089 goto fail;
6992 * The init_sched_build_groups can't handle what we want to do with node
6993 * groups, so roll our own. Now each node has its own list of groups which
6994 * gets dynamically allocated.
6995 */
6996static DEFINE_PER_CPU(struct static_sched_domain, node_domains);
6997static struct sched_group ***sched_group_nodes_bycpu;
6998 7090
6999static DEFINE_PER_CPU(struct static_sched_domain, allnodes_domains); 7091 sg_span = sched_group_cpus(sg);
7000static DEFINE_PER_CPU(struct static_sched_group, sched_group_allnodes);
7001 7092
7002static int cpu_to_allnodes_group(int cpu, const struct cpumask *cpu_map, 7093 child = *per_cpu_ptr(sdd->sd, i);
7003 struct sched_group **sg, 7094 if (child->child) {
7004 struct cpumask *nodemask) 7095 child = child->child;
7005{ 7096 cpumask_copy(sg_span, sched_domain_span(child));
7006 int group; 7097 } else
7098 cpumask_set_cpu(i, sg_span);
7007 7099
7008 cpumask_and(nodemask, cpumask_of_node(cpu_to_node(cpu)), cpu_map); 7100 cpumask_or(covered, covered, sg_span);
7009 group = cpumask_first(nodemask);
7010 7101
7011 if (sg) 7102 sg->sgp = *per_cpu_ptr(sdd->sgp, cpumask_first(sg_span));
7012 *sg = &per_cpu(sched_group_allnodes, group).sg; 7103 atomic_inc(&sg->sgp->ref);
7013 return group;
7014}
7015 7104
7016static void init_numa_sched_groups_power(struct sched_group *group_head) 7105 if (cpumask_test_cpu(cpu, sg_span))
7017{ 7106 groups = sg;
7018 struct sched_group *sg = group_head;
7019 int j;
7020 7107
7021 if (!sg) 7108 if (!first)
7022 return; 7109 first = sg;
7023 do { 7110 if (last)
7024 for_each_cpu(j, sched_group_cpus(sg)) { 7111 last->next = sg;
7025 struct sched_domain *sd; 7112 last = sg;
7113 last->next = first;
7114 }
7115 sd->groups = groups;
7026 7116
7027 sd = &per_cpu(phys_domains, j).sd; 7117 return 0;
7028 if (j != group_first_cpu(sd->groups)) {
7029 /*
7030 * Only add "power" once for each
7031 * physical package.
7032 */
7033 continue;
7034 }
7035 7118
7036 sg->cpu_power += sd->groups->cpu_power; 7119fail:
7037 } 7120 free_sched_groups(first, 0);
7038 sg = sg->next; 7121
7039 } while (sg != group_head); 7122 return -ENOMEM;
7040} 7123}
7041 7124
7042static int build_numa_sched_groups(struct s_data *d, 7125static int get_group(int cpu, struct sd_data *sdd, struct sched_group **sg)
7043 const struct cpumask *cpu_map, int num)
7044{ 7126{
7045 struct sched_domain *sd; 7127 struct sched_domain *sd = *per_cpu_ptr(sdd->sd, cpu);
7046 struct sched_group *sg, *prev; 7128 struct sched_domain *child = sd->child;
7047 int n, j;
7048 7129
7049 cpumask_clear(d->covered); 7130 if (child)
7050 cpumask_and(d->nodemask, cpumask_of_node(num), cpu_map); 7131 cpu = cpumask_first(sched_domain_span(child));
7051 if (cpumask_empty(d->nodemask)) { 7132
7052 d->sched_group_nodes[num] = NULL; 7133 if (sg) {
7053 goto out; 7134 *sg = *per_cpu_ptr(sdd->sg, cpu);
7135 (*sg)->sgp = *per_cpu_ptr(sdd->sgp, cpu);
7136 atomic_set(&(*sg)->sgp->ref, 1); /* for claim_allocations */
7054 } 7137 }
7055 7138
7056 sched_domain_node_span(num, d->domainspan); 7139 return cpu;
7057 cpumask_and(d->domainspan, d->domainspan, cpu_map); 7140}
7058 7141
7059 sg = kmalloc_node(sizeof(struct sched_group) + cpumask_size(), 7142/*
7060 GFP_KERNEL, num); 7143 * build_sched_groups will build a circular linked list of the groups
7061 if (!sg) { 7144 * covered by the given span, and will set each group's ->cpumask correctly,
7062 printk(KERN_WARNING "Can not alloc domain group for node %d\n", 7145 * and ->cpu_power to 0.
7063 num); 7146 *
7064 return -ENOMEM; 7147 * Assumes the sched_domain tree is fully constructed
7065 } 7148 */
7066 d->sched_group_nodes[num] = sg; 7149static int
7150build_sched_groups(struct sched_domain *sd, int cpu)
7151{
7152 struct sched_group *first = NULL, *last = NULL;
7153 struct sd_data *sdd = sd->private;
7154 const struct cpumask *span = sched_domain_span(sd);
7155 struct cpumask *covered;
7156 int i;
7067 7157
7068 for_each_cpu(j, d->nodemask) { 7158 get_group(cpu, sdd, &sd->groups);
7069 sd = &per_cpu(node_domains, j).sd; 7159 atomic_inc(&sd->groups->ref);
7070 sd->groups = sg;
7071 }
7072 7160
7073 sg->cpu_power = 0; 7161 if (cpu != cpumask_first(sched_domain_span(sd)))
7074 cpumask_copy(sched_group_cpus(sg), d->nodemask); 7162 return 0;
7075 sg->next = sg;
7076 cpumask_or(d->covered, d->covered, d->nodemask);
7077 7163
7078 prev = sg; 7164 lockdep_assert_held(&sched_domains_mutex);
7079 for (j = 0; j < nr_node_ids; j++) { 7165 covered = sched_domains_tmpmask;
7080 n = (num + j) % nr_node_ids;
7081 cpumask_complement(d->notcovered, d->covered);
7082 cpumask_and(d->tmpmask, d->notcovered, cpu_map);
7083 cpumask_and(d->tmpmask, d->tmpmask, d->domainspan);
7084 if (cpumask_empty(d->tmpmask))
7085 break;
7086 cpumask_and(d->tmpmask, d->tmpmask, cpumask_of_node(n));
7087 if (cpumask_empty(d->tmpmask))
7088 continue;
7089 sg = kmalloc_node(sizeof(struct sched_group) + cpumask_size(),
7090 GFP_KERNEL, num);
7091 if (!sg) {
7092 printk(KERN_WARNING
7093 "Can not alloc domain group for node %d\n", j);
7094 return -ENOMEM;
7095 }
7096 sg->cpu_power = 0;
7097 cpumask_copy(sched_group_cpus(sg), d->tmpmask);
7098 sg->next = prev->next;
7099 cpumask_or(d->covered, d->covered, d->tmpmask);
7100 prev->next = sg;
7101 prev = sg;
7102 }
7103out:
7104 return 0;
7105}
7106#endif /* CONFIG_NUMA */
7107 7166
7108#ifdef CONFIG_NUMA 7167 cpumask_clear(covered);
7109/* Free memory allocated for various sched_group structures */
7110static void free_sched_groups(const struct cpumask *cpu_map,
7111 struct cpumask *nodemask)
7112{
7113 int cpu, i;
7114 7168
7115 for_each_cpu(cpu, cpu_map) { 7169 for_each_cpu(i, span) {
7116 struct sched_group **sched_group_nodes 7170 struct sched_group *sg;
7117 = sched_group_nodes_bycpu[cpu]; 7171 int group = get_group(i, sdd, &sg);
7172 int j;
7118 7173
7119 if (!sched_group_nodes) 7174 if (cpumask_test_cpu(i, covered))
7120 continue; 7175 continue;
7121 7176
7122 for (i = 0; i < nr_node_ids; i++) { 7177 cpumask_clear(sched_group_cpus(sg));
7123 struct sched_group *oldsg, *sg = sched_group_nodes[i]; 7178 sg->sgp->power = 0;
7124 7179
7125 cpumask_and(nodemask, cpumask_of_node(i), cpu_map); 7180 for_each_cpu(j, span) {
7126 if (cpumask_empty(nodemask)) 7181 if (get_group(j, sdd, NULL) != group)
7127 continue; 7182 continue;
7128 7183
7129 if (sg == NULL) 7184 cpumask_set_cpu(j, covered);
7130 continue; 7185 cpumask_set_cpu(j, sched_group_cpus(sg));
7131 sg = sg->next;
7132next_sg:
7133 oldsg = sg;
7134 sg = sg->next;
7135 kfree(oldsg);
7136 if (oldsg != sched_group_nodes[i])
7137 goto next_sg;
7138 } 7186 }
7139 kfree(sched_group_nodes); 7187
7140 sched_group_nodes_bycpu[cpu] = NULL; 7188 if (!first)
7189 first = sg;
7190 if (last)
7191 last->next = sg;
7192 last = sg;
7141 } 7193 }
7194 last->next = first;
7195
7196 return 0;
7142} 7197}
7143#else /* !CONFIG_NUMA */
7144static void free_sched_groups(const struct cpumask *cpu_map,
7145 struct cpumask *nodemask)
7146{
7147}
7148#endif /* CONFIG_NUMA */
7149 7198
7150/* 7199/*
7151 * Initialize sched groups cpu_power. 7200 * Initialize sched groups cpu_power.
@@ -7159,48 +7208,19 @@ static void free_sched_groups(const struct cpumask *cpu_map,
7159 */ 7208 */
7160static void init_sched_groups_power(int cpu, struct sched_domain *sd) 7209static void init_sched_groups_power(int cpu, struct sched_domain *sd)
7161{ 7210{
7162 struct sched_domain *child; 7211 struct sched_group *sg = sd->groups;
7163 struct sched_group *group;
7164 long power;
7165 int weight;
7166 7212
7167 WARN_ON(!sd || !sd->groups); 7213 WARN_ON(!sd || !sg);
7168
7169 if (cpu != group_first_cpu(sd->groups))
7170 return;
7171 7214
7172 sd->groups->group_weight = cpumask_weight(sched_group_cpus(sd->groups)); 7215 do {
7173 7216 sg->group_weight = cpumask_weight(sched_group_cpus(sg));
7174 child = sd->child; 7217 sg = sg->next;
7175 7218 } while (sg != sd->groups);
7176 sd->groups->cpu_power = 0;
7177 7219
7178 if (!child) { 7220 if (cpu != group_first_cpu(sg))
7179 power = SCHED_LOAD_SCALE;
7180 weight = cpumask_weight(sched_domain_span(sd));
7181 /*
7182 * SMT siblings share the power of a single core.
7183 * Usually multiple threads get a better yield out of
7184 * that one core than a single thread would have,
7185 * reflect that in sd->smt_gain.
7186 */
7187 if ((sd->flags & SD_SHARE_CPUPOWER) && weight > 1) {
7188 power *= sd->smt_gain;
7189 power /= weight;
7190 power >>= SCHED_LOAD_SHIFT;
7191 }
7192 sd->groups->cpu_power += power;
7193 return; 7221 return;
7194 }
7195 7222
7196 /* 7223 update_group_power(sd, cpu);
7197 * Add cpu_power of each child group to this groups cpu_power.
7198 */
7199 group = child->groups;
7200 do {
7201 sd->groups->cpu_power += group->cpu_power;
7202 group = group->next;
7203 } while (group != child->groups);
7204} 7224}
7205 7225
7206/* 7226/*
@@ -7214,15 +7234,15 @@ static void init_sched_groups_power(int cpu, struct sched_domain *sd)
7214# define SD_INIT_NAME(sd, type) do { } while (0) 7234# define SD_INIT_NAME(sd, type) do { } while (0)
7215#endif 7235#endif
7216 7236
7217#define SD_INIT(sd, type) sd_init_##type(sd) 7237#define SD_INIT_FUNC(type) \
7218 7238static noinline struct sched_domain * \
7219#define SD_INIT_FUNC(type) \ 7239sd_init_##type(struct sched_domain_topology_level *tl, int cpu) \
7220static noinline void sd_init_##type(struct sched_domain *sd) \ 7240{ \
7221{ \ 7241 struct sched_domain *sd = *per_cpu_ptr(tl->data.sd, cpu); \
7222 memset(sd, 0, sizeof(*sd)); \ 7242 *sd = SD_##type##_INIT; \
7223 *sd = SD_##type##_INIT; \ 7243 SD_INIT_NAME(sd, type); \
7224 sd->level = SD_LV_##type; \ 7244 sd->private = &tl->data; \
7225 SD_INIT_NAME(sd, type); \ 7245 return sd; \
7226} 7246}
7227 7247
7228SD_INIT_FUNC(CPU) 7248SD_INIT_FUNC(CPU)
@@ -7241,13 +7261,14 @@ SD_INIT_FUNC(CPU)
7241#endif 7261#endif
7242 7262
7243static int default_relax_domain_level = -1; 7263static int default_relax_domain_level = -1;
7264int sched_domain_level_max;
7244 7265
7245static int __init setup_relax_domain_level(char *str) 7266static int __init setup_relax_domain_level(char *str)
7246{ 7267{
7247 unsigned long val; 7268 unsigned long val;
7248 7269
7249 val = simple_strtoul(str, NULL, 0); 7270 val = simple_strtoul(str, NULL, 0);
7250 if (val < SD_LV_MAX) 7271 if (val < sched_domain_level_max)
7251 default_relax_domain_level = val; 7272 default_relax_domain_level = val;
7252 7273
7253 return 1; 7274 return 1;
@@ -7275,37 +7296,20 @@ static void set_domain_attribute(struct sched_domain *sd,
7275 } 7296 }
7276} 7297}
7277 7298
7299static void __sdt_free(const struct cpumask *cpu_map);
7300static int __sdt_alloc(const struct cpumask *cpu_map);
7301
7278static void __free_domain_allocs(struct s_data *d, enum s_alloc what, 7302static void __free_domain_allocs(struct s_data *d, enum s_alloc what,
7279 const struct cpumask *cpu_map) 7303 const struct cpumask *cpu_map)
7280{ 7304{
7281 switch (what) { 7305 switch (what) {
7282 case sa_sched_groups:
7283 free_sched_groups(cpu_map, d->tmpmask); /* fall through */
7284 d->sched_group_nodes = NULL;
7285 case sa_rootdomain: 7306 case sa_rootdomain:
7286 free_rootdomain(d->rd); /* fall through */ 7307 if (!atomic_read(&d->rd->refcount))
7287 case sa_tmpmask: 7308 free_rootdomain(&d->rd->rcu); /* fall through */
7288 free_cpumask_var(d->tmpmask); /* fall through */ 7309 case sa_sd:
7289 case sa_send_covered: 7310 free_percpu(d->sd); /* fall through */
7290 free_cpumask_var(d->send_covered); /* fall through */ 7311 case sa_sd_storage:
7291 case sa_this_book_map: 7312 __sdt_free(cpu_map); /* fall through */
7292 free_cpumask_var(d->this_book_map); /* fall through */
7293 case sa_this_core_map:
7294 free_cpumask_var(d->this_core_map); /* fall through */
7295 case sa_this_sibling_map:
7296 free_cpumask_var(d->this_sibling_map); /* fall through */
7297 case sa_nodemask:
7298 free_cpumask_var(d->nodemask); /* fall through */
7299 case sa_sched_group_nodes:
7300#ifdef CONFIG_NUMA
7301 kfree(d->sched_group_nodes); /* fall through */
7302 case sa_notcovered:
7303 free_cpumask_var(d->notcovered); /* fall through */
7304 case sa_covered:
7305 free_cpumask_var(d->covered); /* fall through */
7306 case sa_domainspan:
7307 free_cpumask_var(d->domainspan); /* fall through */
7308#endif
7309 case sa_none: 7313 case sa_none:
7310 break; 7314 break;
7311 } 7315 }
@@ -7314,308 +7318,233 @@ static void __free_domain_allocs(struct s_data *d, enum s_alloc what,
7314static enum s_alloc __visit_domain_allocation_hell(struct s_data *d, 7318static enum s_alloc __visit_domain_allocation_hell(struct s_data *d,
7315 const struct cpumask *cpu_map) 7319 const struct cpumask *cpu_map)
7316{ 7320{
7317#ifdef CONFIG_NUMA 7321 memset(d, 0, sizeof(*d));
7318 if (!alloc_cpumask_var(&d->domainspan, GFP_KERNEL)) 7322
7319 return sa_none; 7323 if (__sdt_alloc(cpu_map))
7320 if (!alloc_cpumask_var(&d->covered, GFP_KERNEL)) 7324 return sa_sd_storage;
7321 return sa_domainspan; 7325 d->sd = alloc_percpu(struct sched_domain *);
7322 if (!alloc_cpumask_var(&d->notcovered, GFP_KERNEL)) 7326 if (!d->sd)
7323 return sa_covered; 7327 return sa_sd_storage;
7324 /* Allocate the per-node list of sched groups */
7325 d->sched_group_nodes = kcalloc(nr_node_ids,
7326 sizeof(struct sched_group *), GFP_KERNEL);
7327 if (!d->sched_group_nodes) {
7328 printk(KERN_WARNING "Can not alloc sched group node list\n");
7329 return sa_notcovered;
7330 }
7331 sched_group_nodes_bycpu[cpumask_first(cpu_map)] = d->sched_group_nodes;
7332#endif
7333 if (!alloc_cpumask_var(&d->nodemask, GFP_KERNEL))
7334 return sa_sched_group_nodes;
7335 if (!alloc_cpumask_var(&d->this_sibling_map, GFP_KERNEL))
7336 return sa_nodemask;
7337 if (!alloc_cpumask_var(&d->this_core_map, GFP_KERNEL))
7338 return sa_this_sibling_map;
7339 if (!alloc_cpumask_var(&d->this_book_map, GFP_KERNEL))
7340 return sa_this_core_map;
7341 if (!alloc_cpumask_var(&d->send_covered, GFP_KERNEL))
7342 return sa_this_book_map;
7343 if (!alloc_cpumask_var(&d->tmpmask, GFP_KERNEL))
7344 return sa_send_covered;
7345 d->rd = alloc_rootdomain(); 7328 d->rd = alloc_rootdomain();
7346 if (!d->rd) { 7329 if (!d->rd)
7347 printk(KERN_WARNING "Cannot alloc root domain\n"); 7330 return sa_sd;
7348 return sa_tmpmask;
7349 }
7350 return sa_rootdomain; 7331 return sa_rootdomain;
7351} 7332}
7352 7333
7353static struct sched_domain *__build_numa_sched_domains(struct s_data *d, 7334/*
7354 const struct cpumask *cpu_map, struct sched_domain_attr *attr, int i) 7335 * NULL the sd_data elements we've used to build the sched_domain and
7336 * sched_group structure so that the subsequent __free_domain_allocs()
7337 * will not free the data we're using.
7338 */
7339static void claim_allocations(int cpu, struct sched_domain *sd)
7355{ 7340{
7356 struct sched_domain *sd = NULL; 7341 struct sd_data *sdd = sd->private;
7357#ifdef CONFIG_NUMA
7358 struct sched_domain *parent;
7359
7360 d->sd_allnodes = 0;
7361 if (cpumask_weight(cpu_map) >
7362 SD_NODES_PER_DOMAIN * cpumask_weight(d->nodemask)) {
7363 sd = &per_cpu(allnodes_domains, i).sd;
7364 SD_INIT(sd, ALLNODES);
7365 set_domain_attribute(sd, attr);
7366 cpumask_copy(sched_domain_span(sd), cpu_map);
7367 cpu_to_allnodes_group(i, cpu_map, &sd->groups, d->tmpmask);
7368 d->sd_allnodes = 1;
7369 }
7370 parent = sd;
7371
7372 sd = &per_cpu(node_domains, i).sd;
7373 SD_INIT(sd, NODE);
7374 set_domain_attribute(sd, attr);
7375 sched_domain_node_span(cpu_to_node(i), sched_domain_span(sd));
7376 sd->parent = parent;
7377 if (parent)
7378 parent->child = sd;
7379 cpumask_and(sched_domain_span(sd), sched_domain_span(sd), cpu_map);
7380#endif
7381 return sd;
7382}
7383 7342
7384static struct sched_domain *__build_cpu_sched_domain(struct s_data *d, 7343 WARN_ON_ONCE(*per_cpu_ptr(sdd->sd, cpu) != sd);
7385 const struct cpumask *cpu_map, struct sched_domain_attr *attr, 7344 *per_cpu_ptr(sdd->sd, cpu) = NULL;
7386 struct sched_domain *parent, int i)
7387{
7388 struct sched_domain *sd;
7389 sd = &per_cpu(phys_domains, i).sd;
7390 SD_INIT(sd, CPU);
7391 set_domain_attribute(sd, attr);
7392 cpumask_copy(sched_domain_span(sd), d->nodemask);
7393 sd->parent = parent;
7394 if (parent)
7395 parent->child = sd;
7396 cpu_to_phys_group(i, cpu_map, &sd->groups, d->tmpmask);
7397 return sd;
7398}
7399 7345
7400static struct sched_domain *__build_book_sched_domain(struct s_data *d, 7346 if (atomic_read(&(*per_cpu_ptr(sdd->sg, cpu))->ref))
7401 const struct cpumask *cpu_map, struct sched_domain_attr *attr, 7347 *per_cpu_ptr(sdd->sg, cpu) = NULL;
7402 struct sched_domain *parent, int i)
7403{
7404 struct sched_domain *sd = parent;
7405#ifdef CONFIG_SCHED_BOOK
7406 sd = &per_cpu(book_domains, i).sd;
7407 SD_INIT(sd, BOOK);
7408 set_domain_attribute(sd, attr);
7409 cpumask_and(sched_domain_span(sd), cpu_map, cpu_book_mask(i));
7410 sd->parent = parent;
7411 parent->child = sd;
7412 cpu_to_book_group(i, cpu_map, &sd->groups, d->tmpmask);
7413#endif
7414 return sd;
7415}
7416 7348
7417static struct sched_domain *__build_mc_sched_domain(struct s_data *d, 7349 if (atomic_read(&(*per_cpu_ptr(sdd->sgp, cpu))->ref))
7418 const struct cpumask *cpu_map, struct sched_domain_attr *attr, 7350 *per_cpu_ptr(sdd->sgp, cpu) = NULL;
7419 struct sched_domain *parent, int i)
7420{
7421 struct sched_domain *sd = parent;
7422#ifdef CONFIG_SCHED_MC
7423 sd = &per_cpu(core_domains, i).sd;
7424 SD_INIT(sd, MC);
7425 set_domain_attribute(sd, attr);
7426 cpumask_and(sched_domain_span(sd), cpu_map, cpu_coregroup_mask(i));
7427 sd->parent = parent;
7428 parent->child = sd;
7429 cpu_to_core_group(i, cpu_map, &sd->groups, d->tmpmask);
7430#endif
7431 return sd;
7432} 7351}
7433 7352
7434static struct sched_domain *__build_smt_sched_domain(struct s_data *d,
7435 const struct cpumask *cpu_map, struct sched_domain_attr *attr,
7436 struct sched_domain *parent, int i)
7437{
7438 struct sched_domain *sd = parent;
7439#ifdef CONFIG_SCHED_SMT 7353#ifdef CONFIG_SCHED_SMT
7440 sd = &per_cpu(cpu_domains, i).sd; 7354static const struct cpumask *cpu_smt_mask(int cpu)
7441 SD_INIT(sd, SIBLING); 7355{
7442 set_domain_attribute(sd, attr); 7356 return topology_thread_cpumask(cpu);
7443 cpumask_and(sched_domain_span(sd), cpu_map, topology_thread_cpumask(i));
7444 sd->parent = parent;
7445 parent->child = sd;
7446 cpu_to_cpu_group(i, cpu_map, &sd->groups, d->tmpmask);
7447#endif
7448 return sd;
7449} 7357}
7358#endif
7450 7359
7451static void build_sched_groups(struct s_data *d, enum sched_domain_level l, 7360/*
7452 const struct cpumask *cpu_map, int cpu) 7361 * Topology list, bottom-up.
7453{ 7362 */
7454 switch (l) { 7363static struct sched_domain_topology_level default_topology[] = {
7455#ifdef CONFIG_SCHED_SMT 7364#ifdef CONFIG_SCHED_SMT
7456 case SD_LV_SIBLING: /* set up CPU (sibling) groups */ 7365 { sd_init_SIBLING, cpu_smt_mask, },
7457 cpumask_and(d->this_sibling_map, cpu_map,
7458 topology_thread_cpumask(cpu));
7459 if (cpu == cpumask_first(d->this_sibling_map))
7460 init_sched_build_groups(d->this_sibling_map, cpu_map,
7461 &cpu_to_cpu_group,
7462 d->send_covered, d->tmpmask);
7463 break;
7464#endif 7366#endif
7465#ifdef CONFIG_SCHED_MC 7367#ifdef CONFIG_SCHED_MC
7466 case SD_LV_MC: /* set up multi-core groups */ 7368 { sd_init_MC, cpu_coregroup_mask, },
7467 cpumask_and(d->this_core_map, cpu_map, cpu_coregroup_mask(cpu));
7468 if (cpu == cpumask_first(d->this_core_map))
7469 init_sched_build_groups(d->this_core_map, cpu_map,
7470 &cpu_to_core_group,
7471 d->send_covered, d->tmpmask);
7472 break;
7473#endif 7369#endif
7474#ifdef CONFIG_SCHED_BOOK 7370#ifdef CONFIG_SCHED_BOOK
7475 case SD_LV_BOOK: /* set up book groups */ 7371 { sd_init_BOOK, cpu_book_mask, },
7476 cpumask_and(d->this_book_map, cpu_map, cpu_book_mask(cpu));
7477 if (cpu == cpumask_first(d->this_book_map))
7478 init_sched_build_groups(d->this_book_map, cpu_map,
7479 &cpu_to_book_group,
7480 d->send_covered, d->tmpmask);
7481 break;
7482#endif 7372#endif
7483 case SD_LV_CPU: /* set up physical groups */ 7373 { sd_init_CPU, cpu_cpu_mask, },
7484 cpumask_and(d->nodemask, cpumask_of_node(cpu), cpu_map);
7485 if (!cpumask_empty(d->nodemask))
7486 init_sched_build_groups(d->nodemask, cpu_map,
7487 &cpu_to_phys_group,
7488 d->send_covered, d->tmpmask);
7489 break;
7490#ifdef CONFIG_NUMA 7374#ifdef CONFIG_NUMA
7491 case SD_LV_ALLNODES: 7375 { sd_init_NODE, cpu_node_mask, SDTL_OVERLAP, },
7492 init_sched_build_groups(cpu_map, cpu_map, &cpu_to_allnodes_group, 7376 { sd_init_ALLNODES, cpu_allnodes_mask, },
7493 d->send_covered, d->tmpmask);
7494 break;
7495#endif 7377#endif
7496 default: 7378 { NULL, },
7497 break; 7379};
7380
7381static struct sched_domain_topology_level *sched_domain_topology = default_topology;
7382
7383static int __sdt_alloc(const struct cpumask *cpu_map)
7384{
7385 struct sched_domain_topology_level *tl;
7386 int j;
7387
7388 for (tl = sched_domain_topology; tl->init; tl++) {
7389 struct sd_data *sdd = &tl->data;
7390
7391 sdd->sd = alloc_percpu(struct sched_domain *);
7392 if (!sdd->sd)
7393 return -ENOMEM;
7394
7395 sdd->sg = alloc_percpu(struct sched_group *);
7396 if (!sdd->sg)
7397 return -ENOMEM;
7398
7399 sdd->sgp = alloc_percpu(struct sched_group_power *);
7400 if (!sdd->sgp)
7401 return -ENOMEM;
7402
7403 for_each_cpu(j, cpu_map) {
7404 struct sched_domain *sd;
7405 struct sched_group *sg;
7406 struct sched_group_power *sgp;
7407
7408 sd = kzalloc_node(sizeof(struct sched_domain) + cpumask_size(),
7409 GFP_KERNEL, cpu_to_node(j));
7410 if (!sd)
7411 return -ENOMEM;
7412
7413 *per_cpu_ptr(sdd->sd, j) = sd;
7414
7415 sg = kzalloc_node(sizeof(struct sched_group) + cpumask_size(),
7416 GFP_KERNEL, cpu_to_node(j));
7417 if (!sg)
7418 return -ENOMEM;
7419
7420 *per_cpu_ptr(sdd->sg, j) = sg;
7421
7422 sgp = kzalloc_node(sizeof(struct sched_group_power),
7423 GFP_KERNEL, cpu_to_node(j));
7424 if (!sgp)
7425 return -ENOMEM;
7426
7427 *per_cpu_ptr(sdd->sgp, j) = sgp;
7428 }
7429 }
7430
7431 return 0;
7432}
7433
7434static void __sdt_free(const struct cpumask *cpu_map)
7435{
7436 struct sched_domain_topology_level *tl;
7437 int j;
7438
7439 for (tl = sched_domain_topology; tl->init; tl++) {
7440 struct sd_data *sdd = &tl->data;
7441
7442 for_each_cpu(j, cpu_map) {
7443 struct sched_domain *sd = *per_cpu_ptr(sdd->sd, j);
7444 if (sd && (sd->flags & SD_OVERLAP))
7445 free_sched_groups(sd->groups, 0);
7446 kfree(*per_cpu_ptr(sdd->sg, j));
7447 kfree(*per_cpu_ptr(sdd->sgp, j));
7448 }
7449 free_percpu(sdd->sd);
7450 free_percpu(sdd->sg);
7451 free_percpu(sdd->sgp);
7498 } 7452 }
7499} 7453}
7500 7454
7455struct sched_domain *build_sched_domain(struct sched_domain_topology_level *tl,
7456 struct s_data *d, const struct cpumask *cpu_map,
7457 struct sched_domain_attr *attr, struct sched_domain *child,
7458 int cpu)
7459{
7460 struct sched_domain *sd = tl->init(tl, cpu);
7461 if (!sd)
7462 return child;
7463
7464 set_domain_attribute(sd, attr);
7465 cpumask_and(sched_domain_span(sd), cpu_map, tl->mask(cpu));
7466 if (child) {
7467 sd->level = child->level + 1;
7468 sched_domain_level_max = max(sched_domain_level_max, sd->level);
7469 child->parent = sd;
7470 }
7471 sd->child = child;
7472
7473 return sd;
7474}
7475
7501/* 7476/*
7502 * Build sched domains for a given set of cpus and attach the sched domains 7477 * Build sched domains for a given set of cpus and attach the sched domains
7503 * to the individual cpus 7478 * to the individual cpus
7504 */ 7479 */
7505static int __build_sched_domains(const struct cpumask *cpu_map, 7480static int build_sched_domains(const struct cpumask *cpu_map,
7506 struct sched_domain_attr *attr) 7481 struct sched_domain_attr *attr)
7507{ 7482{
7508 enum s_alloc alloc_state = sa_none; 7483 enum s_alloc alloc_state = sa_none;
7509 struct s_data d;
7510 struct sched_domain *sd; 7484 struct sched_domain *sd;
7511 int i; 7485 struct s_data d;
7512#ifdef CONFIG_NUMA 7486 int i, ret = -ENOMEM;
7513 d.sd_allnodes = 0;
7514#endif
7515 7487
7516 alloc_state = __visit_domain_allocation_hell(&d, cpu_map); 7488 alloc_state = __visit_domain_allocation_hell(&d, cpu_map);
7517 if (alloc_state != sa_rootdomain) 7489 if (alloc_state != sa_rootdomain)
7518 goto error; 7490 goto error;
7519 alloc_state = sa_sched_groups;
7520 7491
7521 /* 7492 /* Set up domains for cpus specified by the cpu_map. */
7522 * Set up domains for cpus specified by the cpu_map.
7523 */
7524 for_each_cpu(i, cpu_map) { 7493 for_each_cpu(i, cpu_map) {
7525 cpumask_and(d.nodemask, cpumask_of_node(cpu_to_node(i)), 7494 struct sched_domain_topology_level *tl;
7526 cpu_map); 7495
7496 sd = NULL;
7497 for (tl = sched_domain_topology; tl->init; tl++) {
7498 sd = build_sched_domain(tl, &d, cpu_map, attr, sd, i);
7499 if (tl->flags & SDTL_OVERLAP || sched_feat(FORCE_SD_OVERLAP))
7500 sd->flags |= SD_OVERLAP;
7501 if (cpumask_equal(cpu_map, sched_domain_span(sd)))
7502 break;
7503 }
7527 7504
7528 sd = __build_numa_sched_domains(&d, cpu_map, attr, i); 7505 while (sd->child)
7529 sd = __build_cpu_sched_domain(&d, cpu_map, attr, sd, i); 7506 sd = sd->child;
7530 sd = __build_book_sched_domain(&d, cpu_map, attr, sd, i);
7531 sd = __build_mc_sched_domain(&d, cpu_map, attr, sd, i);
7532 sd = __build_smt_sched_domain(&d, cpu_map, attr, sd, i);
7533 }
7534 7507
7535 for_each_cpu(i, cpu_map) { 7508 *per_cpu_ptr(d.sd, i) = sd;
7536 build_sched_groups(&d, SD_LV_SIBLING, cpu_map, i);
7537 build_sched_groups(&d, SD_LV_BOOK, cpu_map, i);
7538 build_sched_groups(&d, SD_LV_MC, cpu_map, i);
7539 } 7509 }
7540 7510
7541 /* Set up physical groups */ 7511 /* Build the groups for the domains */
7542 for (i = 0; i < nr_node_ids; i++)
7543 build_sched_groups(&d, SD_LV_CPU, cpu_map, i);
7544
7545#ifdef CONFIG_NUMA
7546 /* Set up node groups */
7547 if (d.sd_allnodes)
7548 build_sched_groups(&d, SD_LV_ALLNODES, cpu_map, 0);
7549
7550 for (i = 0; i < nr_node_ids; i++)
7551 if (build_numa_sched_groups(&d, cpu_map, i))
7552 goto error;
7553#endif
7554
7555 /* Calculate CPU power for physical packages and nodes */
7556#ifdef CONFIG_SCHED_SMT
7557 for_each_cpu(i, cpu_map) { 7512 for_each_cpu(i, cpu_map) {
7558 sd = &per_cpu(cpu_domains, i).sd; 7513 for (sd = *per_cpu_ptr(d.sd, i); sd; sd = sd->parent) {
7559 init_sched_groups_power(i, sd); 7514 sd->span_weight = cpumask_weight(sched_domain_span(sd));
7560 } 7515 if (sd->flags & SD_OVERLAP) {
7561#endif 7516 if (build_overlap_sched_groups(sd, i))
7562#ifdef CONFIG_SCHED_MC 7517 goto error;
7563 for_each_cpu(i, cpu_map) { 7518 } else {
7564 sd = &per_cpu(core_domains, i).sd; 7519 if (build_sched_groups(sd, i))
7565 init_sched_groups_power(i, sd); 7520 goto error;
7566 } 7521 }
7567#endif 7522 }
7568#ifdef CONFIG_SCHED_BOOK
7569 for_each_cpu(i, cpu_map) {
7570 sd = &per_cpu(book_domains, i).sd;
7571 init_sched_groups_power(i, sd);
7572 }
7573#endif
7574
7575 for_each_cpu(i, cpu_map) {
7576 sd = &per_cpu(phys_domains, i).sd;
7577 init_sched_groups_power(i, sd);
7578 } 7523 }
7579 7524
7580#ifdef CONFIG_NUMA 7525 /* Calculate CPU power for physical packages and nodes */
7581 for (i = 0; i < nr_node_ids; i++) 7526 for (i = nr_cpumask_bits-1; i >= 0; i--) {
7582 init_numa_sched_groups_power(d.sched_group_nodes[i]); 7527 if (!cpumask_test_cpu(i, cpu_map))
7583 7528 continue;
7584 if (d.sd_allnodes) {
7585 struct sched_group *sg;
7586 7529
7587 cpu_to_allnodes_group(cpumask_first(cpu_map), cpu_map, &sg, 7530 for (sd = *per_cpu_ptr(d.sd, i); sd; sd = sd->parent) {
7588 d.tmpmask); 7531 claim_allocations(i, sd);
7589 init_numa_sched_groups_power(sg); 7532 init_sched_groups_power(i, sd);
7533 }
7590 } 7534 }
7591#endif
7592 7535
7593 /* Attach the domains */ 7536 /* Attach the domains */
7537 rcu_read_lock();
7594 for_each_cpu(i, cpu_map) { 7538 for_each_cpu(i, cpu_map) {
7595#ifdef CONFIG_SCHED_SMT 7539 sd = *per_cpu_ptr(d.sd, i);
7596 sd = &per_cpu(cpu_domains, i).sd;
7597#elif defined(CONFIG_SCHED_MC)
7598 sd = &per_cpu(core_domains, i).sd;
7599#elif defined(CONFIG_SCHED_BOOK)
7600 sd = &per_cpu(book_domains, i).sd;
7601#else
7602 sd = &per_cpu(phys_domains, i).sd;
7603#endif
7604 cpu_attach_domain(sd, d.rd, i); 7540 cpu_attach_domain(sd, d.rd, i);
7605 } 7541 }
7542 rcu_read_unlock();
7606 7543
7607 d.sched_group_nodes = NULL; /* don't free this we still need it */ 7544 ret = 0;
7608 __free_domain_allocs(&d, sa_tmpmask, cpu_map);
7609 return 0;
7610
7611error: 7545error:
7612 __free_domain_allocs(&d, alloc_state, cpu_map); 7546 __free_domain_allocs(&d, alloc_state, cpu_map);
7613 return -ENOMEM; 7547 return ret;
7614}
7615
7616static int build_sched_domains(const struct cpumask *cpu_map)
7617{
7618 return __build_sched_domains(cpu_map, NULL);
7619} 7548}
7620 7549
7621static cpumask_var_t *doms_cur; /* current sched domains */ 7550static cpumask_var_t *doms_cur; /* current sched domains */
@@ -7670,7 +7599,7 @@ void free_sched_domains(cpumask_var_t doms[], unsigned int ndoms)
7670 * For now this just excludes isolated cpus, but could be used to 7599 * For now this just excludes isolated cpus, but could be used to
7671 * exclude other special cases in the future. 7600 * exclude other special cases in the future.
7672 */ 7601 */
7673static int arch_init_sched_domains(const struct cpumask *cpu_map) 7602static int init_sched_domains(const struct cpumask *cpu_map)
7674{ 7603{
7675 int err; 7604 int err;
7676 7605
@@ -7681,32 +7610,24 @@ static int arch_init_sched_domains(const struct cpumask *cpu_map)
7681 doms_cur = &fallback_doms; 7610 doms_cur = &fallback_doms;
7682 cpumask_andnot(doms_cur[0], cpu_map, cpu_isolated_map); 7611 cpumask_andnot(doms_cur[0], cpu_map, cpu_isolated_map);
7683 dattr_cur = NULL; 7612 dattr_cur = NULL;
7684 err = build_sched_domains(doms_cur[0]); 7613 err = build_sched_domains(doms_cur[0], NULL);
7685 register_sched_domain_sysctl(); 7614 register_sched_domain_sysctl();
7686 7615
7687 return err; 7616 return err;
7688} 7617}
7689 7618
7690static void arch_destroy_sched_domains(const struct cpumask *cpu_map,
7691 struct cpumask *tmpmask)
7692{
7693 free_sched_groups(cpu_map, tmpmask);
7694}
7695
7696/* 7619/*
7697 * Detach sched domains from a group of cpus specified in cpu_map 7620 * Detach sched domains from a group of cpus specified in cpu_map
7698 * These cpus will now be attached to the NULL domain 7621 * These cpus will now be attached to the NULL domain
7699 */ 7622 */
7700static void detach_destroy_domains(const struct cpumask *cpu_map) 7623static void detach_destroy_domains(const struct cpumask *cpu_map)
7701{ 7624{
7702 /* Save because hotplug lock held. */
7703 static DECLARE_BITMAP(tmpmask, CONFIG_NR_CPUS);
7704 int i; 7625 int i;
7705 7626
7627 rcu_read_lock();
7706 for_each_cpu(i, cpu_map) 7628 for_each_cpu(i, cpu_map)
7707 cpu_attach_domain(NULL, &def_root_domain, i); 7629 cpu_attach_domain(NULL, &def_root_domain, i);
7708 synchronize_sched(); 7630 rcu_read_unlock();
7709 arch_destroy_sched_domains(cpu_map, to_cpumask(tmpmask));
7710} 7631}
7711 7632
7712/* handle null as "default" */ 7633/* handle null as "default" */
@@ -7795,8 +7716,7 @@ match1:
7795 goto match2; 7716 goto match2;
7796 } 7717 }
7797 /* no match - add a new doms_new */ 7718 /* no match - add a new doms_new */
7798 __build_sched_domains(doms_new[i], 7719 build_sched_domains(doms_new[i], dattr_new ? dattr_new + i : NULL);
7799 dattr_new ? dattr_new + i : NULL);
7800match2: 7720match2:
7801 ; 7721 ;
7802 } 7722 }
@@ -7815,7 +7735,7 @@ match2:
7815} 7735}
7816 7736
7817#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT) 7737#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
7818static void arch_reinit_sched_domains(void) 7738static void reinit_sched_domains(void)
7819{ 7739{
7820 get_online_cpus(); 7740 get_online_cpus();
7821 7741
@@ -7848,7 +7768,7 @@ static ssize_t sched_power_savings_store(const char *buf, size_t count, int smt)
7848 else 7768 else
7849 sched_mc_power_savings = level; 7769 sched_mc_power_savings = level;
7850 7770
7851 arch_reinit_sched_domains(); 7771 reinit_sched_domains();
7852 7772
7853 return count; 7773 return count;
7854} 7774}
@@ -7967,14 +7887,9 @@ void __init sched_init_smp(void)
7967 alloc_cpumask_var(&non_isolated_cpus, GFP_KERNEL); 7887 alloc_cpumask_var(&non_isolated_cpus, GFP_KERNEL);
7968 alloc_cpumask_var(&fallback_doms, GFP_KERNEL); 7888 alloc_cpumask_var(&fallback_doms, GFP_KERNEL);
7969 7889
7970#if defined(CONFIG_NUMA)
7971 sched_group_nodes_bycpu = kzalloc(nr_cpu_ids * sizeof(void **),
7972 GFP_KERNEL);
7973 BUG_ON(sched_group_nodes_bycpu == NULL);
7974#endif
7975 get_online_cpus(); 7890 get_online_cpus();
7976 mutex_lock(&sched_domains_mutex); 7891 mutex_lock(&sched_domains_mutex);
7977 arch_init_sched_domains(cpu_active_mask); 7892 init_sched_domains(cpu_active_mask);
7978 cpumask_andnot(non_isolated_cpus, cpu_possible_mask, cpu_isolated_map); 7893 cpumask_andnot(non_isolated_cpus, cpu_possible_mask, cpu_isolated_map);
7979 if (cpumask_empty(non_isolated_cpus)) 7894 if (cpumask_empty(non_isolated_cpus))
7980 cpumask_set_cpu(smp_processor_id(), non_isolated_cpus); 7895 cpumask_set_cpu(smp_processor_id(), non_isolated_cpus);
@@ -8013,18 +7928,14 @@ int in_sched_functions(unsigned long addr)
8013 && addr < (unsigned long)__sched_text_end); 7928 && addr < (unsigned long)__sched_text_end);
8014} 7929}
8015 7930
8016static void init_cfs_rq(struct cfs_rq *cfs_rq, struct rq *rq) 7931static void init_cfs_rq(struct cfs_rq *cfs_rq)
8017{ 7932{
8018 cfs_rq->tasks_timeline = RB_ROOT; 7933 cfs_rq->tasks_timeline = RB_ROOT;
8019 INIT_LIST_HEAD(&cfs_rq->tasks); 7934 INIT_LIST_HEAD(&cfs_rq->tasks);
8020#ifdef CONFIG_FAIR_GROUP_SCHED
8021 cfs_rq->rq = rq;
8022 /* allow initial update_cfs_load() to truncate */
8023#ifdef CONFIG_SMP
8024 cfs_rq->load_stamp = 1;
8025#endif
8026#endif
8027 cfs_rq->min_vruntime = (u64)(-(1LL << 20)); 7935 cfs_rq->min_vruntime = (u64)(-(1LL << 20));
7936#ifndef CONFIG_64BIT
7937 cfs_rq->min_vruntime_copy = cfs_rq->min_vruntime;
7938#endif
8028} 7939}
8029 7940
8030static void init_rt_rq(struct rt_rq *rt_rq, struct rq *rq) 7941static void init_rt_rq(struct rt_rq *rt_rq, struct rq *rq)
@@ -8040,27 +7951,18 @@ static void init_rt_rq(struct rt_rq *rt_rq, struct rq *rq)
8040 /* delimiter for bitsearch: */ 7951 /* delimiter for bitsearch: */
8041 __set_bit(MAX_RT_PRIO, array->bitmap); 7952 __set_bit(MAX_RT_PRIO, array->bitmap);
8042 7953
8043#if defined CONFIG_SMP || defined CONFIG_RT_GROUP_SCHED 7954#if defined CONFIG_SMP
8044 rt_rq->highest_prio.curr = MAX_RT_PRIO; 7955 rt_rq->highest_prio.curr = MAX_RT_PRIO;
8045#ifdef CONFIG_SMP
8046 rt_rq->highest_prio.next = MAX_RT_PRIO; 7956 rt_rq->highest_prio.next = MAX_RT_PRIO;
8047#endif
8048#endif
8049#ifdef CONFIG_SMP
8050 rt_rq->rt_nr_migratory = 0; 7957 rt_rq->rt_nr_migratory = 0;
8051 rt_rq->overloaded = 0; 7958 rt_rq->overloaded = 0;
8052 plist_head_init_raw(&rt_rq->pushable_tasks, &rq->lock); 7959 plist_head_init(&rt_rq->pushable_tasks);
8053#endif 7960#endif
8054 7961
8055 rt_rq->rt_time = 0; 7962 rt_rq->rt_time = 0;
8056 rt_rq->rt_throttled = 0; 7963 rt_rq->rt_throttled = 0;
8057 rt_rq->rt_runtime = 0; 7964 rt_rq->rt_runtime = 0;
8058 raw_spin_lock_init(&rt_rq->rt_runtime_lock); 7965 raw_spin_lock_init(&rt_rq->rt_runtime_lock);
8059
8060#ifdef CONFIG_RT_GROUP_SCHED
8061 rt_rq->rt_nr_boosted = 0;
8062 rt_rq->rq = rq;
8063#endif
8064} 7966}
8065 7967
8066#ifdef CONFIG_FAIR_GROUP_SCHED 7968#ifdef CONFIG_FAIR_GROUP_SCHED
@@ -8069,11 +7971,17 @@ static void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq,
8069 struct sched_entity *parent) 7971 struct sched_entity *parent)
8070{ 7972{
8071 struct rq *rq = cpu_rq(cpu); 7973 struct rq *rq = cpu_rq(cpu);
8072 tg->cfs_rq[cpu] = cfs_rq; 7974
8073 init_cfs_rq(cfs_rq, rq);
8074 cfs_rq->tg = tg; 7975 cfs_rq->tg = tg;
7976 cfs_rq->rq = rq;
7977#ifdef CONFIG_SMP
7978 /* allow initial update_cfs_load() to truncate */
7979 cfs_rq->load_stamp = 1;
7980#endif
8075 7981
7982 tg->cfs_rq[cpu] = cfs_rq;
8076 tg->se[cpu] = se; 7983 tg->se[cpu] = se;
7984
8077 /* se could be NULL for root_task_group */ 7985 /* se could be NULL for root_task_group */
8078 if (!se) 7986 if (!se)
8079 return; 7987 return;
@@ -8096,12 +8004,14 @@ static void init_tg_rt_entry(struct task_group *tg, struct rt_rq *rt_rq,
8096{ 8004{
8097 struct rq *rq = cpu_rq(cpu); 8005 struct rq *rq = cpu_rq(cpu);
8098 8006
8099 tg->rt_rq[cpu] = rt_rq; 8007 rt_rq->highest_prio.curr = MAX_RT_PRIO;
8100 init_rt_rq(rt_rq, rq); 8008 rt_rq->rt_nr_boosted = 0;
8009 rt_rq->rq = rq;
8101 rt_rq->tg = tg; 8010 rt_rq->tg = tg;
8102 rt_rq->rt_runtime = tg->rt_bandwidth.rt_runtime;
8103 8011
8012 tg->rt_rq[cpu] = rt_rq;
8104 tg->rt_se[cpu] = rt_se; 8013 tg->rt_se[cpu] = rt_se;
8014
8105 if (!rt_se) 8015 if (!rt_se)
8106 return; 8016 return;
8107 8017
@@ -8183,7 +8093,7 @@ void __init sched_init(void)
8183 rq->nr_running = 0; 8093 rq->nr_running = 0;
8184 rq->calc_load_active = 0; 8094 rq->calc_load_active = 0;
8185 rq->calc_load_update = jiffies + LOAD_FREQ; 8095 rq->calc_load_update = jiffies + LOAD_FREQ;
8186 init_cfs_rq(&rq->cfs, rq); 8096 init_cfs_rq(&rq->cfs);
8187 init_rt_rq(&rq->rt, rq); 8097 init_rt_rq(&rq->rt, rq);
8188#ifdef CONFIG_FAIR_GROUP_SCHED 8098#ifdef CONFIG_FAIR_GROUP_SCHED
8189 root_task_group.shares = root_task_group_load; 8099 root_task_group.shares = root_task_group_load;
@@ -8224,7 +8134,7 @@ void __init sched_init(void)
8224#ifdef CONFIG_SMP 8134#ifdef CONFIG_SMP
8225 rq->sd = NULL; 8135 rq->sd = NULL;
8226 rq->rd = NULL; 8136 rq->rd = NULL;
8227 rq->cpu_power = SCHED_LOAD_SCALE; 8137 rq->cpu_power = SCHED_POWER_SCALE;
8228 rq->post_schedule = 0; 8138 rq->post_schedule = 0;
8229 rq->active_balance = 0; 8139 rq->active_balance = 0;
8230 rq->next_balance = jiffies; 8140 rq->next_balance = jiffies;
@@ -8254,7 +8164,7 @@ void __init sched_init(void)
8254#endif 8164#endif
8255 8165
8256#ifdef CONFIG_RT_MUTEXES 8166#ifdef CONFIG_RT_MUTEXES
8257 plist_head_init_raw(&init_task.pi_waiters, &init_task.pi_lock); 8167 plist_head_init(&init_task.pi_waiters);
8258#endif 8168#endif
8259 8169
8260 /* 8170 /*
@@ -8281,6 +8191,7 @@ void __init sched_init(void)
8281 /* Allocate the nohz_cpu_mask if CONFIG_CPUMASK_OFFSTACK */ 8191 /* Allocate the nohz_cpu_mask if CONFIG_CPUMASK_OFFSTACK */
8282 zalloc_cpumask_var(&nohz_cpu_mask, GFP_NOWAIT); 8192 zalloc_cpumask_var(&nohz_cpu_mask, GFP_NOWAIT);
8283#ifdef CONFIG_SMP 8193#ifdef CONFIG_SMP
8194 zalloc_cpumask_var(&sched_domains_tmpmask, GFP_NOWAIT);
8284#ifdef CONFIG_NO_HZ 8195#ifdef CONFIG_NO_HZ
8285 zalloc_cpumask_var(&nohz.idle_cpus_mask, GFP_NOWAIT); 8196 zalloc_cpumask_var(&nohz.idle_cpus_mask, GFP_NOWAIT);
8286 alloc_cpumask_var(&nohz.grp_idle_mask, GFP_NOWAIT); 8197 alloc_cpumask_var(&nohz.grp_idle_mask, GFP_NOWAIT);
@@ -8296,7 +8207,7 @@ void __init sched_init(void)
8296 scheduler_running = 1; 8207 scheduler_running = 1;
8297} 8208}
8298 8209
8299#ifdef CONFIG_DEBUG_SPINLOCK_SLEEP 8210#ifdef CONFIG_DEBUG_ATOMIC_SLEEP
8300static inline int preempt_count_equals(int preempt_offset) 8211static inline int preempt_count_equals(int preempt_offset)
8301{ 8212{
8302 int nested = (preempt_count() & ~PREEMPT_ACTIVE) + rcu_preempt_depth(); 8213 int nested = (preempt_count() & ~PREEMPT_ACTIVE) + rcu_preempt_depth();
@@ -8306,7 +8217,6 @@ static inline int preempt_count_equals(int preempt_offset)
8306 8217
8307void __might_sleep(const char *file, int line, int preempt_offset) 8218void __might_sleep(const char *file, int line, int preempt_offset)
8308{ 8219{
8309#ifdef in_atomic
8310 static unsigned long prev_jiffy; /* ratelimiting */ 8220 static unsigned long prev_jiffy; /* ratelimiting */
8311 8221
8312 if ((preempt_count_equals(preempt_offset) && !irqs_disabled()) || 8222 if ((preempt_count_equals(preempt_offset) && !irqs_disabled()) ||
@@ -8328,7 +8238,6 @@ void __might_sleep(const char *file, int line, int preempt_offset)
8328 if (irqs_disabled()) 8238 if (irqs_disabled())
8329 print_irqtrace_events(current); 8239 print_irqtrace_events(current);
8330 dump_stack(); 8240 dump_stack();
8331#endif
8332} 8241}
8333EXPORT_SYMBOL(__might_sleep); 8242EXPORT_SYMBOL(__might_sleep);
8334#endif 8243#endif
@@ -8340,7 +8249,7 @@ static void normalize_task(struct rq *rq, struct task_struct *p)
8340 int old_prio = p->prio; 8249 int old_prio = p->prio;
8341 int on_rq; 8250 int on_rq;
8342 8251
8343 on_rq = p->se.on_rq; 8252 on_rq = p->on_rq;
8344 if (on_rq) 8253 if (on_rq)
8345 deactivate_task(rq, p, 0); 8254 deactivate_task(rq, p, 0);
8346 __setscheduler(rq, p, SCHED_NORMAL, 0); 8255 __setscheduler(rq, p, SCHED_NORMAL, 0);
@@ -8487,6 +8396,7 @@ int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)
8487 if (!se) 8396 if (!se)
8488 goto err_free_rq; 8397 goto err_free_rq;
8489 8398
8399 init_cfs_rq(cfs_rq);
8490 init_tg_cfs_entry(tg, cfs_rq, se, i, parent->se[i]); 8400 init_tg_cfs_entry(tg, cfs_rq, se, i, parent->se[i]);
8491 } 8401 }
8492 8402
@@ -8514,7 +8424,7 @@ static inline void unregister_fair_sched_group(struct task_group *tg, int cpu)
8514 list_del_leaf_cfs_rq(tg->cfs_rq[cpu]); 8424 list_del_leaf_cfs_rq(tg->cfs_rq[cpu]);
8515 raw_spin_unlock_irqrestore(&rq->lock, flags); 8425 raw_spin_unlock_irqrestore(&rq->lock, flags);
8516} 8426}
8517#else /* !CONFG_FAIR_GROUP_SCHED */ 8427#else /* !CONFIG_FAIR_GROUP_SCHED */
8518static inline void free_fair_sched_group(struct task_group *tg) 8428static inline void free_fair_sched_group(struct task_group *tg)
8519{ 8429{
8520} 8430}
@@ -8535,7 +8445,8 @@ static void free_rt_sched_group(struct task_group *tg)
8535{ 8445{
8536 int i; 8446 int i;
8537 8447
8538 destroy_rt_bandwidth(&tg->rt_bandwidth); 8448 if (tg->rt_se)
8449 destroy_rt_bandwidth(&tg->rt_bandwidth);
8539 8450
8540 for_each_possible_cpu(i) { 8451 for_each_possible_cpu(i) {
8541 if (tg->rt_rq) 8452 if (tg->rt_rq)
@@ -8553,7 +8464,6 @@ int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent)
8553{ 8464{
8554 struct rt_rq *rt_rq; 8465 struct rt_rq *rt_rq;
8555 struct sched_rt_entity *rt_se; 8466 struct sched_rt_entity *rt_se;
8556 struct rq *rq;
8557 int i; 8467 int i;
8558 8468
8559 tg->rt_rq = kzalloc(sizeof(rt_rq) * nr_cpu_ids, GFP_KERNEL); 8469 tg->rt_rq = kzalloc(sizeof(rt_rq) * nr_cpu_ids, GFP_KERNEL);
@@ -8567,8 +8477,6 @@ int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent)
8567 ktime_to_ns(def_rt_bandwidth.rt_period), 0); 8477 ktime_to_ns(def_rt_bandwidth.rt_period), 0);
8568 8478
8569 for_each_possible_cpu(i) { 8479 for_each_possible_cpu(i) {
8570 rq = cpu_rq(i);
8571
8572 rt_rq = kzalloc_node(sizeof(struct rt_rq), 8480 rt_rq = kzalloc_node(sizeof(struct rt_rq),
8573 GFP_KERNEL, cpu_to_node(i)); 8481 GFP_KERNEL, cpu_to_node(i));
8574 if (!rt_rq) 8482 if (!rt_rq)
@@ -8579,6 +8487,8 @@ int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent)
8579 if (!rt_se) 8487 if (!rt_se)
8580 goto err_free_rq; 8488 goto err_free_rq;
8581 8489
8490 init_rt_rq(rt_rq, cpu_rq(i));
8491 rt_rq->rt_runtime = tg->rt_bandwidth.rt_runtime;
8582 init_tg_rt_entry(tg, rt_rq, rt_se, i, parent->rt_se[i]); 8492 init_tg_rt_entry(tg, rt_rq, rt_se, i, parent->rt_se[i]);
8583 } 8493 }
8584 8494
@@ -8683,7 +8593,7 @@ void sched_move_task(struct task_struct *tsk)
8683 rq = task_rq_lock(tsk, &flags); 8593 rq = task_rq_lock(tsk, &flags);
8684 8594
8685 running = task_current(rq, tsk); 8595 running = task_current(rq, tsk);
8686 on_rq = tsk->se.on_rq; 8596 on_rq = tsk->on_rq;
8687 8597
8688 if (on_rq) 8598 if (on_rq)
8689 dequeue_task(rq, tsk, 0); 8599 dequeue_task(rq, tsk, 0);
@@ -8702,7 +8612,7 @@ void sched_move_task(struct task_struct *tsk)
8702 if (on_rq) 8612 if (on_rq)
8703 enqueue_task(rq, tsk, 0); 8613 enqueue_task(rq, tsk, 0);
8704 8614
8705 task_rq_unlock(rq, &flags); 8615 task_rq_unlock(rq, tsk, &flags);
8706} 8616}
8707#endif /* CONFIG_CGROUP_SCHED */ 8617#endif /* CONFIG_CGROUP_SCHED */
8708 8618
@@ -8720,10 +8630,7 @@ int sched_group_set_shares(struct task_group *tg, unsigned long shares)
8720 if (!tg->se[0]) 8630 if (!tg->se[0])
8721 return -EINVAL; 8631 return -EINVAL;
8722 8632
8723 if (shares < MIN_SHARES) 8633 shares = clamp(shares, scale_load(MIN_SHARES), scale_load(MAX_SHARES));
8724 shares = MIN_SHARES;
8725 else if (shares > MAX_SHARES)
8726 shares = MAX_SHARES;
8727 8634
8728 mutex_lock(&shares_mutex); 8635 mutex_lock(&shares_mutex);
8729 if (tg->shares == shares) 8636 if (tg->shares == shares)
@@ -9073,42 +8980,10 @@ cpu_cgroup_can_attach_task(struct cgroup *cgrp, struct task_struct *tsk)
9073 return 0; 8980 return 0;
9074} 8981}
9075 8982
9076static int
9077cpu_cgroup_can_attach(struct cgroup_subsys *ss, struct cgroup *cgrp,
9078 struct task_struct *tsk, bool threadgroup)
9079{
9080 int retval = cpu_cgroup_can_attach_task(cgrp, tsk);
9081 if (retval)
9082 return retval;
9083 if (threadgroup) {
9084 struct task_struct *c;
9085 rcu_read_lock();
9086 list_for_each_entry_rcu(c, &tsk->thread_group, thread_group) {
9087 retval = cpu_cgroup_can_attach_task(cgrp, c);
9088 if (retval) {
9089 rcu_read_unlock();
9090 return retval;
9091 }
9092 }
9093 rcu_read_unlock();
9094 }
9095 return 0;
9096}
9097
9098static void 8983static void
9099cpu_cgroup_attach(struct cgroup_subsys *ss, struct cgroup *cgrp, 8984cpu_cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk)
9100 struct cgroup *old_cont, struct task_struct *tsk,
9101 bool threadgroup)
9102{ 8985{
9103 sched_move_task(tsk); 8986 sched_move_task(tsk);
9104 if (threadgroup) {
9105 struct task_struct *c;
9106 rcu_read_lock();
9107 list_for_each_entry_rcu(c, &tsk->thread_group, thread_group) {
9108 sched_move_task(c);
9109 }
9110 rcu_read_unlock();
9111 }
9112} 8987}
9113 8988
9114static void 8989static void
@@ -9130,14 +9005,14 @@ cpu_cgroup_exit(struct cgroup_subsys *ss, struct cgroup *cgrp,
9130static int cpu_shares_write_u64(struct cgroup *cgrp, struct cftype *cftype, 9005static int cpu_shares_write_u64(struct cgroup *cgrp, struct cftype *cftype,
9131 u64 shareval) 9006 u64 shareval)
9132{ 9007{
9133 return sched_group_set_shares(cgroup_tg(cgrp), shareval); 9008 return sched_group_set_shares(cgroup_tg(cgrp), scale_load(shareval));
9134} 9009}
9135 9010
9136static u64 cpu_shares_read_u64(struct cgroup *cgrp, struct cftype *cft) 9011static u64 cpu_shares_read_u64(struct cgroup *cgrp, struct cftype *cft)
9137{ 9012{
9138 struct task_group *tg = cgroup_tg(cgrp); 9013 struct task_group *tg = cgroup_tg(cgrp);
9139 9014
9140 return (u64) tg->shares; 9015 return (u64) scale_load_down(tg->shares);
9141} 9016}
9142#endif /* CONFIG_FAIR_GROUP_SCHED */ 9017#endif /* CONFIG_FAIR_GROUP_SCHED */
9143 9018
@@ -9196,8 +9071,8 @@ struct cgroup_subsys cpu_cgroup_subsys = {
9196 .name = "cpu", 9071 .name = "cpu",
9197 .create = cpu_cgroup_create, 9072 .create = cpu_cgroup_create,
9198 .destroy = cpu_cgroup_destroy, 9073 .destroy = cpu_cgroup_destroy,
9199 .can_attach = cpu_cgroup_can_attach, 9074 .can_attach_task = cpu_cgroup_can_attach_task,
9200 .attach = cpu_cgroup_attach, 9075 .attach_task = cpu_cgroup_attach_task,
9201 .exit = cpu_cgroup_exit, 9076 .exit = cpu_cgroup_exit,
9202 .populate = cpu_cgroup_populate, 9077 .populate = cpu_cgroup_populate,
9203 .subsys_id = cpu_cgroup_subsys_id, 9078 .subsys_id = cpu_cgroup_subsys_id,
diff --git a/kernel/sched_autogroup.h b/kernel/sched_autogroup.h
index 05577055cfca..c2f0e7248dca 100644
--- a/kernel/sched_autogroup.h
+++ b/kernel/sched_autogroup.h
@@ -13,6 +13,7 @@ struct autogroup {
13 int nice; 13 int nice;
14}; 14};
15 15
16static inline bool task_group_is_autogroup(struct task_group *tg);
16static inline struct task_group * 17static inline struct task_group *
17autogroup_task_group(struct task_struct *p, struct task_group *tg); 18autogroup_task_group(struct task_struct *p, struct task_group *tg);
18 19
diff --git a/kernel/sched_debug.c b/kernel/sched_debug.c
index 7bacd83a4158..a6710a112b4f 100644
--- a/kernel/sched_debug.c
+++ b/kernel/sched_debug.c
@@ -152,7 +152,7 @@ static void print_rq(struct seq_file *m, struct rq *rq, int rq_cpu)
152 read_lock_irqsave(&tasklist_lock, flags); 152 read_lock_irqsave(&tasklist_lock, flags);
153 153
154 do_each_thread(g, p) { 154 do_each_thread(g, p) {
155 if (!p->se.on_rq || task_cpu(p) != rq_cpu) 155 if (!p->on_rq || task_cpu(p) != rq_cpu)
156 continue; 156 continue;
157 157
158 print_task(m, rq, p); 158 print_task(m, rq, p);
@@ -296,9 +296,6 @@ static void print_cpu(struct seq_file *m, int cpu)
296 P(ttwu_count); 296 P(ttwu_count);
297 P(ttwu_local); 297 P(ttwu_local);
298 298
299 SEQ_printf(m, " .%-30s: %d\n", "bkl_count",
300 rq->rq_sched_info.bkl_count);
301
302#undef P 299#undef P
303#undef P64 300#undef P64
304#endif 301#endif
@@ -441,7 +438,6 @@ void proc_sched_show_task(struct task_struct *p, struct seq_file *m)
441 P(se.statistics.wait_count); 438 P(se.statistics.wait_count);
442 PN(se.statistics.iowait_sum); 439 PN(se.statistics.iowait_sum);
443 P(se.statistics.iowait_count); 440 P(se.statistics.iowait_count);
444 P(sched_info.bkl_count);
445 P(se.nr_migrations); 441 P(se.nr_migrations);
446 P(se.statistics.nr_migrations_cold); 442 P(se.statistics.nr_migrations_cold);
447 P(se.statistics.nr_failed_migrations_affine); 443 P(se.statistics.nr_failed_migrations_affine);
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index 6fa833ab2cb8..bc8ee9993814 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -135,14 +135,6 @@ static inline struct cfs_rq *group_cfs_rq(struct sched_entity *grp)
135 return grp->my_q; 135 return grp->my_q;
136} 136}
137 137
138/* Given a group's cfs_rq on one cpu, return its corresponding cfs_rq on
139 * another cpu ('this_cpu')
140 */
141static inline struct cfs_rq *cpu_cfs_rq(struct cfs_rq *cfs_rq, int this_cpu)
142{
143 return cfs_rq->tg->cfs_rq[this_cpu];
144}
145
146static inline void list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq) 138static inline void list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq)
147{ 139{
148 if (!cfs_rq->on_list) { 140 if (!cfs_rq->on_list) {
@@ -271,11 +263,6 @@ static inline struct cfs_rq *group_cfs_rq(struct sched_entity *grp)
271 return NULL; 263 return NULL;
272} 264}
273 265
274static inline struct cfs_rq *cpu_cfs_rq(struct cfs_rq *cfs_rq, int this_cpu)
275{
276 return &cpu_rq(this_cpu)->cfs;
277}
278
279static inline void list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq) 266static inline void list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq)
280{ 267{
281} 268}
@@ -334,11 +321,6 @@ static inline int entity_before(struct sched_entity *a,
334 return (s64)(a->vruntime - b->vruntime) < 0; 321 return (s64)(a->vruntime - b->vruntime) < 0;
335} 322}
336 323
337static inline s64 entity_key(struct cfs_rq *cfs_rq, struct sched_entity *se)
338{
339 return se->vruntime - cfs_rq->min_vruntime;
340}
341
342static void update_min_vruntime(struct cfs_rq *cfs_rq) 324static void update_min_vruntime(struct cfs_rq *cfs_rq)
343{ 325{
344 u64 vruntime = cfs_rq->min_vruntime; 326 u64 vruntime = cfs_rq->min_vruntime;
@@ -358,6 +340,10 @@ static void update_min_vruntime(struct cfs_rq *cfs_rq)
358 } 340 }
359 341
360 cfs_rq->min_vruntime = max_vruntime(cfs_rq->min_vruntime, vruntime); 342 cfs_rq->min_vruntime = max_vruntime(cfs_rq->min_vruntime, vruntime);
343#ifndef CONFIG_64BIT
344 smp_wmb();
345 cfs_rq->min_vruntime_copy = cfs_rq->min_vruntime;
346#endif
361} 347}
362 348
363/* 349/*
@@ -368,7 +354,6 @@ static void __enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
368 struct rb_node **link = &cfs_rq->tasks_timeline.rb_node; 354 struct rb_node **link = &cfs_rq->tasks_timeline.rb_node;
369 struct rb_node *parent = NULL; 355 struct rb_node *parent = NULL;
370 struct sched_entity *entry; 356 struct sched_entity *entry;
371 s64 key = entity_key(cfs_rq, se);
372 int leftmost = 1; 357 int leftmost = 1;
373 358
374 /* 359 /*
@@ -381,7 +366,7 @@ static void __enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
381 * We dont care about collisions. Nodes with 366 * We dont care about collisions. Nodes with
382 * the same key stay together. 367 * the same key stay together.
383 */ 368 */
384 if (key < entity_key(cfs_rq, entry)) { 369 if (entity_before(se, entry)) {
385 link = &parent->rb_left; 370 link = &parent->rb_left;
386 } else { 371 } else {
387 link = &parent->rb_right; 372 link = &parent->rb_right;
@@ -1072,8 +1057,6 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
1072 se->on_rq = 0; 1057 se->on_rq = 0;
1073 update_cfs_load(cfs_rq, 0); 1058 update_cfs_load(cfs_rq, 0);
1074 account_entity_dequeue(cfs_rq, se); 1059 account_entity_dequeue(cfs_rq, se);
1075 update_min_vruntime(cfs_rq);
1076 update_cfs_shares(cfs_rq);
1077 1060
1078 /* 1061 /*
1079 * Normalize the entity after updating the min_vruntime because the 1062 * Normalize the entity after updating the min_vruntime because the
@@ -1082,6 +1065,9 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
1082 */ 1065 */
1083 if (!(flags & DEQUEUE_SLEEP)) 1066 if (!(flags & DEQUEUE_SLEEP))
1084 se->vruntime -= cfs_rq->min_vruntime; 1067 se->vruntime -= cfs_rq->min_vruntime;
1068
1069 update_min_vruntime(cfs_rq);
1070 update_cfs_shares(cfs_rq);
1085} 1071}
1086 1072
1087/* 1073/*
@@ -1331,7 +1317,7 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
1331 } 1317 }
1332 1318
1333 for_each_sched_entity(se) { 1319 for_each_sched_entity(se) {
1334 struct cfs_rq *cfs_rq = cfs_rq_of(se); 1320 cfs_rq = cfs_rq_of(se);
1335 1321
1336 update_cfs_load(cfs_rq, 0); 1322 update_cfs_load(cfs_rq, 0);
1337 update_cfs_shares(cfs_rq); 1323 update_cfs_shares(cfs_rq);
@@ -1340,6 +1326,8 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
1340 hrtick_update(rq); 1326 hrtick_update(rq);
1341} 1327}
1342 1328
1329static void set_next_buddy(struct sched_entity *se);
1330
1343/* 1331/*
1344 * The dequeue_task method is called before nr_running is 1332 * The dequeue_task method is called before nr_running is
1345 * decreased. We remove the task from the rbtree and 1333 * decreased. We remove the task from the rbtree and
@@ -1349,19 +1337,30 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
1349{ 1337{
1350 struct cfs_rq *cfs_rq; 1338 struct cfs_rq *cfs_rq;
1351 struct sched_entity *se = &p->se; 1339 struct sched_entity *se = &p->se;
1340 int task_sleep = flags & DEQUEUE_SLEEP;
1352 1341
1353 for_each_sched_entity(se) { 1342 for_each_sched_entity(se) {
1354 cfs_rq = cfs_rq_of(se); 1343 cfs_rq = cfs_rq_of(se);
1355 dequeue_entity(cfs_rq, se, flags); 1344 dequeue_entity(cfs_rq, se, flags);
1356 1345
1357 /* Don't dequeue parent if it has other entities besides us */ 1346 /* Don't dequeue parent if it has other entities besides us */
1358 if (cfs_rq->load.weight) 1347 if (cfs_rq->load.weight) {
1348 /*
1349 * Bias pick_next to pick a task from this cfs_rq, as
1350 * p is sleeping when it is within its sched_slice.
1351 */
1352 if (task_sleep && parent_entity(se))
1353 set_next_buddy(parent_entity(se));
1354
1355 /* avoid re-evaluating load for this entity */
1356 se = parent_entity(se);
1359 break; 1357 break;
1358 }
1360 flags |= DEQUEUE_SLEEP; 1359 flags |= DEQUEUE_SLEEP;
1361 } 1360 }
1362 1361
1363 for_each_sched_entity(se) { 1362 for_each_sched_entity(se) {
1364 struct cfs_rq *cfs_rq = cfs_rq_of(se); 1363 cfs_rq = cfs_rq_of(se);
1365 1364
1366 update_cfs_load(cfs_rq, 0); 1365 update_cfs_load(cfs_rq, 0);
1367 update_cfs_shares(cfs_rq); 1366 update_cfs_shares(cfs_rq);
@@ -1372,12 +1371,25 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
1372 1371
1373#ifdef CONFIG_SMP 1372#ifdef CONFIG_SMP
1374 1373
1375static void task_waking_fair(struct rq *rq, struct task_struct *p) 1374static void task_waking_fair(struct task_struct *p)
1376{ 1375{
1377 struct sched_entity *se = &p->se; 1376 struct sched_entity *se = &p->se;
1378 struct cfs_rq *cfs_rq = cfs_rq_of(se); 1377 struct cfs_rq *cfs_rq = cfs_rq_of(se);
1378 u64 min_vruntime;
1379 1379
1380 se->vruntime -= cfs_rq->min_vruntime; 1380#ifndef CONFIG_64BIT
1381 u64 min_vruntime_copy;
1382
1383 do {
1384 min_vruntime_copy = cfs_rq->min_vruntime_copy;
1385 smp_rmb();
1386 min_vruntime = cfs_rq->min_vruntime;
1387 } while (min_vruntime != min_vruntime_copy);
1388#else
1389 min_vruntime = cfs_rq->min_vruntime;
1390#endif
1391
1392 se->vruntime -= min_vruntime;
1381} 1393}
1382 1394
1383#ifdef CONFIG_FAIR_GROUP_SCHED 1395#ifdef CONFIG_FAIR_GROUP_SCHED
@@ -1453,7 +1465,6 @@ static int wake_affine(struct sched_domain *sd, struct task_struct *p, int sync)
1453 * effect of the currently running task from the load 1465 * effect of the currently running task from the load
1454 * of the current CPU: 1466 * of the current CPU:
1455 */ 1467 */
1456 rcu_read_lock();
1457 if (sync) { 1468 if (sync) {
1458 tg = task_group(current); 1469 tg = task_group(current);
1459 weight = current->se.load.weight; 1470 weight = current->se.load.weight;
@@ -1489,7 +1500,6 @@ static int wake_affine(struct sched_domain *sd, struct task_struct *p, int sync)
1489 balanced = this_eff_load <= prev_eff_load; 1500 balanced = this_eff_load <= prev_eff_load;
1490 } else 1501 } else
1491 balanced = true; 1502 balanced = true;
1492 rcu_read_unlock();
1493 1503
1494 /* 1504 /*
1495 * If the currently running task will sleep within 1505 * If the currently running task will sleep within
@@ -1557,7 +1567,7 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p,
1557 } 1567 }
1558 1568
1559 /* Adjust by relative CPU power of the group */ 1569 /* Adjust by relative CPU power of the group */
1560 avg_load = (avg_load * SCHED_LOAD_SCALE) / group->cpu_power; 1570 avg_load = (avg_load * SCHED_POWER_SCALE) / group->sgp->power;
1561 1571
1562 if (local_group) { 1572 if (local_group) {
1563 this_load = avg_load; 1573 this_load = avg_load;
@@ -1622,6 +1632,7 @@ static int select_idle_sibling(struct task_struct *p, int target)
1622 /* 1632 /*
1623 * Otherwise, iterate the domains and find an elegible idle cpu. 1633 * Otherwise, iterate the domains and find an elegible idle cpu.
1624 */ 1634 */
1635 rcu_read_lock();
1625 for_each_domain(target, sd) { 1636 for_each_domain(target, sd) {
1626 if (!(sd->flags & SD_SHARE_PKG_RESOURCES)) 1637 if (!(sd->flags & SD_SHARE_PKG_RESOURCES))
1627 break; 1638 break;
@@ -1641,6 +1652,7 @@ static int select_idle_sibling(struct task_struct *p, int target)
1641 cpumask_test_cpu(prev_cpu, sched_domain_span(sd))) 1652 cpumask_test_cpu(prev_cpu, sched_domain_span(sd)))
1642 break; 1653 break;
1643 } 1654 }
1655 rcu_read_unlock();
1644 1656
1645 return target; 1657 return target;
1646} 1658}
@@ -1657,7 +1669,7 @@ static int select_idle_sibling(struct task_struct *p, int target)
1657 * preempt must be disabled. 1669 * preempt must be disabled.
1658 */ 1670 */
1659static int 1671static int
1660select_task_rq_fair(struct rq *rq, struct task_struct *p, int sd_flag, int wake_flags) 1672select_task_rq_fair(struct task_struct *p, int sd_flag, int wake_flags)
1661{ 1673{
1662 struct sched_domain *tmp, *affine_sd = NULL, *sd = NULL; 1674 struct sched_domain *tmp, *affine_sd = NULL, *sd = NULL;
1663 int cpu = smp_processor_id(); 1675 int cpu = smp_processor_id();
@@ -1673,6 +1685,7 @@ select_task_rq_fair(struct rq *rq, struct task_struct *p, int sd_flag, int wake_
1673 new_cpu = prev_cpu; 1685 new_cpu = prev_cpu;
1674 } 1686 }
1675 1687
1688 rcu_read_lock();
1676 for_each_domain(cpu, tmp) { 1689 for_each_domain(cpu, tmp) {
1677 if (!(tmp->flags & SD_LOAD_BALANCE)) 1690 if (!(tmp->flags & SD_LOAD_BALANCE))
1678 continue; 1691 continue;
@@ -1692,7 +1705,7 @@ select_task_rq_fair(struct rq *rq, struct task_struct *p, int sd_flag, int wake_
1692 nr_running += cpu_rq(i)->cfs.nr_running; 1705 nr_running += cpu_rq(i)->cfs.nr_running;
1693 } 1706 }
1694 1707
1695 capacity = DIV_ROUND_CLOSEST(power, SCHED_LOAD_SCALE); 1708 capacity = DIV_ROUND_CLOSEST(power, SCHED_POWER_SCALE);
1696 1709
1697 if (tmp->flags & SD_POWERSAVINGS_BALANCE) 1710 if (tmp->flags & SD_POWERSAVINGS_BALANCE)
1698 nr_running /= 2; 1711 nr_running /= 2;
@@ -1723,9 +1736,10 @@ select_task_rq_fair(struct rq *rq, struct task_struct *p, int sd_flag, int wake_
1723 1736
1724 if (affine_sd) { 1737 if (affine_sd) {
1725 if (cpu == prev_cpu || wake_affine(affine_sd, p, sync)) 1738 if (cpu == prev_cpu || wake_affine(affine_sd, p, sync))
1726 return select_idle_sibling(p, cpu); 1739 prev_cpu = cpu;
1727 else 1740
1728 return select_idle_sibling(p, prev_cpu); 1741 new_cpu = select_idle_sibling(p, prev_cpu);
1742 goto unlock;
1729 } 1743 }
1730 1744
1731 while (sd) { 1745 while (sd) {
@@ -1766,6 +1780,8 @@ select_task_rq_fair(struct rq *rq, struct task_struct *p, int sd_flag, int wake_
1766 } 1780 }
1767 /* while loop will break here if sd == NULL */ 1781 /* while loop will break here if sd == NULL */
1768 } 1782 }
1783unlock:
1784 rcu_read_unlock();
1769 1785
1770 return new_cpu; 1786 return new_cpu;
1771} 1787}
@@ -1789,10 +1805,7 @@ wakeup_gran(struct sched_entity *curr, struct sched_entity *se)
1789 * This is especially important for buddies when the leftmost 1805 * This is especially important for buddies when the leftmost
1790 * task is higher priority than the buddy. 1806 * task is higher priority than the buddy.
1791 */ 1807 */
1792 if (unlikely(se->load.weight != NICE_0_LOAD)) 1808 return calc_delta_fair(gran, se);
1793 gran = calc_delta_fair(gran, se);
1794
1795 return gran;
1796} 1809}
1797 1810
1798/* 1811/*
@@ -1826,26 +1839,26 @@ wakeup_preempt_entity(struct sched_entity *curr, struct sched_entity *se)
1826 1839
1827static void set_last_buddy(struct sched_entity *se) 1840static void set_last_buddy(struct sched_entity *se)
1828{ 1841{
1829 if (likely(task_of(se)->policy != SCHED_IDLE)) { 1842 if (entity_is_task(se) && unlikely(task_of(se)->policy == SCHED_IDLE))
1830 for_each_sched_entity(se) 1843 return;
1831 cfs_rq_of(se)->last = se; 1844
1832 } 1845 for_each_sched_entity(se)
1846 cfs_rq_of(se)->last = se;
1833} 1847}
1834 1848
1835static void set_next_buddy(struct sched_entity *se) 1849static void set_next_buddy(struct sched_entity *se)
1836{ 1850{
1837 if (likely(task_of(se)->policy != SCHED_IDLE)) { 1851 if (entity_is_task(se) && unlikely(task_of(se)->policy == SCHED_IDLE))
1838 for_each_sched_entity(se) 1852 return;
1839 cfs_rq_of(se)->next = se; 1853
1840 } 1854 for_each_sched_entity(se)
1855 cfs_rq_of(se)->next = se;
1841} 1856}
1842 1857
1843static void set_skip_buddy(struct sched_entity *se) 1858static void set_skip_buddy(struct sched_entity *se)
1844{ 1859{
1845 if (likely(task_of(se)->policy != SCHED_IDLE)) { 1860 for_each_sched_entity(se)
1846 for_each_sched_entity(se) 1861 cfs_rq_of(se)->skip = se;
1847 cfs_rq_of(se)->skip = se;
1848 }
1849} 1862}
1850 1863
1851/* 1864/*
@@ -1857,12 +1870,15 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_
1857 struct sched_entity *se = &curr->se, *pse = &p->se; 1870 struct sched_entity *se = &curr->se, *pse = &p->se;
1858 struct cfs_rq *cfs_rq = task_cfs_rq(curr); 1871 struct cfs_rq *cfs_rq = task_cfs_rq(curr);
1859 int scale = cfs_rq->nr_running >= sched_nr_latency; 1872 int scale = cfs_rq->nr_running >= sched_nr_latency;
1873 int next_buddy_marked = 0;
1860 1874
1861 if (unlikely(se == pse)) 1875 if (unlikely(se == pse))
1862 return; 1876 return;
1863 1877
1864 if (sched_feat(NEXT_BUDDY) && scale && !(wake_flags & WF_FORK)) 1878 if (sched_feat(NEXT_BUDDY) && scale && !(wake_flags & WF_FORK)) {
1865 set_next_buddy(pse); 1879 set_next_buddy(pse);
1880 next_buddy_marked = 1;
1881 }
1866 1882
1867 /* 1883 /*
1868 * We can come here with TIF_NEED_RESCHED already set from new task 1884 * We can come here with TIF_NEED_RESCHED already set from new task
@@ -1887,11 +1903,18 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_
1887 if (!sched_feat(WAKEUP_PREEMPT)) 1903 if (!sched_feat(WAKEUP_PREEMPT))
1888 return; 1904 return;
1889 1905
1890 update_curr(cfs_rq);
1891 find_matching_se(&se, &pse); 1906 find_matching_se(&se, &pse);
1907 update_curr(cfs_rq_of(se));
1892 BUG_ON(!pse); 1908 BUG_ON(!pse);
1893 if (wakeup_preempt_entity(se, pse) == 1) 1909 if (wakeup_preempt_entity(se, pse) == 1) {
1910 /*
1911 * Bias pick_next to pick the sched entity that is
1912 * triggering this preemption.
1913 */
1914 if (!next_buddy_marked)
1915 set_next_buddy(pse);
1894 goto preempt; 1916 goto preempt;
1917 }
1895 1918
1896 return; 1919 return;
1897 1920
@@ -2102,7 +2125,7 @@ static unsigned long
2102balance_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest, 2125balance_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest,
2103 unsigned long max_load_move, struct sched_domain *sd, 2126 unsigned long max_load_move, struct sched_domain *sd,
2104 enum cpu_idle_type idle, int *all_pinned, 2127 enum cpu_idle_type idle, int *all_pinned,
2105 int *this_best_prio, struct cfs_rq *busiest_cfs_rq) 2128 struct cfs_rq *busiest_cfs_rq)
2106{ 2129{
2107 int loops = 0, pulled = 0; 2130 int loops = 0, pulled = 0;
2108 long rem_load_move = max_load_move; 2131 long rem_load_move = max_load_move;
@@ -2140,9 +2163,6 @@ balance_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest,
2140 */ 2163 */
2141 if (rem_load_move <= 0) 2164 if (rem_load_move <= 0)
2142 break; 2165 break;
2143
2144 if (p->prio < *this_best_prio)
2145 *this_best_prio = p->prio;
2146 } 2166 }
2147out: 2167out:
2148 /* 2168 /*
@@ -2193,26 +2213,56 @@ static void update_shares(int cpu)
2193 struct rq *rq = cpu_rq(cpu); 2213 struct rq *rq = cpu_rq(cpu);
2194 2214
2195 rcu_read_lock(); 2215 rcu_read_lock();
2216 /*
2217 * Iterates the task_group tree in a bottom up fashion, see
2218 * list_add_leaf_cfs_rq() for details.
2219 */
2196 for_each_leaf_cfs_rq(rq, cfs_rq) 2220 for_each_leaf_cfs_rq(rq, cfs_rq)
2197 update_shares_cpu(cfs_rq->tg, cpu); 2221 update_shares_cpu(cfs_rq->tg, cpu);
2198 rcu_read_unlock(); 2222 rcu_read_unlock();
2199} 2223}
2200 2224
2225/*
2226 * Compute the cpu's hierarchical load factor for each task group.
2227 * This needs to be done in a top-down fashion because the load of a child
2228 * group is a fraction of its parents load.
2229 */
2230static int tg_load_down(struct task_group *tg, void *data)
2231{
2232 unsigned long load;
2233 long cpu = (long)data;
2234
2235 if (!tg->parent) {
2236 load = cpu_rq(cpu)->load.weight;
2237 } else {
2238 load = tg->parent->cfs_rq[cpu]->h_load;
2239 load *= tg->se[cpu]->load.weight;
2240 load /= tg->parent->cfs_rq[cpu]->load.weight + 1;
2241 }
2242
2243 tg->cfs_rq[cpu]->h_load = load;
2244
2245 return 0;
2246}
2247
2248static void update_h_load(long cpu)
2249{
2250 walk_tg_tree(tg_load_down, tg_nop, (void *)cpu);
2251}
2252
2201static unsigned long 2253static unsigned long
2202load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest, 2254load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest,
2203 unsigned long max_load_move, 2255 unsigned long max_load_move,
2204 struct sched_domain *sd, enum cpu_idle_type idle, 2256 struct sched_domain *sd, enum cpu_idle_type idle,
2205 int *all_pinned, int *this_best_prio) 2257 int *all_pinned)
2206{ 2258{
2207 long rem_load_move = max_load_move; 2259 long rem_load_move = max_load_move;
2208 int busiest_cpu = cpu_of(busiest); 2260 struct cfs_rq *busiest_cfs_rq;
2209 struct task_group *tg;
2210 2261
2211 rcu_read_lock(); 2262 rcu_read_lock();
2212 update_h_load(busiest_cpu); 2263 update_h_load(cpu_of(busiest));
2213 2264
2214 list_for_each_entry_rcu(tg, &task_groups, list) { 2265 for_each_leaf_cfs_rq(busiest, busiest_cfs_rq) {
2215 struct cfs_rq *busiest_cfs_rq = tg->cfs_rq[busiest_cpu];
2216 unsigned long busiest_h_load = busiest_cfs_rq->h_load; 2266 unsigned long busiest_h_load = busiest_cfs_rq->h_load;
2217 unsigned long busiest_weight = busiest_cfs_rq->load.weight; 2267 unsigned long busiest_weight = busiest_cfs_rq->load.weight;
2218 u64 rem_load, moved_load; 2268 u64 rem_load, moved_load;
@@ -2227,7 +2277,7 @@ load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest,
2227 rem_load = div_u64(rem_load, busiest_h_load + 1); 2277 rem_load = div_u64(rem_load, busiest_h_load + 1);
2228 2278
2229 moved_load = balance_tasks(this_rq, this_cpu, busiest, 2279 moved_load = balance_tasks(this_rq, this_cpu, busiest,
2230 rem_load, sd, idle, all_pinned, this_best_prio, 2280 rem_load, sd, idle, all_pinned,
2231 busiest_cfs_rq); 2281 busiest_cfs_rq);
2232 2282
2233 if (!moved_load) 2283 if (!moved_load)
@@ -2253,11 +2303,11 @@ static unsigned long
2253load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest, 2303load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest,
2254 unsigned long max_load_move, 2304 unsigned long max_load_move,
2255 struct sched_domain *sd, enum cpu_idle_type idle, 2305 struct sched_domain *sd, enum cpu_idle_type idle,
2256 int *all_pinned, int *this_best_prio) 2306 int *all_pinned)
2257{ 2307{
2258 return balance_tasks(this_rq, this_cpu, busiest, 2308 return balance_tasks(this_rq, this_cpu, busiest,
2259 max_load_move, sd, idle, all_pinned, 2309 max_load_move, sd, idle, all_pinned,
2260 this_best_prio, &busiest->cfs); 2310 &busiest->cfs);
2261} 2311}
2262#endif 2312#endif
2263 2313
@@ -2274,12 +2324,11 @@ static int move_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest,
2274 int *all_pinned) 2324 int *all_pinned)
2275{ 2325{
2276 unsigned long total_load_moved = 0, load_moved; 2326 unsigned long total_load_moved = 0, load_moved;
2277 int this_best_prio = this_rq->curr->prio;
2278 2327
2279 do { 2328 do {
2280 load_moved = load_balance_fair(this_rq, this_cpu, busiest, 2329 load_moved = load_balance_fair(this_rq, this_cpu, busiest,
2281 max_load_move - total_load_moved, 2330 max_load_move - total_load_moved,
2282 sd, idle, all_pinned, &this_best_prio); 2331 sd, idle, all_pinned);
2283 2332
2284 total_load_moved += load_moved; 2333 total_load_moved += load_moved;
2285 2334
@@ -2534,7 +2583,7 @@ static inline int check_power_save_busiest_group(struct sd_lb_stats *sds,
2534 2583
2535unsigned long default_scale_freq_power(struct sched_domain *sd, int cpu) 2584unsigned long default_scale_freq_power(struct sched_domain *sd, int cpu)
2536{ 2585{
2537 return SCHED_LOAD_SCALE; 2586 return SCHED_POWER_SCALE;
2538} 2587}
2539 2588
2540unsigned long __weak arch_scale_freq_power(struct sched_domain *sd, int cpu) 2589unsigned long __weak arch_scale_freq_power(struct sched_domain *sd, int cpu)
@@ -2571,10 +2620,10 @@ unsigned long scale_rt_power(int cpu)
2571 available = total - rq->rt_avg; 2620 available = total - rq->rt_avg;
2572 } 2621 }
2573 2622
2574 if (unlikely((s64)total < SCHED_LOAD_SCALE)) 2623 if (unlikely((s64)total < SCHED_POWER_SCALE))
2575 total = SCHED_LOAD_SCALE; 2624 total = SCHED_POWER_SCALE;
2576 2625
2577 total >>= SCHED_LOAD_SHIFT; 2626 total >>= SCHED_POWER_SHIFT;
2578 2627
2579 return div_u64(available, total); 2628 return div_u64(available, total);
2580} 2629}
@@ -2582,7 +2631,7 @@ unsigned long scale_rt_power(int cpu)
2582static void update_cpu_power(struct sched_domain *sd, int cpu) 2631static void update_cpu_power(struct sched_domain *sd, int cpu)
2583{ 2632{
2584 unsigned long weight = sd->span_weight; 2633 unsigned long weight = sd->span_weight;
2585 unsigned long power = SCHED_LOAD_SCALE; 2634 unsigned long power = SCHED_POWER_SCALE;
2586 struct sched_group *sdg = sd->groups; 2635 struct sched_group *sdg = sd->groups;
2587 2636
2588 if ((sd->flags & SD_SHARE_CPUPOWER) && weight > 1) { 2637 if ((sd->flags & SD_SHARE_CPUPOWER) && weight > 1) {
@@ -2591,26 +2640,26 @@ static void update_cpu_power(struct sched_domain *sd, int cpu)
2591 else 2640 else
2592 power *= default_scale_smt_power(sd, cpu); 2641 power *= default_scale_smt_power(sd, cpu);
2593 2642
2594 power >>= SCHED_LOAD_SHIFT; 2643 power >>= SCHED_POWER_SHIFT;
2595 } 2644 }
2596 2645
2597 sdg->cpu_power_orig = power; 2646 sdg->sgp->power_orig = power;
2598 2647
2599 if (sched_feat(ARCH_POWER)) 2648 if (sched_feat(ARCH_POWER))
2600 power *= arch_scale_freq_power(sd, cpu); 2649 power *= arch_scale_freq_power(sd, cpu);
2601 else 2650 else
2602 power *= default_scale_freq_power(sd, cpu); 2651 power *= default_scale_freq_power(sd, cpu);
2603 2652
2604 power >>= SCHED_LOAD_SHIFT; 2653 power >>= SCHED_POWER_SHIFT;
2605 2654
2606 power *= scale_rt_power(cpu); 2655 power *= scale_rt_power(cpu);
2607 power >>= SCHED_LOAD_SHIFT; 2656 power >>= SCHED_POWER_SHIFT;
2608 2657
2609 if (!power) 2658 if (!power)
2610 power = 1; 2659 power = 1;
2611 2660
2612 cpu_rq(cpu)->cpu_power = power; 2661 cpu_rq(cpu)->cpu_power = power;
2613 sdg->cpu_power = power; 2662 sdg->sgp->power = power;
2614} 2663}
2615 2664
2616static void update_group_power(struct sched_domain *sd, int cpu) 2665static void update_group_power(struct sched_domain *sd, int cpu)
@@ -2628,11 +2677,11 @@ static void update_group_power(struct sched_domain *sd, int cpu)
2628 2677
2629 group = child->groups; 2678 group = child->groups;
2630 do { 2679 do {
2631 power += group->cpu_power; 2680 power += group->sgp->power;
2632 group = group->next; 2681 group = group->next;
2633 } while (group != child->groups); 2682 } while (group != child->groups);
2634 2683
2635 sdg->cpu_power = power; 2684 sdg->sgp->power = power;
2636} 2685}
2637 2686
2638/* 2687/*
@@ -2646,15 +2695,15 @@ static inline int
2646fix_small_capacity(struct sched_domain *sd, struct sched_group *group) 2695fix_small_capacity(struct sched_domain *sd, struct sched_group *group)
2647{ 2696{
2648 /* 2697 /*
2649 * Only siblings can have significantly less than SCHED_LOAD_SCALE 2698 * Only siblings can have significantly less than SCHED_POWER_SCALE
2650 */ 2699 */
2651 if (sd->level != SD_LV_SIBLING) 2700 if (!(sd->flags & SD_SHARE_CPUPOWER))
2652 return 0; 2701 return 0;
2653 2702
2654 /* 2703 /*
2655 * If ~90% of the cpu_power is still there, we're good. 2704 * If ~90% of the cpu_power is still there, we're good.
2656 */ 2705 */
2657 if (group->cpu_power * 32 > group->cpu_power_orig * 29) 2706 if (group->sgp->power * 32 > group->sgp->power_orig * 29)
2658 return 1; 2707 return 1;
2659 2708
2660 return 0; 2709 return 0;
@@ -2734,7 +2783,7 @@ static inline void update_sg_lb_stats(struct sched_domain *sd,
2734 } 2783 }
2735 2784
2736 /* Adjust by relative CPU power of the group */ 2785 /* Adjust by relative CPU power of the group */
2737 sgs->avg_load = (sgs->group_load * SCHED_LOAD_SCALE) / group->cpu_power; 2786 sgs->avg_load = (sgs->group_load*SCHED_POWER_SCALE) / group->sgp->power;
2738 2787
2739 /* 2788 /*
2740 * Consider the group unbalanced when the imbalance is larger 2789 * Consider the group unbalanced when the imbalance is larger
@@ -2751,7 +2800,8 @@ static inline void update_sg_lb_stats(struct sched_domain *sd,
2751 if ((max_cpu_load - min_cpu_load) >= avg_load_per_task && max_nr_running > 1) 2800 if ((max_cpu_load - min_cpu_load) >= avg_load_per_task && max_nr_running > 1)
2752 sgs->group_imb = 1; 2801 sgs->group_imb = 1;
2753 2802
2754 sgs->group_capacity = DIV_ROUND_CLOSEST(group->cpu_power, SCHED_LOAD_SCALE); 2803 sgs->group_capacity = DIV_ROUND_CLOSEST(group->sgp->power,
2804 SCHED_POWER_SCALE);
2755 if (!sgs->group_capacity) 2805 if (!sgs->group_capacity)
2756 sgs->group_capacity = fix_small_capacity(sd, group); 2806 sgs->group_capacity = fix_small_capacity(sd, group);
2757 sgs->group_weight = group->group_weight; 2807 sgs->group_weight = group->group_weight;
@@ -2839,7 +2889,7 @@ static inline void update_sd_lb_stats(struct sched_domain *sd, int this_cpu,
2839 return; 2889 return;
2840 2890
2841 sds->total_load += sgs.group_load; 2891 sds->total_load += sgs.group_load;
2842 sds->total_pwr += sg->cpu_power; 2892 sds->total_pwr += sg->sgp->power;
2843 2893
2844 /* 2894 /*
2845 * In case the child domain prefers tasks go to siblings 2895 * In case the child domain prefers tasks go to siblings
@@ -2924,8 +2974,8 @@ static int check_asym_packing(struct sched_domain *sd,
2924 if (this_cpu > busiest_cpu) 2974 if (this_cpu > busiest_cpu)
2925 return 0; 2975 return 0;
2926 2976
2927 *imbalance = DIV_ROUND_CLOSEST(sds->max_load * sds->busiest->cpu_power, 2977 *imbalance = DIV_ROUND_CLOSEST(sds->max_load * sds->busiest->sgp->power,
2928 SCHED_LOAD_SCALE); 2978 SCHED_POWER_SCALE);
2929 return 1; 2979 return 1;
2930} 2980}
2931 2981
@@ -2954,8 +3004,8 @@ static inline void fix_small_imbalance(struct sd_lb_stats *sds,
2954 cpu_avg_load_per_task(this_cpu); 3004 cpu_avg_load_per_task(this_cpu);
2955 3005
2956 scaled_busy_load_per_task = sds->busiest_load_per_task 3006 scaled_busy_load_per_task = sds->busiest_load_per_task
2957 * SCHED_LOAD_SCALE; 3007 * SCHED_POWER_SCALE;
2958 scaled_busy_load_per_task /= sds->busiest->cpu_power; 3008 scaled_busy_load_per_task /= sds->busiest->sgp->power;
2959 3009
2960 if (sds->max_load - sds->this_load + scaled_busy_load_per_task >= 3010 if (sds->max_load - sds->this_load + scaled_busy_load_per_task >=
2961 (scaled_busy_load_per_task * imbn)) { 3011 (scaled_busy_load_per_task * imbn)) {
@@ -2969,30 +3019,30 @@ static inline void fix_small_imbalance(struct sd_lb_stats *sds,
2969 * moving them. 3019 * moving them.
2970 */ 3020 */
2971 3021
2972 pwr_now += sds->busiest->cpu_power * 3022 pwr_now += sds->busiest->sgp->power *
2973 min(sds->busiest_load_per_task, sds->max_load); 3023 min(sds->busiest_load_per_task, sds->max_load);
2974 pwr_now += sds->this->cpu_power * 3024 pwr_now += sds->this->sgp->power *
2975 min(sds->this_load_per_task, sds->this_load); 3025 min(sds->this_load_per_task, sds->this_load);
2976 pwr_now /= SCHED_LOAD_SCALE; 3026 pwr_now /= SCHED_POWER_SCALE;
2977 3027
2978 /* Amount of load we'd subtract */ 3028 /* Amount of load we'd subtract */
2979 tmp = (sds->busiest_load_per_task * SCHED_LOAD_SCALE) / 3029 tmp = (sds->busiest_load_per_task * SCHED_POWER_SCALE) /
2980 sds->busiest->cpu_power; 3030 sds->busiest->sgp->power;
2981 if (sds->max_load > tmp) 3031 if (sds->max_load > tmp)
2982 pwr_move += sds->busiest->cpu_power * 3032 pwr_move += sds->busiest->sgp->power *
2983 min(sds->busiest_load_per_task, sds->max_load - tmp); 3033 min(sds->busiest_load_per_task, sds->max_load - tmp);
2984 3034
2985 /* Amount of load we'd add */ 3035 /* Amount of load we'd add */
2986 if (sds->max_load * sds->busiest->cpu_power < 3036 if (sds->max_load * sds->busiest->sgp->power <
2987 sds->busiest_load_per_task * SCHED_LOAD_SCALE) 3037 sds->busiest_load_per_task * SCHED_POWER_SCALE)
2988 tmp = (sds->max_load * sds->busiest->cpu_power) / 3038 tmp = (sds->max_load * sds->busiest->sgp->power) /
2989 sds->this->cpu_power; 3039 sds->this->sgp->power;
2990 else 3040 else
2991 tmp = (sds->busiest_load_per_task * SCHED_LOAD_SCALE) / 3041 tmp = (sds->busiest_load_per_task * SCHED_POWER_SCALE) /
2992 sds->this->cpu_power; 3042 sds->this->sgp->power;
2993 pwr_move += sds->this->cpu_power * 3043 pwr_move += sds->this->sgp->power *
2994 min(sds->this_load_per_task, sds->this_load + tmp); 3044 min(sds->this_load_per_task, sds->this_load + tmp);
2995 pwr_move /= SCHED_LOAD_SCALE; 3045 pwr_move /= SCHED_POWER_SCALE;
2996 3046
2997 /* Move if we gain throughput */ 3047 /* Move if we gain throughput */
2998 if (pwr_move > pwr_now) 3048 if (pwr_move > pwr_now)
@@ -3034,9 +3084,9 @@ static inline void calculate_imbalance(struct sd_lb_stats *sds, int this_cpu,
3034 load_above_capacity = (sds->busiest_nr_running - 3084 load_above_capacity = (sds->busiest_nr_running -
3035 sds->busiest_group_capacity); 3085 sds->busiest_group_capacity);
3036 3086
3037 load_above_capacity *= (SCHED_LOAD_SCALE * SCHED_LOAD_SCALE); 3087 load_above_capacity *= (SCHED_LOAD_SCALE * SCHED_POWER_SCALE);
3038 3088
3039 load_above_capacity /= sds->busiest->cpu_power; 3089 load_above_capacity /= sds->busiest->sgp->power;
3040 } 3090 }
3041 3091
3042 /* 3092 /*
@@ -3052,9 +3102,9 @@ static inline void calculate_imbalance(struct sd_lb_stats *sds, int this_cpu,
3052 max_pull = min(sds->max_load - sds->avg_load, load_above_capacity); 3102 max_pull = min(sds->max_load - sds->avg_load, load_above_capacity);
3053 3103
3054 /* How much load to actually move to equalise the imbalance */ 3104 /* How much load to actually move to equalise the imbalance */
3055 *imbalance = min(max_pull * sds->busiest->cpu_power, 3105 *imbalance = min(max_pull * sds->busiest->sgp->power,
3056 (sds->avg_load - sds->this_load) * sds->this->cpu_power) 3106 (sds->avg_load - sds->this_load) * sds->this->sgp->power)
3057 / SCHED_LOAD_SCALE; 3107 / SCHED_POWER_SCALE;
3058 3108
3059 /* 3109 /*
3060 * if *imbalance is less than the average load per runnable task 3110 * if *imbalance is less than the average load per runnable task
@@ -3123,7 +3173,7 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
3123 if (!sds.busiest || sds.busiest_nr_running == 0) 3173 if (!sds.busiest || sds.busiest_nr_running == 0)
3124 goto out_balanced; 3174 goto out_balanced;
3125 3175
3126 sds.avg_load = (SCHED_LOAD_SCALE * sds.total_load) / sds.total_pwr; 3176 sds.avg_load = (SCHED_POWER_SCALE * sds.total_load) / sds.total_pwr;
3127 3177
3128 /* 3178 /*
3129 * If the busiest group is imbalanced the below checks don't 3179 * If the busiest group is imbalanced the below checks don't
@@ -3202,7 +3252,8 @@ find_busiest_queue(struct sched_domain *sd, struct sched_group *group,
3202 3252
3203 for_each_cpu(i, sched_group_cpus(group)) { 3253 for_each_cpu(i, sched_group_cpus(group)) {
3204 unsigned long power = power_of(i); 3254 unsigned long power = power_of(i);
3205 unsigned long capacity = DIV_ROUND_CLOSEST(power, SCHED_LOAD_SCALE); 3255 unsigned long capacity = DIV_ROUND_CLOSEST(power,
3256 SCHED_POWER_SCALE);
3206 unsigned long wl; 3257 unsigned long wl;
3207 3258
3208 if (!capacity) 3259 if (!capacity)
@@ -3227,7 +3278,7 @@ find_busiest_queue(struct sched_domain *sd, struct sched_group *group,
3227 * the load can be moved away from the cpu that is potentially 3278 * the load can be moved away from the cpu that is potentially
3228 * running at a lower capacity. 3279 * running at a lower capacity.
3229 */ 3280 */
3230 wl = (wl * SCHED_LOAD_SCALE) / power; 3281 wl = (wl * SCHED_POWER_SCALE) / power;
3231 3282
3232 if (wl > max_load) { 3283 if (wl > max_load) {
3233 max_load = wl; 3284 max_load = wl;
@@ -3465,6 +3516,7 @@ static void idle_balance(int this_cpu, struct rq *this_rq)
3465 raw_spin_unlock(&this_rq->lock); 3516 raw_spin_unlock(&this_rq->lock);
3466 3517
3467 update_shares(this_cpu); 3518 update_shares(this_cpu);
3519 rcu_read_lock();
3468 for_each_domain(this_cpu, sd) { 3520 for_each_domain(this_cpu, sd) {
3469 unsigned long interval; 3521 unsigned long interval;
3470 int balance = 1; 3522 int balance = 1;
@@ -3486,6 +3538,7 @@ static void idle_balance(int this_cpu, struct rq *this_rq)
3486 break; 3538 break;
3487 } 3539 }
3488 } 3540 }
3541 rcu_read_unlock();
3489 3542
3490 raw_spin_lock(&this_rq->lock); 3543 raw_spin_lock(&this_rq->lock);
3491 3544
@@ -3534,6 +3587,7 @@ static int active_load_balance_cpu_stop(void *data)
3534 double_lock_balance(busiest_rq, target_rq); 3587 double_lock_balance(busiest_rq, target_rq);
3535 3588
3536 /* Search for an sd spanning us and the target CPU. */ 3589 /* Search for an sd spanning us and the target CPU. */
3590 rcu_read_lock();
3537 for_each_domain(target_cpu, sd) { 3591 for_each_domain(target_cpu, sd) {
3538 if ((sd->flags & SD_LOAD_BALANCE) && 3592 if ((sd->flags & SD_LOAD_BALANCE) &&
3539 cpumask_test_cpu(busiest_cpu, sched_domain_span(sd))) 3593 cpumask_test_cpu(busiest_cpu, sched_domain_span(sd)))
@@ -3549,6 +3603,7 @@ static int active_load_balance_cpu_stop(void *data)
3549 else 3603 else
3550 schedstat_inc(sd, alb_failed); 3604 schedstat_inc(sd, alb_failed);
3551 } 3605 }
3606 rcu_read_unlock();
3552 double_unlock_balance(busiest_rq, target_rq); 3607 double_unlock_balance(busiest_rq, target_rq);
3553out_unlock: 3608out_unlock:
3554 busiest_rq->active_balance = 0; 3609 busiest_rq->active_balance = 0;
@@ -3675,6 +3730,7 @@ static int find_new_ilb(int cpu)
3675{ 3730{
3676 struct sched_domain *sd; 3731 struct sched_domain *sd;
3677 struct sched_group *ilb_group; 3732 struct sched_group *ilb_group;
3733 int ilb = nr_cpu_ids;
3678 3734
3679 /* 3735 /*
3680 * Have idle load balancer selection from semi-idle packages only 3736 * Have idle load balancer selection from semi-idle packages only
@@ -3690,20 +3746,25 @@ static int find_new_ilb(int cpu)
3690 if (cpumask_weight(nohz.idle_cpus_mask) < 2) 3746 if (cpumask_weight(nohz.idle_cpus_mask) < 2)
3691 goto out_done; 3747 goto out_done;
3692 3748
3749 rcu_read_lock();
3693 for_each_flag_domain(cpu, sd, SD_POWERSAVINGS_BALANCE) { 3750 for_each_flag_domain(cpu, sd, SD_POWERSAVINGS_BALANCE) {
3694 ilb_group = sd->groups; 3751 ilb_group = sd->groups;
3695 3752
3696 do { 3753 do {
3697 if (is_semi_idle_group(ilb_group)) 3754 if (is_semi_idle_group(ilb_group)) {
3698 return cpumask_first(nohz.grp_idle_mask); 3755 ilb = cpumask_first(nohz.grp_idle_mask);
3756 goto unlock;
3757 }
3699 3758
3700 ilb_group = ilb_group->next; 3759 ilb_group = ilb_group->next;
3701 3760
3702 } while (ilb_group != sd->groups); 3761 } while (ilb_group != sd->groups);
3703 } 3762 }
3763unlock:
3764 rcu_read_unlock();
3704 3765
3705out_done: 3766out_done:
3706 return nr_cpu_ids; 3767 return ilb;
3707} 3768}
3708#else /* (CONFIG_SCHED_MC || CONFIG_SCHED_SMT) */ 3769#else /* (CONFIG_SCHED_MC || CONFIG_SCHED_SMT) */
3709static inline int find_new_ilb(int call_cpu) 3770static inline int find_new_ilb(int call_cpu)
@@ -3848,6 +3909,7 @@ static void rebalance_domains(int cpu, enum cpu_idle_type idle)
3848 3909
3849 update_shares(cpu); 3910 update_shares(cpu);
3850 3911
3912 rcu_read_lock();
3851 for_each_domain(cpu, sd) { 3913 for_each_domain(cpu, sd) {
3852 if (!(sd->flags & SD_LOAD_BALANCE)) 3914 if (!(sd->flags & SD_LOAD_BALANCE))
3853 continue; 3915 continue;
@@ -3893,6 +3955,7 @@ out:
3893 if (!balance) 3955 if (!balance)
3894 break; 3956 break;
3895 } 3957 }
3958 rcu_read_unlock();
3896 3959
3897 /* 3960 /*
3898 * next_balance will be updated only when there is a need. 3961 * next_balance will be updated only when there is a need.
diff --git a/kernel/sched_features.h b/kernel/sched_features.h
index 68e69acc29b9..2e74677cb040 100644
--- a/kernel/sched_features.h
+++ b/kernel/sched_features.h
@@ -61,6 +61,14 @@ SCHED_FEAT(LB_BIAS, 1)
61SCHED_FEAT(OWNER_SPIN, 1) 61SCHED_FEAT(OWNER_SPIN, 1)
62 62
63/* 63/*
64 * Decrement CPU power based on irq activity 64 * Decrement CPU power based on time not spent running tasks
65 */ 65 */
66SCHED_FEAT(NONIRQ_POWER, 1) 66SCHED_FEAT(NONTASK_POWER, 1)
67
68/*
69 * Queue remote wakeups on the target CPU and process them
70 * using the scheduler IPI. Reduces rq->lock contention/bounces.
71 */
72SCHED_FEAT(TTWU_QUEUE, 1)
73
74SCHED_FEAT(FORCE_SD_OVERLAP, 0)
diff --git a/kernel/sched_idletask.c b/kernel/sched_idletask.c
index a776a6396427..0a51882534ea 100644
--- a/kernel/sched_idletask.c
+++ b/kernel/sched_idletask.c
@@ -7,7 +7,7 @@
7 7
8#ifdef CONFIG_SMP 8#ifdef CONFIG_SMP
9static int 9static int
10select_task_rq_idle(struct rq *rq, struct task_struct *p, int sd_flag, int flags) 10select_task_rq_idle(struct task_struct *p, int sd_flag, int flags)
11{ 11{
12 return task_cpu(p); /* IDLE tasks as never migrated */ 12 return task_cpu(p); /* IDLE tasks as never migrated */
13} 13}
diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c
index e7cebdc65f82..97540f0c9e47 100644
--- a/kernel/sched_rt.c
+++ b/kernel/sched_rt.c
@@ -183,6 +183,26 @@ static inline u64 sched_rt_period(struct rt_rq *rt_rq)
183 return ktime_to_ns(rt_rq->tg->rt_bandwidth.rt_period); 183 return ktime_to_ns(rt_rq->tg->rt_bandwidth.rt_period);
184} 184}
185 185
186typedef struct task_group *rt_rq_iter_t;
187
188static inline struct task_group *next_task_group(struct task_group *tg)
189{
190 do {
191 tg = list_entry_rcu(tg->list.next,
192 typeof(struct task_group), list);
193 } while (&tg->list != &task_groups && task_group_is_autogroup(tg));
194
195 if (&tg->list == &task_groups)
196 tg = NULL;
197
198 return tg;
199}
200
201#define for_each_rt_rq(rt_rq, iter, rq) \
202 for (iter = container_of(&task_groups, typeof(*iter), list); \
203 (iter = next_task_group(iter)) && \
204 (rt_rq = iter->rt_rq[cpu_of(rq)]);)
205
186static inline void list_add_leaf_rt_rq(struct rt_rq *rt_rq) 206static inline void list_add_leaf_rt_rq(struct rt_rq *rt_rq)
187{ 207{
188 list_add_rcu(&rt_rq->leaf_rt_rq_list, 208 list_add_rcu(&rt_rq->leaf_rt_rq_list,
@@ -288,6 +308,11 @@ static inline u64 sched_rt_period(struct rt_rq *rt_rq)
288 return ktime_to_ns(def_rt_bandwidth.rt_period); 308 return ktime_to_ns(def_rt_bandwidth.rt_period);
289} 309}
290 310
311typedef struct rt_rq *rt_rq_iter_t;
312
313#define for_each_rt_rq(rt_rq, iter, rq) \
314 for ((void) iter, rt_rq = &rq->rt; rt_rq; rt_rq = NULL)
315
291static inline void list_add_leaf_rt_rq(struct rt_rq *rt_rq) 316static inline void list_add_leaf_rt_rq(struct rt_rq *rt_rq)
292{ 317{
293} 318}
@@ -402,12 +427,13 @@ next:
402static void __disable_runtime(struct rq *rq) 427static void __disable_runtime(struct rq *rq)
403{ 428{
404 struct root_domain *rd = rq->rd; 429 struct root_domain *rd = rq->rd;
430 rt_rq_iter_t iter;
405 struct rt_rq *rt_rq; 431 struct rt_rq *rt_rq;
406 432
407 if (unlikely(!scheduler_running)) 433 if (unlikely(!scheduler_running))
408 return; 434 return;
409 435
410 for_each_leaf_rt_rq(rt_rq, rq) { 436 for_each_rt_rq(rt_rq, iter, rq) {
411 struct rt_bandwidth *rt_b = sched_rt_bandwidth(rt_rq); 437 struct rt_bandwidth *rt_b = sched_rt_bandwidth(rt_rq);
412 s64 want; 438 s64 want;
413 int i; 439 int i;
@@ -487,6 +513,7 @@ static void disable_runtime(struct rq *rq)
487 513
488static void __enable_runtime(struct rq *rq) 514static void __enable_runtime(struct rq *rq)
489{ 515{
516 rt_rq_iter_t iter;
490 struct rt_rq *rt_rq; 517 struct rt_rq *rt_rq;
491 518
492 if (unlikely(!scheduler_running)) 519 if (unlikely(!scheduler_running))
@@ -495,7 +522,7 @@ static void __enable_runtime(struct rq *rq)
495 /* 522 /*
496 * Reset each runqueue's bandwidth settings 523 * Reset each runqueue's bandwidth settings
497 */ 524 */
498 for_each_leaf_rt_rq(rt_rq, rq) { 525 for_each_rt_rq(rt_rq, iter, rq) {
499 struct rt_bandwidth *rt_b = sched_rt_bandwidth(rt_rq); 526 struct rt_bandwidth *rt_b = sched_rt_bandwidth(rt_rq);
500 527
501 raw_spin_lock(&rt_b->rt_runtime_lock); 528 raw_spin_lock(&rt_b->rt_runtime_lock);
@@ -562,6 +589,13 @@ static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun)
562 if (rt_rq->rt_throttled && rt_rq->rt_time < runtime) { 589 if (rt_rq->rt_throttled && rt_rq->rt_time < runtime) {
563 rt_rq->rt_throttled = 0; 590 rt_rq->rt_throttled = 0;
564 enqueue = 1; 591 enqueue = 1;
592
593 /*
594 * Force a clock update if the CPU was idle,
595 * lest wakeup -> unthrottle time accumulate.
596 */
597 if (rt_rq->rt_nr_running && rq->curr == rq->idle)
598 rq->skip_clock_update = -1;
565 } 599 }
566 if (rt_rq->rt_time || rt_rq->rt_nr_running) 600 if (rt_rq->rt_time || rt_rq->rt_nr_running)
567 idle = 0; 601 idle = 0;
@@ -977,13 +1011,23 @@ static void yield_task_rt(struct rq *rq)
977static int find_lowest_rq(struct task_struct *task); 1011static int find_lowest_rq(struct task_struct *task);
978 1012
979static int 1013static int
980select_task_rq_rt(struct rq *rq, struct task_struct *p, int sd_flag, int flags) 1014select_task_rq_rt(struct task_struct *p, int sd_flag, int flags)
981{ 1015{
1016 struct task_struct *curr;
1017 struct rq *rq;
1018 int cpu;
1019
982 if (sd_flag != SD_BALANCE_WAKE) 1020 if (sd_flag != SD_BALANCE_WAKE)
983 return smp_processor_id(); 1021 return smp_processor_id();
984 1022
1023 cpu = task_cpu(p);
1024 rq = cpu_rq(cpu);
1025
1026 rcu_read_lock();
1027 curr = ACCESS_ONCE(rq->curr); /* unlocked access */
1028
985 /* 1029 /*
986 * If the current task is an RT task, then 1030 * If the current task on @p's runqueue is an RT task, then
987 * try to see if we can wake this RT task up on another 1031 * try to see if we can wake this RT task up on another
988 * runqueue. Otherwise simply start this RT task 1032 * runqueue. Otherwise simply start this RT task
989 * on its current runqueue. 1033 * on its current runqueue.
@@ -997,21 +1041,25 @@ select_task_rq_rt(struct rq *rq, struct task_struct *p, int sd_flag, int flags)
997 * lock? 1041 * lock?
998 * 1042 *
999 * For equal prio tasks, we just let the scheduler sort it out. 1043 * For equal prio tasks, we just let the scheduler sort it out.
1044 *
1045 * Otherwise, just let it ride on the affined RQ and the
1046 * post-schedule router will push the preempted task away
1047 *
1048 * This test is optimistic, if we get it wrong the load-balancer
1049 * will have to sort it out.
1000 */ 1050 */
1001 if (unlikely(rt_task(rq->curr)) && 1051 if (curr && unlikely(rt_task(curr)) &&
1002 (rq->curr->rt.nr_cpus_allowed < 2 || 1052 (curr->rt.nr_cpus_allowed < 2 ||
1003 rq->curr->prio < p->prio) && 1053 curr->prio < p->prio) &&
1004 (p->rt.nr_cpus_allowed > 1)) { 1054 (p->rt.nr_cpus_allowed > 1)) {
1005 int cpu = find_lowest_rq(p); 1055 int target = find_lowest_rq(p);
1006 1056
1007 return (cpu == -1) ? task_cpu(p) : cpu; 1057 if (target != -1)
1058 cpu = target;
1008 } 1059 }
1060 rcu_read_unlock();
1009 1061
1010 /* 1062 return cpu;
1011 * Otherwise, just let it ride on the affined RQ and the
1012 * post-schedule router will push the preempted task away
1013 */
1014 return task_cpu(p);
1015} 1063}
1016 1064
1017static void check_preempt_equal_prio(struct rq *rq, struct task_struct *p) 1065static void check_preempt_equal_prio(struct rq *rq, struct task_struct *p)
@@ -1060,7 +1108,7 @@ static void check_preempt_curr_rt(struct rq *rq, struct task_struct *p, int flag
1060 * to move current somewhere else, making room for our non-migratable 1108 * to move current somewhere else, making room for our non-migratable
1061 * task. 1109 * task.
1062 */ 1110 */
1063 if (p->prio == rq->curr->prio && !need_resched()) 1111 if (p->prio == rq->curr->prio && !test_tsk_need_resched(rq->curr))
1064 check_preempt_equal_prio(rq, p); 1112 check_preempt_equal_prio(rq, p);
1065#endif 1113#endif
1066} 1114}
@@ -1090,7 +1138,7 @@ static struct task_struct *_pick_next_task_rt(struct rq *rq)
1090 1138
1091 rt_rq = &rq->rt; 1139 rt_rq = &rq->rt;
1092 1140
1093 if (unlikely(!rt_rq->rt_nr_running)) 1141 if (!rt_rq->rt_nr_running)
1094 return NULL; 1142 return NULL;
1095 1143
1096 if (rt_rq_throttled(rt_rq)) 1144 if (rt_rq_throttled(rt_rq))
@@ -1136,7 +1184,7 @@ static void put_prev_task_rt(struct rq *rq, struct task_struct *p)
1136 * The previous task needs to be made eligible for pushing 1184 * The previous task needs to be made eligible for pushing
1137 * if it is still active 1185 * if it is still active
1138 */ 1186 */
1139 if (p->se.on_rq && p->rt.nr_cpus_allowed > 1) 1187 if (on_rt_rq(&p->rt) && p->rt.nr_cpus_allowed > 1)
1140 enqueue_pushable_task(rq, p); 1188 enqueue_pushable_task(rq, p);
1141} 1189}
1142 1190
@@ -1203,6 +1251,10 @@ static int find_lowest_rq(struct task_struct *task)
1203 int this_cpu = smp_processor_id(); 1251 int this_cpu = smp_processor_id();
1204 int cpu = task_cpu(task); 1252 int cpu = task_cpu(task);
1205 1253
1254 /* Make sure the mask is initialized first */
1255 if (unlikely(!lowest_mask))
1256 return -1;
1257
1206 if (task->rt.nr_cpus_allowed == 1) 1258 if (task->rt.nr_cpus_allowed == 1)
1207 return -1; /* No other targets possible */ 1259 return -1; /* No other targets possible */
1208 1260
@@ -1227,6 +1279,7 @@ static int find_lowest_rq(struct task_struct *task)
1227 if (!cpumask_test_cpu(this_cpu, lowest_mask)) 1279 if (!cpumask_test_cpu(this_cpu, lowest_mask))
1228 this_cpu = -1; /* Skip this_cpu opt if not among lowest */ 1280 this_cpu = -1; /* Skip this_cpu opt if not among lowest */
1229 1281
1282 rcu_read_lock();
1230 for_each_domain(cpu, sd) { 1283 for_each_domain(cpu, sd) {
1231 if (sd->flags & SD_WAKE_AFFINE) { 1284 if (sd->flags & SD_WAKE_AFFINE) {
1232 int best_cpu; 1285 int best_cpu;
@@ -1236,15 +1289,20 @@ static int find_lowest_rq(struct task_struct *task)
1236 * remote processor. 1289 * remote processor.
1237 */ 1290 */
1238 if (this_cpu != -1 && 1291 if (this_cpu != -1 &&
1239 cpumask_test_cpu(this_cpu, sched_domain_span(sd))) 1292 cpumask_test_cpu(this_cpu, sched_domain_span(sd))) {
1293 rcu_read_unlock();
1240 return this_cpu; 1294 return this_cpu;
1295 }
1241 1296
1242 best_cpu = cpumask_first_and(lowest_mask, 1297 best_cpu = cpumask_first_and(lowest_mask,
1243 sched_domain_span(sd)); 1298 sched_domain_span(sd));
1244 if (best_cpu < nr_cpu_ids) 1299 if (best_cpu < nr_cpu_ids) {
1300 rcu_read_unlock();
1245 return best_cpu; 1301 return best_cpu;
1302 }
1246 } 1303 }
1247 } 1304 }
1305 rcu_read_unlock();
1248 1306
1249 /* 1307 /*
1250 * And finally, if there were no matches within the domains 1308 * And finally, if there were no matches within the domains
@@ -1287,7 +1345,7 @@ static struct rq *find_lock_lowest_rq(struct task_struct *task, struct rq *rq)
1287 !cpumask_test_cpu(lowest_rq->cpu, 1345 !cpumask_test_cpu(lowest_rq->cpu,
1288 &task->cpus_allowed) || 1346 &task->cpus_allowed) ||
1289 task_running(rq, task) || 1347 task_running(rq, task) ||
1290 !task->se.on_rq)) { 1348 !task->on_rq)) {
1291 1349
1292 raw_spin_unlock(&lowest_rq->lock); 1350 raw_spin_unlock(&lowest_rq->lock);
1293 lowest_rq = NULL; 1351 lowest_rq = NULL;
@@ -1321,7 +1379,7 @@ static struct task_struct *pick_next_pushable_task(struct rq *rq)
1321 BUG_ON(task_current(rq, p)); 1379 BUG_ON(task_current(rq, p));
1322 BUG_ON(p->rt.nr_cpus_allowed <= 1); 1380 BUG_ON(p->rt.nr_cpus_allowed <= 1);
1323 1381
1324 BUG_ON(!p->se.on_rq); 1382 BUG_ON(!p->on_rq);
1325 BUG_ON(!rt_task(p)); 1383 BUG_ON(!rt_task(p));
1326 1384
1327 return p; 1385 return p;
@@ -1467,7 +1525,7 @@ static int pull_rt_task(struct rq *this_rq)
1467 */ 1525 */
1468 if (p && (p->prio < this_rq->rt.highest_prio.curr)) { 1526 if (p && (p->prio < this_rq->rt.highest_prio.curr)) {
1469 WARN_ON(p == src_rq->curr); 1527 WARN_ON(p == src_rq->curr);
1470 WARN_ON(!p->se.on_rq); 1528 WARN_ON(!p->on_rq);
1471 1529
1472 /* 1530 /*
1473 * There's a chance that p is higher in priority 1531 * There's a chance that p is higher in priority
@@ -1502,7 +1560,7 @@ skip:
1502static void pre_schedule_rt(struct rq *rq, struct task_struct *prev) 1560static void pre_schedule_rt(struct rq *rq, struct task_struct *prev)
1503{ 1561{
1504 /* Try to pull RT tasks here if we lower this rq's prio */ 1562 /* Try to pull RT tasks here if we lower this rq's prio */
1505 if (unlikely(rt_task(prev)) && rq->rt.highest_prio.curr > prev->prio) 1563 if (rq->rt.highest_prio.curr > prev->prio)
1506 pull_rt_task(rq); 1564 pull_rt_task(rq);
1507} 1565}
1508 1566
@@ -1538,7 +1596,7 @@ static void set_cpus_allowed_rt(struct task_struct *p,
1538 * Update the migration status of the RQ if we have an RT task 1596 * Update the migration status of the RQ if we have an RT task
1539 * which is running AND changing its weight value. 1597 * which is running AND changing its weight value.
1540 */ 1598 */
1541 if (p->se.on_rq && (weight != p->rt.nr_cpus_allowed)) { 1599 if (p->on_rq && (weight != p->rt.nr_cpus_allowed)) {
1542 struct rq *rq = task_rq(p); 1600 struct rq *rq = task_rq(p);
1543 1601
1544 if (!task_current(rq, p)) { 1602 if (!task_current(rq, p)) {
@@ -1608,7 +1666,7 @@ static void switched_from_rt(struct rq *rq, struct task_struct *p)
1608 * we may need to handle the pulling of RT tasks 1666 * we may need to handle the pulling of RT tasks
1609 * now. 1667 * now.
1610 */ 1668 */
1611 if (p->se.on_rq && !rq->rt.rt_nr_running) 1669 if (p->on_rq && !rq->rt.rt_nr_running)
1612 pull_rt_task(rq); 1670 pull_rt_task(rq);
1613} 1671}
1614 1672
@@ -1638,7 +1696,7 @@ static void switched_to_rt(struct rq *rq, struct task_struct *p)
1638 * If that current running task is also an RT task 1696 * If that current running task is also an RT task
1639 * then see if we can move to another run queue. 1697 * then see if we can move to another run queue.
1640 */ 1698 */
1641 if (p->se.on_rq && rq->curr != p) { 1699 if (p->on_rq && rq->curr != p) {
1642#ifdef CONFIG_SMP 1700#ifdef CONFIG_SMP
1643 if (rq->rt.overloaded && push_rt_task(rq) && 1701 if (rq->rt.overloaded && push_rt_task(rq) &&
1644 /* Don't resched if we changed runqueues */ 1702 /* Don't resched if we changed runqueues */
@@ -1657,7 +1715,7 @@ static void switched_to_rt(struct rq *rq, struct task_struct *p)
1657static void 1715static void
1658prio_changed_rt(struct rq *rq, struct task_struct *p, int oldprio) 1716prio_changed_rt(struct rq *rq, struct task_struct *p, int oldprio)
1659{ 1717{
1660 if (!p->se.on_rq) 1718 if (!p->on_rq)
1661 return; 1719 return;
1662 1720
1663 if (rq->curr == p) { 1721 if (rq->curr == p) {
@@ -1796,10 +1854,11 @@ extern void print_rt_rq(struct seq_file *m, int cpu, struct rt_rq *rt_rq);
1796 1854
1797static void print_rt_stats(struct seq_file *m, int cpu) 1855static void print_rt_stats(struct seq_file *m, int cpu)
1798{ 1856{
1857 rt_rq_iter_t iter;
1799 struct rt_rq *rt_rq; 1858 struct rt_rq *rt_rq;
1800 1859
1801 rcu_read_lock(); 1860 rcu_read_lock();
1802 for_each_leaf_rt_rq(rt_rq, cpu_rq(cpu)) 1861 for_each_rt_rq(rt_rq, iter, cpu_rq(cpu))
1803 print_rt_rq(m, cpu, rt_rq); 1862 print_rt_rq(m, cpu, rt_rq);
1804 rcu_read_unlock(); 1863 rcu_read_unlock();
1805} 1864}
diff --git a/kernel/sched_stats.h b/kernel/sched_stats.h
index 48ddf431db0e..331e01bcd026 100644
--- a/kernel/sched_stats.h
+++ b/kernel/sched_stats.h
@@ -37,7 +37,7 @@ static int show_schedstat(struct seq_file *seq, void *v)
37 37
38#ifdef CONFIG_SMP 38#ifdef CONFIG_SMP
39 /* domain-specific stats */ 39 /* domain-specific stats */
40 preempt_disable(); 40 rcu_read_lock();
41 for_each_domain(cpu, sd) { 41 for_each_domain(cpu, sd) {
42 enum cpu_idle_type itype; 42 enum cpu_idle_type itype;
43 43
@@ -64,7 +64,7 @@ static int show_schedstat(struct seq_file *seq, void *v)
64 sd->ttwu_wake_remote, sd->ttwu_move_affine, 64 sd->ttwu_wake_remote, sd->ttwu_move_affine,
65 sd->ttwu_move_balance); 65 sd->ttwu_move_balance);
66 } 66 }
67 preempt_enable(); 67 rcu_read_unlock();
68#endif 68#endif
69 } 69 }
70 kfree(mask_str); 70 kfree(mask_str);
diff --git a/kernel/sched_stoptask.c b/kernel/sched_stoptask.c
index 1ba2bd40fdac..6f437632afab 100644
--- a/kernel/sched_stoptask.c
+++ b/kernel/sched_stoptask.c
@@ -9,8 +9,7 @@
9 9
10#ifdef CONFIG_SMP 10#ifdef CONFIG_SMP
11static int 11static int
12select_task_rq_stop(struct rq *rq, struct task_struct *p, 12select_task_rq_stop(struct task_struct *p, int sd_flag, int flags)
13 int sd_flag, int flags)
14{ 13{
15 return task_cpu(p); /* stop tasks as never migrate */ 14 return task_cpu(p); /* stop tasks as never migrate */
16} 15}
@@ -26,7 +25,7 @@ static struct task_struct *pick_next_task_stop(struct rq *rq)
26{ 25{
27 struct task_struct *stop = rq->stop; 26 struct task_struct *stop = rq->stop;
28 27
29 if (stop && stop->se.on_rq) 28 if (stop && stop->on_rq)
30 return stop; 29 return stop;
31 30
32 return NULL; 31 return NULL;
diff --git a/kernel/signal.c b/kernel/signal.c
index 7165af5f1b11..291c9700be75 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -87,7 +87,7 @@ static int sig_ignored(struct task_struct *t, int sig, int from_ancestor_ns)
87 /* 87 /*
88 * Tracers may want to know about even ignored signals. 88 * Tracers may want to know about even ignored signals.
89 */ 89 */
90 return !tracehook_consider_ignored_signal(t, sig); 90 return !t->ptrace;
91} 91}
92 92
93/* 93/*
@@ -124,7 +124,7 @@ static inline int has_pending_signals(sigset_t *signal, sigset_t *blocked)
124 124
125static int recalc_sigpending_tsk(struct task_struct *t) 125static int recalc_sigpending_tsk(struct task_struct *t)
126{ 126{
127 if (t->signal->group_stop_count > 0 || 127 if ((t->jobctl & JOBCTL_PENDING_MASK) ||
128 PENDING(&t->pending, &t->blocked) || 128 PENDING(&t->pending, &t->blocked) ||
129 PENDING(&t->signal->shared_pending, &t->blocked)) { 129 PENDING(&t->signal->shared_pending, &t->blocked)) {
130 set_tsk_thread_flag(t, TIF_SIGPENDING); 130 set_tsk_thread_flag(t, TIF_SIGPENDING);
@@ -150,9 +150,7 @@ void recalc_sigpending_and_wake(struct task_struct *t)
150 150
151void recalc_sigpending(void) 151void recalc_sigpending(void)
152{ 152{
153 if (unlikely(tracehook_force_sigpending())) 153 if (!recalc_sigpending_tsk(current) && !freezing(current))
154 set_thread_flag(TIF_SIGPENDING);
155 else if (!recalc_sigpending_tsk(current) && !freezing(current))
156 clear_thread_flag(TIF_SIGPENDING); 154 clear_thread_flag(TIF_SIGPENDING);
157 155
158} 156}
@@ -223,6 +221,129 @@ static inline void print_dropped_signal(int sig)
223 current->comm, current->pid, sig); 221 current->comm, current->pid, sig);
224} 222}
225 223
224/**
225 * task_set_jobctl_pending - set jobctl pending bits
226 * @task: target task
227 * @mask: pending bits to set
228 *
229 * Clear @mask from @task->jobctl. @mask must be subset of
230 * %JOBCTL_PENDING_MASK | %JOBCTL_STOP_CONSUME | %JOBCTL_STOP_SIGMASK |
231 * %JOBCTL_TRAPPING. If stop signo is being set, the existing signo is
232 * cleared. If @task is already being killed or exiting, this function
233 * becomes noop.
234 *
235 * CONTEXT:
236 * Must be called with @task->sighand->siglock held.
237 *
238 * RETURNS:
239 * %true if @mask is set, %false if made noop because @task was dying.
240 */
241bool task_set_jobctl_pending(struct task_struct *task, unsigned int mask)
242{
243 BUG_ON(mask & ~(JOBCTL_PENDING_MASK | JOBCTL_STOP_CONSUME |
244 JOBCTL_STOP_SIGMASK | JOBCTL_TRAPPING));
245 BUG_ON((mask & JOBCTL_TRAPPING) && !(mask & JOBCTL_PENDING_MASK));
246
247 if (unlikely(fatal_signal_pending(task) || (task->flags & PF_EXITING)))
248 return false;
249
250 if (mask & JOBCTL_STOP_SIGMASK)
251 task->jobctl &= ~JOBCTL_STOP_SIGMASK;
252
253 task->jobctl |= mask;
254 return true;
255}
256
257/**
258 * task_clear_jobctl_trapping - clear jobctl trapping bit
259 * @task: target task
260 *
261 * If JOBCTL_TRAPPING is set, a ptracer is waiting for us to enter TRACED.
262 * Clear it and wake up the ptracer. Note that we don't need any further
263 * locking. @task->siglock guarantees that @task->parent points to the
264 * ptracer.
265 *
266 * CONTEXT:
267 * Must be called with @task->sighand->siglock held.
268 */
269void task_clear_jobctl_trapping(struct task_struct *task)
270{
271 if (unlikely(task->jobctl & JOBCTL_TRAPPING)) {
272 task->jobctl &= ~JOBCTL_TRAPPING;
273 wake_up_bit(&task->jobctl, JOBCTL_TRAPPING_BIT);
274 }
275}
276
277/**
278 * task_clear_jobctl_pending - clear jobctl pending bits
279 * @task: target task
280 * @mask: pending bits to clear
281 *
282 * Clear @mask from @task->jobctl. @mask must be subset of
283 * %JOBCTL_PENDING_MASK. If %JOBCTL_STOP_PENDING is being cleared, other
284 * STOP bits are cleared together.
285 *
286 * If clearing of @mask leaves no stop or trap pending, this function calls
287 * task_clear_jobctl_trapping().
288 *
289 * CONTEXT:
290 * Must be called with @task->sighand->siglock held.
291 */
292void task_clear_jobctl_pending(struct task_struct *task, unsigned int mask)
293{
294 BUG_ON(mask & ~JOBCTL_PENDING_MASK);
295
296 if (mask & JOBCTL_STOP_PENDING)
297 mask |= JOBCTL_STOP_CONSUME | JOBCTL_STOP_DEQUEUED;
298
299 task->jobctl &= ~mask;
300
301 if (!(task->jobctl & JOBCTL_PENDING_MASK))
302 task_clear_jobctl_trapping(task);
303}
304
305/**
306 * task_participate_group_stop - participate in a group stop
307 * @task: task participating in a group stop
308 *
309 * @task has %JOBCTL_STOP_PENDING set and is participating in a group stop.
310 * Group stop states are cleared and the group stop count is consumed if
311 * %JOBCTL_STOP_CONSUME was set. If the consumption completes the group
312 * stop, the appropriate %SIGNAL_* flags are set.
313 *
314 * CONTEXT:
315 * Must be called with @task->sighand->siglock held.
316 *
317 * RETURNS:
318 * %true if group stop completion should be notified to the parent, %false
319 * otherwise.
320 */
321static bool task_participate_group_stop(struct task_struct *task)
322{
323 struct signal_struct *sig = task->signal;
324 bool consume = task->jobctl & JOBCTL_STOP_CONSUME;
325
326 WARN_ON_ONCE(!(task->jobctl & JOBCTL_STOP_PENDING));
327
328 task_clear_jobctl_pending(task, JOBCTL_STOP_PENDING);
329
330 if (!consume)
331 return false;
332
333 if (!WARN_ON_ONCE(sig->group_stop_count == 0))
334 sig->group_stop_count--;
335
336 /*
337 * Tell the caller to notify completion iff we are entering into a
338 * fresh group stop. Read comment in do_signal_stop() for details.
339 */
340 if (!sig->group_stop_count && !(sig->flags & SIGNAL_STOP_STOPPED)) {
341 sig->flags = SIGNAL_STOP_STOPPED;
342 return true;
343 }
344 return false;
345}
346
226/* 347/*
227 * allocate a new signal queue record 348 * allocate a new signal queue record
228 * - this may be called without locks if and only if t == current, otherwise an 349 * - this may be called without locks if and only if t == current, otherwise an
@@ -372,7 +493,8 @@ int unhandled_signal(struct task_struct *tsk, int sig)
372 return 1; 493 return 1;
373 if (handler != SIG_IGN && handler != SIG_DFL) 494 if (handler != SIG_IGN && handler != SIG_DFL)
374 return 0; 495 return 0;
375 return !tracehook_consider_fatal_signal(tsk, sig); 496 /* if ptraced, let the tracer determine */
497 return !tsk->ptrace;
376} 498}
377 499
378/* 500/*
@@ -527,7 +649,7 @@ int dequeue_signal(struct task_struct *tsk, sigset_t *mask, siginfo_t *info)
527 * is to alert stop-signal processing code when another 649 * is to alert stop-signal processing code when another
528 * processor has come along and cleared the flag. 650 * processor has come along and cleared the flag.
529 */ 651 */
530 tsk->signal->flags |= SIGNAL_STOP_DEQUEUED; 652 current->jobctl |= JOBCTL_STOP_DEQUEUED;
531 } 653 }
532 if ((info->si_code & __SI_MASK) == __SI_TIMER && info->si_sys_private) { 654 if ((info->si_code & __SI_MASK) == __SI_TIMER && info->si_sys_private) {
533 /* 655 /*
@@ -592,7 +714,7 @@ static int rm_from_queue_full(sigset_t *mask, struct sigpending *s)
592 if (sigisemptyset(&m)) 714 if (sigisemptyset(&m))
593 return 0; 715 return 0;
594 716
595 signandsets(&s->signal, &s->signal, mask); 717 sigandnsets(&s->signal, &s->signal, mask);
596 list_for_each_entry_safe(q, n, &s->list, list) { 718 list_for_each_entry_safe(q, n, &s->list, list) {
597 if (sigismember(mask, q->info.si_signo)) { 719 if (sigismember(mask, q->info.si_signo)) {
598 list_del_init(&q->list); 720 list_del_init(&q->list);
@@ -696,6 +818,32 @@ static int check_kill_permission(int sig, struct siginfo *info,
696 return security_task_kill(t, info, sig, 0); 818 return security_task_kill(t, info, sig, 0);
697} 819}
698 820
821/**
822 * ptrace_trap_notify - schedule trap to notify ptracer
823 * @t: tracee wanting to notify tracer
824 *
825 * This function schedules sticky ptrace trap which is cleared on the next
826 * TRAP_STOP to notify ptracer of an event. @t must have been seized by
827 * ptracer.
828 *
829 * If @t is running, STOP trap will be taken. If trapped for STOP and
830 * ptracer is listening for events, tracee is woken up so that it can
831 * re-trap for the new event. If trapped otherwise, STOP trap will be
832 * eventually taken without returning to userland after the existing traps
833 * are finished by PTRACE_CONT.
834 *
835 * CONTEXT:
836 * Must be called with @task->sighand->siglock held.
837 */
838static void ptrace_trap_notify(struct task_struct *t)
839{
840 WARN_ON_ONCE(!(t->ptrace & PT_SEIZED));
841 assert_spin_locked(&t->sighand->siglock);
842
843 task_set_jobctl_pending(t, JOBCTL_TRAP_NOTIFY);
844 signal_wake_up(t, t->jobctl & JOBCTL_LISTENING);
845}
846
699/* 847/*
700 * Handle magic process-wide effects of stop/continue signals. Unlike 848 * Handle magic process-wide effects of stop/continue signals. Unlike
701 * the signal actions, these happen immediately at signal-generation 849 * the signal actions, these happen immediately at signal-generation
@@ -727,34 +875,17 @@ static int prepare_signal(int sig, struct task_struct *p, int from_ancestor_ns)
727 } else if (sig == SIGCONT) { 875 } else if (sig == SIGCONT) {
728 unsigned int why; 876 unsigned int why;
729 /* 877 /*
730 * Remove all stop signals from all queues, 878 * Remove all stop signals from all queues, wake all threads.
731 * and wake all threads.
732 */ 879 */
733 rm_from_queue(SIG_KERNEL_STOP_MASK, &signal->shared_pending); 880 rm_from_queue(SIG_KERNEL_STOP_MASK, &signal->shared_pending);
734 t = p; 881 t = p;
735 do { 882 do {
736 unsigned int state; 883 task_clear_jobctl_pending(t, JOBCTL_STOP_PENDING);
737 rm_from_queue(SIG_KERNEL_STOP_MASK, &t->pending); 884 rm_from_queue(SIG_KERNEL_STOP_MASK, &t->pending);
738 /* 885 if (likely(!(t->ptrace & PT_SEIZED)))
739 * If there is a handler for SIGCONT, we must make 886 wake_up_state(t, __TASK_STOPPED);
740 * sure that no thread returns to user mode before 887 else
741 * we post the signal, in case it was the only 888 ptrace_trap_notify(t);
742 * thread eligible to run the signal handler--then
743 * it must not do anything between resuming and
744 * running the handler. With the TIF_SIGPENDING
745 * flag set, the thread will pause and acquire the
746 * siglock that we hold now and until we've queued
747 * the pending signal.
748 *
749 * Wake up the stopped thread _after_ setting
750 * TIF_SIGPENDING
751 */
752 state = __TASK_STOPPED;
753 if (sig_user_defined(t, SIGCONT) && !sigismember(&t->blocked, SIGCONT)) {
754 set_tsk_thread_flag(t, TIF_SIGPENDING);
755 state |= TASK_INTERRUPTIBLE;
756 }
757 wake_up_state(t, state);
758 } while_each_thread(p, t); 889 } while_each_thread(p, t);
759 890
760 /* 891 /*
@@ -780,13 +911,6 @@ static int prepare_signal(int sig, struct task_struct *p, int from_ancestor_ns)
780 signal->flags = why | SIGNAL_STOP_CONTINUED; 911 signal->flags = why | SIGNAL_STOP_CONTINUED;
781 signal->group_stop_count = 0; 912 signal->group_stop_count = 0;
782 signal->group_exit_code = 0; 913 signal->group_exit_code = 0;
783 } else {
784 /*
785 * We are not stopped, but there could be a stop
786 * signal in the middle of being processed after
787 * being removed from the queue. Clear that too.
788 */
789 signal->flags &= ~SIGNAL_STOP_DEQUEUED;
790 } 914 }
791 } 915 }
792 916
@@ -858,8 +982,7 @@ static void complete_signal(int sig, struct task_struct *p, int group)
858 if (sig_fatal(p, sig) && 982 if (sig_fatal(p, sig) &&
859 !(signal->flags & (SIGNAL_UNKILLABLE | SIGNAL_GROUP_EXIT)) && 983 !(signal->flags & (SIGNAL_UNKILLABLE | SIGNAL_GROUP_EXIT)) &&
860 !sigismember(&t->real_blocked, sig) && 984 !sigismember(&t->real_blocked, sig) &&
861 (sig == SIGKILL || 985 (sig == SIGKILL || !t->ptrace)) {
862 !tracehook_consider_fatal_signal(t, sig))) {
863 /* 986 /*
864 * This signal will be fatal to the whole group. 987 * This signal will be fatal to the whole group.
865 */ 988 */
@@ -875,6 +998,7 @@ static void complete_signal(int sig, struct task_struct *p, int group)
875 signal->group_stop_count = 0; 998 signal->group_stop_count = 0;
876 t = p; 999 t = p;
877 do { 1000 do {
1001 task_clear_jobctl_pending(t, JOBCTL_PENDING_MASK);
878 sigaddset(&t->pending.signal, SIGKILL); 1002 sigaddset(&t->pending.signal, SIGKILL);
879 signal_wake_up(t, 1); 1003 signal_wake_up(t, 1);
880 } while_each_thread(p, t); 1004 } while_each_thread(p, t);
@@ -1109,6 +1233,7 @@ int zap_other_threads(struct task_struct *p)
1109 p->signal->group_stop_count = 0; 1233 p->signal->group_stop_count = 0;
1110 1234
1111 while_each_thread(p, t) { 1235 while_each_thread(p, t) {
1236 task_clear_jobctl_pending(t, JOBCTL_PENDING_MASK);
1112 count++; 1237 count++;
1113 1238
1114 /* Don't bother with already dead threads */ 1239 /* Don't bother with already dead threads */
@@ -1126,18 +1251,25 @@ struct sighand_struct *__lock_task_sighand(struct task_struct *tsk,
1126{ 1251{
1127 struct sighand_struct *sighand; 1252 struct sighand_struct *sighand;
1128 1253
1129 rcu_read_lock();
1130 for (;;) { 1254 for (;;) {
1255 local_irq_save(*flags);
1256 rcu_read_lock();
1131 sighand = rcu_dereference(tsk->sighand); 1257 sighand = rcu_dereference(tsk->sighand);
1132 if (unlikely(sighand == NULL)) 1258 if (unlikely(sighand == NULL)) {
1259 rcu_read_unlock();
1260 local_irq_restore(*flags);
1133 break; 1261 break;
1262 }
1134 1263
1135 spin_lock_irqsave(&sighand->siglock, *flags); 1264 spin_lock(&sighand->siglock);
1136 if (likely(sighand == tsk->sighand)) 1265 if (likely(sighand == tsk->sighand)) {
1266 rcu_read_unlock();
1137 break; 1267 break;
1138 spin_unlock_irqrestore(&sighand->siglock, *flags); 1268 }
1269 spin_unlock(&sighand->siglock);
1270 rcu_read_unlock();
1271 local_irq_restore(*flags);
1139 } 1272 }
1140 rcu_read_unlock();
1141 1273
1142 return sighand; 1274 return sighand;
1143} 1275}
@@ -1452,22 +1584,22 @@ ret:
1452 * Let a parent know about the death of a child. 1584 * Let a parent know about the death of a child.
1453 * For a stopped/continued status change, use do_notify_parent_cldstop instead. 1585 * For a stopped/continued status change, use do_notify_parent_cldstop instead.
1454 * 1586 *
1455 * Returns -1 if our parent ignored us and so we've switched to 1587 * Returns true if our parent ignored us and so we've switched to
1456 * self-reaping, or else @sig. 1588 * self-reaping.
1457 */ 1589 */
1458int do_notify_parent(struct task_struct *tsk, int sig) 1590bool do_notify_parent(struct task_struct *tsk, int sig)
1459{ 1591{
1460 struct siginfo info; 1592 struct siginfo info;
1461 unsigned long flags; 1593 unsigned long flags;
1462 struct sighand_struct *psig; 1594 struct sighand_struct *psig;
1463 int ret = sig; 1595 bool autoreap = false;
1464 1596
1465 BUG_ON(sig == -1); 1597 BUG_ON(sig == -1);
1466 1598
1467 /* do_notify_parent_cldstop should have been called instead. */ 1599 /* do_notify_parent_cldstop should have been called instead. */
1468 BUG_ON(task_is_stopped_or_traced(tsk)); 1600 BUG_ON(task_is_stopped_or_traced(tsk));
1469 1601
1470 BUG_ON(!task_ptrace(tsk) && 1602 BUG_ON(!tsk->ptrace &&
1471 (tsk->group_leader != tsk || !thread_group_empty(tsk))); 1603 (tsk->group_leader != tsk || !thread_group_empty(tsk)));
1472 1604
1473 info.si_signo = sig; 1605 info.si_signo = sig;
@@ -1506,7 +1638,7 @@ int do_notify_parent(struct task_struct *tsk, int sig)
1506 1638
1507 psig = tsk->parent->sighand; 1639 psig = tsk->parent->sighand;
1508 spin_lock_irqsave(&psig->siglock, flags); 1640 spin_lock_irqsave(&psig->siglock, flags);
1509 if (!task_ptrace(tsk) && sig == SIGCHLD && 1641 if (!tsk->ptrace && sig == SIGCHLD &&
1510 (psig->action[SIGCHLD-1].sa.sa_handler == SIG_IGN || 1642 (psig->action[SIGCHLD-1].sa.sa_handler == SIG_IGN ||
1511 (psig->action[SIGCHLD-1].sa.sa_flags & SA_NOCLDWAIT))) { 1643 (psig->action[SIGCHLD-1].sa.sa_flags & SA_NOCLDWAIT))) {
1512 /* 1644 /*
@@ -1524,28 +1656,42 @@ int do_notify_parent(struct task_struct *tsk, int sig)
1524 * is implementation-defined: we do (if you don't want 1656 * is implementation-defined: we do (if you don't want
1525 * it, just use SIG_IGN instead). 1657 * it, just use SIG_IGN instead).
1526 */ 1658 */
1527 ret = tsk->exit_signal = -1; 1659 autoreap = true;
1528 if (psig->action[SIGCHLD-1].sa.sa_handler == SIG_IGN) 1660 if (psig->action[SIGCHLD-1].sa.sa_handler == SIG_IGN)
1529 sig = -1; 1661 sig = 0;
1530 } 1662 }
1531 if (valid_signal(sig) && sig > 0) 1663 if (valid_signal(sig) && sig)
1532 __group_send_sig_info(sig, &info, tsk->parent); 1664 __group_send_sig_info(sig, &info, tsk->parent);
1533 __wake_up_parent(tsk, tsk->parent); 1665 __wake_up_parent(tsk, tsk->parent);
1534 spin_unlock_irqrestore(&psig->siglock, flags); 1666 spin_unlock_irqrestore(&psig->siglock, flags);
1535 1667
1536 return ret; 1668 return autoreap;
1537} 1669}
1538 1670
1539static void do_notify_parent_cldstop(struct task_struct *tsk, int why) 1671/**
1672 * do_notify_parent_cldstop - notify parent of stopped/continued state change
1673 * @tsk: task reporting the state change
1674 * @for_ptracer: the notification is for ptracer
1675 * @why: CLD_{CONTINUED|STOPPED|TRAPPED} to report
1676 *
1677 * Notify @tsk's parent that the stopped/continued state has changed. If
1678 * @for_ptracer is %false, @tsk's group leader notifies to its real parent.
1679 * If %true, @tsk reports to @tsk->parent which should be the ptracer.
1680 *
1681 * CONTEXT:
1682 * Must be called with tasklist_lock at least read locked.
1683 */
1684static void do_notify_parent_cldstop(struct task_struct *tsk,
1685 bool for_ptracer, int why)
1540{ 1686{
1541 struct siginfo info; 1687 struct siginfo info;
1542 unsigned long flags; 1688 unsigned long flags;
1543 struct task_struct *parent; 1689 struct task_struct *parent;
1544 struct sighand_struct *sighand; 1690 struct sighand_struct *sighand;
1545 1691
1546 if (task_ptrace(tsk)) 1692 if (for_ptracer) {
1547 parent = tsk->parent; 1693 parent = tsk->parent;
1548 else { 1694 } else {
1549 tsk = tsk->group_leader; 1695 tsk = tsk->group_leader;
1550 parent = tsk->real_parent; 1696 parent = tsk->real_parent;
1551 } 1697 }
@@ -1592,7 +1738,7 @@ static void do_notify_parent_cldstop(struct task_struct *tsk, int why)
1592 1738
1593static inline int may_ptrace_stop(void) 1739static inline int may_ptrace_stop(void)
1594{ 1740{
1595 if (!likely(task_ptrace(current))) 1741 if (!likely(current->ptrace))
1596 return 0; 1742 return 0;
1597 /* 1743 /*
1598 * Are we in the middle of do_coredump? 1744 * Are we in the middle of do_coredump?
@@ -1631,10 +1777,12 @@ static int sigkill_pending(struct task_struct *tsk)
1631 * If we actually decide not to stop at all because the tracer 1777 * If we actually decide not to stop at all because the tracer
1632 * is gone, we keep current->exit_code unless clear_code. 1778 * is gone, we keep current->exit_code unless clear_code.
1633 */ 1779 */
1634static void ptrace_stop(int exit_code, int clear_code, siginfo_t *info) 1780static void ptrace_stop(int exit_code, int why, int clear_code, siginfo_t *info)
1635 __releases(&current->sighand->siglock) 1781 __releases(&current->sighand->siglock)
1636 __acquires(&current->sighand->siglock) 1782 __acquires(&current->sighand->siglock)
1637{ 1783{
1784 bool gstop_done = false;
1785
1638 if (arch_ptrace_stop_needed(exit_code, info)) { 1786 if (arch_ptrace_stop_needed(exit_code, info)) {
1639 /* 1787 /*
1640 * The arch code has something special to do before a 1788 * The arch code has something special to do before a
@@ -1655,21 +1803,52 @@ static void ptrace_stop(int exit_code, int clear_code, siginfo_t *info)
1655 } 1803 }
1656 1804
1657 /* 1805 /*
1658 * If there is a group stop in progress, 1806 * We're committing to trapping. TRACED should be visible before
1659 * we must participate in the bookkeeping. 1807 * TRAPPING is cleared; otherwise, the tracer might fail do_wait().
1808 * Also, transition to TRACED and updates to ->jobctl should be
1809 * atomic with respect to siglock and should be done after the arch
1810 * hook as siglock is released and regrabbed across it.
1660 */ 1811 */
1661 if (current->signal->group_stop_count > 0) 1812 set_current_state(TASK_TRACED);
1662 --current->signal->group_stop_count;
1663 1813
1664 current->last_siginfo = info; 1814 current->last_siginfo = info;
1665 current->exit_code = exit_code; 1815 current->exit_code = exit_code;
1666 1816
1667 /* Let the debugger run. */ 1817 /*
1668 __set_current_state(TASK_TRACED); 1818 * If @why is CLD_STOPPED, we're trapping to participate in a group
1819 * stop. Do the bookkeeping. Note that if SIGCONT was delievered
1820 * across siglock relocks since INTERRUPT was scheduled, PENDING
1821 * could be clear now. We act as if SIGCONT is received after
1822 * TASK_TRACED is entered - ignore it.
1823 */
1824 if (why == CLD_STOPPED && (current->jobctl & JOBCTL_STOP_PENDING))
1825 gstop_done = task_participate_group_stop(current);
1826
1827 /* any trap clears pending STOP trap, STOP trap clears NOTIFY */
1828 task_clear_jobctl_pending(current, JOBCTL_TRAP_STOP);
1829 if (info && info->si_code >> 8 == PTRACE_EVENT_STOP)
1830 task_clear_jobctl_pending(current, JOBCTL_TRAP_NOTIFY);
1831
1832 /* entering a trap, clear TRAPPING */
1833 task_clear_jobctl_trapping(current);
1834
1669 spin_unlock_irq(&current->sighand->siglock); 1835 spin_unlock_irq(&current->sighand->siglock);
1670 read_lock(&tasklist_lock); 1836 read_lock(&tasklist_lock);
1671 if (may_ptrace_stop()) { 1837 if (may_ptrace_stop()) {
1672 do_notify_parent_cldstop(current, CLD_TRAPPED); 1838 /*
1839 * Notify parents of the stop.
1840 *
1841 * While ptraced, there are two parents - the ptracer and
1842 * the real_parent of the group_leader. The ptracer should
1843 * know about every stop while the real parent is only
1844 * interested in the completion of group stop. The states
1845 * for the two don't interact with each other. Notify
1846 * separately unless they're gonna be duplicates.
1847 */
1848 do_notify_parent_cldstop(current, true, why);
1849 if (gstop_done && ptrace_reparented(current))
1850 do_notify_parent_cldstop(current, false, why);
1851
1673 /* 1852 /*
1674 * Don't want to allow preemption here, because 1853 * Don't want to allow preemption here, because
1675 * sys_ptrace() needs this task to be inactive. 1854 * sys_ptrace() needs this task to be inactive.
@@ -1684,7 +1863,16 @@ static void ptrace_stop(int exit_code, int clear_code, siginfo_t *info)
1684 /* 1863 /*
1685 * By the time we got the lock, our tracer went away. 1864 * By the time we got the lock, our tracer went away.
1686 * Don't drop the lock yet, another tracer may come. 1865 * Don't drop the lock yet, another tracer may come.
1866 *
1867 * If @gstop_done, the ptracer went away between group stop
1868 * completion and here. During detach, it would have set
1869 * JOBCTL_STOP_PENDING on us and we'll re-enter
1870 * TASK_STOPPED in do_signal_stop() on return, so notifying
1871 * the real parent of the group stop completion is enough.
1687 */ 1872 */
1873 if (gstop_done)
1874 do_notify_parent_cldstop(current, false, why);
1875
1688 __set_current_state(TASK_RUNNING); 1876 __set_current_state(TASK_RUNNING);
1689 if (clear_code) 1877 if (clear_code)
1690 current->exit_code = 0; 1878 current->exit_code = 0;
@@ -1706,6 +1894,9 @@ static void ptrace_stop(int exit_code, int clear_code, siginfo_t *info)
1706 spin_lock_irq(&current->sighand->siglock); 1894 spin_lock_irq(&current->sighand->siglock);
1707 current->last_siginfo = NULL; 1895 current->last_siginfo = NULL;
1708 1896
1897 /* LISTENING can be set only during STOP traps, clear it */
1898 current->jobctl &= ~JOBCTL_LISTENING;
1899
1709 /* 1900 /*
1710 * Queued signals ignored us while we were stopped for tracing. 1901 * Queued signals ignored us while we were stopped for tracing.
1711 * So check for any that we should take before resuming user mode. 1902 * So check for any that we should take before resuming user mode.
@@ -1714,107 +1905,204 @@ static void ptrace_stop(int exit_code, int clear_code, siginfo_t *info)
1714 recalc_sigpending_tsk(current); 1905 recalc_sigpending_tsk(current);
1715} 1906}
1716 1907
1717void ptrace_notify(int exit_code) 1908static void ptrace_do_notify(int signr, int exit_code, int why)
1718{ 1909{
1719 siginfo_t info; 1910 siginfo_t info;
1720 1911
1721 BUG_ON((exit_code & (0x7f | ~0xffff)) != SIGTRAP);
1722
1723 memset(&info, 0, sizeof info); 1912 memset(&info, 0, sizeof info);
1724 info.si_signo = SIGTRAP; 1913 info.si_signo = signr;
1725 info.si_code = exit_code; 1914 info.si_code = exit_code;
1726 info.si_pid = task_pid_vnr(current); 1915 info.si_pid = task_pid_vnr(current);
1727 info.si_uid = current_uid(); 1916 info.si_uid = current_uid();
1728 1917
1729 /* Let the debugger run. */ 1918 /* Let the debugger run. */
1919 ptrace_stop(exit_code, why, 1, &info);
1920}
1921
1922void ptrace_notify(int exit_code)
1923{
1924 BUG_ON((exit_code & (0x7f | ~0xffff)) != SIGTRAP);
1925
1730 spin_lock_irq(&current->sighand->siglock); 1926 spin_lock_irq(&current->sighand->siglock);
1731 ptrace_stop(exit_code, 1, &info); 1927 ptrace_do_notify(SIGTRAP, exit_code, CLD_TRAPPED);
1732 spin_unlock_irq(&current->sighand->siglock); 1928 spin_unlock_irq(&current->sighand->siglock);
1733} 1929}
1734 1930
1735/* 1931/**
1736 * This performs the stopping for SIGSTOP and other stop signals. 1932 * do_signal_stop - handle group stop for SIGSTOP and other stop signals
1737 * We have to stop all threads in the thread group. 1933 * @signr: signr causing group stop if initiating
1738 * Returns non-zero if we've actually stopped and released the siglock. 1934 *
1739 * Returns zero if we didn't stop and still hold the siglock. 1935 * If %JOBCTL_STOP_PENDING is not set yet, initiate group stop with @signr
1936 * and participate in it. If already set, participate in the existing
1937 * group stop. If participated in a group stop (and thus slept), %true is
1938 * returned with siglock released.
1939 *
1940 * If ptraced, this function doesn't handle stop itself. Instead,
1941 * %JOBCTL_TRAP_STOP is scheduled and %false is returned with siglock
1942 * untouched. The caller must ensure that INTERRUPT trap handling takes
1943 * places afterwards.
1944 *
1945 * CONTEXT:
1946 * Must be called with @current->sighand->siglock held, which is released
1947 * on %true return.
1948 *
1949 * RETURNS:
1950 * %false if group stop is already cancelled or ptrace trap is scheduled.
1951 * %true if participated in group stop.
1740 */ 1952 */
1741static int do_signal_stop(int signr) 1953static bool do_signal_stop(int signr)
1954 __releases(&current->sighand->siglock)
1742{ 1955{
1743 struct signal_struct *sig = current->signal; 1956 struct signal_struct *sig = current->signal;
1744 int notify;
1745 1957
1746 if (!sig->group_stop_count) { 1958 if (!(current->jobctl & JOBCTL_STOP_PENDING)) {
1959 unsigned int gstop = JOBCTL_STOP_PENDING | JOBCTL_STOP_CONSUME;
1747 struct task_struct *t; 1960 struct task_struct *t;
1748 1961
1749 if (!likely(sig->flags & SIGNAL_STOP_DEQUEUED) || 1962 /* signr will be recorded in task->jobctl for retries */
1963 WARN_ON_ONCE(signr & ~JOBCTL_STOP_SIGMASK);
1964
1965 if (!likely(current->jobctl & JOBCTL_STOP_DEQUEUED) ||
1750 unlikely(signal_group_exit(sig))) 1966 unlikely(signal_group_exit(sig)))
1751 return 0; 1967 return false;
1752 /* 1968 /*
1753 * There is no group stop already in progress. 1969 * There is no group stop already in progress. We must
1754 * We must initiate one now. 1970 * initiate one now.
1971 *
1972 * While ptraced, a task may be resumed while group stop is
1973 * still in effect and then receive a stop signal and
1974 * initiate another group stop. This deviates from the
1975 * usual behavior as two consecutive stop signals can't
1976 * cause two group stops when !ptraced. That is why we
1977 * also check !task_is_stopped(t) below.
1978 *
1979 * The condition can be distinguished by testing whether
1980 * SIGNAL_STOP_STOPPED is already set. Don't generate
1981 * group_exit_code in such case.
1982 *
1983 * This is not necessary for SIGNAL_STOP_CONTINUED because
1984 * an intervening stop signal is required to cause two
1985 * continued events regardless of ptrace.
1755 */ 1986 */
1756 sig->group_exit_code = signr; 1987 if (!(sig->flags & SIGNAL_STOP_STOPPED))
1988 sig->group_exit_code = signr;
1989 else
1990 WARN_ON_ONCE(!current->ptrace);
1991
1992 sig->group_stop_count = 0;
1757 1993
1758 sig->group_stop_count = 1; 1994 if (task_set_jobctl_pending(current, signr | gstop))
1759 for (t = next_thread(current); t != current; t = next_thread(t)) 1995 sig->group_stop_count++;
1996
1997 for (t = next_thread(current); t != current;
1998 t = next_thread(t)) {
1760 /* 1999 /*
1761 * Setting state to TASK_STOPPED for a group 2000 * Setting state to TASK_STOPPED for a group
1762 * stop is always done with the siglock held, 2001 * stop is always done with the siglock held,
1763 * so this check has no races. 2002 * so this check has no races.
1764 */ 2003 */
1765 if (!(t->flags & PF_EXITING) && 2004 if (!task_is_stopped(t) &&
1766 !task_is_stopped_or_traced(t)) { 2005 task_set_jobctl_pending(t, signr | gstop)) {
1767 sig->group_stop_count++; 2006 sig->group_stop_count++;
1768 signal_wake_up(t, 0); 2007 if (likely(!(t->ptrace & PT_SEIZED)))
2008 signal_wake_up(t, 0);
2009 else
2010 ptrace_trap_notify(t);
1769 } 2011 }
2012 }
1770 } 2013 }
1771 /* 2014
1772 * If there are no other threads in the group, or if there is 2015 if (likely(!current->ptrace)) {
1773 * a group stop in progress and we are the last to stop, report 2016 int notify = 0;
1774 * to the parent. When ptraced, every thread reports itself. 2017
1775 */ 2018 /*
1776 notify = sig->group_stop_count == 1 ? CLD_STOPPED : 0; 2019 * If there are no other threads in the group, or if there
1777 notify = tracehook_notify_jctl(notify, CLD_STOPPED); 2020 * is a group stop in progress and we are the last to stop,
1778 /* 2021 * report to the parent.
1779 * tracehook_notify_jctl() can drop and reacquire siglock, so 2022 */
1780 * we keep ->group_stop_count != 0 before the call. If SIGCONT 2023 if (task_participate_group_stop(current))
1781 * or SIGKILL comes in between ->group_stop_count == 0. 2024 notify = CLD_STOPPED;
1782 */ 2025
1783 if (sig->group_stop_count) {
1784 if (!--sig->group_stop_count)
1785 sig->flags = SIGNAL_STOP_STOPPED;
1786 current->exit_code = sig->group_exit_code;
1787 __set_current_state(TASK_STOPPED); 2026 __set_current_state(TASK_STOPPED);
1788 } 2027 spin_unlock_irq(&current->sighand->siglock);
1789 spin_unlock_irq(&current->sighand->siglock);
1790 2028
1791 if (notify) { 2029 /*
1792 read_lock(&tasklist_lock); 2030 * Notify the parent of the group stop completion. Because
1793 do_notify_parent_cldstop(current, notify); 2031 * we're not holding either the siglock or tasklist_lock
1794 read_unlock(&tasklist_lock); 2032 * here, ptracer may attach inbetween; however, this is for
1795 } 2033 * group stop and should always be delivered to the real
2034 * parent of the group leader. The new ptracer will get
2035 * its notification when this task transitions into
2036 * TASK_TRACED.
2037 */
2038 if (notify) {
2039 read_lock(&tasklist_lock);
2040 do_notify_parent_cldstop(current, false, notify);
2041 read_unlock(&tasklist_lock);
2042 }
1796 2043
1797 /* Now we don't run again until woken by SIGCONT or SIGKILL */ 2044 /* Now we don't run again until woken by SIGCONT or SIGKILL */
1798 do {
1799 schedule(); 2045 schedule();
1800 } while (try_to_freeze()); 2046 return true;
1801 2047 } else {
1802 tracehook_finish_jctl(); 2048 /*
1803 current->exit_code = 0; 2049 * While ptraced, group stop is handled by STOP trap.
2050 * Schedule it and let the caller deal with it.
2051 */
2052 task_set_jobctl_pending(current, JOBCTL_TRAP_STOP);
2053 return false;
2054 }
2055}
1804 2056
1805 return 1; 2057/**
2058 * do_jobctl_trap - take care of ptrace jobctl traps
2059 *
2060 * When PT_SEIZED, it's used for both group stop and explicit
2061 * SEIZE/INTERRUPT traps. Both generate PTRACE_EVENT_STOP trap with
2062 * accompanying siginfo. If stopped, lower eight bits of exit_code contain
2063 * the stop signal; otherwise, %SIGTRAP.
2064 *
2065 * When !PT_SEIZED, it's used only for group stop trap with stop signal
2066 * number as exit_code and no siginfo.
2067 *
2068 * CONTEXT:
2069 * Must be called with @current->sighand->siglock held, which may be
2070 * released and re-acquired before returning with intervening sleep.
2071 */
2072static void do_jobctl_trap(void)
2073{
2074 struct signal_struct *signal = current->signal;
2075 int signr = current->jobctl & JOBCTL_STOP_SIGMASK;
2076
2077 if (current->ptrace & PT_SEIZED) {
2078 if (!signal->group_stop_count &&
2079 !(signal->flags & SIGNAL_STOP_STOPPED))
2080 signr = SIGTRAP;
2081 WARN_ON_ONCE(!signr);
2082 ptrace_do_notify(signr, signr | (PTRACE_EVENT_STOP << 8),
2083 CLD_STOPPED);
2084 } else {
2085 WARN_ON_ONCE(!signr);
2086 ptrace_stop(signr, CLD_STOPPED, 0, NULL);
2087 current->exit_code = 0;
2088 }
1806} 2089}
1807 2090
1808static int ptrace_signal(int signr, siginfo_t *info, 2091static int ptrace_signal(int signr, siginfo_t *info,
1809 struct pt_regs *regs, void *cookie) 2092 struct pt_regs *regs, void *cookie)
1810{ 2093{
1811 if (!task_ptrace(current))
1812 return signr;
1813
1814 ptrace_signal_deliver(regs, cookie); 2094 ptrace_signal_deliver(regs, cookie);
1815 2095 /*
1816 /* Let the debugger run. */ 2096 * We do not check sig_kernel_stop(signr) but set this marker
1817 ptrace_stop(signr, 0, info); 2097 * unconditionally because we do not know whether debugger will
2098 * change signr. This flag has no meaning unless we are going
2099 * to stop after return from ptrace_stop(). In this case it will
2100 * be checked in do_signal_stop(), we should only stop if it was
2101 * not cleared by SIGCONT while we were sleeping. See also the
2102 * comment in dequeue_signal().
2103 */
2104 current->jobctl |= JOBCTL_STOP_DEQUEUED;
2105 ptrace_stop(signr, CLD_TRAPPED, 0, info);
1818 2106
1819 /* We're back. Did the debugger cancel the sig? */ 2107 /* We're back. Did the debugger cancel the sig? */
1820 signr = current->exit_code; 2108 signr = current->exit_code;
@@ -1869,54 +2157,63 @@ relock:
1869 * the CLD_ si_code into SIGNAL_CLD_MASK bits. 2157 * the CLD_ si_code into SIGNAL_CLD_MASK bits.
1870 */ 2158 */
1871 if (unlikely(signal->flags & SIGNAL_CLD_MASK)) { 2159 if (unlikely(signal->flags & SIGNAL_CLD_MASK)) {
1872 int why = (signal->flags & SIGNAL_STOP_CONTINUED) 2160 int why;
1873 ? CLD_CONTINUED : CLD_STOPPED; 2161
2162 if (signal->flags & SIGNAL_CLD_CONTINUED)
2163 why = CLD_CONTINUED;
2164 else
2165 why = CLD_STOPPED;
2166
1874 signal->flags &= ~SIGNAL_CLD_MASK; 2167 signal->flags &= ~SIGNAL_CLD_MASK;
1875 2168
1876 why = tracehook_notify_jctl(why, CLD_CONTINUED);
1877 spin_unlock_irq(&sighand->siglock); 2169 spin_unlock_irq(&sighand->siglock);
1878 2170
1879 if (why) { 2171 /*
1880 read_lock(&tasklist_lock); 2172 * Notify the parent that we're continuing. This event is
1881 do_notify_parent_cldstop(current->group_leader, why); 2173 * always per-process and doesn't make whole lot of sense
1882 read_unlock(&tasklist_lock); 2174 * for ptracers, who shouldn't consume the state via
1883 } 2175 * wait(2) either, but, for backward compatibility, notify
2176 * the ptracer of the group leader too unless it's gonna be
2177 * a duplicate.
2178 */
2179 read_lock(&tasklist_lock);
2180 do_notify_parent_cldstop(current, false, why);
2181
2182 if (ptrace_reparented(current->group_leader))
2183 do_notify_parent_cldstop(current->group_leader,
2184 true, why);
2185 read_unlock(&tasklist_lock);
2186
1884 goto relock; 2187 goto relock;
1885 } 2188 }
1886 2189
1887 for (;;) { 2190 for (;;) {
1888 struct k_sigaction *ka; 2191 struct k_sigaction *ka;
1889 /* 2192
1890 * Tracing can induce an artificial signal and choose sigaction. 2193 if (unlikely(current->jobctl & JOBCTL_STOP_PENDING) &&
1891 * The return value in @signr determines the default action, 2194 do_signal_stop(0))
1892 * but @info->si_signo is the signal number we will report.
1893 */
1894 signr = tracehook_get_signal(current, regs, info, return_ka);
1895 if (unlikely(signr < 0))
1896 goto relock; 2195 goto relock;
1897 if (unlikely(signr != 0))
1898 ka = return_ka;
1899 else {
1900 if (unlikely(signal->group_stop_count > 0) &&
1901 do_signal_stop(0))
1902 goto relock;
1903 2196
1904 signr = dequeue_signal(current, &current->blocked, 2197 if (unlikely(current->jobctl & JOBCTL_TRAP_MASK)) {
1905 info); 2198 do_jobctl_trap();
2199 spin_unlock_irq(&sighand->siglock);
2200 goto relock;
2201 }
1906 2202
1907 if (!signr) 2203 signr = dequeue_signal(current, &current->blocked, info);
1908 break; /* will return 0 */
1909 2204
1910 if (signr != SIGKILL) { 2205 if (!signr)
1911 signr = ptrace_signal(signr, info, 2206 break; /* will return 0 */
1912 regs, cookie);
1913 if (!signr)
1914 continue;
1915 }
1916 2207
1917 ka = &sighand->action[signr-1]; 2208 if (unlikely(current->ptrace) && signr != SIGKILL) {
2209 signr = ptrace_signal(signr, info,
2210 regs, cookie);
2211 if (!signr)
2212 continue;
1918 } 2213 }
1919 2214
2215 ka = &sighand->action[signr-1];
2216
1920 /* Trace actually delivered signals. */ 2217 /* Trace actually delivered signals. */
1921 trace_signal_deliver(signr, info, ka); 2218 trace_signal_deliver(signr, info, ka);
1922 2219
@@ -2017,10 +2314,42 @@ relock:
2017 return signr; 2314 return signr;
2018} 2315}
2019 2316
2317/*
2318 * It could be that complete_signal() picked us to notify about the
2319 * group-wide signal. Other threads should be notified now to take
2320 * the shared signals in @which since we will not.
2321 */
2322static void retarget_shared_pending(struct task_struct *tsk, sigset_t *which)
2323{
2324 sigset_t retarget;
2325 struct task_struct *t;
2326
2327 sigandsets(&retarget, &tsk->signal->shared_pending.signal, which);
2328 if (sigisemptyset(&retarget))
2329 return;
2330
2331 t = tsk;
2332 while_each_thread(tsk, t) {
2333 if (t->flags & PF_EXITING)
2334 continue;
2335
2336 if (!has_pending_signals(&retarget, &t->blocked))
2337 continue;
2338 /* Remove the signals this thread can handle. */
2339 sigandsets(&retarget, &retarget, &t->blocked);
2340
2341 if (!signal_pending(t))
2342 signal_wake_up(t, 0);
2343
2344 if (sigisemptyset(&retarget))
2345 break;
2346 }
2347}
2348
2020void exit_signals(struct task_struct *tsk) 2349void exit_signals(struct task_struct *tsk)
2021{ 2350{
2022 int group_stop = 0; 2351 int group_stop = 0;
2023 struct task_struct *t; 2352 sigset_t unblocked;
2024 2353
2025 if (thread_group_empty(tsk) || signal_group_exit(tsk->signal)) { 2354 if (thread_group_empty(tsk) || signal_group_exit(tsk->signal)) {
2026 tsk->flags |= PF_EXITING; 2355 tsk->flags |= PF_EXITING;
@@ -2036,26 +2365,23 @@ void exit_signals(struct task_struct *tsk)
2036 if (!signal_pending(tsk)) 2365 if (!signal_pending(tsk))
2037 goto out; 2366 goto out;
2038 2367
2039 /* 2368 unblocked = tsk->blocked;
2040 * It could be that __group_complete_signal() choose us to 2369 signotset(&unblocked);
2041 * notify about group-wide signal. Another thread should be 2370 retarget_shared_pending(tsk, &unblocked);
2042 * woken now to take the signal since we will not.
2043 */
2044 for (t = tsk; (t = next_thread(t)) != tsk; )
2045 if (!signal_pending(t) && !(t->flags & PF_EXITING))
2046 recalc_sigpending_and_wake(t);
2047 2371
2048 if (unlikely(tsk->signal->group_stop_count) && 2372 if (unlikely(tsk->jobctl & JOBCTL_STOP_PENDING) &&
2049 !--tsk->signal->group_stop_count) { 2373 task_participate_group_stop(tsk))
2050 tsk->signal->flags = SIGNAL_STOP_STOPPED; 2374 group_stop = CLD_STOPPED;
2051 group_stop = tracehook_notify_jctl(CLD_STOPPED, CLD_STOPPED);
2052 }
2053out: 2375out:
2054 spin_unlock_irq(&tsk->sighand->siglock); 2376 spin_unlock_irq(&tsk->sighand->siglock);
2055 2377
2378 /*
2379 * If group stop has completed, deliver the notification. This
2380 * should always go to the real parent of the group leader.
2381 */
2056 if (unlikely(group_stop)) { 2382 if (unlikely(group_stop)) {
2057 read_lock(&tasklist_lock); 2383 read_lock(&tasklist_lock);
2058 do_notify_parent_cldstop(tsk, group_stop); 2384 do_notify_parent_cldstop(tsk, false, group_stop);
2059 read_unlock(&tasklist_lock); 2385 read_unlock(&tasklist_lock);
2060 } 2386 }
2061} 2387}
@@ -2089,11 +2415,33 @@ long do_no_restart_syscall(struct restart_block *param)
2089 return -EINTR; 2415 return -EINTR;
2090} 2416}
2091 2417
2092/* 2418static void __set_task_blocked(struct task_struct *tsk, const sigset_t *newset)
2093 * We don't need to get the kernel lock - this is all local to this 2419{
2094 * particular thread.. (and that's good, because this is _heavily_ 2420 if (signal_pending(tsk) && !thread_group_empty(tsk)) {
2095 * used by various programs) 2421 sigset_t newblocked;
2422 /* A set of now blocked but previously unblocked signals. */
2423 sigandnsets(&newblocked, newset, &current->blocked);
2424 retarget_shared_pending(tsk, &newblocked);
2425 }
2426 tsk->blocked = *newset;
2427 recalc_sigpending();
2428}
2429
2430/**
2431 * set_current_blocked - change current->blocked mask
2432 * @newset: new mask
2433 *
2434 * It is wrong to change ->blocked directly, this helper should be used
2435 * to ensure the process can't miss a shared signal we are going to block.
2096 */ 2436 */
2437void set_current_blocked(const sigset_t *newset)
2438{
2439 struct task_struct *tsk = current;
2440
2441 spin_lock_irq(&tsk->sighand->siglock);
2442 __set_task_blocked(tsk, newset);
2443 spin_unlock_irq(&tsk->sighand->siglock);
2444}
2097 2445
2098/* 2446/*
2099 * This is also useful for kernel threads that want to temporarily 2447 * This is also useful for kernel threads that want to temporarily
@@ -2105,73 +2453,66 @@ long do_no_restart_syscall(struct restart_block *param)
2105 */ 2453 */
2106int sigprocmask(int how, sigset_t *set, sigset_t *oldset) 2454int sigprocmask(int how, sigset_t *set, sigset_t *oldset)
2107{ 2455{
2108 int error; 2456 struct task_struct *tsk = current;
2457 sigset_t newset;
2109 2458
2110 spin_lock_irq(&current->sighand->siglock); 2459 /* Lockless, only current can change ->blocked, never from irq */
2111 if (oldset) 2460 if (oldset)
2112 *oldset = current->blocked; 2461 *oldset = tsk->blocked;
2113 2462
2114 error = 0;
2115 switch (how) { 2463 switch (how) {
2116 case SIG_BLOCK: 2464 case SIG_BLOCK:
2117 sigorsets(&current->blocked, &current->blocked, set); 2465 sigorsets(&newset, &tsk->blocked, set);
2118 break; 2466 break;
2119 case SIG_UNBLOCK: 2467 case SIG_UNBLOCK:
2120 signandsets(&current->blocked, &current->blocked, set); 2468 sigandnsets(&newset, &tsk->blocked, set);
2121 break; 2469 break;
2122 case SIG_SETMASK: 2470 case SIG_SETMASK:
2123 current->blocked = *set; 2471 newset = *set;
2124 break; 2472 break;
2125 default: 2473 default:
2126 error = -EINVAL; 2474 return -EINVAL;
2127 } 2475 }
2128 recalc_sigpending();
2129 spin_unlock_irq(&current->sighand->siglock);
2130 2476
2131 return error; 2477 set_current_blocked(&newset);
2478 return 0;
2132} 2479}
2133 2480
2134/** 2481/**
2135 * sys_rt_sigprocmask - change the list of currently blocked signals 2482 * sys_rt_sigprocmask - change the list of currently blocked signals
2136 * @how: whether to add, remove, or set signals 2483 * @how: whether to add, remove, or set signals
2137 * @set: stores pending signals 2484 * @nset: stores pending signals
2138 * @oset: previous value of signal mask if non-null 2485 * @oset: previous value of signal mask if non-null
2139 * @sigsetsize: size of sigset_t type 2486 * @sigsetsize: size of sigset_t type
2140 */ 2487 */
2141SYSCALL_DEFINE4(rt_sigprocmask, int, how, sigset_t __user *, set, 2488SYSCALL_DEFINE4(rt_sigprocmask, int, how, sigset_t __user *, nset,
2142 sigset_t __user *, oset, size_t, sigsetsize) 2489 sigset_t __user *, oset, size_t, sigsetsize)
2143{ 2490{
2144 int error = -EINVAL;
2145 sigset_t old_set, new_set; 2491 sigset_t old_set, new_set;
2492 int error;
2146 2493
2147 /* XXX: Don't preclude handling different sized sigset_t's. */ 2494 /* XXX: Don't preclude handling different sized sigset_t's. */
2148 if (sigsetsize != sizeof(sigset_t)) 2495 if (sigsetsize != sizeof(sigset_t))
2149 goto out; 2496 return -EINVAL;
2150 2497
2151 if (set) { 2498 old_set = current->blocked;
2152 error = -EFAULT; 2499
2153 if (copy_from_user(&new_set, set, sizeof(*set))) 2500 if (nset) {
2154 goto out; 2501 if (copy_from_user(&new_set, nset, sizeof(sigset_t)))
2502 return -EFAULT;
2155 sigdelsetmask(&new_set, sigmask(SIGKILL)|sigmask(SIGSTOP)); 2503 sigdelsetmask(&new_set, sigmask(SIGKILL)|sigmask(SIGSTOP));
2156 2504
2157 error = sigprocmask(how, &new_set, &old_set); 2505 error = sigprocmask(how, &new_set, NULL);
2158 if (error) 2506 if (error)
2159 goto out; 2507 return error;
2160 if (oset) 2508 }
2161 goto set_old;
2162 } else if (oset) {
2163 spin_lock_irq(&current->sighand->siglock);
2164 old_set = current->blocked;
2165 spin_unlock_irq(&current->sighand->siglock);
2166 2509
2167 set_old: 2510 if (oset) {
2168 error = -EFAULT; 2511 if (copy_to_user(oset, &old_set, sizeof(sigset_t)))
2169 if (copy_to_user(oset, &old_set, sizeof(*oset))) 2512 return -EFAULT;
2170 goto out;
2171 } 2513 }
2172 error = 0; 2514
2173out: 2515 return 0;
2174 return error;
2175} 2516}
2176 2517
2177long do_sigpending(void __user *set, unsigned long sigsetsize) 2518long do_sigpending(void __user *set, unsigned long sigsetsize)
@@ -2284,6 +2625,66 @@ int copy_siginfo_to_user(siginfo_t __user *to, siginfo_t *from)
2284#endif 2625#endif
2285 2626
2286/** 2627/**
2628 * do_sigtimedwait - wait for queued signals specified in @which
2629 * @which: queued signals to wait for
2630 * @info: if non-null, the signal's siginfo is returned here
2631 * @ts: upper bound on process time suspension
2632 */
2633int do_sigtimedwait(const sigset_t *which, siginfo_t *info,
2634 const struct timespec *ts)
2635{
2636 struct task_struct *tsk = current;
2637 long timeout = MAX_SCHEDULE_TIMEOUT;
2638 sigset_t mask = *which;
2639 int sig;
2640
2641 if (ts) {
2642 if (!timespec_valid(ts))
2643 return -EINVAL;
2644 timeout = timespec_to_jiffies(ts);
2645 /*
2646 * We can be close to the next tick, add another one
2647 * to ensure we will wait at least the time asked for.
2648 */
2649 if (ts->tv_sec || ts->tv_nsec)
2650 timeout++;
2651 }
2652
2653 /*
2654 * Invert the set of allowed signals to get those we want to block.
2655 */
2656 sigdelsetmask(&mask, sigmask(SIGKILL) | sigmask(SIGSTOP));
2657 signotset(&mask);
2658
2659 spin_lock_irq(&tsk->sighand->siglock);
2660 sig = dequeue_signal(tsk, &mask, info);
2661 if (!sig && timeout) {
2662 /*
2663 * None ready, temporarily unblock those we're interested
2664 * while we are sleeping in so that we'll be awakened when
2665 * they arrive. Unblocking is always fine, we can avoid
2666 * set_current_blocked().
2667 */
2668 tsk->real_blocked = tsk->blocked;
2669 sigandsets(&tsk->blocked, &tsk->blocked, &mask);
2670 recalc_sigpending();
2671 spin_unlock_irq(&tsk->sighand->siglock);
2672
2673 timeout = schedule_timeout_interruptible(timeout);
2674
2675 spin_lock_irq(&tsk->sighand->siglock);
2676 __set_task_blocked(tsk, &tsk->real_blocked);
2677 siginitset(&tsk->real_blocked, 0);
2678 sig = dequeue_signal(tsk, &mask, info);
2679 }
2680 spin_unlock_irq(&tsk->sighand->siglock);
2681
2682 if (sig)
2683 return sig;
2684 return timeout ? -EINTR : -EAGAIN;
2685}
2686
2687/**
2287 * sys_rt_sigtimedwait - synchronously wait for queued signals specified 2688 * sys_rt_sigtimedwait - synchronously wait for queued signals specified
2288 * in @uthese 2689 * in @uthese
2289 * @uthese: queued signals to wait for 2690 * @uthese: queued signals to wait for
@@ -2295,11 +2696,10 @@ SYSCALL_DEFINE4(rt_sigtimedwait, const sigset_t __user *, uthese,
2295 siginfo_t __user *, uinfo, const struct timespec __user *, uts, 2696 siginfo_t __user *, uinfo, const struct timespec __user *, uts,
2296 size_t, sigsetsize) 2697 size_t, sigsetsize)
2297{ 2698{
2298 int ret, sig;
2299 sigset_t these; 2699 sigset_t these;
2300 struct timespec ts; 2700 struct timespec ts;
2301 siginfo_t info; 2701 siginfo_t info;
2302 long timeout = 0; 2702 int ret;
2303 2703
2304 /* XXX: Don't preclude handling different sized sigset_t's. */ 2704 /* XXX: Don't preclude handling different sized sigset_t's. */
2305 if (sigsetsize != sizeof(sigset_t)) 2705 if (sigsetsize != sizeof(sigset_t))
@@ -2308,61 +2708,16 @@ SYSCALL_DEFINE4(rt_sigtimedwait, const sigset_t __user *, uthese,
2308 if (copy_from_user(&these, uthese, sizeof(these))) 2708 if (copy_from_user(&these, uthese, sizeof(these)))
2309 return -EFAULT; 2709 return -EFAULT;
2310 2710
2311 /*
2312 * Invert the set of allowed signals to get those we
2313 * want to block.
2314 */
2315 sigdelsetmask(&these, sigmask(SIGKILL)|sigmask(SIGSTOP));
2316 signotset(&these);
2317
2318 if (uts) { 2711 if (uts) {
2319 if (copy_from_user(&ts, uts, sizeof(ts))) 2712 if (copy_from_user(&ts, uts, sizeof(ts)))
2320 return -EFAULT; 2713 return -EFAULT;
2321 if (ts.tv_nsec >= 1000000000L || ts.tv_nsec < 0
2322 || ts.tv_sec < 0)
2323 return -EINVAL;
2324 } 2714 }
2325 2715
2326 spin_lock_irq(&current->sighand->siglock); 2716 ret = do_sigtimedwait(&these, &info, uts ? &ts : NULL);
2327 sig = dequeue_signal(current, &these, &info);
2328 if (!sig) {
2329 timeout = MAX_SCHEDULE_TIMEOUT;
2330 if (uts)
2331 timeout = (timespec_to_jiffies(&ts)
2332 + (ts.tv_sec || ts.tv_nsec));
2333
2334 if (timeout) {
2335 /*
2336 * None ready -- temporarily unblock those we're
2337 * interested while we are sleeping in so that we'll
2338 * be awakened when they arrive.
2339 */
2340 current->real_blocked = current->blocked;
2341 sigandsets(&current->blocked, &current->blocked, &these);
2342 recalc_sigpending();
2343 spin_unlock_irq(&current->sighand->siglock);
2344
2345 timeout = schedule_timeout_interruptible(timeout);
2346
2347 spin_lock_irq(&current->sighand->siglock);
2348 sig = dequeue_signal(current, &these, &info);
2349 current->blocked = current->real_blocked;
2350 siginitset(&current->real_blocked, 0);
2351 recalc_sigpending();
2352 }
2353 }
2354 spin_unlock_irq(&current->sighand->siglock);
2355 2717
2356 if (sig) { 2718 if (ret > 0 && uinfo) {
2357 ret = sig; 2719 if (copy_siginfo_to_user(uinfo, &info))
2358 if (uinfo) { 2720 ret = -EFAULT;
2359 if (copy_siginfo_to_user(uinfo, &info))
2360 ret = -EFAULT;
2361 }
2362 } else {
2363 ret = -EAGAIN;
2364 if (timeout)
2365 ret = -EINTR;
2366 } 2721 }
2367 2722
2368 return ret; 2723 return ret;
@@ -2650,60 +3005,51 @@ SYSCALL_DEFINE1(sigpending, old_sigset_t __user *, set)
2650/** 3005/**
2651 * sys_sigprocmask - examine and change blocked signals 3006 * sys_sigprocmask - examine and change blocked signals
2652 * @how: whether to add, remove, or set signals 3007 * @how: whether to add, remove, or set signals
2653 * @set: signals to add or remove (if non-null) 3008 * @nset: signals to add or remove (if non-null)
2654 * @oset: previous value of signal mask if non-null 3009 * @oset: previous value of signal mask if non-null
2655 * 3010 *
2656 * Some platforms have their own version with special arguments; 3011 * Some platforms have their own version with special arguments;
2657 * others support only sys_rt_sigprocmask. 3012 * others support only sys_rt_sigprocmask.
2658 */ 3013 */
2659 3014
2660SYSCALL_DEFINE3(sigprocmask, int, how, old_sigset_t __user *, set, 3015SYSCALL_DEFINE3(sigprocmask, int, how, old_sigset_t __user *, nset,
2661 old_sigset_t __user *, oset) 3016 old_sigset_t __user *, oset)
2662{ 3017{
2663 int error;
2664 old_sigset_t old_set, new_set; 3018 old_sigset_t old_set, new_set;
3019 sigset_t new_blocked;
2665 3020
2666 if (set) { 3021 old_set = current->blocked.sig[0];
2667 error = -EFAULT; 3022
2668 if (copy_from_user(&new_set, set, sizeof(*set))) 3023 if (nset) {
2669 goto out; 3024 if (copy_from_user(&new_set, nset, sizeof(*nset)))
3025 return -EFAULT;
2670 new_set &= ~(sigmask(SIGKILL) | sigmask(SIGSTOP)); 3026 new_set &= ~(sigmask(SIGKILL) | sigmask(SIGSTOP));
2671 3027
2672 spin_lock_irq(&current->sighand->siglock); 3028 new_blocked = current->blocked;
2673 old_set = current->blocked.sig[0];
2674 3029
2675 error = 0;
2676 switch (how) { 3030 switch (how) {
2677 default:
2678 error = -EINVAL;
2679 break;
2680 case SIG_BLOCK: 3031 case SIG_BLOCK:
2681 sigaddsetmask(&current->blocked, new_set); 3032 sigaddsetmask(&new_blocked, new_set);
2682 break; 3033 break;
2683 case SIG_UNBLOCK: 3034 case SIG_UNBLOCK:
2684 sigdelsetmask(&current->blocked, new_set); 3035 sigdelsetmask(&new_blocked, new_set);
2685 break; 3036 break;
2686 case SIG_SETMASK: 3037 case SIG_SETMASK:
2687 current->blocked.sig[0] = new_set; 3038 new_blocked.sig[0] = new_set;
2688 break; 3039 break;
3040 default:
3041 return -EINVAL;
2689 } 3042 }
2690 3043
2691 recalc_sigpending(); 3044 set_current_blocked(&new_blocked);
2692 spin_unlock_irq(&current->sighand->siglock); 3045 }
2693 if (error) 3046
2694 goto out; 3047 if (oset) {
2695 if (oset)
2696 goto set_old;
2697 } else if (oset) {
2698 old_set = current->blocked.sig[0];
2699 set_old:
2700 error = -EFAULT;
2701 if (copy_to_user(oset, &old_set, sizeof(*oset))) 3048 if (copy_to_user(oset, &old_set, sizeof(*oset)))
2702 goto out; 3049 return -EFAULT;
2703 } 3050 }
2704 error = 0; 3051
2705out: 3052 return 0;
2706 return error;
2707} 3053}
2708#endif /* __ARCH_WANT_SYS_SIGPROCMASK */ 3054#endif /* __ARCH_WANT_SYS_SIGPROCMASK */
2709 3055
@@ -2756,15 +3102,11 @@ SYSCALL_DEFINE0(sgetmask)
2756 3102
2757SYSCALL_DEFINE1(ssetmask, int, newmask) 3103SYSCALL_DEFINE1(ssetmask, int, newmask)
2758{ 3104{
2759 int old; 3105 int old = current->blocked.sig[0];
2760 3106 sigset_t newset;
2761 spin_lock_irq(&current->sighand->siglock);
2762 old = current->blocked.sig[0];
2763 3107
2764 siginitset(&current->blocked, newmask & ~(sigmask(SIGKILL)| 3108 siginitset(&newset, newmask & ~(sigmask(SIGKILL) | sigmask(SIGSTOP)));
2765 sigmask(SIGSTOP))); 3109 set_current_blocked(&newset);
2766 recalc_sigpending();
2767 spin_unlock_irq(&current->sighand->siglock);
2768 3110
2769 return old; 3111 return old;
2770} 3112}
@@ -2793,8 +3135,10 @@ SYSCALL_DEFINE2(signal, int, sig, __sighandler_t, handler)
2793 3135
2794SYSCALL_DEFINE0(pause) 3136SYSCALL_DEFINE0(pause)
2795{ 3137{
2796 current->state = TASK_INTERRUPTIBLE; 3138 while (!signal_pending(current)) {
2797 schedule(); 3139 current->state = TASK_INTERRUPTIBLE;
3140 schedule();
3141 }
2798 return -ERESTARTNOHAND; 3142 return -ERESTARTNOHAND;
2799} 3143}
2800 3144
@@ -2819,11 +3163,8 @@ SYSCALL_DEFINE2(rt_sigsuspend, sigset_t __user *, unewset, size_t, sigsetsize)
2819 return -EFAULT; 3163 return -EFAULT;
2820 sigdelsetmask(&newset, sigmask(SIGKILL)|sigmask(SIGSTOP)); 3164 sigdelsetmask(&newset, sigmask(SIGKILL)|sigmask(SIGSTOP));
2821 3165
2822 spin_lock_irq(&current->sighand->siglock);
2823 current->saved_sigmask = current->blocked; 3166 current->saved_sigmask = current->blocked;
2824 current->blocked = newset; 3167 set_current_blocked(&newset);
2825 recalc_sigpending();
2826 spin_unlock_irq(&current->sighand->siglock);
2827 3168
2828 current->state = TASK_INTERRUPTIBLE; 3169 current->state = TASK_INTERRUPTIBLE;
2829 schedule(); 3170 schedule();
diff --git a/kernel/smp.c b/kernel/smp.c
index 73a195193558..fb67dfa8394e 100644
--- a/kernel/smp.c
+++ b/kernel/smp.c
@@ -74,7 +74,7 @@ static struct notifier_block __cpuinitdata hotplug_cfd_notifier = {
74 .notifier_call = hotplug_cfd, 74 .notifier_call = hotplug_cfd,
75}; 75};
76 76
77static int __cpuinit init_call_single_data(void) 77void __init call_function_init(void)
78{ 78{
79 void *cpu = (void *)(long)smp_processor_id(); 79 void *cpu = (void *)(long)smp_processor_id();
80 int i; 80 int i;
@@ -88,10 +88,7 @@ static int __cpuinit init_call_single_data(void)
88 88
89 hotplug_cfd(&hotplug_cfd_notifier, CPU_UP_PREPARE, cpu); 89 hotplug_cfd(&hotplug_cfd_notifier, CPU_UP_PREPARE, cpu);
90 register_cpu_notifier(&hotplug_cfd_notifier); 90 register_cpu_notifier(&hotplug_cfd_notifier);
91
92 return 0;
93} 91}
94early_initcall(init_call_single_data);
95 92
96/* 93/*
97 * csd_lock/csd_unlock used to serialize access to per-cpu csd resources 94 * csd_lock/csd_unlock used to serialize access to per-cpu csd resources
diff --git a/kernel/softirq.c b/kernel/softirq.c
index 174f976c2874..fca82c32042b 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -58,7 +58,7 @@ DEFINE_PER_CPU(struct task_struct *, ksoftirqd);
58 58
59char *softirq_to_name[NR_SOFTIRQS] = { 59char *softirq_to_name[NR_SOFTIRQS] = {
60 "HI", "TIMER", "NET_TX", "NET_RX", "BLOCK", "BLOCK_IOPOLL", 60 "HI", "TIMER", "NET_TX", "NET_RX", "BLOCK", "BLOCK_IOPOLL",
61 "TASKLET", "SCHED", "HRTIMER", "RCU" 61 "TASKLET", "SCHED", "HRTIMER", "RCU"
62}; 62};
63 63
64/* 64/*
@@ -315,16 +315,24 @@ static inline void invoke_softirq(void)
315{ 315{
316 if (!force_irqthreads) 316 if (!force_irqthreads)
317 __do_softirq(); 317 __do_softirq();
318 else 318 else {
319 __local_bh_disable((unsigned long)__builtin_return_address(0),
320 SOFTIRQ_OFFSET);
319 wakeup_softirqd(); 321 wakeup_softirqd();
322 __local_bh_enable(SOFTIRQ_OFFSET);
323 }
320} 324}
321#else 325#else
322static inline void invoke_softirq(void) 326static inline void invoke_softirq(void)
323{ 327{
324 if (!force_irqthreads) 328 if (!force_irqthreads)
325 do_softirq(); 329 do_softirq();
326 else 330 else {
331 __local_bh_disable((unsigned long)__builtin_return_address(0),
332 SOFTIRQ_OFFSET);
327 wakeup_softirqd(); 333 wakeup_softirqd();
334 __local_bh_enable(SOFTIRQ_OFFSET);
335 }
328} 336}
329#endif 337#endif
330 338
diff --git a/kernel/stacktrace.c b/kernel/stacktrace.c
index eb212f8f8bc8..d20c6983aad9 100644
--- a/kernel/stacktrace.c
+++ b/kernel/stacktrace.c
@@ -26,12 +26,18 @@ void print_stack_trace(struct stack_trace *trace, int spaces)
26EXPORT_SYMBOL_GPL(print_stack_trace); 26EXPORT_SYMBOL_GPL(print_stack_trace);
27 27
28/* 28/*
29 * Architectures that do not implement save_stack_trace_tsk get this 29 * Architectures that do not implement save_stack_trace_tsk or
30 * weak alias and a once-per-bootup warning (whenever this facility 30 * save_stack_trace_regs get this weak alias and a once-per-bootup warning
31 * is utilized - for example by procfs): 31 * (whenever this facility is utilized - for example by procfs):
32 */ 32 */
33__weak void 33__weak void
34save_stack_trace_tsk(struct task_struct *tsk, struct stack_trace *trace) 34save_stack_trace_tsk(struct task_struct *tsk, struct stack_trace *trace)
35{ 35{
36 WARN_ONCE(1, KERN_INFO "save_stack_trace_tsk() not implemented yet.\n"); 36 WARN_ONCE(1, KERN_INFO "save_stack_trace_tsk() not implemented yet.\n");
37} 37}
38
39__weak void
40save_stack_trace_regs(struct pt_regs *regs, struct stack_trace *trace)
41{
42 WARN_ONCE(1, KERN_INFO "save_stack_trace_regs() not implemented yet.\n");
43}
diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c
index e3516b29076c..ba5070ce5765 100644
--- a/kernel/stop_machine.c
+++ b/kernel/stop_machine.c
@@ -19,7 +19,7 @@
19#include <linux/interrupt.h> 19#include <linux/interrupt.h>
20#include <linux/kallsyms.h> 20#include <linux/kallsyms.h>
21 21
22#include <asm/atomic.h> 22#include <linux/atomic.h>
23 23
24/* 24/*
25 * Structure to determine completion condition and record errors. May 25 * Structure to determine completion condition and record errors. May
@@ -136,10 +136,11 @@ void stop_one_cpu_nowait(unsigned int cpu, cpu_stop_fn_t fn, void *arg,
136static DEFINE_MUTEX(stop_cpus_mutex); 136static DEFINE_MUTEX(stop_cpus_mutex);
137static DEFINE_PER_CPU(struct cpu_stop_work, stop_cpus_work); 137static DEFINE_PER_CPU(struct cpu_stop_work, stop_cpus_work);
138 138
139int __stop_cpus(const struct cpumask *cpumask, cpu_stop_fn_t fn, void *arg) 139static void queue_stop_cpus_work(const struct cpumask *cpumask,
140 cpu_stop_fn_t fn, void *arg,
141 struct cpu_stop_done *done)
140{ 142{
141 struct cpu_stop_work *work; 143 struct cpu_stop_work *work;
142 struct cpu_stop_done done;
143 unsigned int cpu; 144 unsigned int cpu;
144 145
145 /* initialize works and done */ 146 /* initialize works and done */
@@ -147,9 +148,8 @@ int __stop_cpus(const struct cpumask *cpumask, cpu_stop_fn_t fn, void *arg)
147 work = &per_cpu(stop_cpus_work, cpu); 148 work = &per_cpu(stop_cpus_work, cpu);
148 work->fn = fn; 149 work->fn = fn;
149 work->arg = arg; 150 work->arg = arg;
150 work->done = &done; 151 work->done = done;
151 } 152 }
152 cpu_stop_init_done(&done, cpumask_weight(cpumask));
153 153
154 /* 154 /*
155 * Disable preemption while queueing to avoid getting 155 * Disable preemption while queueing to avoid getting
@@ -161,7 +161,15 @@ int __stop_cpus(const struct cpumask *cpumask, cpu_stop_fn_t fn, void *arg)
161 cpu_stop_queue_work(&per_cpu(cpu_stopper, cpu), 161 cpu_stop_queue_work(&per_cpu(cpu_stopper, cpu),
162 &per_cpu(stop_cpus_work, cpu)); 162 &per_cpu(stop_cpus_work, cpu));
163 preempt_enable(); 163 preempt_enable();
164}
164 165
166static int __stop_cpus(const struct cpumask *cpumask,
167 cpu_stop_fn_t fn, void *arg)
168{
169 struct cpu_stop_done done;
170
171 cpu_stop_init_done(&done, cpumask_weight(cpumask));
172 queue_stop_cpus_work(cpumask, fn, arg, &done);
165 wait_for_completion(&done.completion); 173 wait_for_completion(&done.completion);
166 return done.executed ? done.ret : -ENOENT; 174 return done.executed ? done.ret : -ENOENT;
167} 175}
@@ -431,8 +439,15 @@ static int stop_machine_cpu_stop(void *data)
431 struct stop_machine_data *smdata = data; 439 struct stop_machine_data *smdata = data;
432 enum stopmachine_state curstate = STOPMACHINE_NONE; 440 enum stopmachine_state curstate = STOPMACHINE_NONE;
433 int cpu = smp_processor_id(), err = 0; 441 int cpu = smp_processor_id(), err = 0;
442 unsigned long flags;
434 bool is_active; 443 bool is_active;
435 444
445 /*
446 * When called from stop_machine_from_inactive_cpu(), irq might
447 * already be disabled. Save the state and restore it on exit.
448 */
449 local_save_flags(flags);
450
436 if (!smdata->active_cpus) 451 if (!smdata->active_cpus)
437 is_active = cpu == cpumask_first(cpu_online_mask); 452 is_active = cpu == cpumask_first(cpu_online_mask);
438 else 453 else
@@ -460,7 +475,7 @@ static int stop_machine_cpu_stop(void *data)
460 } 475 }
461 } while (curstate != STOPMACHINE_EXIT); 476 } while (curstate != STOPMACHINE_EXIT);
462 477
463 local_irq_enable(); 478 local_irq_restore(flags);
464 return err; 479 return err;
465} 480}
466 481
@@ -487,4 +502,57 @@ int stop_machine(int (*fn)(void *), void *data, const struct cpumask *cpus)
487} 502}
488EXPORT_SYMBOL_GPL(stop_machine); 503EXPORT_SYMBOL_GPL(stop_machine);
489 504
505/**
506 * stop_machine_from_inactive_cpu - stop_machine() from inactive CPU
507 * @fn: the function to run
508 * @data: the data ptr for the @fn()
509 * @cpus: the cpus to run the @fn() on (NULL = any online cpu)
510 *
511 * This is identical to stop_machine() but can be called from a CPU which
512 * is not active. The local CPU is in the process of hotplug (so no other
513 * CPU hotplug can start) and not marked active and doesn't have enough
514 * context to sleep.
515 *
516 * This function provides stop_machine() functionality for such state by
517 * using busy-wait for synchronization and executing @fn directly for local
518 * CPU.
519 *
520 * CONTEXT:
521 * Local CPU is inactive. Temporarily stops all active CPUs.
522 *
523 * RETURNS:
524 * 0 if all executions of @fn returned 0, any non zero return value if any
525 * returned non zero.
526 */
527int stop_machine_from_inactive_cpu(int (*fn)(void *), void *data,
528 const struct cpumask *cpus)
529{
530 struct stop_machine_data smdata = { .fn = fn, .data = data,
531 .active_cpus = cpus };
532 struct cpu_stop_done done;
533 int ret;
534
535 /* Local CPU must be inactive and CPU hotplug in progress. */
536 BUG_ON(cpu_active(raw_smp_processor_id()));
537 smdata.num_threads = num_active_cpus() + 1; /* +1 for local */
538
539 /* No proper task established and can't sleep - busy wait for lock. */
540 while (!mutex_trylock(&stop_cpus_mutex))
541 cpu_relax();
542
543 /* Schedule work on other CPUs and execute directly for local CPU */
544 set_state(&smdata, STOPMACHINE_PREPARE);
545 cpu_stop_init_done(&done, num_active_cpus());
546 queue_stop_cpus_work(cpu_active_mask, stop_machine_cpu_stop, &smdata,
547 &done);
548 ret = stop_machine_cpu_stop(&smdata);
549
550 /* Busy wait for completion. */
551 while (!completion_done(&done.completion))
552 cpu_relax();
553
554 mutex_unlock(&stop_cpus_mutex);
555 return ret ?: done.ret;
556}
557
490#endif /* CONFIG_STOP_MACHINE */ 558#endif /* CONFIG_STOP_MACHINE */
diff --git a/kernel/sys.c b/kernel/sys.c
index af468edf096a..dd948a1fca4c 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -8,7 +8,6 @@
8#include <linux/mm.h> 8#include <linux/mm.h>
9#include <linux/utsname.h> 9#include <linux/utsname.h>
10#include <linux/mman.h> 10#include <linux/mman.h>
11#include <linux/notifier.h>
12#include <linux/reboot.h> 11#include <linux/reboot.h>
13#include <linux/prctl.h> 12#include <linux/prctl.h>
14#include <linux/highuid.h> 13#include <linux/highuid.h>
@@ -314,12 +313,43 @@ void kernel_restart_prepare(char *cmd)
314{ 313{
315 blocking_notifier_call_chain(&reboot_notifier_list, SYS_RESTART, cmd); 314 blocking_notifier_call_chain(&reboot_notifier_list, SYS_RESTART, cmd);
316 system_state = SYSTEM_RESTART; 315 system_state = SYSTEM_RESTART;
316 usermodehelper_disable();
317 device_shutdown(); 317 device_shutdown();
318 sysdev_shutdown();
319 syscore_shutdown(); 318 syscore_shutdown();
320} 319}
321 320
322/** 321/**
322 * register_reboot_notifier - Register function to be called at reboot time
323 * @nb: Info about notifier function to be called
324 *
325 * Registers a function with the list of functions
326 * to be called at reboot time.
327 *
328 * Currently always returns zero, as blocking_notifier_chain_register()
329 * always returns zero.
330 */
331int register_reboot_notifier(struct notifier_block *nb)
332{
333 return blocking_notifier_chain_register(&reboot_notifier_list, nb);
334}
335EXPORT_SYMBOL(register_reboot_notifier);
336
337/**
338 * unregister_reboot_notifier - Unregister previously registered reboot notifier
339 * @nb: Hook to be unregistered
340 *
341 * Unregisters a previously registered reboot
342 * notifier function.
343 *
344 * Returns zero on success, or %-ENOENT on failure.
345 */
346int unregister_reboot_notifier(struct notifier_block *nb)
347{
348 return blocking_notifier_chain_unregister(&reboot_notifier_list, nb);
349}
350EXPORT_SYMBOL(unregister_reboot_notifier);
351
352/**
323 * kernel_restart - reboot the system 353 * kernel_restart - reboot the system
324 * @cmd: pointer to buffer containing command to execute for restart 354 * @cmd: pointer to buffer containing command to execute for restart
325 * or %NULL 355 * or %NULL
@@ -344,6 +374,7 @@ static void kernel_shutdown_prepare(enum system_states state)
344 blocking_notifier_call_chain(&reboot_notifier_list, 374 blocking_notifier_call_chain(&reboot_notifier_list,
345 (state == SYSTEM_HALT)?SYS_HALT:SYS_POWER_OFF, NULL); 375 (state == SYSTEM_HALT)?SYS_HALT:SYS_POWER_OFF, NULL);
346 system_state = state; 376 system_state = state;
377 usermodehelper_disable();
347 device_shutdown(); 378 device_shutdown();
348} 379}
349/** 380/**
@@ -354,7 +385,6 @@ static void kernel_shutdown_prepare(enum system_states state)
354void kernel_halt(void) 385void kernel_halt(void)
355{ 386{
356 kernel_shutdown_prepare(SYSTEM_HALT); 387 kernel_shutdown_prepare(SYSTEM_HALT);
357 sysdev_shutdown();
358 syscore_shutdown(); 388 syscore_shutdown();
359 printk(KERN_EMERG "System halted.\n"); 389 printk(KERN_EMERG "System halted.\n");
360 kmsg_dump(KMSG_DUMP_HALT); 390 kmsg_dump(KMSG_DUMP_HALT);
@@ -374,7 +404,6 @@ void kernel_power_off(void)
374 if (pm_power_off_prepare) 404 if (pm_power_off_prepare)
375 pm_power_off_prepare(); 405 pm_power_off_prepare();
376 disable_nonboot_cpus(); 406 disable_nonboot_cpus();
377 sysdev_shutdown();
378 syscore_shutdown(); 407 syscore_shutdown();
379 printk(KERN_EMERG "Power down.\n"); 408 printk(KERN_EMERG "Power down.\n");
380 kmsg_dump(KMSG_DUMP_POWEROFF); 409 kmsg_dump(KMSG_DUMP_POWEROFF);
@@ -592,11 +621,18 @@ static int set_user(struct cred *new)
592 if (!new_user) 621 if (!new_user)
593 return -EAGAIN; 622 return -EAGAIN;
594 623
624 /*
625 * We don't fail in case of NPROC limit excess here because too many
626 * poorly written programs don't check set*uid() return code, assuming
627 * it never fails if called by root. We may still enforce NPROC limit
628 * for programs doing set*uid()+execve() by harmlessly deferring the
629 * failure to the execve() stage.
630 */
595 if (atomic_read(&new_user->processes) >= rlimit(RLIMIT_NPROC) && 631 if (atomic_read(&new_user->processes) >= rlimit(RLIMIT_NPROC) &&
596 new_user != INIT_USER) { 632 new_user != INIT_USER)
597 free_uid(new_user); 633 current->flags |= PF_NPROC_EXCEEDED;
598 return -EAGAIN; 634 else
599 } 635 current->flags &= ~PF_NPROC_EXCEEDED;
600 636
601 free_uid(new->user); 637 free_uid(new->user);
602 new->user = new_user; 638 new->user = new_user;
diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c
index 25cc41cd8f33..62cbc8877fef 100644
--- a/kernel/sys_ni.c
+++ b/kernel/sys_ni.c
@@ -46,7 +46,9 @@ cond_syscall(sys_getsockopt);
46cond_syscall(compat_sys_getsockopt); 46cond_syscall(compat_sys_getsockopt);
47cond_syscall(sys_shutdown); 47cond_syscall(sys_shutdown);
48cond_syscall(sys_sendmsg); 48cond_syscall(sys_sendmsg);
49cond_syscall(sys_sendmmsg);
49cond_syscall(compat_sys_sendmsg); 50cond_syscall(compat_sys_sendmsg);
51cond_syscall(compat_sys_sendmmsg);
50cond_syscall(sys_recvmsg); 52cond_syscall(sys_recvmsg);
51cond_syscall(sys_recvmmsg); 53cond_syscall(sys_recvmmsg);
52cond_syscall(compat_sys_recvmsg); 54cond_syscall(compat_sys_recvmsg);
@@ -69,15 +71,22 @@ cond_syscall(compat_sys_epoll_pwait);
69cond_syscall(sys_semget); 71cond_syscall(sys_semget);
70cond_syscall(sys_semop); 72cond_syscall(sys_semop);
71cond_syscall(sys_semtimedop); 73cond_syscall(sys_semtimedop);
74cond_syscall(compat_sys_semtimedop);
72cond_syscall(sys_semctl); 75cond_syscall(sys_semctl);
76cond_syscall(compat_sys_semctl);
73cond_syscall(sys_msgget); 77cond_syscall(sys_msgget);
74cond_syscall(sys_msgsnd); 78cond_syscall(sys_msgsnd);
79cond_syscall(compat_sys_msgsnd);
75cond_syscall(sys_msgrcv); 80cond_syscall(sys_msgrcv);
81cond_syscall(compat_sys_msgrcv);
76cond_syscall(sys_msgctl); 82cond_syscall(sys_msgctl);
83cond_syscall(compat_sys_msgctl);
77cond_syscall(sys_shmget); 84cond_syscall(sys_shmget);
78cond_syscall(sys_shmat); 85cond_syscall(sys_shmat);
86cond_syscall(compat_sys_shmat);
79cond_syscall(sys_shmdt); 87cond_syscall(sys_shmdt);
80cond_syscall(sys_shmctl); 88cond_syscall(sys_shmctl);
89cond_syscall(compat_sys_shmctl);
81cond_syscall(sys_mq_open); 90cond_syscall(sys_mq_open);
82cond_syscall(sys_mq_unlink); 91cond_syscall(sys_mq_unlink);
83cond_syscall(sys_mq_timedsend); 92cond_syscall(sys_mq_timedsend);
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index c0bb32414b17..11d65b531e50 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -56,6 +56,7 @@
56#include <linux/kprobes.h> 56#include <linux/kprobes.h>
57#include <linux/pipe_fs_i.h> 57#include <linux/pipe_fs_i.h>
58#include <linux/oom.h> 58#include <linux/oom.h>
59#include <linux/kmod.h>
59 60
60#include <asm/uaccess.h> 61#include <asm/uaccess.h>
61#include <asm/processor.h> 62#include <asm/processor.h>
@@ -616,6 +617,11 @@ static struct ctl_table kern_table[] = {
616 .child = random_table, 617 .child = random_table,
617 }, 618 },
618 { 619 {
620 .procname = "usermodehelper",
621 .mode = 0555,
622 .child = usermodehelper_table,
623 },
624 {
619 .procname = "overflowuid", 625 .procname = "overflowuid",
620 .data = &overflowuid, 626 .data = &overflowuid,
621 .maxlen = sizeof(int), 627 .maxlen = sizeof(int),
@@ -730,14 +736,16 @@ static struct ctl_table kern_table[] = {
730 .data = &watchdog_enabled, 736 .data = &watchdog_enabled,
731 .maxlen = sizeof (int), 737 .maxlen = sizeof (int),
732 .mode = 0644, 738 .mode = 0644,
733 .proc_handler = proc_dowatchdog_enabled, 739 .proc_handler = proc_dowatchdog,
740 .extra1 = &zero,
741 .extra2 = &one,
734 }, 742 },
735 { 743 {
736 .procname = "watchdog_thresh", 744 .procname = "watchdog_thresh",
737 .data = &softlockup_thresh, 745 .data = &watchdog_thresh,
738 .maxlen = sizeof(int), 746 .maxlen = sizeof(int),
739 .mode = 0644, 747 .mode = 0644,
740 .proc_handler = proc_dowatchdog_thresh, 748 .proc_handler = proc_dowatchdog,
741 .extra1 = &neg_one, 749 .extra1 = &neg_one,
742 .extra2 = &sixty, 750 .extra2 = &sixty,
743 }, 751 },
@@ -755,7 +763,9 @@ static struct ctl_table kern_table[] = {
755 .data = &watchdog_enabled, 763 .data = &watchdog_enabled,
756 .maxlen = sizeof (int), 764 .maxlen = sizeof (int),
757 .mode = 0644, 765 .mode = 0644,
758 .proc_handler = proc_dowatchdog_enabled, 766 .proc_handler = proc_dowatchdog,
767 .extra1 = &zero,
768 .extra2 = &one,
759 }, 769 },
760#endif 770#endif
761#if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_X86) 771#if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_X86)
@@ -928,6 +938,12 @@ static struct ctl_table kern_table[] = {
928 }, 938 },
929#endif 939#endif
930#ifdef CONFIG_PERF_EVENTS 940#ifdef CONFIG_PERF_EVENTS
941 /*
942 * User-space scripts rely on the existence of this file
943 * as a feature check for perf_events being enabled.
944 *
945 * So it's an ABI, do not remove!
946 */
931 { 947 {
932 .procname = "perf_event_paranoid", 948 .procname = "perf_event_paranoid",
933 .data = &sysctl_perf_event_paranoid, 949 .data = &sysctl_perf_event_paranoid,
@@ -1496,7 +1512,7 @@ static struct ctl_table fs_table[] = {
1496 1512
1497static struct ctl_table debug_table[] = { 1513static struct ctl_table debug_table[] = {
1498#if defined(CONFIG_X86) || defined(CONFIG_PPC) || defined(CONFIG_SPARC) || \ 1514#if defined(CONFIG_X86) || defined(CONFIG_PPC) || defined(CONFIG_SPARC) || \
1499 defined(CONFIG_S390) 1515 defined(CONFIG_S390) || defined(CONFIG_TILE)
1500 { 1516 {
1501 .procname = "exception-trace", 1517 .procname = "exception-trace",
1502 .data = &show_unhandled_signals, 1518 .data = &show_unhandled_signals,
@@ -1574,16 +1590,11 @@ void sysctl_head_get(struct ctl_table_header *head)
1574 spin_unlock(&sysctl_lock); 1590 spin_unlock(&sysctl_lock);
1575} 1591}
1576 1592
1577static void free_head(struct rcu_head *rcu)
1578{
1579 kfree(container_of(rcu, struct ctl_table_header, rcu));
1580}
1581
1582void sysctl_head_put(struct ctl_table_header *head) 1593void sysctl_head_put(struct ctl_table_header *head)
1583{ 1594{
1584 spin_lock(&sysctl_lock); 1595 spin_lock(&sysctl_lock);
1585 if (!--head->count) 1596 if (!--head->count)
1586 call_rcu(&head->rcu, free_head); 1597 kfree_rcu(head, rcu);
1587 spin_unlock(&sysctl_lock); 1598 spin_unlock(&sysctl_lock);
1588} 1599}
1589 1600
@@ -1955,10 +1966,10 @@ void unregister_sysctl_table(struct ctl_table_header * header)
1955 start_unregistering(header); 1966 start_unregistering(header);
1956 if (!--header->parent->count) { 1967 if (!--header->parent->count) {
1957 WARN_ON(1); 1968 WARN_ON(1);
1958 call_rcu(&header->parent->rcu, free_head); 1969 kfree_rcu(header->parent, rcu);
1959 } 1970 }
1960 if (!--header->count) 1971 if (!--header->count)
1961 call_rcu(&header->rcu, free_head); 1972 kfree_rcu(header, rcu);
1962 spin_unlock(&sysctl_lock); 1973 spin_unlock(&sysctl_lock);
1963} 1974}
1964 1975
diff --git a/kernel/taskstats.c b/kernel/taskstats.c
index 9ffea360a778..e19ce1454ee1 100644
--- a/kernel/taskstats.c
+++ b/kernel/taskstats.c
@@ -28,7 +28,7 @@
28#include <linux/fs.h> 28#include <linux/fs.h>
29#include <linux/file.h> 29#include <linux/file.h>
30#include <net/genetlink.h> 30#include <net/genetlink.h>
31#include <asm/atomic.h> 31#include <linux/atomic.h>
32 32
33/* 33/*
34 * Maximum length of a cpumask that can be specified in 34 * Maximum length of a cpumask that can be specified in
@@ -285,7 +285,7 @@ ret:
285static int add_del_listener(pid_t pid, const struct cpumask *mask, int isadd) 285static int add_del_listener(pid_t pid, const struct cpumask *mask, int isadd)
286{ 286{
287 struct listener_list *listeners; 287 struct listener_list *listeners;
288 struct listener *s, *tmp; 288 struct listener *s, *tmp, *s2;
289 unsigned int cpu; 289 unsigned int cpu;
290 290
291 if (!cpumask_subset(mask, cpu_possible_mask)) 291 if (!cpumask_subset(mask, cpu_possible_mask))
@@ -293,18 +293,25 @@ static int add_del_listener(pid_t pid, const struct cpumask *mask, int isadd)
293 293
294 if (isadd == REGISTER) { 294 if (isadd == REGISTER) {
295 for_each_cpu(cpu, mask) { 295 for_each_cpu(cpu, mask) {
296 s = kmalloc_node(sizeof(struct listener), GFP_KERNEL, 296 s = kmalloc_node(sizeof(struct listener),
297 cpu_to_node(cpu)); 297 GFP_KERNEL, cpu_to_node(cpu));
298 if (!s) 298 if (!s)
299 goto cleanup; 299 goto cleanup;
300
300 s->pid = pid; 301 s->pid = pid;
301 INIT_LIST_HEAD(&s->list);
302 s->valid = 1; 302 s->valid = 1;
303 303
304 listeners = &per_cpu(listener_array, cpu); 304 listeners = &per_cpu(listener_array, cpu);
305 down_write(&listeners->sem); 305 down_write(&listeners->sem);
306 list_for_each_entry(s2, &listeners->list, list) {
307 if (s2->pid == pid && s2->valid)
308 goto exists;
309 }
306 list_add(&s->list, &listeners->list); 310 list_add(&s->list, &listeners->list);
311 s = NULL;
312exists:
307 up_write(&listeners->sem); 313 up_write(&listeners->sem);
314 kfree(s); /* nop if NULL */
308 } 315 }
309 return 0; 316 return 0;
310 } 317 }
diff --git a/kernel/time/Makefile b/kernel/time/Makefile
index b0425991e9ac..e2fd74b8e8c2 100644
--- a/kernel/time/Makefile
+++ b/kernel/time/Makefile
@@ -1,5 +1,5 @@
1obj-y += timekeeping.o ntp.o clocksource.o jiffies.o timer_list.o timecompare.o 1obj-y += timekeeping.o ntp.o clocksource.o jiffies.o timer_list.o timecompare.o
2obj-y += timeconv.o posix-clock.o 2obj-y += timeconv.o posix-clock.o alarmtimer.o
3 3
4obj-$(CONFIG_GENERIC_CLOCKEVENTS_BUILD) += clockevents.o 4obj-$(CONFIG_GENERIC_CLOCKEVENTS_BUILD) += clockevents.o
5obj-$(CONFIG_GENERIC_CLOCKEVENTS) += tick-common.o 5obj-$(CONFIG_GENERIC_CLOCKEVENTS) += tick-common.o
diff --git a/kernel/time/alarmtimer.c b/kernel/time/alarmtimer.c
new file mode 100644
index 000000000000..59f369f98a04
--- /dev/null
+++ b/kernel/time/alarmtimer.c
@@ -0,0 +1,720 @@
1/*
2 * Alarmtimer interface
3 *
4 * This interface provides a timer which is similarto hrtimers,
5 * but triggers a RTC alarm if the box is suspend.
6 *
7 * This interface is influenced by the Android RTC Alarm timer
8 * interface.
9 *
10 * Copyright (C) 2010 IBM Corperation
11 *
12 * Author: John Stultz <john.stultz@linaro.org>
13 *
14 * This program is free software; you can redistribute it and/or modify
15 * it under the terms of the GNU General Public License version 2 as
16 * published by the Free Software Foundation.
17 */
18#include <linux/time.h>
19#include <linux/hrtimer.h>
20#include <linux/timerqueue.h>
21#include <linux/rtc.h>
22#include <linux/alarmtimer.h>
23#include <linux/mutex.h>
24#include <linux/platform_device.h>
25#include <linux/posix-timers.h>
26#include <linux/workqueue.h>
27#include <linux/freezer.h>
28
29/**
30 * struct alarm_base - Alarm timer bases
31 * @lock: Lock for syncrhonized access to the base
32 * @timerqueue: Timerqueue head managing the list of events
33 * @timer: hrtimer used to schedule events while running
34 * @gettime: Function to read the time correlating to the base
35 * @base_clockid: clockid for the base
36 */
37static struct alarm_base {
38 spinlock_t lock;
39 struct timerqueue_head timerqueue;
40 struct hrtimer timer;
41 ktime_t (*gettime)(void);
42 clockid_t base_clockid;
43} alarm_bases[ALARM_NUMTYPE];
44
45/* freezer delta & lock used to handle clock_nanosleep triggered wakeups */
46static ktime_t freezer_delta;
47static DEFINE_SPINLOCK(freezer_delta_lock);
48
49#ifdef CONFIG_RTC_CLASS
50/* rtc timer and device for setting alarm wakeups at suspend */
51static struct rtc_timer rtctimer;
52static struct rtc_device *rtcdev;
53static DEFINE_SPINLOCK(rtcdev_lock);
54
55/**
56 * has_wakealarm - check rtc device has wakealarm ability
57 * @dev: current device
58 * @name_ptr: name to be returned
59 *
60 * This helper function checks to see if the rtc device can wake
61 * from suspend.
62 */
63static int has_wakealarm(struct device *dev, void *name_ptr)
64{
65 struct rtc_device *candidate = to_rtc_device(dev);
66
67 if (!candidate->ops->set_alarm)
68 return 0;
69 if (!device_may_wakeup(candidate->dev.parent))
70 return 0;
71
72 *(const char **)name_ptr = dev_name(dev);
73 return 1;
74}
75
76/**
77 * alarmtimer_get_rtcdev - Return selected rtcdevice
78 *
79 * This function returns the rtc device to use for wakealarms.
80 * If one has not already been chosen, it checks to see if a
81 * functional rtc device is available.
82 */
83static struct rtc_device *alarmtimer_get_rtcdev(void)
84{
85 struct device *dev;
86 char *str;
87 unsigned long flags;
88 struct rtc_device *ret;
89
90 spin_lock_irqsave(&rtcdev_lock, flags);
91 if (!rtcdev) {
92 /* Find an rtc device and init the rtc_timer */
93 dev = class_find_device(rtc_class, NULL, &str, has_wakealarm);
94 /* If we have a device then str is valid. See has_wakealarm() */
95 if (dev) {
96 rtcdev = rtc_class_open(str);
97 /*
98 * Drop the reference we got in class_find_device,
99 * rtc_open takes its own.
100 */
101 put_device(dev);
102 rtc_timer_init(&rtctimer, NULL, NULL);
103 }
104 }
105 ret = rtcdev;
106 spin_unlock_irqrestore(&rtcdev_lock, flags);
107
108 return ret;
109}
110#else
111#define alarmtimer_get_rtcdev() (0)
112#define rtcdev (0)
113#endif
114
115
116/**
117 * alarmtimer_enqueue - Adds an alarm timer to an alarm_base timerqueue
118 * @base: pointer to the base where the timer is being run
119 * @alarm: pointer to alarm being enqueued.
120 *
121 * Adds alarm to a alarm_base timerqueue and if necessary sets
122 * an hrtimer to run.
123 *
124 * Must hold base->lock when calling.
125 */
126static void alarmtimer_enqueue(struct alarm_base *base, struct alarm *alarm)
127{
128 timerqueue_add(&base->timerqueue, &alarm->node);
129 if (&alarm->node == timerqueue_getnext(&base->timerqueue)) {
130 hrtimer_try_to_cancel(&base->timer);
131 hrtimer_start(&base->timer, alarm->node.expires,
132 HRTIMER_MODE_ABS);
133 }
134}
135
136/**
137 * alarmtimer_remove - Removes an alarm timer from an alarm_base timerqueue
138 * @base: pointer to the base where the timer is running
139 * @alarm: pointer to alarm being removed
140 *
141 * Removes alarm to a alarm_base timerqueue and if necessary sets
142 * a new timer to run.
143 *
144 * Must hold base->lock when calling.
145 */
146static void alarmtimer_remove(struct alarm_base *base, struct alarm *alarm)
147{
148 struct timerqueue_node *next = timerqueue_getnext(&base->timerqueue);
149
150 timerqueue_del(&base->timerqueue, &alarm->node);
151 if (next == &alarm->node) {
152 hrtimer_try_to_cancel(&base->timer);
153 next = timerqueue_getnext(&base->timerqueue);
154 if (!next)
155 return;
156 hrtimer_start(&base->timer, next->expires, HRTIMER_MODE_ABS);
157 }
158}
159
160
161/**
162 * alarmtimer_fired - Handles alarm hrtimer being fired.
163 * @timer: pointer to hrtimer being run
164 *
165 * When a alarm timer fires, this runs through the timerqueue to
166 * see which alarms expired, and runs those. If there are more alarm
167 * timers queued for the future, we set the hrtimer to fire when
168 * when the next future alarm timer expires.
169 */
170static enum hrtimer_restart alarmtimer_fired(struct hrtimer *timer)
171{
172 struct alarm_base *base = container_of(timer, struct alarm_base, timer);
173 struct timerqueue_node *next;
174 unsigned long flags;
175 ktime_t now;
176 int ret = HRTIMER_NORESTART;
177
178 spin_lock_irqsave(&base->lock, flags);
179 now = base->gettime();
180 while ((next = timerqueue_getnext(&base->timerqueue))) {
181 struct alarm *alarm;
182 ktime_t expired = next->expires;
183
184 if (expired.tv64 >= now.tv64)
185 break;
186
187 alarm = container_of(next, struct alarm, node);
188
189 timerqueue_del(&base->timerqueue, &alarm->node);
190 alarm->enabled = 0;
191 /* Re-add periodic timers */
192 if (alarm->period.tv64) {
193 alarm->node.expires = ktime_add(expired, alarm->period);
194 timerqueue_add(&base->timerqueue, &alarm->node);
195 alarm->enabled = 1;
196 }
197 spin_unlock_irqrestore(&base->lock, flags);
198 if (alarm->function)
199 alarm->function(alarm);
200 spin_lock_irqsave(&base->lock, flags);
201 }
202
203 if (next) {
204 hrtimer_set_expires(&base->timer, next->expires);
205 ret = HRTIMER_RESTART;
206 }
207 spin_unlock_irqrestore(&base->lock, flags);
208
209 return ret;
210
211}
212
213#ifdef CONFIG_RTC_CLASS
214/**
215 * alarmtimer_suspend - Suspend time callback
216 * @dev: unused
217 * @state: unused
218 *
219 * When we are going into suspend, we look through the bases
220 * to see which is the soonest timer to expire. We then
221 * set an rtc timer to fire that far into the future, which
222 * will wake us from suspend.
223 */
224static int alarmtimer_suspend(struct device *dev)
225{
226 struct rtc_time tm;
227 ktime_t min, now;
228 unsigned long flags;
229 struct rtc_device *rtc;
230 int i;
231
232 spin_lock_irqsave(&freezer_delta_lock, flags);
233 min = freezer_delta;
234 freezer_delta = ktime_set(0, 0);
235 spin_unlock_irqrestore(&freezer_delta_lock, flags);
236
237 rtc = rtcdev;
238 /* If we have no rtcdev, just return */
239 if (!rtc)
240 return 0;
241
242 /* Find the soonest timer to expire*/
243 for (i = 0; i < ALARM_NUMTYPE; i++) {
244 struct alarm_base *base = &alarm_bases[i];
245 struct timerqueue_node *next;
246 ktime_t delta;
247
248 spin_lock_irqsave(&base->lock, flags);
249 next = timerqueue_getnext(&base->timerqueue);
250 spin_unlock_irqrestore(&base->lock, flags);
251 if (!next)
252 continue;
253 delta = ktime_sub(next->expires, base->gettime());
254 if (!min.tv64 || (delta.tv64 < min.tv64))
255 min = delta;
256 }
257 if (min.tv64 == 0)
258 return 0;
259
260 /* XXX - Should we enforce a minimum sleep time? */
261 WARN_ON(min.tv64 < NSEC_PER_SEC);
262
263 /* Setup an rtc timer to fire that far in the future */
264 rtc_timer_cancel(rtc, &rtctimer);
265 rtc_read_time(rtc, &tm);
266 now = rtc_tm_to_ktime(tm);
267 now = ktime_add(now, min);
268
269 rtc_timer_start(rtc, &rtctimer, now, ktime_set(0, 0));
270
271 return 0;
272}
273#else
274static int alarmtimer_suspend(struct device *dev)
275{
276 return 0;
277}
278#endif
279
280static void alarmtimer_freezerset(ktime_t absexp, enum alarmtimer_type type)
281{
282 ktime_t delta;
283 unsigned long flags;
284 struct alarm_base *base = &alarm_bases[type];
285
286 delta = ktime_sub(absexp, base->gettime());
287
288 spin_lock_irqsave(&freezer_delta_lock, flags);
289 if (!freezer_delta.tv64 || (delta.tv64 < freezer_delta.tv64))
290 freezer_delta = delta;
291 spin_unlock_irqrestore(&freezer_delta_lock, flags);
292}
293
294
295/**
296 * alarm_init - Initialize an alarm structure
297 * @alarm: ptr to alarm to be initialized
298 * @type: the type of the alarm
299 * @function: callback that is run when the alarm fires
300 */
301void alarm_init(struct alarm *alarm, enum alarmtimer_type type,
302 void (*function)(struct alarm *))
303{
304 timerqueue_init(&alarm->node);
305 alarm->period = ktime_set(0, 0);
306 alarm->function = function;
307 alarm->type = type;
308 alarm->enabled = 0;
309}
310
311/**
312 * alarm_start - Sets an alarm to fire
313 * @alarm: ptr to alarm to set
314 * @start: time to run the alarm
315 * @period: period at which the alarm will recur
316 */
317void alarm_start(struct alarm *alarm, ktime_t start, ktime_t period)
318{
319 struct alarm_base *base = &alarm_bases[alarm->type];
320 unsigned long flags;
321
322 spin_lock_irqsave(&base->lock, flags);
323 if (alarm->enabled)
324 alarmtimer_remove(base, alarm);
325 alarm->node.expires = start;
326 alarm->period = period;
327 alarmtimer_enqueue(base, alarm);
328 alarm->enabled = 1;
329 spin_unlock_irqrestore(&base->lock, flags);
330}
331
332/**
333 * alarm_cancel - Tries to cancel an alarm timer
334 * @alarm: ptr to alarm to be canceled
335 */
336void alarm_cancel(struct alarm *alarm)
337{
338 struct alarm_base *base = &alarm_bases[alarm->type];
339 unsigned long flags;
340
341 spin_lock_irqsave(&base->lock, flags);
342 if (alarm->enabled)
343 alarmtimer_remove(base, alarm);
344 alarm->enabled = 0;
345 spin_unlock_irqrestore(&base->lock, flags);
346}
347
348
349/**
350 * clock2alarm - helper that converts from clockid to alarmtypes
351 * @clockid: clockid.
352 */
353static enum alarmtimer_type clock2alarm(clockid_t clockid)
354{
355 if (clockid == CLOCK_REALTIME_ALARM)
356 return ALARM_REALTIME;
357 if (clockid == CLOCK_BOOTTIME_ALARM)
358 return ALARM_BOOTTIME;
359 return -1;
360}
361
362/**
363 * alarm_handle_timer - Callback for posix timers
364 * @alarm: alarm that fired
365 *
366 * Posix timer callback for expired alarm timers.
367 */
368static void alarm_handle_timer(struct alarm *alarm)
369{
370 struct k_itimer *ptr = container_of(alarm, struct k_itimer,
371 it.alarmtimer);
372 if (posix_timer_event(ptr, 0) != 0)
373 ptr->it_overrun++;
374}
375
376/**
377 * alarm_clock_getres - posix getres interface
378 * @which_clock: clockid
379 * @tp: timespec to fill
380 *
381 * Returns the granularity of underlying alarm base clock
382 */
383static int alarm_clock_getres(const clockid_t which_clock, struct timespec *tp)
384{
385 clockid_t baseid = alarm_bases[clock2alarm(which_clock)].base_clockid;
386
387 if (!alarmtimer_get_rtcdev())
388 return -ENOTSUPP;
389
390 return hrtimer_get_res(baseid, tp);
391}
392
393/**
394 * alarm_clock_get - posix clock_get interface
395 * @which_clock: clockid
396 * @tp: timespec to fill.
397 *
398 * Provides the underlying alarm base time.
399 */
400static int alarm_clock_get(clockid_t which_clock, struct timespec *tp)
401{
402 struct alarm_base *base = &alarm_bases[clock2alarm(which_clock)];
403
404 if (!alarmtimer_get_rtcdev())
405 return -ENOTSUPP;
406
407 *tp = ktime_to_timespec(base->gettime());
408 return 0;
409}
410
411/**
412 * alarm_timer_create - posix timer_create interface
413 * @new_timer: k_itimer pointer to manage
414 *
415 * Initializes the k_itimer structure.
416 */
417static int alarm_timer_create(struct k_itimer *new_timer)
418{
419 enum alarmtimer_type type;
420 struct alarm_base *base;
421
422 if (!alarmtimer_get_rtcdev())
423 return -ENOTSUPP;
424
425 if (!capable(CAP_WAKE_ALARM))
426 return -EPERM;
427
428 type = clock2alarm(new_timer->it_clock);
429 base = &alarm_bases[type];
430 alarm_init(&new_timer->it.alarmtimer, type, alarm_handle_timer);
431 return 0;
432}
433
434/**
435 * alarm_timer_get - posix timer_get interface
436 * @new_timer: k_itimer pointer
437 * @cur_setting: itimerspec data to fill
438 *
439 * Copies the itimerspec data out from the k_itimer
440 */
441static void alarm_timer_get(struct k_itimer *timr,
442 struct itimerspec *cur_setting)
443{
444 cur_setting->it_interval =
445 ktime_to_timespec(timr->it.alarmtimer.period);
446 cur_setting->it_value =
447 ktime_to_timespec(timr->it.alarmtimer.node.expires);
448 return;
449}
450
451/**
452 * alarm_timer_del - posix timer_del interface
453 * @timr: k_itimer pointer to be deleted
454 *
455 * Cancels any programmed alarms for the given timer.
456 */
457static int alarm_timer_del(struct k_itimer *timr)
458{
459 if (!rtcdev)
460 return -ENOTSUPP;
461
462 alarm_cancel(&timr->it.alarmtimer);
463 return 0;
464}
465
466/**
467 * alarm_timer_set - posix timer_set interface
468 * @timr: k_itimer pointer to be deleted
469 * @flags: timer flags
470 * @new_setting: itimerspec to be used
471 * @old_setting: itimerspec being replaced
472 *
473 * Sets the timer to new_setting, and starts the timer.
474 */
475static int alarm_timer_set(struct k_itimer *timr, int flags,
476 struct itimerspec *new_setting,
477 struct itimerspec *old_setting)
478{
479 if (!rtcdev)
480 return -ENOTSUPP;
481
482 /* Save old values */
483 old_setting->it_interval =
484 ktime_to_timespec(timr->it.alarmtimer.period);
485 old_setting->it_value =
486 ktime_to_timespec(timr->it.alarmtimer.node.expires);
487
488 /* If the timer was already set, cancel it */
489 alarm_cancel(&timr->it.alarmtimer);
490
491 /* start the timer */
492 alarm_start(&timr->it.alarmtimer,
493 timespec_to_ktime(new_setting->it_value),
494 timespec_to_ktime(new_setting->it_interval));
495 return 0;
496}
497
498/**
499 * alarmtimer_nsleep_wakeup - Wakeup function for alarm_timer_nsleep
500 * @alarm: ptr to alarm that fired
501 *
502 * Wakes up the task that set the alarmtimer
503 */
504static void alarmtimer_nsleep_wakeup(struct alarm *alarm)
505{
506 struct task_struct *task = (struct task_struct *)alarm->data;
507
508 alarm->data = NULL;
509 if (task)
510 wake_up_process(task);
511}
512
513/**
514 * alarmtimer_do_nsleep - Internal alarmtimer nsleep implementation
515 * @alarm: ptr to alarmtimer
516 * @absexp: absolute expiration time
517 *
518 * Sets the alarm timer and sleeps until it is fired or interrupted.
519 */
520static int alarmtimer_do_nsleep(struct alarm *alarm, ktime_t absexp)
521{
522 alarm->data = (void *)current;
523 do {
524 set_current_state(TASK_INTERRUPTIBLE);
525 alarm_start(alarm, absexp, ktime_set(0, 0));
526 if (likely(alarm->data))
527 schedule();
528
529 alarm_cancel(alarm);
530 } while (alarm->data && !signal_pending(current));
531
532 __set_current_state(TASK_RUNNING);
533
534 return (alarm->data == NULL);
535}
536
537
538/**
539 * update_rmtp - Update remaining timespec value
540 * @exp: expiration time
541 * @type: timer type
542 * @rmtp: user pointer to remaining timepsec value
543 *
544 * Helper function that fills in rmtp value with time between
545 * now and the exp value
546 */
547static int update_rmtp(ktime_t exp, enum alarmtimer_type type,
548 struct timespec __user *rmtp)
549{
550 struct timespec rmt;
551 ktime_t rem;
552
553 rem = ktime_sub(exp, alarm_bases[type].gettime());
554
555 if (rem.tv64 <= 0)
556 return 0;
557 rmt = ktime_to_timespec(rem);
558
559 if (copy_to_user(rmtp, &rmt, sizeof(*rmtp)))
560 return -EFAULT;
561
562 return 1;
563
564}
565
566/**
567 * alarm_timer_nsleep_restart - restartblock alarmtimer nsleep
568 * @restart: ptr to restart block
569 *
570 * Handles restarted clock_nanosleep calls
571 */
572static long __sched alarm_timer_nsleep_restart(struct restart_block *restart)
573{
574 enum alarmtimer_type type = restart->nanosleep.clockid;
575 ktime_t exp;
576 struct timespec __user *rmtp;
577 struct alarm alarm;
578 int ret = 0;
579
580 exp.tv64 = restart->nanosleep.expires;
581 alarm_init(&alarm, type, alarmtimer_nsleep_wakeup);
582
583 if (alarmtimer_do_nsleep(&alarm, exp))
584 goto out;
585
586 if (freezing(current))
587 alarmtimer_freezerset(exp, type);
588
589 rmtp = restart->nanosleep.rmtp;
590 if (rmtp) {
591 ret = update_rmtp(exp, type, rmtp);
592 if (ret <= 0)
593 goto out;
594 }
595
596
597 /* The other values in restart are already filled in */
598 ret = -ERESTART_RESTARTBLOCK;
599out:
600 return ret;
601}
602
603/**
604 * alarm_timer_nsleep - alarmtimer nanosleep
605 * @which_clock: clockid
606 * @flags: determins abstime or relative
607 * @tsreq: requested sleep time (abs or rel)
608 * @rmtp: remaining sleep time saved
609 *
610 * Handles clock_nanosleep calls against _ALARM clockids
611 */
612static int alarm_timer_nsleep(const clockid_t which_clock, int flags,
613 struct timespec *tsreq, struct timespec __user *rmtp)
614{
615 enum alarmtimer_type type = clock2alarm(which_clock);
616 struct alarm alarm;
617 ktime_t exp;
618 int ret = 0;
619 struct restart_block *restart;
620
621 if (!alarmtimer_get_rtcdev())
622 return -ENOTSUPP;
623
624 if (!capable(CAP_WAKE_ALARM))
625 return -EPERM;
626
627 alarm_init(&alarm, type, alarmtimer_nsleep_wakeup);
628
629 exp = timespec_to_ktime(*tsreq);
630 /* Convert (if necessary) to absolute time */
631 if (flags != TIMER_ABSTIME) {
632 ktime_t now = alarm_bases[type].gettime();
633 exp = ktime_add(now, exp);
634 }
635
636 if (alarmtimer_do_nsleep(&alarm, exp))
637 goto out;
638
639 if (freezing(current))
640 alarmtimer_freezerset(exp, type);
641
642 /* abs timers don't set remaining time or restart */
643 if (flags == TIMER_ABSTIME) {
644 ret = -ERESTARTNOHAND;
645 goto out;
646 }
647
648 if (rmtp) {
649 ret = update_rmtp(exp, type, rmtp);
650 if (ret <= 0)
651 goto out;
652 }
653
654 restart = &current_thread_info()->restart_block;
655 restart->fn = alarm_timer_nsleep_restart;
656 restart->nanosleep.clockid = type;
657 restart->nanosleep.expires = exp.tv64;
658 restart->nanosleep.rmtp = rmtp;
659 ret = -ERESTART_RESTARTBLOCK;
660
661out:
662 return ret;
663}
664
665
666/* Suspend hook structures */
667static const struct dev_pm_ops alarmtimer_pm_ops = {
668 .suspend = alarmtimer_suspend,
669};
670
671static struct platform_driver alarmtimer_driver = {
672 .driver = {
673 .name = "alarmtimer",
674 .pm = &alarmtimer_pm_ops,
675 }
676};
677
678/**
679 * alarmtimer_init - Initialize alarm timer code
680 *
681 * This function initializes the alarm bases and registers
682 * the posix clock ids.
683 */
684static int __init alarmtimer_init(void)
685{
686 int error = 0;
687 int i;
688 struct k_clock alarm_clock = {
689 .clock_getres = alarm_clock_getres,
690 .clock_get = alarm_clock_get,
691 .timer_create = alarm_timer_create,
692 .timer_set = alarm_timer_set,
693 .timer_del = alarm_timer_del,
694 .timer_get = alarm_timer_get,
695 .nsleep = alarm_timer_nsleep,
696 };
697
698 posix_timers_register_clock(CLOCK_REALTIME_ALARM, &alarm_clock);
699 posix_timers_register_clock(CLOCK_BOOTTIME_ALARM, &alarm_clock);
700
701 /* Initialize alarm bases */
702 alarm_bases[ALARM_REALTIME].base_clockid = CLOCK_REALTIME;
703 alarm_bases[ALARM_REALTIME].gettime = &ktime_get_real;
704 alarm_bases[ALARM_BOOTTIME].base_clockid = CLOCK_BOOTTIME;
705 alarm_bases[ALARM_BOOTTIME].gettime = &ktime_get_boottime;
706 for (i = 0; i < ALARM_NUMTYPE; i++) {
707 timerqueue_init_head(&alarm_bases[i].timerqueue);
708 spin_lock_init(&alarm_bases[i].lock);
709 hrtimer_init(&alarm_bases[i].timer,
710 alarm_bases[i].base_clockid,
711 HRTIMER_MODE_ABS);
712 alarm_bases[i].timer.function = alarmtimer_fired;
713 }
714 error = platform_driver_register(&alarmtimer_driver);
715 platform_device_register_simple("alarmtimer", -1, NULL, 0);
716
717 return error;
718}
719device_initcall(alarmtimer_init);
720
diff --git a/kernel/time/clockevents.c b/kernel/time/clockevents.c
index 0d74b9ba90c8..e4c699dfa4e8 100644
--- a/kernel/time/clockevents.c
+++ b/kernel/time/clockevents.c
@@ -182,7 +182,10 @@ void clockevents_register_device(struct clock_event_device *dev)
182 unsigned long flags; 182 unsigned long flags;
183 183
184 BUG_ON(dev->mode != CLOCK_EVT_MODE_UNUSED); 184 BUG_ON(dev->mode != CLOCK_EVT_MODE_UNUSED);
185 BUG_ON(!dev->cpumask); 185 if (!dev->cpumask) {
186 WARN_ON(num_possible_cpus() > 1);
187 dev->cpumask = cpumask_of(smp_processor_id());
188 }
186 189
187 raw_spin_lock_irqsave(&clockevents_lock, flags); 190 raw_spin_lock_irqsave(&clockevents_lock, flags);
188 191
@@ -194,6 +197,70 @@ void clockevents_register_device(struct clock_event_device *dev)
194} 197}
195EXPORT_SYMBOL_GPL(clockevents_register_device); 198EXPORT_SYMBOL_GPL(clockevents_register_device);
196 199
200static void clockevents_config(struct clock_event_device *dev,
201 u32 freq)
202{
203 u64 sec;
204
205 if (!(dev->features & CLOCK_EVT_FEAT_ONESHOT))
206 return;
207
208 /*
209 * Calculate the maximum number of seconds we can sleep. Limit
210 * to 10 minutes for hardware which can program more than
211 * 32bit ticks so we still get reasonable conversion values.
212 */
213 sec = dev->max_delta_ticks;
214 do_div(sec, freq);
215 if (!sec)
216 sec = 1;
217 else if (sec > 600 && dev->max_delta_ticks > UINT_MAX)
218 sec = 600;
219
220 clockevents_calc_mult_shift(dev, freq, sec);
221 dev->min_delta_ns = clockevent_delta2ns(dev->min_delta_ticks, dev);
222 dev->max_delta_ns = clockevent_delta2ns(dev->max_delta_ticks, dev);
223}
224
225/**
226 * clockevents_config_and_register - Configure and register a clock event device
227 * @dev: device to register
228 * @freq: The clock frequency
229 * @min_delta: The minimum clock ticks to program in oneshot mode
230 * @max_delta: The maximum clock ticks to program in oneshot mode
231 *
232 * min/max_delta can be 0 for devices which do not support oneshot mode.
233 */
234void clockevents_config_and_register(struct clock_event_device *dev,
235 u32 freq, unsigned long min_delta,
236 unsigned long max_delta)
237{
238 dev->min_delta_ticks = min_delta;
239 dev->max_delta_ticks = max_delta;
240 clockevents_config(dev, freq);
241 clockevents_register_device(dev);
242}
243
244/**
245 * clockevents_update_freq - Update frequency and reprogram a clock event device.
246 * @dev: device to modify
247 * @freq: new device frequency
248 *
249 * Reconfigure and reprogram a clock event device in oneshot
250 * mode. Must be called on the cpu for which the device delivers per
251 * cpu timer events with interrupts disabled! Returns 0 on success,
252 * -ETIME when the event is in the past.
253 */
254int clockevents_update_freq(struct clock_event_device *dev, u32 freq)
255{
256 clockevents_config(dev, freq);
257
258 if (dev->mode != CLOCK_EVT_MODE_ONESHOT)
259 return 0;
260
261 return clockevents_program_event(dev, dev->next_event, ktime_get());
262}
263
197/* 264/*
198 * Noop handler when we shut down an event device 265 * Noop handler when we shut down an event device
199 */ 266 */
diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c
index 6519cf62d9cd..e0980f0d9a0a 100644
--- a/kernel/time/clocksource.c
+++ b/kernel/time/clocksource.c
@@ -185,7 +185,6 @@ static struct clocksource *watchdog;
185static struct timer_list watchdog_timer; 185static struct timer_list watchdog_timer;
186static DECLARE_WORK(watchdog_work, clocksource_watchdog_work); 186static DECLARE_WORK(watchdog_work, clocksource_watchdog_work);
187static DEFINE_SPINLOCK(watchdog_lock); 187static DEFINE_SPINLOCK(watchdog_lock);
188static cycle_t watchdog_last;
189static int watchdog_running; 188static int watchdog_running;
190 189
191static int clocksource_watchdog_kthread(void *data); 190static int clocksource_watchdog_kthread(void *data);
@@ -254,11 +253,6 @@ static void clocksource_watchdog(unsigned long data)
254 if (!watchdog_running) 253 if (!watchdog_running)
255 goto out; 254 goto out;
256 255
257 wdnow = watchdog->read(watchdog);
258 wd_nsec = clocksource_cyc2ns((wdnow - watchdog_last) & watchdog->mask,
259 watchdog->mult, watchdog->shift);
260 watchdog_last = wdnow;
261
262 list_for_each_entry(cs, &watchdog_list, wd_list) { 256 list_for_each_entry(cs, &watchdog_list, wd_list) {
263 257
264 /* Clocksource already marked unstable? */ 258 /* Clocksource already marked unstable? */
@@ -268,19 +262,28 @@ static void clocksource_watchdog(unsigned long data)
268 continue; 262 continue;
269 } 263 }
270 264
265 local_irq_disable();
271 csnow = cs->read(cs); 266 csnow = cs->read(cs);
267 wdnow = watchdog->read(watchdog);
268 local_irq_enable();
272 269
273 /* Clocksource initialized ? */ 270 /* Clocksource initialized ? */
274 if (!(cs->flags & CLOCK_SOURCE_WATCHDOG)) { 271 if (!(cs->flags & CLOCK_SOURCE_WATCHDOG)) {
275 cs->flags |= CLOCK_SOURCE_WATCHDOG; 272 cs->flags |= CLOCK_SOURCE_WATCHDOG;
276 cs->wd_last = csnow; 273 cs->wd_last = wdnow;
274 cs->cs_last = csnow;
277 continue; 275 continue;
278 } 276 }
279 277
280 /* Check the deviation from the watchdog clocksource. */ 278 wd_nsec = clocksource_cyc2ns((wdnow - cs->wd_last) & watchdog->mask,
281 cs_nsec = clocksource_cyc2ns((csnow - cs->wd_last) & 279 watchdog->mult, watchdog->shift);
280
281 cs_nsec = clocksource_cyc2ns((csnow - cs->cs_last) &
282 cs->mask, cs->mult, cs->shift); 282 cs->mask, cs->mult, cs->shift);
283 cs->wd_last = csnow; 283 cs->cs_last = csnow;
284 cs->wd_last = wdnow;
285
286 /* Check the deviation from the watchdog clocksource. */
284 if (abs(cs_nsec - wd_nsec) > WATCHDOG_THRESHOLD) { 287 if (abs(cs_nsec - wd_nsec) > WATCHDOG_THRESHOLD) {
285 clocksource_unstable(cs, cs_nsec - wd_nsec); 288 clocksource_unstable(cs, cs_nsec - wd_nsec);
286 continue; 289 continue;
@@ -318,7 +321,6 @@ static inline void clocksource_start_watchdog(void)
318 return; 321 return;
319 init_timer(&watchdog_timer); 322 init_timer(&watchdog_timer);
320 watchdog_timer.function = clocksource_watchdog; 323 watchdog_timer.function = clocksource_watchdog;
321 watchdog_last = watchdog->read(watchdog);
322 watchdog_timer.expires = jiffies + WATCHDOG_INTERVAL; 324 watchdog_timer.expires = jiffies + WATCHDOG_INTERVAL;
323 add_timer_on(&watchdog_timer, cpumask_first(cpu_online_mask)); 325 add_timer_on(&watchdog_timer, cpumask_first(cpu_online_mask));
324 watchdog_running = 1; 326 watchdog_running = 1;
@@ -626,19 +628,6 @@ static void clocksource_enqueue(struct clocksource *cs)
626 list_add(&cs->list, entry); 628 list_add(&cs->list, entry);
627} 629}
628 630
629
630/*
631 * Maximum time we expect to go between ticks. This includes idle
632 * tickless time. It provides the trade off between selecting a
633 * mult/shift pair that is very precise but can only handle a short
634 * period of time, vs. a mult/shift pair that can handle long periods
635 * of time but isn't as precise.
636 *
637 * This is a subsystem constant, and actual hardware limitations
638 * may override it (ie: clocksources that wrap every 3 seconds).
639 */
640#define MAX_UPDATE_LENGTH 5 /* Seconds */
641
642/** 631/**
643 * __clocksource_updatefreq_scale - Used update clocksource with new freq 632 * __clocksource_updatefreq_scale - Used update clocksource with new freq
644 * @t: clocksource to be registered 633 * @t: clocksource to be registered
@@ -652,15 +641,28 @@ static void clocksource_enqueue(struct clocksource *cs)
652 */ 641 */
653void __clocksource_updatefreq_scale(struct clocksource *cs, u32 scale, u32 freq) 642void __clocksource_updatefreq_scale(struct clocksource *cs, u32 scale, u32 freq)
654{ 643{
644 u64 sec;
645
655 /* 646 /*
656 * Ideally we want to use some of the limits used in 647 * Calc the maximum number of seconds which we can run before
657 * clocksource_max_deferment, to provide a more informed 648 * wrapping around. For clocksources which have a mask > 32bit
658 * MAX_UPDATE_LENGTH. But for now this just gets the 649 * we need to limit the max sleep time to have a good
659 * register interface working properly. 650 * conversion precision. 10 minutes is still a reasonable
651 * amount. That results in a shift value of 24 for a
652 * clocksource with mask >= 40bit and f >= 4GHz. That maps to
653 * ~ 0.06ppm granularity for NTP. We apply the same 12.5%
654 * margin as we do in clocksource_max_deferment()
660 */ 655 */
656 sec = (cs->mask - (cs->mask >> 5));
657 do_div(sec, freq);
658 do_div(sec, scale);
659 if (!sec)
660 sec = 1;
661 else if (sec > 600 && cs->mask > UINT_MAX)
662 sec = 600;
663
661 clocks_calc_mult_shift(&cs->mult, &cs->shift, freq, 664 clocks_calc_mult_shift(&cs->mult, &cs->shift, freq,
662 NSEC_PER_SEC/scale, 665 NSEC_PER_SEC / scale, sec * scale);
663 MAX_UPDATE_LENGTH*scale);
664 cs->max_idle_ns = clocksource_max_deferment(cs); 666 cs->max_idle_ns = clocksource_max_deferment(cs);
665} 667}
666EXPORT_SYMBOL_GPL(__clocksource_updatefreq_scale); 668EXPORT_SYMBOL_GPL(__clocksource_updatefreq_scale);
@@ -685,8 +687,8 @@ int __clocksource_register_scale(struct clocksource *cs, u32 scale, u32 freq)
685 /* Add clocksource to the clcoksource list */ 687 /* Add clocksource to the clcoksource list */
686 mutex_lock(&clocksource_mutex); 688 mutex_lock(&clocksource_mutex);
687 clocksource_enqueue(cs); 689 clocksource_enqueue(cs);
688 clocksource_select();
689 clocksource_enqueue_watchdog(cs); 690 clocksource_enqueue_watchdog(cs);
691 clocksource_select();
690 mutex_unlock(&clocksource_mutex); 692 mutex_unlock(&clocksource_mutex);
691 return 0; 693 return 0;
692} 694}
@@ -706,8 +708,8 @@ int clocksource_register(struct clocksource *cs)
706 708
707 mutex_lock(&clocksource_mutex); 709 mutex_lock(&clocksource_mutex);
708 clocksource_enqueue(cs); 710 clocksource_enqueue(cs);
709 clocksource_select();
710 clocksource_enqueue_watchdog(cs); 711 clocksource_enqueue_watchdog(cs);
712 clocksource_select();
711 mutex_unlock(&clocksource_mutex); 713 mutex_unlock(&clocksource_mutex);
712 return 0; 714 return 0;
713} 715}
diff --git a/kernel/time/tick-broadcast.c b/kernel/time/tick-broadcast.c
index da800ffa810c..c7218d132738 100644
--- a/kernel/time/tick-broadcast.c
+++ b/kernel/time/tick-broadcast.c
@@ -456,23 +456,27 @@ void tick_broadcast_oneshot_control(unsigned long reason)
456 unsigned long flags; 456 unsigned long flags;
457 int cpu; 457 int cpu;
458 458
459 raw_spin_lock_irqsave(&tick_broadcast_lock, flags);
460
461 /* 459 /*
462 * Periodic mode does not care about the enter/exit of power 460 * Periodic mode does not care about the enter/exit of power
463 * states 461 * states
464 */ 462 */
465 if (tick_broadcast_device.mode == TICKDEV_MODE_PERIODIC) 463 if (tick_broadcast_device.mode == TICKDEV_MODE_PERIODIC)
466 goto out; 464 return;
467 465
468 bc = tick_broadcast_device.evtdev; 466 /*
467 * We are called with preemtion disabled from the depth of the
468 * idle code, so we can't be moved away.
469 */
469 cpu = smp_processor_id(); 470 cpu = smp_processor_id();
470 td = &per_cpu(tick_cpu_device, cpu); 471 td = &per_cpu(tick_cpu_device, cpu);
471 dev = td->evtdev; 472 dev = td->evtdev;
472 473
473 if (!(dev->features & CLOCK_EVT_FEAT_C3STOP)) 474 if (!(dev->features & CLOCK_EVT_FEAT_C3STOP))
474 goto out; 475 return;
475 476
477 bc = tick_broadcast_device.evtdev;
478
479 raw_spin_lock_irqsave(&tick_broadcast_lock, flags);
476 if (reason == CLOCK_EVT_NOTIFY_BROADCAST_ENTER) { 480 if (reason == CLOCK_EVT_NOTIFY_BROADCAST_ENTER) {
477 if (!cpumask_test_cpu(cpu, tick_get_broadcast_oneshot_mask())) { 481 if (!cpumask_test_cpu(cpu, tick_get_broadcast_oneshot_mask())) {
478 cpumask_set_cpu(cpu, tick_get_broadcast_oneshot_mask()); 482 cpumask_set_cpu(cpu, tick_get_broadcast_oneshot_mask());
@@ -489,8 +493,6 @@ void tick_broadcast_oneshot_control(unsigned long reason)
489 tick_program_event(dev->next_event, 1); 493 tick_program_event(dev->next_event, 1);
490 } 494 }
491 } 495 }
492
493out:
494 raw_spin_unlock_irqrestore(&tick_broadcast_lock, flags); 496 raw_spin_unlock_irqrestore(&tick_broadcast_lock, flags);
495} 497}
496 498
@@ -522,10 +524,11 @@ static void tick_broadcast_init_next_event(struct cpumask *mask,
522 */ 524 */
523void tick_broadcast_setup_oneshot(struct clock_event_device *bc) 525void tick_broadcast_setup_oneshot(struct clock_event_device *bc)
524{ 526{
527 int cpu = smp_processor_id();
528
525 /* Set it up only once ! */ 529 /* Set it up only once ! */
526 if (bc->event_handler != tick_handle_oneshot_broadcast) { 530 if (bc->event_handler != tick_handle_oneshot_broadcast) {
527 int was_periodic = bc->mode == CLOCK_EVT_MODE_PERIODIC; 531 int was_periodic = bc->mode == CLOCK_EVT_MODE_PERIODIC;
528 int cpu = smp_processor_id();
529 532
530 bc->event_handler = tick_handle_oneshot_broadcast; 533 bc->event_handler = tick_handle_oneshot_broadcast;
531 clockevents_set_mode(bc, CLOCK_EVT_MODE_ONESHOT); 534 clockevents_set_mode(bc, CLOCK_EVT_MODE_ONESHOT);
@@ -551,6 +554,15 @@ void tick_broadcast_setup_oneshot(struct clock_event_device *bc)
551 tick_broadcast_set_event(tick_next_period, 1); 554 tick_broadcast_set_event(tick_next_period, 1);
552 } else 555 } else
553 bc->next_event.tv64 = KTIME_MAX; 556 bc->next_event.tv64 = KTIME_MAX;
557 } else {
558 /*
559 * The first cpu which switches to oneshot mode sets
560 * the bit for all other cpus which are in the general
561 * (periodic) broadcast mask. So the bit is set and
562 * would prevent the first broadcast enter after this
563 * to program the bc device.
564 */
565 tick_broadcast_clear_oneshot(cpu);
554 } 566 }
555} 567}
556 568
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index 8ad5d576755e..2b021b0e8507 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -596,6 +596,64 @@ void __init timekeeping_init(void)
596static struct timespec timekeeping_suspend_time; 596static struct timespec timekeeping_suspend_time;
597 597
598/** 598/**
599 * __timekeeping_inject_sleeptime - Internal function to add sleep interval
600 * @delta: pointer to a timespec delta value
601 *
602 * Takes a timespec offset measuring a suspend interval and properly
603 * adds the sleep offset to the timekeeping variables.
604 */
605static void __timekeeping_inject_sleeptime(struct timespec *delta)
606{
607 if (!timespec_valid(delta)) {
608 printk(KERN_WARNING "__timekeeping_inject_sleeptime: Invalid "
609 "sleep delta value!\n");
610 return;
611 }
612
613 xtime = timespec_add(xtime, *delta);
614 wall_to_monotonic = timespec_sub(wall_to_monotonic, *delta);
615 total_sleep_time = timespec_add(total_sleep_time, *delta);
616}
617
618
619/**
620 * timekeeping_inject_sleeptime - Adds suspend interval to timeekeeping values
621 * @delta: pointer to a timespec delta value
622 *
623 * This hook is for architectures that cannot support read_persistent_clock
624 * because their RTC/persistent clock is only accessible when irqs are enabled.
625 *
626 * This function should only be called by rtc_resume(), and allows
627 * a suspend offset to be injected into the timekeeping values.
628 */
629void timekeeping_inject_sleeptime(struct timespec *delta)
630{
631 unsigned long flags;
632 struct timespec ts;
633
634 /* Make sure we don't set the clock twice */
635 read_persistent_clock(&ts);
636 if (!(ts.tv_sec == 0 && ts.tv_nsec == 0))
637 return;
638
639 write_seqlock_irqsave(&xtime_lock, flags);
640 timekeeping_forward_now();
641
642 __timekeeping_inject_sleeptime(delta);
643
644 timekeeper.ntp_error = 0;
645 ntp_clear();
646 update_vsyscall(&xtime, &wall_to_monotonic, timekeeper.clock,
647 timekeeper.mult);
648
649 write_sequnlock_irqrestore(&xtime_lock, flags);
650
651 /* signal hrtimers about time change */
652 clock_was_set();
653}
654
655
656/**
599 * timekeeping_resume - Resumes the generic timekeeping subsystem. 657 * timekeeping_resume - Resumes the generic timekeeping subsystem.
600 * 658 *
601 * This is for the generic clocksource timekeeping. 659 * This is for the generic clocksource timekeeping.
@@ -615,9 +673,7 @@ static void timekeeping_resume(void)
615 673
616 if (timespec_compare(&ts, &timekeeping_suspend_time) > 0) { 674 if (timespec_compare(&ts, &timekeeping_suspend_time) > 0) {
617 ts = timespec_sub(ts, timekeeping_suspend_time); 675 ts = timespec_sub(ts, timekeeping_suspend_time);
618 xtime = timespec_add(xtime, ts); 676 __timekeeping_inject_sleeptime(&ts);
619 wall_to_monotonic = timespec_sub(wall_to_monotonic, ts);
620 total_sleep_time = timespec_add(total_sleep_time, ts);
621 } 677 }
622 /* re-base the last cycle value */ 678 /* re-base the last cycle value */
623 timekeeper.clock->cycle_last = timekeeper.clock->read(timekeeper.clock); 679 timekeeper.clock->cycle_last = timekeeper.clock->read(timekeeper.clock);
@@ -630,18 +686,40 @@ static void timekeeping_resume(void)
630 clockevents_notify(CLOCK_EVT_NOTIFY_RESUME, NULL); 686 clockevents_notify(CLOCK_EVT_NOTIFY_RESUME, NULL);
631 687
632 /* Resume hrtimers */ 688 /* Resume hrtimers */
633 hres_timers_resume(); 689 hrtimers_resume();
634} 690}
635 691
636static int timekeeping_suspend(void) 692static int timekeeping_suspend(void)
637{ 693{
638 unsigned long flags; 694 unsigned long flags;
695 struct timespec delta, delta_delta;
696 static struct timespec old_delta;
639 697
640 read_persistent_clock(&timekeeping_suspend_time); 698 read_persistent_clock(&timekeeping_suspend_time);
641 699
642 write_seqlock_irqsave(&xtime_lock, flags); 700 write_seqlock_irqsave(&xtime_lock, flags);
643 timekeeping_forward_now(); 701 timekeeping_forward_now();
644 timekeeping_suspended = 1; 702 timekeeping_suspended = 1;
703
704 /*
705 * To avoid drift caused by repeated suspend/resumes,
706 * which each can add ~1 second drift error,
707 * try to compensate so the difference in system time
708 * and persistent_clock time stays close to constant.
709 */
710 delta = timespec_sub(xtime, timekeeping_suspend_time);
711 delta_delta = timespec_sub(delta, old_delta);
712 if (abs(delta_delta.tv_sec) >= 2) {
713 /*
714 * if delta_delta is too large, assume time correction
715 * has occured and set old_delta to the current delta.
716 */
717 old_delta = delta;
718 } else {
719 /* Otherwise try to adjust old_system to compensate */
720 timekeeping_suspend_time =
721 timespec_add(timekeeping_suspend_time, delta_delta);
722 }
645 write_sequnlock_irqrestore(&xtime_lock, flags); 723 write_sequnlock_irqrestore(&xtime_lock, flags);
646 724
647 clockevents_notify(CLOCK_EVT_NOTIFY_SUSPEND, NULL); 725 clockevents_notify(CLOCK_EVT_NOTIFY_SUSPEND, NULL);
@@ -1049,6 +1127,21 @@ void get_xtime_and_monotonic_and_sleep_offset(struct timespec *xtim,
1049} 1127}
1050 1128
1051/** 1129/**
1130 * ktime_get_monotonic_offset() - get wall_to_monotonic in ktime_t format
1131 */
1132ktime_t ktime_get_monotonic_offset(void)
1133{
1134 unsigned long seq;
1135 struct timespec wtom;
1136
1137 do {
1138 seq = read_seqbegin(&xtime_lock);
1139 wtom = wall_to_monotonic;
1140 } while (read_seqretry(&xtime_lock, seq));
1141 return timespec_to_ktime(wtom);
1142}
1143
1144/**
1052 * xtime_update() - advances the timekeeping infrastructure 1145 * xtime_update() - advances the timekeeping infrastructure
1053 * @ticks: number of ticks, that have elapsed since the last call. 1146 * @ticks: number of ticks, that have elapsed since the last call.
1054 * 1147 *
diff --git a/kernel/timer.c b/kernel/timer.c
index fd6198692b57..8cff36119e4d 100644
--- a/kernel/timer.c
+++ b/kernel/timer.c
@@ -749,16 +749,15 @@ unsigned long apply_slack(struct timer_list *timer, unsigned long expires)
749 unsigned long expires_limit, mask; 749 unsigned long expires_limit, mask;
750 int bit; 750 int bit;
751 751
752 expires_limit = expires;
753
754 if (timer->slack >= 0) { 752 if (timer->slack >= 0) {
755 expires_limit = expires + timer->slack; 753 expires_limit = expires + timer->slack;
756 } else { 754 } else {
757 unsigned long now = jiffies; 755 long delta = expires - jiffies;
756
757 if (delta < 256)
758 return expires;
758 759
759 /* No slack, if already expired else auto slack 0.4% */ 760 expires_limit = expires + delta / 256;
760 if (time_after(expires, now))
761 expires_limit = expires + (expires - now)/256;
762 } 761 }
763 mask = expires ^ expires_limit; 762 mask = expires ^ expires_limit;
764 if (mask == 0) 763 if (mask == 0)
@@ -795,6 +794,8 @@ unsigned long apply_slack(struct timer_list *timer, unsigned long expires)
795 */ 794 */
796int mod_timer(struct timer_list *timer, unsigned long expires) 795int mod_timer(struct timer_list *timer, unsigned long expires)
797{ 796{
797 expires = apply_slack(timer, expires);
798
798 /* 799 /*
799 * This is a common optimization triggered by the 800 * This is a common optimization triggered by the
800 * networking code - if the timer is re-modified 801 * networking code - if the timer is re-modified
@@ -803,8 +804,6 @@ int mod_timer(struct timer_list *timer, unsigned long expires)
803 if (timer_pending(timer) && timer->expires == expires) 804 if (timer_pending(timer) && timer->expires == expires)
804 return 1; 805 return 1;
805 806
806 expires = apply_slack(timer, expires);
807
808 return __mod_timer(timer, expires, false, TIMER_NOT_PINNED); 807 return __mod_timer(timer, expires, false, TIMER_NOT_PINNED);
809} 808}
810EXPORT_SYMBOL(mod_timer); 809EXPORT_SYMBOL(mod_timer);
diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
index 2ad39e556cb4..cd3134510f3d 100644
--- a/kernel/trace/Kconfig
+++ b/kernel/trace/Kconfig
@@ -82,7 +82,7 @@ config EVENT_POWER_TRACING_DEPRECATED
82 power:power_frequency 82 power:power_frequency
83 This is for userspace compatibility 83 This is for userspace compatibility
84 and will vanish after 5 kernel iterations, 84 and will vanish after 5 kernel iterations,
85 namely 2.6.41. 85 namely 3.1.
86 86
87config CONTEXT_SWITCH_TRACER 87config CONTEXT_SWITCH_TRACER
88 bool 88 bool
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index ee24fa1935ac..c3e4575e7829 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -32,27 +32,32 @@
32 32
33#include <trace/events/sched.h> 33#include <trace/events/sched.h>
34 34
35#include <asm/ftrace.h>
36#include <asm/setup.h> 35#include <asm/setup.h>
37 36
38#include "trace_output.h" 37#include "trace_output.h"
39#include "trace_stat.h" 38#include "trace_stat.h"
40 39
41#define FTRACE_WARN_ON(cond) \ 40#define FTRACE_WARN_ON(cond) \
42 do { \ 41 ({ \
43 if (WARN_ON(cond)) \ 42 int ___r = cond; \
43 if (WARN_ON(___r)) \
44 ftrace_kill(); \ 44 ftrace_kill(); \
45 } while (0) 45 ___r; \
46 })
46 47
47#define FTRACE_WARN_ON_ONCE(cond) \ 48#define FTRACE_WARN_ON_ONCE(cond) \
48 do { \ 49 ({ \
49 if (WARN_ON_ONCE(cond)) \ 50 int ___r = cond; \
51 if (WARN_ON_ONCE(___r)) \
50 ftrace_kill(); \ 52 ftrace_kill(); \
51 } while (0) 53 ___r; \
54 })
52 55
53/* hash bits for specific function selection */ 56/* hash bits for specific function selection */
54#define FTRACE_HASH_BITS 7 57#define FTRACE_HASH_BITS 7
55#define FTRACE_FUNC_HASHSIZE (1 << FTRACE_HASH_BITS) 58#define FTRACE_FUNC_HASHSIZE (1 << FTRACE_HASH_BITS)
59#define FTRACE_HASH_DEFAULT_BITS 10
60#define FTRACE_HASH_MAX_BITS 12
56 61
57/* ftrace_enabled is a method to turn ftrace on or off */ 62/* ftrace_enabled is a method to turn ftrace on or off */
58int ftrace_enabled __read_mostly; 63int ftrace_enabled __read_mostly;
@@ -76,33 +81,45 @@ static int ftrace_disabled __read_mostly;
76 81
77static DEFINE_MUTEX(ftrace_lock); 82static DEFINE_MUTEX(ftrace_lock);
78 83
79static struct ftrace_ops ftrace_list_end __read_mostly = 84static struct ftrace_ops ftrace_list_end __read_mostly = {
80{
81 .func = ftrace_stub, 85 .func = ftrace_stub,
82}; 86};
83 87
84static struct ftrace_ops *ftrace_list __read_mostly = &ftrace_list_end; 88static struct ftrace_ops *ftrace_global_list __read_mostly = &ftrace_list_end;
89static struct ftrace_ops *ftrace_ops_list __read_mostly = &ftrace_list_end;
85ftrace_func_t ftrace_trace_function __read_mostly = ftrace_stub; 90ftrace_func_t ftrace_trace_function __read_mostly = ftrace_stub;
91static ftrace_func_t __ftrace_trace_function_delay __read_mostly = ftrace_stub;
86ftrace_func_t __ftrace_trace_function __read_mostly = ftrace_stub; 92ftrace_func_t __ftrace_trace_function __read_mostly = ftrace_stub;
87ftrace_func_t ftrace_pid_function __read_mostly = ftrace_stub; 93ftrace_func_t ftrace_pid_function __read_mostly = ftrace_stub;
94static struct ftrace_ops global_ops;
95
96static void
97ftrace_ops_list_func(unsigned long ip, unsigned long parent_ip);
88 98
89/* 99/*
90 * Traverse the ftrace_list, invoking all entries. The reason that we 100 * Traverse the ftrace_global_list, invoking all entries. The reason that we
91 * can use rcu_dereference_raw() is that elements removed from this list 101 * can use rcu_dereference_raw() is that elements removed from this list
92 * are simply leaked, so there is no need to interact with a grace-period 102 * are simply leaked, so there is no need to interact with a grace-period
93 * mechanism. The rcu_dereference_raw() calls are needed to handle 103 * mechanism. The rcu_dereference_raw() calls are needed to handle
94 * concurrent insertions into the ftrace_list. 104 * concurrent insertions into the ftrace_global_list.
95 * 105 *
96 * Silly Alpha and silly pointer-speculation compiler optimizations! 106 * Silly Alpha and silly pointer-speculation compiler optimizations!
97 */ 107 */
98static void ftrace_list_func(unsigned long ip, unsigned long parent_ip) 108static void ftrace_global_list_func(unsigned long ip,
109 unsigned long parent_ip)
99{ 110{
100 struct ftrace_ops *op = rcu_dereference_raw(ftrace_list); /*see above*/ 111 struct ftrace_ops *op;
112
113 if (unlikely(trace_recursion_test(TRACE_GLOBAL_BIT)))
114 return;
101 115
116 trace_recursion_set(TRACE_GLOBAL_BIT);
117 op = rcu_dereference_raw(ftrace_global_list); /*see above*/
102 while (op != &ftrace_list_end) { 118 while (op != &ftrace_list_end) {
103 op->func(ip, parent_ip); 119 op->func(ip, parent_ip);
104 op = rcu_dereference_raw(op->next); /*see above*/ 120 op = rcu_dereference_raw(op->next); /*see above*/
105 }; 121 };
122 trace_recursion_clear(TRACE_GLOBAL_BIT);
106} 123}
107 124
108static void ftrace_pid_func(unsigned long ip, unsigned long parent_ip) 125static void ftrace_pid_func(unsigned long ip, unsigned long parent_ip)
@@ -130,9 +147,11 @@ void clear_ftrace_function(void)
130{ 147{
131 ftrace_trace_function = ftrace_stub; 148 ftrace_trace_function = ftrace_stub;
132 __ftrace_trace_function = ftrace_stub; 149 __ftrace_trace_function = ftrace_stub;
150 __ftrace_trace_function_delay = ftrace_stub;
133 ftrace_pid_function = ftrace_stub; 151 ftrace_pid_function = ftrace_stub;
134} 152}
135 153
154#undef CONFIG_HAVE_FUNCTION_TRACE_MCOUNT_TEST
136#ifndef CONFIG_HAVE_FUNCTION_TRACE_MCOUNT_TEST 155#ifndef CONFIG_HAVE_FUNCTION_TRACE_MCOUNT_TEST
137/* 156/*
138 * For those archs that do not test ftrace_trace_stop in their 157 * For those archs that do not test ftrace_trace_stop in their
@@ -147,46 +166,74 @@ static void ftrace_test_stop_func(unsigned long ip, unsigned long parent_ip)
147} 166}
148#endif 167#endif
149 168
150static int __register_ftrace_function(struct ftrace_ops *ops) 169static void update_global_ops(void)
151{ 170{
152 ops->next = ftrace_list; 171 ftrace_func_t func;
172
153 /* 173 /*
154 * We are entering ops into the ftrace_list but another 174 * If there's only one function registered, then call that
155 * CPU might be walking that list. We need to make sure 175 * function directly. Otherwise, we need to iterate over the
156 * the ops->next pointer is valid before another CPU sees 176 * registered callers.
157 * the ops pointer included into the ftrace_list.
158 */ 177 */
159 rcu_assign_pointer(ftrace_list, ops); 178 if (ftrace_global_list == &ftrace_list_end ||
179 ftrace_global_list->next == &ftrace_list_end)
180 func = ftrace_global_list->func;
181 else
182 func = ftrace_global_list_func;
160 183
161 if (ftrace_enabled) { 184 /* If we filter on pids, update to use the pid function */
162 ftrace_func_t func; 185 if (!list_empty(&ftrace_pids)) {
186 set_ftrace_pid_function(func);
187 func = ftrace_pid_func;
188 }
163 189
164 if (ops->next == &ftrace_list_end) 190 global_ops.func = func;
165 func = ops->func; 191}
166 else
167 func = ftrace_list_func;
168 192
169 if (!list_empty(&ftrace_pids)) { 193static void update_ftrace_function(void)
170 set_ftrace_pid_function(func); 194{
171 func = ftrace_pid_func; 195 ftrace_func_t func;
172 } 196
197 update_global_ops();
198
199 /*
200 * If we are at the end of the list and this ops is
201 * not dynamic, then have the mcount trampoline call
202 * the function directly
203 */
204 if (ftrace_ops_list == &ftrace_list_end ||
205 (ftrace_ops_list->next == &ftrace_list_end &&
206 !(ftrace_ops_list->flags & FTRACE_OPS_FL_DYNAMIC)))
207 func = ftrace_ops_list->func;
208 else
209 func = ftrace_ops_list_func;
173 210
174 /*
175 * For one func, simply call it directly.
176 * For more than one func, call the chain.
177 */
178#ifdef CONFIG_HAVE_FUNCTION_TRACE_MCOUNT_TEST 211#ifdef CONFIG_HAVE_FUNCTION_TRACE_MCOUNT_TEST
179 ftrace_trace_function = func; 212 ftrace_trace_function = func;
180#else 213#else
181 __ftrace_trace_function = func; 214#ifdef CONFIG_DYNAMIC_FTRACE
182 ftrace_trace_function = ftrace_test_stop_func; 215 /* do not update till all functions have been modified */
216 __ftrace_trace_function_delay = func;
217#else
218 __ftrace_trace_function = func;
183#endif 219#endif
184 } 220 ftrace_trace_function = ftrace_test_stop_func;
221#endif
222}
185 223
186 return 0; 224static void add_ftrace_ops(struct ftrace_ops **list, struct ftrace_ops *ops)
225{
226 ops->next = *list;
227 /*
228 * We are entering ops into the list but another
229 * CPU might be walking that list. We need to make sure
230 * the ops->next pointer is valid before another CPU sees
231 * the ops pointer included into the list.
232 */
233 rcu_assign_pointer(*list, ops);
187} 234}
188 235
189static int __unregister_ftrace_function(struct ftrace_ops *ops) 236static int remove_ftrace_ops(struct ftrace_ops **list, struct ftrace_ops *ops)
190{ 237{
191 struct ftrace_ops **p; 238 struct ftrace_ops **p;
192 239
@@ -194,13 +241,12 @@ static int __unregister_ftrace_function(struct ftrace_ops *ops)
194 * If we are removing the last function, then simply point 241 * If we are removing the last function, then simply point
195 * to the ftrace_stub. 242 * to the ftrace_stub.
196 */ 243 */
197 if (ftrace_list == ops && ops->next == &ftrace_list_end) { 244 if (*list == ops && ops->next == &ftrace_list_end) {
198 ftrace_trace_function = ftrace_stub; 245 *list = &ftrace_list_end;
199 ftrace_list = &ftrace_list_end;
200 return 0; 246 return 0;
201 } 247 }
202 248
203 for (p = &ftrace_list; *p != &ftrace_list_end; p = &(*p)->next) 249 for (p = list; *p != &ftrace_list_end; p = &(*p)->next)
204 if (*p == ops) 250 if (*p == ops)
205 break; 251 break;
206 252
@@ -208,53 +254,83 @@ static int __unregister_ftrace_function(struct ftrace_ops *ops)
208 return -1; 254 return -1;
209 255
210 *p = (*p)->next; 256 *p = (*p)->next;
257 return 0;
258}
211 259
212 if (ftrace_enabled) { 260static int __register_ftrace_function(struct ftrace_ops *ops)
213 /* If we only have one func left, then call that directly */ 261{
214 if (ftrace_list->next == &ftrace_list_end) { 262 if (ftrace_disabled)
215 ftrace_func_t func = ftrace_list->func; 263 return -ENODEV;
216 264
217 if (!list_empty(&ftrace_pids)) { 265 if (FTRACE_WARN_ON(ops == &global_ops))
218 set_ftrace_pid_function(func); 266 return -EINVAL;
219 func = ftrace_pid_func; 267
220 } 268 if (WARN_ON(ops->flags & FTRACE_OPS_FL_ENABLED))
221#ifdef CONFIG_HAVE_FUNCTION_TRACE_MCOUNT_TEST 269 return -EBUSY;
222 ftrace_trace_function = func; 270
223#else 271 if (!core_kernel_data((unsigned long)ops))
224 __ftrace_trace_function = func; 272 ops->flags |= FTRACE_OPS_FL_DYNAMIC;
225#endif 273
226 } 274 if (ops->flags & FTRACE_OPS_FL_GLOBAL) {
227 } 275 int first = ftrace_global_list == &ftrace_list_end;
276 add_ftrace_ops(&ftrace_global_list, ops);
277 ops->flags |= FTRACE_OPS_FL_ENABLED;
278 if (first)
279 add_ftrace_ops(&ftrace_ops_list, &global_ops);
280 } else
281 add_ftrace_ops(&ftrace_ops_list, ops);
282
283 if (ftrace_enabled)
284 update_ftrace_function();
228 285
229 return 0; 286 return 0;
230} 287}
231 288
232static void ftrace_update_pid_func(void) 289static int __unregister_ftrace_function(struct ftrace_ops *ops)
233{ 290{
234 ftrace_func_t func; 291 int ret;
235 292
236 if (ftrace_trace_function == ftrace_stub) 293 if (ftrace_disabled)
237 return; 294 return -ENODEV;
238 295
239#ifdef CONFIG_HAVE_FUNCTION_TRACE_MCOUNT_TEST 296 if (WARN_ON(!(ops->flags & FTRACE_OPS_FL_ENABLED)))
240 func = ftrace_trace_function; 297 return -EBUSY;
241#else
242 func = __ftrace_trace_function;
243#endif
244 298
245 if (!list_empty(&ftrace_pids)) { 299 if (FTRACE_WARN_ON(ops == &global_ops))
246 set_ftrace_pid_function(func); 300 return -EINVAL;
247 func = ftrace_pid_func;
248 } else {
249 if (func == ftrace_pid_func)
250 func = ftrace_pid_function;
251 }
252 301
253#ifdef CONFIG_HAVE_FUNCTION_TRACE_MCOUNT_TEST 302 if (ops->flags & FTRACE_OPS_FL_GLOBAL) {
254 ftrace_trace_function = func; 303 ret = remove_ftrace_ops(&ftrace_global_list, ops);
255#else 304 if (!ret && ftrace_global_list == &ftrace_list_end)
256 __ftrace_trace_function = func; 305 ret = remove_ftrace_ops(&ftrace_ops_list, &global_ops);
257#endif 306 if (!ret)
307 ops->flags &= ~FTRACE_OPS_FL_ENABLED;
308 } else
309 ret = remove_ftrace_ops(&ftrace_ops_list, ops);
310
311 if (ret < 0)
312 return ret;
313
314 if (ftrace_enabled)
315 update_ftrace_function();
316
317 /*
318 * Dynamic ops may be freed, we must make sure that all
319 * callers are done before leaving this function.
320 */
321 if (ops->flags & FTRACE_OPS_FL_DYNAMIC)
322 synchronize_sched();
323
324 return 0;
325}
326
327static void ftrace_update_pid_func(void)
328{
329 /* Only do something if we are tracing something */
330 if (ftrace_trace_function == ftrace_stub)
331 return;
332
333 update_ftrace_function();
258} 334}
259 335
260#ifdef CONFIG_FUNCTION_PROFILER 336#ifdef CONFIG_FUNCTION_PROFILER
@@ -715,8 +791,7 @@ static void unregister_ftrace_profiler(void)
715 unregister_ftrace_graph(); 791 unregister_ftrace_graph();
716} 792}
717#else 793#else
718static struct ftrace_ops ftrace_profile_ops __read_mostly = 794static struct ftrace_ops ftrace_profile_ops __read_mostly = {
719{
720 .func = function_profile_call, 795 .func = function_profile_call,
721}; 796};
722 797
@@ -736,19 +811,10 @@ ftrace_profile_write(struct file *filp, const char __user *ubuf,
736 size_t cnt, loff_t *ppos) 811 size_t cnt, loff_t *ppos)
737{ 812{
738 unsigned long val; 813 unsigned long val;
739 char buf[64]; /* big enough to hold a number */
740 int ret; 814 int ret;
741 815
742 if (cnt >= sizeof(buf)) 816 ret = kstrtoul_from_user(ubuf, cnt, 10, &val);
743 return -EINVAL; 817 if (ret)
744
745 if (copy_from_user(&buf, ubuf, cnt))
746 return -EFAULT;
747
748 buf[cnt] = 0;
749
750 ret = strict_strtoul(buf, 10, &val);
751 if (ret < 0)
752 return ret; 818 return ret;
753 819
754 val = !!val; 820 val = !!val;
@@ -888,8 +954,35 @@ enum {
888 FTRACE_START_FUNC_RET = (1 << 3), 954 FTRACE_START_FUNC_RET = (1 << 3),
889 FTRACE_STOP_FUNC_RET = (1 << 4), 955 FTRACE_STOP_FUNC_RET = (1 << 4),
890}; 956};
957struct ftrace_func_entry {
958 struct hlist_node hlist;
959 unsigned long ip;
960};
891 961
892static int ftrace_filtered; 962struct ftrace_hash {
963 unsigned long size_bits;
964 struct hlist_head *buckets;
965 unsigned long count;
966 struct rcu_head rcu;
967};
968
969/*
970 * We make these constant because no one should touch them,
971 * but they are used as the default "empty hash", to avoid allocating
972 * it all the time. These are in a read only section such that if
973 * anyone does try to modify it, it will cause an exception.
974 */
975static const struct hlist_head empty_buckets[1];
976static const struct ftrace_hash empty_hash = {
977 .buckets = (struct hlist_head *)empty_buckets,
978};
979#define EMPTY_HASH ((struct ftrace_hash *)&empty_hash)
980
981static struct ftrace_ops global_ops = {
982 .func = ftrace_stub,
983 .notrace_hash = EMPTY_HASH,
984 .filter_hash = EMPTY_HASH,
985};
893 986
894static struct dyn_ftrace *ftrace_new_addrs; 987static struct dyn_ftrace *ftrace_new_addrs;
895 988
@@ -912,6 +1005,292 @@ static struct ftrace_page *ftrace_pages;
912 1005
913static struct dyn_ftrace *ftrace_free_records; 1006static struct dyn_ftrace *ftrace_free_records;
914 1007
1008static struct ftrace_func_entry *
1009ftrace_lookup_ip(struct ftrace_hash *hash, unsigned long ip)
1010{
1011 unsigned long key;
1012 struct ftrace_func_entry *entry;
1013 struct hlist_head *hhd;
1014 struct hlist_node *n;
1015
1016 if (!hash->count)
1017 return NULL;
1018
1019 if (hash->size_bits > 0)
1020 key = hash_long(ip, hash->size_bits);
1021 else
1022 key = 0;
1023
1024 hhd = &hash->buckets[key];
1025
1026 hlist_for_each_entry_rcu(entry, n, hhd, hlist) {
1027 if (entry->ip == ip)
1028 return entry;
1029 }
1030 return NULL;
1031}
1032
1033static void __add_hash_entry(struct ftrace_hash *hash,
1034 struct ftrace_func_entry *entry)
1035{
1036 struct hlist_head *hhd;
1037 unsigned long key;
1038
1039 if (hash->size_bits)
1040 key = hash_long(entry->ip, hash->size_bits);
1041 else
1042 key = 0;
1043
1044 hhd = &hash->buckets[key];
1045 hlist_add_head(&entry->hlist, hhd);
1046 hash->count++;
1047}
1048
1049static int add_hash_entry(struct ftrace_hash *hash, unsigned long ip)
1050{
1051 struct ftrace_func_entry *entry;
1052
1053 entry = kmalloc(sizeof(*entry), GFP_KERNEL);
1054 if (!entry)
1055 return -ENOMEM;
1056
1057 entry->ip = ip;
1058 __add_hash_entry(hash, entry);
1059
1060 return 0;
1061}
1062
1063static void
1064free_hash_entry(struct ftrace_hash *hash,
1065 struct ftrace_func_entry *entry)
1066{
1067 hlist_del(&entry->hlist);
1068 kfree(entry);
1069 hash->count--;
1070}
1071
1072static void
1073remove_hash_entry(struct ftrace_hash *hash,
1074 struct ftrace_func_entry *entry)
1075{
1076 hlist_del(&entry->hlist);
1077 hash->count--;
1078}
1079
1080static void ftrace_hash_clear(struct ftrace_hash *hash)
1081{
1082 struct hlist_head *hhd;
1083 struct hlist_node *tp, *tn;
1084 struct ftrace_func_entry *entry;
1085 int size = 1 << hash->size_bits;
1086 int i;
1087
1088 if (!hash->count)
1089 return;
1090
1091 for (i = 0; i < size; i++) {
1092 hhd = &hash->buckets[i];
1093 hlist_for_each_entry_safe(entry, tp, tn, hhd, hlist)
1094 free_hash_entry(hash, entry);
1095 }
1096 FTRACE_WARN_ON(hash->count);
1097}
1098
1099static void free_ftrace_hash(struct ftrace_hash *hash)
1100{
1101 if (!hash || hash == EMPTY_HASH)
1102 return;
1103 ftrace_hash_clear(hash);
1104 kfree(hash->buckets);
1105 kfree(hash);
1106}
1107
1108static void __free_ftrace_hash_rcu(struct rcu_head *rcu)
1109{
1110 struct ftrace_hash *hash;
1111
1112 hash = container_of(rcu, struct ftrace_hash, rcu);
1113 free_ftrace_hash(hash);
1114}
1115
1116static void free_ftrace_hash_rcu(struct ftrace_hash *hash)
1117{
1118 if (!hash || hash == EMPTY_HASH)
1119 return;
1120 call_rcu_sched(&hash->rcu, __free_ftrace_hash_rcu);
1121}
1122
1123static struct ftrace_hash *alloc_ftrace_hash(int size_bits)
1124{
1125 struct ftrace_hash *hash;
1126 int size;
1127
1128 hash = kzalloc(sizeof(*hash), GFP_KERNEL);
1129 if (!hash)
1130 return NULL;
1131
1132 size = 1 << size_bits;
1133 hash->buckets = kzalloc(sizeof(*hash->buckets) * size, GFP_KERNEL);
1134
1135 if (!hash->buckets) {
1136 kfree(hash);
1137 return NULL;
1138 }
1139
1140 hash->size_bits = size_bits;
1141
1142 return hash;
1143}
1144
1145static struct ftrace_hash *
1146alloc_and_copy_ftrace_hash(int size_bits, struct ftrace_hash *hash)
1147{
1148 struct ftrace_func_entry *entry;
1149 struct ftrace_hash *new_hash;
1150 struct hlist_node *tp;
1151 int size;
1152 int ret;
1153 int i;
1154
1155 new_hash = alloc_ftrace_hash(size_bits);
1156 if (!new_hash)
1157 return NULL;
1158
1159 /* Empty hash? */
1160 if (!hash || !hash->count)
1161 return new_hash;
1162
1163 size = 1 << hash->size_bits;
1164 for (i = 0; i < size; i++) {
1165 hlist_for_each_entry(entry, tp, &hash->buckets[i], hlist) {
1166 ret = add_hash_entry(new_hash, entry->ip);
1167 if (ret < 0)
1168 goto free_hash;
1169 }
1170 }
1171
1172 FTRACE_WARN_ON(new_hash->count != hash->count);
1173
1174 return new_hash;
1175
1176 free_hash:
1177 free_ftrace_hash(new_hash);
1178 return NULL;
1179}
1180
1181static void
1182ftrace_hash_rec_disable(struct ftrace_ops *ops, int filter_hash);
1183static void
1184ftrace_hash_rec_enable(struct ftrace_ops *ops, int filter_hash);
1185
1186static int
1187ftrace_hash_move(struct ftrace_ops *ops, int enable,
1188 struct ftrace_hash **dst, struct ftrace_hash *src)
1189{
1190 struct ftrace_func_entry *entry;
1191 struct hlist_node *tp, *tn;
1192 struct hlist_head *hhd;
1193 struct ftrace_hash *old_hash;
1194 struct ftrace_hash *new_hash;
1195 unsigned long key;
1196 int size = src->count;
1197 int bits = 0;
1198 int ret;
1199 int i;
1200
1201 /*
1202 * Remove the current set, update the hash and add
1203 * them back.
1204 */
1205 ftrace_hash_rec_disable(ops, enable);
1206
1207 /*
1208 * If the new source is empty, just free dst and assign it
1209 * the empty_hash.
1210 */
1211 if (!src->count) {
1212 free_ftrace_hash_rcu(*dst);
1213 rcu_assign_pointer(*dst, EMPTY_HASH);
1214 return 0;
1215 }
1216
1217 /*
1218 * Make the hash size about 1/2 the # found
1219 */
1220 for (size /= 2; size; size >>= 1)
1221 bits++;
1222
1223 /* Don't allocate too much */
1224 if (bits > FTRACE_HASH_MAX_BITS)
1225 bits = FTRACE_HASH_MAX_BITS;
1226
1227 ret = -ENOMEM;
1228 new_hash = alloc_ftrace_hash(bits);
1229 if (!new_hash)
1230 goto out;
1231
1232 size = 1 << src->size_bits;
1233 for (i = 0; i < size; i++) {
1234 hhd = &src->buckets[i];
1235 hlist_for_each_entry_safe(entry, tp, tn, hhd, hlist) {
1236 if (bits > 0)
1237 key = hash_long(entry->ip, bits);
1238 else
1239 key = 0;
1240 remove_hash_entry(src, entry);
1241 __add_hash_entry(new_hash, entry);
1242 }
1243 }
1244
1245 old_hash = *dst;
1246 rcu_assign_pointer(*dst, new_hash);
1247 free_ftrace_hash_rcu(old_hash);
1248
1249 ret = 0;
1250 out:
1251 /*
1252 * Enable regardless of ret:
1253 * On success, we enable the new hash.
1254 * On failure, we re-enable the original hash.
1255 */
1256 ftrace_hash_rec_enable(ops, enable);
1257
1258 return ret;
1259}
1260
1261/*
1262 * Test the hashes for this ops to see if we want to call
1263 * the ops->func or not.
1264 *
1265 * It's a match if the ip is in the ops->filter_hash or
1266 * the filter_hash does not exist or is empty,
1267 * AND
1268 * the ip is not in the ops->notrace_hash.
1269 *
1270 * This needs to be called with preemption disabled as
1271 * the hashes are freed with call_rcu_sched().
1272 */
1273static int
1274ftrace_ops_test(struct ftrace_ops *ops, unsigned long ip)
1275{
1276 struct ftrace_hash *filter_hash;
1277 struct ftrace_hash *notrace_hash;
1278 int ret;
1279
1280 filter_hash = rcu_dereference_raw(ops->filter_hash);
1281 notrace_hash = rcu_dereference_raw(ops->notrace_hash);
1282
1283 if ((!filter_hash || !filter_hash->count ||
1284 ftrace_lookup_ip(filter_hash, ip)) &&
1285 (!notrace_hash || !notrace_hash->count ||
1286 !ftrace_lookup_ip(notrace_hash, ip)))
1287 ret = 1;
1288 else
1289 ret = 0;
1290
1291 return ret;
1292}
1293
915/* 1294/*
916 * This is a double for. Do not use 'break' to break out of the loop, 1295 * This is a double for. Do not use 'break' to break out of the loop,
917 * you must use a goto. 1296 * you must use a goto.
@@ -926,6 +1305,105 @@ static struct dyn_ftrace *ftrace_free_records;
926 } \ 1305 } \
927 } 1306 }
928 1307
1308static void __ftrace_hash_rec_update(struct ftrace_ops *ops,
1309 int filter_hash,
1310 bool inc)
1311{
1312 struct ftrace_hash *hash;
1313 struct ftrace_hash *other_hash;
1314 struct ftrace_page *pg;
1315 struct dyn_ftrace *rec;
1316 int count = 0;
1317 int all = 0;
1318
1319 /* Only update if the ops has been registered */
1320 if (!(ops->flags & FTRACE_OPS_FL_ENABLED))
1321 return;
1322
1323 /*
1324 * In the filter_hash case:
1325 * If the count is zero, we update all records.
1326 * Otherwise we just update the items in the hash.
1327 *
1328 * In the notrace_hash case:
1329 * We enable the update in the hash.
1330 * As disabling notrace means enabling the tracing,
1331 * and enabling notrace means disabling, the inc variable
1332 * gets inversed.
1333 */
1334 if (filter_hash) {
1335 hash = ops->filter_hash;
1336 other_hash = ops->notrace_hash;
1337 if (!hash || !hash->count)
1338 all = 1;
1339 } else {
1340 inc = !inc;
1341 hash = ops->notrace_hash;
1342 other_hash = ops->filter_hash;
1343 /*
1344 * If the notrace hash has no items,
1345 * then there's nothing to do.
1346 */
1347 if (hash && !hash->count)
1348 return;
1349 }
1350
1351 do_for_each_ftrace_rec(pg, rec) {
1352 int in_other_hash = 0;
1353 int in_hash = 0;
1354 int match = 0;
1355
1356 if (all) {
1357 /*
1358 * Only the filter_hash affects all records.
1359 * Update if the record is not in the notrace hash.
1360 */
1361 if (!other_hash || !ftrace_lookup_ip(other_hash, rec->ip))
1362 match = 1;
1363 } else {
1364 in_hash = hash && !!ftrace_lookup_ip(hash, rec->ip);
1365 in_other_hash = other_hash && !!ftrace_lookup_ip(other_hash, rec->ip);
1366
1367 /*
1368 *
1369 */
1370 if (filter_hash && in_hash && !in_other_hash)
1371 match = 1;
1372 else if (!filter_hash && in_hash &&
1373 (in_other_hash || !other_hash->count))
1374 match = 1;
1375 }
1376 if (!match)
1377 continue;
1378
1379 if (inc) {
1380 rec->flags++;
1381 if (FTRACE_WARN_ON((rec->flags & ~FTRACE_FL_MASK) == FTRACE_REF_MAX))
1382 return;
1383 } else {
1384 if (FTRACE_WARN_ON((rec->flags & ~FTRACE_FL_MASK) == 0))
1385 return;
1386 rec->flags--;
1387 }
1388 count++;
1389 /* Shortcut, if we handled all records, we are done. */
1390 if (!all && count == hash->count)
1391 return;
1392 } while_for_each_ftrace_rec();
1393}
1394
1395static void ftrace_hash_rec_disable(struct ftrace_ops *ops,
1396 int filter_hash)
1397{
1398 __ftrace_hash_rec_update(ops, filter_hash, 0);
1399}
1400
1401static void ftrace_hash_rec_enable(struct ftrace_ops *ops,
1402 int filter_hash)
1403{
1404 __ftrace_hash_rec_update(ops, filter_hash, 1);
1405}
1406
929static void ftrace_free_rec(struct dyn_ftrace *rec) 1407static void ftrace_free_rec(struct dyn_ftrace *rec)
930{ 1408{
931 rec->freelist = ftrace_free_records; 1409 rec->freelist = ftrace_free_records;
@@ -1047,18 +1525,18 @@ __ftrace_replace_code(struct dyn_ftrace *rec, int enable)
1047 ftrace_addr = (unsigned long)FTRACE_ADDR; 1525 ftrace_addr = (unsigned long)FTRACE_ADDR;
1048 1526
1049 /* 1527 /*
1050 * If this record is not to be traced or we want to disable it, 1528 * If we are enabling tracing:
1051 * then disable it. 1529 *
1530 * If the record has a ref count, then we need to enable it
1531 * because someone is using it.
1052 * 1532 *
1053 * If we want to enable it and filtering is off, then enable it. 1533 * Otherwise we make sure its disabled.
1054 * 1534 *
1055 * If we want to enable it and filtering is on, enable it only if 1535 * If we are disabling tracing, then disable all records that
1056 * it's filtered 1536 * are enabled.
1057 */ 1537 */
1058 if (enable && !(rec->flags & FTRACE_FL_NOTRACE)) { 1538 if (enable && (rec->flags & ~FTRACE_FL_MASK))
1059 if (!ftrace_filtered || (rec->flags & FTRACE_FL_FILTER)) 1539 flag = FTRACE_FL_ENABLED;
1060 flag = FTRACE_FL_ENABLED;
1061 }
1062 1540
1063 /* If the state of this record hasn't changed, then do nothing */ 1541 /* If the state of this record hasn't changed, then do nothing */
1064 if ((rec->flags & FTRACE_FL_ENABLED) == flag) 1542 if ((rec->flags & FTRACE_FL_ENABLED) == flag)
@@ -1079,19 +1557,16 @@ static void ftrace_replace_code(int enable)
1079 struct ftrace_page *pg; 1557 struct ftrace_page *pg;
1080 int failed; 1558 int failed;
1081 1559
1560 if (unlikely(ftrace_disabled))
1561 return;
1562
1082 do_for_each_ftrace_rec(pg, rec) { 1563 do_for_each_ftrace_rec(pg, rec) {
1083 /* 1564 /* Skip over free records */
1084 * Skip over free records, records that have 1565 if (rec->flags & FTRACE_FL_FREE)
1085 * failed and not converted.
1086 */
1087 if (rec->flags & FTRACE_FL_FREE ||
1088 rec->flags & FTRACE_FL_FAILED ||
1089 !(rec->flags & FTRACE_FL_CONVERTED))
1090 continue; 1566 continue;
1091 1567
1092 failed = __ftrace_replace_code(rec, enable); 1568 failed = __ftrace_replace_code(rec, enable);
1093 if (failed) { 1569 if (failed) {
1094 rec->flags |= FTRACE_FL_FAILED;
1095 ftrace_bug(failed, rec->ip); 1570 ftrace_bug(failed, rec->ip);
1096 /* Stop processing */ 1571 /* Stop processing */
1097 return; 1572 return;
@@ -1107,10 +1582,12 @@ ftrace_code_disable(struct module *mod, struct dyn_ftrace *rec)
1107 1582
1108 ip = rec->ip; 1583 ip = rec->ip;
1109 1584
1585 if (unlikely(ftrace_disabled))
1586 return 0;
1587
1110 ret = ftrace_make_nop(mod, rec, MCOUNT_ADDR); 1588 ret = ftrace_make_nop(mod, rec, MCOUNT_ADDR);
1111 if (ret) { 1589 if (ret) {
1112 ftrace_bug(ret, ip); 1590 ftrace_bug(ret, ip);
1113 rec->flags |= FTRACE_FL_FAILED;
1114 return 0; 1591 return 0;
1115 } 1592 }
1116 return 1; 1593 return 1;
@@ -1138,6 +1615,12 @@ static int __ftrace_modify_code(void *data)
1138{ 1615{
1139 int *command = data; 1616 int *command = data;
1140 1617
1618 /*
1619 * Do not call function tracer while we update the code.
1620 * We are in stop machine, no worrying about races.
1621 */
1622 function_trace_stop++;
1623
1141 if (*command & FTRACE_ENABLE_CALLS) 1624 if (*command & FTRACE_ENABLE_CALLS)
1142 ftrace_replace_code(1); 1625 ftrace_replace_code(1);
1143 else if (*command & FTRACE_DISABLE_CALLS) 1626 else if (*command & FTRACE_DISABLE_CALLS)
@@ -1151,6 +1634,18 @@ static int __ftrace_modify_code(void *data)
1151 else if (*command & FTRACE_STOP_FUNC_RET) 1634 else if (*command & FTRACE_STOP_FUNC_RET)
1152 ftrace_disable_ftrace_graph_caller(); 1635 ftrace_disable_ftrace_graph_caller();
1153 1636
1637#ifndef CONFIG_HAVE_FUNCTION_TRACE_MCOUNT_TEST
1638 /*
1639 * For archs that call ftrace_test_stop_func(), we must
1640 * wait till after we update all the function callers
1641 * before we update the callback. This keeps different
1642 * ops that record different functions from corrupting
1643 * each other.
1644 */
1645 __ftrace_trace_function = __ftrace_trace_function_delay;
1646#endif
1647 function_trace_stop--;
1648
1154 return 0; 1649 return 0;
1155} 1650}
1156 1651
@@ -1171,6 +1666,7 @@ static void ftrace_run_update_code(int command)
1171 1666
1172static ftrace_func_t saved_ftrace_func; 1667static ftrace_func_t saved_ftrace_func;
1173static int ftrace_start_up; 1668static int ftrace_start_up;
1669static int global_start_up;
1174 1670
1175static void ftrace_startup_enable(int command) 1671static void ftrace_startup_enable(int command)
1176{ 1672{
@@ -1185,19 +1681,38 @@ static void ftrace_startup_enable(int command)
1185 ftrace_run_update_code(command); 1681 ftrace_run_update_code(command);
1186} 1682}
1187 1683
1188static void ftrace_startup(int command) 1684static int ftrace_startup(struct ftrace_ops *ops, int command)
1189{ 1685{
1686 bool hash_enable = true;
1687
1190 if (unlikely(ftrace_disabled)) 1688 if (unlikely(ftrace_disabled))
1191 return; 1689 return -ENODEV;
1192 1690
1193 ftrace_start_up++; 1691 ftrace_start_up++;
1194 command |= FTRACE_ENABLE_CALLS; 1692 command |= FTRACE_ENABLE_CALLS;
1195 1693
1694 /* ops marked global share the filter hashes */
1695 if (ops->flags & FTRACE_OPS_FL_GLOBAL) {
1696 ops = &global_ops;
1697 /* Don't update hash if global is already set */
1698 if (global_start_up)
1699 hash_enable = false;
1700 global_start_up++;
1701 }
1702
1703 ops->flags |= FTRACE_OPS_FL_ENABLED;
1704 if (hash_enable)
1705 ftrace_hash_rec_enable(ops, 1);
1706
1196 ftrace_startup_enable(command); 1707 ftrace_startup_enable(command);
1708
1709 return 0;
1197} 1710}
1198 1711
1199static void ftrace_shutdown(int command) 1712static void ftrace_shutdown(struct ftrace_ops *ops, int command)
1200{ 1713{
1714 bool hash_disable = true;
1715
1201 if (unlikely(ftrace_disabled)) 1716 if (unlikely(ftrace_disabled))
1202 return; 1717 return;
1203 1718
@@ -1209,6 +1724,23 @@ static void ftrace_shutdown(int command)
1209 */ 1724 */
1210 WARN_ON_ONCE(ftrace_start_up < 0); 1725 WARN_ON_ONCE(ftrace_start_up < 0);
1211 1726
1727 if (ops->flags & FTRACE_OPS_FL_GLOBAL) {
1728 ops = &global_ops;
1729 global_start_up--;
1730 WARN_ON_ONCE(global_start_up < 0);
1731 /* Don't update hash if global still has users */
1732 if (global_start_up) {
1733 WARN_ON_ONCE(!ftrace_start_up);
1734 hash_disable = false;
1735 }
1736 }
1737
1738 if (hash_disable)
1739 ftrace_hash_rec_disable(ops, 1);
1740
1741 if (ops != &global_ops || !global_start_up)
1742 ops->flags &= ~FTRACE_OPS_FL_ENABLED;
1743
1212 if (!ftrace_start_up) 1744 if (!ftrace_start_up)
1213 command |= FTRACE_DISABLE_CALLS; 1745 command |= FTRACE_DISABLE_CALLS;
1214 1746
@@ -1249,10 +1781,36 @@ static cycle_t ftrace_update_time;
1249static unsigned long ftrace_update_cnt; 1781static unsigned long ftrace_update_cnt;
1250unsigned long ftrace_update_tot_cnt; 1782unsigned long ftrace_update_tot_cnt;
1251 1783
1784static int ops_traces_mod(struct ftrace_ops *ops)
1785{
1786 struct ftrace_hash *hash;
1787
1788 hash = ops->filter_hash;
1789 return !!(!hash || !hash->count);
1790}
1791
1252static int ftrace_update_code(struct module *mod) 1792static int ftrace_update_code(struct module *mod)
1253{ 1793{
1254 struct dyn_ftrace *p; 1794 struct dyn_ftrace *p;
1255 cycle_t start, stop; 1795 cycle_t start, stop;
1796 unsigned long ref = 0;
1797
1798 /*
1799 * When adding a module, we need to check if tracers are
1800 * currently enabled and if they are set to trace all functions.
1801 * If they are, we need to enable the module functions as well
1802 * as update the reference counts for those function records.
1803 */
1804 if (mod) {
1805 struct ftrace_ops *ops;
1806
1807 for (ops = ftrace_ops_list;
1808 ops != &ftrace_list_end; ops = ops->next) {
1809 if (ops->flags & FTRACE_OPS_FL_ENABLED &&
1810 ops_traces_mod(ops))
1811 ref++;
1812 }
1813 }
1256 1814
1257 start = ftrace_now(raw_smp_processor_id()); 1815 start = ftrace_now(raw_smp_processor_id());
1258 ftrace_update_cnt = 0; 1816 ftrace_update_cnt = 0;
@@ -1265,7 +1823,7 @@ static int ftrace_update_code(struct module *mod)
1265 1823
1266 p = ftrace_new_addrs; 1824 p = ftrace_new_addrs;
1267 ftrace_new_addrs = p->newlist; 1825 ftrace_new_addrs = p->newlist;
1268 p->flags = 0L; 1826 p->flags = ref;
1269 1827
1270 /* 1828 /*
1271 * Do the initial record conversion from mcount jump 1829 * Do the initial record conversion from mcount jump
@@ -1273,10 +1831,10 @@ static int ftrace_update_code(struct module *mod)
1273 */ 1831 */
1274 if (!ftrace_code_disable(mod, p)) { 1832 if (!ftrace_code_disable(mod, p)) {
1275 ftrace_free_rec(p); 1833 ftrace_free_rec(p);
1276 continue; 1834 /* Game over */
1835 break;
1277 } 1836 }
1278 1837
1279 p->flags |= FTRACE_FL_CONVERTED;
1280 ftrace_update_cnt++; 1838 ftrace_update_cnt++;
1281 1839
1282 /* 1840 /*
@@ -1288,7 +1846,7 @@ static int ftrace_update_code(struct module *mod)
1288 * conversion puts the module to the correct state, thus 1846 * conversion puts the module to the correct state, thus
1289 * passing the ftrace_make_call check. 1847 * passing the ftrace_make_call check.
1290 */ 1848 */
1291 if (ftrace_start_up) { 1849 if (ftrace_start_up && ref) {
1292 int failed = __ftrace_replace_code(p, 1); 1850 int failed = __ftrace_replace_code(p, 1);
1293 if (failed) { 1851 if (failed) {
1294 ftrace_bug(failed, p->ip); 1852 ftrace_bug(failed, p->ip);
@@ -1351,9 +1909,9 @@ static int __init ftrace_dyn_table_alloc(unsigned long num_to_init)
1351enum { 1909enum {
1352 FTRACE_ITER_FILTER = (1 << 0), 1910 FTRACE_ITER_FILTER = (1 << 0),
1353 FTRACE_ITER_NOTRACE = (1 << 1), 1911 FTRACE_ITER_NOTRACE = (1 << 1),
1354 FTRACE_ITER_FAILURES = (1 << 2), 1912 FTRACE_ITER_PRINTALL = (1 << 2),
1355 FTRACE_ITER_PRINTALL = (1 << 3), 1913 FTRACE_ITER_HASH = (1 << 3),
1356 FTRACE_ITER_HASH = (1 << 4), 1914 FTRACE_ITER_ENABLED = (1 << 4),
1357}; 1915};
1358 1916
1359#define FTRACE_BUFF_MAX (KSYM_SYMBOL_LEN+4) /* room for wildcards */ 1917#define FTRACE_BUFF_MAX (KSYM_SYMBOL_LEN+4) /* room for wildcards */
@@ -1365,6 +1923,8 @@ struct ftrace_iterator {
1365 struct dyn_ftrace *func; 1923 struct dyn_ftrace *func;
1366 struct ftrace_func_probe *probe; 1924 struct ftrace_func_probe *probe;
1367 struct trace_parser parser; 1925 struct trace_parser parser;
1926 struct ftrace_hash *hash;
1927 struct ftrace_ops *ops;
1368 int hidx; 1928 int hidx;
1369 int idx; 1929 int idx;
1370 unsigned flags; 1930 unsigned flags;
@@ -1461,8 +2021,12 @@ static void *
1461t_next(struct seq_file *m, void *v, loff_t *pos) 2021t_next(struct seq_file *m, void *v, loff_t *pos)
1462{ 2022{
1463 struct ftrace_iterator *iter = m->private; 2023 struct ftrace_iterator *iter = m->private;
2024 struct ftrace_ops *ops = &global_ops;
1464 struct dyn_ftrace *rec = NULL; 2025 struct dyn_ftrace *rec = NULL;
1465 2026
2027 if (unlikely(ftrace_disabled))
2028 return NULL;
2029
1466 if (iter->flags & FTRACE_ITER_HASH) 2030 if (iter->flags & FTRACE_ITER_HASH)
1467 return t_hash_next(m, pos); 2031 return t_hash_next(m, pos);
1468 2032
@@ -1483,17 +2047,15 @@ t_next(struct seq_file *m, void *v, loff_t *pos)
1483 rec = &iter->pg->records[iter->idx++]; 2047 rec = &iter->pg->records[iter->idx++];
1484 if ((rec->flags & FTRACE_FL_FREE) || 2048 if ((rec->flags & FTRACE_FL_FREE) ||
1485 2049
1486 (!(iter->flags & FTRACE_ITER_FAILURES) &&
1487 (rec->flags & FTRACE_FL_FAILED)) ||
1488
1489 ((iter->flags & FTRACE_ITER_FAILURES) &&
1490 !(rec->flags & FTRACE_FL_FAILED)) ||
1491
1492 ((iter->flags & FTRACE_ITER_FILTER) && 2050 ((iter->flags & FTRACE_ITER_FILTER) &&
1493 !(rec->flags & FTRACE_FL_FILTER)) || 2051 !(ftrace_lookup_ip(ops->filter_hash, rec->ip))) ||
1494 2052
1495 ((iter->flags & FTRACE_ITER_NOTRACE) && 2053 ((iter->flags & FTRACE_ITER_NOTRACE) &&
1496 !(rec->flags & FTRACE_FL_NOTRACE))) { 2054 !ftrace_lookup_ip(ops->notrace_hash, rec->ip)) ||
2055
2056 ((iter->flags & FTRACE_ITER_ENABLED) &&
2057 !(rec->flags & ~FTRACE_FL_MASK))) {
2058
1497 rec = NULL; 2059 rec = NULL;
1498 goto retry; 2060 goto retry;
1499 } 2061 }
@@ -1517,10 +2079,15 @@ static void reset_iter_read(struct ftrace_iterator *iter)
1517static void *t_start(struct seq_file *m, loff_t *pos) 2079static void *t_start(struct seq_file *m, loff_t *pos)
1518{ 2080{
1519 struct ftrace_iterator *iter = m->private; 2081 struct ftrace_iterator *iter = m->private;
2082 struct ftrace_ops *ops = &global_ops;
1520 void *p = NULL; 2083 void *p = NULL;
1521 loff_t l; 2084 loff_t l;
1522 2085
1523 mutex_lock(&ftrace_lock); 2086 mutex_lock(&ftrace_lock);
2087
2088 if (unlikely(ftrace_disabled))
2089 return NULL;
2090
1524 /* 2091 /*
1525 * If an lseek was done, then reset and start from beginning. 2092 * If an lseek was done, then reset and start from beginning.
1526 */ 2093 */
@@ -1532,7 +2099,7 @@ static void *t_start(struct seq_file *m, loff_t *pos)
1532 * off, we can short cut and just print out that all 2099 * off, we can short cut and just print out that all
1533 * functions are enabled. 2100 * functions are enabled.
1534 */ 2101 */
1535 if (iter->flags & FTRACE_ITER_FILTER && !ftrace_filtered) { 2102 if (iter->flags & FTRACE_ITER_FILTER && !ops->filter_hash->count) {
1536 if (*pos > 0) 2103 if (*pos > 0)
1537 return t_hash_start(m, pos); 2104 return t_hash_start(m, pos);
1538 iter->flags |= FTRACE_ITER_PRINTALL; 2105 iter->flags |= FTRACE_ITER_PRINTALL;
@@ -1590,7 +2157,11 @@ static int t_show(struct seq_file *m, void *v)
1590 if (!rec) 2157 if (!rec)
1591 return 0; 2158 return 0;
1592 2159
1593 seq_printf(m, "%ps\n", (void *)rec->ip); 2160 seq_printf(m, "%ps", (void *)rec->ip);
2161 if (iter->flags & FTRACE_ITER_ENABLED)
2162 seq_printf(m, " (%ld)",
2163 rec->flags & ~FTRACE_FL_MASK);
2164 seq_printf(m, "\n");
1594 2165
1595 return 0; 2166 return 0;
1596} 2167}
@@ -1630,44 +2201,46 @@ ftrace_avail_open(struct inode *inode, struct file *file)
1630} 2201}
1631 2202
1632static int 2203static int
1633ftrace_failures_open(struct inode *inode, struct file *file) 2204ftrace_enabled_open(struct inode *inode, struct file *file)
1634{ 2205{
1635 int ret;
1636 struct seq_file *m;
1637 struct ftrace_iterator *iter; 2206 struct ftrace_iterator *iter;
2207 int ret;
1638 2208
1639 ret = ftrace_avail_open(inode, file); 2209 if (unlikely(ftrace_disabled))
2210 return -ENODEV;
2211
2212 iter = kzalloc(sizeof(*iter), GFP_KERNEL);
2213 if (!iter)
2214 return -ENOMEM;
2215
2216 iter->pg = ftrace_pages_start;
2217 iter->flags = FTRACE_ITER_ENABLED;
2218
2219 ret = seq_open(file, &show_ftrace_seq_ops);
1640 if (!ret) { 2220 if (!ret) {
1641 m = file->private_data; 2221 struct seq_file *m = file->private_data;
1642 iter = m->private; 2222
1643 iter->flags = FTRACE_ITER_FAILURES; 2223 m->private = iter;
2224 } else {
2225 kfree(iter);
1644 } 2226 }
1645 2227
1646 return ret; 2228 return ret;
1647} 2229}
1648 2230
1649 2231static void ftrace_filter_reset(struct ftrace_hash *hash)
1650static void ftrace_filter_reset(int enable)
1651{ 2232{
1652 struct ftrace_page *pg;
1653 struct dyn_ftrace *rec;
1654 unsigned long type = enable ? FTRACE_FL_FILTER : FTRACE_FL_NOTRACE;
1655
1656 mutex_lock(&ftrace_lock); 2233 mutex_lock(&ftrace_lock);
1657 if (enable) 2234 ftrace_hash_clear(hash);
1658 ftrace_filtered = 0;
1659 do_for_each_ftrace_rec(pg, rec) {
1660 if (rec->flags & FTRACE_FL_FAILED)
1661 continue;
1662 rec->flags &= ~type;
1663 } while_for_each_ftrace_rec();
1664 mutex_unlock(&ftrace_lock); 2235 mutex_unlock(&ftrace_lock);
1665} 2236}
1666 2237
1667static int 2238static int
1668ftrace_regex_open(struct inode *inode, struct file *file, int enable) 2239ftrace_regex_open(struct ftrace_ops *ops, int flag,
2240 struct inode *inode, struct file *file)
1669{ 2241{
1670 struct ftrace_iterator *iter; 2242 struct ftrace_iterator *iter;
2243 struct ftrace_hash *hash;
1671 int ret = 0; 2244 int ret = 0;
1672 2245
1673 if (unlikely(ftrace_disabled)) 2246 if (unlikely(ftrace_disabled))
@@ -1682,21 +2255,42 @@ ftrace_regex_open(struct inode *inode, struct file *file, int enable)
1682 return -ENOMEM; 2255 return -ENOMEM;
1683 } 2256 }
1684 2257
2258 if (flag & FTRACE_ITER_NOTRACE)
2259 hash = ops->notrace_hash;
2260 else
2261 hash = ops->filter_hash;
2262
2263 iter->ops = ops;
2264 iter->flags = flag;
2265
2266 if (file->f_mode & FMODE_WRITE) {
2267 mutex_lock(&ftrace_lock);
2268 iter->hash = alloc_and_copy_ftrace_hash(FTRACE_HASH_DEFAULT_BITS, hash);
2269 mutex_unlock(&ftrace_lock);
2270
2271 if (!iter->hash) {
2272 trace_parser_put(&iter->parser);
2273 kfree(iter);
2274 return -ENOMEM;
2275 }
2276 }
2277
1685 mutex_lock(&ftrace_regex_lock); 2278 mutex_lock(&ftrace_regex_lock);
2279
1686 if ((file->f_mode & FMODE_WRITE) && 2280 if ((file->f_mode & FMODE_WRITE) &&
1687 (file->f_flags & O_TRUNC)) 2281 (file->f_flags & O_TRUNC))
1688 ftrace_filter_reset(enable); 2282 ftrace_filter_reset(iter->hash);
1689 2283
1690 if (file->f_mode & FMODE_READ) { 2284 if (file->f_mode & FMODE_READ) {
1691 iter->pg = ftrace_pages_start; 2285 iter->pg = ftrace_pages_start;
1692 iter->flags = enable ? FTRACE_ITER_FILTER :
1693 FTRACE_ITER_NOTRACE;
1694 2286
1695 ret = seq_open(file, &show_ftrace_seq_ops); 2287 ret = seq_open(file, &show_ftrace_seq_ops);
1696 if (!ret) { 2288 if (!ret) {
1697 struct seq_file *m = file->private_data; 2289 struct seq_file *m = file->private_data;
1698 m->private = iter; 2290 m->private = iter;
1699 } else { 2291 } else {
2292 /* Failed */
2293 free_ftrace_hash(iter->hash);
1700 trace_parser_put(&iter->parser); 2294 trace_parser_put(&iter->parser);
1701 kfree(iter); 2295 kfree(iter);
1702 } 2296 }
@@ -1710,13 +2304,15 @@ ftrace_regex_open(struct inode *inode, struct file *file, int enable)
1710static int 2304static int
1711ftrace_filter_open(struct inode *inode, struct file *file) 2305ftrace_filter_open(struct inode *inode, struct file *file)
1712{ 2306{
1713 return ftrace_regex_open(inode, file, 1); 2307 return ftrace_regex_open(&global_ops, FTRACE_ITER_FILTER,
2308 inode, file);
1714} 2309}
1715 2310
1716static int 2311static int
1717ftrace_notrace_open(struct inode *inode, struct file *file) 2312ftrace_notrace_open(struct inode *inode, struct file *file)
1718{ 2313{
1719 return ftrace_regex_open(inode, file, 0); 2314 return ftrace_regex_open(&global_ops, FTRACE_ITER_NOTRACE,
2315 inode, file);
1720} 2316}
1721 2317
1722static loff_t 2318static loff_t
@@ -1761,86 +2357,99 @@ static int ftrace_match(char *str, char *regex, int len, int type)
1761} 2357}
1762 2358
1763static int 2359static int
1764ftrace_match_record(struct dyn_ftrace *rec, char *regex, int len, int type) 2360enter_record(struct ftrace_hash *hash, struct dyn_ftrace *rec, int not)
2361{
2362 struct ftrace_func_entry *entry;
2363 int ret = 0;
2364
2365 entry = ftrace_lookup_ip(hash, rec->ip);
2366 if (not) {
2367 /* Do nothing if it doesn't exist */
2368 if (!entry)
2369 return 0;
2370
2371 free_hash_entry(hash, entry);
2372 } else {
2373 /* Do nothing if it exists */
2374 if (entry)
2375 return 0;
2376
2377 ret = add_hash_entry(hash, rec->ip);
2378 }
2379 return ret;
2380}
2381
2382static int
2383ftrace_match_record(struct dyn_ftrace *rec, char *mod,
2384 char *regex, int len, int type)
1765{ 2385{
1766 char str[KSYM_SYMBOL_LEN]; 2386 char str[KSYM_SYMBOL_LEN];
2387 char *modname;
2388
2389 kallsyms_lookup(rec->ip, NULL, NULL, &modname, str);
2390
2391 if (mod) {
2392 /* module lookup requires matching the module */
2393 if (!modname || strcmp(modname, mod))
2394 return 0;
2395
2396 /* blank search means to match all funcs in the mod */
2397 if (!len)
2398 return 1;
2399 }
1767 2400
1768 kallsyms_lookup(rec->ip, NULL, NULL, NULL, str);
1769 return ftrace_match(str, regex, len, type); 2401 return ftrace_match(str, regex, len, type);
1770} 2402}
1771 2403
1772static int ftrace_match_records(char *buff, int len, int enable) 2404static int
2405match_records(struct ftrace_hash *hash, char *buff,
2406 int len, char *mod, int not)
1773{ 2407{
1774 unsigned int search_len; 2408 unsigned search_len = 0;
1775 struct ftrace_page *pg; 2409 struct ftrace_page *pg;
1776 struct dyn_ftrace *rec; 2410 struct dyn_ftrace *rec;
1777 unsigned long flag; 2411 int type = MATCH_FULL;
1778 char *search; 2412 char *search = buff;
1779 int type;
1780 int not;
1781 int found = 0; 2413 int found = 0;
2414 int ret;
1782 2415
1783 flag = enable ? FTRACE_FL_FILTER : FTRACE_FL_NOTRACE; 2416 if (len) {
1784 type = filter_parse_regex(buff, len, &search, &not); 2417 type = filter_parse_regex(buff, len, &search, &not);
1785 2418 search_len = strlen(search);
1786 search_len = strlen(search); 2419 }
1787 2420
1788 mutex_lock(&ftrace_lock); 2421 mutex_lock(&ftrace_lock);
1789 do_for_each_ftrace_rec(pg, rec) {
1790 2422
1791 if (rec->flags & FTRACE_FL_FAILED) 2423 if (unlikely(ftrace_disabled))
1792 continue; 2424 goto out_unlock;
1793 2425
1794 if (ftrace_match_record(rec, search, search_len, type)) { 2426 do_for_each_ftrace_rec(pg, rec) {
1795 if (not) 2427
1796 rec->flags &= ~flag; 2428 if (ftrace_match_record(rec, mod, search, search_len, type)) {
1797 else 2429 ret = enter_record(hash, rec, not);
1798 rec->flags |= flag; 2430 if (ret < 0) {
2431 found = ret;
2432 goto out_unlock;
2433 }
1799 found = 1; 2434 found = 1;
1800 } 2435 }
1801 /*
1802 * Only enable filtering if we have a function that
1803 * is filtered on.
1804 */
1805 if (enable && (rec->flags & FTRACE_FL_FILTER))
1806 ftrace_filtered = 1;
1807 } while_for_each_ftrace_rec(); 2436 } while_for_each_ftrace_rec();
2437 out_unlock:
1808 mutex_unlock(&ftrace_lock); 2438 mutex_unlock(&ftrace_lock);
1809 2439
1810 return found; 2440 return found;
1811} 2441}
1812 2442
1813static int 2443static int
1814ftrace_match_module_record(struct dyn_ftrace *rec, char *mod, 2444ftrace_match_records(struct ftrace_hash *hash, char *buff, int len)
1815 char *regex, int len, int type)
1816{ 2445{
1817 char str[KSYM_SYMBOL_LEN]; 2446 return match_records(hash, buff, len, NULL, 0);
1818 char *modname;
1819
1820 kallsyms_lookup(rec->ip, NULL, NULL, &modname, str);
1821
1822 if (!modname || strcmp(modname, mod))
1823 return 0;
1824
1825 /* blank search means to match all funcs in the mod */
1826 if (len)
1827 return ftrace_match(str, regex, len, type);
1828 else
1829 return 1;
1830} 2447}
1831 2448
1832static int ftrace_match_module_records(char *buff, char *mod, int enable) 2449static int
2450ftrace_match_module_records(struct ftrace_hash *hash, char *buff, char *mod)
1833{ 2451{
1834 unsigned search_len = 0;
1835 struct ftrace_page *pg;
1836 struct dyn_ftrace *rec;
1837 int type = MATCH_FULL;
1838 char *search = buff;
1839 unsigned long flag;
1840 int not = 0; 2452 int not = 0;
1841 int found = 0;
1842
1843 flag = enable ? FTRACE_FL_FILTER : FTRACE_FL_NOTRACE;
1844 2453
1845 /* blank or '*' mean the same */ 2454 /* blank or '*' mean the same */
1846 if (strcmp(buff, "*") == 0) 2455 if (strcmp(buff, "*") == 0)
@@ -1852,32 +2461,7 @@ static int ftrace_match_module_records(char *buff, char *mod, int enable)
1852 not = 1; 2461 not = 1;
1853 } 2462 }
1854 2463
1855 if (strlen(buff)) { 2464 return match_records(hash, buff, strlen(buff), mod, not);
1856 type = filter_parse_regex(buff, strlen(buff), &search, &not);
1857 search_len = strlen(search);
1858 }
1859
1860 mutex_lock(&ftrace_lock);
1861 do_for_each_ftrace_rec(pg, rec) {
1862
1863 if (rec->flags & FTRACE_FL_FAILED)
1864 continue;
1865
1866 if (ftrace_match_module_record(rec, mod,
1867 search, search_len, type)) {
1868 if (not)
1869 rec->flags &= ~flag;
1870 else
1871 rec->flags |= flag;
1872 found = 1;
1873 }
1874 if (enable && (rec->flags & FTRACE_FL_FILTER))
1875 ftrace_filtered = 1;
1876
1877 } while_for_each_ftrace_rec();
1878 mutex_unlock(&ftrace_lock);
1879
1880 return found;
1881} 2465}
1882 2466
1883/* 2467/*
@@ -1886,9 +2470,11 @@ static int ftrace_match_module_records(char *buff, char *mod, int enable)
1886 */ 2470 */
1887 2471
1888static int 2472static int
1889ftrace_mod_callback(char *func, char *cmd, char *param, int enable) 2473ftrace_mod_callback(struct ftrace_hash *hash,
2474 char *func, char *cmd, char *param, int enable)
1890{ 2475{
1891 char *mod; 2476 char *mod;
2477 int ret = -EINVAL;
1892 2478
1893 /* 2479 /*
1894 * cmd == 'mod' because we only registered this func 2480 * cmd == 'mod' because we only registered this func
@@ -1900,15 +2486,19 @@ ftrace_mod_callback(char *func, char *cmd, char *param, int enable)
1900 2486
1901 /* we must have a module name */ 2487 /* we must have a module name */
1902 if (!param) 2488 if (!param)
1903 return -EINVAL; 2489 return ret;
1904 2490
1905 mod = strsep(&param, ":"); 2491 mod = strsep(&param, ":");
1906 if (!strlen(mod)) 2492 if (!strlen(mod))
1907 return -EINVAL; 2493 return ret;
1908 2494
1909 if (ftrace_match_module_records(func, mod, enable)) 2495 ret = ftrace_match_module_records(hash, func, mod);
1910 return 0; 2496 if (!ret)
1911 return -EINVAL; 2497 ret = -EINVAL;
2498 if (ret < 0)
2499 return ret;
2500
2501 return 0;
1912} 2502}
1913 2503
1914static struct ftrace_func_command ftrace_mod_cmd = { 2504static struct ftrace_func_command ftrace_mod_cmd = {
@@ -1959,6 +2549,7 @@ static int ftrace_probe_registered;
1959 2549
1960static void __enable_ftrace_function_probe(void) 2550static void __enable_ftrace_function_probe(void)
1961{ 2551{
2552 int ret;
1962 int i; 2553 int i;
1963 2554
1964 if (ftrace_probe_registered) 2555 if (ftrace_probe_registered)
@@ -1973,13 +2564,16 @@ static void __enable_ftrace_function_probe(void)
1973 if (i == FTRACE_FUNC_HASHSIZE) 2564 if (i == FTRACE_FUNC_HASHSIZE)
1974 return; 2565 return;
1975 2566
1976 __register_ftrace_function(&trace_probe_ops); 2567 ret = __register_ftrace_function(&trace_probe_ops);
1977 ftrace_startup(0); 2568 if (!ret)
2569 ret = ftrace_startup(&trace_probe_ops, 0);
2570
1978 ftrace_probe_registered = 1; 2571 ftrace_probe_registered = 1;
1979} 2572}
1980 2573
1981static void __disable_ftrace_function_probe(void) 2574static void __disable_ftrace_function_probe(void)
1982{ 2575{
2576 int ret;
1983 int i; 2577 int i;
1984 2578
1985 if (!ftrace_probe_registered) 2579 if (!ftrace_probe_registered)
@@ -1992,8 +2586,10 @@ static void __disable_ftrace_function_probe(void)
1992 } 2586 }
1993 2587
1994 /* no more funcs left */ 2588 /* no more funcs left */
1995 __unregister_ftrace_function(&trace_probe_ops); 2589 ret = __unregister_ftrace_function(&trace_probe_ops);
1996 ftrace_shutdown(0); 2590 if (!ret)
2591 ftrace_shutdown(&trace_probe_ops, 0);
2592
1997 ftrace_probe_registered = 0; 2593 ftrace_probe_registered = 0;
1998} 2594}
1999 2595
@@ -2029,12 +2625,13 @@ register_ftrace_function_probe(char *glob, struct ftrace_probe_ops *ops,
2029 return -EINVAL; 2625 return -EINVAL;
2030 2626
2031 mutex_lock(&ftrace_lock); 2627 mutex_lock(&ftrace_lock);
2032 do_for_each_ftrace_rec(pg, rec) {
2033 2628
2034 if (rec->flags & FTRACE_FL_FAILED) 2629 if (unlikely(ftrace_disabled))
2035 continue; 2630 goto out_unlock;
2036 2631
2037 if (!ftrace_match_record(rec, search, len, type)) 2632 do_for_each_ftrace_rec(pg, rec) {
2633
2634 if (!ftrace_match_record(rec, NULL, search, len, type))
2038 continue; 2635 continue;
2039 2636
2040 entry = kmalloc(sizeof(*entry), GFP_KERNEL); 2637 entry = kmalloc(sizeof(*entry), GFP_KERNEL);
@@ -2195,7 +2792,8 @@ int unregister_ftrace_command(struct ftrace_func_command *cmd)
2195 return ret; 2792 return ret;
2196} 2793}
2197 2794
2198static int ftrace_process_regex(char *buff, int len, int enable) 2795static int ftrace_process_regex(struct ftrace_hash *hash,
2796 char *buff, int len, int enable)
2199{ 2797{
2200 char *func, *command, *next = buff; 2798 char *func, *command, *next = buff;
2201 struct ftrace_func_command *p; 2799 struct ftrace_func_command *p;
@@ -2204,9 +2802,12 @@ static int ftrace_process_regex(char *buff, int len, int enable)
2204 func = strsep(&next, ":"); 2802 func = strsep(&next, ":");
2205 2803
2206 if (!next) { 2804 if (!next) {
2207 if (ftrace_match_records(func, len, enable)) 2805 ret = ftrace_match_records(hash, func, len);
2208 return 0; 2806 if (!ret)
2209 return ret; 2807 ret = -EINVAL;
2808 if (ret < 0)
2809 return ret;
2810 return 0;
2210 } 2811 }
2211 2812
2212 /* command found */ 2813 /* command found */
@@ -2216,7 +2817,7 @@ static int ftrace_process_regex(char *buff, int len, int enable)
2216 mutex_lock(&ftrace_cmd_mutex); 2817 mutex_lock(&ftrace_cmd_mutex);
2217 list_for_each_entry(p, &ftrace_commands, list) { 2818 list_for_each_entry(p, &ftrace_commands, list) {
2218 if (strcmp(p->name, command) == 0) { 2819 if (strcmp(p->name, command) == 0) {
2219 ret = p->func(func, command, next, enable); 2820 ret = p->func(hash, func, command, next, enable);
2220 goto out_unlock; 2821 goto out_unlock;
2221 } 2822 }
2222 } 2823 }
@@ -2239,6 +2840,10 @@ ftrace_regex_write(struct file *file, const char __user *ubuf,
2239 2840
2240 mutex_lock(&ftrace_regex_lock); 2841 mutex_lock(&ftrace_regex_lock);
2241 2842
2843 ret = -ENODEV;
2844 if (unlikely(ftrace_disabled))
2845 goto out_unlock;
2846
2242 if (file->f_mode & FMODE_READ) { 2847 if (file->f_mode & FMODE_READ) {
2243 struct seq_file *m = file->private_data; 2848 struct seq_file *m = file->private_data;
2244 iter = m->private; 2849 iter = m->private;
@@ -2250,7 +2855,7 @@ ftrace_regex_write(struct file *file, const char __user *ubuf,
2250 2855
2251 if (read >= 0 && trace_parser_loaded(parser) && 2856 if (read >= 0 && trace_parser_loaded(parser) &&
2252 !trace_parser_cont(parser)) { 2857 !trace_parser_cont(parser)) {
2253 ret = ftrace_process_regex(parser->buffer, 2858 ret = ftrace_process_regex(iter->hash, parser->buffer,
2254 parser->idx, enable); 2859 parser->idx, enable);
2255 trace_parser_clear(parser); 2860 trace_parser_clear(parser);
2256 if (ret) 2861 if (ret)
@@ -2278,22 +2883,53 @@ ftrace_notrace_write(struct file *file, const char __user *ubuf,
2278 return ftrace_regex_write(file, ubuf, cnt, ppos, 0); 2883 return ftrace_regex_write(file, ubuf, cnt, ppos, 0);
2279} 2884}
2280 2885
2281static void 2886static int
2282ftrace_set_regex(unsigned char *buf, int len, int reset, int enable) 2887ftrace_set_regex(struct ftrace_ops *ops, unsigned char *buf, int len,
2888 int reset, int enable)
2283{ 2889{
2890 struct ftrace_hash **orig_hash;
2891 struct ftrace_hash *hash;
2892 int ret;
2893
2894 /* All global ops uses the global ops filters */
2895 if (ops->flags & FTRACE_OPS_FL_GLOBAL)
2896 ops = &global_ops;
2897
2284 if (unlikely(ftrace_disabled)) 2898 if (unlikely(ftrace_disabled))
2285 return; 2899 return -ENODEV;
2900
2901 if (enable)
2902 orig_hash = &ops->filter_hash;
2903 else
2904 orig_hash = &ops->notrace_hash;
2905
2906 hash = alloc_and_copy_ftrace_hash(FTRACE_HASH_DEFAULT_BITS, *orig_hash);
2907 if (!hash)
2908 return -ENOMEM;
2286 2909
2287 mutex_lock(&ftrace_regex_lock); 2910 mutex_lock(&ftrace_regex_lock);
2288 if (reset) 2911 if (reset)
2289 ftrace_filter_reset(enable); 2912 ftrace_filter_reset(hash);
2290 if (buf) 2913 if (buf)
2291 ftrace_match_records(buf, len, enable); 2914 ftrace_match_records(hash, buf, len);
2915
2916 mutex_lock(&ftrace_lock);
2917 ret = ftrace_hash_move(ops, enable, orig_hash, hash);
2918 if (!ret && ops->flags & FTRACE_OPS_FL_ENABLED
2919 && ftrace_enabled)
2920 ftrace_run_update_code(FTRACE_ENABLE_CALLS);
2921
2922 mutex_unlock(&ftrace_lock);
2923
2292 mutex_unlock(&ftrace_regex_lock); 2924 mutex_unlock(&ftrace_regex_lock);
2925
2926 free_ftrace_hash(hash);
2927 return ret;
2293} 2928}
2294 2929
2295/** 2930/**
2296 * ftrace_set_filter - set a function to filter on in ftrace 2931 * ftrace_set_filter - set a function to filter on in ftrace
2932 * @ops - the ops to set the filter with
2297 * @buf - the string that holds the function filter text. 2933 * @buf - the string that holds the function filter text.
2298 * @len - the length of the string. 2934 * @len - the length of the string.
2299 * @reset - non zero to reset all filters before applying this filter. 2935 * @reset - non zero to reset all filters before applying this filter.
@@ -2301,13 +2937,16 @@ ftrace_set_regex(unsigned char *buf, int len, int reset, int enable)
2301 * Filters denote which functions should be enabled when tracing is enabled. 2937 * Filters denote which functions should be enabled when tracing is enabled.
2302 * If @buf is NULL and reset is set, all functions will be enabled for tracing. 2938 * If @buf is NULL and reset is set, all functions will be enabled for tracing.
2303 */ 2939 */
2304void ftrace_set_filter(unsigned char *buf, int len, int reset) 2940void ftrace_set_filter(struct ftrace_ops *ops, unsigned char *buf,
2941 int len, int reset)
2305{ 2942{
2306 ftrace_set_regex(buf, len, reset, 1); 2943 ftrace_set_regex(ops, buf, len, reset, 1);
2307} 2944}
2945EXPORT_SYMBOL_GPL(ftrace_set_filter);
2308 2946
2309/** 2947/**
2310 * ftrace_set_notrace - set a function to not trace in ftrace 2948 * ftrace_set_notrace - set a function to not trace in ftrace
2949 * @ops - the ops to set the notrace filter with
2311 * @buf - the string that holds the function notrace text. 2950 * @buf - the string that holds the function notrace text.
2312 * @len - the length of the string. 2951 * @len - the length of the string.
2313 * @reset - non zero to reset all filters before applying this filter. 2952 * @reset - non zero to reset all filters before applying this filter.
@@ -2316,10 +2955,44 @@ void ftrace_set_filter(unsigned char *buf, int len, int reset)
2316 * is enabled. If @buf is NULL and reset is set, all functions will be enabled 2955 * is enabled. If @buf is NULL and reset is set, all functions will be enabled
2317 * for tracing. 2956 * for tracing.
2318 */ 2957 */
2319void ftrace_set_notrace(unsigned char *buf, int len, int reset) 2958void ftrace_set_notrace(struct ftrace_ops *ops, unsigned char *buf,
2959 int len, int reset)
2320{ 2960{
2321 ftrace_set_regex(buf, len, reset, 0); 2961 ftrace_set_regex(ops, buf, len, reset, 0);
2322} 2962}
2963EXPORT_SYMBOL_GPL(ftrace_set_notrace);
2964/**
2965 * ftrace_set_filter - set a function to filter on in ftrace
2966 * @ops - the ops to set the filter with
2967 * @buf - the string that holds the function filter text.
2968 * @len - the length of the string.
2969 * @reset - non zero to reset all filters before applying this filter.
2970 *
2971 * Filters denote which functions should be enabled when tracing is enabled.
2972 * If @buf is NULL and reset is set, all functions will be enabled for tracing.
2973 */
2974void ftrace_set_global_filter(unsigned char *buf, int len, int reset)
2975{
2976 ftrace_set_regex(&global_ops, buf, len, reset, 1);
2977}
2978EXPORT_SYMBOL_GPL(ftrace_set_global_filter);
2979
2980/**
2981 * ftrace_set_notrace - set a function to not trace in ftrace
2982 * @ops - the ops to set the notrace filter with
2983 * @buf - the string that holds the function notrace text.
2984 * @len - the length of the string.
2985 * @reset - non zero to reset all filters before applying this filter.
2986 *
2987 * Notrace Filters denote which functions should not be enabled when tracing
2988 * is enabled. If @buf is NULL and reset is set, all functions will be enabled
2989 * for tracing.
2990 */
2991void ftrace_set_global_notrace(unsigned char *buf, int len, int reset)
2992{
2993 ftrace_set_regex(&global_ops, buf, len, reset, 0);
2994}
2995EXPORT_SYMBOL_GPL(ftrace_set_global_notrace);
2323 2996
2324/* 2997/*
2325 * command line interface to allow users to set filters on boot up. 2998 * command line interface to allow users to set filters on boot up.
@@ -2370,22 +3043,23 @@ static void __init set_ftrace_early_graph(char *buf)
2370} 3043}
2371#endif /* CONFIG_FUNCTION_GRAPH_TRACER */ 3044#endif /* CONFIG_FUNCTION_GRAPH_TRACER */
2372 3045
2373static void __init set_ftrace_early_filter(char *buf, int enable) 3046static void __init
3047set_ftrace_early_filter(struct ftrace_ops *ops, char *buf, int enable)
2374{ 3048{
2375 char *func; 3049 char *func;
2376 3050
2377 while (buf) { 3051 while (buf) {
2378 func = strsep(&buf, ","); 3052 func = strsep(&buf, ",");
2379 ftrace_set_regex(func, strlen(func), 0, enable); 3053 ftrace_set_regex(ops, func, strlen(func), 0, enable);
2380 } 3054 }
2381} 3055}
2382 3056
2383static void __init set_ftrace_early_filters(void) 3057static void __init set_ftrace_early_filters(void)
2384{ 3058{
2385 if (ftrace_filter_buf[0]) 3059 if (ftrace_filter_buf[0])
2386 set_ftrace_early_filter(ftrace_filter_buf, 1); 3060 set_ftrace_early_filter(&global_ops, ftrace_filter_buf, 1);
2387 if (ftrace_notrace_buf[0]) 3061 if (ftrace_notrace_buf[0])
2388 set_ftrace_early_filter(ftrace_notrace_buf, 0); 3062 set_ftrace_early_filter(&global_ops, ftrace_notrace_buf, 0);
2389#ifdef CONFIG_FUNCTION_GRAPH_TRACER 3063#ifdef CONFIG_FUNCTION_GRAPH_TRACER
2390 if (ftrace_graph_buf[0]) 3064 if (ftrace_graph_buf[0])
2391 set_ftrace_early_graph(ftrace_graph_buf); 3065 set_ftrace_early_graph(ftrace_graph_buf);
@@ -2393,11 +3067,14 @@ static void __init set_ftrace_early_filters(void)
2393} 3067}
2394 3068
2395static int 3069static int
2396ftrace_regex_release(struct inode *inode, struct file *file, int enable) 3070ftrace_regex_release(struct inode *inode, struct file *file)
2397{ 3071{
2398 struct seq_file *m = (struct seq_file *)file->private_data; 3072 struct seq_file *m = (struct seq_file *)file->private_data;
2399 struct ftrace_iterator *iter; 3073 struct ftrace_iterator *iter;
3074 struct ftrace_hash **orig_hash;
2400 struct trace_parser *parser; 3075 struct trace_parser *parser;
3076 int filter_hash;
3077 int ret;
2401 3078
2402 mutex_lock(&ftrace_regex_lock); 3079 mutex_lock(&ftrace_regex_lock);
2403 if (file->f_mode & FMODE_READ) { 3080 if (file->f_mode & FMODE_READ) {
@@ -2410,33 +3087,35 @@ ftrace_regex_release(struct inode *inode, struct file *file, int enable)
2410 parser = &iter->parser; 3087 parser = &iter->parser;
2411 if (trace_parser_loaded(parser)) { 3088 if (trace_parser_loaded(parser)) {
2412 parser->buffer[parser->idx] = 0; 3089 parser->buffer[parser->idx] = 0;
2413 ftrace_match_records(parser->buffer, parser->idx, enable); 3090 ftrace_match_records(iter->hash, parser->buffer, parser->idx);
2414 } 3091 }
2415 3092
2416 mutex_lock(&ftrace_lock);
2417 if (ftrace_start_up && ftrace_enabled)
2418 ftrace_run_update_code(FTRACE_ENABLE_CALLS);
2419 mutex_unlock(&ftrace_lock);
2420
2421 trace_parser_put(parser); 3093 trace_parser_put(parser);
3094
3095 if (file->f_mode & FMODE_WRITE) {
3096 filter_hash = !!(iter->flags & FTRACE_ITER_FILTER);
3097
3098 if (filter_hash)
3099 orig_hash = &iter->ops->filter_hash;
3100 else
3101 orig_hash = &iter->ops->notrace_hash;
3102
3103 mutex_lock(&ftrace_lock);
3104 ret = ftrace_hash_move(iter->ops, filter_hash,
3105 orig_hash, iter->hash);
3106 if (!ret && (iter->ops->flags & FTRACE_OPS_FL_ENABLED)
3107 && ftrace_enabled)
3108 ftrace_run_update_code(FTRACE_ENABLE_CALLS);
3109
3110 mutex_unlock(&ftrace_lock);
3111 }
3112 free_ftrace_hash(iter->hash);
2422 kfree(iter); 3113 kfree(iter);
2423 3114
2424 mutex_unlock(&ftrace_regex_lock); 3115 mutex_unlock(&ftrace_regex_lock);
2425 return 0; 3116 return 0;
2426} 3117}
2427 3118
2428static int
2429ftrace_filter_release(struct inode *inode, struct file *file)
2430{
2431 return ftrace_regex_release(inode, file, 1);
2432}
2433
2434static int
2435ftrace_notrace_release(struct inode *inode, struct file *file)
2436{
2437 return ftrace_regex_release(inode, file, 0);
2438}
2439
2440static const struct file_operations ftrace_avail_fops = { 3119static const struct file_operations ftrace_avail_fops = {
2441 .open = ftrace_avail_open, 3120 .open = ftrace_avail_open,
2442 .read = seq_read, 3121 .read = seq_read,
@@ -2444,8 +3123,8 @@ static const struct file_operations ftrace_avail_fops = {
2444 .release = seq_release_private, 3123 .release = seq_release_private,
2445}; 3124};
2446 3125
2447static const struct file_operations ftrace_failures_fops = { 3126static const struct file_operations ftrace_enabled_fops = {
2448 .open = ftrace_failures_open, 3127 .open = ftrace_enabled_open,
2449 .read = seq_read, 3128 .read = seq_read,
2450 .llseek = seq_lseek, 3129 .llseek = seq_lseek,
2451 .release = seq_release_private, 3130 .release = seq_release_private,
@@ -2456,7 +3135,7 @@ static const struct file_operations ftrace_filter_fops = {
2456 .read = seq_read, 3135 .read = seq_read,
2457 .write = ftrace_filter_write, 3136 .write = ftrace_filter_write,
2458 .llseek = ftrace_regex_lseek, 3137 .llseek = ftrace_regex_lseek,
2459 .release = ftrace_filter_release, 3138 .release = ftrace_regex_release,
2460}; 3139};
2461 3140
2462static const struct file_operations ftrace_notrace_fops = { 3141static const struct file_operations ftrace_notrace_fops = {
@@ -2464,7 +3143,7 @@ static const struct file_operations ftrace_notrace_fops = {
2464 .read = seq_read, 3143 .read = seq_read,
2465 .write = ftrace_notrace_write, 3144 .write = ftrace_notrace_write,
2466 .llseek = ftrace_regex_lseek, 3145 .llseek = ftrace_regex_lseek,
2467 .release = ftrace_notrace_release, 3146 .release = ftrace_regex_release,
2468}; 3147};
2469 3148
2470#ifdef CONFIG_FUNCTION_GRAPH_TRACER 3149#ifdef CONFIG_FUNCTION_GRAPH_TRACER
@@ -2573,9 +3252,6 @@ ftrace_set_func(unsigned long *array, int *idx, char *buffer)
2573 bool exists; 3252 bool exists;
2574 int i; 3253 int i;
2575 3254
2576 if (ftrace_disabled)
2577 return -ENODEV;
2578
2579 /* decode regex */ 3255 /* decode regex */
2580 type = filter_parse_regex(buffer, strlen(buffer), &search, &not); 3256 type = filter_parse_regex(buffer, strlen(buffer), &search, &not);
2581 if (!not && *idx >= FTRACE_GRAPH_MAX_FUNCS) 3257 if (!not && *idx >= FTRACE_GRAPH_MAX_FUNCS)
@@ -2584,12 +3260,18 @@ ftrace_set_func(unsigned long *array, int *idx, char *buffer)
2584 search_len = strlen(search); 3260 search_len = strlen(search);
2585 3261
2586 mutex_lock(&ftrace_lock); 3262 mutex_lock(&ftrace_lock);
3263
3264 if (unlikely(ftrace_disabled)) {
3265 mutex_unlock(&ftrace_lock);
3266 return -ENODEV;
3267 }
3268
2587 do_for_each_ftrace_rec(pg, rec) { 3269 do_for_each_ftrace_rec(pg, rec) {
2588 3270
2589 if (rec->flags & (FTRACE_FL_FAILED | FTRACE_FL_FREE)) 3271 if (rec->flags & FTRACE_FL_FREE)
2590 continue; 3272 continue;
2591 3273
2592 if (ftrace_match_record(rec, search, search_len, type)) { 3274 if (ftrace_match_record(rec, NULL, search, search_len, type)) {
2593 /* if it is in the array */ 3275 /* if it is in the array */
2594 exists = false; 3276 exists = false;
2595 for (i = 0; i < *idx; i++) { 3277 for (i = 0; i < *idx; i++) {
@@ -2679,8 +3361,8 @@ static __init int ftrace_init_dyn_debugfs(struct dentry *d_tracer)
2679 trace_create_file("available_filter_functions", 0444, 3361 trace_create_file("available_filter_functions", 0444,
2680 d_tracer, NULL, &ftrace_avail_fops); 3362 d_tracer, NULL, &ftrace_avail_fops);
2681 3363
2682 trace_create_file("failures", 0444, 3364 trace_create_file("enabled_functions", 0444,
2683 d_tracer, NULL, &ftrace_failures_fops); 3365 d_tracer, NULL, &ftrace_enabled_fops);
2684 3366
2685 trace_create_file("set_ftrace_filter", 0644, d_tracer, 3367 trace_create_file("set_ftrace_filter", 0644, d_tracer,
2686 NULL, &ftrace_filter_fops); 3368 NULL, &ftrace_filter_fops);
@@ -2703,7 +3385,7 @@ static int ftrace_process_locs(struct module *mod,
2703{ 3385{
2704 unsigned long *p; 3386 unsigned long *p;
2705 unsigned long addr; 3387 unsigned long addr;
2706 unsigned long flags; 3388 unsigned long flags = 0; /* Shut up gcc */
2707 3389
2708 mutex_lock(&ftrace_lock); 3390 mutex_lock(&ftrace_lock);
2709 p = start; 3391 p = start;
@@ -2720,10 +3402,19 @@ static int ftrace_process_locs(struct module *mod,
2720 ftrace_record_ip(addr); 3402 ftrace_record_ip(addr);
2721 } 3403 }
2722 3404
2723 /* disable interrupts to prevent kstop machine */ 3405 /*
2724 local_irq_save(flags); 3406 * We only need to disable interrupts on start up
3407 * because we are modifying code that an interrupt
3408 * may execute, and the modification is not atomic.
3409 * But for modules, nothing runs the code we modify
3410 * until we are finished with it, and there's no
3411 * reason to cause large interrupt latencies while we do it.
3412 */
3413 if (!mod)
3414 local_irq_save(flags);
2725 ftrace_update_code(mod); 3415 ftrace_update_code(mod);
2726 local_irq_restore(flags); 3416 if (!mod)
3417 local_irq_restore(flags);
2727 mutex_unlock(&ftrace_lock); 3418 mutex_unlock(&ftrace_lock);
2728 3419
2729 return 0; 3420 return 0;
@@ -2735,10 +3426,11 @@ void ftrace_release_mod(struct module *mod)
2735 struct dyn_ftrace *rec; 3426 struct dyn_ftrace *rec;
2736 struct ftrace_page *pg; 3427 struct ftrace_page *pg;
2737 3428
3429 mutex_lock(&ftrace_lock);
3430
2738 if (ftrace_disabled) 3431 if (ftrace_disabled)
2739 return; 3432 goto out_unlock;
2740 3433
2741 mutex_lock(&ftrace_lock);
2742 do_for_each_ftrace_rec(pg, rec) { 3434 do_for_each_ftrace_rec(pg, rec) {
2743 if (within_module_core(rec->ip, mod)) { 3435 if (within_module_core(rec->ip, mod)) {
2744 /* 3436 /*
@@ -2749,6 +3441,7 @@ void ftrace_release_mod(struct module *mod)
2749 ftrace_free_rec(rec); 3441 ftrace_free_rec(rec);
2750 } 3442 }
2751 } while_for_each_ftrace_rec(); 3443 } while_for_each_ftrace_rec();
3444 out_unlock:
2752 mutex_unlock(&ftrace_lock); 3445 mutex_unlock(&ftrace_lock);
2753} 3446}
2754 3447
@@ -2835,6 +3528,10 @@ void __init ftrace_init(void)
2835 3528
2836#else 3529#else
2837 3530
3531static struct ftrace_ops global_ops = {
3532 .func = ftrace_stub,
3533};
3534
2838static int __init ftrace_nodyn_init(void) 3535static int __init ftrace_nodyn_init(void)
2839{ 3536{
2840 ftrace_enabled = 1; 3537 ftrace_enabled = 1;
@@ -2845,12 +3542,47 @@ device_initcall(ftrace_nodyn_init);
2845static inline int ftrace_init_dyn_debugfs(struct dentry *d_tracer) { return 0; } 3542static inline int ftrace_init_dyn_debugfs(struct dentry *d_tracer) { return 0; }
2846static inline void ftrace_startup_enable(int command) { } 3543static inline void ftrace_startup_enable(int command) { }
2847/* Keep as macros so we do not need to define the commands */ 3544/* Keep as macros so we do not need to define the commands */
2848# define ftrace_startup(command) do { } while (0) 3545# define ftrace_startup(ops, command) \
2849# define ftrace_shutdown(command) do { } while (0) 3546 ({ \
3547 (ops)->flags |= FTRACE_OPS_FL_ENABLED; \
3548 0; \
3549 })
3550# define ftrace_shutdown(ops, command) do { } while (0)
2850# define ftrace_startup_sysctl() do { } while (0) 3551# define ftrace_startup_sysctl() do { } while (0)
2851# define ftrace_shutdown_sysctl() do { } while (0) 3552# define ftrace_shutdown_sysctl() do { } while (0)
3553
3554static inline int
3555ftrace_ops_test(struct ftrace_ops *ops, unsigned long ip)
3556{
3557 return 1;
3558}
3559
2852#endif /* CONFIG_DYNAMIC_FTRACE */ 3560#endif /* CONFIG_DYNAMIC_FTRACE */
2853 3561
3562static void
3563ftrace_ops_list_func(unsigned long ip, unsigned long parent_ip)
3564{
3565 struct ftrace_ops *op;
3566
3567 if (unlikely(trace_recursion_test(TRACE_INTERNAL_BIT)))
3568 return;
3569
3570 trace_recursion_set(TRACE_INTERNAL_BIT);
3571 /*
3572 * Some of the ops may be dynamically allocated,
3573 * they must be freed after a synchronize_sched().
3574 */
3575 preempt_disable_notrace();
3576 op = rcu_dereference_raw(ftrace_ops_list);
3577 while (op != &ftrace_list_end) {
3578 if (ftrace_ops_test(op, ip))
3579 op->func(ip, parent_ip);
3580 op = rcu_dereference_raw(op->next);
3581 };
3582 preempt_enable_notrace();
3583 trace_recursion_clear(TRACE_INTERNAL_BIT);
3584}
3585
2854static void clear_ftrace_swapper(void) 3586static void clear_ftrace_swapper(void)
2855{ 3587{
2856 struct task_struct *p; 3588 struct task_struct *p;
@@ -3143,19 +3875,23 @@ void ftrace_kill(void)
3143 */ 3875 */
3144int register_ftrace_function(struct ftrace_ops *ops) 3876int register_ftrace_function(struct ftrace_ops *ops)
3145{ 3877{
3146 int ret; 3878 int ret = -1;
3147
3148 if (unlikely(ftrace_disabled))
3149 return -1;
3150 3879
3151 mutex_lock(&ftrace_lock); 3880 mutex_lock(&ftrace_lock);
3152 3881
3882 if (unlikely(ftrace_disabled))
3883 goto out_unlock;
3884
3153 ret = __register_ftrace_function(ops); 3885 ret = __register_ftrace_function(ops);
3154 ftrace_startup(0); 3886 if (!ret)
3887 ret = ftrace_startup(ops, 0);
3888
3155 3889
3890 out_unlock:
3156 mutex_unlock(&ftrace_lock); 3891 mutex_unlock(&ftrace_lock);
3157 return ret; 3892 return ret;
3158} 3893}
3894EXPORT_SYMBOL_GPL(register_ftrace_function);
3159 3895
3160/** 3896/**
3161 * unregister_ftrace_function - unregister a function for profiling. 3897 * unregister_ftrace_function - unregister a function for profiling.
@@ -3169,25 +3905,27 @@ int unregister_ftrace_function(struct ftrace_ops *ops)
3169 3905
3170 mutex_lock(&ftrace_lock); 3906 mutex_lock(&ftrace_lock);
3171 ret = __unregister_ftrace_function(ops); 3907 ret = __unregister_ftrace_function(ops);
3172 ftrace_shutdown(0); 3908 if (!ret)
3909 ftrace_shutdown(ops, 0);
3173 mutex_unlock(&ftrace_lock); 3910 mutex_unlock(&ftrace_lock);
3174 3911
3175 return ret; 3912 return ret;
3176} 3913}
3914EXPORT_SYMBOL_GPL(unregister_ftrace_function);
3177 3915
3178int 3916int
3179ftrace_enable_sysctl(struct ctl_table *table, int write, 3917ftrace_enable_sysctl(struct ctl_table *table, int write,
3180 void __user *buffer, size_t *lenp, 3918 void __user *buffer, size_t *lenp,
3181 loff_t *ppos) 3919 loff_t *ppos)
3182{ 3920{
3183 int ret; 3921 int ret = -ENODEV;
3184
3185 if (unlikely(ftrace_disabled))
3186 return -ENODEV;
3187 3922
3188 mutex_lock(&ftrace_lock); 3923 mutex_lock(&ftrace_lock);
3189 3924
3190 ret = proc_dointvec(table, write, buffer, lenp, ppos); 3925 if (unlikely(ftrace_disabled))
3926 goto out;
3927
3928 ret = proc_dointvec(table, write, buffer, lenp, ppos);
3191 3929
3192 if (ret || !write || (last_ftrace_enabled == !!ftrace_enabled)) 3930 if (ret || !write || (last_ftrace_enabled == !!ftrace_enabled))
3193 goto out; 3931 goto out;
@@ -3199,11 +3937,11 @@ ftrace_enable_sysctl(struct ctl_table *table, int write,
3199 ftrace_startup_sysctl(); 3937 ftrace_startup_sysctl();
3200 3938
3201 /* we are starting ftrace again */ 3939 /* we are starting ftrace again */
3202 if (ftrace_list != &ftrace_list_end) { 3940 if (ftrace_ops_list != &ftrace_list_end) {
3203 if (ftrace_list->next == &ftrace_list_end) 3941 if (ftrace_ops_list->next == &ftrace_list_end)
3204 ftrace_trace_function = ftrace_list->func; 3942 ftrace_trace_function = ftrace_ops_list->func;
3205 else 3943 else
3206 ftrace_trace_function = ftrace_list_func; 3944 ftrace_trace_function = ftrace_ops_list_func;
3207 } 3945 }
3208 3946
3209 } else { 3947 } else {
@@ -3392,7 +4130,7 @@ int register_ftrace_graph(trace_func_graph_ret_t retfunc,
3392 ftrace_graph_return = retfunc; 4130 ftrace_graph_return = retfunc;
3393 ftrace_graph_entry = entryfunc; 4131 ftrace_graph_entry = entryfunc;
3394 4132
3395 ftrace_startup(FTRACE_START_FUNC_RET); 4133 ret = ftrace_startup(&global_ops, FTRACE_START_FUNC_RET);
3396 4134
3397out: 4135out:
3398 mutex_unlock(&ftrace_lock); 4136 mutex_unlock(&ftrace_lock);
@@ -3409,7 +4147,7 @@ void unregister_ftrace_graph(void)
3409 ftrace_graph_active--; 4147 ftrace_graph_active--;
3410 ftrace_graph_return = (trace_func_graph_ret_t)ftrace_stub; 4148 ftrace_graph_return = (trace_func_graph_ret_t)ftrace_stub;
3411 ftrace_graph_entry = ftrace_graph_entry_stub; 4149 ftrace_graph_entry = ftrace_graph_entry_stub;
3412 ftrace_shutdown(FTRACE_STOP_FUNC_RET); 4150 ftrace_shutdown(&global_ops, FTRACE_STOP_FUNC_RET);
3413 unregister_pm_notifier(&ftrace_suspend_notifier); 4151 unregister_pm_notifier(&ftrace_suspend_notifier);
3414 unregister_trace_sched_switch(ftrace_graph_probe_sched_switch, NULL); 4152 unregister_trace_sched_switch(ftrace_graph_probe_sched_switch, NULL);
3415 4153
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index 0ef7b4b2a1f7..731201bf4acc 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -997,15 +997,21 @@ static int rb_allocate_pages(struct ring_buffer_per_cpu *cpu_buffer,
997 unsigned nr_pages) 997 unsigned nr_pages)
998{ 998{
999 struct buffer_page *bpage, *tmp; 999 struct buffer_page *bpage, *tmp;
1000 unsigned long addr;
1001 LIST_HEAD(pages); 1000 LIST_HEAD(pages);
1002 unsigned i; 1001 unsigned i;
1003 1002
1004 WARN_ON(!nr_pages); 1003 WARN_ON(!nr_pages);
1005 1004
1006 for (i = 0; i < nr_pages; i++) { 1005 for (i = 0; i < nr_pages; i++) {
1006 struct page *page;
1007 /*
1008 * __GFP_NORETRY flag makes sure that the allocation fails
1009 * gracefully without invoking oom-killer and the system is
1010 * not destabilized.
1011 */
1007 bpage = kzalloc_node(ALIGN(sizeof(*bpage), cache_line_size()), 1012 bpage = kzalloc_node(ALIGN(sizeof(*bpage), cache_line_size()),
1008 GFP_KERNEL, cpu_to_node(cpu_buffer->cpu)); 1013 GFP_KERNEL | __GFP_NORETRY,
1014 cpu_to_node(cpu_buffer->cpu));
1009 if (!bpage) 1015 if (!bpage)
1010 goto free_pages; 1016 goto free_pages;
1011 1017
@@ -1013,10 +1019,11 @@ static int rb_allocate_pages(struct ring_buffer_per_cpu *cpu_buffer,
1013 1019
1014 list_add(&bpage->list, &pages); 1020 list_add(&bpage->list, &pages);
1015 1021
1016 addr = __get_free_page(GFP_KERNEL); 1022 page = alloc_pages_node(cpu_to_node(cpu_buffer->cpu),
1017 if (!addr) 1023 GFP_KERNEL | __GFP_NORETRY, 0);
1024 if (!page)
1018 goto free_pages; 1025 goto free_pages;
1019 bpage->page = (void *)addr; 1026 bpage->page = page_address(page);
1020 rb_init_page(bpage->page); 1027 rb_init_page(bpage->page);
1021 } 1028 }
1022 1029
@@ -1045,7 +1052,7 @@ rb_allocate_cpu_buffer(struct ring_buffer *buffer, int cpu)
1045{ 1052{
1046 struct ring_buffer_per_cpu *cpu_buffer; 1053 struct ring_buffer_per_cpu *cpu_buffer;
1047 struct buffer_page *bpage; 1054 struct buffer_page *bpage;
1048 unsigned long addr; 1055 struct page *page;
1049 int ret; 1056 int ret;
1050 1057
1051 cpu_buffer = kzalloc_node(ALIGN(sizeof(*cpu_buffer), cache_line_size()), 1058 cpu_buffer = kzalloc_node(ALIGN(sizeof(*cpu_buffer), cache_line_size()),
@@ -1067,10 +1074,10 @@ rb_allocate_cpu_buffer(struct ring_buffer *buffer, int cpu)
1067 rb_check_bpage(cpu_buffer, bpage); 1074 rb_check_bpage(cpu_buffer, bpage);
1068 1075
1069 cpu_buffer->reader_page = bpage; 1076 cpu_buffer->reader_page = bpage;
1070 addr = __get_free_page(GFP_KERNEL); 1077 page = alloc_pages_node(cpu_to_node(cpu), GFP_KERNEL, 0);
1071 if (!addr) 1078 if (!page)
1072 goto fail_free_reader; 1079 goto fail_free_reader;
1073 bpage->page = (void *)addr; 1080 bpage->page = page_address(page);
1074 rb_init_page(bpage->page); 1081 rb_init_page(bpage->page);
1075 1082
1076 INIT_LIST_HEAD(&cpu_buffer->reader_page->list); 1083 INIT_LIST_HEAD(&cpu_buffer->reader_page->list);
@@ -1314,7 +1321,6 @@ int ring_buffer_resize(struct ring_buffer *buffer, unsigned long size)
1314 unsigned nr_pages, rm_pages, new_pages; 1321 unsigned nr_pages, rm_pages, new_pages;
1315 struct buffer_page *bpage, *tmp; 1322 struct buffer_page *bpage, *tmp;
1316 unsigned long buffer_size; 1323 unsigned long buffer_size;
1317 unsigned long addr;
1318 LIST_HEAD(pages); 1324 LIST_HEAD(pages);
1319 int i, cpu; 1325 int i, cpu;
1320 1326
@@ -1375,16 +1381,24 @@ int ring_buffer_resize(struct ring_buffer *buffer, unsigned long size)
1375 1381
1376 for_each_buffer_cpu(buffer, cpu) { 1382 for_each_buffer_cpu(buffer, cpu) {
1377 for (i = 0; i < new_pages; i++) { 1383 for (i = 0; i < new_pages; i++) {
1384 struct page *page;
1385 /*
1386 * __GFP_NORETRY flag makes sure that the allocation
1387 * fails gracefully without invoking oom-killer and
1388 * the system is not destabilized.
1389 */
1378 bpage = kzalloc_node(ALIGN(sizeof(*bpage), 1390 bpage = kzalloc_node(ALIGN(sizeof(*bpage),
1379 cache_line_size()), 1391 cache_line_size()),
1380 GFP_KERNEL, cpu_to_node(cpu)); 1392 GFP_KERNEL | __GFP_NORETRY,
1393 cpu_to_node(cpu));
1381 if (!bpage) 1394 if (!bpage)
1382 goto free_pages; 1395 goto free_pages;
1383 list_add(&bpage->list, &pages); 1396 list_add(&bpage->list, &pages);
1384 addr = __get_free_page(GFP_KERNEL); 1397 page = alloc_pages_node(cpu_to_node(cpu),
1385 if (!addr) 1398 GFP_KERNEL | __GFP_NORETRY, 0);
1399 if (!page)
1386 goto free_pages; 1400 goto free_pages;
1387 bpage->page = (void *)addr; 1401 bpage->page = page_address(page);
1388 rb_init_page(bpage->page); 1402 rb_init_page(bpage->page);
1389 } 1403 }
1390 } 1404 }
@@ -2216,7 +2230,7 @@ static noinline void trace_recursive_fail(void)
2216 2230
2217 printk_once(KERN_WARNING "Tracing recursion: depth[%ld]:" 2231 printk_once(KERN_WARNING "Tracing recursion: depth[%ld]:"
2218 "HC[%lu]:SC[%lu]:NMI[%lu]\n", 2232 "HC[%lu]:SC[%lu]:NMI[%lu]\n",
2219 current->trace_recursion, 2233 trace_recursion_buffer(),
2220 hardirq_count() >> HARDIRQ_SHIFT, 2234 hardirq_count() >> HARDIRQ_SHIFT,
2221 softirq_count() >> SOFTIRQ_SHIFT, 2235 softirq_count() >> SOFTIRQ_SHIFT,
2222 in_nmi()); 2236 in_nmi());
@@ -2226,9 +2240,9 @@ static noinline void trace_recursive_fail(void)
2226 2240
2227static inline int trace_recursive_lock(void) 2241static inline int trace_recursive_lock(void)
2228{ 2242{
2229 current->trace_recursion++; 2243 trace_recursion_inc();
2230 2244
2231 if (likely(current->trace_recursion < TRACE_RECURSIVE_DEPTH)) 2245 if (likely(trace_recursion_buffer() < TRACE_RECURSIVE_DEPTH))
2232 return 0; 2246 return 0;
2233 2247
2234 trace_recursive_fail(); 2248 trace_recursive_fail();
@@ -2238,9 +2252,9 @@ static inline int trace_recursive_lock(void)
2238 2252
2239static inline void trace_recursive_unlock(void) 2253static inline void trace_recursive_unlock(void)
2240{ 2254{
2241 WARN_ON_ONCE(!current->trace_recursion); 2255 WARN_ON_ONCE(!trace_recursion_buffer());
2242 2256
2243 current->trace_recursion--; 2257 trace_recursion_dec();
2244} 2258}
2245 2259
2246#else 2260#else
@@ -3730,16 +3744,17 @@ EXPORT_SYMBOL_GPL(ring_buffer_swap_cpu);
3730 * Returns: 3744 * Returns:
3731 * The page allocated, or NULL on error. 3745 * The page allocated, or NULL on error.
3732 */ 3746 */
3733void *ring_buffer_alloc_read_page(struct ring_buffer *buffer) 3747void *ring_buffer_alloc_read_page(struct ring_buffer *buffer, int cpu)
3734{ 3748{
3735 struct buffer_data_page *bpage; 3749 struct buffer_data_page *bpage;
3736 unsigned long addr; 3750 struct page *page;
3737 3751
3738 addr = __get_free_page(GFP_KERNEL); 3752 page = alloc_pages_node(cpu_to_node(cpu),
3739 if (!addr) 3753 GFP_KERNEL | __GFP_NORETRY, 0);
3754 if (!page)
3740 return NULL; 3755 return NULL;
3741 3756
3742 bpage = (void *)addr; 3757 bpage = page_address(page);
3743 3758
3744 rb_init_page(bpage); 3759 rb_init_page(bpage);
3745 3760
@@ -3978,20 +3993,11 @@ rb_simple_write(struct file *filp, const char __user *ubuf,
3978 size_t cnt, loff_t *ppos) 3993 size_t cnt, loff_t *ppos)
3979{ 3994{
3980 unsigned long *p = filp->private_data; 3995 unsigned long *p = filp->private_data;
3981 char buf[64];
3982 unsigned long val; 3996 unsigned long val;
3983 int ret; 3997 int ret;
3984 3998
3985 if (cnt >= sizeof(buf)) 3999 ret = kstrtoul_from_user(ubuf, cnt, 10, &val);
3986 return -EINVAL; 4000 if (ret)
3987
3988 if (copy_from_user(&buf, ubuf, cnt))
3989 return -EFAULT;
3990
3991 buf[cnt] = 0;
3992
3993 ret = strict_strtoul(buf, 10, &val);
3994 if (ret < 0)
3995 return ret; 4001 return ret;
3996 4002
3997 if (val) 4003 if (val)
diff --git a/kernel/trace/ring_buffer_benchmark.c b/kernel/trace/ring_buffer_benchmark.c
index 302f8a614635..a5457d577b98 100644
--- a/kernel/trace/ring_buffer_benchmark.c
+++ b/kernel/trace/ring_buffer_benchmark.c
@@ -106,7 +106,7 @@ static enum event_status read_page(int cpu)
106 int inc; 106 int inc;
107 int i; 107 int i;
108 108
109 bpage = ring_buffer_alloc_read_page(buffer); 109 bpage = ring_buffer_alloc_read_page(buffer, cpu);
110 if (!bpage) 110 if (!bpage)
111 return EVENT_DROPPED; 111 return EVENT_DROPPED;
112 112
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 1cb49be7c7fb..e5df02c69b1d 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -343,26 +343,27 @@ unsigned long trace_flags = TRACE_ITER_PRINT_PARENT | TRACE_ITER_PRINTK |
343static int trace_stop_count; 343static int trace_stop_count;
344static DEFINE_SPINLOCK(tracing_start_lock); 344static DEFINE_SPINLOCK(tracing_start_lock);
345 345
346static void wakeup_work_handler(struct work_struct *work)
347{
348 wake_up(&trace_wait);
349}
350
351static DECLARE_DELAYED_WORK(wakeup_work, wakeup_work_handler);
352
346/** 353/**
347 * trace_wake_up - wake up tasks waiting for trace input 354 * trace_wake_up - wake up tasks waiting for trace input
348 * 355 *
349 * Simply wakes up any task that is blocked on the trace_wait 356 * Schedules a delayed work to wake up any task that is blocked on the
350 * queue. These is used with trace_poll for tasks polling the trace. 357 * trace_wait queue. These is used with trace_poll for tasks polling the
358 * trace.
351 */ 359 */
352void trace_wake_up(void) 360void trace_wake_up(void)
353{ 361{
354 int cpu; 362 const unsigned long delay = msecs_to_jiffies(2);
355 363
356 if (trace_flags & TRACE_ITER_BLOCK) 364 if (trace_flags & TRACE_ITER_BLOCK)
357 return; 365 return;
358 /* 366 schedule_delayed_work(&wakeup_work, delay);
359 * The runqueue_is_locked() can fail, but this is the best we
360 * have for now:
361 */
362 cpu = get_cpu();
363 if (!runqueue_is_locked(cpu))
364 wake_up(&trace_wait);
365 put_cpu();
366} 367}
367 368
368static int __init set_buf_size(char *str) 369static int __init set_buf_size(char *str)
@@ -424,6 +425,7 @@ static const char *trace_options[] = {
424 "graph-time", 425 "graph-time",
425 "record-cmd", 426 "record-cmd",
426 "overwrite", 427 "overwrite",
428 "disable_on_free",
427 NULL 429 NULL
428}; 430};
429 431
@@ -1191,6 +1193,18 @@ void trace_nowake_buffer_unlock_commit(struct ring_buffer *buffer,
1191} 1193}
1192EXPORT_SYMBOL_GPL(trace_nowake_buffer_unlock_commit); 1194EXPORT_SYMBOL_GPL(trace_nowake_buffer_unlock_commit);
1193 1195
1196void trace_nowake_buffer_unlock_commit_regs(struct ring_buffer *buffer,
1197 struct ring_buffer_event *event,
1198 unsigned long flags, int pc,
1199 struct pt_regs *regs)
1200{
1201 ring_buffer_unlock_commit(buffer, event);
1202
1203 ftrace_trace_stack_regs(buffer, flags, 0, pc, regs);
1204 ftrace_trace_userstack(buffer, flags, pc);
1205}
1206EXPORT_SYMBOL_GPL(trace_nowake_buffer_unlock_commit_regs);
1207
1194void trace_current_buffer_discard_commit(struct ring_buffer *buffer, 1208void trace_current_buffer_discard_commit(struct ring_buffer *buffer,
1195 struct ring_buffer_event *event) 1209 struct ring_buffer_event *event)
1196{ 1210{
@@ -1234,30 +1248,103 @@ ftrace(struct trace_array *tr, struct trace_array_cpu *data,
1234} 1248}
1235 1249
1236#ifdef CONFIG_STACKTRACE 1250#ifdef CONFIG_STACKTRACE
1251
1252#define FTRACE_STACK_MAX_ENTRIES (PAGE_SIZE / sizeof(unsigned long))
1253struct ftrace_stack {
1254 unsigned long calls[FTRACE_STACK_MAX_ENTRIES];
1255};
1256
1257static DEFINE_PER_CPU(struct ftrace_stack, ftrace_stack);
1258static DEFINE_PER_CPU(int, ftrace_stack_reserve);
1259
1237static void __ftrace_trace_stack(struct ring_buffer *buffer, 1260static void __ftrace_trace_stack(struct ring_buffer *buffer,
1238 unsigned long flags, 1261 unsigned long flags,
1239 int skip, int pc) 1262 int skip, int pc, struct pt_regs *regs)
1240{ 1263{
1241 struct ftrace_event_call *call = &event_kernel_stack; 1264 struct ftrace_event_call *call = &event_kernel_stack;
1242 struct ring_buffer_event *event; 1265 struct ring_buffer_event *event;
1243 struct stack_entry *entry; 1266 struct stack_entry *entry;
1244 struct stack_trace trace; 1267 struct stack_trace trace;
1268 int use_stack;
1269 int size = FTRACE_STACK_ENTRIES;
1270
1271 trace.nr_entries = 0;
1272 trace.skip = skip;
1273
1274 /*
1275 * Since events can happen in NMIs there's no safe way to
1276 * use the per cpu ftrace_stacks. We reserve it and if an interrupt
1277 * or NMI comes in, it will just have to use the default
1278 * FTRACE_STACK_SIZE.
1279 */
1280 preempt_disable_notrace();
1281
1282 use_stack = ++__get_cpu_var(ftrace_stack_reserve);
1283 /*
1284 * We don't need any atomic variables, just a barrier.
1285 * If an interrupt comes in, we don't care, because it would
1286 * have exited and put the counter back to what we want.
1287 * We just need a barrier to keep gcc from moving things
1288 * around.
1289 */
1290 barrier();
1291 if (use_stack == 1) {
1292 trace.entries = &__get_cpu_var(ftrace_stack).calls[0];
1293 trace.max_entries = FTRACE_STACK_MAX_ENTRIES;
1294
1295 if (regs)
1296 save_stack_trace_regs(regs, &trace);
1297 else
1298 save_stack_trace(&trace);
1299
1300 if (trace.nr_entries > size)
1301 size = trace.nr_entries;
1302 } else
1303 /* From now on, use_stack is a boolean */
1304 use_stack = 0;
1305
1306 size *= sizeof(unsigned long);
1245 1307
1246 event = trace_buffer_lock_reserve(buffer, TRACE_STACK, 1308 event = trace_buffer_lock_reserve(buffer, TRACE_STACK,
1247 sizeof(*entry), flags, pc); 1309 sizeof(*entry) + size, flags, pc);
1248 if (!event) 1310 if (!event)
1249 return; 1311 goto out;
1250 entry = ring_buffer_event_data(event); 1312 entry = ring_buffer_event_data(event);
1251 memset(&entry->caller, 0, sizeof(entry->caller));
1252 1313
1253 trace.nr_entries = 0; 1314 memset(&entry->caller, 0, size);
1254 trace.max_entries = FTRACE_STACK_ENTRIES; 1315
1255 trace.skip = skip; 1316 if (use_stack)
1256 trace.entries = entry->caller; 1317 memcpy(&entry->caller, trace.entries,
1318 trace.nr_entries * sizeof(unsigned long));
1319 else {
1320 trace.max_entries = FTRACE_STACK_ENTRIES;
1321 trace.entries = entry->caller;
1322 if (regs)
1323 save_stack_trace_regs(regs, &trace);
1324 else
1325 save_stack_trace(&trace);
1326 }
1327
1328 entry->size = trace.nr_entries;
1257 1329
1258 save_stack_trace(&trace);
1259 if (!filter_check_discard(call, entry, buffer, event)) 1330 if (!filter_check_discard(call, entry, buffer, event))
1260 ring_buffer_unlock_commit(buffer, event); 1331 ring_buffer_unlock_commit(buffer, event);
1332
1333 out:
1334 /* Again, don't let gcc optimize things here */
1335 barrier();
1336 __get_cpu_var(ftrace_stack_reserve)--;
1337 preempt_enable_notrace();
1338
1339}
1340
1341void ftrace_trace_stack_regs(struct ring_buffer *buffer, unsigned long flags,
1342 int skip, int pc, struct pt_regs *regs)
1343{
1344 if (!(trace_flags & TRACE_ITER_STACKTRACE))
1345 return;
1346
1347 __ftrace_trace_stack(buffer, flags, skip, pc, regs);
1261} 1348}
1262 1349
1263void ftrace_trace_stack(struct ring_buffer *buffer, unsigned long flags, 1350void ftrace_trace_stack(struct ring_buffer *buffer, unsigned long flags,
@@ -1266,13 +1353,13 @@ void ftrace_trace_stack(struct ring_buffer *buffer, unsigned long flags,
1266 if (!(trace_flags & TRACE_ITER_STACKTRACE)) 1353 if (!(trace_flags & TRACE_ITER_STACKTRACE))
1267 return; 1354 return;
1268 1355
1269 __ftrace_trace_stack(buffer, flags, skip, pc); 1356 __ftrace_trace_stack(buffer, flags, skip, pc, NULL);
1270} 1357}
1271 1358
1272void __trace_stack(struct trace_array *tr, unsigned long flags, int skip, 1359void __trace_stack(struct trace_array *tr, unsigned long flags, int skip,
1273 int pc) 1360 int pc)
1274{ 1361{
1275 __ftrace_trace_stack(tr->buffer, flags, skip, pc); 1362 __ftrace_trace_stack(tr->buffer, flags, skip, pc, NULL);
1276} 1363}
1277 1364
1278/** 1365/**
@@ -1288,7 +1375,7 @@ void trace_dump_stack(void)
1288 local_save_flags(flags); 1375 local_save_flags(flags);
1289 1376
1290 /* skipping 3 traces, seems to get us at the caller of this function */ 1377 /* skipping 3 traces, seems to get us at the caller of this function */
1291 __ftrace_trace_stack(global_trace.buffer, flags, 3, preempt_count()); 1378 __ftrace_trace_stack(global_trace.buffer, flags, 3, preempt_count(), NULL);
1292} 1379}
1293 1380
1294static DEFINE_PER_CPU(int, user_stack_count); 1381static DEFINE_PER_CPU(int, user_stack_count);
@@ -1536,7 +1623,12 @@ peek_next_entry(struct trace_iterator *iter, int cpu, u64 *ts,
1536 1623
1537 ftrace_enable_cpu(); 1624 ftrace_enable_cpu();
1538 1625
1539 return event ? ring_buffer_event_data(event) : NULL; 1626 if (event) {
1627 iter->ent_size = ring_buffer_event_length(event);
1628 return ring_buffer_event_data(event);
1629 }
1630 iter->ent_size = 0;
1631 return NULL;
1540} 1632}
1541 1633
1542static struct trace_entry * 1634static struct trace_entry *
@@ -2014,9 +2106,10 @@ enum print_line_t print_trace_line(struct trace_iterator *iter)
2014{ 2106{
2015 enum print_line_t ret; 2107 enum print_line_t ret;
2016 2108
2017 if (iter->lost_events) 2109 if (iter->lost_events &&
2018 trace_seq_printf(&iter->seq, "CPU:%d [LOST %lu EVENTS]\n", 2110 !trace_seq_printf(&iter->seq, "CPU:%d [LOST %lu EVENTS]\n",
2019 iter->cpu, iter->lost_events); 2111 iter->cpu, iter->lost_events))
2112 return TRACE_TYPE_PARTIAL_LINE;
2020 2113
2021 if (iter->trace && iter->trace->print_line) { 2114 if (iter->trace && iter->trace->print_line) {
2022 ret = iter->trace->print_line(iter); 2115 ret = iter->trace->print_line(iter);
@@ -2050,6 +2143,9 @@ void trace_default_header(struct seq_file *m)
2050{ 2143{
2051 struct trace_iterator *iter = m->private; 2144 struct trace_iterator *iter = m->private;
2052 2145
2146 if (!(trace_flags & TRACE_ITER_CONTEXT_INFO))
2147 return;
2148
2053 if (iter->iter_flags & TRACE_FILE_LAT_FMT) { 2149 if (iter->iter_flags & TRACE_FILE_LAT_FMT) {
2054 /* print nothing if the buffers are empty */ 2150 /* print nothing if the buffers are empty */
2055 if (trace_empty(iter)) 2151 if (trace_empty(iter))
@@ -2700,20 +2796,11 @@ tracing_ctrl_write(struct file *filp, const char __user *ubuf,
2700 size_t cnt, loff_t *ppos) 2796 size_t cnt, loff_t *ppos)
2701{ 2797{
2702 struct trace_array *tr = filp->private_data; 2798 struct trace_array *tr = filp->private_data;
2703 char buf[64];
2704 unsigned long val; 2799 unsigned long val;
2705 int ret; 2800 int ret;
2706 2801
2707 if (cnt >= sizeof(buf)) 2802 ret = kstrtoul_from_user(ubuf, cnt, 10, &val);
2708 return -EINVAL; 2803 if (ret)
2709
2710 if (copy_from_user(&buf, ubuf, cnt))
2711 return -EFAULT;
2712
2713 buf[cnt] = 0;
2714
2715 ret = strict_strtoul(buf, 10, &val);
2716 if (ret < 0)
2717 return ret; 2804 return ret;
2718 2805
2719 val = !!val; 2806 val = !!val;
@@ -2766,7 +2853,7 @@ int tracer_init(struct tracer *t, struct trace_array *tr)
2766 return t->init(tr); 2853 return t->init(tr);
2767} 2854}
2768 2855
2769static int tracing_resize_ring_buffer(unsigned long size) 2856static int __tracing_resize_ring_buffer(unsigned long size)
2770{ 2857{
2771 int ret; 2858 int ret;
2772 2859
@@ -2818,6 +2905,41 @@ static int tracing_resize_ring_buffer(unsigned long size)
2818 return ret; 2905 return ret;
2819} 2906}
2820 2907
2908static ssize_t tracing_resize_ring_buffer(unsigned long size)
2909{
2910 int cpu, ret = size;
2911
2912 mutex_lock(&trace_types_lock);
2913
2914 tracing_stop();
2915
2916 /* disable all cpu buffers */
2917 for_each_tracing_cpu(cpu) {
2918 if (global_trace.data[cpu])
2919 atomic_inc(&global_trace.data[cpu]->disabled);
2920 if (max_tr.data[cpu])
2921 atomic_inc(&max_tr.data[cpu]->disabled);
2922 }
2923
2924 if (size != global_trace.entries)
2925 ret = __tracing_resize_ring_buffer(size);
2926
2927 if (ret < 0)
2928 ret = -ENOMEM;
2929
2930 for_each_tracing_cpu(cpu) {
2931 if (global_trace.data[cpu])
2932 atomic_dec(&global_trace.data[cpu]->disabled);
2933 if (max_tr.data[cpu])
2934 atomic_dec(&max_tr.data[cpu]->disabled);
2935 }
2936
2937 tracing_start();
2938 mutex_unlock(&trace_types_lock);
2939
2940 return ret;
2941}
2942
2821 2943
2822/** 2944/**
2823 * tracing_update_buffers - used by tracing facility to expand ring buffers 2945 * tracing_update_buffers - used by tracing facility to expand ring buffers
@@ -2835,7 +2957,7 @@ int tracing_update_buffers(void)
2835 2957
2836 mutex_lock(&trace_types_lock); 2958 mutex_lock(&trace_types_lock);
2837 if (!ring_buffer_expanded) 2959 if (!ring_buffer_expanded)
2838 ret = tracing_resize_ring_buffer(trace_buf_size); 2960 ret = __tracing_resize_ring_buffer(trace_buf_size);
2839 mutex_unlock(&trace_types_lock); 2961 mutex_unlock(&trace_types_lock);
2840 2962
2841 return ret; 2963 return ret;
@@ -2859,7 +2981,7 @@ static int tracing_set_tracer(const char *buf)
2859 mutex_lock(&trace_types_lock); 2981 mutex_lock(&trace_types_lock);
2860 2982
2861 if (!ring_buffer_expanded) { 2983 if (!ring_buffer_expanded) {
2862 ret = tracing_resize_ring_buffer(trace_buf_size); 2984 ret = __tracing_resize_ring_buffer(trace_buf_size);
2863 if (ret < 0) 2985 if (ret < 0)
2864 goto out; 2986 goto out;
2865 ret = 0; 2987 ret = 0;
@@ -2965,20 +3087,11 @@ tracing_max_lat_write(struct file *filp, const char __user *ubuf,
2965 size_t cnt, loff_t *ppos) 3087 size_t cnt, loff_t *ppos)
2966{ 3088{
2967 unsigned long *ptr = filp->private_data; 3089 unsigned long *ptr = filp->private_data;
2968 char buf[64];
2969 unsigned long val; 3090 unsigned long val;
2970 int ret; 3091 int ret;
2971 3092
2972 if (cnt >= sizeof(buf)) 3093 ret = kstrtoul_from_user(ubuf, cnt, 10, &val);
2973 return -EINVAL; 3094 if (ret)
2974
2975 if (copy_from_user(&buf, ubuf, cnt))
2976 return -EFAULT;
2977
2978 buf[cnt] = 0;
2979
2980 ret = strict_strtoul(buf, 10, &val);
2981 if (ret < 0)
2982 return ret; 3095 return ret;
2983 3096
2984 *ptr = val * 1000; 3097 *ptr = val * 1000;
@@ -3230,6 +3343,14 @@ waitagain:
3230 3343
3231 if (iter->seq.len >= cnt) 3344 if (iter->seq.len >= cnt)
3232 break; 3345 break;
3346
3347 /*
3348 * Setting the full flag means we reached the trace_seq buffer
3349 * size and we should leave by partial output condition above.
3350 * One of the trace_seq_* functions is not used properly.
3351 */
3352 WARN_ONCE(iter->seq.full, "full flag set for trace type %d",
3353 iter->ent->type);
3233 } 3354 }
3234 trace_access_unlock(iter->cpu_file); 3355 trace_access_unlock(iter->cpu_file);
3235 trace_event_read_unlock(); 3356 trace_event_read_unlock();
@@ -3425,67 +3546,54 @@ tracing_entries_write(struct file *filp, const char __user *ubuf,
3425 size_t cnt, loff_t *ppos) 3546 size_t cnt, loff_t *ppos)
3426{ 3547{
3427 unsigned long val; 3548 unsigned long val;
3428 char buf[64]; 3549 int ret;
3429 int ret, cpu;
3430
3431 if (cnt >= sizeof(buf))
3432 return -EINVAL;
3433
3434 if (copy_from_user(&buf, ubuf, cnt))
3435 return -EFAULT;
3436
3437 buf[cnt] = 0;
3438 3550
3439 ret = strict_strtoul(buf, 10, &val); 3551 ret = kstrtoul_from_user(ubuf, cnt, 10, &val);
3440 if (ret < 0) 3552 if (ret)
3441 return ret; 3553 return ret;
3442 3554
3443 /* must have at least 1 entry */ 3555 /* must have at least 1 entry */
3444 if (!val) 3556 if (!val)
3445 return -EINVAL; 3557 return -EINVAL;
3446 3558
3447 mutex_lock(&trace_types_lock);
3448
3449 tracing_stop();
3450
3451 /* disable all cpu buffers */
3452 for_each_tracing_cpu(cpu) {
3453 if (global_trace.data[cpu])
3454 atomic_inc(&global_trace.data[cpu]->disabled);
3455 if (max_tr.data[cpu])
3456 atomic_inc(&max_tr.data[cpu]->disabled);
3457 }
3458
3459 /* value is in KB */ 3559 /* value is in KB */
3460 val <<= 10; 3560 val <<= 10;
3461 3561
3462 if (val != global_trace.entries) { 3562 ret = tracing_resize_ring_buffer(val);
3463 ret = tracing_resize_ring_buffer(val); 3563 if (ret < 0)
3464 if (ret < 0) { 3564 return ret;
3465 cnt = ret;
3466 goto out;
3467 }
3468 }
3469 3565
3470 *ppos += cnt; 3566 *ppos += cnt;
3471 3567
3472 /* If check pages failed, return ENOMEM */ 3568 return cnt;
3473 if (tracing_disabled) 3569}
3474 cnt = -ENOMEM;
3475 out:
3476 for_each_tracing_cpu(cpu) {
3477 if (global_trace.data[cpu])
3478 atomic_dec(&global_trace.data[cpu]->disabled);
3479 if (max_tr.data[cpu])
3480 atomic_dec(&max_tr.data[cpu]->disabled);
3481 }
3482 3570
3483 tracing_start(); 3571static ssize_t
3484 mutex_unlock(&trace_types_lock); 3572tracing_free_buffer_write(struct file *filp, const char __user *ubuf,
3573 size_t cnt, loff_t *ppos)
3574{
3575 /*
3576 * There is no need to read what the user has written, this function
3577 * is just to make sure that there is no error when "echo" is used
3578 */
3579
3580 *ppos += cnt;
3485 3581
3486 return cnt; 3582 return cnt;
3487} 3583}
3488 3584
3585static int
3586tracing_free_buffer_release(struct inode *inode, struct file *filp)
3587{
3588 /* disable tracing ? */
3589 if (trace_flags & TRACE_ITER_STOP_ON_FREE)
3590 tracing_off();
3591 /* resize the ring buffer to 0 */
3592 tracing_resize_ring_buffer(0);
3593
3594 return 0;
3595}
3596
3489static int mark_printk(const char *fmt, ...) 3597static int mark_printk(const char *fmt, ...)
3490{ 3598{
3491 int ret; 3599 int ret;
@@ -3631,6 +3739,11 @@ static const struct file_operations tracing_entries_fops = {
3631 .llseek = generic_file_llseek, 3739 .llseek = generic_file_llseek,
3632}; 3740};
3633 3741
3742static const struct file_operations tracing_free_buffer_fops = {
3743 .write = tracing_free_buffer_write,
3744 .release = tracing_free_buffer_release,
3745};
3746
3634static const struct file_operations tracing_mark_fops = { 3747static const struct file_operations tracing_mark_fops = {
3635 .open = tracing_open_generic, 3748 .open = tracing_open_generic,
3636 .write = tracing_mark_write, 3749 .write = tracing_mark_write,
@@ -3687,7 +3800,7 @@ tracing_buffers_read(struct file *filp, char __user *ubuf,
3687 return 0; 3800 return 0;
3688 3801
3689 if (!info->spare) 3802 if (!info->spare)
3690 info->spare = ring_buffer_alloc_read_page(info->tr->buffer); 3803 info->spare = ring_buffer_alloc_read_page(info->tr->buffer, info->cpu);
3691 if (!info->spare) 3804 if (!info->spare)
3692 return -ENOMEM; 3805 return -ENOMEM;
3693 3806
@@ -3844,7 +3957,7 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos,
3844 3957
3845 ref->ref = 1; 3958 ref->ref = 1;
3846 ref->buffer = info->tr->buffer; 3959 ref->buffer = info->tr->buffer;
3847 ref->page = ring_buffer_alloc_read_page(ref->buffer); 3960 ref->page = ring_buffer_alloc_read_page(ref->buffer, info->cpu);
3848 if (!ref->page) { 3961 if (!ref->page) {
3849 kfree(ref); 3962 kfree(ref);
3850 break; 3963 break;
@@ -3853,8 +3966,7 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos,
3853 r = ring_buffer_read_page(ref->buffer, &ref->page, 3966 r = ring_buffer_read_page(ref->buffer, &ref->page,
3854 len, info->cpu, 1); 3967 len, info->cpu, 1);
3855 if (r < 0) { 3968 if (r < 0) {
3856 ring_buffer_free_read_page(ref->buffer, 3969 ring_buffer_free_read_page(ref->buffer, ref->page);
3857 ref->page);
3858 kfree(ref); 3970 kfree(ref);
3859 break; 3971 break;
3860 } 3972 }
@@ -4090,19 +4202,10 @@ trace_options_write(struct file *filp, const char __user *ubuf, size_t cnt,
4090{ 4202{
4091 struct trace_option_dentry *topt = filp->private_data; 4203 struct trace_option_dentry *topt = filp->private_data;
4092 unsigned long val; 4204 unsigned long val;
4093 char buf[64];
4094 int ret; 4205 int ret;
4095 4206
4096 if (cnt >= sizeof(buf)) 4207 ret = kstrtoul_from_user(ubuf, cnt, 10, &val);
4097 return -EINVAL; 4208 if (ret)
4098
4099 if (copy_from_user(&buf, ubuf, cnt))
4100 return -EFAULT;
4101
4102 buf[cnt] = 0;
4103
4104 ret = strict_strtoul(buf, 10, &val);
4105 if (ret < 0)
4106 return ret; 4209 return ret;
4107 4210
4108 if (val != 0 && val != 1) 4211 if (val != 0 && val != 1)
@@ -4150,20 +4253,11 @@ trace_options_core_write(struct file *filp, const char __user *ubuf, size_t cnt,
4150 loff_t *ppos) 4253 loff_t *ppos)
4151{ 4254{
4152 long index = (long)filp->private_data; 4255 long index = (long)filp->private_data;
4153 char buf[64];
4154 unsigned long val; 4256 unsigned long val;
4155 int ret; 4257 int ret;
4156 4258
4157 if (cnt >= sizeof(buf)) 4259 ret = kstrtoul_from_user(ubuf, cnt, 10, &val);
4158 return -EINVAL; 4260 if (ret)
4159
4160 if (copy_from_user(&buf, ubuf, cnt))
4161 return -EFAULT;
4162
4163 buf[cnt] = 0;
4164
4165 ret = strict_strtoul(buf, 10, &val);
4166 if (ret < 0)
4167 return ret; 4261 return ret;
4168 4262
4169 if (val != 0 && val != 1) 4263 if (val != 0 && val != 1)
@@ -4356,6 +4450,9 @@ static __init int tracer_init_debugfs(void)
4356 trace_create_file("buffer_size_kb", 0644, d_tracer, 4450 trace_create_file("buffer_size_kb", 0644, d_tracer,
4357 &global_trace, &tracing_entries_fops); 4451 &global_trace, &tracing_entries_fops);
4358 4452
4453 trace_create_file("free_buffer", 0644, d_tracer,
4454 &global_trace, &tracing_free_buffer_fops);
4455
4359 trace_create_file("trace_marker", 0220, d_tracer, 4456 trace_create_file("trace_marker", 0220, d_tracer,
4360 NULL, &tracing_mark_fops); 4457 NULL, &tracing_mark_fops);
4361 4458
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 5e9dfc6286dd..616846bcfee5 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -2,7 +2,7 @@
2#define _LINUX_KERNEL_TRACE_H 2#define _LINUX_KERNEL_TRACE_H
3 3
4#include <linux/fs.h> 4#include <linux/fs.h>
5#include <asm/atomic.h> 5#include <linux/atomic.h>
6#include <linux/sched.h> 6#include <linux/sched.h>
7#include <linux/clocksource.h> 7#include <linux/clocksource.h>
8#include <linux/ring_buffer.h> 8#include <linux/ring_buffer.h>
@@ -278,6 +278,29 @@ struct tracer {
278}; 278};
279 279
280 280
281/* Only current can touch trace_recursion */
282#define trace_recursion_inc() do { (current)->trace_recursion++; } while (0)
283#define trace_recursion_dec() do { (current)->trace_recursion--; } while (0)
284
285/* Ring buffer has the 10 LSB bits to count */
286#define trace_recursion_buffer() ((current)->trace_recursion & 0x3ff)
287
288/* for function tracing recursion */
289#define TRACE_INTERNAL_BIT (1<<11)
290#define TRACE_GLOBAL_BIT (1<<12)
291/*
292 * Abuse of the trace_recursion.
293 * As we need a way to maintain state if we are tracing the function
294 * graph in irq because we want to trace a particular function that
295 * was called in irq context but we have irq tracing off. Since this
296 * can only be modified by current, we can reuse trace_recursion.
297 */
298#define TRACE_IRQ_BIT (1<<13)
299
300#define trace_recursion_set(bit) do { (current)->trace_recursion |= (bit); } while (0)
301#define trace_recursion_clear(bit) do { (current)->trace_recursion &= ~(bit); } while (0)
302#define trace_recursion_test(bit) ((current)->trace_recursion & (bit))
303
281#define TRACE_PIPE_ALL_CPU -1 304#define TRACE_PIPE_ALL_CPU -1
282 305
283int tracer_init(struct tracer *t, struct trace_array *tr); 306int tracer_init(struct tracer *t, struct trace_array *tr);
@@ -389,6 +412,9 @@ void update_max_tr_single(struct trace_array *tr,
389void ftrace_trace_stack(struct ring_buffer *buffer, unsigned long flags, 412void ftrace_trace_stack(struct ring_buffer *buffer, unsigned long flags,
390 int skip, int pc); 413 int skip, int pc);
391 414
415void ftrace_trace_stack_regs(struct ring_buffer *buffer, unsigned long flags,
416 int skip, int pc, struct pt_regs *regs);
417
392void ftrace_trace_userstack(struct ring_buffer *buffer, unsigned long flags, 418void ftrace_trace_userstack(struct ring_buffer *buffer, unsigned long flags,
393 int pc); 419 int pc);
394 420
@@ -400,6 +426,12 @@ static inline void ftrace_trace_stack(struct ring_buffer *buffer,
400{ 426{
401} 427}
402 428
429static inline void ftrace_trace_stack_regs(struct ring_buffer *buffer,
430 unsigned long flags, int skip,
431 int pc, struct pt_regs *regs)
432{
433}
434
403static inline void ftrace_trace_userstack(struct ring_buffer *buffer, 435static inline void ftrace_trace_userstack(struct ring_buffer *buffer,
404 unsigned long flags, int pc) 436 unsigned long flags, int pc)
405{ 437{
@@ -419,6 +451,8 @@ extern void trace_find_cmdline(int pid, char comm[]);
419extern unsigned long ftrace_update_tot_cnt; 451extern unsigned long ftrace_update_tot_cnt;
420#define DYN_FTRACE_TEST_NAME trace_selftest_dynamic_test_func 452#define DYN_FTRACE_TEST_NAME trace_selftest_dynamic_test_func
421extern int DYN_FTRACE_TEST_NAME(void); 453extern int DYN_FTRACE_TEST_NAME(void);
454#define DYN_FTRACE_TEST_NAME2 trace_selftest_dynamic_test_func2
455extern int DYN_FTRACE_TEST_NAME2(void);
422#endif 456#endif
423 457
424extern int ring_buffer_expanded; 458extern int ring_buffer_expanded;
@@ -505,8 +539,18 @@ static inline int ftrace_graph_addr(unsigned long addr)
505 return 1; 539 return 1;
506 540
507 for (i = 0; i < ftrace_graph_count; i++) { 541 for (i = 0; i < ftrace_graph_count; i++) {
508 if (addr == ftrace_graph_funcs[i]) 542 if (addr == ftrace_graph_funcs[i]) {
543 /*
544 * If no irqs are to be traced, but a set_graph_function
545 * is set, and called by an interrupt handler, we still
546 * want to trace it.
547 */
548 if (in_irq())
549 trace_recursion_set(TRACE_IRQ_BIT);
550 else
551 trace_recursion_clear(TRACE_IRQ_BIT);
509 return 1; 552 return 1;
553 }
510 } 554 }
511 555
512 return 0; 556 return 0;
@@ -607,6 +651,7 @@ enum trace_iterator_flags {
607 TRACE_ITER_GRAPH_TIME = 0x80000, 651 TRACE_ITER_GRAPH_TIME = 0x80000,
608 TRACE_ITER_RECORD_CMD = 0x100000, 652 TRACE_ITER_RECORD_CMD = 0x100000,
609 TRACE_ITER_OVERWRITE = 0x200000, 653 TRACE_ITER_OVERWRITE = 0x200000,
654 TRACE_ITER_STOP_ON_FREE = 0x400000,
610}; 655};
611 656
612/* 657/*
@@ -675,6 +720,7 @@ struct event_subsystem {
675 struct dentry *entry; 720 struct dentry *entry;
676 struct event_filter *filter; 721 struct event_filter *filter;
677 int nr_events; 722 int nr_events;
723 int ref_count;
678}; 724};
679 725
680#define FILTER_PRED_INVALID ((unsigned short)-1) 726#define FILTER_PRED_INVALID ((unsigned short)-1)
diff --git a/kernel/trace/trace_entries.h b/kernel/trace/trace_entries.h
index e32744c84d94..93365907f219 100644
--- a/kernel/trace/trace_entries.h
+++ b/kernel/trace/trace_entries.h
@@ -161,7 +161,8 @@ FTRACE_ENTRY(kernel_stack, stack_entry,
161 TRACE_STACK, 161 TRACE_STACK,
162 162
163 F_STRUCT( 163 F_STRUCT(
164 __array( unsigned long, caller, FTRACE_STACK_ENTRIES ) 164 __field( int, size )
165 __dynamic_array(unsigned long, caller )
165 ), 166 ),
166 167
167 F_printk("\t=> (%08lx)\n\t=> (%08lx)\n\t=> (%08lx)\n\t=> (%08lx)\n" 168 F_printk("\t=> (%08lx)\n\t=> (%08lx)\n\t=> (%08lx)\n\t=> (%08lx)\n"
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index 2fe110341359..581876f9f387 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -244,6 +244,35 @@ static void ftrace_clear_events(void)
244 mutex_unlock(&event_mutex); 244 mutex_unlock(&event_mutex);
245} 245}
246 246
247static void __put_system(struct event_subsystem *system)
248{
249 struct event_filter *filter = system->filter;
250
251 WARN_ON_ONCE(system->ref_count == 0);
252 if (--system->ref_count)
253 return;
254
255 if (filter) {
256 kfree(filter->filter_string);
257 kfree(filter);
258 }
259 kfree(system->name);
260 kfree(system);
261}
262
263static void __get_system(struct event_subsystem *system)
264{
265 WARN_ON_ONCE(system->ref_count == 0);
266 system->ref_count++;
267}
268
269static void put_system(struct event_subsystem *system)
270{
271 mutex_lock(&event_mutex);
272 __put_system(system);
273 mutex_unlock(&event_mutex);
274}
275
247/* 276/*
248 * __ftrace_set_clr_event(NULL, NULL, NULL, set) will set/unset all events. 277 * __ftrace_set_clr_event(NULL, NULL, NULL, set) will set/unset all events.
249 */ 278 */
@@ -486,20 +515,11 @@ event_enable_write(struct file *filp, const char __user *ubuf, size_t cnt,
486 loff_t *ppos) 515 loff_t *ppos)
487{ 516{
488 struct ftrace_event_call *call = filp->private_data; 517 struct ftrace_event_call *call = filp->private_data;
489 char buf[64];
490 unsigned long val; 518 unsigned long val;
491 int ret; 519 int ret;
492 520
493 if (cnt >= sizeof(buf)) 521 ret = kstrtoul_from_user(ubuf, cnt, 10, &val);
494 return -EINVAL; 522 if (ret)
495
496 if (copy_from_user(&buf, ubuf, cnt))
497 return -EFAULT;
498
499 buf[cnt] = 0;
500
501 ret = strict_strtoul(buf, 10, &val);
502 if (ret < 0)
503 return ret; 523 return ret;
504 524
505 ret = tracing_update_buffers(); 525 ret = tracing_update_buffers();
@@ -528,7 +548,7 @@ system_enable_read(struct file *filp, char __user *ubuf, size_t cnt,
528 loff_t *ppos) 548 loff_t *ppos)
529{ 549{
530 const char set_to_char[4] = { '?', '0', '1', 'X' }; 550 const char set_to_char[4] = { '?', '0', '1', 'X' };
531 const char *system = filp->private_data; 551 struct event_subsystem *system = filp->private_data;
532 struct ftrace_event_call *call; 552 struct ftrace_event_call *call;
533 char buf[2]; 553 char buf[2];
534 int set = 0; 554 int set = 0;
@@ -539,7 +559,7 @@ system_enable_read(struct file *filp, char __user *ubuf, size_t cnt,
539 if (!call->name || !call->class || !call->class->reg) 559 if (!call->name || !call->class || !call->class->reg)
540 continue; 560 continue;
541 561
542 if (system && strcmp(call->class->system, system) != 0) 562 if (system && strcmp(call->class->system, system->name) != 0)
543 continue; 563 continue;
544 564
545 /* 565 /*
@@ -569,21 +589,13 @@ static ssize_t
569system_enable_write(struct file *filp, const char __user *ubuf, size_t cnt, 589system_enable_write(struct file *filp, const char __user *ubuf, size_t cnt,
570 loff_t *ppos) 590 loff_t *ppos)
571{ 591{
572 const char *system = filp->private_data; 592 struct event_subsystem *system = filp->private_data;
593 const char *name = NULL;
573 unsigned long val; 594 unsigned long val;
574 char buf[64];
575 ssize_t ret; 595 ssize_t ret;
576 596
577 if (cnt >= sizeof(buf)) 597 ret = kstrtoul_from_user(ubuf, cnt, 10, &val);
578 return -EINVAL; 598 if (ret)
579
580 if (copy_from_user(&buf, ubuf, cnt))
581 return -EFAULT;
582
583 buf[cnt] = 0;
584
585 ret = strict_strtoul(buf, 10, &val);
586 if (ret < 0)
587 return ret; 599 return ret;
588 600
589 ret = tracing_update_buffers(); 601 ret = tracing_update_buffers();
@@ -593,7 +605,14 @@ system_enable_write(struct file *filp, const char __user *ubuf, size_t cnt,
593 if (val != 0 && val != 1) 605 if (val != 0 && val != 1)
594 return -EINVAL; 606 return -EINVAL;
595 607
596 ret = __ftrace_set_clr_event(NULL, system, NULL, val); 608 /*
609 * Opening of "enable" adds a ref count to system,
610 * so the name is safe to use.
611 */
612 if (system)
613 name = system->name;
614
615 ret = __ftrace_set_clr_event(NULL, name, NULL, val);
597 if (ret) 616 if (ret)
598 goto out; 617 goto out;
599 618
@@ -826,6 +845,52 @@ event_filter_write(struct file *filp, const char __user *ubuf, size_t cnt,
826 return cnt; 845 return cnt;
827} 846}
828 847
848static LIST_HEAD(event_subsystems);
849
850static int subsystem_open(struct inode *inode, struct file *filp)
851{
852 struct event_subsystem *system = NULL;
853 int ret;
854
855 if (!inode->i_private)
856 goto skip_search;
857
858 /* Make sure the system still exists */
859 mutex_lock(&event_mutex);
860 list_for_each_entry(system, &event_subsystems, list) {
861 if (system == inode->i_private) {
862 /* Don't open systems with no events */
863 if (!system->nr_events) {
864 system = NULL;
865 break;
866 }
867 __get_system(system);
868 break;
869 }
870 }
871 mutex_unlock(&event_mutex);
872
873 if (system != inode->i_private)
874 return -ENODEV;
875
876 skip_search:
877 ret = tracing_open_generic(inode, filp);
878 if (ret < 0 && system)
879 put_system(system);
880
881 return ret;
882}
883
884static int subsystem_release(struct inode *inode, struct file *file)
885{
886 struct event_subsystem *system = inode->i_private;
887
888 if (system)
889 put_system(system);
890
891 return 0;
892}
893
829static ssize_t 894static ssize_t
830subsystem_filter_read(struct file *filp, char __user *ubuf, size_t cnt, 895subsystem_filter_read(struct file *filp, char __user *ubuf, size_t cnt,
831 loff_t *ppos) 896 loff_t *ppos)
@@ -963,17 +1028,19 @@ static const struct file_operations ftrace_event_filter_fops = {
963}; 1028};
964 1029
965static const struct file_operations ftrace_subsystem_filter_fops = { 1030static const struct file_operations ftrace_subsystem_filter_fops = {
966 .open = tracing_open_generic, 1031 .open = subsystem_open,
967 .read = subsystem_filter_read, 1032 .read = subsystem_filter_read,
968 .write = subsystem_filter_write, 1033 .write = subsystem_filter_write,
969 .llseek = default_llseek, 1034 .llseek = default_llseek,
1035 .release = subsystem_release,
970}; 1036};
971 1037
972static const struct file_operations ftrace_system_enable_fops = { 1038static const struct file_operations ftrace_system_enable_fops = {
973 .open = tracing_open_generic, 1039 .open = subsystem_open,
974 .read = system_enable_read, 1040 .read = system_enable_read,
975 .write = system_enable_write, 1041 .write = system_enable_write,
976 .llseek = default_llseek, 1042 .llseek = default_llseek,
1043 .release = subsystem_release,
977}; 1044};
978 1045
979static const struct file_operations ftrace_show_header_fops = { 1046static const struct file_operations ftrace_show_header_fops = {
@@ -1002,8 +1069,6 @@ static struct dentry *event_trace_events_dir(void)
1002 return d_events; 1069 return d_events;
1003} 1070}
1004 1071
1005static LIST_HEAD(event_subsystems);
1006
1007static struct dentry * 1072static struct dentry *
1008event_subsystem_dir(const char *name, struct dentry *d_events) 1073event_subsystem_dir(const char *name, struct dentry *d_events)
1009{ 1074{
@@ -1013,6 +1078,7 @@ event_subsystem_dir(const char *name, struct dentry *d_events)
1013 /* First see if we did not already create this dir */ 1078 /* First see if we did not already create this dir */
1014 list_for_each_entry(system, &event_subsystems, list) { 1079 list_for_each_entry(system, &event_subsystems, list) {
1015 if (strcmp(system->name, name) == 0) { 1080 if (strcmp(system->name, name) == 0) {
1081 __get_system(system);
1016 system->nr_events++; 1082 system->nr_events++;
1017 return system->entry; 1083 return system->entry;
1018 } 1084 }
@@ -1035,6 +1101,7 @@ event_subsystem_dir(const char *name, struct dentry *d_events)
1035 } 1101 }
1036 1102
1037 system->nr_events = 1; 1103 system->nr_events = 1;
1104 system->ref_count = 1;
1038 system->name = kstrdup(name, GFP_KERNEL); 1105 system->name = kstrdup(name, GFP_KERNEL);
1039 if (!system->name) { 1106 if (!system->name) {
1040 debugfs_remove(system->entry); 1107 debugfs_remove(system->entry);
@@ -1062,8 +1129,7 @@ event_subsystem_dir(const char *name, struct dentry *d_events)
1062 "'%s/filter' entry\n", name); 1129 "'%s/filter' entry\n", name);
1063 } 1130 }
1064 1131
1065 trace_create_file("enable", 0644, system->entry, 1132 trace_create_file("enable", 0644, system->entry, system,
1066 (void *)system->name,
1067 &ftrace_system_enable_fops); 1133 &ftrace_system_enable_fops);
1068 1134
1069 return system->entry; 1135 return system->entry;
@@ -1184,16 +1250,9 @@ static void remove_subsystem_dir(const char *name)
1184 list_for_each_entry(system, &event_subsystems, list) { 1250 list_for_each_entry(system, &event_subsystems, list) {
1185 if (strcmp(system->name, name) == 0) { 1251 if (strcmp(system->name, name) == 0) {
1186 if (!--system->nr_events) { 1252 if (!--system->nr_events) {
1187 struct event_filter *filter = system->filter;
1188
1189 debugfs_remove_recursive(system->entry); 1253 debugfs_remove_recursive(system->entry);
1190 list_del(&system->list); 1254 list_del(&system->list);
1191 if (filter) { 1255 __put_system(system);
1192 kfree(filter->filter_string);
1193 kfree(filter);
1194 }
1195 kfree(system->name);
1196 kfree(system);
1197 } 1256 }
1198 break; 1257 break;
1199 } 1258 }
@@ -1657,7 +1716,12 @@ static struct ftrace_ops trace_ops __initdata =
1657 1716
1658static __init void event_trace_self_test_with_function(void) 1717static __init void event_trace_self_test_with_function(void)
1659{ 1718{
1660 register_ftrace_function(&trace_ops); 1719 int ret;
1720 ret = register_ftrace_function(&trace_ops);
1721 if (WARN_ON(ret < 0)) {
1722 pr_info("Failed to enable function tracer for event tests\n");
1723 return;
1724 }
1661 pr_info("Running tests again, along with the function tracer\n"); 1725 pr_info("Running tests again, along with the function tracer\n");
1662 event_trace_self_tests(); 1726 event_trace_self_tests();
1663 unregister_ftrace_function(&trace_ops); 1727 unregister_ftrace_function(&trace_ops);
diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c
index 8008ddcfbf20..256764ecccd6 100644
--- a/kernel/trace/trace_events_filter.c
+++ b/kernel/trace/trace_events_filter.c
@@ -1886,6 +1886,12 @@ int apply_subsystem_event_filter(struct event_subsystem *system,
1886 1886
1887 mutex_lock(&event_mutex); 1887 mutex_lock(&event_mutex);
1888 1888
1889 /* Make sure the system still has events */
1890 if (!system->nr_events) {
1891 err = -ENODEV;
1892 goto out_unlock;
1893 }
1894
1889 if (!strcmp(strstrip(filter_string), "0")) { 1895 if (!strcmp(strstrip(filter_string), "0")) {
1890 filter_free_subsystem_preds(system); 1896 filter_free_subsystem_preds(system);
1891 remove_filter_string(system->filter); 1897 remove_filter_string(system->filter);
diff --git a/kernel/trace/trace_functions.c b/kernel/trace/trace_functions.c
index 16aee4d44e8f..c7b0c6a7db09 100644
--- a/kernel/trace/trace_functions.c
+++ b/kernel/trace/trace_functions.c
@@ -149,11 +149,13 @@ function_stack_trace_call(unsigned long ip, unsigned long parent_ip)
149static struct ftrace_ops trace_ops __read_mostly = 149static struct ftrace_ops trace_ops __read_mostly =
150{ 150{
151 .func = function_trace_call, 151 .func = function_trace_call,
152 .flags = FTRACE_OPS_FL_GLOBAL,
152}; 153};
153 154
154static struct ftrace_ops trace_stack_ops __read_mostly = 155static struct ftrace_ops trace_stack_ops __read_mostly =
155{ 156{
156 .func = function_stack_trace_call, 157 .func = function_stack_trace_call,
158 .flags = FTRACE_OPS_FL_GLOBAL,
157}; 159};
158 160
159/* Our two options */ 161/* Our two options */
@@ -322,7 +324,8 @@ ftrace_trace_onoff_unreg(char *glob, char *cmd, char *param)
322} 324}
323 325
324static int 326static int
325ftrace_trace_onoff_callback(char *glob, char *cmd, char *param, int enable) 327ftrace_trace_onoff_callback(struct ftrace_hash *hash,
328 char *glob, char *cmd, char *param, int enable)
326{ 329{
327 struct ftrace_probe_ops *ops; 330 struct ftrace_probe_ops *ops;
328 void *count = (void *)-1; 331 void *count = (void *)-1;
diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c
index 962cdb24ed81..a7d2a4c653d8 100644
--- a/kernel/trace/trace_functions_graph.c
+++ b/kernel/trace/trace_functions_graph.c
@@ -74,6 +74,20 @@ static struct tracer_flags tracer_flags = {
74 74
75static struct trace_array *graph_array; 75static struct trace_array *graph_array;
76 76
77/*
78 * DURATION column is being also used to display IRQ signs,
79 * following values are used by print_graph_irq and others
80 * to fill in space into DURATION column.
81 */
82enum {
83 DURATION_FILL_FULL = -1,
84 DURATION_FILL_START = -2,
85 DURATION_FILL_END = -3,
86};
87
88static enum print_line_t
89print_graph_duration(unsigned long long duration, struct trace_seq *s,
90 u32 flags);
77 91
78/* Add a function return address to the trace stack on thread info.*/ 92/* Add a function return address to the trace stack on thread info.*/
79int 93int
@@ -213,7 +227,7 @@ int __trace_graph_entry(struct trace_array *tr,
213 227
214static inline int ftrace_graph_ignore_irqs(void) 228static inline int ftrace_graph_ignore_irqs(void)
215{ 229{
216 if (!ftrace_graph_skip_irqs) 230 if (!ftrace_graph_skip_irqs || trace_recursion_test(TRACE_IRQ_BIT))
217 return 0; 231 return 0;
218 232
219 return in_irq(); 233 return in_irq();
@@ -577,32 +591,6 @@ get_return_for_leaf(struct trace_iterator *iter,
577 return next; 591 return next;
578} 592}
579 593
580/* Signal a overhead of time execution to the output */
581static int
582print_graph_overhead(unsigned long long duration, struct trace_seq *s,
583 u32 flags)
584{
585 /* If duration disappear, we don't need anything */
586 if (!(flags & TRACE_GRAPH_PRINT_DURATION))
587 return 1;
588
589 /* Non nested entry or return */
590 if (duration == -1)
591 return trace_seq_printf(s, " ");
592
593 if (flags & TRACE_GRAPH_PRINT_OVERHEAD) {
594 /* Duration exceeded 100 msecs */
595 if (duration > 100000ULL)
596 return trace_seq_printf(s, "! ");
597
598 /* Duration exceeded 10 msecs */
599 if (duration > 10000ULL)
600 return trace_seq_printf(s, "+ ");
601 }
602
603 return trace_seq_printf(s, " ");
604}
605
606static int print_graph_abs_time(u64 t, struct trace_seq *s) 594static int print_graph_abs_time(u64 t, struct trace_seq *s)
607{ 595{
608 unsigned long usecs_rem; 596 unsigned long usecs_rem;
@@ -625,34 +613,36 @@ print_graph_irq(struct trace_iterator *iter, unsigned long addr,
625 addr >= (unsigned long)__irqentry_text_end) 613 addr >= (unsigned long)__irqentry_text_end)
626 return TRACE_TYPE_UNHANDLED; 614 return TRACE_TYPE_UNHANDLED;
627 615
628 /* Absolute time */ 616 if (trace_flags & TRACE_ITER_CONTEXT_INFO) {
629 if (flags & TRACE_GRAPH_PRINT_ABS_TIME) { 617 /* Absolute time */
630 ret = print_graph_abs_time(iter->ts, s); 618 if (flags & TRACE_GRAPH_PRINT_ABS_TIME) {
631 if (!ret) 619 ret = print_graph_abs_time(iter->ts, s);
632 return TRACE_TYPE_PARTIAL_LINE; 620 if (!ret)
633 } 621 return TRACE_TYPE_PARTIAL_LINE;
622 }
634 623
635 /* Cpu */ 624 /* Cpu */
636 if (flags & TRACE_GRAPH_PRINT_CPU) { 625 if (flags & TRACE_GRAPH_PRINT_CPU) {
637 ret = print_graph_cpu(s, cpu); 626 ret = print_graph_cpu(s, cpu);
638 if (ret == TRACE_TYPE_PARTIAL_LINE) 627 if (ret == TRACE_TYPE_PARTIAL_LINE)
639 return TRACE_TYPE_PARTIAL_LINE; 628 return TRACE_TYPE_PARTIAL_LINE;
640 } 629 }
641 630
642 /* Proc */ 631 /* Proc */
643 if (flags & TRACE_GRAPH_PRINT_PROC) { 632 if (flags & TRACE_GRAPH_PRINT_PROC) {
644 ret = print_graph_proc(s, pid); 633 ret = print_graph_proc(s, pid);
645 if (ret == TRACE_TYPE_PARTIAL_LINE) 634 if (ret == TRACE_TYPE_PARTIAL_LINE)
646 return TRACE_TYPE_PARTIAL_LINE; 635 return TRACE_TYPE_PARTIAL_LINE;
647 ret = trace_seq_printf(s, " | "); 636 ret = trace_seq_printf(s, " | ");
648 if (!ret) 637 if (!ret)
649 return TRACE_TYPE_PARTIAL_LINE; 638 return TRACE_TYPE_PARTIAL_LINE;
639 }
650 } 640 }
651 641
652 /* No overhead */ 642 /* No overhead */
653 ret = print_graph_overhead(-1, s, flags); 643 ret = print_graph_duration(DURATION_FILL_START, s, flags);
654 if (!ret) 644 if (ret != TRACE_TYPE_HANDLED)
655 return TRACE_TYPE_PARTIAL_LINE; 645 return ret;
656 646
657 if (type == TRACE_GRAPH_ENT) 647 if (type == TRACE_GRAPH_ENT)
658 ret = trace_seq_printf(s, "==========>"); 648 ret = trace_seq_printf(s, "==========>");
@@ -662,9 +652,10 @@ print_graph_irq(struct trace_iterator *iter, unsigned long addr,
662 if (!ret) 652 if (!ret)
663 return TRACE_TYPE_PARTIAL_LINE; 653 return TRACE_TYPE_PARTIAL_LINE;
664 654
665 /* Don't close the duration column if haven't one */ 655 ret = print_graph_duration(DURATION_FILL_END, s, flags);
666 if (flags & TRACE_GRAPH_PRINT_DURATION) 656 if (ret != TRACE_TYPE_HANDLED)
667 trace_seq_printf(s, " |"); 657 return ret;
658
668 ret = trace_seq_printf(s, "\n"); 659 ret = trace_seq_printf(s, "\n");
669 660
670 if (!ret) 661 if (!ret)
@@ -716,9 +707,49 @@ trace_print_graph_duration(unsigned long long duration, struct trace_seq *s)
716} 707}
717 708
718static enum print_line_t 709static enum print_line_t
719print_graph_duration(unsigned long long duration, struct trace_seq *s) 710print_graph_duration(unsigned long long duration, struct trace_seq *s,
711 u32 flags)
720{ 712{
721 int ret; 713 int ret = -1;
714
715 if (!(flags & TRACE_GRAPH_PRINT_DURATION) ||
716 !(trace_flags & TRACE_ITER_CONTEXT_INFO))
717 return TRACE_TYPE_HANDLED;
718
719 /* No real adata, just filling the column with spaces */
720 switch (duration) {
721 case DURATION_FILL_FULL:
722 ret = trace_seq_printf(s, " | ");
723 return ret ? TRACE_TYPE_HANDLED : TRACE_TYPE_PARTIAL_LINE;
724 case DURATION_FILL_START:
725 ret = trace_seq_printf(s, " ");
726 return ret ? TRACE_TYPE_HANDLED : TRACE_TYPE_PARTIAL_LINE;
727 case DURATION_FILL_END:
728 ret = trace_seq_printf(s, " |");
729 return ret ? TRACE_TYPE_HANDLED : TRACE_TYPE_PARTIAL_LINE;
730 }
731
732 /* Signal a overhead of time execution to the output */
733 if (flags & TRACE_GRAPH_PRINT_OVERHEAD) {
734 /* Duration exceeded 100 msecs */
735 if (duration > 100000ULL)
736 ret = trace_seq_printf(s, "! ");
737 /* Duration exceeded 10 msecs */
738 else if (duration > 10000ULL)
739 ret = trace_seq_printf(s, "+ ");
740 }
741
742 /*
743 * The -1 means we either did not exceed the duration tresholds
744 * or we dont want to print out the overhead. Either way we need
745 * to fill out the space.
746 */
747 if (ret == -1)
748 ret = trace_seq_printf(s, " ");
749
750 /* Catching here any failure happenned above */
751 if (!ret)
752 return TRACE_TYPE_PARTIAL_LINE;
722 753
723 ret = trace_print_graph_duration(duration, s); 754 ret = trace_print_graph_duration(duration, s);
724 if (ret != TRACE_TYPE_HANDLED) 755 if (ret != TRACE_TYPE_HANDLED)
@@ -767,18 +798,11 @@ print_graph_entry_leaf(struct trace_iterator *iter,
767 cpu_data->enter_funcs[call->depth] = 0; 798 cpu_data->enter_funcs[call->depth] = 0;
768 } 799 }
769 800
770 /* Overhead */ 801 /* Overhead and duration */
771 ret = print_graph_overhead(duration, s, flags); 802 ret = print_graph_duration(duration, s, flags);
772 if (!ret) 803 if (ret == TRACE_TYPE_PARTIAL_LINE)
773 return TRACE_TYPE_PARTIAL_LINE; 804 return TRACE_TYPE_PARTIAL_LINE;
774 805
775 /* Duration */
776 if (flags & TRACE_GRAPH_PRINT_DURATION) {
777 ret = print_graph_duration(duration, s);
778 if (ret == TRACE_TYPE_PARTIAL_LINE)
779 return TRACE_TYPE_PARTIAL_LINE;
780 }
781
782 /* Function */ 806 /* Function */
783 for (i = 0; i < call->depth * TRACE_GRAPH_INDENT; i++) { 807 for (i = 0; i < call->depth * TRACE_GRAPH_INDENT; i++) {
784 ret = trace_seq_printf(s, " "); 808 ret = trace_seq_printf(s, " ");
@@ -815,17 +839,10 @@ print_graph_entry_nested(struct trace_iterator *iter,
815 cpu_data->enter_funcs[call->depth] = call->func; 839 cpu_data->enter_funcs[call->depth] = call->func;
816 } 840 }
817 841
818 /* No overhead */
819 ret = print_graph_overhead(-1, s, flags);
820 if (!ret)
821 return TRACE_TYPE_PARTIAL_LINE;
822
823 /* No time */ 842 /* No time */
824 if (flags & TRACE_GRAPH_PRINT_DURATION) { 843 ret = print_graph_duration(DURATION_FILL_FULL, s, flags);
825 ret = trace_seq_printf(s, " | "); 844 if (ret != TRACE_TYPE_HANDLED)
826 if (!ret) 845 return ret;
827 return TRACE_TYPE_PARTIAL_LINE;
828 }
829 846
830 /* Function */ 847 /* Function */
831 for (i = 0; i < call->depth * TRACE_GRAPH_INDENT; i++) { 848 for (i = 0; i < call->depth * TRACE_GRAPH_INDENT; i++) {
@@ -865,6 +882,9 @@ print_graph_prologue(struct trace_iterator *iter, struct trace_seq *s,
865 return TRACE_TYPE_PARTIAL_LINE; 882 return TRACE_TYPE_PARTIAL_LINE;
866 } 883 }
867 884
885 if (!(trace_flags & TRACE_ITER_CONTEXT_INFO))
886 return 0;
887
868 /* Absolute time */ 888 /* Absolute time */
869 if (flags & TRACE_GRAPH_PRINT_ABS_TIME) { 889 if (flags & TRACE_GRAPH_PRINT_ABS_TIME) {
870 ret = print_graph_abs_time(iter->ts, s); 890 ret = print_graph_abs_time(iter->ts, s);
@@ -1078,18 +1098,11 @@ print_graph_return(struct ftrace_graph_ret *trace, struct trace_seq *s,
1078 if (print_graph_prologue(iter, s, 0, 0, flags)) 1098 if (print_graph_prologue(iter, s, 0, 0, flags))
1079 return TRACE_TYPE_PARTIAL_LINE; 1099 return TRACE_TYPE_PARTIAL_LINE;
1080 1100
1081 /* Overhead */ 1101 /* Overhead and duration */
1082 ret = print_graph_overhead(duration, s, flags); 1102 ret = print_graph_duration(duration, s, flags);
1083 if (!ret) 1103 if (ret == TRACE_TYPE_PARTIAL_LINE)
1084 return TRACE_TYPE_PARTIAL_LINE; 1104 return TRACE_TYPE_PARTIAL_LINE;
1085 1105
1086 /* Duration */
1087 if (flags & TRACE_GRAPH_PRINT_DURATION) {
1088 ret = print_graph_duration(duration, s);
1089 if (ret == TRACE_TYPE_PARTIAL_LINE)
1090 return TRACE_TYPE_PARTIAL_LINE;
1091 }
1092
1093 /* Closing brace */ 1106 /* Closing brace */
1094 for (i = 0; i < trace->depth * TRACE_GRAPH_INDENT; i++) { 1107 for (i = 0; i < trace->depth * TRACE_GRAPH_INDENT; i++) {
1095 ret = trace_seq_printf(s, " "); 1108 ret = trace_seq_printf(s, " ");
@@ -1146,17 +1159,10 @@ print_graph_comment(struct trace_seq *s, struct trace_entry *ent,
1146 if (print_graph_prologue(iter, s, 0, 0, flags)) 1159 if (print_graph_prologue(iter, s, 0, 0, flags))
1147 return TRACE_TYPE_PARTIAL_LINE; 1160 return TRACE_TYPE_PARTIAL_LINE;
1148 1161
1149 /* No overhead */
1150 ret = print_graph_overhead(-1, s, flags);
1151 if (!ret)
1152 return TRACE_TYPE_PARTIAL_LINE;
1153
1154 /* No time */ 1162 /* No time */
1155 if (flags & TRACE_GRAPH_PRINT_DURATION) { 1163 ret = print_graph_duration(DURATION_FILL_FULL, s, flags);
1156 ret = trace_seq_printf(s, " | "); 1164 if (ret != TRACE_TYPE_HANDLED)
1157 if (!ret) 1165 return ret;
1158 return TRACE_TYPE_PARTIAL_LINE;
1159 }
1160 1166
1161 /* Indentation */ 1167 /* Indentation */
1162 if (depth > 0) 1168 if (depth > 0)
@@ -1207,7 +1213,7 @@ print_graph_comment(struct trace_seq *s, struct trace_entry *ent,
1207 1213
1208 1214
1209enum print_line_t 1215enum print_line_t
1210__print_graph_function_flags(struct trace_iterator *iter, u32 flags) 1216print_graph_function_flags(struct trace_iterator *iter, u32 flags)
1211{ 1217{
1212 struct ftrace_graph_ent_entry *field; 1218 struct ftrace_graph_ent_entry *field;
1213 struct fgraph_data *data = iter->private; 1219 struct fgraph_data *data = iter->private;
@@ -1270,18 +1276,7 @@ __print_graph_function_flags(struct trace_iterator *iter, u32 flags)
1270static enum print_line_t 1276static enum print_line_t
1271print_graph_function(struct trace_iterator *iter) 1277print_graph_function(struct trace_iterator *iter)
1272{ 1278{
1273 return __print_graph_function_flags(iter, tracer_flags.val); 1279 return print_graph_function_flags(iter, tracer_flags.val);
1274}
1275
1276enum print_line_t print_graph_function_flags(struct trace_iterator *iter,
1277 u32 flags)
1278{
1279 if (trace_flags & TRACE_ITER_LATENCY_FMT)
1280 flags |= TRACE_GRAPH_PRINT_DURATION;
1281 else
1282 flags |= TRACE_GRAPH_PRINT_ABS_TIME;
1283
1284 return __print_graph_function_flags(iter, flags);
1285} 1280}
1286 1281
1287static enum print_line_t 1282static enum print_line_t
@@ -1309,8 +1304,7 @@ static void print_lat_header(struct seq_file *s, u32 flags)
1309 seq_printf(s, "#%.*s / _----=> need-resched \n", size, spaces); 1304 seq_printf(s, "#%.*s / _----=> need-resched \n", size, spaces);
1310 seq_printf(s, "#%.*s| / _---=> hardirq/softirq \n", size, spaces); 1305 seq_printf(s, "#%.*s| / _---=> hardirq/softirq \n", size, spaces);
1311 seq_printf(s, "#%.*s|| / _--=> preempt-depth \n", size, spaces); 1306 seq_printf(s, "#%.*s|| / _--=> preempt-depth \n", size, spaces);
1312 seq_printf(s, "#%.*s||| / _-=> lock-depth \n", size, spaces); 1307 seq_printf(s, "#%.*s||| / \n", size, spaces);
1313 seq_printf(s, "#%.*s|||| / \n", size, spaces);
1314} 1308}
1315 1309
1316static void __print_graph_headers_flags(struct seq_file *s, u32 flags) 1310static void __print_graph_headers_flags(struct seq_file *s, u32 flags)
@@ -1329,7 +1323,7 @@ static void __print_graph_headers_flags(struct seq_file *s, u32 flags)
1329 if (flags & TRACE_GRAPH_PRINT_PROC) 1323 if (flags & TRACE_GRAPH_PRINT_PROC)
1330 seq_printf(s, " TASK/PID "); 1324 seq_printf(s, " TASK/PID ");
1331 if (lat) 1325 if (lat)
1332 seq_printf(s, "|||||"); 1326 seq_printf(s, "||||");
1333 if (flags & TRACE_GRAPH_PRINT_DURATION) 1327 if (flags & TRACE_GRAPH_PRINT_DURATION)
1334 seq_printf(s, " DURATION "); 1328 seq_printf(s, " DURATION ");
1335 seq_printf(s, " FUNCTION CALLS\n"); 1329 seq_printf(s, " FUNCTION CALLS\n");
@@ -1343,7 +1337,7 @@ static void __print_graph_headers_flags(struct seq_file *s, u32 flags)
1343 if (flags & TRACE_GRAPH_PRINT_PROC) 1337 if (flags & TRACE_GRAPH_PRINT_PROC)
1344 seq_printf(s, " | | "); 1338 seq_printf(s, " | | ");
1345 if (lat) 1339 if (lat)
1346 seq_printf(s, "|||||"); 1340 seq_printf(s, "||||");
1347 if (flags & TRACE_GRAPH_PRINT_DURATION) 1341 if (flags & TRACE_GRAPH_PRINT_DURATION)
1348 seq_printf(s, " | | "); 1342 seq_printf(s, " | | ");
1349 seq_printf(s, " | | | |\n"); 1343 seq_printf(s, " | | | |\n");
@@ -1358,15 +1352,16 @@ void print_graph_headers_flags(struct seq_file *s, u32 flags)
1358{ 1352{
1359 struct trace_iterator *iter = s->private; 1353 struct trace_iterator *iter = s->private;
1360 1354
1355 if (!(trace_flags & TRACE_ITER_CONTEXT_INFO))
1356 return;
1357
1361 if (trace_flags & TRACE_ITER_LATENCY_FMT) { 1358 if (trace_flags & TRACE_ITER_LATENCY_FMT) {
1362 /* print nothing if the buffers are empty */ 1359 /* print nothing if the buffers are empty */
1363 if (trace_empty(iter)) 1360 if (trace_empty(iter))
1364 return; 1361 return;
1365 1362
1366 print_trace_header(s, iter); 1363 print_trace_header(s, iter);
1367 flags |= TRACE_GRAPH_PRINT_DURATION; 1364 }
1368 } else
1369 flags |= TRACE_GRAPH_PRINT_ABS_TIME;
1370 1365
1371 __print_graph_headers_flags(s, flags); 1366 __print_graph_headers_flags(s, flags);
1372} 1367}
diff --git a/kernel/trace/trace_irqsoff.c b/kernel/trace/trace_irqsoff.c
index a4969b47afc1..667aa8cc0cfc 100644
--- a/kernel/trace/trace_irqsoff.c
+++ b/kernel/trace/trace_irqsoff.c
@@ -153,6 +153,7 @@ irqsoff_tracer_call(unsigned long ip, unsigned long parent_ip)
153static struct ftrace_ops trace_ops __read_mostly = 153static struct ftrace_ops trace_ops __read_mostly =
154{ 154{
155 .func = irqsoff_tracer_call, 155 .func = irqsoff_tracer_call,
156 .flags = FTRACE_OPS_FL_GLOBAL,
156}; 157};
157#endif /* CONFIG_FUNCTION_TRACER */ 158#endif /* CONFIG_FUNCTION_TRACER */
158 159
@@ -225,7 +226,9 @@ static void irqsoff_trace_close(struct trace_iterator *iter)
225} 226}
226 227
227#define GRAPH_TRACER_FLAGS (TRACE_GRAPH_PRINT_CPU | \ 228#define GRAPH_TRACER_FLAGS (TRACE_GRAPH_PRINT_CPU | \
228 TRACE_GRAPH_PRINT_PROC) 229 TRACE_GRAPH_PRINT_PROC | \
230 TRACE_GRAPH_PRINT_ABS_TIME | \
231 TRACE_GRAPH_PRINT_DURATION)
229 232
230static enum print_line_t irqsoff_print_line(struct trace_iterator *iter) 233static enum print_line_t irqsoff_print_line(struct trace_iterator *iter)
231{ 234{
diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c
index 35d55a386145..5fb3697bf0e5 100644
--- a/kernel/trace/trace_kprobe.c
+++ b/kernel/trace/trace_kprobe.c
@@ -53,7 +53,6 @@ const char *reserved_field_names[] = {
53 "common_preempt_count", 53 "common_preempt_count",
54 "common_pid", 54 "common_pid",
55 "common_tgid", 55 "common_tgid",
56 "common_lock_depth",
57 FIELD_STRING_IP, 56 FIELD_STRING_IP,
58 FIELD_STRING_RETIP, 57 FIELD_STRING_RETIP,
59 FIELD_STRING_FUNC, 58 FIELD_STRING_FUNC,
@@ -344,6 +343,14 @@ DEFINE_BASIC_FETCH_FUNCS(deref)
344DEFINE_FETCH_deref(string) 343DEFINE_FETCH_deref(string)
345DEFINE_FETCH_deref(string_size) 344DEFINE_FETCH_deref(string_size)
346 345
346static __kprobes void update_deref_fetch_param(struct deref_fetch_param *data)
347{
348 if (CHECK_FETCH_FUNCS(deref, data->orig.fn))
349 update_deref_fetch_param(data->orig.data);
350 else if (CHECK_FETCH_FUNCS(symbol, data->orig.fn))
351 update_symbol_cache(data->orig.data);
352}
353
347static __kprobes void free_deref_fetch_param(struct deref_fetch_param *data) 354static __kprobes void free_deref_fetch_param(struct deref_fetch_param *data)
348{ 355{
349 if (CHECK_FETCH_FUNCS(deref, data->orig.fn)) 356 if (CHECK_FETCH_FUNCS(deref, data->orig.fn))
@@ -378,6 +385,19 @@ DEFINE_BASIC_FETCH_FUNCS(bitfield)
378#define fetch_bitfield_string_size NULL 385#define fetch_bitfield_string_size NULL
379 386
380static __kprobes void 387static __kprobes void
388update_bitfield_fetch_param(struct bitfield_fetch_param *data)
389{
390 /*
391 * Don't check the bitfield itself, because this must be the
392 * last fetch function.
393 */
394 if (CHECK_FETCH_FUNCS(deref, data->orig.fn))
395 update_deref_fetch_param(data->orig.data);
396 else if (CHECK_FETCH_FUNCS(symbol, data->orig.fn))
397 update_symbol_cache(data->orig.data);
398}
399
400static __kprobes void
381free_bitfield_fetch_param(struct bitfield_fetch_param *data) 401free_bitfield_fetch_param(struct bitfield_fetch_param *data)
382{ 402{
383 /* 403 /*
@@ -390,6 +410,7 @@ free_bitfield_fetch_param(struct bitfield_fetch_param *data)
390 free_symbol_cache(data->orig.data); 410 free_symbol_cache(data->orig.data);
391 kfree(data); 411 kfree(data);
392} 412}
413
393/* Default (unsigned long) fetch type */ 414/* Default (unsigned long) fetch type */
394#define __DEFAULT_FETCH_TYPE(t) u##t 415#define __DEFAULT_FETCH_TYPE(t) u##t
395#define _DEFAULT_FETCH_TYPE(t) __DEFAULT_FETCH_TYPE(t) 416#define _DEFAULT_FETCH_TYPE(t) __DEFAULT_FETCH_TYPE(t)
@@ -537,6 +558,7 @@ struct probe_arg {
537/* Flags for trace_probe */ 558/* Flags for trace_probe */
538#define TP_FLAG_TRACE 1 559#define TP_FLAG_TRACE 1
539#define TP_FLAG_PROFILE 2 560#define TP_FLAG_PROFILE 2
561#define TP_FLAG_REGISTERED 4
540 562
541struct trace_probe { 563struct trace_probe {
542 struct list_head list; 564 struct list_head list;
@@ -556,16 +578,49 @@ struct trace_probe {
556 (sizeof(struct probe_arg) * (n))) 578 (sizeof(struct probe_arg) * (n)))
557 579
558 580
559static __kprobes int probe_is_return(struct trace_probe *tp) 581static __kprobes int trace_probe_is_return(struct trace_probe *tp)
560{ 582{
561 return tp->rp.handler != NULL; 583 return tp->rp.handler != NULL;
562} 584}
563 585
564static __kprobes const char *probe_symbol(struct trace_probe *tp) 586static __kprobes const char *trace_probe_symbol(struct trace_probe *tp)
565{ 587{
566 return tp->symbol ? tp->symbol : "unknown"; 588 return tp->symbol ? tp->symbol : "unknown";
567} 589}
568 590
591static __kprobes unsigned long trace_probe_offset(struct trace_probe *tp)
592{
593 return tp->rp.kp.offset;
594}
595
596static __kprobes bool trace_probe_is_enabled(struct trace_probe *tp)
597{
598 return !!(tp->flags & (TP_FLAG_TRACE | TP_FLAG_PROFILE));
599}
600
601static __kprobes bool trace_probe_is_registered(struct trace_probe *tp)
602{
603 return !!(tp->flags & TP_FLAG_REGISTERED);
604}
605
606static __kprobes bool trace_probe_has_gone(struct trace_probe *tp)
607{
608 return !!(kprobe_gone(&tp->rp.kp));
609}
610
611static __kprobes bool trace_probe_within_module(struct trace_probe *tp,
612 struct module *mod)
613{
614 int len = strlen(mod->name);
615 const char *name = trace_probe_symbol(tp);
616 return strncmp(mod->name, name, len) == 0 && name[len] == ':';
617}
618
619static __kprobes bool trace_probe_is_on_module(struct trace_probe *tp)
620{
621 return !!strchr(trace_probe_symbol(tp), ':');
622}
623
569static int register_probe_event(struct trace_probe *tp); 624static int register_probe_event(struct trace_probe *tp);
570static void unregister_probe_event(struct trace_probe *tp); 625static void unregister_probe_event(struct trace_probe *tp);
571 626
@@ -647,6 +702,16 @@ error:
647 return ERR_PTR(ret); 702 return ERR_PTR(ret);
648} 703}
649 704
705static void update_probe_arg(struct probe_arg *arg)
706{
707 if (CHECK_FETCH_FUNCS(bitfield, arg->fetch.fn))
708 update_bitfield_fetch_param(arg->fetch.data);
709 else if (CHECK_FETCH_FUNCS(deref, arg->fetch.fn))
710 update_deref_fetch_param(arg->fetch.data);
711 else if (CHECK_FETCH_FUNCS(symbol, arg->fetch.fn))
712 update_symbol_cache(arg->fetch.data);
713}
714
650static void free_probe_arg(struct probe_arg *arg) 715static void free_probe_arg(struct probe_arg *arg)
651{ 716{
652 if (CHECK_FETCH_FUNCS(bitfield, arg->fetch.fn)) 717 if (CHECK_FETCH_FUNCS(bitfield, arg->fetch.fn))
@@ -672,7 +737,7 @@ static void free_trace_probe(struct trace_probe *tp)
672 kfree(tp); 737 kfree(tp);
673} 738}
674 739
675static struct trace_probe *find_probe_event(const char *event, 740static struct trace_probe *find_trace_probe(const char *event,
676 const char *group) 741 const char *group)
677{ 742{
678 struct trace_probe *tp; 743 struct trace_probe *tp;
@@ -684,13 +749,96 @@ static struct trace_probe *find_probe_event(const char *event,
684 return NULL; 749 return NULL;
685} 750}
686 751
752/* Enable trace_probe - @flag must be TP_FLAG_TRACE or TP_FLAG_PROFILE */
753static int enable_trace_probe(struct trace_probe *tp, int flag)
754{
755 int ret = 0;
756
757 tp->flags |= flag;
758 if (trace_probe_is_enabled(tp) && trace_probe_is_registered(tp) &&
759 !trace_probe_has_gone(tp)) {
760 if (trace_probe_is_return(tp))
761 ret = enable_kretprobe(&tp->rp);
762 else
763 ret = enable_kprobe(&tp->rp.kp);
764 }
765
766 return ret;
767}
768
769/* Disable trace_probe - @flag must be TP_FLAG_TRACE or TP_FLAG_PROFILE */
770static void disable_trace_probe(struct trace_probe *tp, int flag)
771{
772 tp->flags &= ~flag;
773 if (!trace_probe_is_enabled(tp) && trace_probe_is_registered(tp)) {
774 if (trace_probe_is_return(tp))
775 disable_kretprobe(&tp->rp);
776 else
777 disable_kprobe(&tp->rp.kp);
778 }
779}
780
781/* Internal register function - just handle k*probes and flags */
782static int __register_trace_probe(struct trace_probe *tp)
783{
784 int i, ret;
785
786 if (trace_probe_is_registered(tp))
787 return -EINVAL;
788
789 for (i = 0; i < tp->nr_args; i++)
790 update_probe_arg(&tp->args[i]);
791
792 /* Set/clear disabled flag according to tp->flag */
793 if (trace_probe_is_enabled(tp))
794 tp->rp.kp.flags &= ~KPROBE_FLAG_DISABLED;
795 else
796 tp->rp.kp.flags |= KPROBE_FLAG_DISABLED;
797
798 if (trace_probe_is_return(tp))
799 ret = register_kretprobe(&tp->rp);
800 else
801 ret = register_kprobe(&tp->rp.kp);
802
803 if (ret == 0)
804 tp->flags |= TP_FLAG_REGISTERED;
805 else {
806 pr_warning("Could not insert probe at %s+%lu: %d\n",
807 trace_probe_symbol(tp), trace_probe_offset(tp), ret);
808 if (ret == -ENOENT && trace_probe_is_on_module(tp)) {
809 pr_warning("This probe might be able to register after"
810 "target module is loaded. Continue.\n");
811 ret = 0;
812 } else if (ret == -EILSEQ) {
813 pr_warning("Probing address(0x%p) is not an "
814 "instruction boundary.\n",
815 tp->rp.kp.addr);
816 ret = -EINVAL;
817 }
818 }
819
820 return ret;
821}
822
823/* Internal unregister function - just handle k*probes and flags */
824static void __unregister_trace_probe(struct trace_probe *tp)
825{
826 if (trace_probe_is_registered(tp)) {
827 if (trace_probe_is_return(tp))
828 unregister_kretprobe(&tp->rp);
829 else
830 unregister_kprobe(&tp->rp.kp);
831 tp->flags &= ~TP_FLAG_REGISTERED;
832 /* Cleanup kprobe for reuse */
833 if (tp->rp.kp.symbol_name)
834 tp->rp.kp.addr = NULL;
835 }
836}
837
687/* Unregister a trace_probe and probe_event: call with locking probe_lock */ 838/* Unregister a trace_probe and probe_event: call with locking probe_lock */
688static void unregister_trace_probe(struct trace_probe *tp) 839static void unregister_trace_probe(struct trace_probe *tp)
689{ 840{
690 if (probe_is_return(tp)) 841 __unregister_trace_probe(tp);
691 unregister_kretprobe(&tp->rp);
692 else
693 unregister_kprobe(&tp->rp.kp);
694 list_del(&tp->list); 842 list_del(&tp->list);
695 unregister_probe_event(tp); 843 unregister_probe_event(tp);
696} 844}
@@ -703,41 +851,65 @@ static int register_trace_probe(struct trace_probe *tp)
703 851
704 mutex_lock(&probe_lock); 852 mutex_lock(&probe_lock);
705 853
706 /* register as an event */ 854 /* Delete old (same name) event if exist */
707 old_tp = find_probe_event(tp->call.name, tp->call.class->system); 855 old_tp = find_trace_probe(tp->call.name, tp->call.class->system);
708 if (old_tp) { 856 if (old_tp) {
709 /* delete old event */
710 unregister_trace_probe(old_tp); 857 unregister_trace_probe(old_tp);
711 free_trace_probe(old_tp); 858 free_trace_probe(old_tp);
712 } 859 }
860
861 /* Register new event */
713 ret = register_probe_event(tp); 862 ret = register_probe_event(tp);
714 if (ret) { 863 if (ret) {
715 pr_warning("Failed to register probe event(%d)\n", ret); 864 pr_warning("Failed to register probe event(%d)\n", ret);
716 goto end; 865 goto end;
717 } 866 }
718 867
719 tp->rp.kp.flags |= KPROBE_FLAG_DISABLED; 868 /* Register k*probe */
720 if (probe_is_return(tp)) 869 ret = __register_trace_probe(tp);
721 ret = register_kretprobe(&tp->rp); 870 if (ret < 0)
722 else
723 ret = register_kprobe(&tp->rp.kp);
724
725 if (ret) {
726 pr_warning("Could not insert probe(%d)\n", ret);
727 if (ret == -EILSEQ) {
728 pr_warning("Probing address(0x%p) is not an "
729 "instruction boundary.\n",
730 tp->rp.kp.addr);
731 ret = -EINVAL;
732 }
733 unregister_probe_event(tp); 871 unregister_probe_event(tp);
734 } else 872 else
735 list_add_tail(&tp->list, &probe_list); 873 list_add_tail(&tp->list, &probe_list);
874
736end: 875end:
737 mutex_unlock(&probe_lock); 876 mutex_unlock(&probe_lock);
738 return ret; 877 return ret;
739} 878}
740 879
880/* Module notifier call back, checking event on the module */
881static int trace_probe_module_callback(struct notifier_block *nb,
882 unsigned long val, void *data)
883{
884 struct module *mod = data;
885 struct trace_probe *tp;
886 int ret;
887
888 if (val != MODULE_STATE_COMING)
889 return NOTIFY_DONE;
890
891 /* Update probes on coming module */
892 mutex_lock(&probe_lock);
893 list_for_each_entry(tp, &probe_list, list) {
894 if (trace_probe_within_module(tp, mod)) {
895 __unregister_trace_probe(tp);
896 ret = __register_trace_probe(tp);
897 if (ret)
898 pr_warning("Failed to re-register probe %s on"
899 "%s: %d\n",
900 tp->call.name, mod->name, ret);
901 }
902 }
903 mutex_unlock(&probe_lock);
904
905 return NOTIFY_DONE;
906}
907
908static struct notifier_block trace_probe_module_nb = {
909 .notifier_call = trace_probe_module_callback,
910 .priority = 1 /* Invoked after kprobe module callback */
911};
912
741/* Split symbol and offset. */ 913/* Split symbol and offset. */
742static int split_symbol_offset(char *symbol, unsigned long *offset) 914static int split_symbol_offset(char *symbol, unsigned long *offset)
743{ 915{
@@ -963,8 +1135,8 @@ static int create_trace_probe(int argc, char **argv)
963{ 1135{
964 /* 1136 /*
965 * Argument syntax: 1137 * Argument syntax:
966 * - Add kprobe: p[:[GRP/]EVENT] KSYM[+OFFS]|KADDR [FETCHARGS] 1138 * - Add kprobe: p[:[GRP/]EVENT] [MOD:]KSYM[+OFFS]|KADDR [FETCHARGS]
967 * - Add kretprobe: r[:[GRP/]EVENT] KSYM[+0] [FETCHARGS] 1139 * - Add kretprobe: r[:[GRP/]EVENT] [MOD:]KSYM[+0] [FETCHARGS]
968 * Fetch args: 1140 * Fetch args:
969 * $retval : fetch return value 1141 * $retval : fetch return value
970 * $stack : fetch stack address 1142 * $stack : fetch stack address
@@ -1026,7 +1198,7 @@ static int create_trace_probe(int argc, char **argv)
1026 return -EINVAL; 1198 return -EINVAL;
1027 } 1199 }
1028 mutex_lock(&probe_lock); 1200 mutex_lock(&probe_lock);
1029 tp = find_probe_event(event, group); 1201 tp = find_trace_probe(event, group);
1030 if (!tp) { 1202 if (!tp) {
1031 mutex_unlock(&probe_lock); 1203 mutex_unlock(&probe_lock);
1032 pr_info("Event %s/%s doesn't exist.\n", group, event); 1204 pr_info("Event %s/%s doesn't exist.\n", group, event);
@@ -1145,7 +1317,7 @@ error:
1145 return ret; 1317 return ret;
1146} 1318}
1147 1319
1148static void cleanup_all_probes(void) 1320static void release_all_trace_probes(void)
1149{ 1321{
1150 struct trace_probe *tp; 1322 struct trace_probe *tp;
1151 1323
@@ -1159,7 +1331,6 @@ static void cleanup_all_probes(void)
1159 mutex_unlock(&probe_lock); 1331 mutex_unlock(&probe_lock);
1160} 1332}
1161 1333
1162
1163/* Probes listing interfaces */ 1334/* Probes listing interfaces */
1164static void *probes_seq_start(struct seq_file *m, loff_t *pos) 1335static void *probes_seq_start(struct seq_file *m, loff_t *pos)
1165{ 1336{
@@ -1182,15 +1353,16 @@ static int probes_seq_show(struct seq_file *m, void *v)
1182 struct trace_probe *tp = v; 1353 struct trace_probe *tp = v;
1183 int i; 1354 int i;
1184 1355
1185 seq_printf(m, "%c", probe_is_return(tp) ? 'r' : 'p'); 1356 seq_printf(m, "%c", trace_probe_is_return(tp) ? 'r' : 'p');
1186 seq_printf(m, ":%s/%s", tp->call.class->system, tp->call.name); 1357 seq_printf(m, ":%s/%s", tp->call.class->system, tp->call.name);
1187 1358
1188 if (!tp->symbol) 1359 if (!tp->symbol)
1189 seq_printf(m, " 0x%p", tp->rp.kp.addr); 1360 seq_printf(m, " 0x%p", tp->rp.kp.addr);
1190 else if (tp->rp.kp.offset) 1361 else if (tp->rp.kp.offset)
1191 seq_printf(m, " %s+%u", probe_symbol(tp), tp->rp.kp.offset); 1362 seq_printf(m, " %s+%u", trace_probe_symbol(tp),
1363 tp->rp.kp.offset);
1192 else 1364 else
1193 seq_printf(m, " %s", probe_symbol(tp)); 1365 seq_printf(m, " %s", trace_probe_symbol(tp));
1194 1366
1195 for (i = 0; i < tp->nr_args; i++) 1367 for (i = 0; i < tp->nr_args; i++)
1196 seq_printf(m, " %s=%s", tp->args[i].name, tp->args[i].comm); 1368 seq_printf(m, " %s=%s", tp->args[i].name, tp->args[i].comm);
@@ -1210,7 +1382,7 @@ static int probes_open(struct inode *inode, struct file *file)
1210{ 1382{
1211 if ((file->f_mode & FMODE_WRITE) && 1383 if ((file->f_mode & FMODE_WRITE) &&
1212 (file->f_flags & O_TRUNC)) 1384 (file->f_flags & O_TRUNC))
1213 cleanup_all_probes(); 1385 release_all_trace_probes();
1214 1386
1215 return seq_open(file, &probes_seq_op); 1387 return seq_open(file, &probes_seq_op);
1216} 1388}
@@ -1398,7 +1570,8 @@ static __kprobes void kprobe_trace_func(struct kprobe *kp, struct pt_regs *regs)
1398 store_trace_args(sizeof(*entry), tp, regs, (u8 *)&entry[1], dsize); 1570 store_trace_args(sizeof(*entry), tp, regs, (u8 *)&entry[1], dsize);
1399 1571
1400 if (!filter_current_check_discard(buffer, call, entry, event)) 1572 if (!filter_current_check_discard(buffer, call, entry, event))
1401 trace_nowake_buffer_unlock_commit(buffer, event, irq_flags, pc); 1573 trace_nowake_buffer_unlock_commit_regs(buffer, event,
1574 irq_flags, pc, regs);
1402} 1575}
1403 1576
1404/* Kretprobe handler */ 1577/* Kretprobe handler */
@@ -1430,7 +1603,8 @@ static __kprobes void kretprobe_trace_func(struct kretprobe_instance *ri,
1430 store_trace_args(sizeof(*entry), tp, regs, (u8 *)&entry[1], dsize); 1603 store_trace_args(sizeof(*entry), tp, regs, (u8 *)&entry[1], dsize);
1431 1604
1432 if (!filter_current_check_discard(buffer, call, entry, event)) 1605 if (!filter_current_check_discard(buffer, call, entry, event))
1433 trace_nowake_buffer_unlock_commit(buffer, event, irq_flags, pc); 1606 trace_nowake_buffer_unlock_commit_regs(buffer, event,
1607 irq_flags, pc, regs);
1434} 1608}
1435 1609
1436/* Event entry printers */ 1610/* Event entry printers */
@@ -1512,30 +1686,6 @@ partial:
1512 return TRACE_TYPE_PARTIAL_LINE; 1686 return TRACE_TYPE_PARTIAL_LINE;
1513} 1687}
1514 1688
1515static int probe_event_enable(struct ftrace_event_call *call)
1516{
1517 struct trace_probe *tp = (struct trace_probe *)call->data;
1518
1519 tp->flags |= TP_FLAG_TRACE;
1520 if (probe_is_return(tp))
1521 return enable_kretprobe(&tp->rp);
1522 else
1523 return enable_kprobe(&tp->rp.kp);
1524}
1525
1526static void probe_event_disable(struct ftrace_event_call *call)
1527{
1528 struct trace_probe *tp = (struct trace_probe *)call->data;
1529
1530 tp->flags &= ~TP_FLAG_TRACE;
1531 if (!(tp->flags & (TP_FLAG_TRACE | TP_FLAG_PROFILE))) {
1532 if (probe_is_return(tp))
1533 disable_kretprobe(&tp->rp);
1534 else
1535 disable_kprobe(&tp->rp.kp);
1536 }
1537}
1538
1539#undef DEFINE_FIELD 1689#undef DEFINE_FIELD
1540#define DEFINE_FIELD(type, item, name, is_signed) \ 1690#define DEFINE_FIELD(type, item, name, is_signed) \
1541 do { \ 1691 do { \
@@ -1597,7 +1747,7 @@ static int __set_print_fmt(struct trace_probe *tp, char *buf, int len)
1597 1747
1598 const char *fmt, *arg; 1748 const char *fmt, *arg;
1599 1749
1600 if (!probe_is_return(tp)) { 1750 if (!trace_probe_is_return(tp)) {
1601 fmt = "(%lx)"; 1751 fmt = "(%lx)";
1602 arg = "REC->" FIELD_STRING_IP; 1752 arg = "REC->" FIELD_STRING_IP;
1603 } else { 1753 } else {
@@ -1714,49 +1864,25 @@ static __kprobes void kretprobe_perf_func(struct kretprobe_instance *ri,
1714 head = this_cpu_ptr(call->perf_events); 1864 head = this_cpu_ptr(call->perf_events);
1715 perf_trace_buf_submit(entry, size, rctx, entry->ret_ip, 1, regs, head); 1865 perf_trace_buf_submit(entry, size, rctx, entry->ret_ip, 1, regs, head);
1716} 1866}
1717
1718static int probe_perf_enable(struct ftrace_event_call *call)
1719{
1720 struct trace_probe *tp = (struct trace_probe *)call->data;
1721
1722 tp->flags |= TP_FLAG_PROFILE;
1723
1724 if (probe_is_return(tp))
1725 return enable_kretprobe(&tp->rp);
1726 else
1727 return enable_kprobe(&tp->rp.kp);
1728}
1729
1730static void probe_perf_disable(struct ftrace_event_call *call)
1731{
1732 struct trace_probe *tp = (struct trace_probe *)call->data;
1733
1734 tp->flags &= ~TP_FLAG_PROFILE;
1735
1736 if (!(tp->flags & TP_FLAG_TRACE)) {
1737 if (probe_is_return(tp))
1738 disable_kretprobe(&tp->rp);
1739 else
1740 disable_kprobe(&tp->rp.kp);
1741 }
1742}
1743#endif /* CONFIG_PERF_EVENTS */ 1867#endif /* CONFIG_PERF_EVENTS */
1744 1868
1745static __kprobes 1869static __kprobes
1746int kprobe_register(struct ftrace_event_call *event, enum trace_reg type) 1870int kprobe_register(struct ftrace_event_call *event, enum trace_reg type)
1747{ 1871{
1872 struct trace_probe *tp = (struct trace_probe *)event->data;
1873
1748 switch (type) { 1874 switch (type) {
1749 case TRACE_REG_REGISTER: 1875 case TRACE_REG_REGISTER:
1750 return probe_event_enable(event); 1876 return enable_trace_probe(tp, TP_FLAG_TRACE);
1751 case TRACE_REG_UNREGISTER: 1877 case TRACE_REG_UNREGISTER:
1752 probe_event_disable(event); 1878 disable_trace_probe(tp, TP_FLAG_TRACE);
1753 return 0; 1879 return 0;
1754 1880
1755#ifdef CONFIG_PERF_EVENTS 1881#ifdef CONFIG_PERF_EVENTS
1756 case TRACE_REG_PERF_REGISTER: 1882 case TRACE_REG_PERF_REGISTER:
1757 return probe_perf_enable(event); 1883 return enable_trace_probe(tp, TP_FLAG_PROFILE);
1758 case TRACE_REG_PERF_UNREGISTER: 1884 case TRACE_REG_PERF_UNREGISTER:
1759 probe_perf_disable(event); 1885 disable_trace_probe(tp, TP_FLAG_PROFILE);
1760 return 0; 1886 return 0;
1761#endif 1887#endif
1762 } 1888 }
@@ -1806,7 +1932,7 @@ static int register_probe_event(struct trace_probe *tp)
1806 1932
1807 /* Initialize ftrace_event_call */ 1933 /* Initialize ftrace_event_call */
1808 INIT_LIST_HEAD(&call->class->fields); 1934 INIT_LIST_HEAD(&call->class->fields);
1809 if (probe_is_return(tp)) { 1935 if (trace_probe_is_return(tp)) {
1810 call->event.funcs = &kretprobe_funcs; 1936 call->event.funcs = &kretprobe_funcs;
1811 call->class->define_fields = kretprobe_event_define_fields; 1937 call->class->define_fields = kretprobe_event_define_fields;
1812 } else { 1938 } else {
@@ -1845,6 +1971,9 @@ static __init int init_kprobe_trace(void)
1845 struct dentry *d_tracer; 1971 struct dentry *d_tracer;
1846 struct dentry *entry; 1972 struct dentry *entry;
1847 1973
1974 if (register_module_notifier(&trace_probe_module_nb))
1975 return -EINVAL;
1976
1848 d_tracer = tracing_init_dentry(); 1977 d_tracer = tracing_init_dentry();
1849 if (!d_tracer) 1978 if (!d_tracer)
1850 return 0; 1979 return 0;
@@ -1871,8 +2000,12 @@ fs_initcall(init_kprobe_trace);
1871 2000
1872#ifdef CONFIG_FTRACE_STARTUP_TEST 2001#ifdef CONFIG_FTRACE_STARTUP_TEST
1873 2002
1874static int kprobe_trace_selftest_target(int a1, int a2, int a3, 2003/*
1875 int a4, int a5, int a6) 2004 * The "__used" keeps gcc from removing the function symbol
2005 * from the kallsyms table.
2006 */
2007static __used int kprobe_trace_selftest_target(int a1, int a2, int a3,
2008 int a4, int a5, int a6)
1876{ 2009{
1877 return a1 + a2 + a3 + a4 + a5 + a6; 2010 return a1 + a2 + a3 + a4 + a5 + a6;
1878} 2011}
@@ -1894,12 +2027,12 @@ static __init int kprobe_trace_self_tests_init(void)
1894 warn++; 2027 warn++;
1895 } else { 2028 } else {
1896 /* Enable trace point */ 2029 /* Enable trace point */
1897 tp = find_probe_event("testprobe", KPROBE_EVENT_SYSTEM); 2030 tp = find_trace_probe("testprobe", KPROBE_EVENT_SYSTEM);
1898 if (WARN_ON_ONCE(tp == NULL)) { 2031 if (WARN_ON_ONCE(tp == NULL)) {
1899 pr_warning("error on getting new probe.\n"); 2032 pr_warning("error on getting new probe.\n");
1900 warn++; 2033 warn++;
1901 } else 2034 } else
1902 probe_event_enable(&tp->call); 2035 enable_trace_probe(tp, TP_FLAG_TRACE);
1903 } 2036 }
1904 2037
1905 ret = command_trace_probe("r:testprobe2 kprobe_trace_selftest_target " 2038 ret = command_trace_probe("r:testprobe2 kprobe_trace_selftest_target "
@@ -1909,12 +2042,12 @@ static __init int kprobe_trace_self_tests_init(void)
1909 warn++; 2042 warn++;
1910 } else { 2043 } else {
1911 /* Enable trace point */ 2044 /* Enable trace point */
1912 tp = find_probe_event("testprobe2", KPROBE_EVENT_SYSTEM); 2045 tp = find_trace_probe("testprobe2", KPROBE_EVENT_SYSTEM);
1913 if (WARN_ON_ONCE(tp == NULL)) { 2046 if (WARN_ON_ONCE(tp == NULL)) {
1914 pr_warning("error on getting new probe.\n"); 2047 pr_warning("error on getting new probe.\n");
1915 warn++; 2048 warn++;
1916 } else 2049 } else
1917 probe_event_enable(&tp->call); 2050 enable_trace_probe(tp, TP_FLAG_TRACE);
1918 } 2051 }
1919 2052
1920 if (warn) 2053 if (warn)
@@ -1935,7 +2068,7 @@ static __init int kprobe_trace_self_tests_init(void)
1935 } 2068 }
1936 2069
1937end: 2070end:
1938 cleanup_all_probes(); 2071 release_all_trace_probes();
1939 if (warn) 2072 if (warn)
1940 pr_cont("NG: Some tests are failed. Please check them.\n"); 2073 pr_cont("NG: Some tests are failed. Please check them.\n");
1941 else 2074 else
diff --git a/kernel/trace/trace_mmiotrace.c b/kernel/trace/trace_mmiotrace.c
index 017fa376505d..fd3c8aae55e5 100644
--- a/kernel/trace/trace_mmiotrace.c
+++ b/kernel/trace/trace_mmiotrace.c
@@ -12,7 +12,7 @@
12#include <linux/slab.h> 12#include <linux/slab.h>
13#include <linux/time.h> 13#include <linux/time.h>
14 14
15#include <asm/atomic.h> 15#include <linux/atomic.h>
16 16
17#include "trace.h" 17#include "trace.h"
18#include "trace_output.h" 18#include "trace_output.h"
diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c
index 456be9063c2d..51999309a6cf 100644
--- a/kernel/trace/trace_output.c
+++ b/kernel/trace/trace_output.c
@@ -353,6 +353,33 @@ ftrace_print_symbols_seq(struct trace_seq *p, unsigned long val,
353} 353}
354EXPORT_SYMBOL(ftrace_print_symbols_seq); 354EXPORT_SYMBOL(ftrace_print_symbols_seq);
355 355
356#if BITS_PER_LONG == 32
357const char *
358ftrace_print_symbols_seq_u64(struct trace_seq *p, unsigned long long val,
359 const struct trace_print_flags_u64 *symbol_array)
360{
361 int i;
362 const char *ret = p->buffer + p->len;
363
364 for (i = 0; symbol_array[i].name; i++) {
365
366 if (val != symbol_array[i].mask)
367 continue;
368
369 trace_seq_puts(p, symbol_array[i].name);
370 break;
371 }
372
373 if (!p->len)
374 trace_seq_printf(p, "0x%llx", val);
375
376 trace_seq_putc(p, 0);
377
378 return ret;
379}
380EXPORT_SYMBOL(ftrace_print_symbols_seq_u64);
381#endif
382
356const char * 383const char *
357ftrace_print_hex_seq(struct trace_seq *p, const unsigned char *buf, int buf_len) 384ftrace_print_hex_seq(struct trace_seq *p, const unsigned char *buf, int buf_len)
358{ 385{
@@ -830,6 +857,9 @@ EXPORT_SYMBOL_GPL(unregister_ftrace_event);
830enum print_line_t trace_nop_print(struct trace_iterator *iter, int flags, 857enum print_line_t trace_nop_print(struct trace_iterator *iter, int flags,
831 struct trace_event *event) 858 struct trace_event *event)
832{ 859{
860 if (!trace_seq_printf(&iter->seq, "type: %d\n", iter->ent->type))
861 return TRACE_TYPE_PARTIAL_LINE;
862
833 return TRACE_TYPE_HANDLED; 863 return TRACE_TYPE_HANDLED;
834} 864}
835 865
@@ -1077,19 +1107,20 @@ static enum print_line_t trace_stack_print(struct trace_iterator *iter,
1077{ 1107{
1078 struct stack_entry *field; 1108 struct stack_entry *field;
1079 struct trace_seq *s = &iter->seq; 1109 struct trace_seq *s = &iter->seq;
1080 int i; 1110 unsigned long *p;
1111 unsigned long *end;
1081 1112
1082 trace_assign_type(field, iter->ent); 1113 trace_assign_type(field, iter->ent);
1114 end = (unsigned long *)((long)iter->ent + iter->ent_size);
1083 1115
1084 if (!trace_seq_puts(s, "<stack trace>\n")) 1116 if (!trace_seq_puts(s, "<stack trace>\n"))
1085 goto partial; 1117 goto partial;
1086 for (i = 0; i < FTRACE_STACK_ENTRIES; i++) { 1118
1087 if (!field->caller[i] || (field->caller[i] == ULONG_MAX)) 1119 for (p = field->caller; p && *p != ULONG_MAX && p < end; p++) {
1088 break;
1089 if (!trace_seq_puts(s, " => ")) 1120 if (!trace_seq_puts(s, " => "))
1090 goto partial; 1121 goto partial;
1091 1122
1092 if (!seq_print_ip_sym(s, field->caller[i], flags)) 1123 if (!seq_print_ip_sym(s, *p, flags))
1093 goto partial; 1124 goto partial;
1094 if (!trace_seq_puts(s, "\n")) 1125 if (!trace_seq_puts(s, "\n"))
1095 goto partial; 1126 goto partial;
diff --git a/kernel/trace/trace_printk.c b/kernel/trace/trace_printk.c
index 2547d8813cf0..1f06468a10d7 100644
--- a/kernel/trace/trace_printk.c
+++ b/kernel/trace/trace_printk.c
@@ -32,7 +32,7 @@ static DEFINE_MUTEX(btrace_mutex);
32 32
33struct trace_bprintk_fmt { 33struct trace_bprintk_fmt {
34 struct list_head list; 34 struct list_head list;
35 char fmt[0]; 35 const char *fmt;
36}; 36};
37 37
38static inline struct trace_bprintk_fmt *lookup_format(const char *fmt) 38static inline struct trace_bprintk_fmt *lookup_format(const char *fmt)
@@ -49,6 +49,7 @@ static
49void hold_module_trace_bprintk_format(const char **start, const char **end) 49void hold_module_trace_bprintk_format(const char **start, const char **end)
50{ 50{
51 const char **iter; 51 const char **iter;
52 char *fmt;
52 53
53 mutex_lock(&btrace_mutex); 54 mutex_lock(&btrace_mutex);
54 for (iter = start; iter < end; iter++) { 55 for (iter = start; iter < end; iter++) {
@@ -58,14 +59,18 @@ void hold_module_trace_bprintk_format(const char **start, const char **end)
58 continue; 59 continue;
59 } 60 }
60 61
61 tb_fmt = kmalloc(offsetof(struct trace_bprintk_fmt, fmt) 62 tb_fmt = kmalloc(sizeof(*tb_fmt), GFP_KERNEL);
62 + strlen(*iter) + 1, GFP_KERNEL); 63 if (tb_fmt)
63 if (tb_fmt) { 64 fmt = kmalloc(strlen(*iter) + 1, GFP_KERNEL);
65 if (tb_fmt && fmt) {
64 list_add_tail(&tb_fmt->list, &trace_bprintk_fmt_list); 66 list_add_tail(&tb_fmt->list, &trace_bprintk_fmt_list);
65 strcpy(tb_fmt->fmt, *iter); 67 strcpy(fmt, *iter);
68 tb_fmt->fmt = fmt;
66 *iter = tb_fmt->fmt; 69 *iter = tb_fmt->fmt;
67 } else 70 } else {
71 kfree(tb_fmt);
68 *iter = NULL; 72 *iter = NULL;
73 }
69 } 74 }
70 mutex_unlock(&btrace_mutex); 75 mutex_unlock(&btrace_mutex);
71} 76}
@@ -84,6 +89,76 @@ static int module_trace_bprintk_format_notify(struct notifier_block *self,
84 return 0; 89 return 0;
85} 90}
86 91
92/*
93 * The debugfs/tracing/printk_formats file maps the addresses with
94 * the ASCII formats that are used in the bprintk events in the
95 * buffer. For userspace tools to be able to decode the events from
96 * the buffer, they need to be able to map the address with the format.
97 *
98 * The addresses of the bprintk formats are in their own section
99 * __trace_printk_fmt. But for modules we copy them into a link list.
100 * The code to print the formats and their addresses passes around the
101 * address of the fmt string. If the fmt address passed into the seq
102 * functions is within the kernel core __trace_printk_fmt section, then
103 * it simply uses the next pointer in the list.
104 *
105 * When the fmt pointer is outside the kernel core __trace_printk_fmt
106 * section, then we need to read the link list pointers. The trick is
107 * we pass the address of the string to the seq function just like
108 * we do for the kernel core formats. To get back the structure that
109 * holds the format, we simply use containerof() and then go to the
110 * next format in the list.
111 */
112static const char **
113find_next_mod_format(int start_index, void *v, const char **fmt, loff_t *pos)
114{
115 struct trace_bprintk_fmt *mod_fmt;
116
117 if (list_empty(&trace_bprintk_fmt_list))
118 return NULL;
119
120 /*
121 * v will point to the address of the fmt record from t_next
122 * v will be NULL from t_start.
123 * If this is the first pointer or called from start
124 * then we need to walk the list.
125 */
126 if (!v || start_index == *pos) {
127 struct trace_bprintk_fmt *p;
128
129 /* search the module list */
130 list_for_each_entry(p, &trace_bprintk_fmt_list, list) {
131 if (start_index == *pos)
132 return &p->fmt;
133 start_index++;
134 }
135 /* pos > index */
136 return NULL;
137 }
138
139 /*
140 * v points to the address of the fmt field in the mod list
141 * structure that holds the module print format.
142 */
143 mod_fmt = container_of(v, typeof(*mod_fmt), fmt);
144 if (mod_fmt->list.next == &trace_bprintk_fmt_list)
145 return NULL;
146
147 mod_fmt = container_of(mod_fmt->list.next, typeof(*mod_fmt), list);
148
149 return &mod_fmt->fmt;
150}
151
152static void format_mod_start(void)
153{
154 mutex_lock(&btrace_mutex);
155}
156
157static void format_mod_stop(void)
158{
159 mutex_unlock(&btrace_mutex);
160}
161
87#else /* !CONFIG_MODULES */ 162#else /* !CONFIG_MODULES */
88__init static int 163__init static int
89module_trace_bprintk_format_notify(struct notifier_block *self, 164module_trace_bprintk_format_notify(struct notifier_block *self,
@@ -91,6 +166,13 @@ module_trace_bprintk_format_notify(struct notifier_block *self,
91{ 166{
92 return 0; 167 return 0;
93} 168}
169static inline const char **
170find_next_mod_format(int start_index, void *v, const char **fmt, loff_t *pos)
171{
172 return NULL;
173}
174static inline void format_mod_start(void) { }
175static inline void format_mod_stop(void) { }
94#endif /* CONFIG_MODULES */ 176#endif /* CONFIG_MODULES */
95 177
96 178
@@ -153,20 +235,30 @@ int __ftrace_vprintk(unsigned long ip, const char *fmt, va_list ap)
153} 235}
154EXPORT_SYMBOL_GPL(__ftrace_vprintk); 236EXPORT_SYMBOL_GPL(__ftrace_vprintk);
155 237
238static const char **find_next(void *v, loff_t *pos)
239{
240 const char **fmt = v;
241 int start_index;
242
243 start_index = __stop___trace_bprintk_fmt - __start___trace_bprintk_fmt;
244
245 if (*pos < start_index)
246 return __start___trace_bprintk_fmt + *pos;
247
248 return find_next_mod_format(start_index, v, fmt, pos);
249}
250
156static void * 251static void *
157t_start(struct seq_file *m, loff_t *pos) 252t_start(struct seq_file *m, loff_t *pos)
158{ 253{
159 const char **fmt = __start___trace_bprintk_fmt + *pos; 254 format_mod_start();
160 255 return find_next(NULL, pos);
161 if ((unsigned long)fmt >= (unsigned long)__stop___trace_bprintk_fmt)
162 return NULL;
163 return fmt;
164} 256}
165 257
166static void *t_next(struct seq_file *m, void * v, loff_t *pos) 258static void *t_next(struct seq_file *m, void * v, loff_t *pos)
167{ 259{
168 (*pos)++; 260 (*pos)++;
169 return t_start(m, pos); 261 return find_next(v, pos);
170} 262}
171 263
172static int t_show(struct seq_file *m, void *v) 264static int t_show(struct seq_file *m, void *v)
@@ -205,6 +297,7 @@ static int t_show(struct seq_file *m, void *v)
205 297
206static void t_stop(struct seq_file *m, void *p) 298static void t_stop(struct seq_file *m, void *p)
207{ 299{
300 format_mod_stop();
208} 301}
209 302
210static const struct seq_operations show_format_seq_ops = { 303static const struct seq_operations show_format_seq_ops = {
diff --git a/kernel/trace/trace_sched_wakeup.c b/kernel/trace/trace_sched_wakeup.c
index 7319559ed59f..e4a70c0c71b6 100644
--- a/kernel/trace/trace_sched_wakeup.c
+++ b/kernel/trace/trace_sched_wakeup.c
@@ -129,6 +129,7 @@ wakeup_tracer_call(unsigned long ip, unsigned long parent_ip)
129static struct ftrace_ops trace_ops __read_mostly = 129static struct ftrace_ops trace_ops __read_mostly =
130{ 130{
131 .func = wakeup_tracer_call, 131 .func = wakeup_tracer_call,
132 .flags = FTRACE_OPS_FL_GLOBAL,
132}; 133};
133#endif /* CONFIG_FUNCTION_TRACER */ 134#endif /* CONFIG_FUNCTION_TRACER */
134 135
@@ -226,7 +227,9 @@ static void wakeup_trace_close(struct trace_iterator *iter)
226 graph_trace_close(iter); 227 graph_trace_close(iter);
227} 228}
228 229
229#define GRAPH_TRACER_FLAGS (TRACE_GRAPH_PRINT_PROC) 230#define GRAPH_TRACER_FLAGS (TRACE_GRAPH_PRINT_PROC | \
231 TRACE_GRAPH_PRINT_ABS_TIME | \
232 TRACE_GRAPH_PRINT_DURATION)
230 233
231static enum print_line_t wakeup_print_line(struct trace_iterator *iter) 234static enum print_line_t wakeup_print_line(struct trace_iterator *iter)
232{ 235{
diff --git a/kernel/trace/trace_selftest.c b/kernel/trace/trace_selftest.c
index 659732eba07c..288541f977fb 100644
--- a/kernel/trace/trace_selftest.c
+++ b/kernel/trace/trace_selftest.c
@@ -101,6 +101,206 @@ static inline void warn_failed_init_tracer(struct tracer *trace, int init_ret)
101 101
102#ifdef CONFIG_DYNAMIC_FTRACE 102#ifdef CONFIG_DYNAMIC_FTRACE
103 103
104static int trace_selftest_test_probe1_cnt;
105static void trace_selftest_test_probe1_func(unsigned long ip,
106 unsigned long pip)
107{
108 trace_selftest_test_probe1_cnt++;
109}
110
111static int trace_selftest_test_probe2_cnt;
112static void trace_selftest_test_probe2_func(unsigned long ip,
113 unsigned long pip)
114{
115 trace_selftest_test_probe2_cnt++;
116}
117
118static int trace_selftest_test_probe3_cnt;
119static void trace_selftest_test_probe3_func(unsigned long ip,
120 unsigned long pip)
121{
122 trace_selftest_test_probe3_cnt++;
123}
124
125static int trace_selftest_test_global_cnt;
126static void trace_selftest_test_global_func(unsigned long ip,
127 unsigned long pip)
128{
129 trace_selftest_test_global_cnt++;
130}
131
132static int trace_selftest_test_dyn_cnt;
133static void trace_selftest_test_dyn_func(unsigned long ip,
134 unsigned long pip)
135{
136 trace_selftest_test_dyn_cnt++;
137}
138
139static struct ftrace_ops test_probe1 = {
140 .func = trace_selftest_test_probe1_func,
141};
142
143static struct ftrace_ops test_probe2 = {
144 .func = trace_selftest_test_probe2_func,
145};
146
147static struct ftrace_ops test_probe3 = {
148 .func = trace_selftest_test_probe3_func,
149};
150
151static struct ftrace_ops test_global = {
152 .func = trace_selftest_test_global_func,
153 .flags = FTRACE_OPS_FL_GLOBAL,
154};
155
156static void print_counts(void)
157{
158 printk("(%d %d %d %d %d) ",
159 trace_selftest_test_probe1_cnt,
160 trace_selftest_test_probe2_cnt,
161 trace_selftest_test_probe3_cnt,
162 trace_selftest_test_global_cnt,
163 trace_selftest_test_dyn_cnt);
164}
165
166static void reset_counts(void)
167{
168 trace_selftest_test_probe1_cnt = 0;
169 trace_selftest_test_probe2_cnt = 0;
170 trace_selftest_test_probe3_cnt = 0;
171 trace_selftest_test_global_cnt = 0;
172 trace_selftest_test_dyn_cnt = 0;
173}
174
175static int trace_selftest_ops(int cnt)
176{
177 int save_ftrace_enabled = ftrace_enabled;
178 struct ftrace_ops *dyn_ops;
179 char *func1_name;
180 char *func2_name;
181 int len1;
182 int len2;
183 int ret = -1;
184
185 printk(KERN_CONT "PASSED\n");
186 pr_info("Testing dynamic ftrace ops #%d: ", cnt);
187
188 ftrace_enabled = 1;
189 reset_counts();
190
191 /* Handle PPC64 '.' name */
192 func1_name = "*" __stringify(DYN_FTRACE_TEST_NAME);
193 func2_name = "*" __stringify(DYN_FTRACE_TEST_NAME2);
194 len1 = strlen(func1_name);
195 len2 = strlen(func2_name);
196
197 /*
198 * Probe 1 will trace function 1.
199 * Probe 2 will trace function 2.
200 * Probe 3 will trace functions 1 and 2.
201 */
202 ftrace_set_filter(&test_probe1, func1_name, len1, 1);
203 ftrace_set_filter(&test_probe2, func2_name, len2, 1);
204 ftrace_set_filter(&test_probe3, func1_name, len1, 1);
205 ftrace_set_filter(&test_probe3, func2_name, len2, 0);
206
207 register_ftrace_function(&test_probe1);
208 register_ftrace_function(&test_probe2);
209 register_ftrace_function(&test_probe3);
210 register_ftrace_function(&test_global);
211
212 DYN_FTRACE_TEST_NAME();
213
214 print_counts();
215
216 if (trace_selftest_test_probe1_cnt != 1)
217 goto out;
218 if (trace_selftest_test_probe2_cnt != 0)
219 goto out;
220 if (trace_selftest_test_probe3_cnt != 1)
221 goto out;
222 if (trace_selftest_test_global_cnt == 0)
223 goto out;
224
225 DYN_FTRACE_TEST_NAME2();
226
227 print_counts();
228
229 if (trace_selftest_test_probe1_cnt != 1)
230 goto out;
231 if (trace_selftest_test_probe2_cnt != 1)
232 goto out;
233 if (trace_selftest_test_probe3_cnt != 2)
234 goto out;
235
236 /* Add a dynamic probe */
237 dyn_ops = kzalloc(sizeof(*dyn_ops), GFP_KERNEL);
238 if (!dyn_ops) {
239 printk("MEMORY ERROR ");
240 goto out;
241 }
242
243 dyn_ops->func = trace_selftest_test_dyn_func;
244
245 register_ftrace_function(dyn_ops);
246
247 trace_selftest_test_global_cnt = 0;
248
249 DYN_FTRACE_TEST_NAME();
250
251 print_counts();
252
253 if (trace_selftest_test_probe1_cnt != 2)
254 goto out_free;
255 if (trace_selftest_test_probe2_cnt != 1)
256 goto out_free;
257 if (trace_selftest_test_probe3_cnt != 3)
258 goto out_free;
259 if (trace_selftest_test_global_cnt == 0)
260 goto out;
261 if (trace_selftest_test_dyn_cnt == 0)
262 goto out_free;
263
264 DYN_FTRACE_TEST_NAME2();
265
266 print_counts();
267
268 if (trace_selftest_test_probe1_cnt != 2)
269 goto out_free;
270 if (trace_selftest_test_probe2_cnt != 2)
271 goto out_free;
272 if (trace_selftest_test_probe3_cnt != 4)
273 goto out_free;
274
275 ret = 0;
276 out_free:
277 unregister_ftrace_function(dyn_ops);
278 kfree(dyn_ops);
279
280 out:
281 /* Purposely unregister in the same order */
282 unregister_ftrace_function(&test_probe1);
283 unregister_ftrace_function(&test_probe2);
284 unregister_ftrace_function(&test_probe3);
285 unregister_ftrace_function(&test_global);
286
287 /* Make sure everything is off */
288 reset_counts();
289 DYN_FTRACE_TEST_NAME();
290 DYN_FTRACE_TEST_NAME();
291
292 if (trace_selftest_test_probe1_cnt ||
293 trace_selftest_test_probe2_cnt ||
294 trace_selftest_test_probe3_cnt ||
295 trace_selftest_test_global_cnt ||
296 trace_selftest_test_dyn_cnt)
297 ret = -1;
298
299 ftrace_enabled = save_ftrace_enabled;
300
301 return ret;
302}
303
104/* Test dynamic code modification and ftrace filters */ 304/* Test dynamic code modification and ftrace filters */
105int trace_selftest_startup_dynamic_tracing(struct tracer *trace, 305int trace_selftest_startup_dynamic_tracing(struct tracer *trace,
106 struct trace_array *tr, 306 struct trace_array *tr,
@@ -131,7 +331,7 @@ int trace_selftest_startup_dynamic_tracing(struct tracer *trace,
131 func_name = "*" __stringify(DYN_FTRACE_TEST_NAME); 331 func_name = "*" __stringify(DYN_FTRACE_TEST_NAME);
132 332
133 /* filter only on our function */ 333 /* filter only on our function */
134 ftrace_set_filter(func_name, strlen(func_name), 1); 334 ftrace_set_global_filter(func_name, strlen(func_name), 1);
135 335
136 /* enable tracing */ 336 /* enable tracing */
137 ret = tracer_init(trace, tr); 337 ret = tracer_init(trace, tr);
@@ -166,22 +366,30 @@ int trace_selftest_startup_dynamic_tracing(struct tracer *trace,
166 366
167 /* check the trace buffer */ 367 /* check the trace buffer */
168 ret = trace_test_buffer(tr, &count); 368 ret = trace_test_buffer(tr, &count);
169 trace->reset(tr);
170 tracing_start(); 369 tracing_start();
171 370
172 /* we should only have one item */ 371 /* we should only have one item */
173 if (!ret && count != 1) { 372 if (!ret && count != 1) {
373 trace->reset(tr);
174 printk(KERN_CONT ".. filter failed count=%ld ..", count); 374 printk(KERN_CONT ".. filter failed count=%ld ..", count);
175 ret = -1; 375 ret = -1;
176 goto out; 376 goto out;
177 } 377 }
178 378
379 /* Test the ops with global tracing running */
380 ret = trace_selftest_ops(1);
381 trace->reset(tr);
382
179 out: 383 out:
180 ftrace_enabled = save_ftrace_enabled; 384 ftrace_enabled = save_ftrace_enabled;
181 tracer_enabled = save_tracer_enabled; 385 tracer_enabled = save_tracer_enabled;
182 386
183 /* Enable tracing on all functions again */ 387 /* Enable tracing on all functions again */
184 ftrace_set_filter(NULL, 0, 1); 388 ftrace_set_global_filter(NULL, 0, 1);
389
390 /* Test the ops with global tracing off */
391 if (!ret)
392 ret = trace_selftest_ops(2);
185 393
186 return ret; 394 return ret;
187} 395}
diff --git a/kernel/trace/trace_selftest_dynamic.c b/kernel/trace/trace_selftest_dynamic.c
index 54dd77cce5bf..b4c475a0a48b 100644
--- a/kernel/trace/trace_selftest_dynamic.c
+++ b/kernel/trace/trace_selftest_dynamic.c
@@ -5,3 +5,9 @@ int DYN_FTRACE_TEST_NAME(void)
5 /* used to call mcount */ 5 /* used to call mcount */
6 return 0; 6 return 0;
7} 7}
8
9int DYN_FTRACE_TEST_NAME2(void)
10{
11 /* used to call mcount */
12 return 0;
13}
diff --git a/kernel/trace/trace_stack.c b/kernel/trace/trace_stack.c
index 4c5dead0c239..77575b386d97 100644
--- a/kernel/trace/trace_stack.c
+++ b/kernel/trace/trace_stack.c
@@ -133,6 +133,7 @@ stack_trace_call(unsigned long ip, unsigned long parent_ip)
133static struct ftrace_ops trace_ops __read_mostly = 133static struct ftrace_ops trace_ops __read_mostly =
134{ 134{
135 .func = stack_trace_call, 135 .func = stack_trace_call,
136 .flags = FTRACE_OPS_FL_GLOBAL,
136}; 137};
137 138
138static ssize_t 139static ssize_t
@@ -155,20 +156,11 @@ stack_max_size_write(struct file *filp, const char __user *ubuf,
155{ 156{
156 long *ptr = filp->private_data; 157 long *ptr = filp->private_data;
157 unsigned long val, flags; 158 unsigned long val, flags;
158 char buf[64];
159 int ret; 159 int ret;
160 int cpu; 160 int cpu;
161 161
162 if (count >= sizeof(buf)) 162 ret = kstrtoul_from_user(ubuf, count, 10, &val);
163 return -EINVAL; 163 if (ret)
164
165 if (copy_from_user(&buf, ubuf, count))
166 return -EFAULT;
167
168 buf[count] = 0;
169
170 ret = strict_strtoul(buf, 10, &val);
171 if (ret < 0)
172 return ret; 164 return ret;
173 165
174 local_irq_save(flags); 166 local_irq_save(flags);
diff --git a/kernel/tracepoint.c b/kernel/tracepoint.c
index 68187af4889e..b219f1449c54 100644
--- a/kernel/tracepoint.c
+++ b/kernel/tracepoint.c
@@ -251,9 +251,9 @@ static void set_tracepoint(struct tracepoint_entry **entry,
251{ 251{
252 WARN_ON(strcmp((*entry)->name, elem->name) != 0); 252 WARN_ON(strcmp((*entry)->name, elem->name) != 0);
253 253
254 if (elem->regfunc && !elem->state && active) 254 if (elem->regfunc && !jump_label_enabled(&elem->key) && active)
255 elem->regfunc(); 255 elem->regfunc();
256 else if (elem->unregfunc && elem->state && !active) 256 else if (elem->unregfunc && jump_label_enabled(&elem->key) && !active)
257 elem->unregfunc(); 257 elem->unregfunc();
258 258
259 /* 259 /*
@@ -264,13 +264,10 @@ static void set_tracepoint(struct tracepoint_entry **entry,
264 * is used. 264 * is used.
265 */ 265 */
266 rcu_assign_pointer(elem->funcs, (*entry)->funcs); 266 rcu_assign_pointer(elem->funcs, (*entry)->funcs);
267 if (!elem->state && active) { 267 if (active && !jump_label_enabled(&elem->key))
268 jump_label_enable(&elem->state); 268 jump_label_inc(&elem->key);
269 elem->state = active; 269 else if (!active && jump_label_enabled(&elem->key))
270 } else if (elem->state && !active) { 270 jump_label_dec(&elem->key);
271 jump_label_disable(&elem->state);
272 elem->state = active;
273 }
274} 271}
275 272
276/* 273/*
@@ -281,13 +278,11 @@ static void set_tracepoint(struct tracepoint_entry **entry,
281 */ 278 */
282static void disable_tracepoint(struct tracepoint *elem) 279static void disable_tracepoint(struct tracepoint *elem)
283{ 280{
284 if (elem->unregfunc && elem->state) 281 if (elem->unregfunc && jump_label_enabled(&elem->key))
285 elem->unregfunc(); 282 elem->unregfunc();
286 283
287 if (elem->state) { 284 if (jump_label_enabled(&elem->key))
288 jump_label_disable(&elem->state); 285 jump_label_dec(&elem->key);
289 elem->state = 0;
290 }
291 rcu_assign_pointer(elem->funcs, NULL); 286 rcu_assign_pointer(elem->funcs, NULL);
292} 287}
293 288
diff --git a/kernel/utsname.c b/kernel/utsname.c
index 44646179eaba..bff131b9510a 100644
--- a/kernel/utsname.c
+++ b/kernel/utsname.c
@@ -15,6 +15,7 @@
15#include <linux/err.h> 15#include <linux/err.h>
16#include <linux/slab.h> 16#include <linux/slab.h>
17#include <linux/user_namespace.h> 17#include <linux/user_namespace.h>
18#include <linux/proc_fs.h>
18 19
19static struct uts_namespace *create_uts_ns(void) 20static struct uts_namespace *create_uts_ns(void)
20{ 21{
@@ -79,3 +80,41 @@ void free_uts_ns(struct kref *kref)
79 put_user_ns(ns->user_ns); 80 put_user_ns(ns->user_ns);
80 kfree(ns); 81 kfree(ns);
81} 82}
83
84static void *utsns_get(struct task_struct *task)
85{
86 struct uts_namespace *ns = NULL;
87 struct nsproxy *nsproxy;
88
89 rcu_read_lock();
90 nsproxy = task_nsproxy(task);
91 if (nsproxy) {
92 ns = nsproxy->uts_ns;
93 get_uts_ns(ns);
94 }
95 rcu_read_unlock();
96
97 return ns;
98}
99
100static void utsns_put(void *ns)
101{
102 put_uts_ns(ns);
103}
104
105static int utsns_install(struct nsproxy *nsproxy, void *ns)
106{
107 get_uts_ns(ns);
108 put_uts_ns(nsproxy->uts_ns);
109 nsproxy->uts_ns = ns;
110 return 0;
111}
112
113const struct proc_ns_operations utsns_operations = {
114 .name = "uts",
115 .type = CLONE_NEWUTS,
116 .get = utsns_get,
117 .put = utsns_put,
118 .install = utsns_install,
119};
120
diff --git a/kernel/watchdog.c b/kernel/watchdog.c
index 14733d4d156b..36491cd5b7d4 100644
--- a/kernel/watchdog.c
+++ b/kernel/watchdog.c
@@ -28,7 +28,7 @@
28#include <linux/perf_event.h> 28#include <linux/perf_event.h>
29 29
30int watchdog_enabled = 1; 30int watchdog_enabled = 1;
31int __read_mostly softlockup_thresh = 60; 31int __read_mostly watchdog_thresh = 10;
32 32
33static DEFINE_PER_CPU(unsigned long, watchdog_touch_ts); 33static DEFINE_PER_CPU(unsigned long, watchdog_touch_ts);
34static DEFINE_PER_CPU(struct task_struct *, softlockup_watchdog); 34static DEFINE_PER_CPU(struct task_struct *, softlockup_watchdog);
@@ -91,6 +91,17 @@ static int __init nosoftlockup_setup(char *str)
91__setup("nosoftlockup", nosoftlockup_setup); 91__setup("nosoftlockup", nosoftlockup_setup);
92/* */ 92/* */
93 93
94/*
95 * Hard-lockup warnings should be triggered after just a few seconds. Soft-
96 * lockups can have false positives under extreme conditions. So we generally
97 * want a higher threshold for soft lockups than for hard lockups. So we couple
98 * the thresholds with a factor: we make the soft threshold twice the amount of
99 * time the hard threshold is.
100 */
101static int get_softlockup_thresh(void)
102{
103 return watchdog_thresh * 2;
104}
94 105
95/* 106/*
96 * Returns seconds, approximately. We don't need nanosecond 107 * Returns seconds, approximately. We don't need nanosecond
@@ -105,12 +116,12 @@ static unsigned long get_timestamp(int this_cpu)
105static unsigned long get_sample_period(void) 116static unsigned long get_sample_period(void)
106{ 117{
107 /* 118 /*
108 * convert softlockup_thresh from seconds to ns 119 * convert watchdog_thresh from seconds to ns
109 * the divide by 5 is to give hrtimer 5 chances to 120 * the divide by 5 is to give hrtimer 5 chances to
110 * increment before the hardlockup detector generates 121 * increment before the hardlockup detector generates
111 * a warning 122 * a warning
112 */ 123 */
113 return softlockup_thresh / 5 * NSEC_PER_SEC; 124 return get_softlockup_thresh() * (NSEC_PER_SEC / 5);
114} 125}
115 126
116/* Commands for resetting the watchdog */ 127/* Commands for resetting the watchdog */
@@ -182,13 +193,14 @@ static int is_softlockup(unsigned long touch_ts)
182 unsigned long now = get_timestamp(smp_processor_id()); 193 unsigned long now = get_timestamp(smp_processor_id());
183 194
184 /* Warn about unreasonable delays: */ 195 /* Warn about unreasonable delays: */
185 if (time_after(now, touch_ts + softlockup_thresh)) 196 if (time_after(now, touch_ts + get_softlockup_thresh()))
186 return now - touch_ts; 197 return now - touch_ts;
187 198
188 return 0; 199 return 0;
189} 200}
190 201
191#ifdef CONFIG_HARDLOCKUP_DETECTOR 202#ifdef CONFIG_HARDLOCKUP_DETECTOR
203
192static struct perf_event_attr wd_hw_attr = { 204static struct perf_event_attr wd_hw_attr = {
193 .type = PERF_TYPE_HARDWARE, 205 .type = PERF_TYPE_HARDWARE,
194 .config = PERF_COUNT_HW_CPU_CYCLES, 206 .config = PERF_COUNT_HW_CPU_CYCLES,
@@ -198,7 +210,7 @@ static struct perf_event_attr wd_hw_attr = {
198}; 210};
199 211
200/* Callback function for perf event subsystem */ 212/* Callback function for perf event subsystem */
201static void watchdog_overflow_callback(struct perf_event *event, int nmi, 213static void watchdog_overflow_callback(struct perf_event *event,
202 struct perf_sample_data *data, 214 struct perf_sample_data *data,
203 struct pt_regs *regs) 215 struct pt_regs *regs)
204{ 216{
@@ -357,10 +369,11 @@ static int watchdog_nmi_enable(int cpu)
357 if (event != NULL) 369 if (event != NULL)
358 goto out_enable; 370 goto out_enable;
359 371
360 /* Try to register using hardware perf events */
361 wd_attr = &wd_hw_attr; 372 wd_attr = &wd_hw_attr;
362 wd_attr->sample_period = hw_nmi_get_sample_period(); 373 wd_attr->sample_period = hw_nmi_get_sample_period(watchdog_thresh);
363 event = perf_event_create_kernel_counter(wd_attr, cpu, NULL, watchdog_overflow_callback); 374
375 /* Try to register using hardware perf events */
376 event = perf_event_create_kernel_counter(wd_attr, cpu, NULL, watchdog_overflow_callback, NULL);
364 if (!IS_ERR(event)) { 377 if (!IS_ERR(event)) {
365 printk(KERN_INFO "NMI watchdog enabled, takes one hw-pmu counter.\n"); 378 printk(KERN_INFO "NMI watchdog enabled, takes one hw-pmu counter.\n");
366 goto out_save; 379 goto out_save;
@@ -404,15 +417,13 @@ static void watchdog_nmi_disable(int cpu) { return; }
404#endif /* CONFIG_HARDLOCKUP_DETECTOR */ 417#endif /* CONFIG_HARDLOCKUP_DETECTOR */
405 418
406/* prepare/enable/disable routines */ 419/* prepare/enable/disable routines */
407static int watchdog_prepare_cpu(int cpu) 420static void watchdog_prepare_cpu(int cpu)
408{ 421{
409 struct hrtimer *hrtimer = &per_cpu(watchdog_hrtimer, cpu); 422 struct hrtimer *hrtimer = &per_cpu(watchdog_hrtimer, cpu);
410 423
411 WARN_ON(per_cpu(softlockup_watchdog, cpu)); 424 WARN_ON(per_cpu(softlockup_watchdog, cpu));
412 hrtimer_init(hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); 425 hrtimer_init(hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
413 hrtimer->function = watchdog_timer_fn; 426 hrtimer->function = watchdog_timer_fn;
414
415 return 0;
416} 427}
417 428
418static int watchdog_enable(int cpu) 429static int watchdog_enable(int cpu)
@@ -501,28 +512,25 @@ static void watchdog_disable_all_cpus(void)
501/* sysctl functions */ 512/* sysctl functions */
502#ifdef CONFIG_SYSCTL 513#ifdef CONFIG_SYSCTL
503/* 514/*
504 * proc handler for /proc/sys/kernel/nmi_watchdog 515 * proc handler for /proc/sys/kernel/nmi_watchdog,watchdog_thresh
505 */ 516 */
506 517
507int proc_dowatchdog_enabled(struct ctl_table *table, int write, 518int proc_dowatchdog(struct ctl_table *table, int write,
508 void __user *buffer, size_t *length, loff_t *ppos) 519 void __user *buffer, size_t *lenp, loff_t *ppos)
509{ 520{
510 proc_dointvec(table, write, buffer, length, ppos); 521 int ret;
511 522
512 if (write) { 523 ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
513 if (watchdog_enabled) 524 if (ret || !write)
514 watchdog_enable_all_cpus(); 525 goto out;
515 else
516 watchdog_disable_all_cpus();
517 }
518 return 0;
519}
520 526
521int proc_dowatchdog_thresh(struct ctl_table *table, int write, 527 if (watchdog_enabled && watchdog_thresh)
522 void __user *buffer, 528 watchdog_enable_all_cpus();
523 size_t *lenp, loff_t *ppos) 529 else
524{ 530 watchdog_disable_all_cpus();
525 return proc_dointvec_minmax(table, write, buffer, lenp, ppos); 531
532out:
533 return ret;
526} 534}
527#endif /* CONFIG_SYSCTL */ 535#endif /* CONFIG_SYSCTL */
528 536
@@ -534,17 +542,16 @@ static int __cpuinit
534cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu) 542cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu)
535{ 543{
536 int hotcpu = (unsigned long)hcpu; 544 int hotcpu = (unsigned long)hcpu;
537 int err = 0;
538 545
539 switch (action) { 546 switch (action) {
540 case CPU_UP_PREPARE: 547 case CPU_UP_PREPARE:
541 case CPU_UP_PREPARE_FROZEN: 548 case CPU_UP_PREPARE_FROZEN:
542 err = watchdog_prepare_cpu(hotcpu); 549 watchdog_prepare_cpu(hotcpu);
543 break; 550 break;
544 case CPU_ONLINE: 551 case CPU_ONLINE:
545 case CPU_ONLINE_FROZEN: 552 case CPU_ONLINE_FROZEN:
546 if (watchdog_enabled) 553 if (watchdog_enabled)
547 err = watchdog_enable(hotcpu); 554 watchdog_enable(hotcpu);
548 break; 555 break;
549#ifdef CONFIG_HOTPLUG_CPU 556#ifdef CONFIG_HOTPLUG_CPU
550 case CPU_UP_CANCELED: 557 case CPU_UP_CANCELED:
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index e3378e8d3a5c..25fb1b0e53fa 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -221,7 +221,7 @@ typedef unsigned long mayday_mask_t;
221 * per-CPU workqueues: 221 * per-CPU workqueues:
222 */ 222 */
223struct workqueue_struct { 223struct workqueue_struct {
224 unsigned int flags; /* I: WQ_* flags */ 224 unsigned int flags; /* W: WQ_* flags */
225 union { 225 union {
226 struct cpu_workqueue_struct __percpu *pcpu; 226 struct cpu_workqueue_struct __percpu *pcpu;
227 struct cpu_workqueue_struct *single; 227 struct cpu_workqueue_struct *single;
@@ -240,6 +240,7 @@ struct workqueue_struct {
240 mayday_mask_t mayday_mask; /* cpus requesting rescue */ 240 mayday_mask_t mayday_mask; /* cpus requesting rescue */
241 struct worker *rescuer; /* I: rescue worker */ 241 struct worker *rescuer; /* I: rescue worker */
242 242
243 int nr_drainers; /* W: drain in progress */
243 int saved_max_active; /* W: saved cwq max_active */ 244 int saved_max_active; /* W: saved cwq max_active */
244 const char *name; /* I: workqueue name */ 245 const char *name; /* I: workqueue name */
245#ifdef CONFIG_LOCKDEP 246#ifdef CONFIG_LOCKDEP
@@ -990,7 +991,7 @@ static void __queue_work(unsigned int cpu, struct workqueue_struct *wq,
990 debug_work_activate(work); 991 debug_work_activate(work);
991 992
992 /* if dying, only works from the same workqueue are allowed */ 993 /* if dying, only works from the same workqueue are allowed */
993 if (unlikely(wq->flags & WQ_DYING) && 994 if (unlikely(wq->flags & WQ_DRAINING) &&
994 WARN_ON_ONCE(!is_chained_work(wq))) 995 WARN_ON_ONCE(!is_chained_work(wq)))
995 return; 996 return;
996 997
@@ -2381,6 +2382,54 @@ out_unlock:
2381} 2382}
2382EXPORT_SYMBOL_GPL(flush_workqueue); 2383EXPORT_SYMBOL_GPL(flush_workqueue);
2383 2384
2385/**
2386 * drain_workqueue - drain a workqueue
2387 * @wq: workqueue to drain
2388 *
2389 * Wait until the workqueue becomes empty. While draining is in progress,
2390 * only chain queueing is allowed. IOW, only currently pending or running
2391 * work items on @wq can queue further work items on it. @wq is flushed
2392 * repeatedly until it becomes empty. The number of flushing is detemined
2393 * by the depth of chaining and should be relatively short. Whine if it
2394 * takes too long.
2395 */
2396void drain_workqueue(struct workqueue_struct *wq)
2397{
2398 unsigned int flush_cnt = 0;
2399 unsigned int cpu;
2400
2401 /*
2402 * __queue_work() needs to test whether there are drainers, is much
2403 * hotter than drain_workqueue() and already looks at @wq->flags.
2404 * Use WQ_DRAINING so that queue doesn't have to check nr_drainers.
2405 */
2406 spin_lock(&workqueue_lock);
2407 if (!wq->nr_drainers++)
2408 wq->flags |= WQ_DRAINING;
2409 spin_unlock(&workqueue_lock);
2410reflush:
2411 flush_workqueue(wq);
2412
2413 for_each_cwq_cpu(cpu, wq) {
2414 struct cpu_workqueue_struct *cwq = get_cwq(cpu, wq);
2415
2416 if (!cwq->nr_active && list_empty(&cwq->delayed_works))
2417 continue;
2418
2419 if (++flush_cnt == 10 ||
2420 (flush_cnt % 100 == 0 && flush_cnt <= 1000))
2421 pr_warning("workqueue %s: flush on destruction isn't complete after %u tries\n",
2422 wq->name, flush_cnt);
2423 goto reflush;
2424 }
2425
2426 spin_lock(&workqueue_lock);
2427 if (!--wq->nr_drainers)
2428 wq->flags &= ~WQ_DRAINING;
2429 spin_unlock(&workqueue_lock);
2430}
2431EXPORT_SYMBOL_GPL(drain_workqueue);
2432
2384static bool start_flush_work(struct work_struct *work, struct wq_barrier *barr, 2433static bool start_flush_work(struct work_struct *work, struct wq_barrier *barr,
2385 bool wait_executing) 2434 bool wait_executing)
2386{ 2435{
@@ -2866,9 +2915,7 @@ static int alloc_cwqs(struct workqueue_struct *wq)
2866 } 2915 }
2867 } 2916 }
2868 2917
2869 /* just in case, make sure it's actually aligned 2918 /* just in case, make sure it's actually aligned */
2870 * - this is affected by PERCPU() alignment in vmlinux.lds.S
2871 */
2872 BUG_ON(!IS_ALIGNED(wq->cpu_wq.v, align)); 2919 BUG_ON(!IS_ALIGNED(wq->cpu_wq.v, align));
2873 return wq->cpu_wq.v ? 0 : -ENOMEM; 2920 return wq->cpu_wq.v ? 0 : -ENOMEM;
2874} 2921}
@@ -3011,34 +3058,10 @@ EXPORT_SYMBOL_GPL(__alloc_workqueue_key);
3011 */ 3058 */
3012void destroy_workqueue(struct workqueue_struct *wq) 3059void destroy_workqueue(struct workqueue_struct *wq)
3013{ 3060{
3014 unsigned int flush_cnt = 0;
3015 unsigned int cpu; 3061 unsigned int cpu;
3016 3062
3017 /* 3063 /* drain it before proceeding with destruction */
3018 * Mark @wq dying and drain all pending works. Once WQ_DYING is 3064 drain_workqueue(wq);
3019 * set, only chain queueing is allowed. IOW, only currently
3020 * pending or running work items on @wq can queue further work
3021 * items on it. @wq is flushed repeatedly until it becomes empty.
3022 * The number of flushing is detemined by the depth of chaining and
3023 * should be relatively short. Whine if it takes too long.
3024 */
3025 wq->flags |= WQ_DYING;
3026reflush:
3027 flush_workqueue(wq);
3028
3029 for_each_cwq_cpu(cpu, wq) {
3030 struct cpu_workqueue_struct *cwq = get_cwq(cpu, wq);
3031
3032 if (!cwq->nr_active && list_empty(&cwq->delayed_works))
3033 continue;
3034
3035 if (++flush_cnt == 10 ||
3036 (flush_cnt % 100 == 0 && flush_cnt <= 1000))
3037 printk(KERN_WARNING "workqueue %s: flush on "
3038 "destruction isn't complete after %u tries\n",
3039 wq->name, flush_cnt);
3040 goto reflush;
3041 }
3042 3065
3043 /* 3066 /*
3044 * wq list is used to freeze wq, remove from list after 3067 * wq list is used to freeze wq, remove from list after