aboutsummaryrefslogtreecommitdiffstats
path: root/kernel
diff options
context:
space:
mode:
authorEric Paris <eparis@redhat.com>2014-03-07 11:41:32 -0500
committerEric Paris <eparis@redhat.com>2014-03-07 11:41:32 -0500
commitb7d3622a39fde7658170b7f3cf6c6889bb8db30d (patch)
tree64f4e781ecb2a85d675e234072b988560bcd25f1 /kernel
parentf3411cb2b2e396a41ed3a439863f028db7140a34 (diff)
parentd8ec26d7f8287f5788a494f56e8814210f0e64be (diff)
Merge tag 'v3.13' into for-3.15
Linux 3.13 Conflicts: include/net/xfrm.h Simple merge where v3.13 removed 'extern' from definitions and the audit tree did s/u32/unsigned int/ to the same definitions.
Diffstat (limited to 'kernel')
-rw-r--r--kernel/.gitignore1
-rw-r--r--kernel/Kconfig.hz2
-rw-r--r--kernel/Makefile82
-rw-r--r--kernel/bounds.c6
-rw-r--r--kernel/cgroup.c340
-rw-r--r--kernel/context_tracking.c2
-rw-r--r--kernel/cpu.c49
-rw-r--r--kernel/cpu/idle.c16
-rw-r--r--kernel/cpuset.c8
-rw-r--r--kernel/debug/debug_core.c32
-rw-r--r--kernel/debug/debug_core.h3
-rw-r--r--kernel/debug/kdb/kdb_debugger.c5
-rw-r--r--kernel/debug/kdb/kdb_main.c3
-rw-r--r--kernel/delayacct.c7
-rw-r--r--kernel/elfcore.c10
-rw-r--r--kernel/events/core.c201
-rw-r--r--kernel/events/internal.h35
-rw-r--r--kernel/events/ring_buffer.c101
-rw-r--r--kernel/events/uprobes.c223
-rw-r--r--kernel/extable.c4
-rw-r--r--kernel/fork.c16
-rw-r--r--kernel/freezer.c6
-rw-r--r--kernel/futex.c9
-rw-r--r--kernel/gcov/Kconfig30
-rw-r--r--kernel/gcov/Makefile32
-rw-r--r--kernel/gcov/base.c32
-rw-r--r--kernel/gcov/fs.c52
-rw-r--r--kernel/gcov/gcc_3_4.c115
-rw-r--r--kernel/gcov/gcc_4_7.c560
-rw-r--r--kernel/gcov/gcov.h65
-rw-r--r--kernel/hung_task.c17
-rw-r--r--kernel/irq/chip.c2
-rw-r--r--kernel/irq/irqdomain.c13
-rw-r--r--kernel/irq/manage.c4
-rw-r--r--kernel/irq/pm.c2
-rw-r--r--kernel/irq/settings.h7
-rw-r--r--kernel/irq/spurious.c12
-rw-r--r--kernel/jump_label.c5
-rw-r--r--kernel/kexec.c7
-rw-r--r--kernel/kprobes.c4
-rw-r--r--kernel/kthread.c73
-rw-r--r--kernel/locking/Makefile25
-rw-r--r--kernel/locking/lglock.c (renamed from kernel/lglock.c)0
-rw-r--r--kernel/locking/lockdep.c (renamed from kernel/lockdep.c)8
-rw-r--r--kernel/locking/lockdep_internals.h (renamed from kernel/lockdep_internals.h)0
-rw-r--r--kernel/locking/lockdep_proc.c (renamed from kernel/lockdep_proc.c)15
-rw-r--r--kernel/locking/lockdep_states.h (renamed from kernel/lockdep_states.h)0
-rw-r--r--kernel/locking/mutex-debug.c (renamed from kernel/mutex-debug.c)0
-rw-r--r--kernel/locking/mutex-debug.h (renamed from kernel/mutex-debug.h)0
-rw-r--r--kernel/locking/mutex.c (renamed from kernel/mutex.c)2
-rw-r--r--kernel/locking/mutex.h (renamed from kernel/mutex.h)0
-rw-r--r--kernel/locking/percpu-rwsem.c165
-rw-r--r--kernel/locking/rtmutex-debug.c (renamed from kernel/rtmutex-debug.c)0
-rw-r--r--kernel/locking/rtmutex-debug.h (renamed from kernel/rtmutex-debug.h)0
-rw-r--r--kernel/locking/rtmutex-tester.c (renamed from kernel/rtmutex-tester.c)0
-rw-r--r--kernel/locking/rtmutex.c (renamed from kernel/rtmutex.c)0
-rw-r--r--kernel/locking/rtmutex.h (renamed from kernel/rtmutex.h)0
-rw-r--r--kernel/locking/rtmutex_common.h (renamed from kernel/rtmutex_common.h)0
-rw-r--r--kernel/locking/rwsem-spinlock.c296
-rw-r--r--kernel/locking/rwsem-xadd.c293
-rw-r--r--kernel/locking/rwsem.c (renamed from kernel/rwsem.c)0
-rw-r--r--kernel/locking/semaphore.c (renamed from kernel/semaphore.c)0
-rw-r--r--kernel/locking/spinlock.c (renamed from kernel/spinlock.c)0
-rw-r--r--kernel/locking/spinlock_debug.c302
-rw-r--r--kernel/modsign_certificate.S12
-rw-r--r--kernel/modsign_pubkey.c104
-rw-r--r--kernel/module-internal.h2
-rw-r--r--kernel/module.c169
-rw-r--r--kernel/module_signing.c11
-rw-r--r--kernel/padata.c9
-rw-r--r--kernel/panic.c2
-rw-r--r--kernel/pid_namespace.c8
-rw-r--r--kernel/power/Kconfig16
-rw-r--r--kernel/power/console.c1
-rw-r--r--kernel/power/qos.c26
-rw-r--r--kernel/power/snapshot.c9
-rw-r--r--kernel/power/user.c21
-rw-r--r--kernel/printk/printk.c35
-rw-r--r--kernel/ptrace.c3
-rw-r--r--kernel/rcu/Makefile6
-rw-r--r--kernel/rcu/rcu.h (renamed from kernel/rcu.h)7
-rw-r--r--kernel/rcu/srcu.c (renamed from kernel/srcu.c)0
-rw-r--r--kernel/rcu/tiny.c (renamed from kernel/rcutiny.c)37
-rw-r--r--kernel/rcu/tiny_plugin.h (renamed from kernel/rcutiny_plugin.h)0
-rw-r--r--kernel/rcu/torture.c (renamed from kernel/rcutorture.c)6
-rw-r--r--kernel/rcu/tree.c (renamed from kernel/rcutree.c)200
-rw-r--r--kernel/rcu/tree.h (renamed from kernel/rcutree.h)2
-rw-r--r--kernel/rcu/tree_plugin.h (renamed from kernel/rcutree_plugin.h)88
-rw-r--r--kernel/rcu/tree_trace.c (renamed from kernel/rcutree_trace.c)2
-rw-r--r--kernel/rcu/update.c (renamed from kernel/rcupdate.c)10
-rw-r--r--kernel/reboot.c2
-rw-r--r--kernel/sched/Makefile1
-rw-r--r--kernel/sched/completion.c299
-rw-r--r--kernel/sched/core.c705
-rw-r--r--kernel/sched/debug.c68
-rw-r--r--kernel/sched/fair.c1594
-rw-r--r--kernel/sched/features.h19
-rw-r--r--kernel/sched/idle_task.c2
-rw-r--r--kernel/sched/rt.c36
-rw-r--r--kernel/sched/sched.h54
-rw-r--r--kernel/sched/stats.h46
-rw-r--r--kernel/sched/stop_task.c2
-rw-r--r--kernel/sched/wait.c (renamed from kernel/wait.c)127
-rw-r--r--kernel/signal.c2
-rw-r--r--kernel/smp.c19
-rw-r--r--kernel/softirq.c184
-rw-r--r--kernel/stop_machine.c303
-rw-r--r--kernel/sys.c1
-rw-r--r--kernel/sysctl.c34
-rw-r--r--kernel/sysctl_binary.c6
-rw-r--r--kernel/system_certificates.S20
-rw-r--r--kernel/system_keyring.c105
-rw-r--r--kernel/taskstats.c54
-rw-r--r--kernel/time/Kconfig2
-rw-r--r--kernel/time/alarmtimer.c4
-rw-r--r--kernel/time/clockevents.c2
-rw-r--r--kernel/time/clocksource.c52
-rw-r--r--kernel/time/ntp.c3
-rw-r--r--kernel/time/sched_clock.c114
-rw-r--r--kernel/time/tick-broadcast.c1
-rw-r--r--kernel/time/tick-common.c15
-rw-r--r--kernel/time/tick-internal.h2
-rw-r--r--kernel/time/tick-sched.c25
-rw-r--r--kernel/time/timekeeping.c5
-rw-r--r--kernel/time/timer_stats.c8
-rw-r--r--kernel/timer.c13
-rw-r--r--kernel/trace/blktrace.c36
-rw-r--r--kernel/trace/ftrace.c227
-rw-r--r--kernel/trace/trace.c85
-rw-r--r--kernel/trace/trace.h51
-rw-r--r--kernel/trace/trace_branch.c2
-rw-r--r--kernel/trace/trace_event_perf.c10
-rw-r--r--kernel/trace/trace_events.c35
-rw-r--r--kernel/trace/trace_events_filter.c218
-rw-r--r--kernel/trace/trace_export.c2
-rw-r--r--kernel/trace/trace_functions_graph.c82
-rw-r--r--kernel/trace/trace_kprobe.c4
-rw-r--r--kernel/trace/trace_mmiotrace.c4
-rw-r--r--kernel/trace/trace_output.c19
-rw-r--r--kernel/trace/trace_sched_switch.c4
-rw-r--r--kernel/trace/trace_stat.c41
-rw-r--r--kernel/trace/trace_syscalls.c32
-rw-r--r--kernel/trace/trace_uprobe.c3
-rw-r--r--kernel/up.c11
-rw-r--r--kernel/user.c4
-rw-r--r--kernel/user_namespace.c6
-rw-r--r--kernel/workqueue.c82
147 files changed, 6384 insertions, 2588 deletions
diff --git a/kernel/.gitignore b/kernel/.gitignore
index b3097bde4e9c..790d83c7d160 100644
--- a/kernel/.gitignore
+++ b/kernel/.gitignore
@@ -5,3 +5,4 @@ config_data.h
5config_data.gz 5config_data.gz
6timeconst.h 6timeconst.h
7hz.bc 7hz.bc
8x509_certificate_list
diff --git a/kernel/Kconfig.hz b/kernel/Kconfig.hz
index 94fabd534b03..2a202a846757 100644
--- a/kernel/Kconfig.hz
+++ b/kernel/Kconfig.hz
@@ -55,4 +55,4 @@ config HZ
55 default 1000 if HZ_1000 55 default 1000 if HZ_1000
56 56
57config SCHED_HRTICK 57config SCHED_HRTICK
58 def_bool HIGH_RES_TIMERS && (!SMP || USE_GENERIC_SMP_HELPERS) 58 def_bool HIGH_RES_TIMERS
diff --git a/kernel/Makefile b/kernel/Makefile
index 1ce47553fb02..bc010ee272b6 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -6,56 +6,44 @@ obj-y = fork.o exec_domain.o panic.o \
6 cpu.o exit.o itimer.o time.o softirq.o resource.o \ 6 cpu.o exit.o itimer.o time.o softirq.o resource.o \
7 sysctl.o sysctl_binary.o capability.o ptrace.o timer.o user.o \ 7 sysctl.o sysctl_binary.o capability.o ptrace.o timer.o user.o \
8 signal.o sys.o kmod.o workqueue.o pid.o task_work.o \ 8 signal.o sys.o kmod.o workqueue.o pid.o task_work.o \
9 rcupdate.o extable.o params.o posix-timers.o \ 9 extable.o params.o posix-timers.o \
10 kthread.o wait.o sys_ni.o posix-cpu-timers.o mutex.o \ 10 kthread.o sys_ni.o posix-cpu-timers.o \
11 hrtimer.o rwsem.o nsproxy.o srcu.o semaphore.o \ 11 hrtimer.o nsproxy.o \
12 notifier.o ksysfs.o cred.o reboot.o \ 12 notifier.o ksysfs.o cred.o reboot.o \
13 async.o range.o groups.o lglock.o smpboot.o 13 async.o range.o groups.o smpboot.o
14 14
15ifdef CONFIG_FUNCTION_TRACER 15ifdef CONFIG_FUNCTION_TRACER
16# Do not trace debug files and internal ftrace files 16# Do not trace debug files and internal ftrace files
17CFLAGS_REMOVE_lockdep.o = -pg
18CFLAGS_REMOVE_lockdep_proc.o = -pg
19CFLAGS_REMOVE_mutex-debug.o = -pg
20CFLAGS_REMOVE_rtmutex-debug.o = -pg
21CFLAGS_REMOVE_cgroup-debug.o = -pg 17CFLAGS_REMOVE_cgroup-debug.o = -pg
22CFLAGS_REMOVE_irq_work.o = -pg 18CFLAGS_REMOVE_irq_work.o = -pg
23endif 19endif
24 20
25obj-y += sched/ 21obj-y += sched/
22obj-y += locking/
26obj-y += power/ 23obj-y += power/
27obj-y += printk/ 24obj-y += printk/
28obj-y += cpu/ 25obj-y += cpu/
29obj-y += irq/ 26obj-y += irq/
27obj-y += rcu/
30 28
31obj-$(CONFIG_CHECKPOINT_RESTORE) += kcmp.o 29obj-$(CONFIG_CHECKPOINT_RESTORE) += kcmp.o
32obj-$(CONFIG_FREEZER) += freezer.o 30obj-$(CONFIG_FREEZER) += freezer.o
33obj-$(CONFIG_PROFILING) += profile.o 31obj-$(CONFIG_PROFILING) += profile.o
34obj-$(CONFIG_STACKTRACE) += stacktrace.o 32obj-$(CONFIG_STACKTRACE) += stacktrace.o
35obj-y += time/ 33obj-y += time/
36obj-$(CONFIG_DEBUG_MUTEXES) += mutex-debug.o
37obj-$(CONFIG_LOCKDEP) += lockdep.o
38ifeq ($(CONFIG_PROC_FS),y)
39obj-$(CONFIG_LOCKDEP) += lockdep_proc.o
40endif
41obj-$(CONFIG_FUTEX) += futex.o 34obj-$(CONFIG_FUTEX) += futex.o
42ifeq ($(CONFIG_COMPAT),y) 35ifeq ($(CONFIG_COMPAT),y)
43obj-$(CONFIG_FUTEX) += futex_compat.o 36obj-$(CONFIG_FUTEX) += futex_compat.o
44endif 37endif
45obj-$(CONFIG_RT_MUTEXES) += rtmutex.o
46obj-$(CONFIG_DEBUG_RT_MUTEXES) += rtmutex-debug.o
47obj-$(CONFIG_RT_MUTEX_TESTER) += rtmutex-tester.o
48obj-$(CONFIG_GENERIC_ISA_DMA) += dma.o 38obj-$(CONFIG_GENERIC_ISA_DMA) += dma.o
49obj-$(CONFIG_SMP) += smp.o 39obj-$(CONFIG_SMP) += smp.o
50ifneq ($(CONFIG_SMP),y) 40ifneq ($(CONFIG_SMP),y)
51obj-y += up.o 41obj-y += up.o
52endif 42endif
53obj-$(CONFIG_SMP) += spinlock.o
54obj-$(CONFIG_DEBUG_SPINLOCK) += spinlock.o
55obj-$(CONFIG_PROVE_LOCKING) += spinlock.o
56obj-$(CONFIG_UID16) += uid16.o 43obj-$(CONFIG_UID16) += uid16.o
44obj-$(CONFIG_SYSTEM_TRUSTED_KEYRING) += system_keyring.o system_certificates.o
57obj-$(CONFIG_MODULES) += module.o 45obj-$(CONFIG_MODULES) += module.o
58obj-$(CONFIG_MODULE_SIG) += module_signing.o modsign_pubkey.o modsign_certificate.o 46obj-$(CONFIG_MODULE_SIG) += module_signing.o
59obj-$(CONFIG_KALLSYMS) += kallsyms.o 47obj-$(CONFIG_KALLSYMS) += kallsyms.o
60obj-$(CONFIG_BSD_PROCESS_ACCT) += acct.o 48obj-$(CONFIG_BSD_PROCESS_ACCT) += acct.o
61obj-$(CONFIG_KEXEC) += kexec.o 49obj-$(CONFIG_KEXEC) += kexec.o
@@ -81,12 +69,6 @@ obj-$(CONFIG_KGDB) += debug/
81obj-$(CONFIG_DETECT_HUNG_TASK) += hung_task.o 69obj-$(CONFIG_DETECT_HUNG_TASK) += hung_task.o
82obj-$(CONFIG_LOCKUP_DETECTOR) += watchdog.o 70obj-$(CONFIG_LOCKUP_DETECTOR) += watchdog.o
83obj-$(CONFIG_SECCOMP) += seccomp.o 71obj-$(CONFIG_SECCOMP) += seccomp.o
84obj-$(CONFIG_RCU_TORTURE_TEST) += rcutorture.o
85obj-$(CONFIG_TREE_RCU) += rcutree.o
86obj-$(CONFIG_TREE_PREEMPT_RCU) += rcutree.o
87obj-$(CONFIG_TREE_RCU_TRACE) += rcutree_trace.o
88obj-$(CONFIG_TINY_RCU) += rcutiny.o
89obj-$(CONFIG_TINY_PREEMPT_RCU) += rcutiny.o
90obj-$(CONFIG_RELAY) += relay.o 72obj-$(CONFIG_RELAY) += relay.o
91obj-$(CONFIG_SYSCTL) += utsname_sysctl.o 73obj-$(CONFIG_SYSCTL) += utsname_sysctl.o
92obj-$(CONFIG_TASK_DELAY_ACCT) += delayacct.o 74obj-$(CONFIG_TASK_DELAY_ACCT) += delayacct.o
@@ -141,19 +123,53 @@ targets += timeconst.h
141$(obj)/timeconst.h: $(obj)/hz.bc $(src)/timeconst.bc FORCE 123$(obj)/timeconst.h: $(obj)/hz.bc $(src)/timeconst.bc FORCE
142 $(call if_changed,bc) 124 $(call if_changed,bc)
143 125
144ifeq ($(CONFIG_MODULE_SIG),y) 126###############################################################################
127#
128# Roll all the X.509 certificates that we can find together and pull them into
129# the kernel so that they get loaded into the system trusted keyring during
130# boot.
145# 131#
146# Pull the signing certificate and any extra certificates into the kernel 132# We look in the source root and the build root for all files whose name ends
133# in ".x509". Unfortunately, this will generate duplicate filenames, so we
134# have make canonicalise the pathnames and then sort them to discard the
135# duplicates.
147# 136#
137###############################################################################
138ifeq ($(CONFIG_SYSTEM_TRUSTED_KEYRING),y)
139X509_CERTIFICATES-y := $(wildcard *.x509) $(wildcard $(srctree)/*.x509)
140X509_CERTIFICATES-$(CONFIG_MODULE_SIG) += $(objtree)/signing_key.x509
141X509_CERTIFICATES-raw := $(sort $(foreach CERT,$(X509_CERTIFICATES-y), \
142 $(or $(realpath $(CERT)),$(CERT))))
143X509_CERTIFICATES := $(subst $(realpath $(objtree))/,,$(X509_CERTIFICATES-raw))
144
145ifeq ($(X509_CERTIFICATES),)
146$(warning *** No X.509 certificates found ***)
147endif
148 148
149quiet_cmd_touch = TOUCH $@ 149ifneq ($(wildcard $(obj)/.x509.list),)
150 cmd_touch = touch $@ 150ifneq ($(shell cat $(obj)/.x509.list),$(X509_CERTIFICATES))
151$(info X.509 certificate list changed)
152$(shell rm $(obj)/.x509.list)
153endif
154endif
155
156kernel/system_certificates.o: $(obj)/x509_certificate_list
157
158quiet_cmd_x509certs = CERTS $@
159 cmd_x509certs = cat $(X509_CERTIFICATES) /dev/null >$@ $(foreach X509,$(X509_CERTIFICATES),; echo " - Including cert $(X509)")
151 160
152extra_certificates: 161targets += $(obj)/x509_certificate_list
153 $(call cmd,touch) 162$(obj)/x509_certificate_list: $(X509_CERTIFICATES) $(obj)/.x509.list
163 $(call if_changed,x509certs)
154 164
155kernel/modsign_certificate.o: signing_key.x509 extra_certificates 165targets += $(obj)/.x509.list
166$(obj)/.x509.list:
167 @echo $(X509_CERTIFICATES) >$@
168endif
169
170clean-files := x509_certificate_list .x509.list
156 171
172ifeq ($(CONFIG_MODULE_SIG),y)
157############################################################################### 173###############################################################################
158# 174#
159# If module signing is requested, say by allyesconfig, but a key has not been 175# If module signing is requested, say by allyesconfig, but a key has not been
diff --git a/kernel/bounds.c b/kernel/bounds.c
index 0c9b862292b2..9fd4246b04b8 100644
--- a/kernel/bounds.c
+++ b/kernel/bounds.c
@@ -10,6 +10,8 @@
10#include <linux/mmzone.h> 10#include <linux/mmzone.h>
11#include <linux/kbuild.h> 11#include <linux/kbuild.h>
12#include <linux/page_cgroup.h> 12#include <linux/page_cgroup.h>
13#include <linux/log2.h>
14#include <linux/spinlock_types.h>
13 15
14void foo(void) 16void foo(void)
15{ 17{
@@ -17,5 +19,9 @@ void foo(void)
17 DEFINE(NR_PAGEFLAGS, __NR_PAGEFLAGS); 19 DEFINE(NR_PAGEFLAGS, __NR_PAGEFLAGS);
18 DEFINE(MAX_NR_ZONES, __MAX_NR_ZONES); 20 DEFINE(MAX_NR_ZONES, __MAX_NR_ZONES);
19 DEFINE(NR_PCG_FLAGS, __NR_PCG_FLAGS); 21 DEFINE(NR_PCG_FLAGS, __NR_PCG_FLAGS);
22#ifdef CONFIG_SMP
23 DEFINE(NR_CPUS_BITS, ilog2(CONFIG_NR_CPUS));
24#endif
25 DEFINE(SPINLOCK_SIZE, sizeof(spinlock_t));
20 /* End of constants */ 26 /* End of constants */
21} 27}
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 8bd9cfdc70d7..bc1dcabe9217 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -90,6 +90,14 @@ static DEFINE_MUTEX(cgroup_mutex);
90static DEFINE_MUTEX(cgroup_root_mutex); 90static DEFINE_MUTEX(cgroup_root_mutex);
91 91
92/* 92/*
93 * cgroup destruction makes heavy use of work items and there can be a lot
94 * of concurrent destructions. Use a separate workqueue so that cgroup
95 * destruction work items don't end up filling up max_active of system_wq
96 * which may lead to deadlock.
97 */
98static struct workqueue_struct *cgroup_destroy_wq;
99
100/*
93 * Generate an array of cgroup subsystem pointers. At boot time, this is 101 * Generate an array of cgroup subsystem pointers. At boot time, this is
94 * populated with the built in subsystems, and modular subsystems are 102 * populated with the built in subsystems, and modular subsystems are
95 * registered after that. The mutable section of this array is protected by 103 * registered after that. The mutable section of this array is protected by
@@ -125,38 +133,6 @@ struct cfent {
125}; 133};
126 134
127/* 135/*
128 * CSS ID -- ID per subsys's Cgroup Subsys State(CSS). used only when
129 * cgroup_subsys->use_id != 0.
130 */
131#define CSS_ID_MAX (65535)
132struct css_id {
133 /*
134 * The css to which this ID points. This pointer is set to valid value
135 * after cgroup is populated. If cgroup is removed, this will be NULL.
136 * This pointer is expected to be RCU-safe because destroy()
137 * is called after synchronize_rcu(). But for safe use, css_tryget()
138 * should be used for avoiding race.
139 */
140 struct cgroup_subsys_state __rcu *css;
141 /*
142 * ID of this css.
143 */
144 unsigned short id;
145 /*
146 * Depth in hierarchy which this ID belongs to.
147 */
148 unsigned short depth;
149 /*
150 * ID is freed by RCU. (and lookup routine is RCU safe.)
151 */
152 struct rcu_head rcu_head;
153 /*
154 * Hierarchy of CSS ID belongs to.
155 */
156 unsigned short stack[0]; /* Array of Length (depth+1) */
157};
158
159/*
160 * cgroup_event represents events which userspace want to receive. 136 * cgroup_event represents events which userspace want to receive.
161 */ 137 */
162struct cgroup_event { 138struct cgroup_event {
@@ -223,6 +199,7 @@ static void cgroup_destroy_css_killed(struct cgroup *cgrp);
223static int cgroup_destroy_locked(struct cgroup *cgrp); 199static int cgroup_destroy_locked(struct cgroup *cgrp);
224static int cgroup_addrm_files(struct cgroup *cgrp, struct cftype cfts[], 200static int cgroup_addrm_files(struct cgroup *cgrp, struct cftype cfts[],
225 bool is_add); 201 bool is_add);
202static int cgroup_file_release(struct inode *inode, struct file *file);
226 203
227/** 204/**
228 * cgroup_css - obtain a cgroup's css for the specified subsystem 205 * cgroup_css - obtain a cgroup's css for the specified subsystem
@@ -387,9 +364,6 @@ struct cgrp_cset_link {
387static struct css_set init_css_set; 364static struct css_set init_css_set;
388static struct cgrp_cset_link init_cgrp_cset_link; 365static struct cgrp_cset_link init_cgrp_cset_link;
389 366
390static int cgroup_init_idr(struct cgroup_subsys *ss,
391 struct cgroup_subsys_state *css);
392
393/* 367/*
394 * css_set_lock protects the list of css_set objects, and the chain of 368 * css_set_lock protects the list of css_set objects, and the chain of
395 * tasks off each css_set. Nests outside task->alloc_lock due to 369 * tasks off each css_set. Nests outside task->alloc_lock due to
@@ -841,8 +815,6 @@ static struct backing_dev_info cgroup_backing_dev_info = {
841 .capabilities = BDI_CAP_NO_ACCT_AND_WRITEBACK, 815 .capabilities = BDI_CAP_NO_ACCT_AND_WRITEBACK,
842}; 816};
843 817
844static int alloc_css_id(struct cgroup_subsys_state *child_css);
845
846static struct inode *cgroup_new_inode(umode_t mode, struct super_block *sb) 818static struct inode *cgroup_new_inode(umode_t mode, struct super_block *sb)
847{ 819{
848 struct inode *inode = new_inode(sb); 820 struct inode *inode = new_inode(sb);
@@ -908,7 +880,7 @@ static void cgroup_free_rcu(struct rcu_head *head)
908 struct cgroup *cgrp = container_of(head, struct cgroup, rcu_head); 880 struct cgroup *cgrp = container_of(head, struct cgroup, rcu_head);
909 881
910 INIT_WORK(&cgrp->destroy_work, cgroup_free_fn); 882 INIT_WORK(&cgrp->destroy_work, cgroup_free_fn);
911 schedule_work(&cgrp->destroy_work); 883 queue_work(cgroup_destroy_wq, &cgrp->destroy_work);
912} 884}
913 885
914static void cgroup_diput(struct dentry *dentry, struct inode *inode) 886static void cgroup_diput(struct dentry *dentry, struct inode *inode)
@@ -918,6 +890,16 @@ static void cgroup_diput(struct dentry *dentry, struct inode *inode)
918 struct cgroup *cgrp = dentry->d_fsdata; 890 struct cgroup *cgrp = dentry->d_fsdata;
919 891
920 BUG_ON(!(cgroup_is_dead(cgrp))); 892 BUG_ON(!(cgroup_is_dead(cgrp)));
893
894 /*
895 * XXX: cgrp->id is only used to look up css's. As cgroup
896 * and css's lifetimes will be decoupled, it should be made
897 * per-subsystem and moved to css->id so that lookups are
898 * successful until the target css is released.
899 */
900 idr_remove(&cgrp->root->cgroup_idr, cgrp->id);
901 cgrp->id = -1;
902
921 call_rcu(&cgrp->rcu_head, cgroup_free_rcu); 903 call_rcu(&cgrp->rcu_head, cgroup_free_rcu);
922 } else { 904 } else {
923 struct cfent *cfe = __d_cfe(dentry); 905 struct cfent *cfe = __d_cfe(dentry);
@@ -932,11 +914,6 @@ static void cgroup_diput(struct dentry *dentry, struct inode *inode)
932 iput(inode); 914 iput(inode);
933} 915}
934 916
935static int cgroup_delete(const struct dentry *d)
936{
937 return 1;
938}
939
940static void remove_dir(struct dentry *d) 917static void remove_dir(struct dentry *d)
941{ 918{
942 struct dentry *parent = dget(d->d_parent); 919 struct dentry *parent = dget(d->d_parent);
@@ -1523,7 +1500,7 @@ static int cgroup_get_rootdir(struct super_block *sb)
1523{ 1500{
1524 static const struct dentry_operations cgroup_dops = { 1501 static const struct dentry_operations cgroup_dops = {
1525 .d_iput = cgroup_diput, 1502 .d_iput = cgroup_diput,
1526 .d_delete = cgroup_delete, 1503 .d_delete = always_delete_dentry,
1527 }; 1504 };
1528 1505
1529 struct inode *inode = 1506 struct inode *inode =
@@ -2463,7 +2440,7 @@ static const struct file_operations cgroup_seqfile_operations = {
2463 .read = seq_read, 2440 .read = seq_read,
2464 .write = cgroup_file_write, 2441 .write = cgroup_file_write,
2465 .llseek = seq_lseek, 2442 .llseek = seq_lseek,
2466 .release = single_release, 2443 .release = cgroup_file_release,
2467}; 2444};
2468 2445
2469static int cgroup_file_open(struct inode *inode, struct file *file) 2446static int cgroup_file_open(struct inode *inode, struct file *file)
@@ -2524,6 +2501,8 @@ static int cgroup_file_release(struct inode *inode, struct file *file)
2524 ret = cft->release(inode, file); 2501 ret = cft->release(inode, file);
2525 if (css->ss) 2502 if (css->ss)
2526 css_put(css); 2503 css_put(css);
2504 if (file->f_op == &cgroup_seqfile_operations)
2505 single_release(inode, file);
2527 return ret; 2506 return ret;
2528} 2507}
2529 2508
@@ -4240,21 +4219,6 @@ static int cgroup_populate_dir(struct cgroup *cgrp, unsigned long subsys_mask)
4240 goto err; 4219 goto err;
4241 } 4220 }
4242 } 4221 }
4243
4244 /* This cgroup is ready now */
4245 for_each_root_subsys(cgrp->root, ss) {
4246 struct cgroup_subsys_state *css = cgroup_css(cgrp, ss);
4247 struct css_id *id = rcu_dereference_protected(css->id, true);
4248
4249 /*
4250 * Update id->css pointer and make this css visible from
4251 * CSS ID functions. This pointer will be dereferened
4252 * from RCU-read-side without locks.
4253 */
4254 if (id)
4255 rcu_assign_pointer(id->css, css);
4256 }
4257
4258 return 0; 4222 return 0;
4259err: 4223err:
4260 cgroup_clear_dir(cgrp, subsys_mask); 4224 cgroup_clear_dir(cgrp, subsys_mask);
@@ -4306,7 +4270,7 @@ static void css_free_rcu_fn(struct rcu_head *rcu_head)
4306 * css_put(). dput() requires process context which we don't have. 4270 * css_put(). dput() requires process context which we don't have.
4307 */ 4271 */
4308 INIT_WORK(&css->destroy_work, css_free_work_fn); 4272 INIT_WORK(&css->destroy_work, css_free_work_fn);
4309 schedule_work(&css->destroy_work); 4273 queue_work(cgroup_destroy_wq, &css->destroy_work);
4310} 4274}
4311 4275
4312static void css_release(struct percpu_ref *ref) 4276static void css_release(struct percpu_ref *ref)
@@ -4314,6 +4278,7 @@ static void css_release(struct percpu_ref *ref)
4314 struct cgroup_subsys_state *css = 4278 struct cgroup_subsys_state *css =
4315 container_of(ref, struct cgroup_subsys_state, refcnt); 4279 container_of(ref, struct cgroup_subsys_state, refcnt);
4316 4280
4281 rcu_assign_pointer(css->cgroup->subsys[css->ss->subsys_id], NULL);
4317 call_rcu(&css->rcu_head, css_free_rcu_fn); 4282 call_rcu(&css->rcu_head, css_free_rcu_fn);
4318} 4283}
4319 4284
@@ -4323,7 +4288,6 @@ static void init_css(struct cgroup_subsys_state *css, struct cgroup_subsys *ss,
4323 css->cgroup = cgrp; 4288 css->cgroup = cgrp;
4324 css->ss = ss; 4289 css->ss = ss;
4325 css->flags = 0; 4290 css->flags = 0;
4326 css->id = NULL;
4327 4291
4328 if (cgrp->parent) 4292 if (cgrp->parent)
4329 css->parent = cgroup_css(cgrp->parent, ss); 4293 css->parent = cgroup_css(cgrp->parent, ss);
@@ -4455,12 +4419,6 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry,
4455 goto err_free_all; 4419 goto err_free_all;
4456 4420
4457 init_css(css, ss, cgrp); 4421 init_css(css, ss, cgrp);
4458
4459 if (ss->use_id) {
4460 err = alloc_css_id(css);
4461 if (err)
4462 goto err_free_all;
4463 }
4464 } 4422 }
4465 4423
4466 /* 4424 /*
@@ -4479,14 +4437,6 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry,
4479 list_add_tail_rcu(&cgrp->sibling, &cgrp->parent->children); 4437 list_add_tail_rcu(&cgrp->sibling, &cgrp->parent->children);
4480 root->number_of_cgroups++; 4438 root->number_of_cgroups++;
4481 4439
4482 /* each css holds a ref to the cgroup's dentry and the parent css */
4483 for_each_root_subsys(root, ss) {
4484 struct cgroup_subsys_state *css = css_ar[ss->subsys_id];
4485
4486 dget(dentry);
4487 css_get(css->parent);
4488 }
4489
4490 /* hold a ref to the parent's dentry */ 4440 /* hold a ref to the parent's dentry */
4491 dget(parent->dentry); 4441 dget(parent->dentry);
4492 4442
@@ -4498,6 +4448,13 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry,
4498 if (err) 4448 if (err)
4499 goto err_destroy; 4449 goto err_destroy;
4500 4450
4451 /* each css holds a ref to the cgroup's dentry and parent css */
4452 dget(dentry);
4453 css_get(css->parent);
4454
4455 /* mark it consumed for error path */
4456 css_ar[ss->subsys_id] = NULL;
4457
4501 if (ss->broken_hierarchy && !ss->warned_broken_hierarchy && 4458 if (ss->broken_hierarchy && !ss->warned_broken_hierarchy &&
4502 parent->parent) { 4459 parent->parent) {
4503 pr_warning("cgroup: %s (%d) created nested cgroup for controller \"%s\" which has incomplete hierarchy support. Nested cgroups may change behavior in the future.\n", 4460 pr_warning("cgroup: %s (%d) created nested cgroup for controller \"%s\" which has incomplete hierarchy support. Nested cgroups may change behavior in the future.\n",
@@ -4544,6 +4501,14 @@ err_free_cgrp:
4544 return err; 4501 return err;
4545 4502
4546err_destroy: 4503err_destroy:
4504 for_each_root_subsys(root, ss) {
4505 struct cgroup_subsys_state *css = css_ar[ss->subsys_id];
4506
4507 if (css) {
4508 percpu_ref_cancel_init(&css->refcnt);
4509 ss->css_free(css);
4510 }
4511 }
4547 cgroup_destroy_locked(cgrp); 4512 cgroup_destroy_locked(cgrp);
4548 mutex_unlock(&cgroup_mutex); 4513 mutex_unlock(&cgroup_mutex);
4549 mutex_unlock(&dentry->d_inode->i_mutex); 4514 mutex_unlock(&dentry->d_inode->i_mutex);
@@ -4603,7 +4568,7 @@ static void css_killed_ref_fn(struct percpu_ref *ref)
4603 container_of(ref, struct cgroup_subsys_state, refcnt); 4568 container_of(ref, struct cgroup_subsys_state, refcnt);
4604 4569
4605 INIT_WORK(&css->destroy_work, css_killed_work_fn); 4570 INIT_WORK(&css->destroy_work, css_killed_work_fn);
4606 schedule_work(&css->destroy_work); 4571 queue_work(cgroup_destroy_wq, &css->destroy_work);
4607} 4572}
4608 4573
4609/** 4574/**
@@ -4705,8 +4670,12 @@ static int cgroup_destroy_locked(struct cgroup *cgrp)
4705 * will be invoked to perform the rest of destruction once the 4670 * will be invoked to perform the rest of destruction once the
4706 * percpu refs of all css's are confirmed to be killed. 4671 * percpu refs of all css's are confirmed to be killed.
4707 */ 4672 */
4708 for_each_root_subsys(cgrp->root, ss) 4673 for_each_root_subsys(cgrp->root, ss) {
4709 kill_css(cgroup_css(cgrp, ss)); 4674 struct cgroup_subsys_state *css = cgroup_css(cgrp, ss);
4675
4676 if (css)
4677 kill_css(css);
4678 }
4710 4679
4711 /* 4680 /*
4712 * Mark @cgrp dead. This prevents further task migration and child 4681 * Mark @cgrp dead. This prevents further task migration and child
@@ -4775,14 +4744,6 @@ static void cgroup_destroy_css_killed(struct cgroup *cgrp)
4775 /* delete this cgroup from parent->children */ 4744 /* delete this cgroup from parent->children */
4776 list_del_rcu(&cgrp->sibling); 4745 list_del_rcu(&cgrp->sibling);
4777 4746
4778 /*
4779 * We should remove the cgroup object from idr before its grace
4780 * period starts, so we won't be looking up a cgroup while the
4781 * cgroup is being freed.
4782 */
4783 idr_remove(&cgrp->root->cgroup_idr, cgrp->id);
4784 cgrp->id = -1;
4785
4786 dput(d); 4747 dput(d);
4787 4748
4788 set_bit(CGRP_RELEASABLE, &parent->flags); 4749 set_bit(CGRP_RELEASABLE, &parent->flags);
@@ -4925,12 +4886,6 @@ int __init_or_module cgroup_load_subsys(struct cgroup_subsys *ss)
4925 4886
4926 /* our new subsystem will be attached to the dummy hierarchy. */ 4887 /* our new subsystem will be attached to the dummy hierarchy. */
4927 init_css(css, ss, cgroup_dummy_top); 4888 init_css(css, ss, cgroup_dummy_top);
4928 /* init_idr must be after init_css() because it sets css->id. */
4929 if (ss->use_id) {
4930 ret = cgroup_init_idr(ss, css);
4931 if (ret)
4932 goto err_unload;
4933 }
4934 4889
4935 /* 4890 /*
4936 * Now we need to entangle the css into the existing css_sets. unlike 4891 * Now we need to entangle the css into the existing css_sets. unlike
@@ -4996,9 +4951,6 @@ void cgroup_unload_subsys(struct cgroup_subsys *ss)
4996 4951
4997 offline_css(cgroup_css(cgroup_dummy_top, ss)); 4952 offline_css(cgroup_css(cgroup_dummy_top, ss));
4998 4953
4999 if (ss->use_id)
5000 idr_destroy(&ss->idr);
5001
5002 /* deassign the subsys_id */ 4954 /* deassign the subsys_id */
5003 cgroup_subsys[ss->subsys_id] = NULL; 4955 cgroup_subsys[ss->subsys_id] = NULL;
5004 4956
@@ -5025,8 +4977,7 @@ void cgroup_unload_subsys(struct cgroup_subsys *ss)
5025 /* 4977 /*
5026 * remove subsystem's css from the cgroup_dummy_top and free it - 4978 * remove subsystem's css from the cgroup_dummy_top and free it -
5027 * need to free before marking as null because ss->css_free needs 4979 * need to free before marking as null because ss->css_free needs
5028 * the cgrp->subsys pointer to find their state. note that this 4980 * the cgrp->subsys pointer to find their state.
5029 * also takes care of freeing the css_id.
5030 */ 4981 */
5031 ss->css_free(cgroup_css(cgroup_dummy_top, ss)); 4982 ss->css_free(cgroup_css(cgroup_dummy_top, ss));
5032 RCU_INIT_POINTER(cgroup_dummy_top->subsys[ss->subsys_id], NULL); 4983 RCU_INIT_POINTER(cgroup_dummy_top->subsys[ss->subsys_id], NULL);
@@ -5097,8 +5048,6 @@ int __init cgroup_init(void)
5097 for_each_builtin_subsys(ss, i) { 5048 for_each_builtin_subsys(ss, i) {
5098 if (!ss->early_init) 5049 if (!ss->early_init)
5099 cgroup_init_subsys(ss); 5050 cgroup_init_subsys(ss);
5100 if (ss->use_id)
5101 cgroup_init_idr(ss, init_css_set.subsys[ss->subsys_id]);
5102 } 5051 }
5103 5052
5104 /* allocate id for the dummy hierarchy */ 5053 /* allocate id for the dummy hierarchy */
@@ -5139,6 +5088,22 @@ out:
5139 return err; 5088 return err;
5140} 5089}
5141 5090
5091static int __init cgroup_wq_init(void)
5092{
5093 /*
5094 * There isn't much point in executing destruction path in
5095 * parallel. Good chunk is serialized with cgroup_mutex anyway.
5096 * Use 1 for @max_active.
5097 *
5098 * We would prefer to do this in cgroup_init() above, but that
5099 * is called before init_workqueues(): so leave this until after.
5100 */
5101 cgroup_destroy_wq = alloc_workqueue("cgroup_destroy", 0, 1);
5102 BUG_ON(!cgroup_destroy_wq);
5103 return 0;
5104}
5105core_initcall(cgroup_wq_init);
5106
5142/* 5107/*
5143 * proc_cgroup_show() 5108 * proc_cgroup_show()
5144 * - Print task's cgroup paths into seq_file, one line for each hierarchy 5109 * - Print task's cgroup paths into seq_file, one line for each hierarchy
@@ -5518,181 +5483,6 @@ static int __init cgroup_disable(char *str)
5518} 5483}
5519__setup("cgroup_disable=", cgroup_disable); 5484__setup("cgroup_disable=", cgroup_disable);
5520 5485
5521/*
5522 * Functons for CSS ID.
5523 */
5524
5525/* to get ID other than 0, this should be called when !cgroup_is_dead() */
5526unsigned short css_id(struct cgroup_subsys_state *css)
5527{
5528 struct css_id *cssid;
5529
5530 /*
5531 * This css_id() can return correct value when somone has refcnt
5532 * on this or this is under rcu_read_lock(). Once css->id is allocated,
5533 * it's unchanged until freed.
5534 */
5535 cssid = rcu_dereference_raw(css->id);
5536
5537 if (cssid)
5538 return cssid->id;
5539 return 0;
5540}
5541EXPORT_SYMBOL_GPL(css_id);
5542
5543/**
5544 * css_is_ancestor - test "root" css is an ancestor of "child"
5545 * @child: the css to be tested.
5546 * @root: the css supporsed to be an ancestor of the child.
5547 *
5548 * Returns true if "root" is an ancestor of "child" in its hierarchy. Because
5549 * this function reads css->id, the caller must hold rcu_read_lock().
5550 * But, considering usual usage, the csses should be valid objects after test.
5551 * Assuming that the caller will do some action to the child if this returns
5552 * returns true, the caller must take "child";s reference count.
5553 * If "child" is valid object and this returns true, "root" is valid, too.
5554 */
5555
5556bool css_is_ancestor(struct cgroup_subsys_state *child,
5557 const struct cgroup_subsys_state *root)
5558{
5559 struct css_id *child_id;
5560 struct css_id *root_id;
5561
5562 child_id = rcu_dereference(child->id);
5563 if (!child_id)
5564 return false;
5565 root_id = rcu_dereference(root->id);
5566 if (!root_id)
5567 return false;
5568 if (child_id->depth < root_id->depth)
5569 return false;
5570 if (child_id->stack[root_id->depth] != root_id->id)
5571 return false;
5572 return true;
5573}
5574
5575void free_css_id(struct cgroup_subsys *ss, struct cgroup_subsys_state *css)
5576{
5577 struct css_id *id = rcu_dereference_protected(css->id, true);
5578
5579 /* When this is called before css_id initialization, id can be NULL */
5580 if (!id)
5581 return;
5582
5583 BUG_ON(!ss->use_id);
5584
5585 rcu_assign_pointer(id->css, NULL);
5586 rcu_assign_pointer(css->id, NULL);
5587 spin_lock(&ss->id_lock);
5588 idr_remove(&ss->idr, id->id);
5589 spin_unlock(&ss->id_lock);
5590 kfree_rcu(id, rcu_head);
5591}
5592EXPORT_SYMBOL_GPL(free_css_id);
5593
5594/*
5595 * This is called by init or create(). Then, calls to this function are
5596 * always serialized (By cgroup_mutex() at create()).
5597 */
5598
5599static struct css_id *get_new_cssid(struct cgroup_subsys *ss, int depth)
5600{
5601 struct css_id *newid;
5602 int ret, size;
5603
5604 BUG_ON(!ss->use_id);
5605
5606 size = sizeof(*newid) + sizeof(unsigned short) * (depth + 1);
5607 newid = kzalloc(size, GFP_KERNEL);
5608 if (!newid)
5609 return ERR_PTR(-ENOMEM);
5610
5611 idr_preload(GFP_KERNEL);
5612 spin_lock(&ss->id_lock);
5613 /* Don't use 0. allocates an ID of 1-65535 */
5614 ret = idr_alloc(&ss->idr, newid, 1, CSS_ID_MAX + 1, GFP_NOWAIT);
5615 spin_unlock(&ss->id_lock);
5616 idr_preload_end();
5617
5618 /* Returns error when there are no free spaces for new ID.*/
5619 if (ret < 0)
5620 goto err_out;
5621
5622 newid->id = ret;
5623 newid->depth = depth;
5624 return newid;
5625err_out:
5626 kfree(newid);
5627 return ERR_PTR(ret);
5628
5629}
5630
5631static int __init_or_module cgroup_init_idr(struct cgroup_subsys *ss,
5632 struct cgroup_subsys_state *rootcss)
5633{
5634 struct css_id *newid;
5635
5636 spin_lock_init(&ss->id_lock);
5637 idr_init(&ss->idr);
5638
5639 newid = get_new_cssid(ss, 0);
5640 if (IS_ERR(newid))
5641 return PTR_ERR(newid);
5642
5643 newid->stack[0] = newid->id;
5644 RCU_INIT_POINTER(newid->css, rootcss);
5645 RCU_INIT_POINTER(rootcss->id, newid);
5646 return 0;
5647}
5648
5649static int alloc_css_id(struct cgroup_subsys_state *child_css)
5650{
5651 struct cgroup_subsys_state *parent_css = css_parent(child_css);
5652 struct css_id *child_id, *parent_id;
5653 int i, depth;
5654
5655 parent_id = rcu_dereference_protected(parent_css->id, true);
5656 depth = parent_id->depth + 1;
5657
5658 child_id = get_new_cssid(child_css->ss, depth);
5659 if (IS_ERR(child_id))
5660 return PTR_ERR(child_id);
5661
5662 for (i = 0; i < depth; i++)
5663 child_id->stack[i] = parent_id->stack[i];
5664 child_id->stack[depth] = child_id->id;
5665 /*
5666 * child_id->css pointer will be set after this cgroup is available
5667 * see cgroup_populate_dir()
5668 */
5669 rcu_assign_pointer(child_css->id, child_id);
5670
5671 return 0;
5672}
5673
5674/**
5675 * css_lookup - lookup css by id
5676 * @ss: cgroup subsys to be looked into.
5677 * @id: the id
5678 *
5679 * Returns pointer to cgroup_subsys_state if there is valid one with id.
5680 * NULL if not. Should be called under rcu_read_lock()
5681 */
5682struct cgroup_subsys_state *css_lookup(struct cgroup_subsys *ss, int id)
5683{
5684 struct css_id *cssid = NULL;
5685
5686 BUG_ON(!ss->use_id);
5687 cssid = idr_find(&ss->idr, id);
5688
5689 if (unlikely(!cssid))
5690 return NULL;
5691
5692 return rcu_dereference(cssid->css);
5693}
5694EXPORT_SYMBOL_GPL(css_lookup);
5695
5696/** 5486/**
5697 * css_from_dir - get corresponding css from the dentry of a cgroup dir 5487 * css_from_dir - get corresponding css from the dentry of a cgroup dir
5698 * @dentry: directory dentry of interest 5488 * @dentry: directory dentry of interest
diff --git a/kernel/context_tracking.c b/kernel/context_tracking.c
index 859c8dfd78a1..e5f3917aa05b 100644
--- a/kernel/context_tracking.c
+++ b/kernel/context_tracking.c
@@ -120,7 +120,7 @@ void context_tracking_user_enter(void)
120 * instead of preempt_schedule() to exit user context if needed before 120 * instead of preempt_schedule() to exit user context if needed before
121 * calling the scheduler. 121 * calling the scheduler.
122 */ 122 */
123void __sched notrace preempt_schedule_context(void) 123asmlinkage void __sched notrace preempt_schedule_context(void)
124{ 124{
125 enum ctx_state prev_ctx; 125 enum ctx_state prev_ctx;
126 126
diff --git a/kernel/cpu.c b/kernel/cpu.c
index d7f07a2da5a6..deff2e693766 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -306,8 +306,28 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen)
306 __func__, cpu); 306 __func__, cpu);
307 goto out_release; 307 goto out_release;
308 } 308 }
309
310 /*
311 * By now we've cleared cpu_active_mask, wait for all preempt-disabled
312 * and RCU users of this state to go away such that all new such users
313 * will observe it.
314 *
315 * For CONFIG_PREEMPT we have preemptible RCU and its sync_rcu() might
316 * not imply sync_sched(), so explicitly call both.
317 *
318 * Do sync before park smpboot threads to take care the rcu boost case.
319 */
320#ifdef CONFIG_PREEMPT
321 synchronize_sched();
322#endif
323 synchronize_rcu();
324
309 smpboot_park_threads(cpu); 325 smpboot_park_threads(cpu);
310 326
327 /*
328 * So now all preempt/rcu users must observe !cpu_active().
329 */
330
311 err = __stop_machine(take_cpu_down, &tcd_param, cpumask_of(cpu)); 331 err = __stop_machine(take_cpu_down, &tcd_param, cpumask_of(cpu));
312 if (err) { 332 if (err) {
313 /* CPU didn't die: tell everyone. Can't complain. */ 333 /* CPU didn't die: tell everyone. Can't complain. */
@@ -420,11 +440,6 @@ int cpu_up(unsigned int cpu)
420{ 440{
421 int err = 0; 441 int err = 0;
422 442
423#ifdef CONFIG_MEMORY_HOTPLUG
424 int nid;
425 pg_data_t *pgdat;
426#endif
427
428 if (!cpu_possible(cpu)) { 443 if (!cpu_possible(cpu)) {
429 printk(KERN_ERR "can't online cpu %d because it is not " 444 printk(KERN_ERR "can't online cpu %d because it is not "
430 "configured as may-hotadd at boot time\n", cpu); 445 "configured as may-hotadd at boot time\n", cpu);
@@ -435,27 +450,9 @@ int cpu_up(unsigned int cpu)
435 return -EINVAL; 450 return -EINVAL;
436 } 451 }
437 452
438#ifdef CONFIG_MEMORY_HOTPLUG 453 err = try_online_node(cpu_to_node(cpu));
439 nid = cpu_to_node(cpu); 454 if (err)
440 if (!node_online(nid)) { 455 return err;
441 err = mem_online_node(nid);
442 if (err)
443 return err;
444 }
445
446 pgdat = NODE_DATA(nid);
447 if (!pgdat) {
448 printk(KERN_ERR
449 "Can't online cpu %d due to NULL pgdat\n", cpu);
450 return -ENOMEM;
451 }
452
453 if (pgdat->node_zonelists->_zonerefs->zone == NULL) {
454 mutex_lock(&zonelists_mutex);
455 build_all_zonelists(NULL, NULL);
456 mutex_unlock(&zonelists_mutex);
457 }
458#endif
459 456
460 cpu_maps_update_begin(); 457 cpu_maps_update_begin();
461 458
diff --git a/kernel/cpu/idle.c b/kernel/cpu/idle.c
index e695c0a0bcb5..988573a9a387 100644
--- a/kernel/cpu/idle.c
+++ b/kernel/cpu/idle.c
@@ -44,7 +44,7 @@ static inline int cpu_idle_poll(void)
44 rcu_idle_enter(); 44 rcu_idle_enter();
45 trace_cpu_idle_rcuidle(0, smp_processor_id()); 45 trace_cpu_idle_rcuidle(0, smp_processor_id());
46 local_irq_enable(); 46 local_irq_enable();
47 while (!need_resched()) 47 while (!tif_need_resched())
48 cpu_relax(); 48 cpu_relax();
49 trace_cpu_idle_rcuidle(PWR_EVENT_EXIT, smp_processor_id()); 49 trace_cpu_idle_rcuidle(PWR_EVENT_EXIT, smp_processor_id());
50 rcu_idle_exit(); 50 rcu_idle_exit();
@@ -92,8 +92,7 @@ static void cpu_idle_loop(void)
92 if (cpu_idle_force_poll || tick_check_broadcast_expired()) { 92 if (cpu_idle_force_poll || tick_check_broadcast_expired()) {
93 cpu_idle_poll(); 93 cpu_idle_poll();
94 } else { 94 } else {
95 current_clr_polling(); 95 if (!current_clr_polling_and_test()) {
96 if (!need_resched()) {
97 stop_critical_timings(); 96 stop_critical_timings();
98 rcu_idle_enter(); 97 rcu_idle_enter();
99 arch_cpu_idle(); 98 arch_cpu_idle();
@@ -103,9 +102,16 @@ static void cpu_idle_loop(void)
103 } else { 102 } else {
104 local_irq_enable(); 103 local_irq_enable();
105 } 104 }
106 current_set_polling(); 105 __current_set_polling();
107 } 106 }
108 arch_cpu_idle_exit(); 107 arch_cpu_idle_exit();
108 /*
109 * We need to test and propagate the TIF_NEED_RESCHED
110 * bit here because we might not have send the
111 * reschedule IPI to idle tasks.
112 */
113 if (tif_need_resched())
114 set_preempt_need_resched();
109 } 115 }
110 tick_nohz_idle_exit(); 116 tick_nohz_idle_exit();
111 schedule_preempt_disabled(); 117 schedule_preempt_disabled();
@@ -129,7 +135,7 @@ void cpu_startup_entry(enum cpuhp_state state)
129 */ 135 */
130 boot_init_stack_canary(); 136 boot_init_stack_canary();
131#endif 137#endif
132 current_set_polling(); 138 __current_set_polling();
133 arch_cpu_idle_prepare(); 139 arch_cpu_idle_prepare();
134 cpu_idle_loop(); 140 cpu_idle_loop();
135} 141}
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index 6bf981e13c43..4772034b4b17 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -1033,8 +1033,10 @@ static void cpuset_change_task_nodemask(struct task_struct *tsk,
1033 need_loop = task_has_mempolicy(tsk) || 1033 need_loop = task_has_mempolicy(tsk) ||
1034 !nodes_intersects(*newmems, tsk->mems_allowed); 1034 !nodes_intersects(*newmems, tsk->mems_allowed);
1035 1035
1036 if (need_loop) 1036 if (need_loop) {
1037 local_irq_disable();
1037 write_seqcount_begin(&tsk->mems_allowed_seq); 1038 write_seqcount_begin(&tsk->mems_allowed_seq);
1039 }
1038 1040
1039 nodes_or(tsk->mems_allowed, tsk->mems_allowed, *newmems); 1041 nodes_or(tsk->mems_allowed, tsk->mems_allowed, *newmems);
1040 mpol_rebind_task(tsk, newmems, MPOL_REBIND_STEP1); 1042 mpol_rebind_task(tsk, newmems, MPOL_REBIND_STEP1);
@@ -1042,8 +1044,10 @@ static void cpuset_change_task_nodemask(struct task_struct *tsk,
1042 mpol_rebind_task(tsk, newmems, MPOL_REBIND_STEP2); 1044 mpol_rebind_task(tsk, newmems, MPOL_REBIND_STEP2);
1043 tsk->mems_allowed = *newmems; 1045 tsk->mems_allowed = *newmems;
1044 1046
1045 if (need_loop) 1047 if (need_loop) {
1046 write_seqcount_end(&tsk->mems_allowed_seq); 1048 write_seqcount_end(&tsk->mems_allowed_seq);
1049 local_irq_enable();
1050 }
1047 1051
1048 task_unlock(tsk); 1052 task_unlock(tsk);
1049} 1053}
diff --git a/kernel/debug/debug_core.c b/kernel/debug/debug_core.c
index 0506d447aed2..7d2f35e5df2f 100644
--- a/kernel/debug/debug_core.c
+++ b/kernel/debug/debug_core.c
@@ -575,8 +575,12 @@ return_normal:
575 raw_spin_lock(&dbg_slave_lock); 575 raw_spin_lock(&dbg_slave_lock);
576 576
577#ifdef CONFIG_SMP 577#ifdef CONFIG_SMP
578 /* If send_ready set, slaves are already waiting */
579 if (ks->send_ready)
580 atomic_set(ks->send_ready, 1);
581
578 /* Signal the other CPUs to enter kgdb_wait() */ 582 /* Signal the other CPUs to enter kgdb_wait() */
579 if ((!kgdb_single_step) && kgdb_do_roundup) 583 else if ((!kgdb_single_step) && kgdb_do_roundup)
580 kgdb_roundup_cpus(flags); 584 kgdb_roundup_cpus(flags);
581#endif 585#endif
582 586
@@ -678,11 +682,11 @@ kgdb_handle_exception(int evector, int signo, int ecode, struct pt_regs *regs)
678 if (arch_kgdb_ops.enable_nmi) 682 if (arch_kgdb_ops.enable_nmi)
679 arch_kgdb_ops.enable_nmi(0); 683 arch_kgdb_ops.enable_nmi(0);
680 684
685 memset(ks, 0, sizeof(struct kgdb_state));
681 ks->cpu = raw_smp_processor_id(); 686 ks->cpu = raw_smp_processor_id();
682 ks->ex_vector = evector; 687 ks->ex_vector = evector;
683 ks->signo = signo; 688 ks->signo = signo;
684 ks->err_code = ecode; 689 ks->err_code = ecode;
685 ks->kgdb_usethreadid = 0;
686 ks->linux_regs = regs; 690 ks->linux_regs = regs;
687 691
688 if (kgdb_reenter_check(ks)) 692 if (kgdb_reenter_check(ks))
@@ -732,6 +736,30 @@ int kgdb_nmicallback(int cpu, void *regs)
732 return 1; 736 return 1;
733} 737}
734 738
739int kgdb_nmicallin(int cpu, int trapnr, void *regs, atomic_t *send_ready)
740{
741#ifdef CONFIG_SMP
742 if (!kgdb_io_ready(0) || !send_ready)
743 return 1;
744
745 if (kgdb_info[cpu].enter_kgdb == 0) {
746 struct kgdb_state kgdb_var;
747 struct kgdb_state *ks = &kgdb_var;
748
749 memset(ks, 0, sizeof(struct kgdb_state));
750 ks->cpu = cpu;
751 ks->ex_vector = trapnr;
752 ks->signo = SIGTRAP;
753 ks->err_code = KGDB_KDB_REASON_SYSTEM_NMI;
754 ks->linux_regs = regs;
755 ks->send_ready = send_ready;
756 kgdb_cpu_enter(ks, regs, DCPU_WANT_MASTER);
757 return 0;
758 }
759#endif
760 return 1;
761}
762
735static void kgdb_console_write(struct console *co, const char *s, 763static void kgdb_console_write(struct console *co, const char *s,
736 unsigned count) 764 unsigned count)
737{ 765{
diff --git a/kernel/debug/debug_core.h b/kernel/debug/debug_core.h
index 2235967e78b0..572aa4f5677c 100644
--- a/kernel/debug/debug_core.h
+++ b/kernel/debug/debug_core.h
@@ -26,6 +26,7 @@ struct kgdb_state {
26 unsigned long threadid; 26 unsigned long threadid;
27 long kgdb_usethreadid; 27 long kgdb_usethreadid;
28 struct pt_regs *linux_regs; 28 struct pt_regs *linux_regs;
29 atomic_t *send_ready;
29}; 30};
30 31
31/* Exception state values */ 32/* Exception state values */
@@ -74,11 +75,13 @@ extern int kdb_stub(struct kgdb_state *ks);
74extern int kdb_parse(const char *cmdstr); 75extern int kdb_parse(const char *cmdstr);
75extern int kdb_common_init_state(struct kgdb_state *ks); 76extern int kdb_common_init_state(struct kgdb_state *ks);
76extern int kdb_common_deinit_state(void); 77extern int kdb_common_deinit_state(void);
78#define KGDB_KDB_REASON_SYSTEM_NMI KDB_REASON_SYSTEM_NMI
77#else /* ! CONFIG_KGDB_KDB */ 79#else /* ! CONFIG_KGDB_KDB */
78static inline int kdb_stub(struct kgdb_state *ks) 80static inline int kdb_stub(struct kgdb_state *ks)
79{ 81{
80 return DBG_PASS_EVENT; 82 return DBG_PASS_EVENT;
81} 83}
84#define KGDB_KDB_REASON_SYSTEM_NMI 0
82#endif /* CONFIG_KGDB_KDB */ 85#endif /* CONFIG_KGDB_KDB */
83 86
84#endif /* _DEBUG_CORE_H_ */ 87#endif /* _DEBUG_CORE_H_ */
diff --git a/kernel/debug/kdb/kdb_debugger.c b/kernel/debug/kdb/kdb_debugger.c
index 328d18ef31e4..8859ca34dcfe 100644
--- a/kernel/debug/kdb/kdb_debugger.c
+++ b/kernel/debug/kdb/kdb_debugger.c
@@ -69,7 +69,10 @@ int kdb_stub(struct kgdb_state *ks)
69 if (atomic_read(&kgdb_setting_breakpoint)) 69 if (atomic_read(&kgdb_setting_breakpoint))
70 reason = KDB_REASON_KEYBOARD; 70 reason = KDB_REASON_KEYBOARD;
71 71
72 if (in_nmi()) 72 if (ks->err_code == KDB_REASON_SYSTEM_NMI && ks->signo == SIGTRAP)
73 reason = KDB_REASON_SYSTEM_NMI;
74
75 else if (in_nmi())
73 reason = KDB_REASON_NMI; 76 reason = KDB_REASON_NMI;
74 77
75 for (i = 0, bp = kdb_breakpoints; i < KDB_MAXBPT; i++, bp++) { 78 for (i = 0, bp = kdb_breakpoints; i < KDB_MAXBPT; i++, bp++) {
diff --git a/kernel/debug/kdb/kdb_main.c b/kernel/debug/kdb/kdb_main.c
index 00eb8f7fbf41..0b097c8a1e50 100644
--- a/kernel/debug/kdb/kdb_main.c
+++ b/kernel/debug/kdb/kdb_main.c
@@ -1200,6 +1200,9 @@ static int kdb_local(kdb_reason_t reason, int error, struct pt_regs *regs,
1200 instruction_pointer(regs)); 1200 instruction_pointer(regs));
1201 kdb_dumpregs(regs); 1201 kdb_dumpregs(regs);
1202 break; 1202 break;
1203 case KDB_REASON_SYSTEM_NMI:
1204 kdb_printf("due to System NonMaskable Interrupt\n");
1205 break;
1203 case KDB_REASON_NMI: 1206 case KDB_REASON_NMI:
1204 kdb_printf("due to NonMaskable Interrupt @ " 1207 kdb_printf("due to NonMaskable Interrupt @ "
1205 kdb_machreg_fmt "\n", 1208 kdb_machreg_fmt "\n",
diff --git a/kernel/delayacct.c b/kernel/delayacct.c
index d473988c1d0b..54996b71e66d 100644
--- a/kernel/delayacct.c
+++ b/kernel/delayacct.c
@@ -108,12 +108,6 @@ int __delayacct_add_tsk(struct taskstats *d, struct task_struct *tsk)
108 struct timespec ts; 108 struct timespec ts;
109 cputime_t utime, stime, stimescaled, utimescaled; 109 cputime_t utime, stime, stimescaled, utimescaled;
110 110
111 /* Though tsk->delays accessed later, early exit avoids
112 * unnecessary returning of other data
113 */
114 if (!tsk->delays)
115 goto done;
116
117 tmp = (s64)d->cpu_run_real_total; 111 tmp = (s64)d->cpu_run_real_total;
118 task_cputime(tsk, &utime, &stime); 112 task_cputime(tsk, &utime, &stime);
119 cputime_to_timespec(utime + stime, &ts); 113 cputime_to_timespec(utime + stime, &ts);
@@ -158,7 +152,6 @@ int __delayacct_add_tsk(struct taskstats *d, struct task_struct *tsk)
158 d->freepages_count += tsk->delays->freepages_count; 152 d->freepages_count += tsk->delays->freepages_count;
159 spin_unlock_irqrestore(&tsk->delays->lock, flags); 153 spin_unlock_irqrestore(&tsk->delays->lock, flags);
160 154
161done:
162 return 0; 155 return 0;
163} 156}
164 157
diff --git a/kernel/elfcore.c b/kernel/elfcore.c
index ff915efef66d..e556751d15d9 100644
--- a/kernel/elfcore.c
+++ b/kernel/elfcore.c
@@ -1,23 +1,19 @@
1#include <linux/elf.h> 1#include <linux/elf.h>
2#include <linux/fs.h> 2#include <linux/fs.h>
3#include <linux/mm.h> 3#include <linux/mm.h>
4 4#include <linux/binfmts.h>
5#include <asm/elf.h>
6
7 5
8Elf_Half __weak elf_core_extra_phdrs(void) 6Elf_Half __weak elf_core_extra_phdrs(void)
9{ 7{
10 return 0; 8 return 0;
11} 9}
12 10
13int __weak elf_core_write_extra_phdrs(struct file *file, loff_t offset, size_t *size, 11int __weak elf_core_write_extra_phdrs(struct coredump_params *cprm, loff_t offset)
14 unsigned long limit)
15{ 12{
16 return 1; 13 return 1;
17} 14}
18 15
19int __weak elf_core_write_extra_data(struct file *file, size_t *size, 16int __weak elf_core_write_extra_data(struct coredump_params *cprm)
20 unsigned long limit)
21{ 17{
22 return 1; 18 return 1;
23} 19}
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 953c14348375..f5744010a8d2 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -175,8 +175,8 @@ int sysctl_perf_event_sample_rate __read_mostly = DEFAULT_MAX_SAMPLE_RATE;
175static int max_samples_per_tick __read_mostly = DIV_ROUND_UP(DEFAULT_MAX_SAMPLE_RATE, HZ); 175static int max_samples_per_tick __read_mostly = DIV_ROUND_UP(DEFAULT_MAX_SAMPLE_RATE, HZ);
176static int perf_sample_period_ns __read_mostly = DEFAULT_SAMPLE_PERIOD_NS; 176static int perf_sample_period_ns __read_mostly = DEFAULT_SAMPLE_PERIOD_NS;
177 177
178static atomic_t perf_sample_allowed_ns __read_mostly = 178static int perf_sample_allowed_ns __read_mostly =
179 ATOMIC_INIT( DEFAULT_SAMPLE_PERIOD_NS * DEFAULT_CPU_TIME_MAX_PERCENT / 100); 179 DEFAULT_SAMPLE_PERIOD_NS * DEFAULT_CPU_TIME_MAX_PERCENT / 100;
180 180
181void update_perf_cpu_limits(void) 181void update_perf_cpu_limits(void)
182{ 182{
@@ -184,7 +184,7 @@ void update_perf_cpu_limits(void)
184 184
185 tmp *= sysctl_perf_cpu_time_max_percent; 185 tmp *= sysctl_perf_cpu_time_max_percent;
186 do_div(tmp, 100); 186 do_div(tmp, 100);
187 atomic_set(&perf_sample_allowed_ns, tmp); 187 ACCESS_ONCE(perf_sample_allowed_ns) = tmp;
188} 188}
189 189
190static int perf_rotate_context(struct perf_cpu_context *cpuctx); 190static int perf_rotate_context(struct perf_cpu_context *cpuctx);
@@ -193,7 +193,7 @@ int perf_proc_update_handler(struct ctl_table *table, int write,
193 void __user *buffer, size_t *lenp, 193 void __user *buffer, size_t *lenp,
194 loff_t *ppos) 194 loff_t *ppos)
195{ 195{
196 int ret = proc_dointvec(table, write, buffer, lenp, ppos); 196 int ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
197 197
198 if (ret || !write) 198 if (ret || !write)
199 return ret; 199 return ret;
@@ -228,14 +228,15 @@ int perf_cpu_time_max_percent_handler(struct ctl_table *table, int write,
228 * we detect that events are taking too long. 228 * we detect that events are taking too long.
229 */ 229 */
230#define NR_ACCUMULATED_SAMPLES 128 230#define NR_ACCUMULATED_SAMPLES 128
231DEFINE_PER_CPU(u64, running_sample_length); 231static DEFINE_PER_CPU(u64, running_sample_length);
232 232
233void perf_sample_event_took(u64 sample_len_ns) 233void perf_sample_event_took(u64 sample_len_ns)
234{ 234{
235 u64 avg_local_sample_len; 235 u64 avg_local_sample_len;
236 u64 local_samples_len; 236 u64 local_samples_len;
237 u64 allowed_ns = ACCESS_ONCE(perf_sample_allowed_ns);
237 238
238 if (atomic_read(&perf_sample_allowed_ns) == 0) 239 if (allowed_ns == 0)
239 return; 240 return;
240 241
241 /* decay the counter by 1 average sample */ 242 /* decay the counter by 1 average sample */
@@ -251,7 +252,7 @@ void perf_sample_event_took(u64 sample_len_ns)
251 */ 252 */
252 avg_local_sample_len = local_samples_len/NR_ACCUMULATED_SAMPLES; 253 avg_local_sample_len = local_samples_len/NR_ACCUMULATED_SAMPLES;
253 254
254 if (avg_local_sample_len <= atomic_read(&perf_sample_allowed_ns)) 255 if (avg_local_sample_len <= allowed_ns)
255 return; 256 return;
256 257
257 if (max_samples_per_tick <= 1) 258 if (max_samples_per_tick <= 1)
@@ -262,10 +263,9 @@ void perf_sample_event_took(u64 sample_len_ns)
262 perf_sample_period_ns = NSEC_PER_SEC / sysctl_perf_event_sample_rate; 263 perf_sample_period_ns = NSEC_PER_SEC / sysctl_perf_event_sample_rate;
263 264
264 printk_ratelimited(KERN_WARNING 265 printk_ratelimited(KERN_WARNING
265 "perf samples too long (%lld > %d), lowering " 266 "perf samples too long (%lld > %lld), lowering "
266 "kernel.perf_event_max_sample_rate to %d\n", 267 "kernel.perf_event_max_sample_rate to %d\n",
267 avg_local_sample_len, 268 avg_local_sample_len, allowed_ns,
268 atomic_read(&perf_sample_allowed_ns),
269 sysctl_perf_event_sample_rate); 269 sysctl_perf_event_sample_rate);
270 270
271 update_perf_cpu_limits(); 271 update_perf_cpu_limits();
@@ -899,6 +899,7 @@ static void unclone_ctx(struct perf_event_context *ctx)
899 put_ctx(ctx->parent_ctx); 899 put_ctx(ctx->parent_ctx);
900 ctx->parent_ctx = NULL; 900 ctx->parent_ctx = NULL;
901 } 901 }
902 ctx->generation++;
902} 903}
903 904
904static u32 perf_event_pid(struct perf_event *event, struct task_struct *p) 905static u32 perf_event_pid(struct perf_event *event, struct task_struct *p)
@@ -1136,6 +1137,8 @@ list_add_event(struct perf_event *event, struct perf_event_context *ctx)
1136 ctx->nr_events++; 1137 ctx->nr_events++;
1137 if (event->attr.inherit_stat) 1138 if (event->attr.inherit_stat)
1138 ctx->nr_stat++; 1139 ctx->nr_stat++;
1140
1141 ctx->generation++;
1139} 1142}
1140 1143
1141/* 1144/*
@@ -1201,6 +1204,9 @@ static void perf_event__header_size(struct perf_event *event)
1201 if (sample_type & PERF_SAMPLE_DATA_SRC) 1204 if (sample_type & PERF_SAMPLE_DATA_SRC)
1202 size += sizeof(data->data_src.val); 1205 size += sizeof(data->data_src.val);
1203 1206
1207 if (sample_type & PERF_SAMPLE_TRANSACTION)
1208 size += sizeof(data->txn);
1209
1204 event->header_size = size; 1210 event->header_size = size;
1205} 1211}
1206 1212
@@ -1310,6 +1316,8 @@ list_del_event(struct perf_event *event, struct perf_event_context *ctx)
1310 */ 1316 */
1311 if (event->state > PERF_EVENT_STATE_OFF) 1317 if (event->state > PERF_EVENT_STATE_OFF)
1312 event->state = PERF_EVENT_STATE_OFF; 1318 event->state = PERF_EVENT_STATE_OFF;
1319
1320 ctx->generation++;
1313} 1321}
1314 1322
1315static void perf_group_detach(struct perf_event *event) 1323static void perf_group_detach(struct perf_event *event)
@@ -1388,6 +1396,8 @@ event_sched_out(struct perf_event *event,
1388 if (event->state != PERF_EVENT_STATE_ACTIVE) 1396 if (event->state != PERF_EVENT_STATE_ACTIVE)
1389 return; 1397 return;
1390 1398
1399 perf_pmu_disable(event->pmu);
1400
1391 event->state = PERF_EVENT_STATE_INACTIVE; 1401 event->state = PERF_EVENT_STATE_INACTIVE;
1392 if (event->pending_disable) { 1402 if (event->pending_disable) {
1393 event->pending_disable = 0; 1403 event->pending_disable = 0;
@@ -1404,6 +1414,8 @@ event_sched_out(struct perf_event *event,
1404 ctx->nr_freq--; 1414 ctx->nr_freq--;
1405 if (event->attr.exclusive || !cpuctx->active_oncpu) 1415 if (event->attr.exclusive || !cpuctx->active_oncpu)
1406 cpuctx->exclusive = 0; 1416 cpuctx->exclusive = 0;
1417
1418 perf_pmu_enable(event->pmu);
1407} 1419}
1408 1420
1409static void 1421static void
@@ -1644,6 +1656,7 @@ event_sched_in(struct perf_event *event,
1644 struct perf_event_context *ctx) 1656 struct perf_event_context *ctx)
1645{ 1657{
1646 u64 tstamp = perf_event_time(event); 1658 u64 tstamp = perf_event_time(event);
1659 int ret = 0;
1647 1660
1648 if (event->state <= PERF_EVENT_STATE_OFF) 1661 if (event->state <= PERF_EVENT_STATE_OFF)
1649 return 0; 1662 return 0;
@@ -1666,10 +1679,13 @@ event_sched_in(struct perf_event *event,
1666 */ 1679 */
1667 smp_wmb(); 1680 smp_wmb();
1668 1681
1682 perf_pmu_disable(event->pmu);
1683
1669 if (event->pmu->add(event, PERF_EF_START)) { 1684 if (event->pmu->add(event, PERF_EF_START)) {
1670 event->state = PERF_EVENT_STATE_INACTIVE; 1685 event->state = PERF_EVENT_STATE_INACTIVE;
1671 event->oncpu = -1; 1686 event->oncpu = -1;
1672 return -EAGAIN; 1687 ret = -EAGAIN;
1688 goto out;
1673 } 1689 }
1674 1690
1675 event->tstamp_running += tstamp - event->tstamp_stopped; 1691 event->tstamp_running += tstamp - event->tstamp_stopped;
@@ -1685,7 +1701,10 @@ event_sched_in(struct perf_event *event,
1685 if (event->attr.exclusive) 1701 if (event->attr.exclusive)
1686 cpuctx->exclusive = 1; 1702 cpuctx->exclusive = 1;
1687 1703
1688 return 0; 1704out:
1705 perf_pmu_enable(event->pmu);
1706
1707 return ret;
1689} 1708}
1690 1709
1691static int 1710static int
@@ -2146,22 +2165,38 @@ static void ctx_sched_out(struct perf_event_context *ctx,
2146} 2165}
2147 2166
2148/* 2167/*
2149 * Test whether two contexts are equivalent, i.e. whether they 2168 * Test whether two contexts are equivalent, i.e. whether they have both been
2150 * have both been cloned from the same version of the same context 2169 * cloned from the same version of the same context.
2151 * and they both have the same number of enabled events. 2170 *
2152 * If the number of enabled events is the same, then the set 2171 * Equivalence is measured using a generation number in the context that is
2153 * of enabled events should be the same, because these are both 2172 * incremented on each modification to it; see unclone_ctx(), list_add_event()
2154 * inherited contexts, therefore we can't access individual events 2173 * and list_del_event().
2155 * in them directly with an fd; we can only enable/disable all
2156 * events via prctl, or enable/disable all events in a family
2157 * via ioctl, which will have the same effect on both contexts.
2158 */ 2174 */
2159static int context_equiv(struct perf_event_context *ctx1, 2175static int context_equiv(struct perf_event_context *ctx1,
2160 struct perf_event_context *ctx2) 2176 struct perf_event_context *ctx2)
2161{ 2177{
2162 return ctx1->parent_ctx && ctx1->parent_ctx == ctx2->parent_ctx 2178 /* Pinning disables the swap optimization */
2163 && ctx1->parent_gen == ctx2->parent_gen 2179 if (ctx1->pin_count || ctx2->pin_count)
2164 && !ctx1->pin_count && !ctx2->pin_count; 2180 return 0;
2181
2182 /* If ctx1 is the parent of ctx2 */
2183 if (ctx1 == ctx2->parent_ctx && ctx1->generation == ctx2->parent_gen)
2184 return 1;
2185
2186 /* If ctx2 is the parent of ctx1 */
2187 if (ctx1->parent_ctx == ctx2 && ctx1->parent_gen == ctx2->generation)
2188 return 1;
2189
2190 /*
2191 * If ctx1 and ctx2 have the same parent; we flatten the parent
2192 * hierarchy, see perf_event_init_context().
2193 */
2194 if (ctx1->parent_ctx && ctx1->parent_ctx == ctx2->parent_ctx &&
2195 ctx1->parent_gen == ctx2->parent_gen)
2196 return 1;
2197
2198 /* Unmatched */
2199 return 0;
2165} 2200}
2166 2201
2167static void __perf_event_sync_stat(struct perf_event *event, 2202static void __perf_event_sync_stat(struct perf_event *event,
@@ -2210,9 +2245,6 @@ static void __perf_event_sync_stat(struct perf_event *event,
2210 perf_event_update_userpage(next_event); 2245 perf_event_update_userpage(next_event);
2211} 2246}
2212 2247
2213#define list_next_entry(pos, member) \
2214 list_entry(pos->member.next, typeof(*pos), member)
2215
2216static void perf_event_sync_stat(struct perf_event_context *ctx, 2248static void perf_event_sync_stat(struct perf_event_context *ctx,
2217 struct perf_event_context *next_ctx) 2249 struct perf_event_context *next_ctx)
2218{ 2250{
@@ -2244,7 +2276,7 @@ static void perf_event_context_sched_out(struct task_struct *task, int ctxn,
2244{ 2276{
2245 struct perf_event_context *ctx = task->perf_event_ctxp[ctxn]; 2277 struct perf_event_context *ctx = task->perf_event_ctxp[ctxn];
2246 struct perf_event_context *next_ctx; 2278 struct perf_event_context *next_ctx;
2247 struct perf_event_context *parent; 2279 struct perf_event_context *parent, *next_parent;
2248 struct perf_cpu_context *cpuctx; 2280 struct perf_cpu_context *cpuctx;
2249 int do_switch = 1; 2281 int do_switch = 1;
2250 2282
@@ -2256,10 +2288,18 @@ static void perf_event_context_sched_out(struct task_struct *task, int ctxn,
2256 return; 2288 return;
2257 2289
2258 rcu_read_lock(); 2290 rcu_read_lock();
2259 parent = rcu_dereference(ctx->parent_ctx);
2260 next_ctx = next->perf_event_ctxp[ctxn]; 2291 next_ctx = next->perf_event_ctxp[ctxn];
2261 if (parent && next_ctx && 2292 if (!next_ctx)
2262 rcu_dereference(next_ctx->parent_ctx) == parent) { 2293 goto unlock;
2294
2295 parent = rcu_dereference(ctx->parent_ctx);
2296 next_parent = rcu_dereference(next_ctx->parent_ctx);
2297
2298 /* If neither context have a parent context; they cannot be clones. */
2299 if (!parent && !next_parent)
2300 goto unlock;
2301
2302 if (next_parent == ctx || next_ctx == parent || next_parent == parent) {
2263 /* 2303 /*
2264 * Looks like the two contexts are clones, so we might be 2304 * Looks like the two contexts are clones, so we might be
2265 * able to optimize the context switch. We lock both 2305 * able to optimize the context switch. We lock both
@@ -2287,6 +2327,7 @@ static void perf_event_context_sched_out(struct task_struct *task, int ctxn,
2287 raw_spin_unlock(&next_ctx->lock); 2327 raw_spin_unlock(&next_ctx->lock);
2288 raw_spin_unlock(&ctx->lock); 2328 raw_spin_unlock(&ctx->lock);
2289 } 2329 }
2330unlock:
2290 rcu_read_unlock(); 2331 rcu_read_unlock();
2291 2332
2292 if (do_switch) { 2333 if (do_switch) {
@@ -2713,6 +2754,8 @@ static void perf_adjust_freq_unthr_context(struct perf_event_context *ctx,
2713 if (!event_filter_match(event)) 2754 if (!event_filter_match(event))
2714 continue; 2755 continue;
2715 2756
2757 perf_pmu_disable(event->pmu);
2758
2716 hwc = &event->hw; 2759 hwc = &event->hw;
2717 2760
2718 if (hwc->interrupts == MAX_INTERRUPTS) { 2761 if (hwc->interrupts == MAX_INTERRUPTS) {
@@ -2722,7 +2765,7 @@ static void perf_adjust_freq_unthr_context(struct perf_event_context *ctx,
2722 } 2765 }
2723 2766
2724 if (!event->attr.freq || !event->attr.sample_freq) 2767 if (!event->attr.freq || !event->attr.sample_freq)
2725 continue; 2768 goto next;
2726 2769
2727 /* 2770 /*
2728 * stop the event and update event->count 2771 * stop the event and update event->count
@@ -2744,6 +2787,8 @@ static void perf_adjust_freq_unthr_context(struct perf_event_context *ctx,
2744 perf_adjust_period(event, period, delta, false); 2787 perf_adjust_period(event, period, delta, false);
2745 2788
2746 event->pmu->start(event, delta > 0 ? PERF_EF_RELOAD : 0); 2789 event->pmu->start(event, delta > 0 ? PERF_EF_RELOAD : 0);
2790 next:
2791 perf_pmu_enable(event->pmu);
2747 } 2792 }
2748 2793
2749 perf_pmu_enable(ctx->pmu); 2794 perf_pmu_enable(ctx->pmu);
@@ -4572,6 +4617,9 @@ void perf_output_sample(struct perf_output_handle *handle,
4572 if (sample_type & PERF_SAMPLE_DATA_SRC) 4617 if (sample_type & PERF_SAMPLE_DATA_SRC)
4573 perf_output_put(handle, data->data_src.val); 4618 perf_output_put(handle, data->data_src.val);
4574 4619
4620 if (sample_type & PERF_SAMPLE_TRANSACTION)
4621 perf_output_put(handle, data->txn);
4622
4575 if (!event->attr.watermark) { 4623 if (!event->attr.watermark) {
4576 int wakeup_events = event->attr.wakeup_events; 4624 int wakeup_events = event->attr.wakeup_events;
4577 4625
@@ -5100,27 +5148,26 @@ static void perf_event_mmap_event(struct perf_mmap_event *mmap_event)
5100 unsigned int size; 5148 unsigned int size;
5101 char tmp[16]; 5149 char tmp[16];
5102 char *buf = NULL; 5150 char *buf = NULL;
5103 const char *name; 5151 char *name;
5104
5105 memset(tmp, 0, sizeof(tmp));
5106 5152
5107 if (file) { 5153 if (file) {
5108 struct inode *inode; 5154 struct inode *inode;
5109 dev_t dev; 5155 dev_t dev;
5156
5157 buf = kmalloc(PATH_MAX, GFP_KERNEL);
5158 if (!buf) {
5159 name = "//enomem";
5160 goto cpy_name;
5161 }
5110 /* 5162 /*
5111 * d_path works from the end of the rb backwards, so we 5163 * d_path() works from the end of the rb backwards, so we
5112 * need to add enough zero bytes after the string to handle 5164 * need to add enough zero bytes after the string to handle
5113 * the 64bit alignment we do later. 5165 * the 64bit alignment we do later.
5114 */ 5166 */
5115 buf = kzalloc(PATH_MAX + sizeof(u64), GFP_KERNEL); 5167 name = d_path(&file->f_path, buf, PATH_MAX - sizeof(u64));
5116 if (!buf) {
5117 name = strncpy(tmp, "//enomem", sizeof(tmp));
5118 goto got_name;
5119 }
5120 name = d_path(&file->f_path, buf, PATH_MAX);
5121 if (IS_ERR(name)) { 5168 if (IS_ERR(name)) {
5122 name = strncpy(tmp, "//toolong", sizeof(tmp)); 5169 name = "//toolong";
5123 goto got_name; 5170 goto cpy_name;
5124 } 5171 }
5125 inode = file_inode(vma->vm_file); 5172 inode = file_inode(vma->vm_file);
5126 dev = inode->i_sb->s_dev; 5173 dev = inode->i_sb->s_dev;
@@ -5128,34 +5175,39 @@ static void perf_event_mmap_event(struct perf_mmap_event *mmap_event)
5128 gen = inode->i_generation; 5175 gen = inode->i_generation;
5129 maj = MAJOR(dev); 5176 maj = MAJOR(dev);
5130 min = MINOR(dev); 5177 min = MINOR(dev);
5131 5178 goto got_name;
5132 } else { 5179 } else {
5133 if (arch_vma_name(mmap_event->vma)) { 5180 name = (char *)arch_vma_name(vma);
5134 name = strncpy(tmp, arch_vma_name(mmap_event->vma), 5181 if (name)
5135 sizeof(tmp) - 1); 5182 goto cpy_name;
5136 tmp[sizeof(tmp) - 1] = '\0';
5137 goto got_name;
5138 }
5139 5183
5140 if (!vma->vm_mm) { 5184 if (vma->vm_start <= vma->vm_mm->start_brk &&
5141 name = strncpy(tmp, "[vdso]", sizeof(tmp));
5142 goto got_name;
5143 } else if (vma->vm_start <= vma->vm_mm->start_brk &&
5144 vma->vm_end >= vma->vm_mm->brk) { 5185 vma->vm_end >= vma->vm_mm->brk) {
5145 name = strncpy(tmp, "[heap]", sizeof(tmp)); 5186 name = "[heap]";
5146 goto got_name; 5187 goto cpy_name;
5147 } else if (vma->vm_start <= vma->vm_mm->start_stack && 5188 }
5189 if (vma->vm_start <= vma->vm_mm->start_stack &&
5148 vma->vm_end >= vma->vm_mm->start_stack) { 5190 vma->vm_end >= vma->vm_mm->start_stack) {
5149 name = strncpy(tmp, "[stack]", sizeof(tmp)); 5191 name = "[stack]";
5150 goto got_name; 5192 goto cpy_name;
5151 } 5193 }
5152 5194
5153 name = strncpy(tmp, "//anon", sizeof(tmp)); 5195 name = "//anon";
5154 goto got_name; 5196 goto cpy_name;
5155 } 5197 }
5156 5198
5199cpy_name:
5200 strlcpy(tmp, name, sizeof(tmp));
5201 name = tmp;
5157got_name: 5202got_name:
5158 size = ALIGN(strlen(name)+1, sizeof(u64)); 5203 /*
5204 * Since our buffer works in 8 byte units we need to align our string
5205 * size to a multiple of 8. However, we must guarantee the tail end is
5206 * zero'd out to avoid leaking random bits to userspace.
5207 */
5208 size = strlen(name)+1;
5209 while (!IS_ALIGNED(size, sizeof(u64)))
5210 name[size++] = '\0';
5159 5211
5160 mmap_event->file_name = name; 5212 mmap_event->file_name = name;
5161 mmap_event->file_size = size; 5213 mmap_event->file_size = size;
@@ -5643,11 +5695,6 @@ static void swevent_hlist_put(struct perf_event *event)
5643{ 5695{
5644 int cpu; 5696 int cpu;
5645 5697
5646 if (event->cpu != -1) {
5647 swevent_hlist_put_cpu(event, event->cpu);
5648 return;
5649 }
5650
5651 for_each_possible_cpu(cpu) 5698 for_each_possible_cpu(cpu)
5652 swevent_hlist_put_cpu(event, cpu); 5699 swevent_hlist_put_cpu(event, cpu);
5653} 5700}
@@ -5681,9 +5728,6 @@ static int swevent_hlist_get(struct perf_event *event)
5681 int err; 5728 int err;
5682 int cpu, failed_cpu; 5729 int cpu, failed_cpu;
5683 5730
5684 if (event->cpu != -1)
5685 return swevent_hlist_get_cpu(event, event->cpu);
5686
5687 get_online_cpus(); 5731 get_online_cpus();
5688 for_each_possible_cpu(cpu) { 5732 for_each_possible_cpu(cpu) {
5689 err = swevent_hlist_get_cpu(event, cpu); 5733 err = swevent_hlist_get_cpu(event, cpu);
@@ -6292,6 +6336,7 @@ type_show(struct device *dev, struct device_attribute *attr, char *page)
6292 6336
6293 return snprintf(page, PAGE_SIZE-1, "%d\n", pmu->type); 6337 return snprintf(page, PAGE_SIZE-1, "%d\n", pmu->type);
6294} 6338}
6339static DEVICE_ATTR_RO(type);
6295 6340
6296static ssize_t 6341static ssize_t
6297perf_event_mux_interval_ms_show(struct device *dev, 6342perf_event_mux_interval_ms_show(struct device *dev,
@@ -6336,17 +6381,19 @@ perf_event_mux_interval_ms_store(struct device *dev,
6336 6381
6337 return count; 6382 return count;
6338} 6383}
6384static DEVICE_ATTR_RW(perf_event_mux_interval_ms);
6339 6385
6340static struct device_attribute pmu_dev_attrs[] = { 6386static struct attribute *pmu_dev_attrs[] = {
6341 __ATTR_RO(type), 6387 &dev_attr_type.attr,
6342 __ATTR_RW(perf_event_mux_interval_ms), 6388 &dev_attr_perf_event_mux_interval_ms.attr,
6343 __ATTR_NULL, 6389 NULL,
6344}; 6390};
6391ATTRIBUTE_GROUPS(pmu_dev);
6345 6392
6346static int pmu_bus_running; 6393static int pmu_bus_running;
6347static struct bus_type pmu_bus = { 6394static struct bus_type pmu_bus = {
6348 .name = "event_source", 6395 .name = "event_source",
6349 .dev_attrs = pmu_dev_attrs, 6396 .dev_groups = pmu_dev_groups,
6350}; 6397};
6351 6398
6352static void pmu_dev_release(struct device *dev) 6399static void pmu_dev_release(struct device *dev)
@@ -7126,7 +7173,6 @@ SYSCALL_DEFINE5(perf_event_open,
7126 } 7173 }
7127 7174
7128 perf_install_in_context(ctx, event, event->cpu); 7175 perf_install_in_context(ctx, event, event->cpu);
7129 ++ctx->generation;
7130 perf_unpin_context(ctx); 7176 perf_unpin_context(ctx);
7131 mutex_unlock(&ctx->mutex); 7177 mutex_unlock(&ctx->mutex);
7132 7178
@@ -7209,7 +7255,6 @@ perf_event_create_kernel_counter(struct perf_event_attr *attr, int cpu,
7209 WARN_ON_ONCE(ctx->parent_ctx); 7255 WARN_ON_ONCE(ctx->parent_ctx);
7210 mutex_lock(&ctx->mutex); 7256 mutex_lock(&ctx->mutex);
7211 perf_install_in_context(ctx, event, cpu); 7257 perf_install_in_context(ctx, event, cpu);
7212 ++ctx->generation;
7213 perf_unpin_context(ctx); 7258 perf_unpin_context(ctx);
7214 mutex_unlock(&ctx->mutex); 7259 mutex_unlock(&ctx->mutex);
7215 7260
diff --git a/kernel/events/internal.h b/kernel/events/internal.h
index ca6599723be5..569b218782ad 100644
--- a/kernel/events/internal.h
+++ b/kernel/events/internal.h
@@ -82,16 +82,16 @@ static inline unsigned long perf_data_size(struct ring_buffer *rb)
82} 82}
83 83
84#define DEFINE_OUTPUT_COPY(func_name, memcpy_func) \ 84#define DEFINE_OUTPUT_COPY(func_name, memcpy_func) \
85static inline unsigned int \ 85static inline unsigned long \
86func_name(struct perf_output_handle *handle, \ 86func_name(struct perf_output_handle *handle, \
87 const void *buf, unsigned int len) \ 87 const void *buf, unsigned long len) \
88{ \ 88{ \
89 unsigned long size, written; \ 89 unsigned long size, written; \
90 \ 90 \
91 do { \ 91 do { \
92 size = min_t(unsigned long, handle->size, len); \ 92 size = min(handle->size, len); \
93 \
94 written = memcpy_func(handle->addr, buf, size); \ 93 written = memcpy_func(handle->addr, buf, size); \
94 written = size - written; \
95 \ 95 \
96 len -= written; \ 96 len -= written; \
97 handle->addr += written; \ 97 handle->addr += written; \
@@ -110,20 +110,37 @@ func_name(struct perf_output_handle *handle, \
110 return len; \ 110 return len; \
111} 111}
112 112
113static inline int memcpy_common(void *dst, const void *src, size_t n) 113static inline unsigned long
114memcpy_common(void *dst, const void *src, unsigned long n)
114{ 115{
115 memcpy(dst, src, n); 116 memcpy(dst, src, n);
116 return n; 117 return 0;
117} 118}
118 119
119DEFINE_OUTPUT_COPY(__output_copy, memcpy_common) 120DEFINE_OUTPUT_COPY(__output_copy, memcpy_common)
120 121
121#define MEMCPY_SKIP(dst, src, n) (n) 122static inline unsigned long
123memcpy_skip(void *dst, const void *src, unsigned long n)
124{
125 return 0;
126}
122 127
123DEFINE_OUTPUT_COPY(__output_skip, MEMCPY_SKIP) 128DEFINE_OUTPUT_COPY(__output_skip, memcpy_skip)
124 129
125#ifndef arch_perf_out_copy_user 130#ifndef arch_perf_out_copy_user
126#define arch_perf_out_copy_user __copy_from_user_inatomic 131#define arch_perf_out_copy_user arch_perf_out_copy_user
132
133static inline unsigned long
134arch_perf_out_copy_user(void *dst, const void *src, unsigned long n)
135{
136 unsigned long ret;
137
138 pagefault_disable();
139 ret = __copy_from_user_inatomic(dst, src, n);
140 pagefault_enable();
141
142 return ret;
143}
127#endif 144#endif
128 145
129DEFINE_OUTPUT_COPY(__output_copy_user, arch_perf_out_copy_user) 146DEFINE_OUTPUT_COPY(__output_copy_user, arch_perf_out_copy_user)
diff --git a/kernel/events/ring_buffer.c b/kernel/events/ring_buffer.c
index 9c2ddfbf4525..e8b168af135b 100644
--- a/kernel/events/ring_buffer.c
+++ b/kernel/events/ring_buffer.c
@@ -12,40 +12,10 @@
12#include <linux/perf_event.h> 12#include <linux/perf_event.h>
13#include <linux/vmalloc.h> 13#include <linux/vmalloc.h>
14#include <linux/slab.h> 14#include <linux/slab.h>
15#include <linux/circ_buf.h>
15 16
16#include "internal.h" 17#include "internal.h"
17 18
18static bool perf_output_space(struct ring_buffer *rb, unsigned long tail,
19 unsigned long offset, unsigned long head)
20{
21 unsigned long sz = perf_data_size(rb);
22 unsigned long mask = sz - 1;
23
24 /*
25 * check if user-writable
26 * overwrite : over-write its own tail
27 * !overwrite: buffer possibly drops events.
28 */
29 if (rb->overwrite)
30 return true;
31
32 /*
33 * verify that payload is not bigger than buffer
34 * otherwise masking logic may fail to detect
35 * the "not enough space" condition
36 */
37 if ((head - offset) > sz)
38 return false;
39
40 offset = (offset - tail) & mask;
41 head = (head - tail) & mask;
42
43 if ((int)(head - offset) < 0)
44 return false;
45
46 return true;
47}
48
49static void perf_output_wakeup(struct perf_output_handle *handle) 19static void perf_output_wakeup(struct perf_output_handle *handle)
50{ 20{
51 atomic_set(&handle->rb->poll, POLL_IN); 21 atomic_set(&handle->rb->poll, POLL_IN);
@@ -115,8 +85,8 @@ again:
115 rb->user_page->data_head = head; 85 rb->user_page->data_head = head;
116 86
117 /* 87 /*
118 * Now check if we missed an update, rely on the (compiler) 88 * Now check if we missed an update -- rely on previous implied
119 * barrier in atomic_dec_and_test() to re-read rb->head. 89 * compiler barriers to force a re-read.
120 */ 90 */
121 if (unlikely(head != local_read(&rb->head))) { 91 if (unlikely(head != local_read(&rb->head))) {
122 local_inc(&rb->nest); 92 local_inc(&rb->nest);
@@ -135,8 +105,7 @@ int perf_output_begin(struct perf_output_handle *handle,
135{ 105{
136 struct ring_buffer *rb; 106 struct ring_buffer *rb;
137 unsigned long tail, offset, head; 107 unsigned long tail, offset, head;
138 int have_lost; 108 int have_lost, page_shift;
139 struct perf_sample_data sample_data;
140 struct { 109 struct {
141 struct perf_event_header header; 110 struct perf_event_header header;
142 u64 id; 111 u64 id;
@@ -151,57 +120,63 @@ int perf_output_begin(struct perf_output_handle *handle,
151 event = event->parent; 120 event = event->parent;
152 121
153 rb = rcu_dereference(event->rb); 122 rb = rcu_dereference(event->rb);
154 if (!rb) 123 if (unlikely(!rb))
155 goto out; 124 goto out;
156 125
157 handle->rb = rb; 126 if (unlikely(!rb->nr_pages))
158 handle->event = event;
159
160 if (!rb->nr_pages)
161 goto out; 127 goto out;
162 128
129 handle->rb = rb;
130 handle->event = event;
131
163 have_lost = local_read(&rb->lost); 132 have_lost = local_read(&rb->lost);
164 if (have_lost) { 133 if (unlikely(have_lost)) {
165 lost_event.header.size = sizeof(lost_event); 134 size += sizeof(lost_event);
166 perf_event_header__init_id(&lost_event.header, &sample_data, 135 if (event->attr.sample_id_all)
167 event); 136 size += event->id_header_size;
168 size += lost_event.header.size;
169 } 137 }
170 138
171 perf_output_get_handle(handle); 139 perf_output_get_handle(handle);
172 140
173 do { 141 do {
174 /*
175 * Userspace could choose to issue a mb() before updating the
176 * tail pointer. So that all reads will be completed before the
177 * write is issued.
178 *
179 * See perf_output_put_handle().
180 */
181 tail = ACCESS_ONCE(rb->user_page->data_tail); 142 tail = ACCESS_ONCE(rb->user_page->data_tail);
182 smp_mb();
183 offset = head = local_read(&rb->head); 143 offset = head = local_read(&rb->head);
184 head += size; 144 if (!rb->overwrite &&
185 if (unlikely(!perf_output_space(rb, tail, offset, head))) 145 unlikely(CIRC_SPACE(head, tail, perf_data_size(rb)) < size))
186 goto fail; 146 goto fail;
147 head += size;
187 } while (local_cmpxchg(&rb->head, offset, head) != offset); 148 } while (local_cmpxchg(&rb->head, offset, head) != offset);
188 149
189 if (head - local_read(&rb->wakeup) > rb->watermark) 150 /*
151 * Separate the userpage->tail read from the data stores below.
152 * Matches the MB userspace SHOULD issue after reading the data
153 * and before storing the new tail position.
154 *
155 * See perf_output_put_handle().
156 */
157 smp_mb();
158
159 if (unlikely(head - local_read(&rb->wakeup) > rb->watermark))
190 local_add(rb->watermark, &rb->wakeup); 160 local_add(rb->watermark, &rb->wakeup);
191 161
192 handle->page = offset >> (PAGE_SHIFT + page_order(rb)); 162 page_shift = PAGE_SHIFT + page_order(rb);
193 handle->page &= rb->nr_pages - 1;
194 handle->size = offset & ((PAGE_SIZE << page_order(rb)) - 1);
195 handle->addr = rb->data_pages[handle->page];
196 handle->addr += handle->size;
197 handle->size = (PAGE_SIZE << page_order(rb)) - handle->size;
198 163
199 if (have_lost) { 164 handle->page = (offset >> page_shift) & (rb->nr_pages - 1);
165 offset &= (1UL << page_shift) - 1;
166 handle->addr = rb->data_pages[handle->page] + offset;
167 handle->size = (1UL << page_shift) - offset;
168
169 if (unlikely(have_lost)) {
170 struct perf_sample_data sample_data;
171
172 lost_event.header.size = sizeof(lost_event);
200 lost_event.header.type = PERF_RECORD_LOST; 173 lost_event.header.type = PERF_RECORD_LOST;
201 lost_event.header.misc = 0; 174 lost_event.header.misc = 0;
202 lost_event.id = event->id; 175 lost_event.id = event->id;
203 lost_event.lost = local_xchg(&rb->lost, 0); 176 lost_event.lost = local_xchg(&rb->lost, 0);
204 177
178 perf_event_header__init_id(&lost_event.header,
179 &sample_data, event);
205 perf_output_put(handle, lost_event); 180 perf_output_put(handle, lost_event);
206 perf_event__output_id_sample(event, handle, &sample_data); 181 perf_event__output_id_sample(event, handle, &sample_data);
207 } 182 }
diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c
index ad8e1bdca70e..24b7d6ca871b 100644
--- a/kernel/events/uprobes.c
+++ b/kernel/events/uprobes.c
@@ -35,6 +35,7 @@
35#include <linux/kdebug.h> /* notifier mechanism */ 35#include <linux/kdebug.h> /* notifier mechanism */
36#include "../../mm/internal.h" /* munlock_vma_page */ 36#include "../../mm/internal.h" /* munlock_vma_page */
37#include <linux/percpu-rwsem.h> 37#include <linux/percpu-rwsem.h>
38#include <linux/task_work.h>
38 39
39#include <linux/uprobes.h> 40#include <linux/uprobes.h>
40 41
@@ -244,12 +245,12 @@ static int verify_opcode(struct page *page, unsigned long vaddr, uprobe_opcode_t
244 * the architecture. If an arch has variable length instruction and the 245 * the architecture. If an arch has variable length instruction and the
245 * breakpoint instruction is not of the smallest length instruction 246 * breakpoint instruction is not of the smallest length instruction
246 * supported by that architecture then we need to modify is_trap_at_addr and 247 * supported by that architecture then we need to modify is_trap_at_addr and
247 * write_opcode accordingly. This would never be a problem for archs that 248 * uprobe_write_opcode accordingly. This would never be a problem for archs
248 * have fixed length instructions. 249 * that have fixed length instructions.
249 */ 250 */
250 251
251/* 252/*
252 * write_opcode - write the opcode at a given virtual address. 253 * uprobe_write_opcode - write the opcode at a given virtual address.
253 * @mm: the probed process address space. 254 * @mm: the probed process address space.
254 * @vaddr: the virtual address to store the opcode. 255 * @vaddr: the virtual address to store the opcode.
255 * @opcode: opcode to be written at @vaddr. 256 * @opcode: opcode to be written at @vaddr.
@@ -260,7 +261,7 @@ static int verify_opcode(struct page *page, unsigned long vaddr, uprobe_opcode_t
260 * For mm @mm, write the opcode at @vaddr. 261 * For mm @mm, write the opcode at @vaddr.
261 * Return 0 (success) or a negative errno. 262 * Return 0 (success) or a negative errno.
262 */ 263 */
263static int write_opcode(struct mm_struct *mm, unsigned long vaddr, 264int uprobe_write_opcode(struct mm_struct *mm, unsigned long vaddr,
264 uprobe_opcode_t opcode) 265 uprobe_opcode_t opcode)
265{ 266{
266 struct page *old_page, *new_page; 267 struct page *old_page, *new_page;
@@ -314,7 +315,7 @@ put_old:
314 */ 315 */
315int __weak set_swbp(struct arch_uprobe *auprobe, struct mm_struct *mm, unsigned long vaddr) 316int __weak set_swbp(struct arch_uprobe *auprobe, struct mm_struct *mm, unsigned long vaddr)
316{ 317{
317 return write_opcode(mm, vaddr, UPROBE_SWBP_INSN); 318 return uprobe_write_opcode(mm, vaddr, UPROBE_SWBP_INSN);
318} 319}
319 320
320/** 321/**
@@ -329,7 +330,7 @@ int __weak set_swbp(struct arch_uprobe *auprobe, struct mm_struct *mm, unsigned
329int __weak 330int __weak
330set_orig_insn(struct arch_uprobe *auprobe, struct mm_struct *mm, unsigned long vaddr) 331set_orig_insn(struct arch_uprobe *auprobe, struct mm_struct *mm, unsigned long vaddr)
331{ 332{
332 return write_opcode(mm, vaddr, *(uprobe_opcode_t *)auprobe->insn); 333 return uprobe_write_opcode(mm, vaddr, *(uprobe_opcode_t *)auprobe->insn);
333} 334}
334 335
335static int match_uprobe(struct uprobe *l, struct uprobe *r) 336static int match_uprobe(struct uprobe *l, struct uprobe *r)
@@ -503,9 +504,8 @@ static bool consumer_del(struct uprobe *uprobe, struct uprobe_consumer *uc)
503 return ret; 504 return ret;
504} 505}
505 506
506static int 507static int __copy_insn(struct address_space *mapping, struct file *filp,
507__copy_insn(struct address_space *mapping, struct file *filp, char *insn, 508 void *insn, int nbytes, loff_t offset)
508 unsigned long nbytes, loff_t offset)
509{ 509{
510 struct page *page; 510 struct page *page;
511 511
@@ -527,28 +527,28 @@ __copy_insn(struct address_space *mapping, struct file *filp, char *insn,
527 527
528static int copy_insn(struct uprobe *uprobe, struct file *filp) 528static int copy_insn(struct uprobe *uprobe, struct file *filp)
529{ 529{
530 struct address_space *mapping; 530 struct address_space *mapping = uprobe->inode->i_mapping;
531 unsigned long nbytes; 531 loff_t offs = uprobe->offset;
532 int bytes; 532 void *insn = uprobe->arch.insn;
533 533 int size = MAX_UINSN_BYTES;
534 nbytes = PAGE_SIZE - (uprobe->offset & ~PAGE_MASK); 534 int len, err = -EIO;
535 mapping = uprobe->inode->i_mapping;
536 535
537 /* Instruction at end of binary; copy only available bytes */ 536 /* Copy only available bytes, -EIO if nothing was read */
538 if (uprobe->offset + MAX_UINSN_BYTES > uprobe->inode->i_size) 537 do {
539 bytes = uprobe->inode->i_size - uprobe->offset; 538 if (offs >= i_size_read(uprobe->inode))
540 else 539 break;
541 bytes = MAX_UINSN_BYTES;
542 540
543 /* Instruction at the page-boundary; copy bytes in second page */ 541 len = min_t(int, size, PAGE_SIZE - (offs & ~PAGE_MASK));
544 if (nbytes < bytes) { 542 err = __copy_insn(mapping, filp, insn, len, offs);
545 int err = __copy_insn(mapping, filp, uprobe->arch.insn + nbytes,
546 bytes - nbytes, uprobe->offset + nbytes);
547 if (err) 543 if (err)
548 return err; 544 break;
549 bytes = nbytes; 545
550 } 546 insn += len;
551 return __copy_insn(mapping, filp, uprobe->arch.insn, bytes, uprobe->offset); 547 offs += len;
548 size -= len;
549 } while (size);
550
551 return err;
552} 552}
553 553
554static int prepare_uprobe(struct uprobe *uprobe, struct file *file, 554static int prepare_uprobe(struct uprobe *uprobe, struct file *file,
@@ -576,7 +576,7 @@ static int prepare_uprobe(struct uprobe *uprobe, struct file *file,
576 if (ret) 576 if (ret)
577 goto out; 577 goto out;
578 578
579 /* write_opcode() assumes we don't cross page boundary */ 579 /* uprobe_write_opcode() assumes we don't cross page boundary */
580 BUG_ON((uprobe->offset & ~PAGE_MASK) + 580 BUG_ON((uprobe->offset & ~PAGE_MASK) +
581 UPROBE_SWBP_INSN_SIZE > PAGE_SIZE); 581 UPROBE_SWBP_INSN_SIZE > PAGE_SIZE);
582 582
@@ -1096,21 +1096,22 @@ void uprobe_munmap(struct vm_area_struct *vma, unsigned long start, unsigned lon
1096} 1096}
1097 1097
1098/* Slot allocation for XOL */ 1098/* Slot allocation for XOL */
1099static int xol_add_vma(struct xol_area *area) 1099static int xol_add_vma(struct mm_struct *mm, struct xol_area *area)
1100{ 1100{
1101 struct mm_struct *mm = current->mm;
1102 int ret = -EALREADY; 1101 int ret = -EALREADY;
1103 1102
1104 down_write(&mm->mmap_sem); 1103 down_write(&mm->mmap_sem);
1105 if (mm->uprobes_state.xol_area) 1104 if (mm->uprobes_state.xol_area)
1106 goto fail; 1105 goto fail;
1107 1106
1108 ret = -ENOMEM; 1107 if (!area->vaddr) {
1109 /* Try to map as high as possible, this is only a hint. */ 1108 /* Try to map as high as possible, this is only a hint. */
1110 area->vaddr = get_unmapped_area(NULL, TASK_SIZE - PAGE_SIZE, PAGE_SIZE, 0, 0); 1109 area->vaddr = get_unmapped_area(NULL, TASK_SIZE - PAGE_SIZE,
1111 if (area->vaddr & ~PAGE_MASK) { 1110 PAGE_SIZE, 0, 0);
1112 ret = area->vaddr; 1111 if (area->vaddr & ~PAGE_MASK) {
1113 goto fail; 1112 ret = area->vaddr;
1113 goto fail;
1114 }
1114 } 1115 }
1115 1116
1116 ret = install_special_mapping(mm, area->vaddr, PAGE_SIZE, 1117 ret = install_special_mapping(mm, area->vaddr, PAGE_SIZE,
@@ -1120,30 +1121,19 @@ static int xol_add_vma(struct xol_area *area)
1120 1121
1121 smp_wmb(); /* pairs with get_xol_area() */ 1122 smp_wmb(); /* pairs with get_xol_area() */
1122 mm->uprobes_state.xol_area = area; 1123 mm->uprobes_state.xol_area = area;
1123 ret = 0;
1124 fail: 1124 fail:
1125 up_write(&mm->mmap_sem); 1125 up_write(&mm->mmap_sem);
1126 1126
1127 return ret; 1127 return ret;
1128} 1128}
1129 1129
1130/* 1130static struct xol_area *__create_xol_area(unsigned long vaddr)
1131 * get_xol_area - Allocate process's xol_area if necessary.
1132 * This area will be used for storing instructions for execution out of line.
1133 *
1134 * Returns the allocated area or NULL.
1135 */
1136static struct xol_area *get_xol_area(void)
1137{ 1131{
1138 struct mm_struct *mm = current->mm; 1132 struct mm_struct *mm = current->mm;
1139 struct xol_area *area;
1140 uprobe_opcode_t insn = UPROBE_SWBP_INSN; 1133 uprobe_opcode_t insn = UPROBE_SWBP_INSN;
1134 struct xol_area *area;
1141 1135
1142 area = mm->uprobes_state.xol_area; 1136 area = kmalloc(sizeof(*area), GFP_KERNEL);
1143 if (area)
1144 goto ret;
1145
1146 area = kzalloc(sizeof(*area), GFP_KERNEL);
1147 if (unlikely(!area)) 1137 if (unlikely(!area))
1148 goto out; 1138 goto out;
1149 1139
@@ -1155,13 +1145,14 @@ static struct xol_area *get_xol_area(void)
1155 if (!area->page) 1145 if (!area->page)
1156 goto free_bitmap; 1146 goto free_bitmap;
1157 1147
1158 /* allocate first slot of task's xol_area for the return probes */ 1148 area->vaddr = vaddr;
1149 init_waitqueue_head(&area->wq);
1150 /* Reserve the 1st slot for get_trampoline_vaddr() */
1159 set_bit(0, area->bitmap); 1151 set_bit(0, area->bitmap);
1160 copy_to_page(area->page, 0, &insn, UPROBE_SWBP_INSN_SIZE);
1161 atomic_set(&area->slot_count, 1); 1152 atomic_set(&area->slot_count, 1);
1162 init_waitqueue_head(&area->wq); 1153 copy_to_page(area->page, 0, &insn, UPROBE_SWBP_INSN_SIZE);
1163 1154
1164 if (!xol_add_vma(area)) 1155 if (!xol_add_vma(mm, area))
1165 return area; 1156 return area;
1166 1157
1167 __free_page(area->page); 1158 __free_page(area->page);
@@ -1170,9 +1161,25 @@ static struct xol_area *get_xol_area(void)
1170 free_area: 1161 free_area:
1171 kfree(area); 1162 kfree(area);
1172 out: 1163 out:
1164 return NULL;
1165}
1166
1167/*
1168 * get_xol_area - Allocate process's xol_area if necessary.
1169 * This area will be used for storing instructions for execution out of line.
1170 *
1171 * Returns the allocated area or NULL.
1172 */
1173static struct xol_area *get_xol_area(void)
1174{
1175 struct mm_struct *mm = current->mm;
1176 struct xol_area *area;
1177
1178 if (!mm->uprobes_state.xol_area)
1179 __create_xol_area(0);
1180
1173 area = mm->uprobes_state.xol_area; 1181 area = mm->uprobes_state.xol_area;
1174 ret: 1182 smp_read_barrier_depends(); /* pairs with wmb in xol_add_vma() */
1175 smp_read_barrier_depends(); /* pairs with wmb in xol_add_vma() */
1176 return area; 1183 return area;
1177} 1184}
1178 1185
@@ -1256,7 +1263,8 @@ static unsigned long xol_get_insn_slot(struct uprobe *uprobe)
1256 return 0; 1263 return 0;
1257 1264
1258 /* Initialize the slot */ 1265 /* Initialize the slot */
1259 copy_to_page(area->page, xol_vaddr, uprobe->arch.insn, MAX_UINSN_BYTES); 1266 copy_to_page(area->page, xol_vaddr,
1267 uprobe->arch.ixol, sizeof(uprobe->arch.ixol));
1260 /* 1268 /*
1261 * We probably need flush_icache_user_range() but it needs vma. 1269 * We probably need flush_icache_user_range() but it needs vma.
1262 * This should work on supported architectures too. 1270 * This should work on supported architectures too.
@@ -1345,14 +1353,6 @@ void uprobe_free_utask(struct task_struct *t)
1345} 1353}
1346 1354
1347/* 1355/*
1348 * Called in context of a new clone/fork from copy_process.
1349 */
1350void uprobe_copy_process(struct task_struct *t)
1351{
1352 t->utask = NULL;
1353}
1354
1355/*
1356 * Allocate a uprobe_task object for the task if if necessary. 1356 * Allocate a uprobe_task object for the task if if necessary.
1357 * Called when the thread hits a breakpoint. 1357 * Called when the thread hits a breakpoint.
1358 * 1358 *
@@ -1367,6 +1367,90 @@ static struct uprobe_task *get_utask(void)
1367 return current->utask; 1367 return current->utask;
1368} 1368}
1369 1369
1370static int dup_utask(struct task_struct *t, struct uprobe_task *o_utask)
1371{
1372 struct uprobe_task *n_utask;
1373 struct return_instance **p, *o, *n;
1374
1375 n_utask = kzalloc(sizeof(struct uprobe_task), GFP_KERNEL);
1376 if (!n_utask)
1377 return -ENOMEM;
1378 t->utask = n_utask;
1379
1380 p = &n_utask->return_instances;
1381 for (o = o_utask->return_instances; o; o = o->next) {
1382 n = kmalloc(sizeof(struct return_instance), GFP_KERNEL);
1383 if (!n)
1384 return -ENOMEM;
1385
1386 *n = *o;
1387 atomic_inc(&n->uprobe->ref);
1388 n->next = NULL;
1389
1390 *p = n;
1391 p = &n->next;
1392 n_utask->depth++;
1393 }
1394
1395 return 0;
1396}
1397
1398static void uprobe_warn(struct task_struct *t, const char *msg)
1399{
1400 pr_warn("uprobe: %s:%d failed to %s\n",
1401 current->comm, current->pid, msg);
1402}
1403
1404static void dup_xol_work(struct callback_head *work)
1405{
1406 kfree(work);
1407
1408 if (current->flags & PF_EXITING)
1409 return;
1410
1411 if (!__create_xol_area(current->utask->vaddr))
1412 uprobe_warn(current, "dup xol area");
1413}
1414
1415/*
1416 * Called in context of a new clone/fork from copy_process.
1417 */
1418void uprobe_copy_process(struct task_struct *t, unsigned long flags)
1419{
1420 struct uprobe_task *utask = current->utask;
1421 struct mm_struct *mm = current->mm;
1422 struct callback_head *work;
1423 struct xol_area *area;
1424
1425 t->utask = NULL;
1426
1427 if (!utask || !utask->return_instances)
1428 return;
1429
1430 if (mm == t->mm && !(flags & CLONE_VFORK))
1431 return;
1432
1433 if (dup_utask(t, utask))
1434 return uprobe_warn(t, "dup ret instances");
1435
1436 /* The task can fork() after dup_xol_work() fails */
1437 area = mm->uprobes_state.xol_area;
1438 if (!area)
1439 return uprobe_warn(t, "dup xol area");
1440
1441 if (mm == t->mm)
1442 return;
1443
1444 /* TODO: move it into the union in uprobe_task */
1445 work = kmalloc(sizeof(*work), GFP_KERNEL);
1446 if (!work)
1447 return uprobe_warn(t, "dup xol area");
1448
1449 t->utask->vaddr = area->vaddr;
1450 init_task_work(work, dup_xol_work);
1451 task_work_add(t, work, true);
1452}
1453
1370/* 1454/*
1371 * Current area->vaddr notion assume the trampoline address is always 1455 * Current area->vaddr notion assume the trampoline address is always
1372 * equal area->vaddr. 1456 * equal area->vaddr.
@@ -1857,9 +1941,4 @@ static int __init init_uprobes(void)
1857 1941
1858 return register_die_notifier(&uprobe_exception_nb); 1942 return register_die_notifier(&uprobe_exception_nb);
1859} 1943}
1860module_init(init_uprobes); 1944__initcall(init_uprobes);
1861
1862static void __exit exit_uprobes(void)
1863{
1864}
1865module_exit(exit_uprobes);
diff --git a/kernel/extable.c b/kernel/extable.c
index 832cb28105bb..763faf037ec1 100644
--- a/kernel/extable.c
+++ b/kernel/extable.c
@@ -61,7 +61,7 @@ const struct exception_table_entry *search_exception_tables(unsigned long addr)
61static inline int init_kernel_text(unsigned long addr) 61static inline int init_kernel_text(unsigned long addr)
62{ 62{
63 if (addr >= (unsigned long)_sinittext && 63 if (addr >= (unsigned long)_sinittext &&
64 addr <= (unsigned long)_einittext) 64 addr < (unsigned long)_einittext)
65 return 1; 65 return 1;
66 return 0; 66 return 0;
67} 67}
@@ -69,7 +69,7 @@ static inline int init_kernel_text(unsigned long addr)
69int core_kernel_text(unsigned long addr) 69int core_kernel_text(unsigned long addr)
70{ 70{
71 if (addr >= (unsigned long)_stext && 71 if (addr >= (unsigned long)_stext &&
72 addr <= (unsigned long)_etext) 72 addr < (unsigned long)_etext)
73 return 1; 73 return 1;
74 74
75 if (system_state == SYSTEM_BOOTING && 75 if (system_state == SYSTEM_BOOTING &&
diff --git a/kernel/fork.c b/kernel/fork.c
index 086fe73ad6bd..dfa736c98d17 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -532,11 +532,12 @@ static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p)
532 mm->flags = (current->mm) ? 532 mm->flags = (current->mm) ?
533 (current->mm->flags & MMF_INIT_MASK) : default_dump_filter; 533 (current->mm->flags & MMF_INIT_MASK) : default_dump_filter;
534 mm->core_state = NULL; 534 mm->core_state = NULL;
535 mm->nr_ptes = 0; 535 atomic_long_set(&mm->nr_ptes, 0);
536 memset(&mm->rss_stat, 0, sizeof(mm->rss_stat)); 536 memset(&mm->rss_stat, 0, sizeof(mm->rss_stat));
537 spin_lock_init(&mm->page_table_lock); 537 spin_lock_init(&mm->page_table_lock);
538 mm_init_aio(mm); 538 mm_init_aio(mm);
539 mm_init_owner(mm, p); 539 mm_init_owner(mm, p);
540 clear_tlb_flush_pending(mm);
540 541
541 if (likely(!mm_alloc_pgd(mm))) { 542 if (likely(!mm_alloc_pgd(mm))) {
542 mm->def_flags = 0; 543 mm->def_flags = 0;
@@ -560,7 +561,7 @@ static void check_mm(struct mm_struct *mm)
560 "mm:%p idx:%d val:%ld\n", mm, i, x); 561 "mm:%p idx:%d val:%ld\n", mm, i, x);
561 } 562 }
562 563
563#ifdef CONFIG_TRANSPARENT_HUGEPAGE 564#if defined(CONFIG_TRANSPARENT_HUGEPAGE) && !USE_SPLIT_PMD_PTLOCKS
564 VM_BUG_ON(mm->pmd_huge_pte); 565 VM_BUG_ON(mm->pmd_huge_pte);
565#endif 566#endif
566} 567}
@@ -814,12 +815,9 @@ struct mm_struct *dup_mm(struct task_struct *tsk)
814 memcpy(mm, oldmm, sizeof(*mm)); 815 memcpy(mm, oldmm, sizeof(*mm));
815 mm_init_cpumask(mm); 816 mm_init_cpumask(mm);
816 817
817#ifdef CONFIG_TRANSPARENT_HUGEPAGE 818#if defined(CONFIG_TRANSPARENT_HUGEPAGE) && !USE_SPLIT_PMD_PTLOCKS
818 mm->pmd_huge_pte = NULL; 819 mm->pmd_huge_pte = NULL;
819#endif 820#endif
820#ifdef CONFIG_NUMA_BALANCING
821 mm->first_nid = NUMA_PTE_SCAN_INIT;
822#endif
823 if (!mm_init(mm, tsk)) 821 if (!mm_init(mm, tsk))
824 goto fail_nomem; 822 goto fail_nomem;
825 823
@@ -1174,7 +1172,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
1174 * do not allow it to share a thread group or signal handlers or 1172 * do not allow it to share a thread group or signal handlers or
1175 * parent with the forking task. 1173 * parent with the forking task.
1176 */ 1174 */
1177 if (clone_flags & (CLONE_SIGHAND | CLONE_PARENT)) { 1175 if (clone_flags & CLONE_SIGHAND) {
1178 if ((clone_flags & (CLONE_NEWUSER | CLONE_NEWPID)) || 1176 if ((clone_flags & (CLONE_NEWUSER | CLONE_NEWPID)) ||
1179 (task_active_pid_ns(current) != 1177 (task_active_pid_ns(current) !=
1180 current->nsproxy->pid_ns_for_children)) 1178 current->nsproxy->pid_ns_for_children))
@@ -1313,7 +1311,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
1313#endif 1311#endif
1314 1312
1315 /* Perform scheduler related setup. Assign this task to a CPU. */ 1313 /* Perform scheduler related setup. Assign this task to a CPU. */
1316 sched_fork(p); 1314 sched_fork(clone_flags, p);
1317 1315
1318 retval = perf_event_init_task(p); 1316 retval = perf_event_init_task(p);
1319 if (retval) 1317 if (retval)
@@ -1373,7 +1371,6 @@ static struct task_struct *copy_process(unsigned long clone_flags,
1373 INIT_LIST_HEAD(&p->pi_state_list); 1371 INIT_LIST_HEAD(&p->pi_state_list);
1374 p->pi_state_cache = NULL; 1372 p->pi_state_cache = NULL;
1375#endif 1373#endif
1376 uprobe_copy_process(p);
1377 /* 1374 /*
1378 * sigaltstack should be cleared when sharing the same VM 1375 * sigaltstack should be cleared when sharing the same VM
1379 */ 1376 */
@@ -1490,6 +1487,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
1490 perf_event_fork(p); 1487 perf_event_fork(p);
1491 1488
1492 trace_task_newtask(p, clone_flags); 1489 trace_task_newtask(p, clone_flags);
1490 uprobe_copy_process(p, clone_flags);
1493 1491
1494 return p; 1492 return p;
1495 1493
diff --git a/kernel/freezer.c b/kernel/freezer.c
index b462fa197517..aa6a8aadb911 100644
--- a/kernel/freezer.c
+++ b/kernel/freezer.c
@@ -19,6 +19,12 @@ EXPORT_SYMBOL(system_freezing_cnt);
19bool pm_freezing; 19bool pm_freezing;
20bool pm_nosig_freezing; 20bool pm_nosig_freezing;
21 21
22/*
23 * Temporary export for the deadlock workaround in ata_scsi_hotplug().
24 * Remove once the hack becomes unnecessary.
25 */
26EXPORT_SYMBOL_GPL(pm_freezing);
27
22/* protects freezing and frozen transitions */ 28/* protects freezing and frozen transitions */
23static DEFINE_SPINLOCK(freezer_lock); 29static DEFINE_SPINLOCK(freezer_lock);
24 30
diff --git a/kernel/futex.c b/kernel/futex.c
index c3a1a55a5214..f6ff0191ecf7 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -66,7 +66,7 @@
66 66
67#include <asm/futex.h> 67#include <asm/futex.h>
68 68
69#include "rtmutex_common.h" 69#include "locking/rtmutex_common.h"
70 70
71int __read_mostly futex_cmpxchg_enabled; 71int __read_mostly futex_cmpxchg_enabled;
72 72
@@ -251,6 +251,9 @@ get_futex_key(u32 __user *uaddr, int fshared, union futex_key *key, int rw)
251 return -EINVAL; 251 return -EINVAL;
252 address -= key->both.offset; 252 address -= key->both.offset;
253 253
254 if (unlikely(!access_ok(rw, uaddr, sizeof(u32))))
255 return -EFAULT;
256
254 /* 257 /*
255 * PROCESS_PRIVATE futexes are fast. 258 * PROCESS_PRIVATE futexes are fast.
256 * As the mm cannot disappear under us and the 'key' only needs 259 * As the mm cannot disappear under us and the 'key' only needs
@@ -259,8 +262,6 @@ get_futex_key(u32 __user *uaddr, int fshared, union futex_key *key, int rw)
259 * but access_ok() should be faster than find_vma() 262 * but access_ok() should be faster than find_vma()
260 */ 263 */
261 if (!fshared) { 264 if (!fshared) {
262 if (unlikely(!access_ok(VERIFY_WRITE, uaddr, sizeof(u32))))
263 return -EFAULT;
264 key->private.mm = mm; 265 key->private.mm = mm;
265 key->private.address = address; 266 key->private.address = address;
266 get_futex_key_refs(key); 267 get_futex_key_refs(key);
@@ -288,7 +289,7 @@ again:
288 put_page(page); 289 put_page(page);
289 /* serialize against __split_huge_page_splitting() */ 290 /* serialize against __split_huge_page_splitting() */
290 local_irq_disable(); 291 local_irq_disable();
291 if (likely(__get_user_pages_fast(address, 1, 1, &page) == 1)) { 292 if (likely(__get_user_pages_fast(address, 1, !ro, &page) == 1)) {
292 page_head = compound_head(page); 293 page_head = compound_head(page);
293 /* 294 /*
294 * page_head is valid pointer but we must pin 295 * page_head is valid pointer but we must pin
diff --git a/kernel/gcov/Kconfig b/kernel/gcov/Kconfig
index d4da55d1fb65..d04ce8ac4399 100644
--- a/kernel/gcov/Kconfig
+++ b/kernel/gcov/Kconfig
@@ -46,4 +46,34 @@ config GCOV_PROFILE_ALL
46 larger and run slower. Also be sure to exclude files from profiling 46 larger and run slower. Also be sure to exclude files from profiling
47 which are not linked to the kernel image to prevent linker errors. 47 which are not linked to the kernel image to prevent linker errors.
48 48
49choice
50 prompt "Specify GCOV format"
51 depends on GCOV_KERNEL
52 default GCOV_FORMAT_AUTODETECT
53 ---help---
54 The gcov format is usually determined by the GCC version, but there are
55 exceptions where format changes are integrated in lower-version GCCs.
56 In such a case use this option to adjust the format used in the kernel
57 accordingly.
58
59 If unsure, choose "Autodetect".
60
61config GCOV_FORMAT_AUTODETECT
62 bool "Autodetect"
63 ---help---
64 Select this option to use the format that corresponds to your GCC
65 version.
66
67config GCOV_FORMAT_3_4
68 bool "GCC 3.4 format"
69 ---help---
70 Select this option to use the format defined by GCC 3.4.
71
72config GCOV_FORMAT_4_7
73 bool "GCC 4.7 format"
74 ---help---
75 Select this option to use the format defined by GCC 4.7.
76
77endchoice
78
49endmenu 79endmenu
diff --git a/kernel/gcov/Makefile b/kernel/gcov/Makefile
index e97ca59e2520..52aa7e8de927 100644
--- a/kernel/gcov/Makefile
+++ b/kernel/gcov/Makefile
@@ -1,3 +1,33 @@
1ccflags-y := -DSRCTREE='"$(srctree)"' -DOBJTREE='"$(objtree)"' 1ccflags-y := -DSRCTREE='"$(srctree)"' -DOBJTREE='"$(objtree)"'
2 2
3obj-$(CONFIG_GCOV_KERNEL) := base.o fs.o gcc_3_4.o 3# if-lt
4# Usage VAR := $(call if-lt, $(a), $(b))
5# Returns 1 if (a < b)
6if-lt = $(shell [ $(1) -lt $(2) ] && echo 1)
7
8ifeq ($(CONFIG_GCOV_FORMAT_3_4),y)
9 cc-ver := 0304
10else ifeq ($(CONFIG_GCOV_FORMAT_4_7),y)
11 cc-ver := 0407
12else
13# Use cc-version if available, otherwise set 0
14#
15# scripts/Kbuild.include, which contains cc-version function, is not included
16# during make clean "make -f scripts/Makefile.clean obj=kernel/gcov"
17# Meaning cc-ver is empty causing if-lt test to fail with
18# "/bin/sh: line 0: [: -lt: unary operator expected" error mesage.
19# This has no affect on the clean phase, but the error message could be
20# confusing/annoying. So this dummy workaround sets cc-ver to zero if cc-version
21# is not available. We can probably move if-lt to Kbuild.include, so it's also
22# not defined during clean or to include Kbuild.include in
23# scripts/Makefile.clean. But the following workaround seems least invasive.
24 cc-ver := $(if $(call cc-version),$(call cc-version),0)
25endif
26
27obj-$(CONFIG_GCOV_KERNEL) := base.o fs.o
28
29ifeq ($(call if-lt, $(cc-ver), 0407),1)
30 obj-$(CONFIG_GCOV_KERNEL) += gcc_3_4.o
31else
32 obj-$(CONFIG_GCOV_KERNEL) += gcc_4_7.o
33endif
diff --git a/kernel/gcov/base.c b/kernel/gcov/base.c
index 9b22d03cc581..f45b75b713c0 100644
--- a/kernel/gcov/base.c
+++ b/kernel/gcov/base.c
@@ -20,7 +20,6 @@
20#include <linux/mutex.h> 20#include <linux/mutex.h>
21#include "gcov.h" 21#include "gcov.h"
22 22
23static struct gcov_info *gcov_info_head;
24static int gcov_events_enabled; 23static int gcov_events_enabled;
25static DEFINE_MUTEX(gcov_lock); 24static DEFINE_MUTEX(gcov_lock);
26 25
@@ -34,7 +33,7 @@ void __gcov_init(struct gcov_info *info)
34 33
35 mutex_lock(&gcov_lock); 34 mutex_lock(&gcov_lock);
36 if (gcov_version == 0) { 35 if (gcov_version == 0) {
37 gcov_version = info->version; 36 gcov_version = gcov_info_version(info);
38 /* 37 /*
39 * Printing gcc's version magic may prove useful for debugging 38 * Printing gcc's version magic may prove useful for debugging
40 * incompatibility reports. 39 * incompatibility reports.
@@ -45,8 +44,7 @@ void __gcov_init(struct gcov_info *info)
45 * Add new profiling data structure to list and inform event 44 * Add new profiling data structure to list and inform event
46 * listener. 45 * listener.
47 */ 46 */
48 info->next = gcov_info_head; 47 gcov_info_link(info);
49 gcov_info_head = info;
50 if (gcov_events_enabled) 48 if (gcov_events_enabled)
51 gcov_event(GCOV_ADD, info); 49 gcov_event(GCOV_ADD, info);
52 mutex_unlock(&gcov_lock); 50 mutex_unlock(&gcov_lock);
@@ -81,6 +79,12 @@ void __gcov_merge_delta(gcov_type *counters, unsigned int n_counters)
81} 79}
82EXPORT_SYMBOL(__gcov_merge_delta); 80EXPORT_SYMBOL(__gcov_merge_delta);
83 81
82void __gcov_merge_ior(gcov_type *counters, unsigned int n_counters)
83{
84 /* Unused. */
85}
86EXPORT_SYMBOL(__gcov_merge_ior);
87
84/** 88/**
85 * gcov_enable_events - enable event reporting through gcov_event() 89 * gcov_enable_events - enable event reporting through gcov_event()
86 * 90 *
@@ -91,13 +95,15 @@ EXPORT_SYMBOL(__gcov_merge_delta);
91 */ 95 */
92void gcov_enable_events(void) 96void gcov_enable_events(void)
93{ 97{
94 struct gcov_info *info; 98 struct gcov_info *info = NULL;
95 99
96 mutex_lock(&gcov_lock); 100 mutex_lock(&gcov_lock);
97 gcov_events_enabled = 1; 101 gcov_events_enabled = 1;
102
98 /* Perform event callback for previously registered entries. */ 103 /* Perform event callback for previously registered entries. */
99 for (info = gcov_info_head; info; info = info->next) 104 while ((info = gcov_info_next(info)))
100 gcov_event(GCOV_ADD, info); 105 gcov_event(GCOV_ADD, info);
106
101 mutex_unlock(&gcov_lock); 107 mutex_unlock(&gcov_lock);
102} 108}
103 109
@@ -112,25 +118,23 @@ static int gcov_module_notifier(struct notifier_block *nb, unsigned long event,
112 void *data) 118 void *data)
113{ 119{
114 struct module *mod = data; 120 struct module *mod = data;
115 struct gcov_info *info; 121 struct gcov_info *info = NULL;
116 struct gcov_info *prev; 122 struct gcov_info *prev = NULL;
117 123
118 if (event != MODULE_STATE_GOING) 124 if (event != MODULE_STATE_GOING)
119 return NOTIFY_OK; 125 return NOTIFY_OK;
120 mutex_lock(&gcov_lock); 126 mutex_lock(&gcov_lock);
121 prev = NULL; 127
122 /* Remove entries located in module from linked list. */ 128 /* Remove entries located in module from linked list. */
123 for (info = gcov_info_head; info; info = info->next) { 129 while ((info = gcov_info_next(info))) {
124 if (within(info, mod->module_core, mod->core_size)) { 130 if (within(info, mod->module_core, mod->core_size)) {
125 if (prev) 131 gcov_info_unlink(prev, info);
126 prev->next = info->next;
127 else
128 gcov_info_head = info->next;
129 if (gcov_events_enabled) 132 if (gcov_events_enabled)
130 gcov_event(GCOV_REMOVE, info); 133 gcov_event(GCOV_REMOVE, info);
131 } else 134 } else
132 prev = info; 135 prev = info;
133 } 136 }
137
134 mutex_unlock(&gcov_lock); 138 mutex_unlock(&gcov_lock);
135 139
136 return NOTIFY_OK; 140 return NOTIFY_OK;
diff --git a/kernel/gcov/fs.c b/kernel/gcov/fs.c
index 7a7d2ee96d42..15ff01a76379 100644
--- a/kernel/gcov/fs.c
+++ b/kernel/gcov/fs.c
@@ -75,7 +75,7 @@ static int __init gcov_persist_setup(char *str)
75 unsigned long val; 75 unsigned long val;
76 76
77 if (kstrtoul(str, 0, &val)) { 77 if (kstrtoul(str, 0, &val)) {
78 pr_warning("invalid gcov_persist parameter '%s'\n", str); 78 pr_warn("invalid gcov_persist parameter '%s'\n", str);
79 return 0; 79 return 0;
80 } 80 }
81 gcov_persist = val; 81 gcov_persist = val;
@@ -242,7 +242,7 @@ static struct gcov_node *get_node_by_name(const char *name)
242 242
243 list_for_each_entry(node, &all_head, all) { 243 list_for_each_entry(node, &all_head, all) {
244 info = get_node_info(node); 244 info = get_node_info(node);
245 if (info && (strcmp(info->filename, name) == 0)) 245 if (info && (strcmp(gcov_info_filename(info), name) == 0))
246 return node; 246 return node;
247 } 247 }
248 248
@@ -279,7 +279,7 @@ static ssize_t gcov_seq_write(struct file *file, const char __user *addr,
279 seq = file->private_data; 279 seq = file->private_data;
280 info = gcov_iter_get_info(seq->private); 280 info = gcov_iter_get_info(seq->private);
281 mutex_lock(&node_lock); 281 mutex_lock(&node_lock);
282 node = get_node_by_name(info->filename); 282 node = get_node_by_name(gcov_info_filename(info));
283 if (node) { 283 if (node) {
284 /* Reset counts or remove node for unloaded modules. */ 284 /* Reset counts or remove node for unloaded modules. */
285 if (node->num_loaded == 0) 285 if (node->num_loaded == 0)
@@ -365,7 +365,7 @@ static const char *deskew(const char *basename)
365 */ 365 */
366static void add_links(struct gcov_node *node, struct dentry *parent) 366static void add_links(struct gcov_node *node, struct dentry *parent)
367{ 367{
368 char *basename; 368 const char *basename;
369 char *target; 369 char *target;
370 int num; 370 int num;
371 int i; 371 int i;
@@ -376,14 +376,14 @@ static void add_links(struct gcov_node *node, struct dentry *parent)
376 if (!node->links) 376 if (!node->links)
377 return; 377 return;
378 for (i = 0; i < num; i++) { 378 for (i = 0; i < num; i++) {
379 target = get_link_target(get_node_info(node)->filename, 379 target = get_link_target(
380 &gcov_link[i]); 380 gcov_info_filename(get_node_info(node)),
381 &gcov_link[i]);
381 if (!target) 382 if (!target)
382 goto out_err; 383 goto out_err;
383 basename = strrchr(target, '/'); 384 basename = kbasename(target);
384 if (!basename) 385 if (basename == target)
385 goto out_err; 386 goto out_err;
386 basename++;
387 node->links[i] = debugfs_create_symlink(deskew(basename), 387 node->links[i] = debugfs_create_symlink(deskew(basename),
388 parent, target); 388 parent, target);
389 if (!node->links[i]) 389 if (!node->links[i])
@@ -450,7 +450,7 @@ static struct gcov_node *new_node(struct gcov_node *parent,
450 } else 450 } else
451 node->dentry = debugfs_create_dir(node->name, parent->dentry); 451 node->dentry = debugfs_create_dir(node->name, parent->dentry);
452 if (!node->dentry) { 452 if (!node->dentry) {
453 pr_warning("could not create file\n"); 453 pr_warn("could not create file\n");
454 kfree(node); 454 kfree(node);
455 return NULL; 455 return NULL;
456 } 456 }
@@ -463,7 +463,7 @@ static struct gcov_node *new_node(struct gcov_node *parent,
463 463
464err_nomem: 464err_nomem:
465 kfree(node); 465 kfree(node);
466 pr_warning("out of memory\n"); 466 pr_warn("out of memory\n");
467 return NULL; 467 return NULL;
468} 468}
469 469
@@ -576,7 +576,7 @@ static void add_node(struct gcov_info *info)
576 struct gcov_node *parent; 576 struct gcov_node *parent;
577 struct gcov_node *node; 577 struct gcov_node *node;
578 578
579 filename = kstrdup(info->filename, GFP_KERNEL); 579 filename = kstrdup(gcov_info_filename(info), GFP_KERNEL);
580 if (!filename) 580 if (!filename)
581 return; 581 return;
582 parent = &root_node; 582 parent = &root_node;
@@ -630,8 +630,8 @@ static void add_info(struct gcov_node *node, struct gcov_info *info)
630 */ 630 */
631 loaded_info = kcalloc(num + 1, sizeof(struct gcov_info *), GFP_KERNEL); 631 loaded_info = kcalloc(num + 1, sizeof(struct gcov_info *), GFP_KERNEL);
632 if (!loaded_info) { 632 if (!loaded_info) {
633 pr_warning("could not add '%s' (out of memory)\n", 633 pr_warn("could not add '%s' (out of memory)\n",
634 info->filename); 634 gcov_info_filename(info));
635 return; 635 return;
636 } 636 }
637 memcpy(loaded_info, node->loaded_info, 637 memcpy(loaded_info, node->loaded_info,
@@ -644,8 +644,9 @@ static void add_info(struct gcov_node *node, struct gcov_info *info)
644 * data set replaces the copy of the last one. 644 * data set replaces the copy of the last one.
645 */ 645 */
646 if (!gcov_info_is_compatible(node->unloaded_info, info)) { 646 if (!gcov_info_is_compatible(node->unloaded_info, info)) {
647 pr_warning("discarding saved data for %s " 647 pr_warn("discarding saved data for %s "
648 "(incompatible version)\n", info->filename); 648 "(incompatible version)\n",
649 gcov_info_filename(info));
649 gcov_info_free(node->unloaded_info); 650 gcov_info_free(node->unloaded_info);
650 node->unloaded_info = NULL; 651 node->unloaded_info = NULL;
651 } 652 }
@@ -655,8 +656,8 @@ static void add_info(struct gcov_node *node, struct gcov_info *info)
655 * The initial one takes precedence. 656 * The initial one takes precedence.
656 */ 657 */
657 if (!gcov_info_is_compatible(node->loaded_info[0], info)) { 658 if (!gcov_info_is_compatible(node->loaded_info[0], info)) {
658 pr_warning("could not add '%s' (incompatible " 659 pr_warn("could not add '%s' (incompatible "
659 "version)\n", info->filename); 660 "version)\n", gcov_info_filename(info));
660 kfree(loaded_info); 661 kfree(loaded_info);
661 return; 662 return;
662 } 663 }
@@ -691,8 +692,9 @@ static void save_info(struct gcov_node *node, struct gcov_info *info)
691 else { 692 else {
692 node->unloaded_info = gcov_info_dup(info); 693 node->unloaded_info = gcov_info_dup(info);
693 if (!node->unloaded_info) { 694 if (!node->unloaded_info) {
694 pr_warning("could not save data for '%s' " 695 pr_warn("could not save data for '%s' "
695 "(out of memory)\n", info->filename); 696 "(out of memory)\n",
697 gcov_info_filename(info));
696 } 698 }
697 } 699 }
698} 700}
@@ -707,8 +709,8 @@ static void remove_info(struct gcov_node *node, struct gcov_info *info)
707 709
708 i = get_info_index(node, info); 710 i = get_info_index(node, info);
709 if (i < 0) { 711 if (i < 0) {
710 pr_warning("could not remove '%s' (not found)\n", 712 pr_warn("could not remove '%s' (not found)\n",
711 info->filename); 713 gcov_info_filename(info));
712 return; 714 return;
713 } 715 }
714 if (gcov_persist) 716 if (gcov_persist)
@@ -735,7 +737,7 @@ void gcov_event(enum gcov_action action, struct gcov_info *info)
735 struct gcov_node *node; 737 struct gcov_node *node;
736 738
737 mutex_lock(&node_lock); 739 mutex_lock(&node_lock);
738 node = get_node_by_name(info->filename); 740 node = get_node_by_name(gcov_info_filename(info));
739 switch (action) { 741 switch (action) {
740 case GCOV_ADD: 742 case GCOV_ADD:
741 if (node) 743 if (node)
@@ -747,8 +749,8 @@ void gcov_event(enum gcov_action action, struct gcov_info *info)
747 if (node) 749 if (node)
748 remove_info(node, info); 750 remove_info(node, info);
749 else { 751 else {
750 pr_warning("could not remove '%s' (not found)\n", 752 pr_warn("could not remove '%s' (not found)\n",
751 info->filename); 753 gcov_info_filename(info));
752 } 754 }
753 break; 755 break;
754 } 756 }
diff --git a/kernel/gcov/gcc_3_4.c b/kernel/gcov/gcc_3_4.c
index ae5bb4260033..27bc88a35013 100644
--- a/kernel/gcov/gcc_3_4.c
+++ b/kernel/gcov/gcc_3_4.c
@@ -21,6 +21,121 @@
21#include <linux/vmalloc.h> 21#include <linux/vmalloc.h>
22#include "gcov.h" 22#include "gcov.h"
23 23
24#define GCOV_COUNTERS 5
25
26static struct gcov_info *gcov_info_head;
27
28/**
29 * struct gcov_fn_info - profiling meta data per function
30 * @ident: object file-unique function identifier
31 * @checksum: function checksum
32 * @n_ctrs: number of values per counter type belonging to this function
33 *
34 * This data is generated by gcc during compilation and doesn't change
35 * at run-time.
36 */
37struct gcov_fn_info {
38 unsigned int ident;
39 unsigned int checksum;
40 unsigned int n_ctrs[0];
41};
42
43/**
44 * struct gcov_ctr_info - profiling data per counter type
45 * @num: number of counter values for this type
46 * @values: array of counter values for this type
47 * @merge: merge function for counter values of this type (unused)
48 *
49 * This data is generated by gcc during compilation and doesn't change
50 * at run-time with the exception of the values array.
51 */
52struct gcov_ctr_info {
53 unsigned int num;
54 gcov_type *values;
55 void (*merge)(gcov_type *, unsigned int);
56};
57
58/**
59 * struct gcov_info - profiling data per object file
60 * @version: gcov version magic indicating the gcc version used for compilation
61 * @next: list head for a singly-linked list
62 * @stamp: time stamp
63 * @filename: name of the associated gcov data file
64 * @n_functions: number of instrumented functions
65 * @functions: function data
66 * @ctr_mask: mask specifying which counter types are active
67 * @counts: counter data per counter type
68 *
69 * This data is generated by gcc during compilation and doesn't change
70 * at run-time with the exception of the next pointer.
71 */
72struct gcov_info {
73 unsigned int version;
74 struct gcov_info *next;
75 unsigned int stamp;
76 const char *filename;
77 unsigned int n_functions;
78 const struct gcov_fn_info *functions;
79 unsigned int ctr_mask;
80 struct gcov_ctr_info counts[0];
81};
82
83/**
84 * gcov_info_filename - return info filename
85 * @info: profiling data set
86 */
87const char *gcov_info_filename(struct gcov_info *info)
88{
89 return info->filename;
90}
91
92/**
93 * gcov_info_version - return info version
94 * @info: profiling data set
95 */
96unsigned int gcov_info_version(struct gcov_info *info)
97{
98 return info->version;
99}
100
101/**
102 * gcov_info_next - return next profiling data set
103 * @info: profiling data set
104 *
105 * Returns next gcov_info following @info or first gcov_info in the chain if
106 * @info is %NULL.
107 */
108struct gcov_info *gcov_info_next(struct gcov_info *info)
109{
110 if (!info)
111 return gcov_info_head;
112
113 return info->next;
114}
115
116/**
117 * gcov_info_link - link/add profiling data set to the list
118 * @info: profiling data set
119 */
120void gcov_info_link(struct gcov_info *info)
121{
122 info->next = gcov_info_head;
123 gcov_info_head = info;
124}
125
126/**
127 * gcov_info_unlink - unlink/remove profiling data set from the list
128 * @prev: previous profiling data set
129 * @info: profiling data set
130 */
131void gcov_info_unlink(struct gcov_info *prev, struct gcov_info *info)
132{
133 if (prev)
134 prev->next = info->next;
135 else
136 gcov_info_head = info->next;
137}
138
24/* Symbolic links to be created for each profiling data file. */ 139/* Symbolic links to be created for each profiling data file. */
25const struct gcov_link gcov_link[] = { 140const struct gcov_link gcov_link[] = {
26 { OBJ_TREE, "gcno" }, /* Link to .gcno file in $(objtree). */ 141 { OBJ_TREE, "gcno" }, /* Link to .gcno file in $(objtree). */
diff --git a/kernel/gcov/gcc_4_7.c b/kernel/gcov/gcc_4_7.c
new file mode 100644
index 000000000000..2c6e4631c814
--- /dev/null
+++ b/kernel/gcov/gcc_4_7.c
@@ -0,0 +1,560 @@
1/*
2 * This code provides functions to handle gcc's profiling data format
3 * introduced with gcc 4.7.
4 *
5 * This file is based heavily on gcc_3_4.c file.
6 *
7 * For a better understanding, refer to gcc source:
8 * gcc/gcov-io.h
9 * libgcc/libgcov.c
10 *
11 * Uses gcc-internal data definitions.
12 */
13
14#include <linux/errno.h>
15#include <linux/slab.h>
16#include <linux/string.h>
17#include <linux/seq_file.h>
18#include <linux/vmalloc.h>
19#include "gcov.h"
20
21#define GCOV_COUNTERS 8
22#define GCOV_TAG_FUNCTION_LENGTH 3
23
24static struct gcov_info *gcov_info_head;
25
26/**
27 * struct gcov_ctr_info - information about counters for a single function
28 * @num: number of counter values for this type
29 * @values: array of counter values for this type
30 *
31 * This data is generated by gcc during compilation and doesn't change
32 * at run-time with the exception of the values array.
33 */
34struct gcov_ctr_info {
35 unsigned int num;
36 gcov_type *values;
37};
38
39/**
40 * struct gcov_fn_info - profiling meta data per function
41 * @key: comdat key
42 * @ident: unique ident of function
43 * @lineno_checksum: function lineo_checksum
44 * @cfg_checksum: function cfg checksum
45 * @ctrs: instrumented counters
46 *
47 * This data is generated by gcc during compilation and doesn't change
48 * at run-time.
49 *
50 * Information about a single function. This uses the trailing array
51 * idiom. The number of counters is determined from the merge pointer
52 * array in gcov_info. The key is used to detect which of a set of
53 * comdat functions was selected -- it points to the gcov_info object
54 * of the object file containing the selected comdat function.
55 */
56struct gcov_fn_info {
57 const struct gcov_info *key;
58 unsigned int ident;
59 unsigned int lineno_checksum;
60 unsigned int cfg_checksum;
61 struct gcov_ctr_info ctrs[0];
62};
63
64/**
65 * struct gcov_info - profiling data per object file
66 * @version: gcov version magic indicating the gcc version used for compilation
67 * @next: list head for a singly-linked list
68 * @stamp: uniquifying time stamp
69 * @filename: name of the associated gcov data file
70 * @merge: merge functions (null for unused counter type)
71 * @n_functions: number of instrumented functions
72 * @functions: pointer to pointers to function information
73 *
74 * This data is generated by gcc during compilation and doesn't change
75 * at run-time with the exception of the next pointer.
76 */
77struct gcov_info {
78 unsigned int version;
79 struct gcov_info *next;
80 unsigned int stamp;
81 const char *filename;
82 void (*merge[GCOV_COUNTERS])(gcov_type *, unsigned int);
83 unsigned int n_functions;
84 struct gcov_fn_info **functions;
85};
86
87/**
88 * gcov_info_filename - return info filename
89 * @info: profiling data set
90 */
91const char *gcov_info_filename(struct gcov_info *info)
92{
93 return info->filename;
94}
95
96/**
97 * gcov_info_version - return info version
98 * @info: profiling data set
99 */
100unsigned int gcov_info_version(struct gcov_info *info)
101{
102 return info->version;
103}
104
105/**
106 * gcov_info_next - return next profiling data set
107 * @info: profiling data set
108 *
109 * Returns next gcov_info following @info or first gcov_info in the chain if
110 * @info is %NULL.
111 */
112struct gcov_info *gcov_info_next(struct gcov_info *info)
113{
114 if (!info)
115 return gcov_info_head;
116
117 return info->next;
118}
119
120/**
121 * gcov_info_link - link/add profiling data set to the list
122 * @info: profiling data set
123 */
124void gcov_info_link(struct gcov_info *info)
125{
126 info->next = gcov_info_head;
127 gcov_info_head = info;
128}
129
130/**
131 * gcov_info_unlink - unlink/remove profiling data set from the list
132 * @prev: previous profiling data set
133 * @info: profiling data set
134 */
135void gcov_info_unlink(struct gcov_info *prev, struct gcov_info *info)
136{
137 if (prev)
138 prev->next = info->next;
139 else
140 gcov_info_head = info->next;
141}
142
143/* Symbolic links to be created for each profiling data file. */
144const struct gcov_link gcov_link[] = {
145 { OBJ_TREE, "gcno" }, /* Link to .gcno file in $(objtree). */
146 { 0, NULL},
147};
148
149/*
150 * Determine whether a counter is active. Doesn't change at run-time.
151 */
152static int counter_active(struct gcov_info *info, unsigned int type)
153{
154 return info->merge[type] ? 1 : 0;
155}
156
157/* Determine number of active counters. Based on gcc magic. */
158static unsigned int num_counter_active(struct gcov_info *info)
159{
160 unsigned int i;
161 unsigned int result = 0;
162
163 for (i = 0; i < GCOV_COUNTERS; i++) {
164 if (counter_active(info, i))
165 result++;
166 }
167 return result;
168}
169
170/**
171 * gcov_info_reset - reset profiling data to zero
172 * @info: profiling data set
173 */
174void gcov_info_reset(struct gcov_info *info)
175{
176 struct gcov_ctr_info *ci_ptr;
177 unsigned int fi_idx;
178 unsigned int ct_idx;
179
180 for (fi_idx = 0; fi_idx < info->n_functions; fi_idx++) {
181 ci_ptr = info->functions[fi_idx]->ctrs;
182
183 for (ct_idx = 0; ct_idx < GCOV_COUNTERS; ct_idx++) {
184 if (!counter_active(info, ct_idx))
185 continue;
186
187 memset(ci_ptr->values, 0,
188 sizeof(gcov_type) * ci_ptr->num);
189 ci_ptr++;
190 }
191 }
192}
193
194/**
195 * gcov_info_is_compatible - check if profiling data can be added
196 * @info1: first profiling data set
197 * @info2: second profiling data set
198 *
199 * Returns non-zero if profiling data can be added, zero otherwise.
200 */
201int gcov_info_is_compatible(struct gcov_info *info1, struct gcov_info *info2)
202{
203 return (info1->stamp == info2->stamp);
204}
205
206/**
207 * gcov_info_add - add up profiling data
208 * @dest: profiling data set to which data is added
209 * @source: profiling data set which is added
210 *
211 * Adds profiling counts of @source to @dest.
212 */
213void gcov_info_add(struct gcov_info *dst, struct gcov_info *src)
214{
215 struct gcov_ctr_info *dci_ptr;
216 struct gcov_ctr_info *sci_ptr;
217 unsigned int fi_idx;
218 unsigned int ct_idx;
219 unsigned int val_idx;
220
221 for (fi_idx = 0; fi_idx < src->n_functions; fi_idx++) {
222 dci_ptr = dst->functions[fi_idx]->ctrs;
223 sci_ptr = src->functions[fi_idx]->ctrs;
224
225 for (ct_idx = 0; ct_idx < GCOV_COUNTERS; ct_idx++) {
226 if (!counter_active(src, ct_idx))
227 continue;
228
229 for (val_idx = 0; val_idx < sci_ptr->num; val_idx++)
230 dci_ptr->values[val_idx] +=
231 sci_ptr->values[val_idx];
232
233 dci_ptr++;
234 sci_ptr++;
235 }
236 }
237}
238
239/**
240 * gcov_info_dup - duplicate profiling data set
241 * @info: profiling data set to duplicate
242 *
243 * Return newly allocated duplicate on success, %NULL on error.
244 */
245struct gcov_info *gcov_info_dup(struct gcov_info *info)
246{
247 struct gcov_info *dup;
248 struct gcov_ctr_info *dci_ptr; /* dst counter info */
249 struct gcov_ctr_info *sci_ptr; /* src counter info */
250 unsigned int active;
251 unsigned int fi_idx; /* function info idx */
252 unsigned int ct_idx; /* counter type idx */
253 size_t fi_size; /* function info size */
254 size_t cv_size; /* counter values size */
255
256 dup = kmemdup(info, sizeof(*dup), GFP_KERNEL);
257 if (!dup)
258 return NULL;
259
260 dup->next = NULL;
261 dup->filename = NULL;
262 dup->functions = NULL;
263
264 dup->filename = kstrdup(info->filename, GFP_KERNEL);
265 if (!dup->filename)
266 goto err_free;
267
268 dup->functions = kcalloc(info->n_functions,
269 sizeof(struct gcov_fn_info *), GFP_KERNEL);
270 if (!dup->functions)
271 goto err_free;
272
273 active = num_counter_active(info);
274 fi_size = sizeof(struct gcov_fn_info);
275 fi_size += sizeof(struct gcov_ctr_info) * active;
276
277 for (fi_idx = 0; fi_idx < info->n_functions; fi_idx++) {
278 dup->functions[fi_idx] = kzalloc(fi_size, GFP_KERNEL);
279 if (!dup->functions[fi_idx])
280 goto err_free;
281
282 *(dup->functions[fi_idx]) = *(info->functions[fi_idx]);
283
284 sci_ptr = info->functions[fi_idx]->ctrs;
285 dci_ptr = dup->functions[fi_idx]->ctrs;
286
287 for (ct_idx = 0; ct_idx < active; ct_idx++) {
288
289 cv_size = sizeof(gcov_type) * sci_ptr->num;
290
291 dci_ptr->values = vmalloc(cv_size);
292
293 if (!dci_ptr->values)
294 goto err_free;
295
296 dci_ptr->num = sci_ptr->num;
297 memcpy(dci_ptr->values, sci_ptr->values, cv_size);
298
299 sci_ptr++;
300 dci_ptr++;
301 }
302 }
303
304 return dup;
305err_free:
306 gcov_info_free(dup);
307 return NULL;
308}
309
310/**
311 * gcov_info_free - release memory for profiling data set duplicate
312 * @info: profiling data set duplicate to free
313 */
314void gcov_info_free(struct gcov_info *info)
315{
316 unsigned int active;
317 unsigned int fi_idx;
318 unsigned int ct_idx;
319 struct gcov_ctr_info *ci_ptr;
320
321 if (!info->functions)
322 goto free_info;
323
324 active = num_counter_active(info);
325
326 for (fi_idx = 0; fi_idx < info->n_functions; fi_idx++) {
327 if (!info->functions[fi_idx])
328 continue;
329
330 ci_ptr = info->functions[fi_idx]->ctrs;
331
332 for (ct_idx = 0; ct_idx < active; ct_idx++, ci_ptr++)
333 vfree(ci_ptr->values);
334
335 kfree(info->functions[fi_idx]);
336 }
337
338free_info:
339 kfree(info->functions);
340 kfree(info->filename);
341 kfree(info);
342}
343
344#define ITER_STRIDE PAGE_SIZE
345
346/**
347 * struct gcov_iterator - specifies current file position in logical records
348 * @info: associated profiling data
349 * @buffer: buffer containing file data
350 * @size: size of buffer
351 * @pos: current position in file
352 */
353struct gcov_iterator {
354 struct gcov_info *info;
355 void *buffer;
356 size_t size;
357 loff_t pos;
358};
359
360/**
361 * store_gcov_u32 - store 32 bit number in gcov format to buffer
362 * @buffer: target buffer or NULL
363 * @off: offset into the buffer
364 * @v: value to be stored
365 *
366 * Number format defined by gcc: numbers are recorded in the 32 bit
367 * unsigned binary form of the endianness of the machine generating the
368 * file. Returns the number of bytes stored. If @buffer is %NULL, doesn't
369 * store anything.
370 */
371static size_t store_gcov_u32(void *buffer, size_t off, u32 v)
372{
373 u32 *data;
374
375 if (buffer) {
376 data = buffer + off;
377 *data = v;
378 }
379
380 return sizeof(*data);
381}
382
383/**
384 * store_gcov_u64 - store 64 bit number in gcov format to buffer
385 * @buffer: target buffer or NULL
386 * @off: offset into the buffer
387 * @v: value to be stored
388 *
389 * Number format defined by gcc: numbers are recorded in the 32 bit
390 * unsigned binary form of the endianness of the machine generating the
391 * file. 64 bit numbers are stored as two 32 bit numbers, the low part
392 * first. Returns the number of bytes stored. If @buffer is %NULL, doesn't store
393 * anything.
394 */
395static size_t store_gcov_u64(void *buffer, size_t off, u64 v)
396{
397 u32 *data;
398
399 if (buffer) {
400 data = buffer + off;
401
402 data[0] = (v & 0xffffffffUL);
403 data[1] = (v >> 32);
404 }
405
406 return sizeof(*data) * 2;
407}
408
409/**
410 * convert_to_gcda - convert profiling data set to gcda file format
411 * @buffer: the buffer to store file data or %NULL if no data should be stored
412 * @info: profiling data set to be converted
413 *
414 * Returns the number of bytes that were/would have been stored into the buffer.
415 */
416static size_t convert_to_gcda(char *buffer, struct gcov_info *info)
417{
418 struct gcov_fn_info *fi_ptr;
419 struct gcov_ctr_info *ci_ptr;
420 unsigned int fi_idx;
421 unsigned int ct_idx;
422 unsigned int cv_idx;
423 size_t pos = 0;
424
425 /* File header. */
426 pos += store_gcov_u32(buffer, pos, GCOV_DATA_MAGIC);
427 pos += store_gcov_u32(buffer, pos, info->version);
428 pos += store_gcov_u32(buffer, pos, info->stamp);
429
430 for (fi_idx = 0; fi_idx < info->n_functions; fi_idx++) {
431 fi_ptr = info->functions[fi_idx];
432
433 /* Function record. */
434 pos += store_gcov_u32(buffer, pos, GCOV_TAG_FUNCTION);
435 pos += store_gcov_u32(buffer, pos, GCOV_TAG_FUNCTION_LENGTH);
436 pos += store_gcov_u32(buffer, pos, fi_ptr->ident);
437 pos += store_gcov_u32(buffer, pos, fi_ptr->lineno_checksum);
438 pos += store_gcov_u32(buffer, pos, fi_ptr->cfg_checksum);
439
440 ci_ptr = fi_ptr->ctrs;
441
442 for (ct_idx = 0; ct_idx < GCOV_COUNTERS; ct_idx++) {
443 if (!counter_active(info, ct_idx))
444 continue;
445
446 /* Counter record. */
447 pos += store_gcov_u32(buffer, pos,
448 GCOV_TAG_FOR_COUNTER(ct_idx));
449 pos += store_gcov_u32(buffer, pos, ci_ptr->num * 2);
450
451 for (cv_idx = 0; cv_idx < ci_ptr->num; cv_idx++) {
452 pos += store_gcov_u64(buffer, pos,
453 ci_ptr->values[cv_idx]);
454 }
455
456 ci_ptr++;
457 }
458 }
459
460 return pos;
461}
462
463/**
464 * gcov_iter_new - allocate and initialize profiling data iterator
465 * @info: profiling data set to be iterated
466 *
467 * Return file iterator on success, %NULL otherwise.
468 */
469struct gcov_iterator *gcov_iter_new(struct gcov_info *info)
470{
471 struct gcov_iterator *iter;
472
473 iter = kzalloc(sizeof(struct gcov_iterator), GFP_KERNEL);
474 if (!iter)
475 goto err_free;
476
477 iter->info = info;
478 /* Dry-run to get the actual buffer size. */
479 iter->size = convert_to_gcda(NULL, info);
480 iter->buffer = vmalloc(iter->size);
481 if (!iter->buffer)
482 goto err_free;
483
484 convert_to_gcda(iter->buffer, info);
485
486 return iter;
487
488err_free:
489 kfree(iter);
490 return NULL;
491}
492
493
494/**
495 * gcov_iter_get_info - return profiling data set for given file iterator
496 * @iter: file iterator
497 */
498void gcov_iter_free(struct gcov_iterator *iter)
499{
500 vfree(iter->buffer);
501 kfree(iter);
502}
503
504/**
505 * gcov_iter_get_info - return profiling data set for given file iterator
506 * @iter: file iterator
507 */
508struct gcov_info *gcov_iter_get_info(struct gcov_iterator *iter)
509{
510 return iter->info;
511}
512
513/**
514 * gcov_iter_start - reset file iterator to starting position
515 * @iter: file iterator
516 */
517void gcov_iter_start(struct gcov_iterator *iter)
518{
519 iter->pos = 0;
520}
521
522/**
523 * gcov_iter_next - advance file iterator to next logical record
524 * @iter: file iterator
525 *
526 * Return zero if new position is valid, non-zero if iterator has reached end.
527 */
528int gcov_iter_next(struct gcov_iterator *iter)
529{
530 if (iter->pos < iter->size)
531 iter->pos += ITER_STRIDE;
532
533 if (iter->pos >= iter->size)
534 return -EINVAL;
535
536 return 0;
537}
538
539/**
540 * gcov_iter_write - write data for current pos to seq_file
541 * @iter: file iterator
542 * @seq: seq_file handle
543 *
544 * Return zero on success, non-zero otherwise.
545 */
546int gcov_iter_write(struct gcov_iterator *iter, struct seq_file *seq)
547{
548 size_t len;
549
550 if (iter->pos >= iter->size)
551 return -EINVAL;
552
553 len = ITER_STRIDE;
554 if (iter->pos + len > iter->size)
555 len = iter->size - iter->pos;
556
557 seq_write(seq, iter->buffer + iter->pos, len);
558
559 return 0;
560}
diff --git a/kernel/gcov/gcov.h b/kernel/gcov/gcov.h
index 060073ebf7a6..92c8e22a29ed 100644
--- a/kernel/gcov/gcov.h
+++ b/kernel/gcov/gcov.h
@@ -21,7 +21,6 @@
21 * gcc and need to be kept as close to the original definition as possible to 21 * gcc and need to be kept as close to the original definition as possible to
22 * remain compatible. 22 * remain compatible.
23 */ 23 */
24#define GCOV_COUNTERS 5
25#define GCOV_DATA_MAGIC ((unsigned int) 0x67636461) 24#define GCOV_DATA_MAGIC ((unsigned int) 0x67636461)
26#define GCOV_TAG_FUNCTION ((unsigned int) 0x01000000) 25#define GCOV_TAG_FUNCTION ((unsigned int) 0x01000000)
27#define GCOV_TAG_COUNTER_BASE ((unsigned int) 0x01a10000) 26#define GCOV_TAG_COUNTER_BASE ((unsigned int) 0x01a10000)
@@ -34,60 +33,18 @@ typedef long gcov_type;
34typedef long long gcov_type; 33typedef long long gcov_type;
35#endif 34#endif
36 35
37/** 36/* Opaque gcov_info. The gcov structures can change as for example in gcc 4.7 so
38 * struct gcov_fn_info - profiling meta data per function 37 * we cannot use full definition here and they need to be placed in gcc specific
39 * @ident: object file-unique function identifier 38 * implementation of gcov. This also means no direct access to the members in
40 * @checksum: function checksum 39 * generic code and usage of the interface below.*/
41 * @n_ctrs: number of values per counter type belonging to this function 40struct gcov_info;
42 *
43 * This data is generated by gcc during compilation and doesn't change
44 * at run-time.
45 */
46struct gcov_fn_info {
47 unsigned int ident;
48 unsigned int checksum;
49 unsigned int n_ctrs[0];
50};
51
52/**
53 * struct gcov_ctr_info - profiling data per counter type
54 * @num: number of counter values for this type
55 * @values: array of counter values for this type
56 * @merge: merge function for counter values of this type (unused)
57 *
58 * This data is generated by gcc during compilation and doesn't change
59 * at run-time with the exception of the values array.
60 */
61struct gcov_ctr_info {
62 unsigned int num;
63 gcov_type *values;
64 void (*merge)(gcov_type *, unsigned int);
65};
66 41
67/** 42/* Interface to access gcov_info data */
68 * struct gcov_info - profiling data per object file 43const char *gcov_info_filename(struct gcov_info *info);
69 * @version: gcov version magic indicating the gcc version used for compilation 44unsigned int gcov_info_version(struct gcov_info *info);
70 * @next: list head for a singly-linked list 45struct gcov_info *gcov_info_next(struct gcov_info *info);
71 * @stamp: time stamp 46void gcov_info_link(struct gcov_info *info);
72 * @filename: name of the associated gcov data file 47void gcov_info_unlink(struct gcov_info *prev, struct gcov_info *info);
73 * @n_functions: number of instrumented functions
74 * @functions: function data
75 * @ctr_mask: mask specifying which counter types are active
76 * @counts: counter data per counter type
77 *
78 * This data is generated by gcc during compilation and doesn't change
79 * at run-time with the exception of the next pointer.
80 */
81struct gcov_info {
82 unsigned int version;
83 struct gcov_info *next;
84 unsigned int stamp;
85 const char *filename;
86 unsigned int n_functions;
87 const struct gcov_fn_info *functions;
88 unsigned int ctr_mask;
89 struct gcov_ctr_info counts[0];
90};
91 48
92/* Base interface. */ 49/* Base interface. */
93enum gcov_action { 50enum gcov_action {
diff --git a/kernel/hung_task.c b/kernel/hung_task.c
index 3e97fb126e6b..9328b80eaf14 100644
--- a/kernel/hung_task.c
+++ b/kernel/hung_task.c
@@ -16,11 +16,12 @@
16#include <linux/export.h> 16#include <linux/export.h>
17#include <linux/sysctl.h> 17#include <linux/sysctl.h>
18#include <linux/utsname.h> 18#include <linux/utsname.h>
19#include <trace/events/sched.h>
19 20
20/* 21/*
21 * The number of tasks checked: 22 * The number of tasks checked:
22 */ 23 */
23unsigned long __read_mostly sysctl_hung_task_check_count = PID_MAX_LIMIT; 24int __read_mostly sysctl_hung_task_check_count = PID_MAX_LIMIT;
24 25
25/* 26/*
26 * Limit number of tasks checked in a batch. 27 * Limit number of tasks checked in a batch.
@@ -92,6 +93,9 @@ static void check_hung_task(struct task_struct *t, unsigned long timeout)
92 t->last_switch_count = switch_count; 93 t->last_switch_count = switch_count;
93 return; 94 return;
94 } 95 }
96
97 trace_sched_process_hang(t);
98
95 if (!sysctl_hung_task_warnings) 99 if (!sysctl_hung_task_warnings)
96 return; 100 return;
97 sysctl_hung_task_warnings--; 101 sysctl_hung_task_warnings--;
@@ -203,6 +207,14 @@ int proc_dohung_task_timeout_secs(struct ctl_table *table, int write,
203 return ret; 207 return ret;
204} 208}
205 209
210static atomic_t reset_hung_task = ATOMIC_INIT(0);
211
212void reset_hung_task_detector(void)
213{
214 atomic_set(&reset_hung_task, 1);
215}
216EXPORT_SYMBOL_GPL(reset_hung_task_detector);
217
206/* 218/*
207 * kthread which checks for tasks stuck in D state 219 * kthread which checks for tasks stuck in D state
208 */ 220 */
@@ -216,6 +228,9 @@ static int watchdog(void *dummy)
216 while (schedule_timeout_interruptible(timeout_jiffies(timeout))) 228 while (schedule_timeout_interruptible(timeout_jiffies(timeout)))
217 timeout = sysctl_hung_task_timeout_secs; 229 timeout = sysctl_hung_task_timeout_secs;
218 230
231 if (atomic_xchg(&reset_hung_task, 0))
232 continue;
233
219 check_hung_uninterruptible_tasks(timeout); 234 check_hung_uninterruptible_tasks(timeout);
220 } 235 }
221 236
diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c
index a3bb14fbe5c6..dc04c166c54d 100644
--- a/kernel/irq/chip.c
+++ b/kernel/irq/chip.c
@@ -214,7 +214,7 @@ void irq_enable(struct irq_desc *desc)
214} 214}
215 215
216/** 216/**
217 * irq_disable - Mark interupt disabled 217 * irq_disable - Mark interrupt disabled
218 * @desc: irq descriptor which should be disabled 218 * @desc: irq descriptor which should be disabled
219 * 219 *
220 * If the chip does not implement the irq_disable callback, we 220 * If the chip does not implement the irq_disable callback, we
diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c
index 706724e9835d..cf68bb36fe58 100644
--- a/kernel/irq/irqdomain.c
+++ b/kernel/irq/irqdomain.c
@@ -465,27 +465,26 @@ int irq_create_strict_mappings(struct irq_domain *domain, unsigned int irq_base,
465} 465}
466EXPORT_SYMBOL_GPL(irq_create_strict_mappings); 466EXPORT_SYMBOL_GPL(irq_create_strict_mappings);
467 467
468unsigned int irq_create_of_mapping(struct device_node *controller, 468unsigned int irq_create_of_mapping(struct of_phandle_args *irq_data)
469 const u32 *intspec, unsigned int intsize)
470{ 469{
471 struct irq_domain *domain; 470 struct irq_domain *domain;
472 irq_hw_number_t hwirq; 471 irq_hw_number_t hwirq;
473 unsigned int type = IRQ_TYPE_NONE; 472 unsigned int type = IRQ_TYPE_NONE;
474 unsigned int virq; 473 unsigned int virq;
475 474
476 domain = controller ? irq_find_host(controller) : irq_default_domain; 475 domain = irq_data->np ? irq_find_host(irq_data->np) : irq_default_domain;
477 if (!domain) { 476 if (!domain) {
478 pr_warn("no irq domain found for %s !\n", 477 pr_warn("no irq domain found for %s !\n",
479 of_node_full_name(controller)); 478 of_node_full_name(irq_data->np));
480 return 0; 479 return 0;
481 } 480 }
482 481
483 /* If domain has no translation, then we assume interrupt line */ 482 /* If domain has no translation, then we assume interrupt line */
484 if (domain->ops->xlate == NULL) 483 if (domain->ops->xlate == NULL)
485 hwirq = intspec[0]; 484 hwirq = irq_data->args[0];
486 else { 485 else {
487 if (domain->ops->xlate(domain, controller, intspec, intsize, 486 if (domain->ops->xlate(domain, irq_data->np, irq_data->args,
488 &hwirq, &type)) 487 irq_data->args_count, &hwirq, &type))
489 return 0; 488 return 0;
490 } 489 }
491 490
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index 514bcfd855a8..481a13c43b17 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -786,7 +786,7 @@ irq_forced_thread_fn(struct irq_desc *desc, struct irqaction *action)
786} 786}
787 787
788/* 788/*
789 * Interrupts explicitely requested as threaded interupts want to be 789 * Interrupts explicitly requested as threaded interrupts want to be
790 * preemtible - many of them need to sleep and wait for slow busses to 790 * preemtible - many of them need to sleep and wait for slow busses to
791 * complete. 791 * complete.
792 */ 792 */
@@ -956,7 +956,7 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new)
956 goto out_mput; 956 goto out_mput;
957 } 957 }
958 958
959 sched_setscheduler(t, SCHED_FIFO, &param); 959 sched_setscheduler_nocheck(t, SCHED_FIFO, &param);
960 960
961 /* 961 /*
962 * We keep the reference to the task struct even if 962 * We keep the reference to the task struct even if
diff --git a/kernel/irq/pm.c b/kernel/irq/pm.c
index cb228bf21760..abcd6ca86cb7 100644
--- a/kernel/irq/pm.c
+++ b/kernel/irq/pm.c
@@ -50,7 +50,7 @@ static void resume_irqs(bool want_early)
50 bool is_early = desc->action && 50 bool is_early = desc->action &&
51 desc->action->flags & IRQF_EARLY_RESUME; 51 desc->action->flags & IRQF_EARLY_RESUME;
52 52
53 if (is_early != want_early) 53 if (!is_early && want_early)
54 continue; 54 continue;
55 55
56 raw_spin_lock_irqsave(&desc->lock, flags); 56 raw_spin_lock_irqsave(&desc->lock, flags);
diff --git a/kernel/irq/settings.h b/kernel/irq/settings.h
index 1162f1030f18..3320b84cc60f 100644
--- a/kernel/irq/settings.h
+++ b/kernel/irq/settings.h
@@ -14,6 +14,7 @@ enum {
14 _IRQ_NO_BALANCING = IRQ_NO_BALANCING, 14 _IRQ_NO_BALANCING = IRQ_NO_BALANCING,
15 _IRQ_NESTED_THREAD = IRQ_NESTED_THREAD, 15 _IRQ_NESTED_THREAD = IRQ_NESTED_THREAD,
16 _IRQ_PER_CPU_DEVID = IRQ_PER_CPU_DEVID, 16 _IRQ_PER_CPU_DEVID = IRQ_PER_CPU_DEVID,
17 _IRQ_IS_POLLED = IRQ_IS_POLLED,
17 _IRQF_MODIFY_MASK = IRQF_MODIFY_MASK, 18 _IRQF_MODIFY_MASK = IRQF_MODIFY_MASK,
18}; 19};
19 20
@@ -26,6 +27,7 @@ enum {
26#define IRQ_NOAUTOEN GOT_YOU_MORON 27#define IRQ_NOAUTOEN GOT_YOU_MORON
27#define IRQ_NESTED_THREAD GOT_YOU_MORON 28#define IRQ_NESTED_THREAD GOT_YOU_MORON
28#define IRQ_PER_CPU_DEVID GOT_YOU_MORON 29#define IRQ_PER_CPU_DEVID GOT_YOU_MORON
30#define IRQ_IS_POLLED GOT_YOU_MORON
29#undef IRQF_MODIFY_MASK 31#undef IRQF_MODIFY_MASK
30#define IRQF_MODIFY_MASK GOT_YOU_MORON 32#define IRQF_MODIFY_MASK GOT_YOU_MORON
31 33
@@ -147,3 +149,8 @@ static inline bool irq_settings_is_nested_thread(struct irq_desc *desc)
147{ 149{
148 return desc->status_use_accessors & _IRQ_NESTED_THREAD; 150 return desc->status_use_accessors & _IRQ_NESTED_THREAD;
149} 151}
152
153static inline bool irq_settings_is_polled(struct irq_desc *desc)
154{
155 return desc->status_use_accessors & _IRQ_IS_POLLED;
156}
diff --git a/kernel/irq/spurious.c b/kernel/irq/spurious.c
index 7b5f012bde9d..a1d8cc63b56e 100644
--- a/kernel/irq/spurious.c
+++ b/kernel/irq/spurious.c
@@ -67,8 +67,13 @@ static int try_one_irq(int irq, struct irq_desc *desc, bool force)
67 67
68 raw_spin_lock(&desc->lock); 68 raw_spin_lock(&desc->lock);
69 69
70 /* PER_CPU and nested thread interrupts are never polled */ 70 /*
71 if (irq_settings_is_per_cpu(desc) || irq_settings_is_nested_thread(desc)) 71 * PER_CPU, nested thread interrupts and interrupts explicitely
72 * marked polled are excluded from polling.
73 */
74 if (irq_settings_is_per_cpu(desc) ||
75 irq_settings_is_nested_thread(desc) ||
76 irq_settings_is_polled(desc))
72 goto out; 77 goto out;
73 78
74 /* 79 /*
@@ -268,7 +273,8 @@ try_misrouted_irq(unsigned int irq, struct irq_desc *desc,
268void note_interrupt(unsigned int irq, struct irq_desc *desc, 273void note_interrupt(unsigned int irq, struct irq_desc *desc,
269 irqreturn_t action_ret) 274 irqreturn_t action_ret)
270{ 275{
271 if (desc->istate & IRQS_POLL_INPROGRESS) 276 if (desc->istate & IRQS_POLL_INPROGRESS ||
277 irq_settings_is_polled(desc))
272 return; 278 return;
273 279
274 /* we get here again via the threaded handler */ 280 /* we get here again via the threaded handler */
diff --git a/kernel/jump_label.c b/kernel/jump_label.c
index 297a9247a3b3..9019f15deab2 100644
--- a/kernel/jump_label.c
+++ b/kernel/jump_label.c
@@ -58,6 +58,7 @@ static void jump_label_update(struct static_key *key, int enable);
58 58
59void static_key_slow_inc(struct static_key *key) 59void static_key_slow_inc(struct static_key *key)
60{ 60{
61 STATIC_KEY_CHECK_USE();
61 if (atomic_inc_not_zero(&key->enabled)) 62 if (atomic_inc_not_zero(&key->enabled))
62 return; 63 return;
63 64
@@ -103,12 +104,14 @@ static void jump_label_update_timeout(struct work_struct *work)
103 104
104void static_key_slow_dec(struct static_key *key) 105void static_key_slow_dec(struct static_key *key)
105{ 106{
107 STATIC_KEY_CHECK_USE();
106 __static_key_slow_dec(key, 0, NULL); 108 __static_key_slow_dec(key, 0, NULL);
107} 109}
108EXPORT_SYMBOL_GPL(static_key_slow_dec); 110EXPORT_SYMBOL_GPL(static_key_slow_dec);
109 111
110void static_key_slow_dec_deferred(struct static_key_deferred *key) 112void static_key_slow_dec_deferred(struct static_key_deferred *key)
111{ 113{
114 STATIC_KEY_CHECK_USE();
112 __static_key_slow_dec(&key->key, key->timeout, &key->work); 115 __static_key_slow_dec(&key->key, key->timeout, &key->work);
113} 116}
114EXPORT_SYMBOL_GPL(static_key_slow_dec_deferred); 117EXPORT_SYMBOL_GPL(static_key_slow_dec_deferred);
@@ -116,6 +119,7 @@ EXPORT_SYMBOL_GPL(static_key_slow_dec_deferred);
116void jump_label_rate_limit(struct static_key_deferred *key, 119void jump_label_rate_limit(struct static_key_deferred *key,
117 unsigned long rl) 120 unsigned long rl)
118{ 121{
122 STATIC_KEY_CHECK_USE();
119 key->timeout = rl; 123 key->timeout = rl;
120 INIT_DELAYED_WORK(&key->work, jump_label_update_timeout); 124 INIT_DELAYED_WORK(&key->work, jump_label_update_timeout);
121} 125}
@@ -212,6 +216,7 @@ void __init jump_label_init(void)
212 key->next = NULL; 216 key->next = NULL;
213#endif 217#endif
214 } 218 }
219 static_key_initialized = true;
215 jump_label_unlock(); 220 jump_label_unlock();
216} 221}
217 222
diff --git a/kernel/kexec.c b/kernel/kexec.c
index 2a74f307c5ec..9c970167e402 100644
--- a/kernel/kexec.c
+++ b/kernel/kexec.c
@@ -47,6 +47,9 @@ u32 vmcoreinfo_note[VMCOREINFO_NOTE_SIZE/4];
47size_t vmcoreinfo_size; 47size_t vmcoreinfo_size;
48size_t vmcoreinfo_max_size = sizeof(vmcoreinfo_data); 48size_t vmcoreinfo_max_size = sizeof(vmcoreinfo_data);
49 49
50/* Flag to indicate we are going to kexec a new kernel */
51bool kexec_in_progress = false;
52
50/* Location of the reserved area for the crash kernel */ 53/* Location of the reserved area for the crash kernel */
51struct resource crashk_res = { 54struct resource crashk_res = {
52 .name = "Crash kernel", 55 .name = "Crash kernel",
@@ -921,7 +924,7 @@ static int kimage_load_segment(struct kimage *image,
921 * reinitialize them. 924 * reinitialize them.
922 * 925 *
923 * - A machine specific part that includes the syscall number 926 * - A machine specific part that includes the syscall number
924 * and the copies the image to it's final destination. And 927 * and then copies the image to it's final destination. And
925 * jumps into the image at entry. 928 * jumps into the image at entry.
926 * 929 *
927 * kexec does not sync, or unmount filesystems so if you need 930 * kexec does not sync, or unmount filesystems so if you need
@@ -1675,7 +1678,9 @@ int kernel_kexec(void)
1675 } else 1678 } else
1676#endif 1679#endif
1677 { 1680 {
1681 kexec_in_progress = true;
1678 kernel_restart_prepare(NULL); 1682 kernel_restart_prepare(NULL);
1683 migrate_to_reboot_cpu();
1679 printk(KERN_EMERG "Starting new kernel\n"); 1684 printk(KERN_EMERG "Starting new kernel\n");
1680 machine_shutdown(); 1685 machine_shutdown();
1681 } 1686 }
diff --git a/kernel/kprobes.c b/kernel/kprobes.c
index a0d367a49122..ceeadfcabb76 100644
--- a/kernel/kprobes.c
+++ b/kernel/kprobes.c
@@ -2066,7 +2066,7 @@ static int __init init_kprobes(void)
2066{ 2066{
2067 int i, err = 0; 2067 int i, err = 0;
2068 unsigned long offset = 0, size = 0; 2068 unsigned long offset = 0, size = 0;
2069 char *modname, namebuf[128]; 2069 char *modname, namebuf[KSYM_NAME_LEN];
2070 const char *symbol_name; 2070 const char *symbol_name;
2071 void *addr; 2071 void *addr;
2072 struct kprobe_blackpoint *kb; 2072 struct kprobe_blackpoint *kb;
@@ -2192,7 +2192,7 @@ static int __kprobes show_kprobe_addr(struct seq_file *pi, void *v)
2192 const char *sym = NULL; 2192 const char *sym = NULL;
2193 unsigned int i = *(loff_t *) v; 2193 unsigned int i = *(loff_t *) v;
2194 unsigned long offset = 0; 2194 unsigned long offset = 0;
2195 char *modname, namebuf[128]; 2195 char *modname, namebuf[KSYM_NAME_LEN];
2196 2196
2197 head = &kprobe_table[i]; 2197 head = &kprobe_table[i];
2198 preempt_disable(); 2198 preempt_disable();
diff --git a/kernel/kthread.c b/kernel/kthread.c
index 760e86df8c20..b5ae3ee860a9 100644
--- a/kernel/kthread.c
+++ b/kernel/kthread.c
@@ -33,7 +33,7 @@ struct kthread_create_info
33 33
34 /* Result passed back to kthread_create() from kthreadd. */ 34 /* Result passed back to kthread_create() from kthreadd. */
35 struct task_struct *result; 35 struct task_struct *result;
36 struct completion done; 36 struct completion *done;
37 37
38 struct list_head list; 38 struct list_head list;
39}; 39};
@@ -178,6 +178,7 @@ static int kthread(void *_create)
178 struct kthread_create_info *create = _create; 178 struct kthread_create_info *create = _create;
179 int (*threadfn)(void *data) = create->threadfn; 179 int (*threadfn)(void *data) = create->threadfn;
180 void *data = create->data; 180 void *data = create->data;
181 struct completion *done;
181 struct kthread self; 182 struct kthread self;
182 int ret; 183 int ret;
183 184
@@ -187,10 +188,16 @@ static int kthread(void *_create)
187 init_completion(&self.parked); 188 init_completion(&self.parked);
188 current->vfork_done = &self.exited; 189 current->vfork_done = &self.exited;
189 190
191 /* If user was SIGKILLed, I release the structure. */
192 done = xchg(&create->done, NULL);
193 if (!done) {
194 kfree(create);
195 do_exit(-EINTR);
196 }
190 /* OK, tell user we're spawned, wait for stop or wakeup */ 197 /* OK, tell user we're spawned, wait for stop or wakeup */
191 __set_current_state(TASK_UNINTERRUPTIBLE); 198 __set_current_state(TASK_UNINTERRUPTIBLE);
192 create->result = current; 199 create->result = current;
193 complete(&create->done); 200 complete(done);
194 schedule(); 201 schedule();
195 202
196 ret = -EINTR; 203 ret = -EINTR;
@@ -223,8 +230,15 @@ static void create_kthread(struct kthread_create_info *create)
223 /* We want our own signal handler (we take no signals by default). */ 230 /* We want our own signal handler (we take no signals by default). */
224 pid = kernel_thread(kthread, create, CLONE_FS | CLONE_FILES | SIGCHLD); 231 pid = kernel_thread(kthread, create, CLONE_FS | CLONE_FILES | SIGCHLD);
225 if (pid < 0) { 232 if (pid < 0) {
233 /* If user was SIGKILLed, I release the structure. */
234 struct completion *done = xchg(&create->done, NULL);
235
236 if (!done) {
237 kfree(create);
238 return;
239 }
226 create->result = ERR_PTR(pid); 240 create->result = ERR_PTR(pid);
227 complete(&create->done); 241 complete(done);
228 } 242 }
229} 243}
230 244
@@ -255,36 +269,59 @@ struct task_struct *kthread_create_on_node(int (*threadfn)(void *data),
255 const char namefmt[], 269 const char namefmt[],
256 ...) 270 ...)
257{ 271{
258 struct kthread_create_info create; 272 DECLARE_COMPLETION_ONSTACK(done);
259 273 struct task_struct *task;
260 create.threadfn = threadfn; 274 struct kthread_create_info *create = kmalloc(sizeof(*create),
261 create.data = data; 275 GFP_KERNEL);
262 create.node = node; 276
263 init_completion(&create.done); 277 if (!create)
278 return ERR_PTR(-ENOMEM);
279 create->threadfn = threadfn;
280 create->data = data;
281 create->node = node;
282 create->done = &done;
264 283
265 spin_lock(&kthread_create_lock); 284 spin_lock(&kthread_create_lock);
266 list_add_tail(&create.list, &kthread_create_list); 285 list_add_tail(&create->list, &kthread_create_list);
267 spin_unlock(&kthread_create_lock); 286 spin_unlock(&kthread_create_lock);
268 287
269 wake_up_process(kthreadd_task); 288 wake_up_process(kthreadd_task);
270 wait_for_completion(&create.done); 289 /*
271 290 * Wait for completion in killable state, for I might be chosen by
272 if (!IS_ERR(create.result)) { 291 * the OOM killer while kthreadd is trying to allocate memory for
292 * new kernel thread.
293 */
294 if (unlikely(wait_for_completion_killable(&done))) {
295 /*
296 * If I was SIGKILLed before kthreadd (or new kernel thread)
297 * calls complete(), leave the cleanup of this structure to
298 * that thread.
299 */
300 if (xchg(&create->done, NULL))
301 return ERR_PTR(-ENOMEM);
302 /*
303 * kthreadd (or new kernel thread) will call complete()
304 * shortly.
305 */
306 wait_for_completion(&done);
307 }
308 task = create->result;
309 if (!IS_ERR(task)) {
273 static const struct sched_param param = { .sched_priority = 0 }; 310 static const struct sched_param param = { .sched_priority = 0 };
274 va_list args; 311 va_list args;
275 312
276 va_start(args, namefmt); 313 va_start(args, namefmt);
277 vsnprintf(create.result->comm, sizeof(create.result->comm), 314 vsnprintf(task->comm, sizeof(task->comm), namefmt, args);
278 namefmt, args);
279 va_end(args); 315 va_end(args);
280 /* 316 /*
281 * root may have changed our (kthreadd's) priority or CPU mask. 317 * root may have changed our (kthreadd's) priority or CPU mask.
282 * The kernel thread should not inherit these properties. 318 * The kernel thread should not inherit these properties.
283 */ 319 */
284 sched_setscheduler_nocheck(create.result, SCHED_NORMAL, &param); 320 sched_setscheduler_nocheck(task, SCHED_NORMAL, &param);
285 set_cpus_allowed_ptr(create.result, cpu_all_mask); 321 set_cpus_allowed_ptr(task, cpu_all_mask);
286 } 322 }
287 return create.result; 323 kfree(create);
324 return task;
288} 325}
289EXPORT_SYMBOL(kthread_create_on_node); 326EXPORT_SYMBOL(kthread_create_on_node);
290 327
diff --git a/kernel/locking/Makefile b/kernel/locking/Makefile
new file mode 100644
index 000000000000..baab8e5e7f66
--- /dev/null
+++ b/kernel/locking/Makefile
@@ -0,0 +1,25 @@
1
2obj-y += mutex.o semaphore.o rwsem.o lglock.o
3
4ifdef CONFIG_FUNCTION_TRACER
5CFLAGS_REMOVE_lockdep.o = -pg
6CFLAGS_REMOVE_lockdep_proc.o = -pg
7CFLAGS_REMOVE_mutex-debug.o = -pg
8CFLAGS_REMOVE_rtmutex-debug.o = -pg
9endif
10
11obj-$(CONFIG_DEBUG_MUTEXES) += mutex-debug.o
12obj-$(CONFIG_LOCKDEP) += lockdep.o
13ifeq ($(CONFIG_PROC_FS),y)
14obj-$(CONFIG_LOCKDEP) += lockdep_proc.o
15endif
16obj-$(CONFIG_SMP) += spinlock.o
17obj-$(CONFIG_PROVE_LOCKING) += spinlock.o
18obj-$(CONFIG_RT_MUTEXES) += rtmutex.o
19obj-$(CONFIG_DEBUG_RT_MUTEXES) += rtmutex-debug.o
20obj-$(CONFIG_RT_MUTEX_TESTER) += rtmutex-tester.o
21obj-$(CONFIG_DEBUG_SPINLOCK) += spinlock.o
22obj-$(CONFIG_DEBUG_SPINLOCK) += spinlock_debug.o
23obj-$(CONFIG_RWSEM_GENERIC_SPINLOCK) += rwsem-spinlock.o
24obj-$(CONFIG_RWSEM_XCHGADD_ALGORITHM) += rwsem-xadd.o
25obj-$(CONFIG_PERCPU_RWSEM) += percpu-rwsem.o
diff --git a/kernel/lglock.c b/kernel/locking/lglock.c
index 86ae2aebf004..86ae2aebf004 100644
--- a/kernel/lglock.c
+++ b/kernel/locking/lglock.c
diff --git a/kernel/lockdep.c b/kernel/locking/lockdep.c
index e16c45b9ee77..576ba756a32d 100644
--- a/kernel/lockdep.c
+++ b/kernel/locking/lockdep.c
@@ -1232,7 +1232,7 @@ static int noop_count(struct lock_list *entry, void *data)
1232 return 0; 1232 return 0;
1233} 1233}
1234 1234
1235unsigned long __lockdep_count_forward_deps(struct lock_list *this) 1235static unsigned long __lockdep_count_forward_deps(struct lock_list *this)
1236{ 1236{
1237 unsigned long count = 0; 1237 unsigned long count = 0;
1238 struct lock_list *uninitialized_var(target_entry); 1238 struct lock_list *uninitialized_var(target_entry);
@@ -1258,7 +1258,7 @@ unsigned long lockdep_count_forward_deps(struct lock_class *class)
1258 return ret; 1258 return ret;
1259} 1259}
1260 1260
1261unsigned long __lockdep_count_backward_deps(struct lock_list *this) 1261static unsigned long __lockdep_count_backward_deps(struct lock_list *this)
1262{ 1262{
1263 unsigned long count = 0; 1263 unsigned long count = 0;
1264 struct lock_list *uninitialized_var(target_entry); 1264 struct lock_list *uninitialized_var(target_entry);
@@ -4224,7 +4224,7 @@ void lockdep_rcu_suspicious(const char *file, const int line, const char *s)
4224 printk("\n%srcu_scheduler_active = %d, debug_locks = %d\n", 4224 printk("\n%srcu_scheduler_active = %d, debug_locks = %d\n",
4225 !rcu_lockdep_current_cpu_online() 4225 !rcu_lockdep_current_cpu_online()
4226 ? "RCU used illegally from offline CPU!\n" 4226 ? "RCU used illegally from offline CPU!\n"
4227 : rcu_is_cpu_idle() 4227 : !rcu_is_watching()
4228 ? "RCU used illegally from idle CPU!\n" 4228 ? "RCU used illegally from idle CPU!\n"
4229 : "", 4229 : "",
4230 rcu_scheduler_active, debug_locks); 4230 rcu_scheduler_active, debug_locks);
@@ -4247,7 +4247,7 @@ void lockdep_rcu_suspicious(const char *file, const int line, const char *s)
4247 * So complain bitterly if someone does call rcu_read_lock(), 4247 * So complain bitterly if someone does call rcu_read_lock(),
4248 * rcu_read_lock_bh() and so on from extended quiescent states. 4248 * rcu_read_lock_bh() and so on from extended quiescent states.
4249 */ 4249 */
4250 if (rcu_is_cpu_idle()) 4250 if (!rcu_is_watching())
4251 printk("RCU used illegally from extended quiescent state!\n"); 4251 printk("RCU used illegally from extended quiescent state!\n");
4252 4252
4253 lockdep_print_held_locks(curr); 4253 lockdep_print_held_locks(curr);
diff --git a/kernel/lockdep_internals.h b/kernel/locking/lockdep_internals.h
index 4f560cfedc8f..4f560cfedc8f 100644
--- a/kernel/lockdep_internals.h
+++ b/kernel/locking/lockdep_internals.h
diff --git a/kernel/lockdep_proc.c b/kernel/locking/lockdep_proc.c
index b2c71c5873e4..ef43ac4bafb5 100644
--- a/kernel/lockdep_proc.c
+++ b/kernel/locking/lockdep_proc.c
@@ -421,6 +421,7 @@ static void seq_lock_time(struct seq_file *m, struct lock_time *lt)
421 seq_time(m, lt->min); 421 seq_time(m, lt->min);
422 seq_time(m, lt->max); 422 seq_time(m, lt->max);
423 seq_time(m, lt->total); 423 seq_time(m, lt->total);
424 seq_time(m, lt->nr ? div_s64(lt->total, lt->nr) : 0);
424} 425}
425 426
426static void seq_stats(struct seq_file *m, struct lock_stat_data *data) 427static void seq_stats(struct seq_file *m, struct lock_stat_data *data)
@@ -518,20 +519,20 @@ static void seq_stats(struct seq_file *m, struct lock_stat_data *data)
518 } 519 }
519 if (i) { 520 if (i) {
520 seq_puts(m, "\n"); 521 seq_puts(m, "\n");
521 seq_line(m, '.', 0, 40 + 1 + 10 * (14 + 1)); 522 seq_line(m, '.', 0, 40 + 1 + 12 * (14 + 1));
522 seq_puts(m, "\n"); 523 seq_puts(m, "\n");
523 } 524 }
524} 525}
525 526
526static void seq_header(struct seq_file *m) 527static void seq_header(struct seq_file *m)
527{ 528{
528 seq_printf(m, "lock_stat version 0.3\n"); 529 seq_puts(m, "lock_stat version 0.4\n");
529 530
530 if (unlikely(!debug_locks)) 531 if (unlikely(!debug_locks))
531 seq_printf(m, "*WARNING* lock debugging disabled!! - possibly due to a lockdep warning\n"); 532 seq_printf(m, "*WARNING* lock debugging disabled!! - possibly due to a lockdep warning\n");
532 533
533 seq_line(m, '-', 0, 40 + 1 + 10 * (14 + 1)); 534 seq_line(m, '-', 0, 40 + 1 + 12 * (14 + 1));
534 seq_printf(m, "%40s %14s %14s %14s %14s %14s %14s %14s %14s " 535 seq_printf(m, "%40s %14s %14s %14s %14s %14s %14s %14s %14s %14s %14s "
535 "%14s %14s\n", 536 "%14s %14s\n",
536 "class name", 537 "class name",
537 "con-bounces", 538 "con-bounces",
@@ -539,12 +540,14 @@ static void seq_header(struct seq_file *m)
539 "waittime-min", 540 "waittime-min",
540 "waittime-max", 541 "waittime-max",
541 "waittime-total", 542 "waittime-total",
543 "waittime-avg",
542 "acq-bounces", 544 "acq-bounces",
543 "acquisitions", 545 "acquisitions",
544 "holdtime-min", 546 "holdtime-min",
545 "holdtime-max", 547 "holdtime-max",
546 "holdtime-total"); 548 "holdtime-total",
547 seq_line(m, '-', 0, 40 + 1 + 10 * (14 + 1)); 549 "holdtime-avg");
550 seq_line(m, '-', 0, 40 + 1 + 12 * (14 + 1));
548 seq_printf(m, "\n"); 551 seq_printf(m, "\n");
549} 552}
550 553
diff --git a/kernel/lockdep_states.h b/kernel/locking/lockdep_states.h
index 995b0cc2b84c..995b0cc2b84c 100644
--- a/kernel/lockdep_states.h
+++ b/kernel/locking/lockdep_states.h
diff --git a/kernel/mutex-debug.c b/kernel/locking/mutex-debug.c
index 7e3443fe1f48..7e3443fe1f48 100644
--- a/kernel/mutex-debug.c
+++ b/kernel/locking/mutex-debug.c
diff --git a/kernel/mutex-debug.h b/kernel/locking/mutex-debug.h
index 0799fd3e4cfa..0799fd3e4cfa 100644
--- a/kernel/mutex-debug.h
+++ b/kernel/locking/mutex-debug.h
diff --git a/kernel/mutex.c b/kernel/locking/mutex.c
index d24105b1b794..4dd6e4c219de 100644
--- a/kernel/mutex.c
+++ b/kernel/locking/mutex.c
@@ -1,5 +1,5 @@
1/* 1/*
2 * kernel/mutex.c 2 * kernel/locking/mutex.c
3 * 3 *
4 * Mutexes: blocking mutual exclusion locks 4 * Mutexes: blocking mutual exclusion locks
5 * 5 *
diff --git a/kernel/mutex.h b/kernel/locking/mutex.h
index 4115fbf83b12..4115fbf83b12 100644
--- a/kernel/mutex.h
+++ b/kernel/locking/mutex.h
diff --git a/kernel/locking/percpu-rwsem.c b/kernel/locking/percpu-rwsem.c
new file mode 100644
index 000000000000..652a8ee8efe9
--- /dev/null
+++ b/kernel/locking/percpu-rwsem.c
@@ -0,0 +1,165 @@
1#include <linux/atomic.h>
2#include <linux/rwsem.h>
3#include <linux/percpu.h>
4#include <linux/wait.h>
5#include <linux/lockdep.h>
6#include <linux/percpu-rwsem.h>
7#include <linux/rcupdate.h>
8#include <linux/sched.h>
9#include <linux/errno.h>
10
11int __percpu_init_rwsem(struct percpu_rw_semaphore *brw,
12 const char *name, struct lock_class_key *rwsem_key)
13{
14 brw->fast_read_ctr = alloc_percpu(int);
15 if (unlikely(!brw->fast_read_ctr))
16 return -ENOMEM;
17
18 /* ->rw_sem represents the whole percpu_rw_semaphore for lockdep */
19 __init_rwsem(&brw->rw_sem, name, rwsem_key);
20 atomic_set(&brw->write_ctr, 0);
21 atomic_set(&brw->slow_read_ctr, 0);
22 init_waitqueue_head(&brw->write_waitq);
23 return 0;
24}
25
26void percpu_free_rwsem(struct percpu_rw_semaphore *brw)
27{
28 free_percpu(brw->fast_read_ctr);
29 brw->fast_read_ctr = NULL; /* catch use after free bugs */
30}
31
32/*
33 * This is the fast-path for down_read/up_read, it only needs to ensure
34 * there is no pending writer (atomic_read(write_ctr) == 0) and inc/dec the
35 * fast per-cpu counter. The writer uses synchronize_sched_expedited() to
36 * serialize with the preempt-disabled section below.
37 *
38 * The nontrivial part is that we should guarantee acquire/release semantics
39 * in case when
40 *
41 * R_W: down_write() comes after up_read(), the writer should see all
42 * changes done by the reader
43 * or
44 * W_R: down_read() comes after up_write(), the reader should see all
45 * changes done by the writer
46 *
47 * If this helper fails the callers rely on the normal rw_semaphore and
48 * atomic_dec_and_test(), so in this case we have the necessary barriers.
49 *
50 * But if it succeeds we do not have any barriers, atomic_read(write_ctr) or
51 * __this_cpu_add() below can be reordered with any LOAD/STORE done by the
52 * reader inside the critical section. See the comments in down_write and
53 * up_write below.
54 */
55static bool update_fast_ctr(struct percpu_rw_semaphore *brw, unsigned int val)
56{
57 bool success = false;
58
59 preempt_disable();
60 if (likely(!atomic_read(&brw->write_ctr))) {
61 __this_cpu_add(*brw->fast_read_ctr, val);
62 success = true;
63 }
64 preempt_enable();
65
66 return success;
67}
68
69/*
70 * Like the normal down_read() this is not recursive, the writer can
71 * come after the first percpu_down_read() and create the deadlock.
72 *
73 * Note: returns with lock_is_held(brw->rw_sem) == T for lockdep,
74 * percpu_up_read() does rwsem_release(). This pairs with the usage
75 * of ->rw_sem in percpu_down/up_write().
76 */
77void percpu_down_read(struct percpu_rw_semaphore *brw)
78{
79 might_sleep();
80 if (likely(update_fast_ctr(brw, +1))) {
81 rwsem_acquire_read(&brw->rw_sem.dep_map, 0, 0, _RET_IP_);
82 return;
83 }
84
85 down_read(&brw->rw_sem);
86 atomic_inc(&brw->slow_read_ctr);
87 /* avoid up_read()->rwsem_release() */
88 __up_read(&brw->rw_sem);
89}
90
91void percpu_up_read(struct percpu_rw_semaphore *brw)
92{
93 rwsem_release(&brw->rw_sem.dep_map, 1, _RET_IP_);
94
95 if (likely(update_fast_ctr(brw, -1)))
96 return;
97
98 /* false-positive is possible but harmless */
99 if (atomic_dec_and_test(&brw->slow_read_ctr))
100 wake_up_all(&brw->write_waitq);
101}
102
103static int clear_fast_ctr(struct percpu_rw_semaphore *brw)
104{
105 unsigned int sum = 0;
106 int cpu;
107
108 for_each_possible_cpu(cpu) {
109 sum += per_cpu(*brw->fast_read_ctr, cpu);
110 per_cpu(*brw->fast_read_ctr, cpu) = 0;
111 }
112
113 return sum;
114}
115
116/*
117 * A writer increments ->write_ctr to force the readers to switch to the
118 * slow mode, note the atomic_read() check in update_fast_ctr().
119 *
120 * After that the readers can only inc/dec the slow ->slow_read_ctr counter,
121 * ->fast_read_ctr is stable. Once the writer moves its sum into the slow
122 * counter it represents the number of active readers.
123 *
124 * Finally the writer takes ->rw_sem for writing and blocks the new readers,
125 * then waits until the slow counter becomes zero.
126 */
127void percpu_down_write(struct percpu_rw_semaphore *brw)
128{
129 /* tell update_fast_ctr() there is a pending writer */
130 atomic_inc(&brw->write_ctr);
131 /*
132 * 1. Ensures that write_ctr != 0 is visible to any down_read/up_read
133 * so that update_fast_ctr() can't succeed.
134 *
135 * 2. Ensures we see the result of every previous this_cpu_add() in
136 * update_fast_ctr().
137 *
138 * 3. Ensures that if any reader has exited its critical section via
139 * fast-path, it executes a full memory barrier before we return.
140 * See R_W case in the comment above update_fast_ctr().
141 */
142 synchronize_sched_expedited();
143
144 /* exclude other writers, and block the new readers completely */
145 down_write(&brw->rw_sem);
146
147 /* nobody can use fast_read_ctr, move its sum into slow_read_ctr */
148 atomic_add(clear_fast_ctr(brw), &brw->slow_read_ctr);
149
150 /* wait for all readers to complete their percpu_up_read() */
151 wait_event(brw->write_waitq, !atomic_read(&brw->slow_read_ctr));
152}
153
154void percpu_up_write(struct percpu_rw_semaphore *brw)
155{
156 /* release the lock, but the readers can't use the fast-path */
157 up_write(&brw->rw_sem);
158 /*
159 * Insert the barrier before the next fast-path in down_read,
160 * see W_R case in the comment above update_fast_ctr().
161 */
162 synchronize_sched_expedited();
163 /* the last writer unblocks update_fast_ctr() */
164 atomic_dec(&brw->write_ctr);
165}
diff --git a/kernel/rtmutex-debug.c b/kernel/locking/rtmutex-debug.c
index 13b243a323fa..13b243a323fa 100644
--- a/kernel/rtmutex-debug.c
+++ b/kernel/locking/rtmutex-debug.c
diff --git a/kernel/rtmutex-debug.h b/kernel/locking/rtmutex-debug.h
index 14193d596d78..14193d596d78 100644
--- a/kernel/rtmutex-debug.h
+++ b/kernel/locking/rtmutex-debug.h
diff --git a/kernel/rtmutex-tester.c b/kernel/locking/rtmutex-tester.c
index 1d96dd0d93c1..1d96dd0d93c1 100644
--- a/kernel/rtmutex-tester.c
+++ b/kernel/locking/rtmutex-tester.c
diff --git a/kernel/rtmutex.c b/kernel/locking/rtmutex.c
index 0dd6aec1cb6a..0dd6aec1cb6a 100644
--- a/kernel/rtmutex.c
+++ b/kernel/locking/rtmutex.c
diff --git a/kernel/rtmutex.h b/kernel/locking/rtmutex.h
index a1a1dd06421d..a1a1dd06421d 100644
--- a/kernel/rtmutex.h
+++ b/kernel/locking/rtmutex.h
diff --git a/kernel/rtmutex_common.h b/kernel/locking/rtmutex_common.h
index 53a66c85261b..53a66c85261b 100644
--- a/kernel/rtmutex_common.h
+++ b/kernel/locking/rtmutex_common.h
diff --git a/kernel/locking/rwsem-spinlock.c b/kernel/locking/rwsem-spinlock.c
new file mode 100644
index 000000000000..9be8a9144978
--- /dev/null
+++ b/kernel/locking/rwsem-spinlock.c
@@ -0,0 +1,296 @@
1/* rwsem-spinlock.c: R/W semaphores: contention handling functions for
2 * generic spinlock implementation
3 *
4 * Copyright (c) 2001 David Howells (dhowells@redhat.com).
5 * - Derived partially from idea by Andrea Arcangeli <andrea@suse.de>
6 * - Derived also from comments by Linus
7 */
8#include <linux/rwsem.h>
9#include <linux/sched.h>
10#include <linux/export.h>
11
12enum rwsem_waiter_type {
13 RWSEM_WAITING_FOR_WRITE,
14 RWSEM_WAITING_FOR_READ
15};
16
17struct rwsem_waiter {
18 struct list_head list;
19 struct task_struct *task;
20 enum rwsem_waiter_type type;
21};
22
23int rwsem_is_locked(struct rw_semaphore *sem)
24{
25 int ret = 1;
26 unsigned long flags;
27
28 if (raw_spin_trylock_irqsave(&sem->wait_lock, flags)) {
29 ret = (sem->activity != 0);
30 raw_spin_unlock_irqrestore(&sem->wait_lock, flags);
31 }
32 return ret;
33}
34EXPORT_SYMBOL(rwsem_is_locked);
35
36/*
37 * initialise the semaphore
38 */
39void __init_rwsem(struct rw_semaphore *sem, const char *name,
40 struct lock_class_key *key)
41{
42#ifdef CONFIG_DEBUG_LOCK_ALLOC
43 /*
44 * Make sure we are not reinitializing a held semaphore:
45 */
46 debug_check_no_locks_freed((void *)sem, sizeof(*sem));
47 lockdep_init_map(&sem->dep_map, name, key, 0);
48#endif
49 sem->activity = 0;
50 raw_spin_lock_init(&sem->wait_lock);
51 INIT_LIST_HEAD(&sem->wait_list);
52}
53EXPORT_SYMBOL(__init_rwsem);
54
55/*
56 * handle the lock release when processes blocked on it that can now run
57 * - if we come here, then:
58 * - the 'active count' _reached_ zero
59 * - the 'waiting count' is non-zero
60 * - the spinlock must be held by the caller
61 * - woken process blocks are discarded from the list after having task zeroed
62 * - writers are only woken if wakewrite is non-zero
63 */
64static inline struct rw_semaphore *
65__rwsem_do_wake(struct rw_semaphore *sem, int wakewrite)
66{
67 struct rwsem_waiter *waiter;
68 struct task_struct *tsk;
69 int woken;
70
71 waiter = list_entry(sem->wait_list.next, struct rwsem_waiter, list);
72
73 if (waiter->type == RWSEM_WAITING_FOR_WRITE) {
74 if (wakewrite)
75 /* Wake up a writer. Note that we do not grant it the
76 * lock - it will have to acquire it when it runs. */
77 wake_up_process(waiter->task);
78 goto out;
79 }
80
81 /* grant an infinite number of read locks to the front of the queue */
82 woken = 0;
83 do {
84 struct list_head *next = waiter->list.next;
85
86 list_del(&waiter->list);
87 tsk = waiter->task;
88 smp_mb();
89 waiter->task = NULL;
90 wake_up_process(tsk);
91 put_task_struct(tsk);
92 woken++;
93 if (next == &sem->wait_list)
94 break;
95 waiter = list_entry(next, struct rwsem_waiter, list);
96 } while (waiter->type != RWSEM_WAITING_FOR_WRITE);
97
98 sem->activity += woken;
99
100 out:
101 return sem;
102}
103
104/*
105 * wake a single writer
106 */
107static inline struct rw_semaphore *
108__rwsem_wake_one_writer(struct rw_semaphore *sem)
109{
110 struct rwsem_waiter *waiter;
111
112 waiter = list_entry(sem->wait_list.next, struct rwsem_waiter, list);
113 wake_up_process(waiter->task);
114
115 return sem;
116}
117
118/*
119 * get a read lock on the semaphore
120 */
121void __sched __down_read(struct rw_semaphore *sem)
122{
123 struct rwsem_waiter waiter;
124 struct task_struct *tsk;
125 unsigned long flags;
126
127 raw_spin_lock_irqsave(&sem->wait_lock, flags);
128
129 if (sem->activity >= 0 && list_empty(&sem->wait_list)) {
130 /* granted */
131 sem->activity++;
132 raw_spin_unlock_irqrestore(&sem->wait_lock, flags);
133 goto out;
134 }
135
136 tsk = current;
137 set_task_state(tsk, TASK_UNINTERRUPTIBLE);
138
139 /* set up my own style of waitqueue */
140 waiter.task = tsk;
141 waiter.type = RWSEM_WAITING_FOR_READ;
142 get_task_struct(tsk);
143
144 list_add_tail(&waiter.list, &sem->wait_list);
145
146 /* we don't need to touch the semaphore struct anymore */
147 raw_spin_unlock_irqrestore(&sem->wait_lock, flags);
148
149 /* wait to be given the lock */
150 for (;;) {
151 if (!waiter.task)
152 break;
153 schedule();
154 set_task_state(tsk, TASK_UNINTERRUPTIBLE);
155 }
156
157 tsk->state = TASK_RUNNING;
158 out:
159 ;
160}
161
162/*
163 * trylock for reading -- returns 1 if successful, 0 if contention
164 */
165int __down_read_trylock(struct rw_semaphore *sem)
166{
167 unsigned long flags;
168 int ret = 0;
169
170
171 raw_spin_lock_irqsave(&sem->wait_lock, flags);
172
173 if (sem->activity >= 0 && list_empty(&sem->wait_list)) {
174 /* granted */
175 sem->activity++;
176 ret = 1;
177 }
178
179 raw_spin_unlock_irqrestore(&sem->wait_lock, flags);
180
181 return ret;
182}
183
184/*
185 * get a write lock on the semaphore
186 */
187void __sched __down_write_nested(struct rw_semaphore *sem, int subclass)
188{
189 struct rwsem_waiter waiter;
190 struct task_struct *tsk;
191 unsigned long flags;
192
193 raw_spin_lock_irqsave(&sem->wait_lock, flags);
194
195 /* set up my own style of waitqueue */
196 tsk = current;
197 waiter.task = tsk;
198 waiter.type = RWSEM_WAITING_FOR_WRITE;
199 list_add_tail(&waiter.list, &sem->wait_list);
200
201 /* wait for someone to release the lock */
202 for (;;) {
203 /*
204 * That is the key to support write lock stealing: allows the
205 * task already on CPU to get the lock soon rather than put
206 * itself into sleep and waiting for system woke it or someone
207 * else in the head of the wait list up.
208 */
209 if (sem->activity == 0)
210 break;
211 set_task_state(tsk, TASK_UNINTERRUPTIBLE);
212 raw_spin_unlock_irqrestore(&sem->wait_lock, flags);
213 schedule();
214 raw_spin_lock_irqsave(&sem->wait_lock, flags);
215 }
216 /* got the lock */
217 sem->activity = -1;
218 list_del(&waiter.list);
219
220 raw_spin_unlock_irqrestore(&sem->wait_lock, flags);
221}
222
223void __sched __down_write(struct rw_semaphore *sem)
224{
225 __down_write_nested(sem, 0);
226}
227
228/*
229 * trylock for writing -- returns 1 if successful, 0 if contention
230 */
231int __down_write_trylock(struct rw_semaphore *sem)
232{
233 unsigned long flags;
234 int ret = 0;
235
236 raw_spin_lock_irqsave(&sem->wait_lock, flags);
237
238 if (sem->activity == 0) {
239 /* got the lock */
240 sem->activity = -1;
241 ret = 1;
242 }
243
244 raw_spin_unlock_irqrestore(&sem->wait_lock, flags);
245
246 return ret;
247}
248
249/*
250 * release a read lock on the semaphore
251 */
252void __up_read(struct rw_semaphore *sem)
253{
254 unsigned long flags;
255
256 raw_spin_lock_irqsave(&sem->wait_lock, flags);
257
258 if (--sem->activity == 0 && !list_empty(&sem->wait_list))
259 sem = __rwsem_wake_one_writer(sem);
260
261 raw_spin_unlock_irqrestore(&sem->wait_lock, flags);
262}
263
264/*
265 * release a write lock on the semaphore
266 */
267void __up_write(struct rw_semaphore *sem)
268{
269 unsigned long flags;
270
271 raw_spin_lock_irqsave(&sem->wait_lock, flags);
272
273 sem->activity = 0;
274 if (!list_empty(&sem->wait_list))
275 sem = __rwsem_do_wake(sem, 1);
276
277 raw_spin_unlock_irqrestore(&sem->wait_lock, flags);
278}
279
280/*
281 * downgrade a write lock into a read lock
282 * - just wake up any readers at the front of the queue
283 */
284void __downgrade_write(struct rw_semaphore *sem)
285{
286 unsigned long flags;
287
288 raw_spin_lock_irqsave(&sem->wait_lock, flags);
289
290 sem->activity = 1;
291 if (!list_empty(&sem->wait_list))
292 sem = __rwsem_do_wake(sem, 0);
293
294 raw_spin_unlock_irqrestore(&sem->wait_lock, flags);
295}
296
diff --git a/kernel/locking/rwsem-xadd.c b/kernel/locking/rwsem-xadd.c
new file mode 100644
index 000000000000..19c5fa95e0b4
--- /dev/null
+++ b/kernel/locking/rwsem-xadd.c
@@ -0,0 +1,293 @@
1/* rwsem.c: R/W semaphores: contention handling functions
2 *
3 * Written by David Howells (dhowells@redhat.com).
4 * Derived from arch/i386/kernel/semaphore.c
5 *
6 * Writer lock-stealing by Alex Shi <alex.shi@intel.com>
7 * and Michel Lespinasse <walken@google.com>
8 */
9#include <linux/rwsem.h>
10#include <linux/sched.h>
11#include <linux/init.h>
12#include <linux/export.h>
13
14/*
15 * Initialize an rwsem:
16 */
17void __init_rwsem(struct rw_semaphore *sem, const char *name,
18 struct lock_class_key *key)
19{
20#ifdef CONFIG_DEBUG_LOCK_ALLOC
21 /*
22 * Make sure we are not reinitializing a held semaphore:
23 */
24 debug_check_no_locks_freed((void *)sem, sizeof(*sem));
25 lockdep_init_map(&sem->dep_map, name, key, 0);
26#endif
27 sem->count = RWSEM_UNLOCKED_VALUE;
28 raw_spin_lock_init(&sem->wait_lock);
29 INIT_LIST_HEAD(&sem->wait_list);
30}
31
32EXPORT_SYMBOL(__init_rwsem);
33
34enum rwsem_waiter_type {
35 RWSEM_WAITING_FOR_WRITE,
36 RWSEM_WAITING_FOR_READ
37};
38
39struct rwsem_waiter {
40 struct list_head list;
41 struct task_struct *task;
42 enum rwsem_waiter_type type;
43};
44
45enum rwsem_wake_type {
46 RWSEM_WAKE_ANY, /* Wake whatever's at head of wait list */
47 RWSEM_WAKE_READERS, /* Wake readers only */
48 RWSEM_WAKE_READ_OWNED /* Waker thread holds the read lock */
49};
50
51/*
52 * handle the lock release when processes blocked on it that can now run
53 * - if we come here from up_xxxx(), then:
54 * - the 'active part' of count (&0x0000ffff) reached 0 (but may have changed)
55 * - the 'waiting part' of count (&0xffff0000) is -ve (and will still be so)
56 * - there must be someone on the queue
57 * - the spinlock must be held by the caller
58 * - woken process blocks are discarded from the list after having task zeroed
59 * - writers are only woken if downgrading is false
60 */
61static struct rw_semaphore *
62__rwsem_do_wake(struct rw_semaphore *sem, enum rwsem_wake_type wake_type)
63{
64 struct rwsem_waiter *waiter;
65 struct task_struct *tsk;
66 struct list_head *next;
67 long oldcount, woken, loop, adjustment;
68
69 waiter = list_entry(sem->wait_list.next, struct rwsem_waiter, list);
70 if (waiter->type == RWSEM_WAITING_FOR_WRITE) {
71 if (wake_type == RWSEM_WAKE_ANY)
72 /* Wake writer at the front of the queue, but do not
73 * grant it the lock yet as we want other writers
74 * to be able to steal it. Readers, on the other hand,
75 * will block as they will notice the queued writer.
76 */
77 wake_up_process(waiter->task);
78 goto out;
79 }
80
81 /* Writers might steal the lock before we grant it to the next reader.
82 * We prefer to do the first reader grant before counting readers
83 * so we can bail out early if a writer stole the lock.
84 */
85 adjustment = 0;
86 if (wake_type != RWSEM_WAKE_READ_OWNED) {
87 adjustment = RWSEM_ACTIVE_READ_BIAS;
88 try_reader_grant:
89 oldcount = rwsem_atomic_update(adjustment, sem) - adjustment;
90 if (unlikely(oldcount < RWSEM_WAITING_BIAS)) {
91 /* A writer stole the lock. Undo our reader grant. */
92 if (rwsem_atomic_update(-adjustment, sem) &
93 RWSEM_ACTIVE_MASK)
94 goto out;
95 /* Last active locker left. Retry waking readers. */
96 goto try_reader_grant;
97 }
98 }
99
100 /* Grant an infinite number of read locks to the readers at the front
101 * of the queue. Note we increment the 'active part' of the count by
102 * the number of readers before waking any processes up.
103 */
104 woken = 0;
105 do {
106 woken++;
107
108 if (waiter->list.next == &sem->wait_list)
109 break;
110
111 waiter = list_entry(waiter->list.next,
112 struct rwsem_waiter, list);
113
114 } while (waiter->type != RWSEM_WAITING_FOR_WRITE);
115
116 adjustment = woken * RWSEM_ACTIVE_READ_BIAS - adjustment;
117 if (waiter->type != RWSEM_WAITING_FOR_WRITE)
118 /* hit end of list above */
119 adjustment -= RWSEM_WAITING_BIAS;
120
121 if (adjustment)
122 rwsem_atomic_add(adjustment, sem);
123
124 next = sem->wait_list.next;
125 loop = woken;
126 do {
127 waiter = list_entry(next, struct rwsem_waiter, list);
128 next = waiter->list.next;
129 tsk = waiter->task;
130 smp_mb();
131 waiter->task = NULL;
132 wake_up_process(tsk);
133 put_task_struct(tsk);
134 } while (--loop);
135
136 sem->wait_list.next = next;
137 next->prev = &sem->wait_list;
138
139 out:
140 return sem;
141}
142
143/*
144 * wait for the read lock to be granted
145 */
146struct rw_semaphore __sched *rwsem_down_read_failed(struct rw_semaphore *sem)
147{
148 long count, adjustment = -RWSEM_ACTIVE_READ_BIAS;
149 struct rwsem_waiter waiter;
150 struct task_struct *tsk = current;
151
152 /* set up my own style of waitqueue */
153 waiter.task = tsk;
154 waiter.type = RWSEM_WAITING_FOR_READ;
155 get_task_struct(tsk);
156
157 raw_spin_lock_irq(&sem->wait_lock);
158 if (list_empty(&sem->wait_list))
159 adjustment += RWSEM_WAITING_BIAS;
160 list_add_tail(&waiter.list, &sem->wait_list);
161
162 /* we're now waiting on the lock, but no longer actively locking */
163 count = rwsem_atomic_update(adjustment, sem);
164
165 /* If there are no active locks, wake the front queued process(es).
166 *
167 * If there are no writers and we are first in the queue,
168 * wake our own waiter to join the existing active readers !
169 */
170 if (count == RWSEM_WAITING_BIAS ||
171 (count > RWSEM_WAITING_BIAS &&
172 adjustment != -RWSEM_ACTIVE_READ_BIAS))
173 sem = __rwsem_do_wake(sem, RWSEM_WAKE_ANY);
174
175 raw_spin_unlock_irq(&sem->wait_lock);
176
177 /* wait to be given the lock */
178 while (true) {
179 set_task_state(tsk, TASK_UNINTERRUPTIBLE);
180 if (!waiter.task)
181 break;
182 schedule();
183 }
184
185 tsk->state = TASK_RUNNING;
186
187 return sem;
188}
189
190/*
191 * wait until we successfully acquire the write lock
192 */
193struct rw_semaphore __sched *rwsem_down_write_failed(struct rw_semaphore *sem)
194{
195 long count, adjustment = -RWSEM_ACTIVE_WRITE_BIAS;
196 struct rwsem_waiter waiter;
197 struct task_struct *tsk = current;
198
199 /* set up my own style of waitqueue */
200 waiter.task = tsk;
201 waiter.type = RWSEM_WAITING_FOR_WRITE;
202
203 raw_spin_lock_irq(&sem->wait_lock);
204 if (list_empty(&sem->wait_list))
205 adjustment += RWSEM_WAITING_BIAS;
206 list_add_tail(&waiter.list, &sem->wait_list);
207
208 /* we're now waiting on the lock, but no longer actively locking */
209 count = rwsem_atomic_update(adjustment, sem);
210
211 /* If there were already threads queued before us and there are no
212 * active writers, the lock must be read owned; so we try to wake
213 * any read locks that were queued ahead of us. */
214 if (count > RWSEM_WAITING_BIAS &&
215 adjustment == -RWSEM_ACTIVE_WRITE_BIAS)
216 sem = __rwsem_do_wake(sem, RWSEM_WAKE_READERS);
217
218 /* wait until we successfully acquire the lock */
219 set_task_state(tsk, TASK_UNINTERRUPTIBLE);
220 while (true) {
221 if (!(count & RWSEM_ACTIVE_MASK)) {
222 /* Try acquiring the write lock. */
223 count = RWSEM_ACTIVE_WRITE_BIAS;
224 if (!list_is_singular(&sem->wait_list))
225 count += RWSEM_WAITING_BIAS;
226
227 if (sem->count == RWSEM_WAITING_BIAS &&
228 cmpxchg(&sem->count, RWSEM_WAITING_BIAS, count) ==
229 RWSEM_WAITING_BIAS)
230 break;
231 }
232
233 raw_spin_unlock_irq(&sem->wait_lock);
234
235 /* Block until there are no active lockers. */
236 do {
237 schedule();
238 set_task_state(tsk, TASK_UNINTERRUPTIBLE);
239 } while ((count = sem->count) & RWSEM_ACTIVE_MASK);
240
241 raw_spin_lock_irq(&sem->wait_lock);
242 }
243
244 list_del(&waiter.list);
245 raw_spin_unlock_irq(&sem->wait_lock);
246 tsk->state = TASK_RUNNING;
247
248 return sem;
249}
250
251/*
252 * handle waking up a waiter on the semaphore
253 * - up_read/up_write has decremented the active part of count if we come here
254 */
255struct rw_semaphore *rwsem_wake(struct rw_semaphore *sem)
256{
257 unsigned long flags;
258
259 raw_spin_lock_irqsave(&sem->wait_lock, flags);
260
261 /* do nothing if list empty */
262 if (!list_empty(&sem->wait_list))
263 sem = __rwsem_do_wake(sem, RWSEM_WAKE_ANY);
264
265 raw_spin_unlock_irqrestore(&sem->wait_lock, flags);
266
267 return sem;
268}
269
270/*
271 * downgrade a write lock into a read lock
272 * - caller incremented waiting part of count and discovered it still negative
273 * - just wake up any readers at the front of the queue
274 */
275struct rw_semaphore *rwsem_downgrade_wake(struct rw_semaphore *sem)
276{
277 unsigned long flags;
278
279 raw_spin_lock_irqsave(&sem->wait_lock, flags);
280
281 /* do nothing if list empty */
282 if (!list_empty(&sem->wait_list))
283 sem = __rwsem_do_wake(sem, RWSEM_WAKE_READ_OWNED);
284
285 raw_spin_unlock_irqrestore(&sem->wait_lock, flags);
286
287 return sem;
288}
289
290EXPORT_SYMBOL(rwsem_down_read_failed);
291EXPORT_SYMBOL(rwsem_down_write_failed);
292EXPORT_SYMBOL(rwsem_wake);
293EXPORT_SYMBOL(rwsem_downgrade_wake);
diff --git a/kernel/rwsem.c b/kernel/locking/rwsem.c
index cfff1435bdfb..cfff1435bdfb 100644
--- a/kernel/rwsem.c
+++ b/kernel/locking/rwsem.c
diff --git a/kernel/semaphore.c b/kernel/locking/semaphore.c
index 6815171a4fff..6815171a4fff 100644
--- a/kernel/semaphore.c
+++ b/kernel/locking/semaphore.c
diff --git a/kernel/spinlock.c b/kernel/locking/spinlock.c
index 4b082b5cac9e..4b082b5cac9e 100644
--- a/kernel/spinlock.c
+++ b/kernel/locking/spinlock.c
diff --git a/kernel/locking/spinlock_debug.c b/kernel/locking/spinlock_debug.c
new file mode 100644
index 000000000000..0374a596cffa
--- /dev/null
+++ b/kernel/locking/spinlock_debug.c
@@ -0,0 +1,302 @@
1/*
2 * Copyright 2005, Red Hat, Inc., Ingo Molnar
3 * Released under the General Public License (GPL).
4 *
5 * This file contains the spinlock/rwlock implementations for
6 * DEBUG_SPINLOCK.
7 */
8
9#include <linux/spinlock.h>
10#include <linux/nmi.h>
11#include <linux/interrupt.h>
12#include <linux/debug_locks.h>
13#include <linux/delay.h>
14#include <linux/export.h>
15
16void __raw_spin_lock_init(raw_spinlock_t *lock, const char *name,
17 struct lock_class_key *key)
18{
19#ifdef CONFIG_DEBUG_LOCK_ALLOC
20 /*
21 * Make sure we are not reinitializing a held lock:
22 */
23 debug_check_no_locks_freed((void *)lock, sizeof(*lock));
24 lockdep_init_map(&lock->dep_map, name, key, 0);
25#endif
26 lock->raw_lock = (arch_spinlock_t)__ARCH_SPIN_LOCK_UNLOCKED;
27 lock->magic = SPINLOCK_MAGIC;
28 lock->owner = SPINLOCK_OWNER_INIT;
29 lock->owner_cpu = -1;
30}
31
32EXPORT_SYMBOL(__raw_spin_lock_init);
33
34void __rwlock_init(rwlock_t *lock, const char *name,
35 struct lock_class_key *key)
36{
37#ifdef CONFIG_DEBUG_LOCK_ALLOC
38 /*
39 * Make sure we are not reinitializing a held lock:
40 */
41 debug_check_no_locks_freed((void *)lock, sizeof(*lock));
42 lockdep_init_map(&lock->dep_map, name, key, 0);
43#endif
44 lock->raw_lock = (arch_rwlock_t) __ARCH_RW_LOCK_UNLOCKED;
45 lock->magic = RWLOCK_MAGIC;
46 lock->owner = SPINLOCK_OWNER_INIT;
47 lock->owner_cpu = -1;
48}
49
50EXPORT_SYMBOL(__rwlock_init);
51
52static void spin_dump(raw_spinlock_t *lock, const char *msg)
53{
54 struct task_struct *owner = NULL;
55
56 if (lock->owner && lock->owner != SPINLOCK_OWNER_INIT)
57 owner = lock->owner;
58 printk(KERN_EMERG "BUG: spinlock %s on CPU#%d, %s/%d\n",
59 msg, raw_smp_processor_id(),
60 current->comm, task_pid_nr(current));
61 printk(KERN_EMERG " lock: %pS, .magic: %08x, .owner: %s/%d, "
62 ".owner_cpu: %d\n",
63 lock, lock->magic,
64 owner ? owner->comm : "<none>",
65 owner ? task_pid_nr(owner) : -1,
66 lock->owner_cpu);
67 dump_stack();
68}
69
70static void spin_bug(raw_spinlock_t *lock, const char *msg)
71{
72 if (!debug_locks_off())
73 return;
74
75 spin_dump(lock, msg);
76}
77
78#define SPIN_BUG_ON(cond, lock, msg) if (unlikely(cond)) spin_bug(lock, msg)
79
80static inline void
81debug_spin_lock_before(raw_spinlock_t *lock)
82{
83 SPIN_BUG_ON(lock->magic != SPINLOCK_MAGIC, lock, "bad magic");
84 SPIN_BUG_ON(lock->owner == current, lock, "recursion");
85 SPIN_BUG_ON(lock->owner_cpu == raw_smp_processor_id(),
86 lock, "cpu recursion");
87}
88
89static inline void debug_spin_lock_after(raw_spinlock_t *lock)
90{
91 lock->owner_cpu = raw_smp_processor_id();
92 lock->owner = current;
93}
94
95static inline void debug_spin_unlock(raw_spinlock_t *lock)
96{
97 SPIN_BUG_ON(lock->magic != SPINLOCK_MAGIC, lock, "bad magic");
98 SPIN_BUG_ON(!raw_spin_is_locked(lock), lock, "already unlocked");
99 SPIN_BUG_ON(lock->owner != current, lock, "wrong owner");
100 SPIN_BUG_ON(lock->owner_cpu != raw_smp_processor_id(),
101 lock, "wrong CPU");
102 lock->owner = SPINLOCK_OWNER_INIT;
103 lock->owner_cpu = -1;
104}
105
106static void __spin_lock_debug(raw_spinlock_t *lock)
107{
108 u64 i;
109 u64 loops = loops_per_jiffy * HZ;
110
111 for (i = 0; i < loops; i++) {
112 if (arch_spin_trylock(&lock->raw_lock))
113 return;
114 __delay(1);
115 }
116 /* lockup suspected: */
117 spin_dump(lock, "lockup suspected");
118#ifdef CONFIG_SMP
119 trigger_all_cpu_backtrace();
120#endif
121
122 /*
123 * The trylock above was causing a livelock. Give the lower level arch
124 * specific lock code a chance to acquire the lock. We have already
125 * printed a warning/backtrace at this point. The non-debug arch
126 * specific code might actually succeed in acquiring the lock. If it is
127 * not successful, the end-result is the same - there is no forward
128 * progress.
129 */
130 arch_spin_lock(&lock->raw_lock);
131}
132
133void do_raw_spin_lock(raw_spinlock_t *lock)
134{
135 debug_spin_lock_before(lock);
136 if (unlikely(!arch_spin_trylock(&lock->raw_lock)))
137 __spin_lock_debug(lock);
138 debug_spin_lock_after(lock);
139}
140
141int do_raw_spin_trylock(raw_spinlock_t *lock)
142{
143 int ret = arch_spin_trylock(&lock->raw_lock);
144
145 if (ret)
146 debug_spin_lock_after(lock);
147#ifndef CONFIG_SMP
148 /*
149 * Must not happen on UP:
150 */
151 SPIN_BUG_ON(!ret, lock, "trylock failure on UP");
152#endif
153 return ret;
154}
155
156void do_raw_spin_unlock(raw_spinlock_t *lock)
157{
158 debug_spin_unlock(lock);
159 arch_spin_unlock(&lock->raw_lock);
160}
161
162static void rwlock_bug(rwlock_t *lock, const char *msg)
163{
164 if (!debug_locks_off())
165 return;
166
167 printk(KERN_EMERG "BUG: rwlock %s on CPU#%d, %s/%d, %p\n",
168 msg, raw_smp_processor_id(), current->comm,
169 task_pid_nr(current), lock);
170 dump_stack();
171}
172
173#define RWLOCK_BUG_ON(cond, lock, msg) if (unlikely(cond)) rwlock_bug(lock, msg)
174
175#if 0 /* __write_lock_debug() can lock up - maybe this can too? */
176static void __read_lock_debug(rwlock_t *lock)
177{
178 u64 i;
179 u64 loops = loops_per_jiffy * HZ;
180 int print_once = 1;
181
182 for (;;) {
183 for (i = 0; i < loops; i++) {
184 if (arch_read_trylock(&lock->raw_lock))
185 return;
186 __delay(1);
187 }
188 /* lockup suspected: */
189 if (print_once) {
190 print_once = 0;
191 printk(KERN_EMERG "BUG: read-lock lockup on CPU#%d, "
192 "%s/%d, %p\n",
193 raw_smp_processor_id(), current->comm,
194 current->pid, lock);
195 dump_stack();
196 }
197 }
198}
199#endif
200
201void do_raw_read_lock(rwlock_t *lock)
202{
203 RWLOCK_BUG_ON(lock->magic != RWLOCK_MAGIC, lock, "bad magic");
204 arch_read_lock(&lock->raw_lock);
205}
206
207int do_raw_read_trylock(rwlock_t *lock)
208{
209 int ret = arch_read_trylock(&lock->raw_lock);
210
211#ifndef CONFIG_SMP
212 /*
213 * Must not happen on UP:
214 */
215 RWLOCK_BUG_ON(!ret, lock, "trylock failure on UP");
216#endif
217 return ret;
218}
219
220void do_raw_read_unlock(rwlock_t *lock)
221{
222 RWLOCK_BUG_ON(lock->magic != RWLOCK_MAGIC, lock, "bad magic");
223 arch_read_unlock(&lock->raw_lock);
224}
225
226static inline void debug_write_lock_before(rwlock_t *lock)
227{
228 RWLOCK_BUG_ON(lock->magic != RWLOCK_MAGIC, lock, "bad magic");
229 RWLOCK_BUG_ON(lock->owner == current, lock, "recursion");
230 RWLOCK_BUG_ON(lock->owner_cpu == raw_smp_processor_id(),
231 lock, "cpu recursion");
232}
233
234static inline void debug_write_lock_after(rwlock_t *lock)
235{
236 lock->owner_cpu = raw_smp_processor_id();
237 lock->owner = current;
238}
239
240static inline void debug_write_unlock(rwlock_t *lock)
241{
242 RWLOCK_BUG_ON(lock->magic != RWLOCK_MAGIC, lock, "bad magic");
243 RWLOCK_BUG_ON(lock->owner != current, lock, "wrong owner");
244 RWLOCK_BUG_ON(lock->owner_cpu != raw_smp_processor_id(),
245 lock, "wrong CPU");
246 lock->owner = SPINLOCK_OWNER_INIT;
247 lock->owner_cpu = -1;
248}
249
250#if 0 /* This can cause lockups */
251static void __write_lock_debug(rwlock_t *lock)
252{
253 u64 i;
254 u64 loops = loops_per_jiffy * HZ;
255 int print_once = 1;
256
257 for (;;) {
258 for (i = 0; i < loops; i++) {
259 if (arch_write_trylock(&lock->raw_lock))
260 return;
261 __delay(1);
262 }
263 /* lockup suspected: */
264 if (print_once) {
265 print_once = 0;
266 printk(KERN_EMERG "BUG: write-lock lockup on CPU#%d, "
267 "%s/%d, %p\n",
268 raw_smp_processor_id(), current->comm,
269 current->pid, lock);
270 dump_stack();
271 }
272 }
273}
274#endif
275
276void do_raw_write_lock(rwlock_t *lock)
277{
278 debug_write_lock_before(lock);
279 arch_write_lock(&lock->raw_lock);
280 debug_write_lock_after(lock);
281}
282
283int do_raw_write_trylock(rwlock_t *lock)
284{
285 int ret = arch_write_trylock(&lock->raw_lock);
286
287 if (ret)
288 debug_write_lock_after(lock);
289#ifndef CONFIG_SMP
290 /*
291 * Must not happen on UP:
292 */
293 RWLOCK_BUG_ON(!ret, lock, "trylock failure on UP");
294#endif
295 return ret;
296}
297
298void do_raw_write_unlock(rwlock_t *lock)
299{
300 debug_write_unlock(lock);
301 arch_write_unlock(&lock->raw_lock);
302}
diff --git a/kernel/modsign_certificate.S b/kernel/modsign_certificate.S
deleted file mode 100644
index 4a9a86d12c8b..000000000000
--- a/kernel/modsign_certificate.S
+++ /dev/null
@@ -1,12 +0,0 @@
1#include <linux/export.h>
2
3#define GLOBAL(name) \
4 .globl VMLINUX_SYMBOL(name); \
5 VMLINUX_SYMBOL(name):
6
7 .section ".init.data","aw"
8
9GLOBAL(modsign_certificate_list)
10 .incbin "signing_key.x509"
11 .incbin "extra_certificates"
12GLOBAL(modsign_certificate_list_end)
diff --git a/kernel/modsign_pubkey.c b/kernel/modsign_pubkey.c
deleted file mode 100644
index 7cbd4507a7e6..000000000000
--- a/kernel/modsign_pubkey.c
+++ /dev/null
@@ -1,104 +0,0 @@
1/* Public keys for module signature verification
2 *
3 * Copyright (C) 2012 Red Hat, Inc. All Rights Reserved.
4 * Written by David Howells (dhowells@redhat.com)
5 *
6 * This program is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public Licence
8 * as published by the Free Software Foundation; either version
9 * 2 of the Licence, or (at your option) any later version.
10 */
11
12#include <linux/kernel.h>
13#include <linux/sched.h>
14#include <linux/cred.h>
15#include <linux/err.h>
16#include <keys/asymmetric-type.h>
17#include "module-internal.h"
18
19struct key *modsign_keyring;
20
21extern __initconst const u8 modsign_certificate_list[];
22extern __initconst const u8 modsign_certificate_list_end[];
23
24/*
25 * We need to make sure ccache doesn't cache the .o file as it doesn't notice
26 * if modsign.pub changes.
27 */
28static __initconst const char annoy_ccache[] = __TIME__ "foo";
29
30/*
31 * Load the compiled-in keys
32 */
33static __init int module_verify_init(void)
34{
35 pr_notice("Initialise module verification\n");
36
37 modsign_keyring = keyring_alloc(".module_sign",
38 KUIDT_INIT(0), KGIDT_INIT(0),
39 current_cred(),
40 ((KEY_POS_ALL & ~KEY_POS_SETATTR) |
41 KEY_USR_VIEW | KEY_USR_READ),
42 KEY_ALLOC_NOT_IN_QUOTA, NULL);
43 if (IS_ERR(modsign_keyring))
44 panic("Can't allocate module signing keyring\n");
45
46 return 0;
47}
48
49/*
50 * Must be initialised before we try and load the keys into the keyring.
51 */
52device_initcall(module_verify_init);
53
54/*
55 * Load the compiled-in keys
56 */
57static __init int load_module_signing_keys(void)
58{
59 key_ref_t key;
60 const u8 *p, *end;
61 size_t plen;
62
63 pr_notice("Loading module verification certificates\n");
64
65 end = modsign_certificate_list_end;
66 p = modsign_certificate_list;
67 while (p < end) {
68 /* Each cert begins with an ASN.1 SEQUENCE tag and must be more
69 * than 256 bytes in size.
70 */
71 if (end - p < 4)
72 goto dodgy_cert;
73 if (p[0] != 0x30 &&
74 p[1] != 0x82)
75 goto dodgy_cert;
76 plen = (p[2] << 8) | p[3];
77 plen += 4;
78 if (plen > end - p)
79 goto dodgy_cert;
80
81 key = key_create_or_update(make_key_ref(modsign_keyring, 1),
82 "asymmetric",
83 NULL,
84 p,
85 plen,
86 (KEY_POS_ALL & ~KEY_POS_SETATTR) |
87 KEY_USR_VIEW,
88 KEY_ALLOC_NOT_IN_QUOTA);
89 if (IS_ERR(key))
90 pr_err("MODSIGN: Problem loading in-kernel X.509 certificate (%ld)\n",
91 PTR_ERR(key));
92 else
93 pr_notice("MODSIGN: Loaded cert '%s'\n",
94 key_ref_to_ptr(key)->description);
95 p += plen;
96 }
97
98 return 0;
99
100dodgy_cert:
101 pr_err("MODSIGN: Problem parsing in-kernel X.509 certificate list\n");
102 return 0;
103}
104late_initcall(load_module_signing_keys);
diff --git a/kernel/module-internal.h b/kernel/module-internal.h
index 24f9247b7d02..915e123a430f 100644
--- a/kernel/module-internal.h
+++ b/kernel/module-internal.h
@@ -9,6 +9,4 @@
9 * 2 of the Licence, or (at your option) any later version. 9 * 2 of the Licence, or (at your option) any later version.
10 */ 10 */
11 11
12extern struct key *modsign_keyring;
13
14extern int mod_verify_sig(const void *mod, unsigned long *_modlen); 12extern int mod_verify_sig(const void *mod, unsigned long *_modlen);
diff --git a/kernel/module.c b/kernel/module.c
index dc582749fa13..f5a3b1e8ec51 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -378,23 +378,21 @@ static bool check_symbol(const struct symsearch *syms,
378 if (syms->licence == GPL_ONLY) 378 if (syms->licence == GPL_ONLY)
379 return false; 379 return false;
380 if (syms->licence == WILL_BE_GPL_ONLY && fsa->warn) { 380 if (syms->licence == WILL_BE_GPL_ONLY && fsa->warn) {
381 printk(KERN_WARNING "Symbol %s is being used " 381 pr_warn("Symbol %s is being used by a non-GPL module, "
382 "by a non-GPL module, which will not " 382 "which will not be allowed in the future\n",
383 "be allowed in the future\n", fsa->name); 383 fsa->name);
384 } 384 }
385 } 385 }
386 386
387#ifdef CONFIG_UNUSED_SYMBOLS 387#ifdef CONFIG_UNUSED_SYMBOLS
388 if (syms->unused && fsa->warn) { 388 if (syms->unused && fsa->warn) {
389 printk(KERN_WARNING "Symbol %s is marked as UNUSED, " 389 pr_warn("Symbol %s is marked as UNUSED, however this module is "
390 "however this module is using it.\n", fsa->name); 390 "using it.\n", fsa->name);
391 printk(KERN_WARNING 391 pr_warn("This symbol will go away in the future.\n");
392 "This symbol will go away in the future.\n"); 392 pr_warn("Please evalute if this is the right api to use and if "
393 printk(KERN_WARNING 393 "it really is, submit a report the linux kernel "
394 "Please evalute if this is the right api to use and if " 394 "mailinglist together with submitting your code for "
395 "it really is, submit a report the linux kernel " 395 "inclusion.\n");
396 "mailinglist together with submitting your code for "
397 "inclusion.\n");
398 } 396 }
399#endif 397#endif
400 398
@@ -492,16 +490,15 @@ static int percpu_modalloc(struct module *mod, struct load_info *info)
492 return 0; 490 return 0;
493 491
494 if (align > PAGE_SIZE) { 492 if (align > PAGE_SIZE) {
495 printk(KERN_WARNING "%s: per-cpu alignment %li > %li\n", 493 pr_warn("%s: per-cpu alignment %li > %li\n",
496 mod->name, align, PAGE_SIZE); 494 mod->name, align, PAGE_SIZE);
497 align = PAGE_SIZE; 495 align = PAGE_SIZE;
498 } 496 }
499 497
500 mod->percpu = __alloc_reserved_percpu(pcpusec->sh_size, align); 498 mod->percpu = __alloc_reserved_percpu(pcpusec->sh_size, align);
501 if (!mod->percpu) { 499 if (!mod->percpu) {
502 printk(KERN_WARNING 500 pr_warn("%s: Could not allocate %lu bytes percpu data\n",
503 "%s: Could not allocate %lu bytes percpu data\n", 501 mod->name, (unsigned long)pcpusec->sh_size);
504 mod->name, (unsigned long)pcpusec->sh_size);
505 return -ENOMEM; 502 return -ENOMEM;
506 } 503 }
507 mod->percpu_size = pcpusec->sh_size; 504 mod->percpu_size = pcpusec->sh_size;
@@ -644,8 +641,6 @@ static int module_unload_init(struct module *mod)
644 641
645 /* Hold reference count during initialization. */ 642 /* Hold reference count during initialization. */
646 __this_cpu_write(mod->refptr->incs, 1); 643 __this_cpu_write(mod->refptr->incs, 1);
647 /* Backwards compatibility macros put refcount during init. */
648 mod->waiter = current;
649 644
650 return 0; 645 return 0;
651} 646}
@@ -679,7 +674,7 @@ static int add_module_usage(struct module *a, struct module *b)
679 pr_debug("Allocating new usage for %s.\n", a->name); 674 pr_debug("Allocating new usage for %s.\n", a->name);
680 use = kmalloc(sizeof(*use), GFP_ATOMIC); 675 use = kmalloc(sizeof(*use), GFP_ATOMIC);
681 if (!use) { 676 if (!use) {
682 printk(KERN_WARNING "%s: out of memory loading\n", a->name); 677 pr_warn("%s: out of memory loading\n", a->name);
683 return -ENOMEM; 678 return -ENOMEM;
684 } 679 }
685 680
@@ -771,16 +766,9 @@ static int __try_stop_module(void *_sref)
771 766
772static int try_stop_module(struct module *mod, int flags, int *forced) 767static int try_stop_module(struct module *mod, int flags, int *forced)
773{ 768{
774 if (flags & O_NONBLOCK) { 769 struct stopref sref = { mod, flags, forced };
775 struct stopref sref = { mod, flags, forced };
776 770
777 return stop_machine(__try_stop_module, &sref, NULL); 771 return stop_machine(__try_stop_module, &sref, NULL);
778 } else {
779 /* We don't need to stop the machine for this. */
780 mod->state = MODULE_STATE_GOING;
781 synchronize_sched();
782 return 0;
783 }
784} 772}
785 773
786unsigned long module_refcount(struct module *mod) 774unsigned long module_refcount(struct module *mod)
@@ -813,21 +801,6 @@ EXPORT_SYMBOL(module_refcount);
813/* This exists whether we can unload or not */ 801/* This exists whether we can unload or not */
814static void free_module(struct module *mod); 802static void free_module(struct module *mod);
815 803
816static void wait_for_zero_refcount(struct module *mod)
817{
818 /* Since we might sleep for some time, release the mutex first */
819 mutex_unlock(&module_mutex);
820 for (;;) {
821 pr_debug("Looking at refcount...\n");
822 set_current_state(TASK_UNINTERRUPTIBLE);
823 if (module_refcount(mod) == 0)
824 break;
825 schedule();
826 }
827 current->state = TASK_RUNNING;
828 mutex_lock(&module_mutex);
829}
830
831SYSCALL_DEFINE2(delete_module, const char __user *, name_user, 804SYSCALL_DEFINE2(delete_module, const char __user *, name_user,
832 unsigned int, flags) 805 unsigned int, flags)
833{ 806{
@@ -842,6 +815,11 @@ SYSCALL_DEFINE2(delete_module, const char __user *, name_user,
842 return -EFAULT; 815 return -EFAULT;
843 name[MODULE_NAME_LEN-1] = '\0'; 816 name[MODULE_NAME_LEN-1] = '\0';
844 817
818 if (!(flags & O_NONBLOCK)) {
819 printk(KERN_WARNING
820 "waiting module removal not supported: please upgrade");
821 }
822
845 if (mutex_lock_interruptible(&module_mutex) != 0) 823 if (mutex_lock_interruptible(&module_mutex) != 0)
846 return -EINTR; 824 return -EINTR;
847 825
@@ -859,8 +837,7 @@ SYSCALL_DEFINE2(delete_module, const char __user *, name_user,
859 837
860 /* Doing init or already dying? */ 838 /* Doing init or already dying? */
861 if (mod->state != MODULE_STATE_LIVE) { 839 if (mod->state != MODULE_STATE_LIVE) {
862 /* FIXME: if (force), slam module count and wake up 840 /* FIXME: if (force), slam module count damn the torpedoes */
863 waiter --RR */
864 pr_debug("%s already dying\n", mod->name); 841 pr_debug("%s already dying\n", mod->name);
865 ret = -EBUSY; 842 ret = -EBUSY;
866 goto out; 843 goto out;
@@ -876,18 +853,11 @@ SYSCALL_DEFINE2(delete_module, const char __user *, name_user,
876 } 853 }
877 } 854 }
878 855
879 /* Set this up before setting mod->state */
880 mod->waiter = current;
881
882 /* Stop the machine so refcounts can't move and disable module. */ 856 /* Stop the machine so refcounts can't move and disable module. */
883 ret = try_stop_module(mod, flags, &forced); 857 ret = try_stop_module(mod, flags, &forced);
884 if (ret != 0) 858 if (ret != 0)
885 goto out; 859 goto out;
886 860
887 /* Never wait if forced. */
888 if (!forced && module_refcount(mod) != 0)
889 wait_for_zero_refcount(mod);
890
891 mutex_unlock(&module_mutex); 861 mutex_unlock(&module_mutex);
892 /* Final destruction now no one is using it. */ 862 /* Final destruction now no one is using it. */
893 if (mod->exit != NULL) 863 if (mod->exit != NULL)
@@ -1005,9 +975,6 @@ void module_put(struct module *module)
1005 __this_cpu_inc(module->refptr->decs); 975 __this_cpu_inc(module->refptr->decs);
1006 976
1007 trace_module_put(module, _RET_IP_); 977 trace_module_put(module, _RET_IP_);
1008 /* Maybe they're waiting for us to drop reference? */
1009 if (unlikely(!module_is_live(module)))
1010 wake_up_process(module->waiter);
1011 preempt_enable(); 978 preempt_enable();
1012 } 979 }
1013} 980}
@@ -1145,8 +1112,7 @@ static int try_to_force_load(struct module *mod, const char *reason)
1145{ 1112{
1146#ifdef CONFIG_MODULE_FORCE_LOAD 1113#ifdef CONFIG_MODULE_FORCE_LOAD
1147 if (!test_taint(TAINT_FORCED_MODULE)) 1114 if (!test_taint(TAINT_FORCED_MODULE))
1148 printk(KERN_WARNING "%s: %s: kernel tainted.\n", 1115 pr_warn("%s: %s: kernel tainted.\n", mod->name, reason);
1149 mod->name, reason);
1150 add_taint_module(mod, TAINT_FORCED_MODULE, LOCKDEP_NOW_UNRELIABLE); 1116 add_taint_module(mod, TAINT_FORCED_MODULE, LOCKDEP_NOW_UNRELIABLE);
1151 return 0; 1117 return 0;
1152#else 1118#else
@@ -1199,8 +1165,7 @@ static int check_version(Elf_Shdr *sechdrs,
1199 goto bad_version; 1165 goto bad_version;
1200 } 1166 }
1201 1167
1202 printk(KERN_WARNING "%s: no symbol version for %s\n", 1168 pr_warn("%s: no symbol version for %s\n", mod->name, symname);
1203 mod->name, symname);
1204 return 0; 1169 return 0;
1205 1170
1206bad_version: 1171bad_version:
@@ -1309,8 +1274,8 @@ resolve_symbol_wait(struct module *mod,
1309 !IS_ERR(ksym = resolve_symbol(mod, info, name, owner)) 1274 !IS_ERR(ksym = resolve_symbol(mod, info, name, owner))
1310 || PTR_ERR(ksym) != -EBUSY, 1275 || PTR_ERR(ksym) != -EBUSY,
1311 30 * HZ) <= 0) { 1276 30 * HZ) <= 0) {
1312 printk(KERN_WARNING "%s: gave up waiting for init of module %s.\n", 1277 pr_warn("%s: gave up waiting for init of module %s.\n",
1313 mod->name, owner); 1278 mod->name, owner);
1314 } 1279 }
1315 return ksym; 1280 return ksym;
1316} 1281}
@@ -1626,15 +1591,14 @@ static int mod_sysfs_init(struct module *mod)
1626 struct kobject *kobj; 1591 struct kobject *kobj;
1627 1592
1628 if (!module_sysfs_initialized) { 1593 if (!module_sysfs_initialized) {
1629 printk(KERN_ERR "%s: module sysfs not initialized\n", 1594 pr_err("%s: module sysfs not initialized\n", mod->name);
1630 mod->name);
1631 err = -EINVAL; 1595 err = -EINVAL;
1632 goto out; 1596 goto out;
1633 } 1597 }
1634 1598
1635 kobj = kset_find_obj(module_kset, mod->name); 1599 kobj = kset_find_obj(module_kset, mod->name);
1636 if (kobj) { 1600 if (kobj) {
1637 printk(KERN_ERR "%s: module is already loaded\n", mod->name); 1601 pr_err("%s: module is already loaded\n", mod->name);
1638 kobject_put(kobj); 1602 kobject_put(kobj);
1639 err = -EINVAL; 1603 err = -EINVAL;
1640 goto out; 1604 goto out;
@@ -1961,8 +1925,7 @@ static int verify_export_symbols(struct module *mod)
1961 for (i = 0; i < ARRAY_SIZE(arr); i++) { 1925 for (i = 0; i < ARRAY_SIZE(arr); i++) {
1962 for (s = arr[i].sym; s < arr[i].sym + arr[i].num; s++) { 1926 for (s = arr[i].sym; s < arr[i].sym + arr[i].num; s++) {
1963 if (find_symbol(s->name, &owner, NULL, true, false)) { 1927 if (find_symbol(s->name, &owner, NULL, true, false)) {
1964 printk(KERN_ERR 1928 pr_err("%s: exports duplicate symbol %s"
1965 "%s: exports duplicate symbol %s"
1966 " (owned by %s)\n", 1929 " (owned by %s)\n",
1967 mod->name, s->name, module_name(owner)); 1930 mod->name, s->name, module_name(owner));
1968 return -ENOEXEC; 1931 return -ENOEXEC;
@@ -2013,8 +1976,8 @@ static int simplify_symbols(struct module *mod, const struct load_info *info)
2013 if (!ksym && ELF_ST_BIND(sym[i].st_info) == STB_WEAK) 1976 if (!ksym && ELF_ST_BIND(sym[i].st_info) == STB_WEAK)
2014 break; 1977 break;
2015 1978
2016 printk(KERN_WARNING "%s: Unknown symbol %s (err %li)\n", 1979 pr_warn("%s: Unknown symbol %s (err %li)\n",
2017 mod->name, name, PTR_ERR(ksym)); 1980 mod->name, name, PTR_ERR(ksym));
2018 ret = PTR_ERR(ksym) ?: -ENOENT; 1981 ret = PTR_ERR(ksym) ?: -ENOENT;
2019 break; 1982 break;
2020 1983
@@ -2168,8 +2131,8 @@ static void set_license(struct module *mod, const char *license)
2168 2131
2169 if (!license_is_gpl_compatible(license)) { 2132 if (!license_is_gpl_compatible(license)) {
2170 if (!test_taint(TAINT_PROPRIETARY_MODULE)) 2133 if (!test_taint(TAINT_PROPRIETARY_MODULE))
2171 printk(KERN_WARNING "%s: module license '%s' taints " 2134 pr_warn("%s: module license '%s' taints kernel.\n",
2172 "kernel.\n", mod->name, license); 2135 mod->name, license);
2173 add_taint_module(mod, TAINT_PROPRIETARY_MODULE, 2136 add_taint_module(mod, TAINT_PROPRIETARY_MODULE,
2174 LOCKDEP_NOW_UNRELIABLE); 2137 LOCKDEP_NOW_UNRELIABLE);
2175 } 2138 }
@@ -2405,8 +2368,8 @@ static void dynamic_debug_setup(struct _ddebug *debug, unsigned int num)
2405 return; 2368 return;
2406#ifdef CONFIG_DYNAMIC_DEBUG 2369#ifdef CONFIG_DYNAMIC_DEBUG
2407 if (ddebug_add_module(debug, num, debug->modname)) 2370 if (ddebug_add_module(debug, num, debug->modname))
2408 printk(KERN_ERR "dynamic debug error adding module: %s\n", 2371 pr_err("dynamic debug error adding module: %s\n",
2409 debug->modname); 2372 debug->modname);
2410#endif 2373#endif
2411} 2374}
2412 2375
@@ -2619,8 +2582,7 @@ static int rewrite_section_headers(struct load_info *info, int flags)
2619 Elf_Shdr *shdr = &info->sechdrs[i]; 2582 Elf_Shdr *shdr = &info->sechdrs[i];
2620 if (shdr->sh_type != SHT_NOBITS 2583 if (shdr->sh_type != SHT_NOBITS
2621 && info->len < shdr->sh_offset + shdr->sh_size) { 2584 && info->len < shdr->sh_offset + shdr->sh_size) {
2622 printk(KERN_ERR "Module len %lu truncated\n", 2585 pr_err("Module len %lu truncated\n", info->len);
2623 info->len);
2624 return -ENOEXEC; 2586 return -ENOEXEC;
2625 } 2587 }
2626 2588
@@ -2682,15 +2644,14 @@ static struct module *setup_load_info(struct load_info *info, int flags)
2682 2644
2683 info->index.mod = find_sec(info, ".gnu.linkonce.this_module"); 2645 info->index.mod = find_sec(info, ".gnu.linkonce.this_module");
2684 if (!info->index.mod) { 2646 if (!info->index.mod) {
2685 printk(KERN_WARNING "No module found in object\n"); 2647 pr_warn("No module found in object\n");
2686 return ERR_PTR(-ENOEXEC); 2648 return ERR_PTR(-ENOEXEC);
2687 } 2649 }
2688 /* This is temporary: point mod into copy of data. */ 2650 /* This is temporary: point mod into copy of data. */
2689 mod = (void *)info->sechdrs[info->index.mod].sh_addr; 2651 mod = (void *)info->sechdrs[info->index.mod].sh_addr;
2690 2652
2691 if (info->index.sym == 0) { 2653 if (info->index.sym == 0) {
2692 printk(KERN_WARNING "%s: module has no symbols (stripped?)\n", 2654 pr_warn("%s: module has no symbols (stripped?)\n", mod->name);
2693 mod->name);
2694 return ERR_PTR(-ENOEXEC); 2655 return ERR_PTR(-ENOEXEC);
2695 } 2656 }
2696 2657
@@ -2717,7 +2678,7 @@ static int check_modinfo(struct module *mod, struct load_info *info, int flags)
2717 if (err) 2678 if (err)
2718 return err; 2679 return err;
2719 } else if (!same_magic(modmagic, vermagic, info->index.vers)) { 2680 } else if (!same_magic(modmagic, vermagic, info->index.vers)) {
2720 printk(KERN_ERR "%s: version magic '%s' should be '%s'\n", 2681 pr_err("%s: version magic '%s' should be '%s'\n",
2721 mod->name, modmagic, vermagic); 2682 mod->name, modmagic, vermagic);
2722 return -ENOEXEC; 2683 return -ENOEXEC;
2723 } 2684 }
@@ -2727,9 +2688,8 @@ static int check_modinfo(struct module *mod, struct load_info *info, int flags)
2727 2688
2728 if (get_modinfo(info, "staging")) { 2689 if (get_modinfo(info, "staging")) {
2729 add_taint_module(mod, TAINT_CRAP, LOCKDEP_STILL_OK); 2690 add_taint_module(mod, TAINT_CRAP, LOCKDEP_STILL_OK);
2730 printk(KERN_WARNING "%s: module is from the staging directory," 2691 pr_warn("%s: module is from the staging directory, the quality "
2731 " the quality is unknown, you have been warned.\n", 2692 "is unknown, you have been warned.\n", mod->name);
2732 mod->name);
2733 } 2693 }
2734 2694
2735 /* Set up license info based on the info section */ 2695 /* Set up license info based on the info section */
@@ -2738,7 +2698,7 @@ static int check_modinfo(struct module *mod, struct load_info *info, int flags)
2738 return 0; 2698 return 0;
2739} 2699}
2740 2700
2741static void find_module_sections(struct module *mod, struct load_info *info) 2701static int find_module_sections(struct module *mod, struct load_info *info)
2742{ 2702{
2743 mod->kp = section_objs(info, "__param", 2703 mod->kp = section_objs(info, "__param",
2744 sizeof(*mod->kp), &mod->num_kp); 2704 sizeof(*mod->kp), &mod->num_kp);
@@ -2768,6 +2728,18 @@ static void find_module_sections(struct module *mod, struct load_info *info)
2768#ifdef CONFIG_CONSTRUCTORS 2728#ifdef CONFIG_CONSTRUCTORS
2769 mod->ctors = section_objs(info, ".ctors", 2729 mod->ctors = section_objs(info, ".ctors",
2770 sizeof(*mod->ctors), &mod->num_ctors); 2730 sizeof(*mod->ctors), &mod->num_ctors);
2731 if (!mod->ctors)
2732 mod->ctors = section_objs(info, ".init_array",
2733 sizeof(*mod->ctors), &mod->num_ctors);
2734 else if (find_sec(info, ".init_array")) {
2735 /*
2736 * This shouldn't happen with same compiler and binutils
2737 * building all parts of the module.
2738 */
2739 printk(KERN_WARNING "%s: has both .ctors and .init_array.\n",
2740 mod->name);
2741 return -EINVAL;
2742 }
2771#endif 2743#endif
2772 2744
2773#ifdef CONFIG_TRACEPOINTS 2745#ifdef CONFIG_TRACEPOINTS
@@ -2801,11 +2773,12 @@ static void find_module_sections(struct module *mod, struct load_info *info)
2801 sizeof(*mod->extable), &mod->num_exentries); 2773 sizeof(*mod->extable), &mod->num_exentries);
2802 2774
2803 if (section_addr(info, "__obsparm")) 2775 if (section_addr(info, "__obsparm"))
2804 printk(KERN_WARNING "%s: Ignoring obsolete parameters\n", 2776 pr_warn("%s: Ignoring obsolete parameters\n", mod->name);
2805 mod->name);
2806 2777
2807 info->debug = section_objs(info, "__verbose", 2778 info->debug = section_objs(info, "__verbose",
2808 sizeof(*info->debug), &info->num_debug); 2779 sizeof(*info->debug), &info->num_debug);
2780
2781 return 0;
2809} 2782}
2810 2783
2811static int move_module(struct module *mod, struct load_info *info) 2784static int move_module(struct module *mod, struct load_info *info)
@@ -3078,11 +3051,10 @@ static int do_init_module(struct module *mod)
3078 return ret; 3051 return ret;
3079 } 3052 }
3080 if (ret > 0) { 3053 if (ret > 0) {
3081 printk(KERN_WARNING 3054 pr_warn("%s: '%s'->init suspiciously returned %d, it should "
3082"%s: '%s'->init suspiciously returned %d, it should follow 0/-E convention\n" 3055 "follow 0/-E convention\n"
3083"%s: loading module anyway...\n", 3056 "%s: loading module anyway...\n",
3084 __func__, mod->name, ret, 3057 __func__, mod->name, ret, __func__);
3085 __func__);
3086 dump_stack(); 3058 dump_stack();
3087 } 3059 }
3088 3060
@@ -3205,10 +3177,8 @@ static int unknown_module_param_cb(char *param, char *val, const char *modname)
3205{ 3177{
3206 /* Check for magic 'dyndbg' arg */ 3178 /* Check for magic 'dyndbg' arg */
3207 int ret = ddebug_dyndbg_module_param_cb(param, val, modname); 3179 int ret = ddebug_dyndbg_module_param_cb(param, val, modname);
3208 if (ret != 0) { 3180 if (ret != 0)
3209 printk(KERN_WARNING "%s: unknown parameter '%s' ignored\n", 3181 pr_warn("%s: unknown parameter '%s' ignored\n", modname, param);
3210 modname, param);
3211 }
3212 return 0; 3182 return 0;
3213} 3183}
3214 3184
@@ -3243,10 +3213,9 @@ static int load_module(struct load_info *info, const char __user *uargs,
3243#ifdef CONFIG_MODULE_SIG 3213#ifdef CONFIG_MODULE_SIG
3244 mod->sig_ok = info->sig_ok; 3214 mod->sig_ok = info->sig_ok;
3245 if (!mod->sig_ok) { 3215 if (!mod->sig_ok) {
3246 printk_once(KERN_NOTICE 3216 pr_notice_once("%s: module verification failed: signature "
3247 "%s: module verification failed: signature and/or" 3217 "and/or required key missing - tainting "
3248 " required key missing - tainting kernel\n", 3218 "kernel\n", mod->name);
3249 mod->name);
3250 add_taint_module(mod, TAINT_FORCED_MODULE, LOCKDEP_STILL_OK); 3219 add_taint_module(mod, TAINT_FORCED_MODULE, LOCKDEP_STILL_OK);
3251 } 3220 }
3252#endif 3221#endif
@@ -3263,7 +3232,9 @@ static int load_module(struct load_info *info, const char __user *uargs,
3263 3232
3264 /* Now we've got everything in the final locations, we can 3233 /* Now we've got everything in the final locations, we can
3265 * find optional sections. */ 3234 * find optional sections. */
3266 find_module_sections(mod, info); 3235 err = find_module_sections(mod, info);
3236 if (err)
3237 goto free_unload;
3267 3238
3268 err = check_module_license_and_versions(mod); 3239 err = check_module_license_and_versions(mod);
3269 if (err) 3240 if (err)
diff --git a/kernel/module_signing.c b/kernel/module_signing.c
index f2970bddc5ea..be5b8fac4bd0 100644
--- a/kernel/module_signing.c
+++ b/kernel/module_signing.c
@@ -14,6 +14,7 @@
14#include <crypto/public_key.h> 14#include <crypto/public_key.h>
15#include <crypto/hash.h> 15#include <crypto/hash.h>
16#include <keys/asymmetric-type.h> 16#include <keys/asymmetric-type.h>
17#include <keys/system_keyring.h>
17#include "module-internal.h" 18#include "module-internal.h"
18 19
19/* 20/*
@@ -28,7 +29,7 @@
28 */ 29 */
29struct module_signature { 30struct module_signature {
30 u8 algo; /* Public-key crypto algorithm [enum pkey_algo] */ 31 u8 algo; /* Public-key crypto algorithm [enum pkey_algo] */
31 u8 hash; /* Digest algorithm [enum pkey_hash_algo] */ 32 u8 hash; /* Digest algorithm [enum hash_algo] */
32 u8 id_type; /* Key identifier type [enum pkey_id_type] */ 33 u8 id_type; /* Key identifier type [enum pkey_id_type] */
33 u8 signer_len; /* Length of signer's name */ 34 u8 signer_len; /* Length of signer's name */
34 u8 key_id_len; /* Length of key identifier */ 35 u8 key_id_len; /* Length of key identifier */
@@ -39,7 +40,7 @@ struct module_signature {
39/* 40/*
40 * Digest the module contents. 41 * Digest the module contents.
41 */ 42 */
42static struct public_key_signature *mod_make_digest(enum pkey_hash_algo hash, 43static struct public_key_signature *mod_make_digest(enum hash_algo hash,
43 const void *mod, 44 const void *mod,
44 unsigned long modlen) 45 unsigned long modlen)
45{ 46{
@@ -54,7 +55,7 @@ static struct public_key_signature *mod_make_digest(enum pkey_hash_algo hash,
54 /* Allocate the hashing algorithm we're going to need and find out how 55 /* Allocate the hashing algorithm we're going to need and find out how
55 * big the hash operational data will be. 56 * big the hash operational data will be.
56 */ 57 */
57 tfm = crypto_alloc_shash(pkey_hash_algo[hash], 0, 0); 58 tfm = crypto_alloc_shash(hash_algo_name[hash], 0, 0);
58 if (IS_ERR(tfm)) 59 if (IS_ERR(tfm))
59 return (PTR_ERR(tfm) == -ENOENT) ? ERR_PTR(-ENOPKG) : ERR_CAST(tfm); 60 return (PTR_ERR(tfm) == -ENOENT) ? ERR_PTR(-ENOPKG) : ERR_CAST(tfm);
60 61
@@ -157,7 +158,7 @@ static struct key *request_asymmetric_key(const char *signer, size_t signer_len,
157 158
158 pr_debug("Look up: \"%s\"\n", id); 159 pr_debug("Look up: \"%s\"\n", id);
159 160
160 key = keyring_search(make_key_ref(modsign_keyring, 1), 161 key = keyring_search(make_key_ref(system_trusted_keyring, 1),
161 &key_type_asymmetric, id); 162 &key_type_asymmetric, id);
162 if (IS_ERR(key)) 163 if (IS_ERR(key))
163 pr_warn("Request for unknown module key '%s' err %ld\n", 164 pr_warn("Request for unknown module key '%s' err %ld\n",
@@ -217,7 +218,7 @@ int mod_verify_sig(const void *mod, unsigned long *_modlen)
217 return -ENOPKG; 218 return -ENOPKG;
218 219
219 if (ms.hash >= PKEY_HASH__LAST || 220 if (ms.hash >= PKEY_HASH__LAST ||
220 !pkey_hash_algo[ms.hash]) 221 !hash_algo_name[ms.hash])
221 return -ENOPKG; 222 return -ENOPKG;
222 223
223 key = request_asymmetric_key(sig, ms.signer_len, 224 key = request_asymmetric_key(sig, ms.signer_len,
diff --git a/kernel/padata.c b/kernel/padata.c
index 07af2c95dcfe..2abd25d79cc8 100644
--- a/kernel/padata.c
+++ b/kernel/padata.c
@@ -46,6 +46,7 @@ static int padata_index_to_cpu(struct parallel_data *pd, int cpu_index)
46 46
47static int padata_cpu_hash(struct parallel_data *pd) 47static int padata_cpu_hash(struct parallel_data *pd)
48{ 48{
49 unsigned int seq_nr;
49 int cpu_index; 50 int cpu_index;
50 51
51 /* 52 /*
@@ -53,10 +54,8 @@ static int padata_cpu_hash(struct parallel_data *pd)
53 * seq_nr mod. number of cpus in use. 54 * seq_nr mod. number of cpus in use.
54 */ 55 */
55 56
56 spin_lock(&pd->seq_lock); 57 seq_nr = atomic_inc_return(&pd->seq_nr);
57 cpu_index = pd->seq_nr % cpumask_weight(pd->cpumask.pcpu); 58 cpu_index = seq_nr % cpumask_weight(pd->cpumask.pcpu);
58 pd->seq_nr++;
59 spin_unlock(&pd->seq_lock);
60 59
61 return padata_index_to_cpu(pd, cpu_index); 60 return padata_index_to_cpu(pd, cpu_index);
62} 61}
@@ -429,7 +428,7 @@ static struct parallel_data *padata_alloc_pd(struct padata_instance *pinst,
429 padata_init_pqueues(pd); 428 padata_init_pqueues(pd);
430 padata_init_squeues(pd); 429 padata_init_squeues(pd);
431 setup_timer(&pd->timer, padata_reorder_timer, (unsigned long)pd); 430 setup_timer(&pd->timer, padata_reorder_timer, (unsigned long)pd);
432 pd->seq_nr = 0; 431 atomic_set(&pd->seq_nr, -1);
433 atomic_set(&pd->reorder_objects, 0); 432 atomic_set(&pd->reorder_objects, 0);
434 atomic_set(&pd->refcnt, 0); 433 atomic_set(&pd->refcnt, 0);
435 pd->pinst = pinst; 434 pd->pinst = pinst;
diff --git a/kernel/panic.c b/kernel/panic.c
index b6c482ccc5db..c00b4ceb39e8 100644
--- a/kernel/panic.c
+++ b/kernel/panic.c
@@ -233,7 +233,7 @@ static const struct tnt tnts[] = {
233 */ 233 */
234const char *print_tainted(void) 234const char *print_tainted(void)
235{ 235{
236 static char buf[ARRAY_SIZE(tnts) + sizeof("Tainted: ") + 1]; 236 static char buf[ARRAY_SIZE(tnts) + sizeof("Tainted: ")];
237 237
238 if (tainted_mask) { 238 if (tainted_mask) {
239 char *s; 239 char *s;
diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c
index 42086551a24a..06c62de9c711 100644
--- a/kernel/pid_namespace.c
+++ b/kernel/pid_namespace.c
@@ -132,6 +132,12 @@ out:
132 return ERR_PTR(err); 132 return ERR_PTR(err);
133} 133}
134 134
135static void delayed_free_pidns(struct rcu_head *p)
136{
137 kmem_cache_free(pid_ns_cachep,
138 container_of(p, struct pid_namespace, rcu));
139}
140
135static void destroy_pid_namespace(struct pid_namespace *ns) 141static void destroy_pid_namespace(struct pid_namespace *ns)
136{ 142{
137 int i; 143 int i;
@@ -140,7 +146,7 @@ static void destroy_pid_namespace(struct pid_namespace *ns)
140 for (i = 0; i < PIDMAP_ENTRIES; i++) 146 for (i = 0; i < PIDMAP_ENTRIES; i++)
141 kfree(ns->pidmap[i].page); 147 kfree(ns->pidmap[i].page);
142 put_user_ns(ns->user_ns); 148 put_user_ns(ns->user_ns);
143 kmem_cache_free(pid_ns_cachep, ns); 149 call_rcu(&ns->rcu, delayed_free_pidns);
144} 150}
145 151
146struct pid_namespace *copy_pid_ns(unsigned long flags, 152struct pid_namespace *copy_pid_ns(unsigned long flags,
diff --git a/kernel/power/Kconfig b/kernel/power/Kconfig
index d444c4e834f4..2fac9cc79b3d 100644
--- a/kernel/power/Kconfig
+++ b/kernel/power/Kconfig
@@ -178,6 +178,22 @@ config PM_SLEEP_DEBUG
178 def_bool y 178 def_bool y
179 depends on PM_DEBUG && PM_SLEEP 179 depends on PM_DEBUG && PM_SLEEP
180 180
181config DPM_WATCHDOG
182 bool "Device suspend/resume watchdog"
183 depends on PM_DEBUG && PSTORE
184 ---help---
185 Sets up a watchdog timer to capture drivers that are
186 locked up attempting to suspend/resume a device.
187 A detected lockup causes system panic with message
188 captured in pstore device for inspection in subsequent
189 boot session.
190
191config DPM_WATCHDOG_TIMEOUT
192 int "Watchdog timeout in seconds"
193 range 1 120
194 default 12
195 depends on DPM_WATCHDOG
196
181config PM_TRACE 197config PM_TRACE
182 bool 198 bool
183 help 199 help
diff --git a/kernel/power/console.c b/kernel/power/console.c
index 463aa6736751..eacb8bd8cab4 100644
--- a/kernel/power/console.c
+++ b/kernel/power/console.c
@@ -81,6 +81,7 @@ void pm_vt_switch_unregister(struct device *dev)
81 list_for_each_entry(tmp, &pm_vt_switch_list, head) { 81 list_for_each_entry(tmp, &pm_vt_switch_list, head) {
82 if (tmp->dev == dev) { 82 if (tmp->dev == dev) {
83 list_del(&tmp->head); 83 list_del(&tmp->head);
84 kfree(tmp);
84 break; 85 break;
85 } 86 }
86 } 87 }
diff --git a/kernel/power/qos.c b/kernel/power/qos.c
index a394297f8b2f..8dff9b48075a 100644
--- a/kernel/power/qos.c
+++ b/kernel/power/qos.c
@@ -558,30 +558,12 @@ static ssize_t pm_qos_power_write(struct file *filp, const char __user *buf,
558 if (count == sizeof(s32)) { 558 if (count == sizeof(s32)) {
559 if (copy_from_user(&value, buf, sizeof(s32))) 559 if (copy_from_user(&value, buf, sizeof(s32)))
560 return -EFAULT; 560 return -EFAULT;
561 } else if (count <= 11) { /* ASCII perhaps? */ 561 } else {
562 char ascii_value[11];
563 unsigned long int ulval;
564 int ret; 562 int ret;
565 563
566 if (copy_from_user(ascii_value, buf, count)) 564 ret = kstrtos32_from_user(buf, count, 16, &value);
567 return -EFAULT; 565 if (ret)
568 566 return ret;
569 if (count > 10) {
570 if (ascii_value[10] == '\n')
571 ascii_value[10] = '\0';
572 else
573 return -EINVAL;
574 } else {
575 ascii_value[count] = '\0';
576 }
577 ret = kstrtoul(ascii_value, 16, &ulval);
578 if (ret) {
579 pr_debug("%s, 0x%lx, 0x%x\n", ascii_value, ulval, ret);
580 return -EINVAL;
581 }
582 value = (s32)lower_32_bits(ulval);
583 } else {
584 return -EINVAL;
585 } 567 }
586 568
587 req = filp->private_data; 569 req = filp->private_data;
diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c
index 98c3b34a4cff..b38109e204af 100644
--- a/kernel/power/snapshot.c
+++ b/kernel/power/snapshot.c
@@ -792,7 +792,8 @@ void free_basic_memory_bitmaps(void)
792{ 792{
793 struct memory_bitmap *bm1, *bm2; 793 struct memory_bitmap *bm1, *bm2;
794 794
795 BUG_ON(!(forbidden_pages_map && free_pages_map)); 795 if (WARN_ON(!(forbidden_pages_map && free_pages_map)))
796 return;
796 797
797 bm1 = forbidden_pages_map; 798 bm1 = forbidden_pages_map;
798 bm2 = free_pages_map; 799 bm2 = free_pages_map;
@@ -1402,7 +1403,11 @@ int hibernate_preallocate_memory(void)
1402 * highmem and non-highmem zones separately. 1403 * highmem and non-highmem zones separately.
1403 */ 1404 */
1404 pages_highmem = preallocate_image_highmem(highmem / 2); 1405 pages_highmem = preallocate_image_highmem(highmem / 2);
1405 alloc = (count - max_size) - pages_highmem; 1406 alloc = count - max_size;
1407 if (alloc > pages_highmem)
1408 alloc -= pages_highmem;
1409 else
1410 alloc = 0;
1406 pages = preallocate_image_memory(alloc, avail_normal); 1411 pages = preallocate_image_memory(alloc, avail_normal);
1407 if (pages < alloc) { 1412 if (pages < alloc) {
1408 /* We have exhausted non-highmem pages, try highmem. */ 1413 /* We have exhausted non-highmem pages, try highmem. */
diff --git a/kernel/power/user.c b/kernel/power/user.c
index 957f06164ad1..98d357584cd6 100644
--- a/kernel/power/user.c
+++ b/kernel/power/user.c
@@ -36,9 +36,9 @@ static struct snapshot_data {
36 struct snapshot_handle handle; 36 struct snapshot_handle handle;
37 int swap; 37 int swap;
38 int mode; 38 int mode;
39 char frozen; 39 bool frozen;
40 char ready; 40 bool ready;
41 char platform_support; 41 bool platform_support;
42 bool free_bitmaps; 42 bool free_bitmaps;
43} snapshot_state; 43} snapshot_state;
44 44
@@ -70,6 +70,7 @@ static int snapshot_open(struct inode *inode, struct file *filp)
70 data->swap = swsusp_resume_device ? 70 data->swap = swsusp_resume_device ?
71 swap_type_of(swsusp_resume_device, 0, NULL) : -1; 71 swap_type_of(swsusp_resume_device, 0, NULL) : -1;
72 data->mode = O_RDONLY; 72 data->mode = O_RDONLY;
73 data->free_bitmaps = false;
73 error = pm_notifier_call_chain(PM_HIBERNATION_PREPARE); 74 error = pm_notifier_call_chain(PM_HIBERNATION_PREPARE);
74 if (error) 75 if (error)
75 pm_notifier_call_chain(PM_POST_HIBERNATION); 76 pm_notifier_call_chain(PM_POST_HIBERNATION);
@@ -93,9 +94,9 @@ static int snapshot_open(struct inode *inode, struct file *filp)
93 if (error) 94 if (error)
94 atomic_inc(&snapshot_device_available); 95 atomic_inc(&snapshot_device_available);
95 96
96 data->frozen = 0; 97 data->frozen = false;
97 data->ready = 0; 98 data->ready = false;
98 data->platform_support = 0; 99 data->platform_support = false;
99 100
100 Unlock: 101 Unlock:
101 unlock_system_sleep(); 102 unlock_system_sleep();
@@ -229,7 +230,7 @@ static long snapshot_ioctl(struct file *filp, unsigned int cmd,
229 if (error) 230 if (error)
230 thaw_processes(); 231 thaw_processes();
231 else 232 else
232 data->frozen = 1; 233 data->frozen = true;
233 234
234 break; 235 break;
235 236
@@ -240,7 +241,7 @@ static long snapshot_ioctl(struct file *filp, unsigned int cmd,
240 free_basic_memory_bitmaps(); 241 free_basic_memory_bitmaps();
241 data->free_bitmaps = false; 242 data->free_bitmaps = false;
242 thaw_processes(); 243 thaw_processes();
243 data->frozen = 0; 244 data->frozen = false;
244 break; 245 break;
245 246
246 case SNAPSHOT_CREATE_IMAGE: 247 case SNAPSHOT_CREATE_IMAGE:
@@ -270,7 +271,7 @@ static long snapshot_ioctl(struct file *filp, unsigned int cmd,
270 case SNAPSHOT_FREE: 271 case SNAPSHOT_FREE:
271 swsusp_free(); 272 swsusp_free();
272 memset(&data->handle, 0, sizeof(struct snapshot_handle)); 273 memset(&data->handle, 0, sizeof(struct snapshot_handle));
273 data->ready = 0; 274 data->ready = false;
274 /* 275 /*
275 * It is necessary to thaw kernel threads here, because 276 * It is necessary to thaw kernel threads here, because
276 * SNAPSHOT_CREATE_IMAGE may be invoked directly after 277 * SNAPSHOT_CREATE_IMAGE may be invoked directly after
@@ -334,7 +335,7 @@ static long snapshot_ioctl(struct file *filp, unsigned int cmd,
334 * PM_HIBERNATION_PREPARE 335 * PM_HIBERNATION_PREPARE
335 */ 336 */
336 error = suspend_devices_and_enter(PM_SUSPEND_MEM); 337 error = suspend_devices_and_enter(PM_SUSPEND_MEM);
337 data->ready = 0; 338 data->ready = false;
338 break; 339 break;
339 340
340 case SNAPSHOT_PLATFORM_SUPPORT: 341 case SNAPSHOT_PLATFORM_SUPPORT:
diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c
index b4e8500afdb3..be7c86bae576 100644
--- a/kernel/printk/printk.c
+++ b/kernel/printk/printk.c
@@ -705,9 +705,9 @@ const struct file_operations kmsg_fops = {
705 705
706#ifdef CONFIG_KEXEC 706#ifdef CONFIG_KEXEC
707/* 707/*
708 * This appends the listed symbols to /proc/vmcoreinfo 708 * This appends the listed symbols to /proc/vmcore
709 * 709 *
710 * /proc/vmcoreinfo is used by various utiilties, like crash and makedumpfile to 710 * /proc/vmcore is used by various utilities, like crash and makedumpfile to
711 * obtain access to symbols that are otherwise very difficult to locate. These 711 * obtain access to symbols that are otherwise very difficult to locate. These
712 * symbols are specifically used so that utilities can access and extract the 712 * symbols are specifically used so that utilities can access and extract the
713 * dmesg log from a vmcore file after a crash. 713 * dmesg log from a vmcore file after a crash.
@@ -791,7 +791,7 @@ static bool __read_mostly ignore_loglevel;
791static int __init ignore_loglevel_setup(char *str) 791static int __init ignore_loglevel_setup(char *str)
792{ 792{
793 ignore_loglevel = 1; 793 ignore_loglevel = 1;
794 printk(KERN_INFO "debug: ignoring loglevel setting.\n"); 794 pr_info("debug: ignoring loglevel setting.\n");
795 795
796 return 0; 796 return 0;
797} 797}
@@ -820,9 +820,9 @@ static int __init boot_delay_setup(char *str)
820 pr_debug("boot_delay: %u, preset_lpj: %ld, lpj: %lu, " 820 pr_debug("boot_delay: %u, preset_lpj: %ld, lpj: %lu, "
821 "HZ: %d, loops_per_msec: %llu\n", 821 "HZ: %d, loops_per_msec: %llu\n",
822 boot_delay, preset_lpj, lpj, HZ, loops_per_msec); 822 boot_delay, preset_lpj, lpj, HZ, loops_per_msec);
823 return 1; 823 return 0;
824} 824}
825__setup("boot_delay=", boot_delay_setup); 825early_param("boot_delay", boot_delay_setup);
826 826
827static void boot_delay_msec(int level) 827static void boot_delay_msec(int level)
828{ 828{
@@ -2193,7 +2193,7 @@ static int __read_mostly keep_bootcon;
2193static int __init keep_bootcon_setup(char *str) 2193static int __init keep_bootcon_setup(char *str)
2194{ 2194{
2195 keep_bootcon = 1; 2195 keep_bootcon = 1;
2196 printk(KERN_INFO "debug: skip boot console de-registration.\n"); 2196 pr_info("debug: skip boot console de-registration.\n");
2197 2197
2198 return 0; 2198 return 0;
2199} 2199}
@@ -2241,7 +2241,7 @@ void register_console(struct console *newcon)
2241 /* find the last or real console */ 2241 /* find the last or real console */
2242 for_each_console(bcon) { 2242 for_each_console(bcon) {
2243 if (!(bcon->flags & CON_BOOT)) { 2243 if (!(bcon->flags & CON_BOOT)) {
2244 printk(KERN_INFO "Too late to register bootconsole %s%d\n", 2244 pr_info("Too late to register bootconsole %s%d\n",
2245 newcon->name, newcon->index); 2245 newcon->name, newcon->index);
2246 return; 2246 return;
2247 } 2247 }
@@ -2358,21 +2358,18 @@ void register_console(struct console *newcon)
2358 * users know there might be something in the kernel's log buffer that 2358 * users know there might be something in the kernel's log buffer that
2359 * went to the bootconsole (that they do not see on the real console) 2359 * went to the bootconsole (that they do not see on the real console)
2360 */ 2360 */
2361 pr_info("%sconsole [%s%d] enabled\n",
2362 (newcon->flags & CON_BOOT) ? "boot" : "" ,
2363 newcon->name, newcon->index);
2361 if (bcon && 2364 if (bcon &&
2362 ((newcon->flags & (CON_CONSDEV | CON_BOOT)) == CON_CONSDEV) && 2365 ((newcon->flags & (CON_CONSDEV | CON_BOOT)) == CON_CONSDEV) &&
2363 !keep_bootcon) { 2366 !keep_bootcon) {
2364 /* we need to iterate through twice, to make sure we print 2367 /* We need to iterate through all boot consoles, to make
2365 * everything out, before we unregister the console(s) 2368 * sure we print everything out, before we unregister them.
2366 */ 2369 */
2367 printk(KERN_INFO "console [%s%d] enabled, bootconsole disabled\n",
2368 newcon->name, newcon->index);
2369 for_each_console(bcon) 2370 for_each_console(bcon)
2370 if (bcon->flags & CON_BOOT) 2371 if (bcon->flags & CON_BOOT)
2371 unregister_console(bcon); 2372 unregister_console(bcon);
2372 } else {
2373 printk(KERN_INFO "%sconsole [%s%d] enabled\n",
2374 (newcon->flags & CON_BOOT) ? "boot" : "" ,
2375 newcon->name, newcon->index);
2376 } 2373 }
2377} 2374}
2378EXPORT_SYMBOL(register_console); 2375EXPORT_SYMBOL(register_console);
@@ -2382,6 +2379,10 @@ int unregister_console(struct console *console)
2382 struct console *a, *b; 2379 struct console *a, *b;
2383 int res; 2380 int res;
2384 2381
2382 pr_info("%sconsole [%s%d] disabled\n",
2383 (console->flags & CON_BOOT) ? "boot" : "" ,
2384 console->name, console->index);
2385
2385 res = _braille_unregister_console(console); 2386 res = _braille_unregister_console(console);
2386 if (res) 2387 if (res)
2387 return res; 2388 return res;
@@ -2421,8 +2422,6 @@ static int __init printk_late_init(void)
2421 2422
2422 for_each_console(con) { 2423 for_each_console(con) {
2423 if (!keep_bootcon && con->flags & CON_BOOT) { 2424 if (!keep_bootcon && con->flags & CON_BOOT) {
2424 printk(KERN_INFO "turn off boot console %s%d\n",
2425 con->name, con->index);
2426 unregister_console(con); 2425 unregister_console(con);
2427 } 2426 }
2428 } 2427 }
@@ -2449,7 +2448,7 @@ static void wake_up_klogd_work_func(struct irq_work *irq_work)
2449 2448
2450 if (pending & PRINTK_PENDING_SCHED) { 2449 if (pending & PRINTK_PENDING_SCHED) {
2451 char *buf = __get_cpu_var(printk_sched_buf); 2450 char *buf = __get_cpu_var(printk_sched_buf);
2452 printk(KERN_WARNING "[sched_delayed] %s", buf); 2451 pr_warn("[sched_delayed] %s", buf);
2453 } 2452 }
2454 2453
2455 if (pending & PRINTK_PENDING_WAKEUP) 2454 if (pending & PRINTK_PENDING_WAKEUP)
diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index dd562e9aa2c8..1f4bcb3cc21c 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -257,7 +257,8 @@ ok:
257 if (task->mm) 257 if (task->mm)
258 dumpable = get_dumpable(task->mm); 258 dumpable = get_dumpable(task->mm);
259 rcu_read_lock(); 259 rcu_read_lock();
260 if (!dumpable && !ptrace_has_cap(__task_cred(task)->user_ns, mode)) { 260 if (dumpable != SUID_DUMP_USER &&
261 !ptrace_has_cap(__task_cred(task)->user_ns, mode)) {
261 rcu_read_unlock(); 262 rcu_read_unlock();
262 return -EPERM; 263 return -EPERM;
263 } 264 }
diff --git a/kernel/rcu/Makefile b/kernel/rcu/Makefile
new file mode 100644
index 000000000000..01e9ec37a3e3
--- /dev/null
+++ b/kernel/rcu/Makefile
@@ -0,0 +1,6 @@
1obj-y += update.o srcu.o
2obj-$(CONFIG_RCU_TORTURE_TEST) += torture.o
3obj-$(CONFIG_TREE_RCU) += tree.o
4obj-$(CONFIG_TREE_PREEMPT_RCU) += tree.o
5obj-$(CONFIG_TREE_RCU_TRACE) += tree_trace.o
6obj-$(CONFIG_TINY_RCU) += tiny.o
diff --git a/kernel/rcu.h b/kernel/rcu/rcu.h
index 77131966c4ad..7859a0a3951e 100644
--- a/kernel/rcu.h
+++ b/kernel/rcu/rcu.h
@@ -122,4 +122,11 @@ int rcu_jiffies_till_stall_check(void);
122 122
123#endif /* #ifdef CONFIG_RCU_STALL_COMMON */ 123#endif /* #ifdef CONFIG_RCU_STALL_COMMON */
124 124
125/*
126 * Strings used in tracepoints need to be exported via the
127 * tracing system such that tools like perf and trace-cmd can
128 * translate the string address pointers to actual text.
129 */
130#define TPS(x) tracepoint_string(x)
131
125#endif /* __LINUX_RCU_H */ 132#endif /* __LINUX_RCU_H */
diff --git a/kernel/srcu.c b/kernel/rcu/srcu.c
index 01d5ccb8bfe3..01d5ccb8bfe3 100644
--- a/kernel/srcu.c
+++ b/kernel/rcu/srcu.c
diff --git a/kernel/rcutiny.c b/kernel/rcu/tiny.c
index 9ed6075dc562..1254f312d024 100644
--- a/kernel/rcutiny.c
+++ b/kernel/rcu/tiny.c
@@ -35,6 +35,7 @@
35#include <linux/time.h> 35#include <linux/time.h>
36#include <linux/cpu.h> 36#include <linux/cpu.h>
37#include <linux/prefetch.h> 37#include <linux/prefetch.h>
38#include <linux/ftrace_event.h>
38 39
39#ifdef CONFIG_RCU_TRACE 40#ifdef CONFIG_RCU_TRACE
40#include <trace/events/rcu.h> 41#include <trace/events/rcu.h>
@@ -42,7 +43,7 @@
42 43
43#include "rcu.h" 44#include "rcu.h"
44 45
45/* Forward declarations for rcutiny_plugin.h. */ 46/* Forward declarations for tiny_plugin.h. */
46struct rcu_ctrlblk; 47struct rcu_ctrlblk;
47static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp); 48static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp);
48static void rcu_process_callbacks(struct softirq_action *unused); 49static void rcu_process_callbacks(struct softirq_action *unused);
@@ -52,22 +53,23 @@ static void __call_rcu(struct rcu_head *head,
52 53
53static long long rcu_dynticks_nesting = DYNTICK_TASK_EXIT_IDLE; 54static long long rcu_dynticks_nesting = DYNTICK_TASK_EXIT_IDLE;
54 55
55#include "rcutiny_plugin.h" 56#include "tiny_plugin.h"
56 57
57/* Common code for rcu_idle_enter() and rcu_irq_exit(), see kernel/rcutree.c. */ 58/* Common code for rcu_idle_enter() and rcu_irq_exit(), see kernel/rcutree.c. */
58static void rcu_idle_enter_common(long long newval) 59static void rcu_idle_enter_common(long long newval)
59{ 60{
60 if (newval) { 61 if (newval) {
61 RCU_TRACE(trace_rcu_dyntick("--=", 62 RCU_TRACE(trace_rcu_dyntick(TPS("--="),
62 rcu_dynticks_nesting, newval)); 63 rcu_dynticks_nesting, newval));
63 rcu_dynticks_nesting = newval; 64 rcu_dynticks_nesting = newval;
64 return; 65 return;
65 } 66 }
66 RCU_TRACE(trace_rcu_dyntick("Start", rcu_dynticks_nesting, newval)); 67 RCU_TRACE(trace_rcu_dyntick(TPS("Start"),
68 rcu_dynticks_nesting, newval));
67 if (!is_idle_task(current)) { 69 if (!is_idle_task(current)) {
68 struct task_struct *idle = idle_task(smp_processor_id()); 70 struct task_struct *idle __maybe_unused = idle_task(smp_processor_id());
69 71
70 RCU_TRACE(trace_rcu_dyntick("Error on entry: not idle task", 72 RCU_TRACE(trace_rcu_dyntick(TPS("Entry error: not idle task"),
71 rcu_dynticks_nesting, newval)); 73 rcu_dynticks_nesting, newval));
72 ftrace_dump(DUMP_ALL); 74 ftrace_dump(DUMP_ALL);
73 WARN_ONCE(1, "Current pid: %d comm: %s / Idle pid: %d comm: %s", 75 WARN_ONCE(1, "Current pid: %d comm: %s / Idle pid: %d comm: %s",
@@ -120,15 +122,15 @@ EXPORT_SYMBOL_GPL(rcu_irq_exit);
120static void rcu_idle_exit_common(long long oldval) 122static void rcu_idle_exit_common(long long oldval)
121{ 123{
122 if (oldval) { 124 if (oldval) {
123 RCU_TRACE(trace_rcu_dyntick("++=", 125 RCU_TRACE(trace_rcu_dyntick(TPS("++="),
124 oldval, rcu_dynticks_nesting)); 126 oldval, rcu_dynticks_nesting));
125 return; 127 return;
126 } 128 }
127 RCU_TRACE(trace_rcu_dyntick("End", oldval, rcu_dynticks_nesting)); 129 RCU_TRACE(trace_rcu_dyntick(TPS("End"), oldval, rcu_dynticks_nesting));
128 if (!is_idle_task(current)) { 130 if (!is_idle_task(current)) {
129 struct task_struct *idle = idle_task(smp_processor_id()); 131 struct task_struct *idle __maybe_unused = idle_task(smp_processor_id());
130 132
131 RCU_TRACE(trace_rcu_dyntick("Error on exit: not idle task", 133 RCU_TRACE(trace_rcu_dyntick(TPS("Exit error: not idle task"),
132 oldval, rcu_dynticks_nesting)); 134 oldval, rcu_dynticks_nesting));
133 ftrace_dump(DUMP_ALL); 135 ftrace_dump(DUMP_ALL);
134 WARN_ONCE(1, "Current pid: %d comm: %s / Idle pid: %d comm: %s", 136 WARN_ONCE(1, "Current pid: %d comm: %s / Idle pid: %d comm: %s",
@@ -174,18 +176,18 @@ void rcu_irq_enter(void)
174} 176}
175EXPORT_SYMBOL_GPL(rcu_irq_enter); 177EXPORT_SYMBOL_GPL(rcu_irq_enter);
176 178
177#ifdef CONFIG_DEBUG_LOCK_ALLOC 179#if defined(CONFIG_DEBUG_LOCK_ALLOC) || defined(CONFIG_RCU_TRACE)
178 180
179/* 181/*
180 * Test whether RCU thinks that the current CPU is idle. 182 * Test whether RCU thinks that the current CPU is idle.
181 */ 183 */
182int rcu_is_cpu_idle(void) 184bool notrace __rcu_is_watching(void)
183{ 185{
184 return !rcu_dynticks_nesting; 186 return rcu_dynticks_nesting;
185} 187}
186EXPORT_SYMBOL(rcu_is_cpu_idle); 188EXPORT_SYMBOL(__rcu_is_watching);
187 189
188#endif /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */ 190#endif /* defined(CONFIG_DEBUG_LOCK_ALLOC) || defined(CONFIG_RCU_TRACE) */
189 191
190/* 192/*
191 * Test whether the current CPU was interrupted from idle. Nested 193 * Test whether the current CPU was interrupted from idle. Nested
@@ -273,7 +275,7 @@ static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp)
273 if (&rcp->rcucblist == rcp->donetail) { 275 if (&rcp->rcucblist == rcp->donetail) {
274 RCU_TRACE(trace_rcu_batch_start(rcp->name, 0, 0, -1)); 276 RCU_TRACE(trace_rcu_batch_start(rcp->name, 0, 0, -1));
275 RCU_TRACE(trace_rcu_batch_end(rcp->name, 0, 277 RCU_TRACE(trace_rcu_batch_end(rcp->name, 0,
276 ACCESS_ONCE(rcp->rcucblist), 278 !!ACCESS_ONCE(rcp->rcucblist),
277 need_resched(), 279 need_resched(),
278 is_idle_task(current), 280 is_idle_task(current),
279 false)); 281 false));
@@ -304,7 +306,8 @@ static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp)
304 RCU_TRACE(cb_count++); 306 RCU_TRACE(cb_count++);
305 } 307 }
306 RCU_TRACE(rcu_trace_sub_qlen(rcp, cb_count)); 308 RCU_TRACE(rcu_trace_sub_qlen(rcp, cb_count));
307 RCU_TRACE(trace_rcu_batch_end(rcp->name, cb_count, 0, need_resched(), 309 RCU_TRACE(trace_rcu_batch_end(rcp->name,
310 cb_count, 0, need_resched(),
308 is_idle_task(current), 311 is_idle_task(current),
309 false)); 312 false));
310} 313}
diff --git a/kernel/rcutiny_plugin.h b/kernel/rcu/tiny_plugin.h
index 280d06cae352..280d06cae352 100644
--- a/kernel/rcutiny_plugin.h
+++ b/kernel/rcu/tiny_plugin.h
diff --git a/kernel/rcutorture.c b/kernel/rcu/torture.c
index be63101c6175..3929cd451511 100644
--- a/kernel/rcutorture.c
+++ b/kernel/rcu/torture.c
@@ -52,6 +52,12 @@
52MODULE_LICENSE("GPL"); 52MODULE_LICENSE("GPL");
53MODULE_AUTHOR("Paul E. McKenney <paulmck@us.ibm.com> and Josh Triplett <josh@freedesktop.org>"); 53MODULE_AUTHOR("Paul E. McKenney <paulmck@us.ibm.com> and Josh Triplett <josh@freedesktop.org>");
54 54
55MODULE_ALIAS("rcutorture");
56#ifdef MODULE_PARAM_PREFIX
57#undef MODULE_PARAM_PREFIX
58#endif
59#define MODULE_PARAM_PREFIX "rcutorture."
60
55static int fqs_duration; 61static int fqs_duration;
56module_param(fqs_duration, int, 0444); 62module_param(fqs_duration, int, 0444);
57MODULE_PARM_DESC(fqs_duration, "Duration of fqs bursts (us), 0 to disable"); 63MODULE_PARM_DESC(fqs_duration, "Duration of fqs bursts (us), 0 to disable");
diff --git a/kernel/rcutree.c b/kernel/rcu/tree.c
index 32618b3fe4e6..dd081987a8ec 100644
--- a/kernel/rcutree.c
+++ b/kernel/rcu/tree.c
@@ -41,6 +41,7 @@
41#include <linux/export.h> 41#include <linux/export.h>
42#include <linux/completion.h> 42#include <linux/completion.h>
43#include <linux/moduleparam.h> 43#include <linux/moduleparam.h>
44#include <linux/module.h>
44#include <linux/percpu.h> 45#include <linux/percpu.h>
45#include <linux/notifier.h> 46#include <linux/notifier.h>
46#include <linux/cpu.h> 47#include <linux/cpu.h>
@@ -56,17 +57,16 @@
56#include <linux/ftrace_event.h> 57#include <linux/ftrace_event.h>
57#include <linux/suspend.h> 58#include <linux/suspend.h>
58 59
59#include "rcutree.h" 60#include "tree.h"
60#include <trace/events/rcu.h> 61#include <trace/events/rcu.h>
61 62
62#include "rcu.h" 63#include "rcu.h"
63 64
64/* 65MODULE_ALIAS("rcutree");
65 * Strings used in tracepoints need to be exported via the 66#ifdef MODULE_PARAM_PREFIX
66 * tracing system such that tools like perf and trace-cmd can 67#undef MODULE_PARAM_PREFIX
67 * translate the string address pointers to actual text. 68#endif
68 */ 69#define MODULE_PARAM_PREFIX "rcutree."
69#define TPS(x) tracepoint_string(x)
70 70
71/* Data structures. */ 71/* Data structures. */
72 72
@@ -222,7 +222,7 @@ void rcu_note_context_switch(int cpu)
222} 222}
223EXPORT_SYMBOL_GPL(rcu_note_context_switch); 223EXPORT_SYMBOL_GPL(rcu_note_context_switch);
224 224
225DEFINE_PER_CPU(struct rcu_dynticks, rcu_dynticks) = { 225static DEFINE_PER_CPU(struct rcu_dynticks, rcu_dynticks) = {
226 .dynticks_nesting = DYNTICK_TASK_EXIT_IDLE, 226 .dynticks_nesting = DYNTICK_TASK_EXIT_IDLE,
227 .dynticks = ATOMIC_INIT(1), 227 .dynticks = ATOMIC_INIT(1),
228#ifdef CONFIG_NO_HZ_FULL_SYSIDLE 228#ifdef CONFIG_NO_HZ_FULL_SYSIDLE
@@ -371,7 +371,8 @@ static void rcu_eqs_enter_common(struct rcu_dynticks *rdtp, long long oldval,
371{ 371{
372 trace_rcu_dyntick(TPS("Start"), oldval, rdtp->dynticks_nesting); 372 trace_rcu_dyntick(TPS("Start"), oldval, rdtp->dynticks_nesting);
373 if (!user && !is_idle_task(current)) { 373 if (!user && !is_idle_task(current)) {
374 struct task_struct *idle = idle_task(smp_processor_id()); 374 struct task_struct *idle __maybe_unused =
375 idle_task(smp_processor_id());
375 376
376 trace_rcu_dyntick(TPS("Error on entry: not idle task"), oldval, 0); 377 trace_rcu_dyntick(TPS("Error on entry: not idle task"), oldval, 0);
377 ftrace_dump(DUMP_ORIG); 378 ftrace_dump(DUMP_ORIG);
@@ -407,7 +408,7 @@ static void rcu_eqs_enter(bool user)
407 long long oldval; 408 long long oldval;
408 struct rcu_dynticks *rdtp; 409 struct rcu_dynticks *rdtp;
409 410
410 rdtp = &__get_cpu_var(rcu_dynticks); 411 rdtp = this_cpu_ptr(&rcu_dynticks);
411 oldval = rdtp->dynticks_nesting; 412 oldval = rdtp->dynticks_nesting;
412 WARN_ON_ONCE((oldval & DYNTICK_TASK_NEST_MASK) == 0); 413 WARN_ON_ONCE((oldval & DYNTICK_TASK_NEST_MASK) == 0);
413 if ((oldval & DYNTICK_TASK_NEST_MASK) == DYNTICK_TASK_NEST_VALUE) 414 if ((oldval & DYNTICK_TASK_NEST_MASK) == DYNTICK_TASK_NEST_VALUE)
@@ -435,7 +436,7 @@ void rcu_idle_enter(void)
435 436
436 local_irq_save(flags); 437 local_irq_save(flags);
437 rcu_eqs_enter(false); 438 rcu_eqs_enter(false);
438 rcu_sysidle_enter(&__get_cpu_var(rcu_dynticks), 0); 439 rcu_sysidle_enter(this_cpu_ptr(&rcu_dynticks), 0);
439 local_irq_restore(flags); 440 local_irq_restore(flags);
440} 441}
441EXPORT_SYMBOL_GPL(rcu_idle_enter); 442EXPORT_SYMBOL_GPL(rcu_idle_enter);
@@ -478,7 +479,7 @@ void rcu_irq_exit(void)
478 struct rcu_dynticks *rdtp; 479 struct rcu_dynticks *rdtp;
479 480
480 local_irq_save(flags); 481 local_irq_save(flags);
481 rdtp = &__get_cpu_var(rcu_dynticks); 482 rdtp = this_cpu_ptr(&rcu_dynticks);
482 oldval = rdtp->dynticks_nesting; 483 oldval = rdtp->dynticks_nesting;
483 rdtp->dynticks_nesting--; 484 rdtp->dynticks_nesting--;
484 WARN_ON_ONCE(rdtp->dynticks_nesting < 0); 485 WARN_ON_ONCE(rdtp->dynticks_nesting < 0);
@@ -508,7 +509,8 @@ static void rcu_eqs_exit_common(struct rcu_dynticks *rdtp, long long oldval,
508 rcu_cleanup_after_idle(smp_processor_id()); 509 rcu_cleanup_after_idle(smp_processor_id());
509 trace_rcu_dyntick(TPS("End"), oldval, rdtp->dynticks_nesting); 510 trace_rcu_dyntick(TPS("End"), oldval, rdtp->dynticks_nesting);
510 if (!user && !is_idle_task(current)) { 511 if (!user && !is_idle_task(current)) {
511 struct task_struct *idle = idle_task(smp_processor_id()); 512 struct task_struct *idle __maybe_unused =
513 idle_task(smp_processor_id());
512 514
513 trace_rcu_dyntick(TPS("Error on exit: not idle task"), 515 trace_rcu_dyntick(TPS("Error on exit: not idle task"),
514 oldval, rdtp->dynticks_nesting); 516 oldval, rdtp->dynticks_nesting);
@@ -528,7 +530,7 @@ static void rcu_eqs_exit(bool user)
528 struct rcu_dynticks *rdtp; 530 struct rcu_dynticks *rdtp;
529 long long oldval; 531 long long oldval;
530 532
531 rdtp = &__get_cpu_var(rcu_dynticks); 533 rdtp = this_cpu_ptr(&rcu_dynticks);
532 oldval = rdtp->dynticks_nesting; 534 oldval = rdtp->dynticks_nesting;
533 WARN_ON_ONCE(oldval < 0); 535 WARN_ON_ONCE(oldval < 0);
534 if (oldval & DYNTICK_TASK_NEST_MASK) 536 if (oldval & DYNTICK_TASK_NEST_MASK)
@@ -555,7 +557,7 @@ void rcu_idle_exit(void)
555 557
556 local_irq_save(flags); 558 local_irq_save(flags);
557 rcu_eqs_exit(false); 559 rcu_eqs_exit(false);
558 rcu_sysidle_exit(&__get_cpu_var(rcu_dynticks), 0); 560 rcu_sysidle_exit(this_cpu_ptr(&rcu_dynticks), 0);
559 local_irq_restore(flags); 561 local_irq_restore(flags);
560} 562}
561EXPORT_SYMBOL_GPL(rcu_idle_exit); 563EXPORT_SYMBOL_GPL(rcu_idle_exit);
@@ -599,7 +601,7 @@ void rcu_irq_enter(void)
599 long long oldval; 601 long long oldval;
600 602
601 local_irq_save(flags); 603 local_irq_save(flags);
602 rdtp = &__get_cpu_var(rcu_dynticks); 604 rdtp = this_cpu_ptr(&rcu_dynticks);
603 oldval = rdtp->dynticks_nesting; 605 oldval = rdtp->dynticks_nesting;
604 rdtp->dynticks_nesting++; 606 rdtp->dynticks_nesting++;
605 WARN_ON_ONCE(rdtp->dynticks_nesting == 0); 607 WARN_ON_ONCE(rdtp->dynticks_nesting == 0);
@@ -620,7 +622,7 @@ void rcu_irq_enter(void)
620 */ 622 */
621void rcu_nmi_enter(void) 623void rcu_nmi_enter(void)
622{ 624{
623 struct rcu_dynticks *rdtp = &__get_cpu_var(rcu_dynticks); 625 struct rcu_dynticks *rdtp = this_cpu_ptr(&rcu_dynticks);
624 626
625 if (rdtp->dynticks_nmi_nesting == 0 && 627 if (rdtp->dynticks_nmi_nesting == 0 &&
626 (atomic_read(&rdtp->dynticks) & 0x1)) 628 (atomic_read(&rdtp->dynticks) & 0x1))
@@ -642,7 +644,7 @@ void rcu_nmi_enter(void)
642 */ 644 */
643void rcu_nmi_exit(void) 645void rcu_nmi_exit(void)
644{ 646{
645 struct rcu_dynticks *rdtp = &__get_cpu_var(rcu_dynticks); 647 struct rcu_dynticks *rdtp = this_cpu_ptr(&rcu_dynticks);
646 648
647 if (rdtp->dynticks_nmi_nesting == 0 || 649 if (rdtp->dynticks_nmi_nesting == 0 ||
648 --rdtp->dynticks_nmi_nesting != 0) 650 --rdtp->dynticks_nmi_nesting != 0)
@@ -655,21 +657,34 @@ void rcu_nmi_exit(void)
655} 657}
656 658
657/** 659/**
658 * rcu_is_cpu_idle - see if RCU thinks that the current CPU is idle 660 * __rcu_is_watching - are RCU read-side critical sections safe?
661 *
662 * Return true if RCU is watching the running CPU, which means that
663 * this CPU can safely enter RCU read-side critical sections. Unlike
664 * rcu_is_watching(), the caller of __rcu_is_watching() must have at
665 * least disabled preemption.
666 */
667bool notrace __rcu_is_watching(void)
668{
669 return atomic_read(this_cpu_ptr(&rcu_dynticks.dynticks)) & 0x1;
670}
671
672/**
673 * rcu_is_watching - see if RCU thinks that the current CPU is idle
659 * 674 *
660 * If the current CPU is in its idle loop and is neither in an interrupt 675 * If the current CPU is in its idle loop and is neither in an interrupt
661 * or NMI handler, return true. 676 * or NMI handler, return true.
662 */ 677 */
663int rcu_is_cpu_idle(void) 678bool notrace rcu_is_watching(void)
664{ 679{
665 int ret; 680 int ret;
666 681
667 preempt_disable(); 682 preempt_disable();
668 ret = (atomic_read(&__get_cpu_var(rcu_dynticks).dynticks) & 0x1) == 0; 683 ret = __rcu_is_watching();
669 preempt_enable(); 684 preempt_enable();
670 return ret; 685 return ret;
671} 686}
672EXPORT_SYMBOL(rcu_is_cpu_idle); 687EXPORT_SYMBOL_GPL(rcu_is_watching);
673 688
674#if defined(CONFIG_PROVE_RCU) && defined(CONFIG_HOTPLUG_CPU) 689#if defined(CONFIG_PROVE_RCU) && defined(CONFIG_HOTPLUG_CPU)
675 690
@@ -703,7 +718,7 @@ bool rcu_lockdep_current_cpu_online(void)
703 if (in_nmi()) 718 if (in_nmi())
704 return 1; 719 return 1;
705 preempt_disable(); 720 preempt_disable();
706 rdp = &__get_cpu_var(rcu_sched_data); 721 rdp = this_cpu_ptr(&rcu_sched_data);
707 rnp = rdp->mynode; 722 rnp = rdp->mynode;
708 ret = (rdp->grpmask & rnp->qsmaskinit) || 723 ret = (rdp->grpmask & rnp->qsmaskinit) ||
709 !rcu_scheduler_fully_active; 724 !rcu_scheduler_fully_active;
@@ -723,7 +738,7 @@ EXPORT_SYMBOL_GPL(rcu_lockdep_current_cpu_online);
723 */ 738 */
724static int rcu_is_cpu_rrupt_from_idle(void) 739static int rcu_is_cpu_rrupt_from_idle(void)
725{ 740{
726 return __get_cpu_var(rcu_dynticks).dynticks_nesting <= 1; 741 return __this_cpu_read(rcu_dynticks.dynticks_nesting) <= 1;
727} 742}
728 743
729/* 744/*
@@ -802,8 +817,11 @@ static int rcu_implicit_dynticks_qs(struct rcu_data *rdp,
802 817
803static void record_gp_stall_check_time(struct rcu_state *rsp) 818static void record_gp_stall_check_time(struct rcu_state *rsp)
804{ 819{
805 rsp->gp_start = jiffies; 820 unsigned long j = ACCESS_ONCE(jiffies);
806 rsp->jiffies_stall = jiffies + rcu_jiffies_till_stall_check(); 821
822 rsp->gp_start = j;
823 smp_wmb(); /* Record start time before stall time. */
824 rsp->jiffies_stall = j + rcu_jiffies_till_stall_check();
807} 825}
808 826
809/* 827/*
@@ -898,6 +916,12 @@ static void print_other_cpu_stall(struct rcu_state *rsp)
898 force_quiescent_state(rsp); /* Kick them all. */ 916 force_quiescent_state(rsp); /* Kick them all. */
899} 917}
900 918
919/*
920 * This function really isn't for public consumption, but RCU is special in
921 * that context switches can allow the state machine to make progress.
922 */
923extern void resched_cpu(int cpu);
924
901static void print_cpu_stall(struct rcu_state *rsp) 925static void print_cpu_stall(struct rcu_state *rsp)
902{ 926{
903 int cpu; 927 int cpu;
@@ -927,22 +951,60 @@ static void print_cpu_stall(struct rcu_state *rsp)
927 3 * rcu_jiffies_till_stall_check() + 3; 951 3 * rcu_jiffies_till_stall_check() + 3;
928 raw_spin_unlock_irqrestore(&rnp->lock, flags); 952 raw_spin_unlock_irqrestore(&rnp->lock, flags);
929 953
930 set_need_resched(); /* kick ourselves to get things going. */ 954 /*
955 * Attempt to revive the RCU machinery by forcing a context switch.
956 *
957 * A context switch would normally allow the RCU state machine to make
958 * progress and it could be we're stuck in kernel space without context
959 * switches for an entirely unreasonable amount of time.
960 */
961 resched_cpu(smp_processor_id());
931} 962}
932 963
933static void check_cpu_stall(struct rcu_state *rsp, struct rcu_data *rdp) 964static void check_cpu_stall(struct rcu_state *rsp, struct rcu_data *rdp)
934{ 965{
966 unsigned long completed;
967 unsigned long gpnum;
968 unsigned long gps;
935 unsigned long j; 969 unsigned long j;
936 unsigned long js; 970 unsigned long js;
937 struct rcu_node *rnp; 971 struct rcu_node *rnp;
938 972
939 if (rcu_cpu_stall_suppress) 973 if (rcu_cpu_stall_suppress || !rcu_gp_in_progress(rsp))
940 return; 974 return;
941 j = ACCESS_ONCE(jiffies); 975 j = ACCESS_ONCE(jiffies);
976
977 /*
978 * Lots of memory barriers to reject false positives.
979 *
980 * The idea is to pick up rsp->gpnum, then rsp->jiffies_stall,
981 * then rsp->gp_start, and finally rsp->completed. These values
982 * are updated in the opposite order with memory barriers (or
983 * equivalent) during grace-period initialization and cleanup.
984 * Now, a false positive can occur if we get an new value of
985 * rsp->gp_start and a old value of rsp->jiffies_stall. But given
986 * the memory barriers, the only way that this can happen is if one
987 * grace period ends and another starts between these two fetches.
988 * Detect this by comparing rsp->completed with the previous fetch
989 * from rsp->gpnum.
990 *
991 * Given this check, comparisons of jiffies, rsp->jiffies_stall,
992 * and rsp->gp_start suffice to forestall false positives.
993 */
994 gpnum = ACCESS_ONCE(rsp->gpnum);
995 smp_rmb(); /* Pick up ->gpnum first... */
942 js = ACCESS_ONCE(rsp->jiffies_stall); 996 js = ACCESS_ONCE(rsp->jiffies_stall);
997 smp_rmb(); /* ...then ->jiffies_stall before the rest... */
998 gps = ACCESS_ONCE(rsp->gp_start);
999 smp_rmb(); /* ...and finally ->gp_start before ->completed. */
1000 completed = ACCESS_ONCE(rsp->completed);
1001 if (ULONG_CMP_GE(completed, gpnum) ||
1002 ULONG_CMP_LT(j, js) ||
1003 ULONG_CMP_GE(gps, js))
1004 return; /* No stall or GP completed since entering function. */
943 rnp = rdp->mynode; 1005 rnp = rdp->mynode;
944 if (rcu_gp_in_progress(rsp) && 1006 if (rcu_gp_in_progress(rsp) &&
945 (ACCESS_ONCE(rnp->qsmask) & rdp->grpmask) && ULONG_CMP_GE(j, js)) { 1007 (ACCESS_ONCE(rnp->qsmask) & rdp->grpmask)) {
946 1008
947 /* We haven't checked in, so go dump stack. */ 1009 /* We haven't checked in, so go dump stack. */
948 print_cpu_stall(rsp); 1010 print_cpu_stall(rsp);
@@ -1297,7 +1359,7 @@ static void note_gp_changes(struct rcu_state *rsp, struct rcu_data *rdp)
1297} 1359}
1298 1360
1299/* 1361/*
1300 * Initialize a new grace period. 1362 * Initialize a new grace period. Return 0 if no grace period required.
1301 */ 1363 */
1302static int rcu_gp_init(struct rcu_state *rsp) 1364static int rcu_gp_init(struct rcu_state *rsp)
1303{ 1365{
@@ -1306,18 +1368,27 @@ static int rcu_gp_init(struct rcu_state *rsp)
1306 1368
1307 rcu_bind_gp_kthread(); 1369 rcu_bind_gp_kthread();
1308 raw_spin_lock_irq(&rnp->lock); 1370 raw_spin_lock_irq(&rnp->lock);
1371 if (rsp->gp_flags == 0) {
1372 /* Spurious wakeup, tell caller to go back to sleep. */
1373 raw_spin_unlock_irq(&rnp->lock);
1374 return 0;
1375 }
1309 rsp->gp_flags = 0; /* Clear all flags: New grace period. */ 1376 rsp->gp_flags = 0; /* Clear all flags: New grace period. */
1310 1377
1311 if (rcu_gp_in_progress(rsp)) { 1378 if (WARN_ON_ONCE(rcu_gp_in_progress(rsp))) {
1312 /* Grace period already in progress, don't start another. */ 1379 /*
1380 * Grace period already in progress, don't start another.
1381 * Not supposed to be able to happen.
1382 */
1313 raw_spin_unlock_irq(&rnp->lock); 1383 raw_spin_unlock_irq(&rnp->lock);
1314 return 0; 1384 return 0;
1315 } 1385 }
1316 1386
1317 /* Advance to a new grace period and initialize state. */ 1387 /* Advance to a new grace period and initialize state. */
1388 record_gp_stall_check_time(rsp);
1389 smp_wmb(); /* Record GP times before starting GP. */
1318 rsp->gpnum++; 1390 rsp->gpnum++;
1319 trace_rcu_grace_period(rsp->name, rsp->gpnum, TPS("start")); 1391 trace_rcu_grace_period(rsp->name, rsp->gpnum, TPS("start"));
1320 record_gp_stall_check_time(rsp);
1321 raw_spin_unlock_irq(&rnp->lock); 1392 raw_spin_unlock_irq(&rnp->lock);
1322 1393
1323 /* Exclude any concurrent CPU-hotplug operations. */ 1394 /* Exclude any concurrent CPU-hotplug operations. */
@@ -1366,7 +1437,7 @@ static int rcu_gp_init(struct rcu_state *rsp)
1366/* 1437/*
1367 * Do one round of quiescent-state forcing. 1438 * Do one round of quiescent-state forcing.
1368 */ 1439 */
1369int rcu_gp_fqs(struct rcu_state *rsp, int fqs_state_in) 1440static int rcu_gp_fqs(struct rcu_state *rsp, int fqs_state_in)
1370{ 1441{
1371 int fqs_state = fqs_state_in; 1442 int fqs_state = fqs_state_in;
1372 bool isidle = false; 1443 bool isidle = false;
@@ -1451,8 +1522,12 @@ static void rcu_gp_cleanup(struct rcu_state *rsp)
1451 rsp->fqs_state = RCU_GP_IDLE; 1522 rsp->fqs_state = RCU_GP_IDLE;
1452 rdp = this_cpu_ptr(rsp->rda); 1523 rdp = this_cpu_ptr(rsp->rda);
1453 rcu_advance_cbs(rsp, rnp, rdp); /* Reduce false positives below. */ 1524 rcu_advance_cbs(rsp, rnp, rdp); /* Reduce false positives below. */
1454 if (cpu_needs_another_gp(rsp, rdp)) 1525 if (cpu_needs_another_gp(rsp, rdp)) {
1455 rsp->gp_flags = 1; 1526 rsp->gp_flags = RCU_GP_FLAG_INIT;
1527 trace_rcu_grace_period(rsp->name,
1528 ACCESS_ONCE(rsp->gpnum),
1529 TPS("newreq"));
1530 }
1456 raw_spin_unlock_irq(&rnp->lock); 1531 raw_spin_unlock_irq(&rnp->lock);
1457} 1532}
1458 1533
@@ -1462,6 +1537,7 @@ static void rcu_gp_cleanup(struct rcu_state *rsp)
1462static int __noreturn rcu_gp_kthread(void *arg) 1537static int __noreturn rcu_gp_kthread(void *arg)
1463{ 1538{
1464 int fqs_state; 1539 int fqs_state;
1540 int gf;
1465 unsigned long j; 1541 unsigned long j;
1466 int ret; 1542 int ret;
1467 struct rcu_state *rsp = arg; 1543 struct rcu_state *rsp = arg;
@@ -1471,14 +1547,19 @@ static int __noreturn rcu_gp_kthread(void *arg)
1471 1547
1472 /* Handle grace-period start. */ 1548 /* Handle grace-period start. */
1473 for (;;) { 1549 for (;;) {
1550 trace_rcu_grace_period(rsp->name,
1551 ACCESS_ONCE(rsp->gpnum),
1552 TPS("reqwait"));
1474 wait_event_interruptible(rsp->gp_wq, 1553 wait_event_interruptible(rsp->gp_wq,
1475 rsp->gp_flags & 1554 ACCESS_ONCE(rsp->gp_flags) &
1476 RCU_GP_FLAG_INIT); 1555 RCU_GP_FLAG_INIT);
1477 if ((rsp->gp_flags & RCU_GP_FLAG_INIT) && 1556 if (rcu_gp_init(rsp))
1478 rcu_gp_init(rsp))
1479 break; 1557 break;
1480 cond_resched(); 1558 cond_resched();
1481 flush_signals(current); 1559 flush_signals(current);
1560 trace_rcu_grace_period(rsp->name,
1561 ACCESS_ONCE(rsp->gpnum),
1562 TPS("reqwaitsig"));
1482 } 1563 }
1483 1564
1484 /* Handle quiescent-state forcing. */ 1565 /* Handle quiescent-state forcing. */
@@ -1488,10 +1569,16 @@ static int __noreturn rcu_gp_kthread(void *arg)
1488 j = HZ; 1569 j = HZ;
1489 jiffies_till_first_fqs = HZ; 1570 jiffies_till_first_fqs = HZ;
1490 } 1571 }
1572 ret = 0;
1491 for (;;) { 1573 for (;;) {
1492 rsp->jiffies_force_qs = jiffies + j; 1574 if (!ret)
1575 rsp->jiffies_force_qs = jiffies + j;
1576 trace_rcu_grace_period(rsp->name,
1577 ACCESS_ONCE(rsp->gpnum),
1578 TPS("fqswait"));
1493 ret = wait_event_interruptible_timeout(rsp->gp_wq, 1579 ret = wait_event_interruptible_timeout(rsp->gp_wq,
1494 (rsp->gp_flags & RCU_GP_FLAG_FQS) || 1580 ((gf = ACCESS_ONCE(rsp->gp_flags)) &
1581 RCU_GP_FLAG_FQS) ||
1495 (!ACCESS_ONCE(rnp->qsmask) && 1582 (!ACCESS_ONCE(rnp->qsmask) &&
1496 !rcu_preempt_blocked_readers_cgp(rnp)), 1583 !rcu_preempt_blocked_readers_cgp(rnp)),
1497 j); 1584 j);
@@ -1500,13 +1587,23 @@ static int __noreturn rcu_gp_kthread(void *arg)
1500 !rcu_preempt_blocked_readers_cgp(rnp)) 1587 !rcu_preempt_blocked_readers_cgp(rnp))
1501 break; 1588 break;
1502 /* If time for quiescent-state forcing, do it. */ 1589 /* If time for quiescent-state forcing, do it. */
1503 if (ret == 0 || (rsp->gp_flags & RCU_GP_FLAG_FQS)) { 1590 if (ULONG_CMP_GE(jiffies, rsp->jiffies_force_qs) ||
1591 (gf & RCU_GP_FLAG_FQS)) {
1592 trace_rcu_grace_period(rsp->name,
1593 ACCESS_ONCE(rsp->gpnum),
1594 TPS("fqsstart"));
1504 fqs_state = rcu_gp_fqs(rsp, fqs_state); 1595 fqs_state = rcu_gp_fqs(rsp, fqs_state);
1596 trace_rcu_grace_period(rsp->name,
1597 ACCESS_ONCE(rsp->gpnum),
1598 TPS("fqsend"));
1505 cond_resched(); 1599 cond_resched();
1506 } else { 1600 } else {
1507 /* Deal with stray signal. */ 1601 /* Deal with stray signal. */
1508 cond_resched(); 1602 cond_resched();
1509 flush_signals(current); 1603 flush_signals(current);
1604 trace_rcu_grace_period(rsp->name,
1605 ACCESS_ONCE(rsp->gpnum),
1606 TPS("fqswaitsig"));
1510 } 1607 }
1511 j = jiffies_till_next_fqs; 1608 j = jiffies_till_next_fqs;
1512 if (j > HZ) { 1609 if (j > HZ) {
@@ -1554,6 +1651,8 @@ rcu_start_gp_advanced(struct rcu_state *rsp, struct rcu_node *rnp,
1554 return; 1651 return;
1555 } 1652 }
1556 rsp->gp_flags = RCU_GP_FLAG_INIT; 1653 rsp->gp_flags = RCU_GP_FLAG_INIT;
1654 trace_rcu_grace_period(rsp->name, ACCESS_ONCE(rsp->gpnum),
1655 TPS("newreq"));
1557 1656
1558 /* 1657 /*
1559 * We can't do wakeups while holding the rnp->lock, as that 1658 * We can't do wakeups while holding the rnp->lock, as that
@@ -2255,7 +2354,7 @@ static void __call_rcu_core(struct rcu_state *rsp, struct rcu_data *rdp,
2255 * If called from an extended quiescent state, invoke the RCU 2354 * If called from an extended quiescent state, invoke the RCU
2256 * core in order to force a re-evaluation of RCU's idleness. 2355 * core in order to force a re-evaluation of RCU's idleness.
2257 */ 2356 */
2258 if (rcu_is_cpu_idle() && cpu_online(smp_processor_id())) 2357 if (!rcu_is_watching() && cpu_online(smp_processor_id()))
2259 invoke_rcu_core(); 2358 invoke_rcu_core();
2260 2359
2261 /* If interrupts were disabled or CPU offline, don't invoke RCU core. */ 2360 /* If interrupts were disabled or CPU offline, don't invoke RCU core. */
@@ -2725,10 +2824,13 @@ static int rcu_cpu_has_callbacks(int cpu, bool *all_lazy)
2725 2824
2726 for_each_rcu_flavor(rsp) { 2825 for_each_rcu_flavor(rsp) {
2727 rdp = per_cpu_ptr(rsp->rda, cpu); 2826 rdp = per_cpu_ptr(rsp->rda, cpu);
2728 if (rdp->qlen != rdp->qlen_lazy) 2827 if (!rdp->nxtlist)
2828 continue;
2829 hc = true;
2830 if (rdp->qlen != rdp->qlen_lazy || !all_lazy) {
2729 al = false; 2831 al = false;
2730 if (rdp->nxtlist) 2832 break;
2731 hc = true; 2833 }
2732 } 2834 }
2733 if (all_lazy) 2835 if (all_lazy)
2734 *all_lazy = al; 2836 *all_lazy = al;
@@ -3216,7 +3318,7 @@ static void __init rcu_init_one(struct rcu_state *rsp,
3216 3318
3217/* 3319/*
3218 * Compute the rcu_node tree geometry from kernel parameters. This cannot 3320 * Compute the rcu_node tree geometry from kernel parameters. This cannot
3219 * replace the definitions in rcutree.h because those are needed to size 3321 * replace the definitions in tree.h because those are needed to size
3220 * the ->node array in the rcu_state structure. 3322 * the ->node array in the rcu_state structure.
3221 */ 3323 */
3222static void __init rcu_init_geometry(void) 3324static void __init rcu_init_geometry(void)
@@ -3295,8 +3397,8 @@ void __init rcu_init(void)
3295 3397
3296 rcu_bootup_announce(); 3398 rcu_bootup_announce();
3297 rcu_init_geometry(); 3399 rcu_init_geometry();
3298 rcu_init_one(&rcu_sched_state, &rcu_sched_data);
3299 rcu_init_one(&rcu_bh_state, &rcu_bh_data); 3400 rcu_init_one(&rcu_bh_state, &rcu_bh_data);
3401 rcu_init_one(&rcu_sched_state, &rcu_sched_data);
3300 __rcu_init_preempt(); 3402 __rcu_init_preempt();
3301 open_softirq(RCU_SOFTIRQ, rcu_process_callbacks); 3403 open_softirq(RCU_SOFTIRQ, rcu_process_callbacks);
3302 3404
@@ -3311,4 +3413,4 @@ void __init rcu_init(void)
3311 rcu_cpu_notify(NULL, CPU_UP_PREPARE, (void *)(long)cpu); 3413 rcu_cpu_notify(NULL, CPU_UP_PREPARE, (void *)(long)cpu);
3312} 3414}
3313 3415
3314#include "rcutree_plugin.h" 3416#include "tree_plugin.h"
diff --git a/kernel/rcutree.h b/kernel/rcu/tree.h
index 5f97eab602cd..52be957c9fe2 100644
--- a/kernel/rcutree.h
+++ b/kernel/rcu/tree.h
@@ -104,6 +104,8 @@ struct rcu_dynticks {
104 /* idle-period nonlazy_posted snapshot. */ 104 /* idle-period nonlazy_posted snapshot. */
105 unsigned long last_accelerate; 105 unsigned long last_accelerate;
106 /* Last jiffy CBs were accelerated. */ 106 /* Last jiffy CBs were accelerated. */
107 unsigned long last_advance_all;
108 /* Last jiffy CBs were all advanced. */
107 int tick_nohz_enabled_snap; /* Previously seen value from sysfs. */ 109 int tick_nohz_enabled_snap; /* Previously seen value from sysfs. */
108#endif /* #ifdef CONFIG_RCU_FAST_NO_HZ */ 110#endif /* #ifdef CONFIG_RCU_FAST_NO_HZ */
109}; 111};
diff --git a/kernel/rcutree_plugin.h b/kernel/rcu/tree_plugin.h
index 130c97b027f2..08a765232432 100644
--- a/kernel/rcutree_plugin.h
+++ b/kernel/rcu/tree_plugin.h
@@ -28,7 +28,7 @@
28#include <linux/gfp.h> 28#include <linux/gfp.h>
29#include <linux/oom.h> 29#include <linux/oom.h>
30#include <linux/smpboot.h> 30#include <linux/smpboot.h>
31#include "time/tick-internal.h" 31#include "../time/tick-internal.h"
32 32
33#define RCU_KTHREAD_PRIO 1 33#define RCU_KTHREAD_PRIO 1
34 34
@@ -96,10 +96,15 @@ static void __init rcu_bootup_announce_oddness(void)
96#endif /* #ifdef CONFIG_RCU_NOCB_CPU_ZERO */ 96#endif /* #ifdef CONFIG_RCU_NOCB_CPU_ZERO */
97#ifdef CONFIG_RCU_NOCB_CPU_ALL 97#ifdef CONFIG_RCU_NOCB_CPU_ALL
98 pr_info("\tOffload RCU callbacks from all CPUs\n"); 98 pr_info("\tOffload RCU callbacks from all CPUs\n");
99 cpumask_setall(rcu_nocb_mask); 99 cpumask_copy(rcu_nocb_mask, cpu_possible_mask);
100#endif /* #ifdef CONFIG_RCU_NOCB_CPU_ALL */ 100#endif /* #ifdef CONFIG_RCU_NOCB_CPU_ALL */
101#endif /* #ifndef CONFIG_RCU_NOCB_CPU_NONE */ 101#endif /* #ifndef CONFIG_RCU_NOCB_CPU_NONE */
102 if (have_rcu_nocb_mask) { 102 if (have_rcu_nocb_mask) {
103 if (!cpumask_subset(rcu_nocb_mask, cpu_possible_mask)) {
104 pr_info("\tNote: kernel parameter 'rcu_nocbs=' contains nonexistent CPUs.\n");
105 cpumask_and(rcu_nocb_mask, cpu_possible_mask,
106 rcu_nocb_mask);
107 }
103 cpulist_scnprintf(nocb_buf, sizeof(nocb_buf), rcu_nocb_mask); 108 cpulist_scnprintf(nocb_buf, sizeof(nocb_buf), rcu_nocb_mask);
104 pr_info("\tOffload RCU callbacks from CPUs: %s.\n", nocb_buf); 109 pr_info("\tOffload RCU callbacks from CPUs: %s.\n", nocb_buf);
105 if (rcu_nocb_poll) 110 if (rcu_nocb_poll)
@@ -660,7 +665,7 @@ static void rcu_preempt_check_callbacks(int cpu)
660 665
661static void rcu_preempt_do_callbacks(void) 666static void rcu_preempt_do_callbacks(void)
662{ 667{
663 rcu_do_batch(&rcu_preempt_state, &__get_cpu_var(rcu_preempt_data)); 668 rcu_do_batch(&rcu_preempt_state, this_cpu_ptr(&rcu_preempt_data));
664} 669}
665 670
666#endif /* #ifdef CONFIG_RCU_BOOST */ 671#endif /* #ifdef CONFIG_RCU_BOOST */
@@ -1128,7 +1133,7 @@ void exit_rcu(void)
1128 1133
1129#ifdef CONFIG_RCU_BOOST 1134#ifdef CONFIG_RCU_BOOST
1130 1135
1131#include "rtmutex_common.h" 1136#include "../locking/rtmutex_common.h"
1132 1137
1133#ifdef CONFIG_RCU_TRACE 1138#ifdef CONFIG_RCU_TRACE
1134 1139
@@ -1332,7 +1337,7 @@ static void invoke_rcu_callbacks_kthread(void)
1332 */ 1337 */
1333static bool rcu_is_callbacks_kthread(void) 1338static bool rcu_is_callbacks_kthread(void)
1334{ 1339{
1335 return __get_cpu_var(rcu_cpu_kthread_task) == current; 1340 return __this_cpu_read(rcu_cpu_kthread_task) == current;
1336} 1341}
1337 1342
1338#define RCU_BOOST_DELAY_JIFFIES DIV_ROUND_UP(CONFIG_RCU_BOOST_DELAY * HZ, 1000) 1343#define RCU_BOOST_DELAY_JIFFIES DIV_ROUND_UP(CONFIG_RCU_BOOST_DELAY * HZ, 1000)
@@ -1382,8 +1387,8 @@ static int rcu_spawn_one_boost_kthread(struct rcu_state *rsp,
1382 1387
1383static void rcu_kthread_do_work(void) 1388static void rcu_kthread_do_work(void)
1384{ 1389{
1385 rcu_do_batch(&rcu_sched_state, &__get_cpu_var(rcu_sched_data)); 1390 rcu_do_batch(&rcu_sched_state, this_cpu_ptr(&rcu_sched_data));
1386 rcu_do_batch(&rcu_bh_state, &__get_cpu_var(rcu_bh_data)); 1391 rcu_do_batch(&rcu_bh_state, this_cpu_ptr(&rcu_bh_data));
1387 rcu_preempt_do_callbacks(); 1392 rcu_preempt_do_callbacks();
1388} 1393}
1389 1394
@@ -1402,7 +1407,7 @@ static void rcu_cpu_kthread_park(unsigned int cpu)
1402 1407
1403static int rcu_cpu_kthread_should_run(unsigned int cpu) 1408static int rcu_cpu_kthread_should_run(unsigned int cpu)
1404{ 1409{
1405 return __get_cpu_var(rcu_cpu_has_work); 1410 return __this_cpu_read(rcu_cpu_has_work);
1406} 1411}
1407 1412
1408/* 1413/*
@@ -1412,8 +1417,8 @@ static int rcu_cpu_kthread_should_run(unsigned int cpu)
1412 */ 1417 */
1413static void rcu_cpu_kthread(unsigned int cpu) 1418static void rcu_cpu_kthread(unsigned int cpu)
1414{ 1419{
1415 unsigned int *statusp = &__get_cpu_var(rcu_cpu_kthread_status); 1420 unsigned int *statusp = this_cpu_ptr(&rcu_cpu_kthread_status);
1416 char work, *workp = &__get_cpu_var(rcu_cpu_has_work); 1421 char work, *workp = this_cpu_ptr(&rcu_cpu_has_work);
1417 int spincnt; 1422 int spincnt;
1418 1423
1419 for (spincnt = 0; spincnt < 10; spincnt++) { 1424 for (spincnt = 0; spincnt < 10; spincnt++) {
@@ -1627,20 +1632,26 @@ module_param(rcu_idle_gp_delay, int, 0644);
1627static int rcu_idle_lazy_gp_delay = RCU_IDLE_LAZY_GP_DELAY; 1632static int rcu_idle_lazy_gp_delay = RCU_IDLE_LAZY_GP_DELAY;
1628module_param(rcu_idle_lazy_gp_delay, int, 0644); 1633module_param(rcu_idle_lazy_gp_delay, int, 0644);
1629 1634
1630extern int tick_nohz_enabled; 1635extern int tick_nohz_active;
1631 1636
1632/* 1637/*
1633 * Try to advance callbacks for all flavors of RCU on the current CPU. 1638 * Try to advance callbacks for all flavors of RCU on the current CPU, but
1634 * Afterwards, if there are any callbacks ready for immediate invocation, 1639 * only if it has been awhile since the last time we did so. Afterwards,
1635 * return true. 1640 * if there are any callbacks ready for immediate invocation, return true.
1636 */ 1641 */
1637static bool rcu_try_advance_all_cbs(void) 1642static bool rcu_try_advance_all_cbs(void)
1638{ 1643{
1639 bool cbs_ready = false; 1644 bool cbs_ready = false;
1640 struct rcu_data *rdp; 1645 struct rcu_data *rdp;
1646 struct rcu_dynticks *rdtp = this_cpu_ptr(&rcu_dynticks);
1641 struct rcu_node *rnp; 1647 struct rcu_node *rnp;
1642 struct rcu_state *rsp; 1648 struct rcu_state *rsp;
1643 1649
1650 /* Exit early if we advanced recently. */
1651 if (jiffies == rdtp->last_advance_all)
1652 return 0;
1653 rdtp->last_advance_all = jiffies;
1654
1644 for_each_rcu_flavor(rsp) { 1655 for_each_rcu_flavor(rsp) {
1645 rdp = this_cpu_ptr(rsp->rda); 1656 rdp = this_cpu_ptr(rsp->rda);
1646 rnp = rdp->mynode; 1657 rnp = rdp->mynode;
@@ -1718,7 +1729,7 @@ static void rcu_prepare_for_idle(int cpu)
1718 int tne; 1729 int tne;
1719 1730
1720 /* Handle nohz enablement switches conservatively. */ 1731 /* Handle nohz enablement switches conservatively. */
1721 tne = ACCESS_ONCE(tick_nohz_enabled); 1732 tne = ACCESS_ONCE(tick_nohz_active);
1722 if (tne != rdtp->tick_nohz_enabled_snap) { 1733 if (tne != rdtp->tick_nohz_enabled_snap) {
1723 if (rcu_cpu_has_callbacks(cpu, NULL)) 1734 if (rcu_cpu_has_callbacks(cpu, NULL))
1724 invoke_rcu_core(); /* force nohz to see update. */ 1735 invoke_rcu_core(); /* force nohz to see update. */
@@ -1739,6 +1750,8 @@ static void rcu_prepare_for_idle(int cpu)
1739 */ 1750 */
1740 if (rdtp->all_lazy && 1751 if (rdtp->all_lazy &&
1741 rdtp->nonlazy_posted != rdtp->nonlazy_posted_snap) { 1752 rdtp->nonlazy_posted != rdtp->nonlazy_posted_snap) {
1753 rdtp->all_lazy = false;
1754 rdtp->nonlazy_posted_snap = rdtp->nonlazy_posted;
1742 invoke_rcu_core(); 1755 invoke_rcu_core();
1743 return; 1756 return;
1744 } 1757 }
@@ -1768,17 +1781,11 @@ static void rcu_prepare_for_idle(int cpu)
1768 */ 1781 */
1769static void rcu_cleanup_after_idle(int cpu) 1782static void rcu_cleanup_after_idle(int cpu)
1770{ 1783{
1771 struct rcu_data *rdp;
1772 struct rcu_state *rsp;
1773 1784
1774 if (rcu_is_nocb_cpu(cpu)) 1785 if (rcu_is_nocb_cpu(cpu))
1775 return; 1786 return;
1776 rcu_try_advance_all_cbs(); 1787 if (rcu_try_advance_all_cbs())
1777 for_each_rcu_flavor(rsp) { 1788 invoke_rcu_core();
1778 rdp = per_cpu_ptr(rsp->rda, cpu);
1779 if (cpu_has_callbacks_ready_to_invoke(rdp))
1780 invoke_rcu_core();
1781 }
1782} 1789}
1783 1790
1784/* 1791/*
@@ -2108,15 +2115,22 @@ static void __call_rcu_nocb_enqueue(struct rcu_data *rdp,
2108 2115
2109 /* If we are not being polled and there is a kthread, awaken it ... */ 2116 /* If we are not being polled and there is a kthread, awaken it ... */
2110 t = ACCESS_ONCE(rdp->nocb_kthread); 2117 t = ACCESS_ONCE(rdp->nocb_kthread);
2111 if (rcu_nocb_poll | !t) 2118 if (rcu_nocb_poll || !t) {
2119 trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu,
2120 TPS("WakeNotPoll"));
2112 return; 2121 return;
2122 }
2113 len = atomic_long_read(&rdp->nocb_q_count); 2123 len = atomic_long_read(&rdp->nocb_q_count);
2114 if (old_rhpp == &rdp->nocb_head) { 2124 if (old_rhpp == &rdp->nocb_head) {
2115 wake_up(&rdp->nocb_wq); /* ... only if queue was empty ... */ 2125 wake_up(&rdp->nocb_wq); /* ... only if queue was empty ... */
2116 rdp->qlen_last_fqs_check = 0; 2126 rdp->qlen_last_fqs_check = 0;
2127 trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, TPS("WakeEmpty"));
2117 } else if (len > rdp->qlen_last_fqs_check + qhimark) { 2128 } else if (len > rdp->qlen_last_fqs_check + qhimark) {
2118 wake_up_process(t); /* ... or if many callbacks queued. */ 2129 wake_up_process(t); /* ... or if many callbacks queued. */
2119 rdp->qlen_last_fqs_check = LONG_MAX / 2; 2130 rdp->qlen_last_fqs_check = LONG_MAX / 2;
2131 trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, TPS("WakeOvf"));
2132 } else {
2133 trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, TPS("WakeNot"));
2120 } 2134 }
2121 return; 2135 return;
2122} 2136}
@@ -2140,10 +2154,12 @@ static bool __call_rcu_nocb(struct rcu_data *rdp, struct rcu_head *rhp,
2140 if (__is_kfree_rcu_offset((unsigned long)rhp->func)) 2154 if (__is_kfree_rcu_offset((unsigned long)rhp->func))
2141 trace_rcu_kfree_callback(rdp->rsp->name, rhp, 2155 trace_rcu_kfree_callback(rdp->rsp->name, rhp,
2142 (unsigned long)rhp->func, 2156 (unsigned long)rhp->func,
2143 rdp->qlen_lazy, rdp->qlen); 2157 -atomic_long_read(&rdp->nocb_q_count_lazy),
2158 -atomic_long_read(&rdp->nocb_q_count));
2144 else 2159 else
2145 trace_rcu_callback(rdp->rsp->name, rhp, 2160 trace_rcu_callback(rdp->rsp->name, rhp,
2146 rdp->qlen_lazy, rdp->qlen); 2161 -atomic_long_read(&rdp->nocb_q_count_lazy),
2162 -atomic_long_read(&rdp->nocb_q_count));
2147 return 1; 2163 return 1;
2148} 2164}
2149 2165
@@ -2221,6 +2237,7 @@ static void rcu_nocb_wait_gp(struct rcu_data *rdp)
2221static int rcu_nocb_kthread(void *arg) 2237static int rcu_nocb_kthread(void *arg)
2222{ 2238{
2223 int c, cl; 2239 int c, cl;
2240 bool firsttime = 1;
2224 struct rcu_head *list; 2241 struct rcu_head *list;
2225 struct rcu_head *next; 2242 struct rcu_head *next;
2226 struct rcu_head **tail; 2243 struct rcu_head **tail;
@@ -2229,14 +2246,27 @@ static int rcu_nocb_kthread(void *arg)
2229 /* Each pass through this loop invokes one batch of callbacks */ 2246 /* Each pass through this loop invokes one batch of callbacks */
2230 for (;;) { 2247 for (;;) {
2231 /* If not polling, wait for next batch of callbacks. */ 2248 /* If not polling, wait for next batch of callbacks. */
2232 if (!rcu_nocb_poll) 2249 if (!rcu_nocb_poll) {
2250 trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu,
2251 TPS("Sleep"));
2233 wait_event_interruptible(rdp->nocb_wq, rdp->nocb_head); 2252 wait_event_interruptible(rdp->nocb_wq, rdp->nocb_head);
2253 } else if (firsttime) {
2254 firsttime = 0;
2255 trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu,
2256 TPS("Poll"));
2257 }
2234 list = ACCESS_ONCE(rdp->nocb_head); 2258 list = ACCESS_ONCE(rdp->nocb_head);
2235 if (!list) { 2259 if (!list) {
2260 if (!rcu_nocb_poll)
2261 trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu,
2262 TPS("WokeEmpty"));
2236 schedule_timeout_interruptible(1); 2263 schedule_timeout_interruptible(1);
2237 flush_signals(current); 2264 flush_signals(current);
2238 continue; 2265 continue;
2239 } 2266 }
2267 firsttime = 1;
2268 trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu,
2269 TPS("WokeNonEmpty"));
2240 2270
2241 /* 2271 /*
2242 * Extract queued callbacks, update counts, and wait 2272 * Extract queued callbacks, update counts, and wait
@@ -2257,7 +2287,11 @@ static int rcu_nocb_kthread(void *arg)
2257 next = list->next; 2287 next = list->next;
2258 /* Wait for enqueuing to complete, if needed. */ 2288 /* Wait for enqueuing to complete, if needed. */
2259 while (next == NULL && &list->next != tail) { 2289 while (next == NULL && &list->next != tail) {
2290 trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu,
2291 TPS("WaitQueue"));
2260 schedule_timeout_interruptible(1); 2292 schedule_timeout_interruptible(1);
2293 trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu,
2294 TPS("WokeQueue"));
2261 next = list->next; 2295 next = list->next;
2262 } 2296 }
2263 debug_rcu_head_unqueue(list); 2297 debug_rcu_head_unqueue(list);
diff --git a/kernel/rcutree_trace.c b/kernel/rcu/tree_trace.c
index cf6c17412932..3596797b7e46 100644
--- a/kernel/rcutree_trace.c
+++ b/kernel/rcu/tree_trace.c
@@ -44,7 +44,7 @@
44#include <linux/seq_file.h> 44#include <linux/seq_file.h>
45 45
46#define RCU_TREE_NONCORE 46#define RCU_TREE_NONCORE
47#include "rcutree.h" 47#include "tree.h"
48 48
49static int r_open(struct inode *inode, struct file *file, 49static int r_open(struct inode *inode, struct file *file,
50 const struct seq_operations *op) 50 const struct seq_operations *op)
diff --git a/kernel/rcupdate.c b/kernel/rcu/update.c
index b02a339836b4..6cb3dff89e2b 100644
--- a/kernel/rcupdate.c
+++ b/kernel/rcu/update.c
@@ -53,6 +53,12 @@
53 53
54#include "rcu.h" 54#include "rcu.h"
55 55
56MODULE_ALIAS("rcupdate");
57#ifdef MODULE_PARAM_PREFIX
58#undef MODULE_PARAM_PREFIX
59#endif
60#define MODULE_PARAM_PREFIX "rcupdate."
61
56module_param(rcu_expedited, int, 0); 62module_param(rcu_expedited, int, 0);
57 63
58#ifdef CONFIG_PREEMPT_RCU 64#ifdef CONFIG_PREEMPT_RCU
@@ -148,7 +154,7 @@ int rcu_read_lock_bh_held(void)
148{ 154{
149 if (!debug_lockdep_rcu_enabled()) 155 if (!debug_lockdep_rcu_enabled())
150 return 1; 156 return 1;
151 if (rcu_is_cpu_idle()) 157 if (!rcu_is_watching())
152 return 0; 158 return 0;
153 if (!rcu_lockdep_current_cpu_online()) 159 if (!rcu_lockdep_current_cpu_online())
154 return 0; 160 return 0;
@@ -298,7 +304,7 @@ EXPORT_SYMBOL_GPL(do_trace_rcu_torture_read);
298#endif 304#endif
299 305
300int rcu_cpu_stall_suppress __read_mostly; /* 1 = suppress stall warnings. */ 306int rcu_cpu_stall_suppress __read_mostly; /* 1 = suppress stall warnings. */
301int rcu_cpu_stall_timeout __read_mostly = CONFIG_RCU_CPU_STALL_TIMEOUT; 307static int rcu_cpu_stall_timeout __read_mostly = CONFIG_RCU_CPU_STALL_TIMEOUT;
302 308
303module_param(rcu_cpu_stall_suppress, int, 0644); 309module_param(rcu_cpu_stall_suppress, int, 0644);
304module_param(rcu_cpu_stall_timeout, int, 0644); 310module_param(rcu_cpu_stall_timeout, int, 0644);
diff --git a/kernel/reboot.c b/kernel/reboot.c
index f813b3474646..662c83fc16b7 100644
--- a/kernel/reboot.c
+++ b/kernel/reboot.c
@@ -104,7 +104,7 @@ int unregister_reboot_notifier(struct notifier_block *nb)
104} 104}
105EXPORT_SYMBOL(unregister_reboot_notifier); 105EXPORT_SYMBOL(unregister_reboot_notifier);
106 106
107static void migrate_to_reboot_cpu(void) 107void migrate_to_reboot_cpu(void)
108{ 108{
109 /* The boot cpu is always logical cpu 0 */ 109 /* The boot cpu is always logical cpu 0 */
110 int cpu = reboot_cpu; 110 int cpu = reboot_cpu;
diff --git a/kernel/sched/Makefile b/kernel/sched/Makefile
index 54adcf35f495..7b621409cf15 100644
--- a/kernel/sched/Makefile
+++ b/kernel/sched/Makefile
@@ -12,6 +12,7 @@ CFLAGS_core.o := $(PROFILING) -fno-omit-frame-pointer
12endif 12endif
13 13
14obj-y += core.o proc.o clock.o cputime.o idle_task.o fair.o rt.o stop_task.o 14obj-y += core.o proc.o clock.o cputime.o idle_task.o fair.o rt.o stop_task.o
15obj-y += wait.o completion.o
15obj-$(CONFIG_SMP) += cpupri.o 16obj-$(CONFIG_SMP) += cpupri.o
16obj-$(CONFIG_SCHED_AUTOGROUP) += auto_group.o 17obj-$(CONFIG_SCHED_AUTOGROUP) += auto_group.o
17obj-$(CONFIG_SCHEDSTATS) += stats.o 18obj-$(CONFIG_SCHEDSTATS) += stats.o
diff --git a/kernel/sched/completion.c b/kernel/sched/completion.c
new file mode 100644
index 000000000000..a63f4dc27909
--- /dev/null
+++ b/kernel/sched/completion.c
@@ -0,0 +1,299 @@
1/*
2 * Generic wait-for-completion handler;
3 *
4 * It differs from semaphores in that their default case is the opposite,
5 * wait_for_completion default blocks whereas semaphore default non-block. The
6 * interface also makes it easy to 'complete' multiple waiting threads,
7 * something which isn't entirely natural for semaphores.
8 *
9 * But more importantly, the primitive documents the usage. Semaphores would
10 * typically be used for exclusion which gives rise to priority inversion.
11 * Waiting for completion is a typically sync point, but not an exclusion point.
12 */
13
14#include <linux/sched.h>
15#include <linux/completion.h>
16
17/**
18 * complete: - signals a single thread waiting on this completion
19 * @x: holds the state of this particular completion
20 *
21 * This will wake up a single thread waiting on this completion. Threads will be
22 * awakened in the same order in which they were queued.
23 *
24 * See also complete_all(), wait_for_completion() and related routines.
25 *
26 * It may be assumed that this function implies a write memory barrier before
27 * changing the task state if and only if any tasks are woken up.
28 */
29void complete(struct completion *x)
30{
31 unsigned long flags;
32
33 spin_lock_irqsave(&x->wait.lock, flags);
34 x->done++;
35 __wake_up_locked(&x->wait, TASK_NORMAL, 1);
36 spin_unlock_irqrestore(&x->wait.lock, flags);
37}
38EXPORT_SYMBOL(complete);
39
40/**
41 * complete_all: - signals all threads waiting on this completion
42 * @x: holds the state of this particular completion
43 *
44 * This will wake up all threads waiting on this particular completion event.
45 *
46 * It may be assumed that this function implies a write memory barrier before
47 * changing the task state if and only if any tasks are woken up.
48 */
49void complete_all(struct completion *x)
50{
51 unsigned long flags;
52
53 spin_lock_irqsave(&x->wait.lock, flags);
54 x->done += UINT_MAX/2;
55 __wake_up_locked(&x->wait, TASK_NORMAL, 0);
56 spin_unlock_irqrestore(&x->wait.lock, flags);
57}
58EXPORT_SYMBOL(complete_all);
59
60static inline long __sched
61do_wait_for_common(struct completion *x,
62 long (*action)(long), long timeout, int state)
63{
64 if (!x->done) {
65 DECLARE_WAITQUEUE(wait, current);
66
67 __add_wait_queue_tail_exclusive(&x->wait, &wait);
68 do {
69 if (signal_pending_state(state, current)) {
70 timeout = -ERESTARTSYS;
71 break;
72 }
73 __set_current_state(state);
74 spin_unlock_irq(&x->wait.lock);
75 timeout = action(timeout);
76 spin_lock_irq(&x->wait.lock);
77 } while (!x->done && timeout);
78 __remove_wait_queue(&x->wait, &wait);
79 if (!x->done)
80 return timeout;
81 }
82 x->done--;
83 return timeout ?: 1;
84}
85
86static inline long __sched
87__wait_for_common(struct completion *x,
88 long (*action)(long), long timeout, int state)
89{
90 might_sleep();
91
92 spin_lock_irq(&x->wait.lock);
93 timeout = do_wait_for_common(x, action, timeout, state);
94 spin_unlock_irq(&x->wait.lock);
95 return timeout;
96}
97
98static long __sched
99wait_for_common(struct completion *x, long timeout, int state)
100{
101 return __wait_for_common(x, schedule_timeout, timeout, state);
102}
103
104static long __sched
105wait_for_common_io(struct completion *x, long timeout, int state)
106{
107 return __wait_for_common(x, io_schedule_timeout, timeout, state);
108}
109
110/**
111 * wait_for_completion: - waits for completion of a task
112 * @x: holds the state of this particular completion
113 *
114 * This waits to be signaled for completion of a specific task. It is NOT
115 * interruptible and there is no timeout.
116 *
117 * See also similar routines (i.e. wait_for_completion_timeout()) with timeout
118 * and interrupt capability. Also see complete().
119 */
120void __sched wait_for_completion(struct completion *x)
121{
122 wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_UNINTERRUPTIBLE);
123}
124EXPORT_SYMBOL(wait_for_completion);
125
126/**
127 * wait_for_completion_timeout: - waits for completion of a task (w/timeout)
128 * @x: holds the state of this particular completion
129 * @timeout: timeout value in jiffies
130 *
131 * This waits for either a completion of a specific task to be signaled or for a
132 * specified timeout to expire. The timeout is in jiffies. It is not
133 * interruptible.
134 *
135 * Return: 0 if timed out, and positive (at least 1, or number of jiffies left
136 * till timeout) if completed.
137 */
138unsigned long __sched
139wait_for_completion_timeout(struct completion *x, unsigned long timeout)
140{
141 return wait_for_common(x, timeout, TASK_UNINTERRUPTIBLE);
142}
143EXPORT_SYMBOL(wait_for_completion_timeout);
144
145/**
146 * wait_for_completion_io: - waits for completion of a task
147 * @x: holds the state of this particular completion
148 *
149 * This waits to be signaled for completion of a specific task. It is NOT
150 * interruptible and there is no timeout. The caller is accounted as waiting
151 * for IO.
152 */
153void __sched wait_for_completion_io(struct completion *x)
154{
155 wait_for_common_io(x, MAX_SCHEDULE_TIMEOUT, TASK_UNINTERRUPTIBLE);
156}
157EXPORT_SYMBOL(wait_for_completion_io);
158
159/**
160 * wait_for_completion_io_timeout: - waits for completion of a task (w/timeout)
161 * @x: holds the state of this particular completion
162 * @timeout: timeout value in jiffies
163 *
164 * This waits for either a completion of a specific task to be signaled or for a
165 * specified timeout to expire. The timeout is in jiffies. It is not
166 * interruptible. The caller is accounted as waiting for IO.
167 *
168 * Return: 0 if timed out, and positive (at least 1, or number of jiffies left
169 * till timeout) if completed.
170 */
171unsigned long __sched
172wait_for_completion_io_timeout(struct completion *x, unsigned long timeout)
173{
174 return wait_for_common_io(x, timeout, TASK_UNINTERRUPTIBLE);
175}
176EXPORT_SYMBOL(wait_for_completion_io_timeout);
177
178/**
179 * wait_for_completion_interruptible: - waits for completion of a task (w/intr)
180 * @x: holds the state of this particular completion
181 *
182 * This waits for completion of a specific task to be signaled. It is
183 * interruptible.
184 *
185 * Return: -ERESTARTSYS if interrupted, 0 if completed.
186 */
187int __sched wait_for_completion_interruptible(struct completion *x)
188{
189 long t = wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_INTERRUPTIBLE);
190 if (t == -ERESTARTSYS)
191 return t;
192 return 0;
193}
194EXPORT_SYMBOL(wait_for_completion_interruptible);
195
196/**
197 * wait_for_completion_interruptible_timeout: - waits for completion (w/(to,intr))
198 * @x: holds the state of this particular completion
199 * @timeout: timeout value in jiffies
200 *
201 * This waits for either a completion of a specific task to be signaled or for a
202 * specified timeout to expire. It is interruptible. The timeout is in jiffies.
203 *
204 * Return: -ERESTARTSYS if interrupted, 0 if timed out, positive (at least 1,
205 * or number of jiffies left till timeout) if completed.
206 */
207long __sched
208wait_for_completion_interruptible_timeout(struct completion *x,
209 unsigned long timeout)
210{
211 return wait_for_common(x, timeout, TASK_INTERRUPTIBLE);
212}
213EXPORT_SYMBOL(wait_for_completion_interruptible_timeout);
214
215/**
216 * wait_for_completion_killable: - waits for completion of a task (killable)
217 * @x: holds the state of this particular completion
218 *
219 * This waits to be signaled for completion of a specific task. It can be
220 * interrupted by a kill signal.
221 *
222 * Return: -ERESTARTSYS if interrupted, 0 if completed.
223 */
224int __sched wait_for_completion_killable(struct completion *x)
225{
226 long t = wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_KILLABLE);
227 if (t == -ERESTARTSYS)
228 return t;
229 return 0;
230}
231EXPORT_SYMBOL(wait_for_completion_killable);
232
233/**
234 * wait_for_completion_killable_timeout: - waits for completion of a task (w/(to,killable))
235 * @x: holds the state of this particular completion
236 * @timeout: timeout value in jiffies
237 *
238 * This waits for either a completion of a specific task to be
239 * signaled or for a specified timeout to expire. It can be
240 * interrupted by a kill signal. The timeout is in jiffies.
241 *
242 * Return: -ERESTARTSYS if interrupted, 0 if timed out, positive (at least 1,
243 * or number of jiffies left till timeout) if completed.
244 */
245long __sched
246wait_for_completion_killable_timeout(struct completion *x,
247 unsigned long timeout)
248{
249 return wait_for_common(x, timeout, TASK_KILLABLE);
250}
251EXPORT_SYMBOL(wait_for_completion_killable_timeout);
252
253/**
254 * try_wait_for_completion - try to decrement a completion without blocking
255 * @x: completion structure
256 *
257 * Return: 0 if a decrement cannot be done without blocking
258 * 1 if a decrement succeeded.
259 *
260 * If a completion is being used as a counting completion,
261 * attempt to decrement the counter without blocking. This
262 * enables us to avoid waiting if the resource the completion
263 * is protecting is not available.
264 */
265bool try_wait_for_completion(struct completion *x)
266{
267 unsigned long flags;
268 int ret = 1;
269
270 spin_lock_irqsave(&x->wait.lock, flags);
271 if (!x->done)
272 ret = 0;
273 else
274 x->done--;
275 spin_unlock_irqrestore(&x->wait.lock, flags);
276 return ret;
277}
278EXPORT_SYMBOL(try_wait_for_completion);
279
280/**
281 * completion_done - Test to see if a completion has any waiters
282 * @x: completion structure
283 *
284 * Return: 0 if there are waiters (wait_for_completion() in progress)
285 * 1 if there are no waiters.
286 *
287 */
288bool completion_done(struct completion *x)
289{
290 unsigned long flags;
291 int ret = 1;
292
293 spin_lock_irqsave(&x->wait.lock, flags);
294 if (!x->done)
295 ret = 0;
296 spin_unlock_irqrestore(&x->wait.lock, flags);
297 return ret;
298}
299EXPORT_SYMBOL(completion_done);
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 5ac63c9a995a..a88f4a485c5e 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -513,12 +513,11 @@ static inline void init_hrtick(void)
513 * might also involve a cross-CPU call to trigger the scheduler on 513 * might also involve a cross-CPU call to trigger the scheduler on
514 * the target CPU. 514 * the target CPU.
515 */ 515 */
516#ifdef CONFIG_SMP
517void resched_task(struct task_struct *p) 516void resched_task(struct task_struct *p)
518{ 517{
519 int cpu; 518 int cpu;
520 519
521 assert_raw_spin_locked(&task_rq(p)->lock); 520 lockdep_assert_held(&task_rq(p)->lock);
522 521
523 if (test_tsk_need_resched(p)) 522 if (test_tsk_need_resched(p))
524 return; 523 return;
@@ -526,8 +525,10 @@ void resched_task(struct task_struct *p)
526 set_tsk_need_resched(p); 525 set_tsk_need_resched(p);
527 526
528 cpu = task_cpu(p); 527 cpu = task_cpu(p);
529 if (cpu == smp_processor_id()) 528 if (cpu == smp_processor_id()) {
529 set_preempt_need_resched();
530 return; 530 return;
531 }
531 532
532 /* NEED_RESCHED must be visible before we test polling */ 533 /* NEED_RESCHED must be visible before we test polling */
533 smp_mb(); 534 smp_mb();
@@ -546,6 +547,7 @@ void resched_cpu(int cpu)
546 raw_spin_unlock_irqrestore(&rq->lock, flags); 547 raw_spin_unlock_irqrestore(&rq->lock, flags);
547} 548}
548 549
550#ifdef CONFIG_SMP
549#ifdef CONFIG_NO_HZ_COMMON 551#ifdef CONFIG_NO_HZ_COMMON
550/* 552/*
551 * In the semi idle case, use the nearest busy cpu for migrating timers 553 * In the semi idle case, use the nearest busy cpu for migrating timers
@@ -693,12 +695,6 @@ void sched_avg_update(struct rq *rq)
693 } 695 }
694} 696}
695 697
696#else /* !CONFIG_SMP */
697void resched_task(struct task_struct *p)
698{
699 assert_raw_spin_locked(&task_rq(p)->lock);
700 set_tsk_need_resched(p);
701}
702#endif /* CONFIG_SMP */ 698#endif /* CONFIG_SMP */
703 699
704#if defined(CONFIG_RT_GROUP_SCHED) || (defined(CONFIG_FAIR_GROUP_SCHED) && \ 700#if defined(CONFIG_RT_GROUP_SCHED) || (defined(CONFIG_FAIR_GROUP_SCHED) && \
@@ -767,14 +763,14 @@ static void set_load_weight(struct task_struct *p)
767static void enqueue_task(struct rq *rq, struct task_struct *p, int flags) 763static void enqueue_task(struct rq *rq, struct task_struct *p, int flags)
768{ 764{
769 update_rq_clock(rq); 765 update_rq_clock(rq);
770 sched_info_queued(p); 766 sched_info_queued(rq, p);
771 p->sched_class->enqueue_task(rq, p, flags); 767 p->sched_class->enqueue_task(rq, p, flags);
772} 768}
773 769
774static void dequeue_task(struct rq *rq, struct task_struct *p, int flags) 770static void dequeue_task(struct rq *rq, struct task_struct *p, int flags)
775{ 771{
776 update_rq_clock(rq); 772 update_rq_clock(rq);
777 sched_info_dequeued(p); 773 sched_info_dequeued(rq, p);
778 p->sched_class->dequeue_task(rq, p, flags); 774 p->sched_class->dequeue_task(rq, p, flags);
779} 775}
780 776
@@ -987,7 +983,7 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
987 * ttwu() will sort out the placement. 983 * ttwu() will sort out the placement.
988 */ 984 */
989 WARN_ON_ONCE(p->state != TASK_RUNNING && p->state != TASK_WAKING && 985 WARN_ON_ONCE(p->state != TASK_RUNNING && p->state != TASK_WAKING &&
990 !(task_thread_info(p)->preempt_count & PREEMPT_ACTIVE)); 986 !(task_preempt_count(p) & PREEMPT_ACTIVE));
991 987
992#ifdef CONFIG_LOCKDEP 988#ifdef CONFIG_LOCKDEP
993 /* 989 /*
@@ -1017,6 +1013,107 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
1017 __set_task_cpu(p, new_cpu); 1013 __set_task_cpu(p, new_cpu);
1018} 1014}
1019 1015
1016static void __migrate_swap_task(struct task_struct *p, int cpu)
1017{
1018 if (p->on_rq) {
1019 struct rq *src_rq, *dst_rq;
1020
1021 src_rq = task_rq(p);
1022 dst_rq = cpu_rq(cpu);
1023
1024 deactivate_task(src_rq, p, 0);
1025 set_task_cpu(p, cpu);
1026 activate_task(dst_rq, p, 0);
1027 check_preempt_curr(dst_rq, p, 0);
1028 } else {
1029 /*
1030 * Task isn't running anymore; make it appear like we migrated
1031 * it before it went to sleep. This means on wakeup we make the
1032 * previous cpu our targer instead of where it really is.
1033 */
1034 p->wake_cpu = cpu;
1035 }
1036}
1037
1038struct migration_swap_arg {
1039 struct task_struct *src_task, *dst_task;
1040 int src_cpu, dst_cpu;
1041};
1042
1043static int migrate_swap_stop(void *data)
1044{
1045 struct migration_swap_arg *arg = data;
1046 struct rq *src_rq, *dst_rq;
1047 int ret = -EAGAIN;
1048
1049 src_rq = cpu_rq(arg->src_cpu);
1050 dst_rq = cpu_rq(arg->dst_cpu);
1051
1052 double_raw_lock(&arg->src_task->pi_lock,
1053 &arg->dst_task->pi_lock);
1054 double_rq_lock(src_rq, dst_rq);
1055 if (task_cpu(arg->dst_task) != arg->dst_cpu)
1056 goto unlock;
1057
1058 if (task_cpu(arg->src_task) != arg->src_cpu)
1059 goto unlock;
1060
1061 if (!cpumask_test_cpu(arg->dst_cpu, tsk_cpus_allowed(arg->src_task)))
1062 goto unlock;
1063
1064 if (!cpumask_test_cpu(arg->src_cpu, tsk_cpus_allowed(arg->dst_task)))
1065 goto unlock;
1066
1067 __migrate_swap_task(arg->src_task, arg->dst_cpu);
1068 __migrate_swap_task(arg->dst_task, arg->src_cpu);
1069
1070 ret = 0;
1071
1072unlock:
1073 double_rq_unlock(src_rq, dst_rq);
1074 raw_spin_unlock(&arg->dst_task->pi_lock);
1075 raw_spin_unlock(&arg->src_task->pi_lock);
1076
1077 return ret;
1078}
1079
1080/*
1081 * Cross migrate two tasks
1082 */
1083int migrate_swap(struct task_struct *cur, struct task_struct *p)
1084{
1085 struct migration_swap_arg arg;
1086 int ret = -EINVAL;
1087
1088 arg = (struct migration_swap_arg){
1089 .src_task = cur,
1090 .src_cpu = task_cpu(cur),
1091 .dst_task = p,
1092 .dst_cpu = task_cpu(p),
1093 };
1094
1095 if (arg.src_cpu == arg.dst_cpu)
1096 goto out;
1097
1098 /*
1099 * These three tests are all lockless; this is OK since all of them
1100 * will be re-checked with proper locks held further down the line.
1101 */
1102 if (!cpu_active(arg.src_cpu) || !cpu_active(arg.dst_cpu))
1103 goto out;
1104
1105 if (!cpumask_test_cpu(arg.dst_cpu, tsk_cpus_allowed(arg.src_task)))
1106 goto out;
1107
1108 if (!cpumask_test_cpu(arg.src_cpu, tsk_cpus_allowed(arg.dst_task)))
1109 goto out;
1110
1111 ret = stop_two_cpus(arg.dst_cpu, arg.src_cpu, migrate_swap_stop, &arg);
1112
1113out:
1114 return ret;
1115}
1116
1020struct migration_arg { 1117struct migration_arg {
1021 struct task_struct *task; 1118 struct task_struct *task;
1022 int dest_cpu; 1119 int dest_cpu;
@@ -1236,9 +1333,9 @@ out:
1236 * The caller (fork, wakeup) owns p->pi_lock, ->cpus_allowed is stable. 1333 * The caller (fork, wakeup) owns p->pi_lock, ->cpus_allowed is stable.
1237 */ 1334 */
1238static inline 1335static inline
1239int select_task_rq(struct task_struct *p, int sd_flags, int wake_flags) 1336int select_task_rq(struct task_struct *p, int cpu, int sd_flags, int wake_flags)
1240{ 1337{
1241 int cpu = p->sched_class->select_task_rq(p, sd_flags, wake_flags); 1338 cpu = p->sched_class->select_task_rq(p, cpu, sd_flags, wake_flags);
1242 1339
1243 /* 1340 /*
1244 * In order not to call set_task_cpu() on a blocking task we need 1341 * In order not to call set_task_cpu() on a blocking task we need
@@ -1330,12 +1427,13 @@ ttwu_do_wakeup(struct rq *rq, struct task_struct *p, int wake_flags)
1330 1427
1331 if (rq->idle_stamp) { 1428 if (rq->idle_stamp) {
1332 u64 delta = rq_clock(rq) - rq->idle_stamp; 1429 u64 delta = rq_clock(rq) - rq->idle_stamp;
1333 u64 max = 2*sysctl_sched_migration_cost; 1430 u64 max = 2*rq->max_idle_balance_cost;
1334 1431
1335 if (delta > max) 1432 update_avg(&rq->avg_idle, delta);
1433
1434 if (rq->avg_idle > max)
1336 rq->avg_idle = max; 1435 rq->avg_idle = max;
1337 else 1436
1338 update_avg(&rq->avg_idle, delta);
1339 rq->idle_stamp = 0; 1437 rq->idle_stamp = 0;
1340 } 1438 }
1341#endif 1439#endif
@@ -1396,6 +1494,14 @@ static void sched_ttwu_pending(void)
1396 1494
1397void scheduler_ipi(void) 1495void scheduler_ipi(void)
1398{ 1496{
1497 /*
1498 * Fold TIF_NEED_RESCHED into the preempt_count; anybody setting
1499 * TIF_NEED_RESCHED remotely (for the first time) will also send
1500 * this IPI.
1501 */
1502 if (tif_need_resched())
1503 set_preempt_need_resched();
1504
1399 if (llist_empty(&this_rq()->wake_list) 1505 if (llist_empty(&this_rq()->wake_list)
1400 && !tick_nohz_full_cpu(smp_processor_id()) 1506 && !tick_nohz_full_cpu(smp_processor_id())
1401 && !got_nohz_idle_kick()) 1507 && !got_nohz_idle_kick())
@@ -1513,7 +1619,7 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
1513 if (p->sched_class->task_waking) 1619 if (p->sched_class->task_waking)
1514 p->sched_class->task_waking(p); 1620 p->sched_class->task_waking(p);
1515 1621
1516 cpu = select_task_rq(p, SD_BALANCE_WAKE, wake_flags); 1622 cpu = select_task_rq(p, p->wake_cpu, SD_BALANCE_WAKE, wake_flags);
1517 if (task_cpu(p) != cpu) { 1623 if (task_cpu(p) != cpu) {
1518 wake_flags |= WF_MIGRATED; 1624 wake_flags |= WF_MIGRATED;
1519 set_task_cpu(p, cpu); 1625 set_task_cpu(p, cpu);
@@ -1595,7 +1701,7 @@ int wake_up_state(struct task_struct *p, unsigned int state)
1595 * 1701 *
1596 * __sched_fork() is basic setup used by init_idle() too: 1702 * __sched_fork() is basic setup used by init_idle() too:
1597 */ 1703 */
1598static void __sched_fork(struct task_struct *p) 1704static void __sched_fork(unsigned long clone_flags, struct task_struct *p)
1599{ 1705{
1600 p->on_rq = 0; 1706 p->on_rq = 0;
1601 1707
@@ -1619,16 +1725,24 @@ static void __sched_fork(struct task_struct *p)
1619 1725
1620#ifdef CONFIG_NUMA_BALANCING 1726#ifdef CONFIG_NUMA_BALANCING
1621 if (p->mm && atomic_read(&p->mm->mm_users) == 1) { 1727 if (p->mm && atomic_read(&p->mm->mm_users) == 1) {
1622 p->mm->numa_next_scan = jiffies; 1728 p->mm->numa_next_scan = jiffies + msecs_to_jiffies(sysctl_numa_balancing_scan_delay);
1623 p->mm->numa_next_reset = jiffies;
1624 p->mm->numa_scan_seq = 0; 1729 p->mm->numa_scan_seq = 0;
1625 } 1730 }
1626 1731
1732 if (clone_flags & CLONE_VM)
1733 p->numa_preferred_nid = current->numa_preferred_nid;
1734 else
1735 p->numa_preferred_nid = -1;
1736
1627 p->node_stamp = 0ULL; 1737 p->node_stamp = 0ULL;
1628 p->numa_scan_seq = p->mm ? p->mm->numa_scan_seq : 0; 1738 p->numa_scan_seq = p->mm ? p->mm->numa_scan_seq : 0;
1629 p->numa_migrate_seq = p->mm ? p->mm->numa_scan_seq - 1 : 0;
1630 p->numa_scan_period = sysctl_numa_balancing_scan_delay; 1739 p->numa_scan_period = sysctl_numa_balancing_scan_delay;
1631 p->numa_work.next = &p->numa_work; 1740 p->numa_work.next = &p->numa_work;
1741 p->numa_faults = NULL;
1742 p->numa_faults_buffer = NULL;
1743
1744 INIT_LIST_HEAD(&p->numa_entry);
1745 p->numa_group = NULL;
1632#endif /* CONFIG_NUMA_BALANCING */ 1746#endif /* CONFIG_NUMA_BALANCING */
1633} 1747}
1634 1748
@@ -1654,12 +1768,12 @@ void set_numabalancing_state(bool enabled)
1654/* 1768/*
1655 * fork()/clone()-time setup: 1769 * fork()/clone()-time setup:
1656 */ 1770 */
1657void sched_fork(struct task_struct *p) 1771void sched_fork(unsigned long clone_flags, struct task_struct *p)
1658{ 1772{
1659 unsigned long flags; 1773 unsigned long flags;
1660 int cpu = get_cpu(); 1774 int cpu = get_cpu();
1661 1775
1662 __sched_fork(p); 1776 __sched_fork(clone_flags, p);
1663 /* 1777 /*
1664 * We mark the process as running here. This guarantees that 1778 * We mark the process as running here. This guarantees that
1665 * nobody will actually run it, and a signal or other external 1779 * nobody will actually run it, and a signal or other external
@@ -1717,10 +1831,7 @@ void sched_fork(struct task_struct *p)
1717#if defined(CONFIG_SMP) 1831#if defined(CONFIG_SMP)
1718 p->on_cpu = 0; 1832 p->on_cpu = 0;
1719#endif 1833#endif
1720#ifdef CONFIG_PREEMPT_COUNT 1834 init_task_preempt_count(p);
1721 /* Want to start with kernel preemption disabled. */
1722 task_thread_info(p)->preempt_count = 1;
1723#endif
1724#ifdef CONFIG_SMP 1835#ifdef CONFIG_SMP
1725 plist_node_init(&p->pushable_tasks, MAX_PRIO); 1836 plist_node_init(&p->pushable_tasks, MAX_PRIO);
1726#endif 1837#endif
@@ -1747,7 +1858,7 @@ void wake_up_new_task(struct task_struct *p)
1747 * - cpus_allowed can change in the fork path 1858 * - cpus_allowed can change in the fork path
1748 * - any previously selected cpu might disappear through hotplug 1859 * - any previously selected cpu might disappear through hotplug
1749 */ 1860 */
1750 set_task_cpu(p, select_task_rq(p, SD_BALANCE_FORK, 0)); 1861 set_task_cpu(p, select_task_rq(p, task_cpu(p), SD_BALANCE_FORK, 0));
1751#endif 1862#endif
1752 1863
1753 /* Initialize new task's runnable average */ 1864 /* Initialize new task's runnable average */
@@ -1838,7 +1949,7 @@ prepare_task_switch(struct rq *rq, struct task_struct *prev,
1838 struct task_struct *next) 1949 struct task_struct *next)
1839{ 1950{
1840 trace_sched_switch(prev, next); 1951 trace_sched_switch(prev, next);
1841 sched_info_switch(prev, next); 1952 sched_info_switch(rq, prev, next);
1842 perf_event_task_sched_out(prev, next); 1953 perf_event_task_sched_out(prev, next);
1843 fire_sched_out_preempt_notifiers(prev, next); 1954 fire_sched_out_preempt_notifiers(prev, next);
1844 prepare_lock_switch(rq, next); 1955 prepare_lock_switch(rq, next);
@@ -1890,6 +2001,8 @@ static void finish_task_switch(struct rq *rq, struct task_struct *prev)
1890 if (mm) 2001 if (mm)
1891 mmdrop(mm); 2002 mmdrop(mm);
1892 if (unlikely(prev_state == TASK_DEAD)) { 2003 if (unlikely(prev_state == TASK_DEAD)) {
2004 task_numa_free(prev);
2005
1893 /* 2006 /*
1894 * Remove function-return probe instances associated with this 2007 * Remove function-return probe instances associated with this
1895 * task and put them back on the free list. 2008 * task and put them back on the free list.
@@ -2073,7 +2186,7 @@ void sched_exec(void)
2073 int dest_cpu; 2186 int dest_cpu;
2074 2187
2075 raw_spin_lock_irqsave(&p->pi_lock, flags); 2188 raw_spin_lock_irqsave(&p->pi_lock, flags);
2076 dest_cpu = p->sched_class->select_task_rq(p, SD_BALANCE_EXEC, 0); 2189 dest_cpu = p->sched_class->select_task_rq(p, task_cpu(p), SD_BALANCE_EXEC, 0);
2077 if (dest_cpu == smp_processor_id()) 2190 if (dest_cpu == smp_processor_id())
2078 goto unlock; 2191 goto unlock;
2079 2192
@@ -2140,6 +2253,20 @@ unsigned long long task_sched_runtime(struct task_struct *p)
2140 struct rq *rq; 2253 struct rq *rq;
2141 u64 ns = 0; 2254 u64 ns = 0;
2142 2255
2256#if defined(CONFIG_64BIT) && defined(CONFIG_SMP)
2257 /*
2258 * 64-bit doesn't need locks to atomically read a 64bit value.
2259 * So we have a optimization chance when the task's delta_exec is 0.
2260 * Reading ->on_cpu is racy, but this is ok.
2261 *
2262 * If we race with it leaving cpu, we'll take a lock. So we're correct.
2263 * If we race with it entering cpu, unaccounted time is 0. This is
2264 * indistinguishable from the read occurring a few cycles earlier.
2265 */
2266 if (!p->on_cpu)
2267 return p->se.sum_exec_runtime;
2268#endif
2269
2143 rq = task_rq_lock(p, &flags); 2270 rq = task_rq_lock(p, &flags);
2144 ns = p->se.sum_exec_runtime + do_task_delta_exec(p, rq); 2271 ns = p->se.sum_exec_runtime + do_task_delta_exec(p, rq);
2145 task_rq_unlock(rq, p, &flags); 2272 task_rq_unlock(rq, p, &flags);
@@ -2215,7 +2342,7 @@ notrace unsigned long get_parent_ip(unsigned long addr)
2215#if defined(CONFIG_PREEMPT) && (defined(CONFIG_DEBUG_PREEMPT) || \ 2342#if defined(CONFIG_PREEMPT) && (defined(CONFIG_DEBUG_PREEMPT) || \
2216 defined(CONFIG_PREEMPT_TRACER)) 2343 defined(CONFIG_PREEMPT_TRACER))
2217 2344
2218void __kprobes add_preempt_count(int val) 2345void __kprobes preempt_count_add(int val)
2219{ 2346{
2220#ifdef CONFIG_DEBUG_PREEMPT 2347#ifdef CONFIG_DEBUG_PREEMPT
2221 /* 2348 /*
@@ -2224,7 +2351,7 @@ void __kprobes add_preempt_count(int val)
2224 if (DEBUG_LOCKS_WARN_ON((preempt_count() < 0))) 2351 if (DEBUG_LOCKS_WARN_ON((preempt_count() < 0)))
2225 return; 2352 return;
2226#endif 2353#endif
2227 preempt_count() += val; 2354 __preempt_count_add(val);
2228#ifdef CONFIG_DEBUG_PREEMPT 2355#ifdef CONFIG_DEBUG_PREEMPT
2229 /* 2356 /*
2230 * Spinlock count overflowing soon? 2357 * Spinlock count overflowing soon?
@@ -2235,9 +2362,9 @@ void __kprobes add_preempt_count(int val)
2235 if (preempt_count() == val) 2362 if (preempt_count() == val)
2236 trace_preempt_off(CALLER_ADDR0, get_parent_ip(CALLER_ADDR1)); 2363 trace_preempt_off(CALLER_ADDR0, get_parent_ip(CALLER_ADDR1));
2237} 2364}
2238EXPORT_SYMBOL(add_preempt_count); 2365EXPORT_SYMBOL(preempt_count_add);
2239 2366
2240void __kprobes sub_preempt_count(int val) 2367void __kprobes preempt_count_sub(int val)
2241{ 2368{
2242#ifdef CONFIG_DEBUG_PREEMPT 2369#ifdef CONFIG_DEBUG_PREEMPT
2243 /* 2370 /*
@@ -2255,9 +2382,9 @@ void __kprobes sub_preempt_count(int val)
2255 2382
2256 if (preempt_count() == val) 2383 if (preempt_count() == val)
2257 trace_preempt_on(CALLER_ADDR0, get_parent_ip(CALLER_ADDR1)); 2384 trace_preempt_on(CALLER_ADDR0, get_parent_ip(CALLER_ADDR1));
2258 preempt_count() -= val; 2385 __preempt_count_sub(val);
2259} 2386}
2260EXPORT_SYMBOL(sub_preempt_count); 2387EXPORT_SYMBOL(preempt_count_sub);
2261 2388
2262#endif 2389#endif
2263 2390
@@ -2430,6 +2557,7 @@ need_resched:
2430 put_prev_task(rq, prev); 2557 put_prev_task(rq, prev);
2431 next = pick_next_task(rq); 2558 next = pick_next_task(rq);
2432 clear_tsk_need_resched(prev); 2559 clear_tsk_need_resched(prev);
2560 clear_preempt_need_resched();
2433 rq->skip_clock_update = 0; 2561 rq->skip_clock_update = 0;
2434 2562
2435 if (likely(prev != next)) { 2563 if (likely(prev != next)) {
@@ -2520,9 +2648,9 @@ asmlinkage void __sched notrace preempt_schedule(void)
2520 return; 2648 return;
2521 2649
2522 do { 2650 do {
2523 add_preempt_count_notrace(PREEMPT_ACTIVE); 2651 __preempt_count_add(PREEMPT_ACTIVE);
2524 __schedule(); 2652 __schedule();
2525 sub_preempt_count_notrace(PREEMPT_ACTIVE); 2653 __preempt_count_sub(PREEMPT_ACTIVE);
2526 2654
2527 /* 2655 /*
2528 * Check again in case we missed a preemption opportunity 2656 * Check again in case we missed a preemption opportunity
@@ -2532,6 +2660,7 @@ asmlinkage void __sched notrace preempt_schedule(void)
2532 } while (need_resched()); 2660 } while (need_resched());
2533} 2661}
2534EXPORT_SYMBOL(preempt_schedule); 2662EXPORT_SYMBOL(preempt_schedule);
2663#endif /* CONFIG_PREEMPT */
2535 2664
2536/* 2665/*
2537 * this is the entry point to schedule() from kernel preemption 2666 * this is the entry point to schedule() from kernel preemption
@@ -2541,20 +2670,19 @@ EXPORT_SYMBOL(preempt_schedule);
2541 */ 2670 */
2542asmlinkage void __sched preempt_schedule_irq(void) 2671asmlinkage void __sched preempt_schedule_irq(void)
2543{ 2672{
2544 struct thread_info *ti = current_thread_info();
2545 enum ctx_state prev_state; 2673 enum ctx_state prev_state;
2546 2674
2547 /* Catch callers which need to be fixed */ 2675 /* Catch callers which need to be fixed */
2548 BUG_ON(ti->preempt_count || !irqs_disabled()); 2676 BUG_ON(preempt_count() || !irqs_disabled());
2549 2677
2550 prev_state = exception_enter(); 2678 prev_state = exception_enter();
2551 2679
2552 do { 2680 do {
2553 add_preempt_count(PREEMPT_ACTIVE); 2681 __preempt_count_add(PREEMPT_ACTIVE);
2554 local_irq_enable(); 2682 local_irq_enable();
2555 __schedule(); 2683 __schedule();
2556 local_irq_disable(); 2684 local_irq_disable();
2557 sub_preempt_count(PREEMPT_ACTIVE); 2685 __preempt_count_sub(PREEMPT_ACTIVE);
2558 2686
2559 /* 2687 /*
2560 * Check again in case we missed a preemption opportunity 2688 * Check again in case we missed a preemption opportunity
@@ -2566,8 +2694,6 @@ asmlinkage void __sched preempt_schedule_irq(void)
2566 exception_exit(prev_state); 2694 exception_exit(prev_state);
2567} 2695}
2568 2696
2569#endif /* CONFIG_PREEMPT */
2570
2571int default_wake_function(wait_queue_t *curr, unsigned mode, int wake_flags, 2697int default_wake_function(wait_queue_t *curr, unsigned mode, int wake_flags,
2572 void *key) 2698 void *key)
2573{ 2699{
@@ -2575,393 +2701,6 @@ int default_wake_function(wait_queue_t *curr, unsigned mode, int wake_flags,
2575} 2701}
2576EXPORT_SYMBOL(default_wake_function); 2702EXPORT_SYMBOL(default_wake_function);
2577 2703
2578/*
2579 * The core wakeup function. Non-exclusive wakeups (nr_exclusive == 0) just
2580 * wake everything up. If it's an exclusive wakeup (nr_exclusive == small +ve
2581 * number) then we wake all the non-exclusive tasks and one exclusive task.
2582 *
2583 * There are circumstances in which we can try to wake a task which has already
2584 * started to run but is not in state TASK_RUNNING. try_to_wake_up() returns
2585 * zero in this (rare) case, and we handle it by continuing to scan the queue.
2586 */
2587static void __wake_up_common(wait_queue_head_t *q, unsigned int mode,
2588 int nr_exclusive, int wake_flags, void *key)
2589{
2590 wait_queue_t *curr, *next;
2591
2592 list_for_each_entry_safe(curr, next, &q->task_list, task_list) {
2593 unsigned flags = curr->flags;
2594
2595 if (curr->func(curr, mode, wake_flags, key) &&
2596 (flags & WQ_FLAG_EXCLUSIVE) && !--nr_exclusive)
2597 break;
2598 }
2599}
2600
2601/**
2602 * __wake_up - wake up threads blocked on a waitqueue.
2603 * @q: the waitqueue
2604 * @mode: which threads
2605 * @nr_exclusive: how many wake-one or wake-many threads to wake up
2606 * @key: is directly passed to the wakeup function
2607 *
2608 * It may be assumed that this function implies a write memory barrier before
2609 * changing the task state if and only if any tasks are woken up.
2610 */
2611void __wake_up(wait_queue_head_t *q, unsigned int mode,
2612 int nr_exclusive, void *key)
2613{
2614 unsigned long flags;
2615
2616 spin_lock_irqsave(&q->lock, flags);
2617 __wake_up_common(q, mode, nr_exclusive, 0, key);
2618 spin_unlock_irqrestore(&q->lock, flags);
2619}
2620EXPORT_SYMBOL(__wake_up);
2621
2622/*
2623 * Same as __wake_up but called with the spinlock in wait_queue_head_t held.
2624 */
2625void __wake_up_locked(wait_queue_head_t *q, unsigned int mode, int nr)
2626{
2627 __wake_up_common(q, mode, nr, 0, NULL);
2628}
2629EXPORT_SYMBOL_GPL(__wake_up_locked);
2630
2631void __wake_up_locked_key(wait_queue_head_t *q, unsigned int mode, void *key)
2632{
2633 __wake_up_common(q, mode, 1, 0, key);
2634}
2635EXPORT_SYMBOL_GPL(__wake_up_locked_key);
2636
2637/**
2638 * __wake_up_sync_key - wake up threads blocked on a waitqueue.
2639 * @q: the waitqueue
2640 * @mode: which threads
2641 * @nr_exclusive: how many wake-one or wake-many threads to wake up
2642 * @key: opaque value to be passed to wakeup targets
2643 *
2644 * The sync wakeup differs that the waker knows that it will schedule
2645 * away soon, so while the target thread will be woken up, it will not
2646 * be migrated to another CPU - ie. the two threads are 'synchronized'
2647 * with each other. This can prevent needless bouncing between CPUs.
2648 *
2649 * On UP it can prevent extra preemption.
2650 *
2651 * It may be assumed that this function implies a write memory barrier before
2652 * changing the task state if and only if any tasks are woken up.
2653 */
2654void __wake_up_sync_key(wait_queue_head_t *q, unsigned int mode,
2655 int nr_exclusive, void *key)
2656{
2657 unsigned long flags;
2658 int wake_flags = WF_SYNC;
2659
2660 if (unlikely(!q))
2661 return;
2662
2663 if (unlikely(nr_exclusive != 1))
2664 wake_flags = 0;
2665
2666 spin_lock_irqsave(&q->lock, flags);
2667 __wake_up_common(q, mode, nr_exclusive, wake_flags, key);
2668 spin_unlock_irqrestore(&q->lock, flags);
2669}
2670EXPORT_SYMBOL_GPL(__wake_up_sync_key);
2671
2672/*
2673 * __wake_up_sync - see __wake_up_sync_key()
2674 */
2675void __wake_up_sync(wait_queue_head_t *q, unsigned int mode, int nr_exclusive)
2676{
2677 __wake_up_sync_key(q, mode, nr_exclusive, NULL);
2678}
2679EXPORT_SYMBOL_GPL(__wake_up_sync); /* For internal use only */
2680
2681/**
2682 * complete: - signals a single thread waiting on this completion
2683 * @x: holds the state of this particular completion
2684 *
2685 * This will wake up a single thread waiting on this completion. Threads will be
2686 * awakened in the same order in which they were queued.
2687 *
2688 * See also complete_all(), wait_for_completion() and related routines.
2689 *
2690 * It may be assumed that this function implies a write memory barrier before
2691 * changing the task state if and only if any tasks are woken up.
2692 */
2693void complete(struct completion *x)
2694{
2695 unsigned long flags;
2696
2697 spin_lock_irqsave(&x->wait.lock, flags);
2698 x->done++;
2699 __wake_up_common(&x->wait, TASK_NORMAL, 1, 0, NULL);
2700 spin_unlock_irqrestore(&x->wait.lock, flags);
2701}
2702EXPORT_SYMBOL(complete);
2703
2704/**
2705 * complete_all: - signals all threads waiting on this completion
2706 * @x: holds the state of this particular completion
2707 *
2708 * This will wake up all threads waiting on this particular completion event.
2709 *
2710 * It may be assumed that this function implies a write memory barrier before
2711 * changing the task state if and only if any tasks are woken up.
2712 */
2713void complete_all(struct completion *x)
2714{
2715 unsigned long flags;
2716
2717 spin_lock_irqsave(&x->wait.lock, flags);
2718 x->done += UINT_MAX/2;
2719 __wake_up_common(&x->wait, TASK_NORMAL, 0, 0, NULL);
2720 spin_unlock_irqrestore(&x->wait.lock, flags);
2721}
2722EXPORT_SYMBOL(complete_all);
2723
2724static inline long __sched
2725do_wait_for_common(struct completion *x,
2726 long (*action)(long), long timeout, int state)
2727{
2728 if (!x->done) {
2729 DECLARE_WAITQUEUE(wait, current);
2730
2731 __add_wait_queue_tail_exclusive(&x->wait, &wait);
2732 do {
2733 if (signal_pending_state(state, current)) {
2734 timeout = -ERESTARTSYS;
2735 break;
2736 }
2737 __set_current_state(state);
2738 spin_unlock_irq(&x->wait.lock);
2739 timeout = action(timeout);
2740 spin_lock_irq(&x->wait.lock);
2741 } while (!x->done && timeout);
2742 __remove_wait_queue(&x->wait, &wait);
2743 if (!x->done)
2744 return timeout;
2745 }
2746 x->done--;
2747 return timeout ?: 1;
2748}
2749
2750static inline long __sched
2751__wait_for_common(struct completion *x,
2752 long (*action)(long), long timeout, int state)
2753{
2754 might_sleep();
2755
2756 spin_lock_irq(&x->wait.lock);
2757 timeout = do_wait_for_common(x, action, timeout, state);
2758 spin_unlock_irq(&x->wait.lock);
2759 return timeout;
2760}
2761
2762static long __sched
2763wait_for_common(struct completion *x, long timeout, int state)
2764{
2765 return __wait_for_common(x, schedule_timeout, timeout, state);
2766}
2767
2768static long __sched
2769wait_for_common_io(struct completion *x, long timeout, int state)
2770{
2771 return __wait_for_common(x, io_schedule_timeout, timeout, state);
2772}
2773
2774/**
2775 * wait_for_completion: - waits for completion of a task
2776 * @x: holds the state of this particular completion
2777 *
2778 * This waits to be signaled for completion of a specific task. It is NOT
2779 * interruptible and there is no timeout.
2780 *
2781 * See also similar routines (i.e. wait_for_completion_timeout()) with timeout
2782 * and interrupt capability. Also see complete().
2783 */
2784void __sched wait_for_completion(struct completion *x)
2785{
2786 wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_UNINTERRUPTIBLE);
2787}
2788EXPORT_SYMBOL(wait_for_completion);
2789
2790/**
2791 * wait_for_completion_timeout: - waits for completion of a task (w/timeout)
2792 * @x: holds the state of this particular completion
2793 * @timeout: timeout value in jiffies
2794 *
2795 * This waits for either a completion of a specific task to be signaled or for a
2796 * specified timeout to expire. The timeout is in jiffies. It is not
2797 * interruptible.
2798 *
2799 * Return: 0 if timed out, and positive (at least 1, or number of jiffies left
2800 * till timeout) if completed.
2801 */
2802unsigned long __sched
2803wait_for_completion_timeout(struct completion *x, unsigned long timeout)
2804{
2805 return wait_for_common(x, timeout, TASK_UNINTERRUPTIBLE);
2806}
2807EXPORT_SYMBOL(wait_for_completion_timeout);
2808
2809/**
2810 * wait_for_completion_io: - waits for completion of a task
2811 * @x: holds the state of this particular completion
2812 *
2813 * This waits to be signaled for completion of a specific task. It is NOT
2814 * interruptible and there is no timeout. The caller is accounted as waiting
2815 * for IO.
2816 */
2817void __sched wait_for_completion_io(struct completion *x)
2818{
2819 wait_for_common_io(x, MAX_SCHEDULE_TIMEOUT, TASK_UNINTERRUPTIBLE);
2820}
2821EXPORT_SYMBOL(wait_for_completion_io);
2822
2823/**
2824 * wait_for_completion_io_timeout: - waits for completion of a task (w/timeout)
2825 * @x: holds the state of this particular completion
2826 * @timeout: timeout value in jiffies
2827 *
2828 * This waits for either a completion of a specific task to be signaled or for a
2829 * specified timeout to expire. The timeout is in jiffies. It is not
2830 * interruptible. The caller is accounted as waiting for IO.
2831 *
2832 * Return: 0 if timed out, and positive (at least 1, or number of jiffies left
2833 * till timeout) if completed.
2834 */
2835unsigned long __sched
2836wait_for_completion_io_timeout(struct completion *x, unsigned long timeout)
2837{
2838 return wait_for_common_io(x, timeout, TASK_UNINTERRUPTIBLE);
2839}
2840EXPORT_SYMBOL(wait_for_completion_io_timeout);
2841
2842/**
2843 * wait_for_completion_interruptible: - waits for completion of a task (w/intr)
2844 * @x: holds the state of this particular completion
2845 *
2846 * This waits for completion of a specific task to be signaled. It is
2847 * interruptible.
2848 *
2849 * Return: -ERESTARTSYS if interrupted, 0 if completed.
2850 */
2851int __sched wait_for_completion_interruptible(struct completion *x)
2852{
2853 long t = wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_INTERRUPTIBLE);
2854 if (t == -ERESTARTSYS)
2855 return t;
2856 return 0;
2857}
2858EXPORT_SYMBOL(wait_for_completion_interruptible);
2859
2860/**
2861 * wait_for_completion_interruptible_timeout: - waits for completion (w/(to,intr))
2862 * @x: holds the state of this particular completion
2863 * @timeout: timeout value in jiffies
2864 *
2865 * This waits for either a completion of a specific task to be signaled or for a
2866 * specified timeout to expire. It is interruptible. The timeout is in jiffies.
2867 *
2868 * Return: -ERESTARTSYS if interrupted, 0 if timed out, positive (at least 1,
2869 * or number of jiffies left till timeout) if completed.
2870 */
2871long __sched
2872wait_for_completion_interruptible_timeout(struct completion *x,
2873 unsigned long timeout)
2874{
2875 return wait_for_common(x, timeout, TASK_INTERRUPTIBLE);
2876}
2877EXPORT_SYMBOL(wait_for_completion_interruptible_timeout);
2878
2879/**
2880 * wait_for_completion_killable: - waits for completion of a task (killable)
2881 * @x: holds the state of this particular completion
2882 *
2883 * This waits to be signaled for completion of a specific task. It can be
2884 * interrupted by a kill signal.
2885 *
2886 * Return: -ERESTARTSYS if interrupted, 0 if completed.
2887 */
2888int __sched wait_for_completion_killable(struct completion *x)
2889{
2890 long t = wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_KILLABLE);
2891 if (t == -ERESTARTSYS)
2892 return t;
2893 return 0;
2894}
2895EXPORT_SYMBOL(wait_for_completion_killable);
2896
2897/**
2898 * wait_for_completion_killable_timeout: - waits for completion of a task (w/(to,killable))
2899 * @x: holds the state of this particular completion
2900 * @timeout: timeout value in jiffies
2901 *
2902 * This waits for either a completion of a specific task to be
2903 * signaled or for a specified timeout to expire. It can be
2904 * interrupted by a kill signal. The timeout is in jiffies.
2905 *
2906 * Return: -ERESTARTSYS if interrupted, 0 if timed out, positive (at least 1,
2907 * or number of jiffies left till timeout) if completed.
2908 */
2909long __sched
2910wait_for_completion_killable_timeout(struct completion *x,
2911 unsigned long timeout)
2912{
2913 return wait_for_common(x, timeout, TASK_KILLABLE);
2914}
2915EXPORT_SYMBOL(wait_for_completion_killable_timeout);
2916
2917/**
2918 * try_wait_for_completion - try to decrement a completion without blocking
2919 * @x: completion structure
2920 *
2921 * Return: 0 if a decrement cannot be done without blocking
2922 * 1 if a decrement succeeded.
2923 *
2924 * If a completion is being used as a counting completion,
2925 * attempt to decrement the counter without blocking. This
2926 * enables us to avoid waiting if the resource the completion
2927 * is protecting is not available.
2928 */
2929bool try_wait_for_completion(struct completion *x)
2930{
2931 unsigned long flags;
2932 int ret = 1;
2933
2934 spin_lock_irqsave(&x->wait.lock, flags);
2935 if (!x->done)
2936 ret = 0;
2937 else
2938 x->done--;
2939 spin_unlock_irqrestore(&x->wait.lock, flags);
2940 return ret;
2941}
2942EXPORT_SYMBOL(try_wait_for_completion);
2943
2944/**
2945 * completion_done - Test to see if a completion has any waiters
2946 * @x: completion structure
2947 *
2948 * Return: 0 if there are waiters (wait_for_completion() in progress)
2949 * 1 if there are no waiters.
2950 *
2951 */
2952bool completion_done(struct completion *x)
2953{
2954 unsigned long flags;
2955 int ret = 1;
2956
2957 spin_lock_irqsave(&x->wait.lock, flags);
2958 if (!x->done)
2959 ret = 0;
2960 spin_unlock_irqrestore(&x->wait.lock, flags);
2961 return ret;
2962}
2963EXPORT_SYMBOL(completion_done);
2964
2965static long __sched 2704static long __sched
2966sleep_on_common(wait_queue_head_t *q, int state, long timeout) 2705sleep_on_common(wait_queue_head_t *q, int state, long timeout)
2967{ 2706{
@@ -3598,13 +3337,11 @@ long sched_setaffinity(pid_t pid, const struct cpumask *in_mask)
3598 struct task_struct *p; 3337 struct task_struct *p;
3599 int retval; 3338 int retval;
3600 3339
3601 get_online_cpus();
3602 rcu_read_lock(); 3340 rcu_read_lock();
3603 3341
3604 p = find_process_by_pid(pid); 3342 p = find_process_by_pid(pid);
3605 if (!p) { 3343 if (!p) {
3606 rcu_read_unlock(); 3344 rcu_read_unlock();
3607 put_online_cpus();
3608 return -ESRCH; 3345 return -ESRCH;
3609 } 3346 }
3610 3347
@@ -3661,7 +3398,6 @@ out_free_cpus_allowed:
3661 free_cpumask_var(cpus_allowed); 3398 free_cpumask_var(cpus_allowed);
3662out_put_task: 3399out_put_task:
3663 put_task_struct(p); 3400 put_task_struct(p);
3664 put_online_cpus();
3665 return retval; 3401 return retval;
3666} 3402}
3667 3403
@@ -3706,7 +3442,6 @@ long sched_getaffinity(pid_t pid, struct cpumask *mask)
3706 unsigned long flags; 3442 unsigned long flags;
3707 int retval; 3443 int retval;
3708 3444
3709 get_online_cpus();
3710 rcu_read_lock(); 3445 rcu_read_lock();
3711 3446
3712 retval = -ESRCH; 3447 retval = -ESRCH;
@@ -3719,12 +3454,11 @@ long sched_getaffinity(pid_t pid, struct cpumask *mask)
3719 goto out_unlock; 3454 goto out_unlock;
3720 3455
3721 raw_spin_lock_irqsave(&p->pi_lock, flags); 3456 raw_spin_lock_irqsave(&p->pi_lock, flags);
3722 cpumask_and(mask, &p->cpus_allowed, cpu_online_mask); 3457 cpumask_and(mask, &p->cpus_allowed, cpu_active_mask);
3723 raw_spin_unlock_irqrestore(&p->pi_lock, flags); 3458 raw_spin_unlock_irqrestore(&p->pi_lock, flags);
3724 3459
3725out_unlock: 3460out_unlock:
3726 rcu_read_unlock(); 3461 rcu_read_unlock();
3727 put_online_cpus();
3728 3462
3729 return retval; 3463 return retval;
3730} 3464}
@@ -3794,16 +3528,11 @@ SYSCALL_DEFINE0(sched_yield)
3794 return 0; 3528 return 0;
3795} 3529}
3796 3530
3797static inline int should_resched(void)
3798{
3799 return need_resched() && !(preempt_count() & PREEMPT_ACTIVE);
3800}
3801
3802static void __cond_resched(void) 3531static void __cond_resched(void)
3803{ 3532{
3804 add_preempt_count(PREEMPT_ACTIVE); 3533 __preempt_count_add(PREEMPT_ACTIVE);
3805 __schedule(); 3534 __schedule();
3806 sub_preempt_count(PREEMPT_ACTIVE); 3535 __preempt_count_sub(PREEMPT_ACTIVE);
3807} 3536}
3808 3537
3809int __sched _cond_resched(void) 3538int __sched _cond_resched(void)
@@ -4186,7 +3915,7 @@ void init_idle(struct task_struct *idle, int cpu)
4186 3915
4187 raw_spin_lock_irqsave(&rq->lock, flags); 3916 raw_spin_lock_irqsave(&rq->lock, flags);
4188 3917
4189 __sched_fork(idle); 3918 __sched_fork(0, idle);
4190 idle->state = TASK_RUNNING; 3919 idle->state = TASK_RUNNING;
4191 idle->se.exec_start = sched_clock(); 3920 idle->se.exec_start = sched_clock();
4192 3921
@@ -4212,7 +3941,7 @@ void init_idle(struct task_struct *idle, int cpu)
4212 raw_spin_unlock_irqrestore(&rq->lock, flags); 3941 raw_spin_unlock_irqrestore(&rq->lock, flags);
4213 3942
4214 /* Set the preempt count _outside_ the spinlocks! */ 3943 /* Set the preempt count _outside_ the spinlocks! */
4215 task_thread_info(idle)->preempt_count = 0; 3944 init_idle_preempt_count(idle, cpu);
4216 3945
4217 /* 3946 /*
4218 * The idle tasks have their own, simple scheduling class: 3947 * The idle tasks have their own, simple scheduling class:
@@ -4346,6 +4075,53 @@ fail:
4346 return ret; 4075 return ret;
4347} 4076}
4348 4077
4078#ifdef CONFIG_NUMA_BALANCING
4079/* Migrate current task p to target_cpu */
4080int migrate_task_to(struct task_struct *p, int target_cpu)
4081{
4082 struct migration_arg arg = { p, target_cpu };
4083 int curr_cpu = task_cpu(p);
4084
4085 if (curr_cpu == target_cpu)
4086 return 0;
4087
4088 if (!cpumask_test_cpu(target_cpu, tsk_cpus_allowed(p)))
4089 return -EINVAL;
4090
4091 /* TODO: This is not properly updating schedstats */
4092
4093 return stop_one_cpu(curr_cpu, migration_cpu_stop, &arg);
4094}
4095
4096/*
4097 * Requeue a task on a given node and accurately track the number of NUMA
4098 * tasks on the runqueues
4099 */
4100void sched_setnuma(struct task_struct *p, int nid)
4101{
4102 struct rq *rq;
4103 unsigned long flags;
4104 bool on_rq, running;
4105
4106 rq = task_rq_lock(p, &flags);
4107 on_rq = p->on_rq;
4108 running = task_current(rq, p);
4109
4110 if (on_rq)
4111 dequeue_task(rq, p, 0);
4112 if (running)
4113 p->sched_class->put_prev_task(rq, p);
4114
4115 p->numa_preferred_nid = nid;
4116
4117 if (running)
4118 p->sched_class->set_curr_task(rq);
4119 if (on_rq)
4120 enqueue_task(rq, p, 0);
4121 task_rq_unlock(rq, p, &flags);
4122}
4123#endif
4124
4349/* 4125/*
4350 * migration_cpu_stop - this will be executed by a highprio stopper thread 4126 * migration_cpu_stop - this will be executed by a highprio stopper thread
4351 * and performs thread migration by bumping thread off CPU then 4127 * and performs thread migration by bumping thread off CPU then
@@ -4985,7 +4761,7 @@ static void rq_attach_root(struct rq *rq, struct root_domain *rd)
4985 cpumask_clear_cpu(rq->cpu, old_rd->span); 4761 cpumask_clear_cpu(rq->cpu, old_rd->span);
4986 4762
4987 /* 4763 /*
4988 * If we dont want to free the old_rt yet then 4764 * If we dont want to free the old_rd yet then
4989 * set old_rd to NULL to skip the freeing later 4765 * set old_rd to NULL to skip the freeing later
4990 * in this function: 4766 * in this function:
4991 */ 4767 */
@@ -5119,10 +4895,14 @@ static void destroy_sched_domains(struct sched_domain *sd, int cpu)
5119DEFINE_PER_CPU(struct sched_domain *, sd_llc); 4895DEFINE_PER_CPU(struct sched_domain *, sd_llc);
5120DEFINE_PER_CPU(int, sd_llc_size); 4896DEFINE_PER_CPU(int, sd_llc_size);
5121DEFINE_PER_CPU(int, sd_llc_id); 4897DEFINE_PER_CPU(int, sd_llc_id);
4898DEFINE_PER_CPU(struct sched_domain *, sd_numa);
4899DEFINE_PER_CPU(struct sched_domain *, sd_busy);
4900DEFINE_PER_CPU(struct sched_domain *, sd_asym);
5122 4901
5123static void update_top_cache_domain(int cpu) 4902static void update_top_cache_domain(int cpu)
5124{ 4903{
5125 struct sched_domain *sd; 4904 struct sched_domain *sd;
4905 struct sched_domain *busy_sd = NULL;
5126 int id = cpu; 4906 int id = cpu;
5127 int size = 1; 4907 int size = 1;
5128 4908
@@ -5130,11 +4910,19 @@ static void update_top_cache_domain(int cpu)
5130 if (sd) { 4910 if (sd) {
5131 id = cpumask_first(sched_domain_span(sd)); 4911 id = cpumask_first(sched_domain_span(sd));
5132 size = cpumask_weight(sched_domain_span(sd)); 4912 size = cpumask_weight(sched_domain_span(sd));
4913 busy_sd = sd->parent; /* sd_busy */
5133 } 4914 }
4915 rcu_assign_pointer(per_cpu(sd_busy, cpu), busy_sd);
5134 4916
5135 rcu_assign_pointer(per_cpu(sd_llc, cpu), sd); 4917 rcu_assign_pointer(per_cpu(sd_llc, cpu), sd);
5136 per_cpu(sd_llc_size, cpu) = size; 4918 per_cpu(sd_llc_size, cpu) = size;
5137 per_cpu(sd_llc_id, cpu) = id; 4919 per_cpu(sd_llc_id, cpu) = id;
4920
4921 sd = lowest_flag_domain(cpu, SD_NUMA);
4922 rcu_assign_pointer(per_cpu(sd_numa, cpu), sd);
4923
4924 sd = highest_flag_domain(cpu, SD_ASYM_PACKING);
4925 rcu_assign_pointer(per_cpu(sd_asym, cpu), sd);
5138} 4926}
5139 4927
5140/* 4928/*
@@ -5325,6 +5113,7 @@ build_overlap_sched_groups(struct sched_domain *sd, int cpu)
5325 * die on a /0 trap. 5113 * die on a /0 trap.
5326 */ 5114 */
5327 sg->sgp->power = SCHED_POWER_SCALE * cpumask_weight(sg_span); 5115 sg->sgp->power = SCHED_POWER_SCALE * cpumask_weight(sg_span);
5116 sg->sgp->power_orig = sg->sgp->power;
5328 5117
5329 /* 5118 /*
5330 * Make sure the first group of this domain contains the 5119 * Make sure the first group of this domain contains the
@@ -5654,6 +5443,7 @@ sd_numa_init(struct sched_domain_topology_level *tl, int cpu)
5654 | 0*SD_SHARE_PKG_RESOURCES 5443 | 0*SD_SHARE_PKG_RESOURCES
5655 | 1*SD_SERIALIZE 5444 | 1*SD_SERIALIZE
5656 | 0*SD_PREFER_SIBLING 5445 | 0*SD_PREFER_SIBLING
5446 | 1*SD_NUMA
5657 | sd_local_flags(level) 5447 | sd_local_flags(level)
5658 , 5448 ,
5659 .last_balance = jiffies, 5449 .last_balance = jiffies,
@@ -6335,14 +6125,17 @@ void __init sched_init_smp(void)
6335 6125
6336 sched_init_numa(); 6126 sched_init_numa();
6337 6127
6338 get_online_cpus(); 6128 /*
6129 * There's no userspace yet to cause hotplug operations; hence all the
6130 * cpu masks are stable and all blatant races in the below code cannot
6131 * happen.
6132 */
6339 mutex_lock(&sched_domains_mutex); 6133 mutex_lock(&sched_domains_mutex);
6340 init_sched_domains(cpu_active_mask); 6134 init_sched_domains(cpu_active_mask);
6341 cpumask_andnot(non_isolated_cpus, cpu_possible_mask, cpu_isolated_map); 6135 cpumask_andnot(non_isolated_cpus, cpu_possible_mask, cpu_isolated_map);
6342 if (cpumask_empty(non_isolated_cpus)) 6136 if (cpumask_empty(non_isolated_cpus))
6343 cpumask_set_cpu(smp_processor_id(), non_isolated_cpus); 6137 cpumask_set_cpu(smp_processor_id(), non_isolated_cpus);
6344 mutex_unlock(&sched_domains_mutex); 6138 mutex_unlock(&sched_domains_mutex);
6345 put_online_cpus();
6346 6139
6347 hotcpu_notifier(sched_domains_numa_masks_update, CPU_PRI_SCHED_ACTIVE); 6140 hotcpu_notifier(sched_domains_numa_masks_update, CPU_PRI_SCHED_ACTIVE);
6348 hotcpu_notifier(cpuset_cpu_active, CPU_PRI_CPUSET_ACTIVE); 6141 hotcpu_notifier(cpuset_cpu_active, CPU_PRI_CPUSET_ACTIVE);
@@ -6505,6 +6298,7 @@ void __init sched_init(void)
6505 rq->online = 0; 6298 rq->online = 0;
6506 rq->idle_stamp = 0; 6299 rq->idle_stamp = 0;
6507 rq->avg_idle = 2*sysctl_sched_migration_cost; 6300 rq->avg_idle = 2*sysctl_sched_migration_cost;
6301 rq->max_idle_balance_cost = sysctl_sched_migration_cost;
6508 6302
6509 INIT_LIST_HEAD(&rq->cfs_tasks); 6303 INIT_LIST_HEAD(&rq->cfs_tasks);
6510 6304
@@ -7277,7 +7071,12 @@ static int tg_set_cfs_bandwidth(struct task_group *tg, u64 period, u64 quota)
7277 7071
7278 runtime_enabled = quota != RUNTIME_INF; 7072 runtime_enabled = quota != RUNTIME_INF;
7279 runtime_was_enabled = cfs_b->quota != RUNTIME_INF; 7073 runtime_was_enabled = cfs_b->quota != RUNTIME_INF;
7280 account_cfs_bandwidth_used(runtime_enabled, runtime_was_enabled); 7074 /*
7075 * If we need to toggle cfs_bandwidth_used, off->on must occur
7076 * before making related changes, and on->off must occur afterwards
7077 */
7078 if (runtime_enabled && !runtime_was_enabled)
7079 cfs_bandwidth_usage_inc();
7281 raw_spin_lock_irq(&cfs_b->lock); 7080 raw_spin_lock_irq(&cfs_b->lock);
7282 cfs_b->period = ns_to_ktime(period); 7081 cfs_b->period = ns_to_ktime(period);
7283 cfs_b->quota = quota; 7082 cfs_b->quota = quota;
@@ -7303,6 +7102,8 @@ static int tg_set_cfs_bandwidth(struct task_group *tg, u64 period, u64 quota)
7303 unthrottle_cfs_rq(cfs_rq); 7102 unthrottle_cfs_rq(cfs_rq);
7304 raw_spin_unlock_irq(&rq->lock); 7103 raw_spin_unlock_irq(&rq->lock);
7305 } 7104 }
7105 if (runtime_was_enabled && !runtime_enabled)
7106 cfs_bandwidth_usage_dec();
7306out_unlock: 7107out_unlock:
7307 mutex_unlock(&cfs_constraints_mutex); 7108 mutex_unlock(&cfs_constraints_mutex);
7308 7109
diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
index 196559994f7c..5c34d1817e8f 100644
--- a/kernel/sched/debug.c
+++ b/kernel/sched/debug.c
@@ -15,6 +15,7 @@
15#include <linux/seq_file.h> 15#include <linux/seq_file.h>
16#include <linux/kallsyms.h> 16#include <linux/kallsyms.h>
17#include <linux/utsname.h> 17#include <linux/utsname.h>
18#include <linux/mempolicy.h>
18 19
19#include "sched.h" 20#include "sched.h"
20 21
@@ -137,6 +138,9 @@ print_task(struct seq_file *m, struct rq *rq, struct task_struct *p)
137 SEQ_printf(m, "%15Ld %15Ld %15Ld.%06ld %15Ld.%06ld %15Ld.%06ld", 138 SEQ_printf(m, "%15Ld %15Ld %15Ld.%06ld %15Ld.%06ld %15Ld.%06ld",
138 0LL, 0LL, 0LL, 0L, 0LL, 0L, 0LL, 0L); 139 0LL, 0LL, 0LL, 0L, 0LL, 0L, 0LL, 0L);
139#endif 140#endif
141#ifdef CONFIG_NUMA_BALANCING
142 SEQ_printf(m, " %d", cpu_to_node(task_cpu(p)));
143#endif
140#ifdef CONFIG_CGROUP_SCHED 144#ifdef CONFIG_CGROUP_SCHED
141 SEQ_printf(m, " %s", task_group_path(task_group(p))); 145 SEQ_printf(m, " %s", task_group_path(task_group(p)));
142#endif 146#endif
@@ -159,7 +163,7 @@ static void print_rq(struct seq_file *m, struct rq *rq, int rq_cpu)
159 read_lock_irqsave(&tasklist_lock, flags); 163 read_lock_irqsave(&tasklist_lock, flags);
160 164
161 do_each_thread(g, p) { 165 do_each_thread(g, p) {
162 if (!p->on_rq || task_cpu(p) != rq_cpu) 166 if (task_cpu(p) != rq_cpu)
163 continue; 167 continue;
164 168
165 print_task(m, rq, p); 169 print_task(m, rq, p);
@@ -225,6 +229,14 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)
225 atomic_read(&cfs_rq->tg->runnable_avg)); 229 atomic_read(&cfs_rq->tg->runnable_avg));
226#endif 230#endif
227#endif 231#endif
232#ifdef CONFIG_CFS_BANDWIDTH
233 SEQ_printf(m, " .%-30s: %d\n", "tg->cfs_bandwidth.timer_active",
234 cfs_rq->tg->cfs_bandwidth.timer_active);
235 SEQ_printf(m, " .%-30s: %d\n", "throttled",
236 cfs_rq->throttled);
237 SEQ_printf(m, " .%-30s: %d\n", "throttle_count",
238 cfs_rq->throttle_count);
239#endif
228 240
229#ifdef CONFIG_FAIR_GROUP_SCHED 241#ifdef CONFIG_FAIR_GROUP_SCHED
230 print_cfs_group_stats(m, cpu, cfs_rq->tg); 242 print_cfs_group_stats(m, cpu, cfs_rq->tg);
@@ -345,7 +357,7 @@ static void sched_debug_header(struct seq_file *m)
345 cpu_clk = local_clock(); 357 cpu_clk = local_clock();
346 local_irq_restore(flags); 358 local_irq_restore(flags);
347 359
348 SEQ_printf(m, "Sched Debug Version: v0.10, %s %.*s\n", 360 SEQ_printf(m, "Sched Debug Version: v0.11, %s %.*s\n",
349 init_utsname()->release, 361 init_utsname()->release,
350 (int)strcspn(init_utsname()->version, " "), 362 (int)strcspn(init_utsname()->version, " "),
351 init_utsname()->version); 363 init_utsname()->version);
@@ -488,6 +500,56 @@ static int __init init_sched_debug_procfs(void)
488 500
489__initcall(init_sched_debug_procfs); 501__initcall(init_sched_debug_procfs);
490 502
503#define __P(F) \
504 SEQ_printf(m, "%-45s:%21Ld\n", #F, (long long)F)
505#define P(F) \
506 SEQ_printf(m, "%-45s:%21Ld\n", #F, (long long)p->F)
507#define __PN(F) \
508 SEQ_printf(m, "%-45s:%14Ld.%06ld\n", #F, SPLIT_NS((long long)F))
509#define PN(F) \
510 SEQ_printf(m, "%-45s:%14Ld.%06ld\n", #F, SPLIT_NS((long long)p->F))
511
512
513static void sched_show_numa(struct task_struct *p, struct seq_file *m)
514{
515#ifdef CONFIG_NUMA_BALANCING
516 struct mempolicy *pol;
517 int node, i;
518
519 if (p->mm)
520 P(mm->numa_scan_seq);
521
522 task_lock(p);
523 pol = p->mempolicy;
524 if (pol && !(pol->flags & MPOL_F_MORON))
525 pol = NULL;
526 mpol_get(pol);
527 task_unlock(p);
528
529 SEQ_printf(m, "numa_migrations, %ld\n", xchg(&p->numa_pages_migrated, 0));
530
531 for_each_online_node(node) {
532 for (i = 0; i < 2; i++) {
533 unsigned long nr_faults = -1;
534 int cpu_current, home_node;
535
536 if (p->numa_faults)
537 nr_faults = p->numa_faults[2*node + i];
538
539 cpu_current = !i ? (task_node(p) == node) :
540 (pol && node_isset(node, pol->v.nodes));
541
542 home_node = (p->numa_preferred_nid == node);
543
544 SEQ_printf(m, "numa_faults, %d, %d, %d, %d, %ld\n",
545 i, node, cpu_current, home_node, nr_faults);
546 }
547 }
548
549 mpol_put(pol);
550#endif
551}
552
491void proc_sched_show_task(struct task_struct *p, struct seq_file *m) 553void proc_sched_show_task(struct task_struct *p, struct seq_file *m)
492{ 554{
493 unsigned long nr_switches; 555 unsigned long nr_switches;
@@ -591,6 +653,8 @@ void proc_sched_show_task(struct task_struct *p, struct seq_file *m)
591 SEQ_printf(m, "%-45s:%21Ld\n", 653 SEQ_printf(m, "%-45s:%21Ld\n",
592 "clock-delta", (long long)(t1-t0)); 654 "clock-delta", (long long)(t1-t0));
593 } 655 }
656
657 sched_show_numa(p, m);
594} 658}
595 659
596void proc_sched_set_task(struct task_struct *p) 660void proc_sched_set_task(struct task_struct *p)
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 7c70201fbc61..e64b0794060e 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -178,59 +178,61 @@ void sched_init_granularity(void)
178 update_sysctl(); 178 update_sysctl();
179} 179}
180 180
181#if BITS_PER_LONG == 32 181#define WMULT_CONST (~0U)
182# define WMULT_CONST (~0UL)
183#else
184# define WMULT_CONST (1UL << 32)
185#endif
186
187#define WMULT_SHIFT 32 182#define WMULT_SHIFT 32
188 183
189/* 184static void __update_inv_weight(struct load_weight *lw)
190 * Shift right and round: 185{
191 */ 186 unsigned long w;
192#define SRR(x, y) (((x) + (1UL << ((y) - 1))) >> (y)) 187
188 if (likely(lw->inv_weight))
189 return;
190
191 w = scale_load_down(lw->weight);
192
193 if (BITS_PER_LONG > 32 && unlikely(w >= WMULT_CONST))
194 lw->inv_weight = 1;
195 else if (unlikely(!w))
196 lw->inv_weight = WMULT_CONST;
197 else
198 lw->inv_weight = WMULT_CONST / w;
199}
193 200
194/* 201/*
195 * delta *= weight / lw 202 * delta_exec * weight / lw.weight
203 * OR
204 * (delta_exec * (weight * lw->inv_weight)) >> WMULT_SHIFT
205 *
206 * Either weight := NICE_0_LOAD and lw \e prio_to_wmult[], in which case
207 * we're guaranteed shift stays positive because inv_weight is guaranteed to
208 * fit 32 bits, and NICE_0_LOAD gives another 10 bits; therefore shift >= 22.
209 *
210 * Or, weight =< lw.weight (because lw.weight is the runqueue weight), thus
211 * weight/lw.weight <= 1, and therefore our shift will also be positive.
196 */ 212 */
197static unsigned long 213static u64 __calc_delta(u64 delta_exec, unsigned long weight, struct load_weight *lw)
198calc_delta_mine(unsigned long delta_exec, unsigned long weight,
199 struct load_weight *lw)
200{ 214{
201 u64 tmp; 215 u64 fact = scale_load_down(weight);
202 216 int shift = WMULT_SHIFT;
203 /*
204 * weight can be less than 2^SCHED_LOAD_RESOLUTION for task group sched
205 * entities since MIN_SHARES = 2. Treat weight as 1 if less than
206 * 2^SCHED_LOAD_RESOLUTION.
207 */
208 if (likely(weight > (1UL << SCHED_LOAD_RESOLUTION)))
209 tmp = (u64)delta_exec * scale_load_down(weight);
210 else
211 tmp = (u64)delta_exec;
212 217
213 if (!lw->inv_weight) { 218 __update_inv_weight(lw);
214 unsigned long w = scale_load_down(lw->weight);
215 219
216 if (BITS_PER_LONG > 32 && unlikely(w >= WMULT_CONST)) 220 if (unlikely(fact >> 32)) {
217 lw->inv_weight = 1; 221 while (fact >> 32) {
218 else if (unlikely(!w)) 222 fact >>= 1;
219 lw->inv_weight = WMULT_CONST; 223 shift--;
220 else 224 }
221 lw->inv_weight = WMULT_CONST / w;
222 } 225 }
223 226
224 /* 227 /* hint to use a 32x32->64 mul */
225 * Check whether we'd overflow the 64-bit multiplication: 228 fact = (u64)(u32)fact * lw->inv_weight;
226 */ 229
227 if (unlikely(tmp > WMULT_CONST)) 230 while (fact >> 32) {
228 tmp = SRR(SRR(tmp, WMULT_SHIFT/2) * lw->inv_weight, 231 fact >>= 1;
229 WMULT_SHIFT/2); 232 shift--;
230 else 233 }
231 tmp = SRR(tmp * lw->inv_weight, WMULT_SHIFT);
232 234
233 return (unsigned long)min(tmp, (u64)(unsigned long)LONG_MAX); 235 return mul_u64_u32_shr(delta_exec, fact, shift);
234} 236}
235 237
236 238
@@ -443,7 +445,7 @@ find_matching_se(struct sched_entity **se, struct sched_entity **pse)
443#endif /* CONFIG_FAIR_GROUP_SCHED */ 445#endif /* CONFIG_FAIR_GROUP_SCHED */
444 446
445static __always_inline 447static __always_inline
446void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, unsigned long delta_exec); 448void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec);
447 449
448/************************************************************** 450/**************************************************************
449 * Scheduling class tree data structure manipulation methods: 451 * Scheduling class tree data structure manipulation methods:
@@ -612,11 +614,10 @@ int sched_proc_update_handler(struct ctl_table *table, int write,
612/* 614/*
613 * delta /= w 615 * delta /= w
614 */ 616 */
615static inline unsigned long 617static inline u64 calc_delta_fair(u64 delta, struct sched_entity *se)
616calc_delta_fair(unsigned long delta, struct sched_entity *se)
617{ 618{
618 if (unlikely(se->load.weight != NICE_0_LOAD)) 619 if (unlikely(se->load.weight != NICE_0_LOAD))
619 delta = calc_delta_mine(delta, NICE_0_LOAD, &se->load); 620 delta = __calc_delta(delta, NICE_0_LOAD, &se->load);
620 621
621 return delta; 622 return delta;
622} 623}
@@ -665,7 +666,7 @@ static u64 sched_slice(struct cfs_rq *cfs_rq, struct sched_entity *se)
665 update_load_add(&lw, se->load.weight); 666 update_load_add(&lw, se->load.weight);
666 load = &lw; 667 load = &lw;
667 } 668 }
668 slice = calc_delta_mine(slice, se->load.weight, load); 669 slice = __calc_delta(slice, se->load.weight, load);
669 } 670 }
670 return slice; 671 return slice;
671} 672}
@@ -681,6 +682,8 @@ static u64 sched_vslice(struct cfs_rq *cfs_rq, struct sched_entity *se)
681} 682}
682 683
683#ifdef CONFIG_SMP 684#ifdef CONFIG_SMP
685static unsigned long task_h_load(struct task_struct *p);
686
684static inline void __update_task_entity_contrib(struct sched_entity *se); 687static inline void __update_task_entity_contrib(struct sched_entity *se);
685 688
686/* Give new task start runnable values to heavy its load in infant time */ 689/* Give new task start runnable values to heavy its load in infant time */
@@ -701,47 +704,32 @@ void init_task_runnable_average(struct task_struct *p)
701#endif 704#endif
702 705
703/* 706/*
704 * Update the current task's runtime statistics. Skip current tasks that 707 * Update the current task's runtime statistics.
705 * are not in our scheduling class.
706 */ 708 */
707static inline void
708__update_curr(struct cfs_rq *cfs_rq, struct sched_entity *curr,
709 unsigned long delta_exec)
710{
711 unsigned long delta_exec_weighted;
712
713 schedstat_set(curr->statistics.exec_max,
714 max((u64)delta_exec, curr->statistics.exec_max));
715
716 curr->sum_exec_runtime += delta_exec;
717 schedstat_add(cfs_rq, exec_clock, delta_exec);
718 delta_exec_weighted = calc_delta_fair(delta_exec, curr);
719
720 curr->vruntime += delta_exec_weighted;
721 update_min_vruntime(cfs_rq);
722}
723
724static void update_curr(struct cfs_rq *cfs_rq) 709static void update_curr(struct cfs_rq *cfs_rq)
725{ 710{
726 struct sched_entity *curr = cfs_rq->curr; 711 struct sched_entity *curr = cfs_rq->curr;
727 u64 now = rq_clock_task(rq_of(cfs_rq)); 712 u64 now = rq_clock_task(rq_of(cfs_rq));
728 unsigned long delta_exec; 713 u64 delta_exec;
729 714
730 if (unlikely(!curr)) 715 if (unlikely(!curr))
731 return; 716 return;
732 717
733 /* 718 delta_exec = now - curr->exec_start;
734 * Get the amount of time the current task was running 719 if (unlikely((s64)delta_exec <= 0))
735 * since the last time we changed load (this cannot
736 * overflow on 32 bits):
737 */
738 delta_exec = (unsigned long)(now - curr->exec_start);
739 if (!delta_exec)
740 return; 720 return;
741 721
742 __update_curr(cfs_rq, curr, delta_exec);
743 curr->exec_start = now; 722 curr->exec_start = now;
744 723
724 schedstat_set(curr->statistics.exec_max,
725 max(delta_exec, curr->statistics.exec_max));
726
727 curr->sum_exec_runtime += delta_exec;
728 schedstat_add(cfs_rq, exec_clock, delta_exec);
729
730 curr->vruntime += calc_delta_fair(delta_exec, curr);
731 update_min_vruntime(cfs_rq);
732
745 if (entity_is_task(curr)) { 733 if (entity_is_task(curr)) {
746 struct task_struct *curtask = task_of(curr); 734 struct task_struct *curtask = task_of(curr);
747 735
@@ -818,11 +806,12 @@ update_stats_curr_start(struct cfs_rq *cfs_rq, struct sched_entity *se)
818 806
819#ifdef CONFIG_NUMA_BALANCING 807#ifdef CONFIG_NUMA_BALANCING
820/* 808/*
821 * numa task sample period in ms 809 * Approximate time to scan a full NUMA task in ms. The task scan period is
810 * calculated based on the tasks virtual memory size and
811 * numa_balancing_scan_size.
822 */ 812 */
823unsigned int sysctl_numa_balancing_scan_period_min = 100; 813unsigned int sysctl_numa_balancing_scan_period_min = 1000;
824unsigned int sysctl_numa_balancing_scan_period_max = 100*50; 814unsigned int sysctl_numa_balancing_scan_period_max = 60000;
825unsigned int sysctl_numa_balancing_scan_period_reset = 100*600;
826 815
827/* Portion of address space to scan in MB */ 816/* Portion of address space to scan in MB */
828unsigned int sysctl_numa_balancing_scan_size = 256; 817unsigned int sysctl_numa_balancing_scan_size = 256;
@@ -830,41 +819,835 @@ unsigned int sysctl_numa_balancing_scan_size = 256;
830/* Scan @scan_size MB every @scan_period after an initial @scan_delay in ms */ 819/* Scan @scan_size MB every @scan_period after an initial @scan_delay in ms */
831unsigned int sysctl_numa_balancing_scan_delay = 1000; 820unsigned int sysctl_numa_balancing_scan_delay = 1000;
832 821
833static void task_numa_placement(struct task_struct *p) 822/*
823 * After skipping a page migration on a shared page, skip N more numa page
824 * migrations unconditionally. This reduces the number of NUMA migrations
825 * in shared memory workloads, and has the effect of pulling tasks towards
826 * where their memory lives, over pulling the memory towards the task.
827 */
828unsigned int sysctl_numa_balancing_migrate_deferred = 16;
829
830static unsigned int task_nr_scan_windows(struct task_struct *p)
831{
832 unsigned long rss = 0;
833 unsigned long nr_scan_pages;
834
835 /*
836 * Calculations based on RSS as non-present and empty pages are skipped
837 * by the PTE scanner and NUMA hinting faults should be trapped based
838 * on resident pages
839 */
840 nr_scan_pages = sysctl_numa_balancing_scan_size << (20 - PAGE_SHIFT);
841 rss = get_mm_rss(p->mm);
842 if (!rss)
843 rss = nr_scan_pages;
844
845 rss = round_up(rss, nr_scan_pages);
846 return rss / nr_scan_pages;
847}
848
849/* For sanitys sake, never scan more PTEs than MAX_SCAN_WINDOW MB/sec. */
850#define MAX_SCAN_WINDOW 2560
851
852static unsigned int task_scan_min(struct task_struct *p)
853{
854 unsigned int scan, floor;
855 unsigned int windows = 1;
856
857 if (sysctl_numa_balancing_scan_size < MAX_SCAN_WINDOW)
858 windows = MAX_SCAN_WINDOW / sysctl_numa_balancing_scan_size;
859 floor = 1000 / windows;
860
861 scan = sysctl_numa_balancing_scan_period_min / task_nr_scan_windows(p);
862 return max_t(unsigned int, floor, scan);
863}
864
865static unsigned int task_scan_max(struct task_struct *p)
866{
867 unsigned int smin = task_scan_min(p);
868 unsigned int smax;
869
870 /* Watch for min being lower than max due to floor calculations */
871 smax = sysctl_numa_balancing_scan_period_max / task_nr_scan_windows(p);
872 return max(smin, smax);
873}
874
875/*
876 * Once a preferred node is selected the scheduler balancer will prefer moving
877 * a task to that node for sysctl_numa_balancing_settle_count number of PTE
878 * scans. This will give the process the chance to accumulate more faults on
879 * the preferred node but still allow the scheduler to move the task again if
880 * the nodes CPUs are overloaded.
881 */
882unsigned int sysctl_numa_balancing_settle_count __read_mostly = 4;
883
884static void account_numa_enqueue(struct rq *rq, struct task_struct *p)
885{
886 rq->nr_numa_running += (p->numa_preferred_nid != -1);
887 rq->nr_preferred_running += (p->numa_preferred_nid == task_node(p));
888}
889
890static void account_numa_dequeue(struct rq *rq, struct task_struct *p)
891{
892 rq->nr_numa_running -= (p->numa_preferred_nid != -1);
893 rq->nr_preferred_running -= (p->numa_preferred_nid == task_node(p));
894}
895
896struct numa_group {
897 atomic_t refcount;
898
899 spinlock_t lock; /* nr_tasks, tasks */
900 int nr_tasks;
901 pid_t gid;
902 struct list_head task_list;
903
904 struct rcu_head rcu;
905 unsigned long total_faults;
906 unsigned long faults[0];
907};
908
909pid_t task_numa_group_id(struct task_struct *p)
910{
911 return p->numa_group ? p->numa_group->gid : 0;
912}
913
914static inline int task_faults_idx(int nid, int priv)
915{
916 return 2 * nid + priv;
917}
918
919static inline unsigned long task_faults(struct task_struct *p, int nid)
920{
921 if (!p->numa_faults)
922 return 0;
923
924 return p->numa_faults[task_faults_idx(nid, 0)] +
925 p->numa_faults[task_faults_idx(nid, 1)];
926}
927
928static inline unsigned long group_faults(struct task_struct *p, int nid)
929{
930 if (!p->numa_group)
931 return 0;
932
933 return p->numa_group->faults[2*nid] + p->numa_group->faults[2*nid+1];
934}
935
936/*
937 * These return the fraction of accesses done by a particular task, or
938 * task group, on a particular numa node. The group weight is given a
939 * larger multiplier, in order to group tasks together that are almost
940 * evenly spread out between numa nodes.
941 */
942static inline unsigned long task_weight(struct task_struct *p, int nid)
943{
944 unsigned long total_faults;
945
946 if (!p->numa_faults)
947 return 0;
948
949 total_faults = p->total_numa_faults;
950
951 if (!total_faults)
952 return 0;
953
954 return 1000 * task_faults(p, nid) / total_faults;
955}
956
957static inline unsigned long group_weight(struct task_struct *p, int nid)
958{
959 if (!p->numa_group || !p->numa_group->total_faults)
960 return 0;
961
962 return 1000 * group_faults(p, nid) / p->numa_group->total_faults;
963}
964
965static unsigned long weighted_cpuload(const int cpu);
966static unsigned long source_load(int cpu, int type);
967static unsigned long target_load(int cpu, int type);
968static unsigned long power_of(int cpu);
969static long effective_load(struct task_group *tg, int cpu, long wl, long wg);
970
971/* Cached statistics for all CPUs within a node */
972struct numa_stats {
973 unsigned long nr_running;
974 unsigned long load;
975
976 /* Total compute capacity of CPUs on a node */
977 unsigned long power;
978
979 /* Approximate capacity in terms of runnable tasks on a node */
980 unsigned long capacity;
981 int has_capacity;
982};
983
984/*
985 * XXX borrowed from update_sg_lb_stats
986 */
987static void update_numa_stats(struct numa_stats *ns, int nid)
988{
989 int cpu, cpus = 0;
990
991 memset(ns, 0, sizeof(*ns));
992 for_each_cpu(cpu, cpumask_of_node(nid)) {
993 struct rq *rq = cpu_rq(cpu);
994
995 ns->nr_running += rq->nr_running;
996 ns->load += weighted_cpuload(cpu);
997 ns->power += power_of(cpu);
998
999 cpus++;
1000 }
1001
1002 /*
1003 * If we raced with hotplug and there are no CPUs left in our mask
1004 * the @ns structure is NULL'ed and task_numa_compare() will
1005 * not find this node attractive.
1006 *
1007 * We'll either bail at !has_capacity, or we'll detect a huge imbalance
1008 * and bail there.
1009 */
1010 if (!cpus)
1011 return;
1012
1013 ns->load = (ns->load * SCHED_POWER_SCALE) / ns->power;
1014 ns->capacity = DIV_ROUND_CLOSEST(ns->power, SCHED_POWER_SCALE);
1015 ns->has_capacity = (ns->nr_running < ns->capacity);
1016}
1017
1018struct task_numa_env {
1019 struct task_struct *p;
1020
1021 int src_cpu, src_nid;
1022 int dst_cpu, dst_nid;
1023
1024 struct numa_stats src_stats, dst_stats;
1025
1026 int imbalance_pct, idx;
1027
1028 struct task_struct *best_task;
1029 long best_imp;
1030 int best_cpu;
1031};
1032
1033static void task_numa_assign(struct task_numa_env *env,
1034 struct task_struct *p, long imp)
1035{
1036 if (env->best_task)
1037 put_task_struct(env->best_task);
1038 if (p)
1039 get_task_struct(p);
1040
1041 env->best_task = p;
1042 env->best_imp = imp;
1043 env->best_cpu = env->dst_cpu;
1044}
1045
1046/*
1047 * This checks if the overall compute and NUMA accesses of the system would
1048 * be improved if the source tasks was migrated to the target dst_cpu taking
1049 * into account that it might be best if task running on the dst_cpu should
1050 * be exchanged with the source task
1051 */
1052static void task_numa_compare(struct task_numa_env *env,
1053 long taskimp, long groupimp)
1054{
1055 struct rq *src_rq = cpu_rq(env->src_cpu);
1056 struct rq *dst_rq = cpu_rq(env->dst_cpu);
1057 struct task_struct *cur;
1058 long dst_load, src_load;
1059 long load;
1060 long imp = (groupimp > 0) ? groupimp : taskimp;
1061
1062 rcu_read_lock();
1063 cur = ACCESS_ONCE(dst_rq->curr);
1064 if (cur->pid == 0) /* idle */
1065 cur = NULL;
1066
1067 /*
1068 * "imp" is the fault differential for the source task between the
1069 * source and destination node. Calculate the total differential for
1070 * the source task and potential destination task. The more negative
1071 * the value is, the more rmeote accesses that would be expected to
1072 * be incurred if the tasks were swapped.
1073 */
1074 if (cur) {
1075 /* Skip this swap candidate if cannot move to the source cpu */
1076 if (!cpumask_test_cpu(env->src_cpu, tsk_cpus_allowed(cur)))
1077 goto unlock;
1078
1079 /*
1080 * If dst and source tasks are in the same NUMA group, or not
1081 * in any group then look only at task weights.
1082 */
1083 if (cur->numa_group == env->p->numa_group) {
1084 imp = taskimp + task_weight(cur, env->src_nid) -
1085 task_weight(cur, env->dst_nid);
1086 /*
1087 * Add some hysteresis to prevent swapping the
1088 * tasks within a group over tiny differences.
1089 */
1090 if (cur->numa_group)
1091 imp -= imp/16;
1092 } else {
1093 /*
1094 * Compare the group weights. If a task is all by
1095 * itself (not part of a group), use the task weight
1096 * instead.
1097 */
1098 if (env->p->numa_group)
1099 imp = groupimp;
1100 else
1101 imp = taskimp;
1102
1103 if (cur->numa_group)
1104 imp += group_weight(cur, env->src_nid) -
1105 group_weight(cur, env->dst_nid);
1106 else
1107 imp += task_weight(cur, env->src_nid) -
1108 task_weight(cur, env->dst_nid);
1109 }
1110 }
1111
1112 if (imp < env->best_imp)
1113 goto unlock;
1114
1115 if (!cur) {
1116 /* Is there capacity at our destination? */
1117 if (env->src_stats.has_capacity &&
1118 !env->dst_stats.has_capacity)
1119 goto unlock;
1120
1121 goto balance;
1122 }
1123
1124 /* Balance doesn't matter much if we're running a task per cpu */
1125 if (src_rq->nr_running == 1 && dst_rq->nr_running == 1)
1126 goto assign;
1127
1128 /*
1129 * In the overloaded case, try and keep the load balanced.
1130 */
1131balance:
1132 dst_load = env->dst_stats.load;
1133 src_load = env->src_stats.load;
1134
1135 /* XXX missing power terms */
1136 load = task_h_load(env->p);
1137 dst_load += load;
1138 src_load -= load;
1139
1140 if (cur) {
1141 load = task_h_load(cur);
1142 dst_load -= load;
1143 src_load += load;
1144 }
1145
1146 /* make src_load the smaller */
1147 if (dst_load < src_load)
1148 swap(dst_load, src_load);
1149
1150 if (src_load * env->imbalance_pct < dst_load * 100)
1151 goto unlock;
1152
1153assign:
1154 task_numa_assign(env, cur, imp);
1155unlock:
1156 rcu_read_unlock();
1157}
1158
1159static void task_numa_find_cpu(struct task_numa_env *env,
1160 long taskimp, long groupimp)
834{ 1161{
835 int seq; 1162 int cpu;
1163
1164 for_each_cpu(cpu, cpumask_of_node(env->dst_nid)) {
1165 /* Skip this CPU if the source task cannot migrate */
1166 if (!cpumask_test_cpu(cpu, tsk_cpus_allowed(env->p)))
1167 continue;
1168
1169 env->dst_cpu = cpu;
1170 task_numa_compare(env, taskimp, groupimp);
1171 }
1172}
1173
1174static int task_numa_migrate(struct task_struct *p)
1175{
1176 struct task_numa_env env = {
1177 .p = p,
1178
1179 .src_cpu = task_cpu(p),
1180 .src_nid = task_node(p),
1181
1182 .imbalance_pct = 112,
1183
1184 .best_task = NULL,
1185 .best_imp = 0,
1186 .best_cpu = -1
1187 };
1188 struct sched_domain *sd;
1189 unsigned long taskweight, groupweight;
1190 int nid, ret;
1191 long taskimp, groupimp;
1192
1193 /*
1194 * Pick the lowest SD_NUMA domain, as that would have the smallest
1195 * imbalance and would be the first to start moving tasks about.
1196 *
1197 * And we want to avoid any moving of tasks about, as that would create
1198 * random movement of tasks -- counter the numa conditions we're trying
1199 * to satisfy here.
1200 */
1201 rcu_read_lock();
1202 sd = rcu_dereference(per_cpu(sd_numa, env.src_cpu));
1203 if (sd)
1204 env.imbalance_pct = 100 + (sd->imbalance_pct - 100) / 2;
1205 rcu_read_unlock();
1206
1207 /*
1208 * Cpusets can break the scheduler domain tree into smaller
1209 * balance domains, some of which do not cross NUMA boundaries.
1210 * Tasks that are "trapped" in such domains cannot be migrated
1211 * elsewhere, so there is no point in (re)trying.
1212 */
1213 if (unlikely(!sd)) {
1214 p->numa_preferred_nid = cpu_to_node(task_cpu(p));
1215 return -EINVAL;
1216 }
1217
1218 taskweight = task_weight(p, env.src_nid);
1219 groupweight = group_weight(p, env.src_nid);
1220 update_numa_stats(&env.src_stats, env.src_nid);
1221 env.dst_nid = p->numa_preferred_nid;
1222 taskimp = task_weight(p, env.dst_nid) - taskweight;
1223 groupimp = group_weight(p, env.dst_nid) - groupweight;
1224 update_numa_stats(&env.dst_stats, env.dst_nid);
1225
1226 /* If the preferred nid has capacity, try to use it. */
1227 if (env.dst_stats.has_capacity)
1228 task_numa_find_cpu(&env, taskimp, groupimp);
1229
1230 /* No space available on the preferred nid. Look elsewhere. */
1231 if (env.best_cpu == -1) {
1232 for_each_online_node(nid) {
1233 if (nid == env.src_nid || nid == p->numa_preferred_nid)
1234 continue;
1235
1236 /* Only consider nodes where both task and groups benefit */
1237 taskimp = task_weight(p, nid) - taskweight;
1238 groupimp = group_weight(p, nid) - groupweight;
1239 if (taskimp < 0 && groupimp < 0)
1240 continue;
1241
1242 env.dst_nid = nid;
1243 update_numa_stats(&env.dst_stats, env.dst_nid);
1244 task_numa_find_cpu(&env, taskimp, groupimp);
1245 }
1246 }
1247
1248 /* No better CPU than the current one was found. */
1249 if (env.best_cpu == -1)
1250 return -EAGAIN;
1251
1252 sched_setnuma(p, env.dst_nid);
1253
1254 /*
1255 * Reset the scan period if the task is being rescheduled on an
1256 * alternative node to recheck if the tasks is now properly placed.
1257 */
1258 p->numa_scan_period = task_scan_min(p);
1259
1260 if (env.best_task == NULL) {
1261 int ret = migrate_task_to(p, env.best_cpu);
1262 return ret;
1263 }
1264
1265 ret = migrate_swap(p, env.best_task);
1266 put_task_struct(env.best_task);
1267 return ret;
1268}
1269
1270/* Attempt to migrate a task to a CPU on the preferred node. */
1271static void numa_migrate_preferred(struct task_struct *p)
1272{
1273 /* This task has no NUMA fault statistics yet */
1274 if (unlikely(p->numa_preferred_nid == -1 || !p->numa_faults))
1275 return;
1276
1277 /* Periodically retry migrating the task to the preferred node */
1278 p->numa_migrate_retry = jiffies + HZ;
1279
1280 /* Success if task is already running on preferred CPU */
1281 if (cpu_to_node(task_cpu(p)) == p->numa_preferred_nid)
1282 return;
1283
1284 /* Otherwise, try migrate to a CPU on the preferred node */
1285 task_numa_migrate(p);
1286}
1287
1288/*
1289 * When adapting the scan rate, the period is divided into NUMA_PERIOD_SLOTS
1290 * increments. The more local the fault statistics are, the higher the scan
1291 * period will be for the next scan window. If local/remote ratio is below
1292 * NUMA_PERIOD_THRESHOLD (where range of ratio is 1..NUMA_PERIOD_SLOTS) the
1293 * scan period will decrease
1294 */
1295#define NUMA_PERIOD_SLOTS 10
1296#define NUMA_PERIOD_THRESHOLD 3
1297
1298/*
1299 * Increase the scan period (slow down scanning) if the majority of
1300 * our memory is already on our local node, or if the majority of
1301 * the page accesses are shared with other processes.
1302 * Otherwise, decrease the scan period.
1303 */
1304static void update_task_scan_period(struct task_struct *p,
1305 unsigned long shared, unsigned long private)
1306{
1307 unsigned int period_slot;
1308 int ratio;
1309 int diff;
1310
1311 unsigned long remote = p->numa_faults_locality[0];
1312 unsigned long local = p->numa_faults_locality[1];
1313
1314 /*
1315 * If there were no record hinting faults then either the task is
1316 * completely idle or all activity is areas that are not of interest
1317 * to automatic numa balancing. Scan slower
1318 */
1319 if (local + shared == 0) {
1320 p->numa_scan_period = min(p->numa_scan_period_max,
1321 p->numa_scan_period << 1);
1322
1323 p->mm->numa_next_scan = jiffies +
1324 msecs_to_jiffies(p->numa_scan_period);
836 1325
837 if (!p->mm) /* for example, ksmd faulting in a user's mm */
838 return; 1326 return;
1327 }
1328
1329 /*
1330 * Prepare to scale scan period relative to the current period.
1331 * == NUMA_PERIOD_THRESHOLD scan period stays the same
1332 * < NUMA_PERIOD_THRESHOLD scan period decreases (scan faster)
1333 * >= NUMA_PERIOD_THRESHOLD scan period increases (scan slower)
1334 */
1335 period_slot = DIV_ROUND_UP(p->numa_scan_period, NUMA_PERIOD_SLOTS);
1336 ratio = (local * NUMA_PERIOD_SLOTS) / (local + remote);
1337 if (ratio >= NUMA_PERIOD_THRESHOLD) {
1338 int slot = ratio - NUMA_PERIOD_THRESHOLD;
1339 if (!slot)
1340 slot = 1;
1341 diff = slot * period_slot;
1342 } else {
1343 diff = -(NUMA_PERIOD_THRESHOLD - ratio) * period_slot;
1344
1345 /*
1346 * Scale scan rate increases based on sharing. There is an
1347 * inverse relationship between the degree of sharing and
1348 * the adjustment made to the scanning period. Broadly
1349 * speaking the intent is that there is little point
1350 * scanning faster if shared accesses dominate as it may
1351 * simply bounce migrations uselessly
1352 */
1353 period_slot = DIV_ROUND_UP(diff, NUMA_PERIOD_SLOTS);
1354 ratio = DIV_ROUND_UP(private * NUMA_PERIOD_SLOTS, (private + shared));
1355 diff = (diff * ratio) / NUMA_PERIOD_SLOTS;
1356 }
1357
1358 p->numa_scan_period = clamp(p->numa_scan_period + diff,
1359 task_scan_min(p), task_scan_max(p));
1360 memset(p->numa_faults_locality, 0, sizeof(p->numa_faults_locality));
1361}
1362
1363static void task_numa_placement(struct task_struct *p)
1364{
1365 int seq, nid, max_nid = -1, max_group_nid = -1;
1366 unsigned long max_faults = 0, max_group_faults = 0;
1367 unsigned long fault_types[2] = { 0, 0 };
1368 spinlock_t *group_lock = NULL;
1369
839 seq = ACCESS_ONCE(p->mm->numa_scan_seq); 1370 seq = ACCESS_ONCE(p->mm->numa_scan_seq);
840 if (p->numa_scan_seq == seq) 1371 if (p->numa_scan_seq == seq)
841 return; 1372 return;
842 p->numa_scan_seq = seq; 1373 p->numa_scan_seq = seq;
1374 p->numa_scan_period_max = task_scan_max(p);
1375
1376 /* If the task is part of a group prevent parallel updates to group stats */
1377 if (p->numa_group) {
1378 group_lock = &p->numa_group->lock;
1379 spin_lock(group_lock);
1380 }
1381
1382 /* Find the node with the highest number of faults */
1383 for_each_online_node(nid) {
1384 unsigned long faults = 0, group_faults = 0;
1385 int priv, i;
1386
1387 for (priv = 0; priv < 2; priv++) {
1388 long diff;
1389
1390 i = task_faults_idx(nid, priv);
1391 diff = -p->numa_faults[i];
1392
1393 /* Decay existing window, copy faults since last scan */
1394 p->numa_faults[i] >>= 1;
1395 p->numa_faults[i] += p->numa_faults_buffer[i];
1396 fault_types[priv] += p->numa_faults_buffer[i];
1397 p->numa_faults_buffer[i] = 0;
1398
1399 faults += p->numa_faults[i];
1400 diff += p->numa_faults[i];
1401 p->total_numa_faults += diff;
1402 if (p->numa_group) {
1403 /* safe because we can only change our own group */
1404 p->numa_group->faults[i] += diff;
1405 p->numa_group->total_faults += diff;
1406 group_faults += p->numa_group->faults[i];
1407 }
1408 }
843 1409
844 /* FIXME: Scheduling placement policy hints go here */ 1410 if (faults > max_faults) {
1411 max_faults = faults;
1412 max_nid = nid;
1413 }
1414
1415 if (group_faults > max_group_faults) {
1416 max_group_faults = group_faults;
1417 max_group_nid = nid;
1418 }
1419 }
1420
1421 update_task_scan_period(p, fault_types[0], fault_types[1]);
1422
1423 if (p->numa_group) {
1424 /*
1425 * If the preferred task and group nids are different,
1426 * iterate over the nodes again to find the best place.
1427 */
1428 if (max_nid != max_group_nid) {
1429 unsigned long weight, max_weight = 0;
1430
1431 for_each_online_node(nid) {
1432 weight = task_weight(p, nid) + group_weight(p, nid);
1433 if (weight > max_weight) {
1434 max_weight = weight;
1435 max_nid = nid;
1436 }
1437 }
1438 }
1439
1440 spin_unlock(group_lock);
1441 }
1442
1443 /* Preferred node as the node with the most faults */
1444 if (max_faults && max_nid != p->numa_preferred_nid) {
1445 /* Update the preferred nid and migrate task if possible */
1446 sched_setnuma(p, max_nid);
1447 numa_migrate_preferred(p);
1448 }
1449}
1450
1451static inline int get_numa_group(struct numa_group *grp)
1452{
1453 return atomic_inc_not_zero(&grp->refcount);
1454}
1455
1456static inline void put_numa_group(struct numa_group *grp)
1457{
1458 if (atomic_dec_and_test(&grp->refcount))
1459 kfree_rcu(grp, rcu);
1460}
1461
1462static void task_numa_group(struct task_struct *p, int cpupid, int flags,
1463 int *priv)
1464{
1465 struct numa_group *grp, *my_grp;
1466 struct task_struct *tsk;
1467 bool join = false;
1468 int cpu = cpupid_to_cpu(cpupid);
1469 int i;
1470
1471 if (unlikely(!p->numa_group)) {
1472 unsigned int size = sizeof(struct numa_group) +
1473 2*nr_node_ids*sizeof(unsigned long);
1474
1475 grp = kzalloc(size, GFP_KERNEL | __GFP_NOWARN);
1476 if (!grp)
1477 return;
1478
1479 atomic_set(&grp->refcount, 1);
1480 spin_lock_init(&grp->lock);
1481 INIT_LIST_HEAD(&grp->task_list);
1482 grp->gid = p->pid;
1483
1484 for (i = 0; i < 2*nr_node_ids; i++)
1485 grp->faults[i] = p->numa_faults[i];
1486
1487 grp->total_faults = p->total_numa_faults;
1488
1489 list_add(&p->numa_entry, &grp->task_list);
1490 grp->nr_tasks++;
1491 rcu_assign_pointer(p->numa_group, grp);
1492 }
1493
1494 rcu_read_lock();
1495 tsk = ACCESS_ONCE(cpu_rq(cpu)->curr);
1496
1497 if (!cpupid_match_pid(tsk, cpupid))
1498 goto no_join;
1499
1500 grp = rcu_dereference(tsk->numa_group);
1501 if (!grp)
1502 goto no_join;
1503
1504 my_grp = p->numa_group;
1505 if (grp == my_grp)
1506 goto no_join;
1507
1508 /*
1509 * Only join the other group if its bigger; if we're the bigger group,
1510 * the other task will join us.
1511 */
1512 if (my_grp->nr_tasks > grp->nr_tasks)
1513 goto no_join;
1514
1515 /*
1516 * Tie-break on the grp address.
1517 */
1518 if (my_grp->nr_tasks == grp->nr_tasks && my_grp > grp)
1519 goto no_join;
1520
1521 /* Always join threads in the same process. */
1522 if (tsk->mm == current->mm)
1523 join = true;
1524
1525 /* Simple filter to avoid false positives due to PID collisions */
1526 if (flags & TNF_SHARED)
1527 join = true;
1528
1529 /* Update priv based on whether false sharing was detected */
1530 *priv = !join;
1531
1532 if (join && !get_numa_group(grp))
1533 goto no_join;
1534
1535 rcu_read_unlock();
1536
1537 if (!join)
1538 return;
1539
1540 double_lock(&my_grp->lock, &grp->lock);
1541
1542 for (i = 0; i < 2*nr_node_ids; i++) {
1543 my_grp->faults[i] -= p->numa_faults[i];
1544 grp->faults[i] += p->numa_faults[i];
1545 }
1546 my_grp->total_faults -= p->total_numa_faults;
1547 grp->total_faults += p->total_numa_faults;
1548
1549 list_move(&p->numa_entry, &grp->task_list);
1550 my_grp->nr_tasks--;
1551 grp->nr_tasks++;
1552
1553 spin_unlock(&my_grp->lock);
1554 spin_unlock(&grp->lock);
1555
1556 rcu_assign_pointer(p->numa_group, grp);
1557
1558 put_numa_group(my_grp);
1559 return;
1560
1561no_join:
1562 rcu_read_unlock();
1563 return;
1564}
1565
1566void task_numa_free(struct task_struct *p)
1567{
1568 struct numa_group *grp = p->numa_group;
1569 int i;
1570 void *numa_faults = p->numa_faults;
1571
1572 if (grp) {
1573 spin_lock(&grp->lock);
1574 for (i = 0; i < 2*nr_node_ids; i++)
1575 grp->faults[i] -= p->numa_faults[i];
1576 grp->total_faults -= p->total_numa_faults;
1577
1578 list_del(&p->numa_entry);
1579 grp->nr_tasks--;
1580 spin_unlock(&grp->lock);
1581 rcu_assign_pointer(p->numa_group, NULL);
1582 put_numa_group(grp);
1583 }
1584
1585 p->numa_faults = NULL;
1586 p->numa_faults_buffer = NULL;
1587 kfree(numa_faults);
845} 1588}
846 1589
847/* 1590/*
848 * Got a PROT_NONE fault for a page on @node. 1591 * Got a PROT_NONE fault for a page on @node.
849 */ 1592 */
850void task_numa_fault(int node, int pages, bool migrated) 1593void task_numa_fault(int last_cpupid, int node, int pages, int flags)
851{ 1594{
852 struct task_struct *p = current; 1595 struct task_struct *p = current;
1596 bool migrated = flags & TNF_MIGRATED;
1597 int priv;
853 1598
854 if (!numabalancing_enabled) 1599 if (!numabalancing_enabled)
855 return; 1600 return;
856 1601
857 /* FIXME: Allocate task-specific structure for placement policy here */ 1602 /* for example, ksmd faulting in a user's mm */
1603 if (!p->mm)
1604 return;
1605
1606 /* Do not worry about placement if exiting */
1607 if (p->state == TASK_DEAD)
1608 return;
1609
1610 /* Allocate buffer to track faults on a per-node basis */
1611 if (unlikely(!p->numa_faults)) {
1612 int size = sizeof(*p->numa_faults) * 2 * nr_node_ids;
1613
1614 /* numa_faults and numa_faults_buffer share the allocation */
1615 p->numa_faults = kzalloc(size * 2, GFP_KERNEL|__GFP_NOWARN);
1616 if (!p->numa_faults)
1617 return;
1618
1619 BUG_ON(p->numa_faults_buffer);
1620 p->numa_faults_buffer = p->numa_faults + (2 * nr_node_ids);
1621 p->total_numa_faults = 0;
1622 memset(p->numa_faults_locality, 0, sizeof(p->numa_faults_locality));
1623 }
858 1624
859 /* 1625 /*
860 * If pages are properly placed (did not migrate) then scan slower. 1626 * First accesses are treated as private, otherwise consider accesses
861 * This is reset periodically in case of phase changes 1627 * to be private if the accessing pid has not changed
862 */ 1628 */
863 if (!migrated) 1629 if (unlikely(last_cpupid == (-1 & LAST_CPUPID_MASK))) {
864 p->numa_scan_period = min(sysctl_numa_balancing_scan_period_max, 1630 priv = 1;
865 p->numa_scan_period + jiffies_to_msecs(10)); 1631 } else {
1632 priv = cpupid_match_pid(p, last_cpupid);
1633 if (!priv && !(flags & TNF_NO_GROUP))
1634 task_numa_group(p, last_cpupid, flags, &priv);
1635 }
866 1636
867 task_numa_placement(p); 1637 task_numa_placement(p);
1638
1639 /*
1640 * Retry task to preferred node migration periodically, in case it
1641 * case it previously failed, or the scheduler moved us.
1642 */
1643 if (time_after(jiffies, p->numa_migrate_retry))
1644 numa_migrate_preferred(p);
1645
1646 if (migrated)
1647 p->numa_pages_migrated += pages;
1648
1649 p->numa_faults_buffer[task_faults_idx(node, priv)] += pages;
1650 p->numa_faults_locality[!!(flags & TNF_FAULT_LOCAL)] += pages;
868} 1651}
869 1652
870static void reset_ptenuma_scan(struct task_struct *p) 1653static void reset_ptenuma_scan(struct task_struct *p)
@@ -884,6 +1667,7 @@ void task_numa_work(struct callback_head *work)
884 struct mm_struct *mm = p->mm; 1667 struct mm_struct *mm = p->mm;
885 struct vm_area_struct *vma; 1668 struct vm_area_struct *vma;
886 unsigned long start, end; 1669 unsigned long start, end;
1670 unsigned long nr_pte_updates = 0;
887 long pages; 1671 long pages;
888 1672
889 WARN_ON_ONCE(p != container_of(work, struct task_struct, numa_work)); 1673 WARN_ON_ONCE(p != container_of(work, struct task_struct, numa_work));
@@ -900,35 +1684,9 @@ void task_numa_work(struct callback_head *work)
900 if (p->flags & PF_EXITING) 1684 if (p->flags & PF_EXITING)
901 return; 1685 return;
902 1686
903 /* 1687 if (!mm->numa_next_scan) {
904 * We do not care about task placement until a task runs on a node 1688 mm->numa_next_scan = now +
905 * other than the first one used by the address space. This is 1689 msecs_to_jiffies(sysctl_numa_balancing_scan_delay);
906 * largely because migrations are driven by what CPU the task
907 * is running on. If it's never scheduled on another node, it'll
908 * not migrate so why bother trapping the fault.
909 */
910 if (mm->first_nid == NUMA_PTE_SCAN_INIT)
911 mm->first_nid = numa_node_id();
912 if (mm->first_nid != NUMA_PTE_SCAN_ACTIVE) {
913 /* Are we running on a new node yet? */
914 if (numa_node_id() == mm->first_nid &&
915 !sched_feat_numa(NUMA_FORCE))
916 return;
917
918 mm->first_nid = NUMA_PTE_SCAN_ACTIVE;
919 }
920
921 /*
922 * Reset the scan period if enough time has gone by. Objective is that
923 * scanning will be reduced if pages are properly placed. As tasks
924 * can enter different phases this needs to be re-examined. Lacking
925 * proper tracking of reference behaviour, this blunt hammer is used.
926 */
927 migrate = mm->numa_next_reset;
928 if (time_after(now, migrate)) {
929 p->numa_scan_period = sysctl_numa_balancing_scan_period_min;
930 next_scan = now + msecs_to_jiffies(sysctl_numa_balancing_scan_period_reset);
931 xchg(&mm->numa_next_reset, next_scan);
932 } 1690 }
933 1691
934 /* 1692 /*
@@ -938,20 +1696,20 @@ void task_numa_work(struct callback_head *work)
938 if (time_before(now, migrate)) 1696 if (time_before(now, migrate))
939 return; 1697 return;
940 1698
941 if (p->numa_scan_period == 0) 1699 if (p->numa_scan_period == 0) {
942 p->numa_scan_period = sysctl_numa_balancing_scan_period_min; 1700 p->numa_scan_period_max = task_scan_max(p);
1701 p->numa_scan_period = task_scan_min(p);
1702 }
943 1703
944 next_scan = now + msecs_to_jiffies(p->numa_scan_period); 1704 next_scan = now + msecs_to_jiffies(p->numa_scan_period);
945 if (cmpxchg(&mm->numa_next_scan, migrate, next_scan) != migrate) 1705 if (cmpxchg(&mm->numa_next_scan, migrate, next_scan) != migrate)
946 return; 1706 return;
947 1707
948 /* 1708 /*
949 * Do not set pte_numa if the current running node is rate-limited. 1709 * Delay this task enough that another task of this mm will likely win
950 * This loses statistics on the fault but if we are unwilling to 1710 * the next time around.
951 * migrate to this node, it is less likely we can do useful work
952 */ 1711 */
953 if (migrate_ratelimited(numa_node_id())) 1712 p->node_stamp += 2 * TICK_NSEC;
954 return;
955 1713
956 start = mm->numa_scan_offset; 1714 start = mm->numa_scan_offset;
957 pages = sysctl_numa_balancing_scan_size; 1715 pages = sysctl_numa_balancing_scan_size;
@@ -967,18 +1725,39 @@ void task_numa_work(struct callback_head *work)
967 vma = mm->mmap; 1725 vma = mm->mmap;
968 } 1726 }
969 for (; vma; vma = vma->vm_next) { 1727 for (; vma; vma = vma->vm_next) {
970 if (!vma_migratable(vma)) 1728 if (!vma_migratable(vma) || !vma_policy_mof(p, vma))
971 continue; 1729 continue;
972 1730
973 /* Skip small VMAs. They are not likely to be of relevance */ 1731 /*
974 if (vma->vm_end - vma->vm_start < HPAGE_SIZE) 1732 * Shared library pages mapped by multiple processes are not
1733 * migrated as it is expected they are cache replicated. Avoid
1734 * hinting faults in read-only file-backed mappings or the vdso
1735 * as migrating the pages will be of marginal benefit.
1736 */
1737 if (!vma->vm_mm ||
1738 (vma->vm_file && (vma->vm_flags & (VM_READ|VM_WRITE)) == (VM_READ)))
1739 continue;
1740
1741 /*
1742 * Skip inaccessible VMAs to avoid any confusion between
1743 * PROT_NONE and NUMA hinting ptes
1744 */
1745 if (!(vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE)))
975 continue; 1746 continue;
976 1747
977 do { 1748 do {
978 start = max(start, vma->vm_start); 1749 start = max(start, vma->vm_start);
979 end = ALIGN(start + (pages << PAGE_SHIFT), HPAGE_SIZE); 1750 end = ALIGN(start + (pages << PAGE_SHIFT), HPAGE_SIZE);
980 end = min(end, vma->vm_end); 1751 end = min(end, vma->vm_end);
981 pages -= change_prot_numa(vma, start, end); 1752 nr_pte_updates += change_prot_numa(vma, start, end);
1753
1754 /*
1755 * Scan sysctl_numa_balancing_scan_size but ensure that
1756 * at least one PTE is updated so that unused virtual
1757 * address space is quickly skipped.
1758 */
1759 if (nr_pte_updates)
1760 pages -= (end - start) >> PAGE_SHIFT;
982 1761
983 start = end; 1762 start = end;
984 if (pages <= 0) 1763 if (pages <= 0)
@@ -988,10 +1767,10 @@ void task_numa_work(struct callback_head *work)
988 1767
989out: 1768out:
990 /* 1769 /*
991 * It is possible to reach the end of the VMA list but the last few VMAs are 1770 * It is possible to reach the end of the VMA list but the last few
992 * not guaranteed to the vma_migratable. If they are not, we would find the 1771 * VMAs are not guaranteed to the vma_migratable. If they are not, we
993 * !migratable VMA on the next scan but not reset the scanner to the start 1772 * would find the !migratable VMA on the next scan but not reset the
994 * so check it now. 1773 * scanner to the start so check it now.
995 */ 1774 */
996 if (vma) 1775 if (vma)
997 mm->numa_scan_offset = start; 1776 mm->numa_scan_offset = start;
@@ -1025,8 +1804,8 @@ void task_tick_numa(struct rq *rq, struct task_struct *curr)
1025 1804
1026 if (now - curr->node_stamp > period) { 1805 if (now - curr->node_stamp > period) {
1027 if (!curr->node_stamp) 1806 if (!curr->node_stamp)
1028 curr->numa_scan_period = sysctl_numa_balancing_scan_period_min; 1807 curr->numa_scan_period = task_scan_min(curr);
1029 curr->node_stamp = now; 1808 curr->node_stamp += period;
1030 1809
1031 if (!time_before(jiffies, curr->mm->numa_next_scan)) { 1810 if (!time_before(jiffies, curr->mm->numa_next_scan)) {
1032 init_task_work(work, task_numa_work); /* TODO: move this into sched_fork() */ 1811 init_task_work(work, task_numa_work); /* TODO: move this into sched_fork() */
@@ -1038,6 +1817,14 @@ void task_tick_numa(struct rq *rq, struct task_struct *curr)
1038static void task_tick_numa(struct rq *rq, struct task_struct *curr) 1817static void task_tick_numa(struct rq *rq, struct task_struct *curr)
1039{ 1818{
1040} 1819}
1820
1821static inline void account_numa_enqueue(struct rq *rq, struct task_struct *p)
1822{
1823}
1824
1825static inline void account_numa_dequeue(struct rq *rq, struct task_struct *p)
1826{
1827}
1041#endif /* CONFIG_NUMA_BALANCING */ 1828#endif /* CONFIG_NUMA_BALANCING */
1042 1829
1043static void 1830static void
@@ -1047,8 +1834,12 @@ account_entity_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se)
1047 if (!parent_entity(se)) 1834 if (!parent_entity(se))
1048 update_load_add(&rq_of(cfs_rq)->load, se->load.weight); 1835 update_load_add(&rq_of(cfs_rq)->load, se->load.weight);
1049#ifdef CONFIG_SMP 1836#ifdef CONFIG_SMP
1050 if (entity_is_task(se)) 1837 if (entity_is_task(se)) {
1051 list_add(&se->group_node, &rq_of(cfs_rq)->cfs_tasks); 1838 struct rq *rq = rq_of(cfs_rq);
1839
1840 account_numa_enqueue(rq, task_of(se));
1841 list_add(&se->group_node, &rq->cfs_tasks);
1842 }
1052#endif 1843#endif
1053 cfs_rq->nr_running++; 1844 cfs_rq->nr_running++;
1054} 1845}
@@ -1059,8 +1850,10 @@ account_entity_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se)
1059 update_load_sub(&cfs_rq->load, se->load.weight); 1850 update_load_sub(&cfs_rq->load, se->load.weight);
1060 if (!parent_entity(se)) 1851 if (!parent_entity(se))
1061 update_load_sub(&rq_of(cfs_rq)->load, se->load.weight); 1852 update_load_sub(&rq_of(cfs_rq)->load, se->load.weight);
1062 if (entity_is_task(se)) 1853 if (entity_is_task(se)) {
1854 account_numa_dequeue(rq_of(cfs_rq), task_of(se));
1063 list_del_init(&se->group_node); 1855 list_del_init(&se->group_node);
1856 }
1064 cfs_rq->nr_running--; 1857 cfs_rq->nr_running--;
1065} 1858}
1066 1859
@@ -1378,7 +2171,7 @@ static inline void __update_tg_runnable_avg(struct sched_avg *sa,
1378 long contrib; 2171 long contrib;
1379 2172
1380 /* The fraction of a cpu used by this cfs_rq */ 2173 /* The fraction of a cpu used by this cfs_rq */
1381 contrib = div_u64(sa->runnable_avg_sum << NICE_0_SHIFT, 2174 contrib = div_u64((u64)sa->runnable_avg_sum << NICE_0_SHIFT,
1382 sa->runnable_avg_period + 1); 2175 sa->runnable_avg_period + 1);
1383 contrib -= cfs_rq->tg_runnable_contrib; 2176 contrib -= cfs_rq->tg_runnable_contrib;
1384 2177
@@ -2070,13 +2863,14 @@ static inline bool cfs_bandwidth_used(void)
2070 return static_key_false(&__cfs_bandwidth_used); 2863 return static_key_false(&__cfs_bandwidth_used);
2071} 2864}
2072 2865
2073void account_cfs_bandwidth_used(int enabled, int was_enabled) 2866void cfs_bandwidth_usage_inc(void)
2074{ 2867{
2075 /* only need to count groups transitioning between enabled/!enabled */ 2868 static_key_slow_inc(&__cfs_bandwidth_used);
2076 if (enabled && !was_enabled) 2869}
2077 static_key_slow_inc(&__cfs_bandwidth_used); 2870
2078 else if (!enabled && was_enabled) 2871void cfs_bandwidth_usage_dec(void)
2079 static_key_slow_dec(&__cfs_bandwidth_used); 2872{
2873 static_key_slow_dec(&__cfs_bandwidth_used);
2080} 2874}
2081#else /* HAVE_JUMP_LABEL */ 2875#else /* HAVE_JUMP_LABEL */
2082static bool cfs_bandwidth_used(void) 2876static bool cfs_bandwidth_used(void)
@@ -2084,7 +2878,8 @@ static bool cfs_bandwidth_used(void)
2084 return true; 2878 return true;
2085} 2879}
2086 2880
2087void account_cfs_bandwidth_used(int enabled, int was_enabled) {} 2881void cfs_bandwidth_usage_inc(void) {}
2882void cfs_bandwidth_usage_dec(void) {}
2088#endif /* HAVE_JUMP_LABEL */ 2883#endif /* HAVE_JUMP_LABEL */
2089 2884
2090/* 2885/*
@@ -2213,8 +3008,7 @@ static void expire_cfs_rq_runtime(struct cfs_rq *cfs_rq)
2213 } 3008 }
2214} 3009}
2215 3010
2216static void __account_cfs_rq_runtime(struct cfs_rq *cfs_rq, 3011static void __account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec)
2217 unsigned long delta_exec)
2218{ 3012{
2219 /* dock delta_exec before expiring quota (as it could span periods) */ 3013 /* dock delta_exec before expiring quota (as it could span periods) */
2220 cfs_rq->runtime_remaining -= delta_exec; 3014 cfs_rq->runtime_remaining -= delta_exec;
@@ -2232,7 +3026,7 @@ static void __account_cfs_rq_runtime(struct cfs_rq *cfs_rq,
2232} 3026}
2233 3027
2234static __always_inline 3028static __always_inline
2235void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, unsigned long delta_exec) 3029void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec)
2236{ 3030{
2237 if (!cfs_bandwidth_used() || !cfs_rq->runtime_enabled) 3031 if (!cfs_bandwidth_used() || !cfs_rq->runtime_enabled)
2238 return; 3032 return;
@@ -2335,6 +3129,8 @@ static void throttle_cfs_rq(struct cfs_rq *cfs_rq)
2335 cfs_rq->throttled_clock = rq_clock(rq); 3129 cfs_rq->throttled_clock = rq_clock(rq);
2336 raw_spin_lock(&cfs_b->lock); 3130 raw_spin_lock(&cfs_b->lock);
2337 list_add_tail_rcu(&cfs_rq->throttled_list, &cfs_b->throttled_cfs_rq); 3131 list_add_tail_rcu(&cfs_rq->throttled_list, &cfs_b->throttled_cfs_rq);
3132 if (!cfs_b->timer_active)
3133 __start_cfs_bandwidth(cfs_b);
2338 raw_spin_unlock(&cfs_b->lock); 3134 raw_spin_unlock(&cfs_b->lock);
2339} 3135}
2340 3136
@@ -2448,6 +3244,13 @@ static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun)
2448 if (idle) 3244 if (idle)
2449 goto out_unlock; 3245 goto out_unlock;
2450 3246
3247 /*
3248 * if we have relooped after returning idle once, we need to update our
3249 * status as actually running, so that other cpus doing
3250 * __start_cfs_bandwidth will stop trying to cancel us.
3251 */
3252 cfs_b->timer_active = 1;
3253
2451 __refill_cfs_bandwidth_runtime(cfs_b); 3254 __refill_cfs_bandwidth_runtime(cfs_b);
2452 3255
2453 if (!throttled) { 3256 if (!throttled) {
@@ -2508,7 +3311,13 @@ static const u64 min_bandwidth_expiration = 2 * NSEC_PER_MSEC;
2508/* how long we wait to gather additional slack before distributing */ 3311/* how long we wait to gather additional slack before distributing */
2509static const u64 cfs_bandwidth_slack_period = 5 * NSEC_PER_MSEC; 3312static const u64 cfs_bandwidth_slack_period = 5 * NSEC_PER_MSEC;
2510 3313
2511/* are we near the end of the current quota period? */ 3314/*
3315 * Are we near the end of the current quota period?
3316 *
3317 * Requires cfs_b->lock for hrtimer_expires_remaining to be safe against the
3318 * hrtimer base being cleared by __hrtimer_start_range_ns. In the case of
3319 * migrate_hrtimers, base is never cleared, so we are fine.
3320 */
2512static int runtime_refresh_within(struct cfs_bandwidth *cfs_b, u64 min_expire) 3321static int runtime_refresh_within(struct cfs_bandwidth *cfs_b, u64 min_expire)
2513{ 3322{
2514 struct hrtimer *refresh_timer = &cfs_b->period_timer; 3323 struct hrtimer *refresh_timer = &cfs_b->period_timer;
@@ -2584,10 +3393,12 @@ static void do_sched_cfs_slack_timer(struct cfs_bandwidth *cfs_b)
2584 u64 expires; 3393 u64 expires;
2585 3394
2586 /* confirm we're still not at a refresh boundary */ 3395 /* confirm we're still not at a refresh boundary */
2587 if (runtime_refresh_within(cfs_b, min_bandwidth_expiration)) 3396 raw_spin_lock(&cfs_b->lock);
3397 if (runtime_refresh_within(cfs_b, min_bandwidth_expiration)) {
3398 raw_spin_unlock(&cfs_b->lock);
2588 return; 3399 return;
3400 }
2589 3401
2590 raw_spin_lock(&cfs_b->lock);
2591 if (cfs_b->quota != RUNTIME_INF && cfs_b->runtime > slice) { 3402 if (cfs_b->quota != RUNTIME_INF && cfs_b->runtime > slice) {
2592 runtime = cfs_b->runtime; 3403 runtime = cfs_b->runtime;
2593 cfs_b->runtime = 0; 3404 cfs_b->runtime = 0;
@@ -2708,11 +3519,11 @@ void __start_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
2708 * (timer_active==0 becomes visible before the hrtimer call-back 3519 * (timer_active==0 becomes visible before the hrtimer call-back
2709 * terminates). In either case we ensure that it's re-programmed 3520 * terminates). In either case we ensure that it's re-programmed
2710 */ 3521 */
2711 while (unlikely(hrtimer_active(&cfs_b->period_timer))) { 3522 while (unlikely(hrtimer_active(&cfs_b->period_timer)) &&
3523 hrtimer_try_to_cancel(&cfs_b->period_timer) < 0) {
3524 /* bounce the lock to allow do_sched_cfs_period_timer to run */
2712 raw_spin_unlock(&cfs_b->lock); 3525 raw_spin_unlock(&cfs_b->lock);
2713 /* ensure cfs_b->lock is available while we wait */ 3526 cpu_relax();
2714 hrtimer_cancel(&cfs_b->period_timer);
2715
2716 raw_spin_lock(&cfs_b->lock); 3527 raw_spin_lock(&cfs_b->lock);
2717 /* if someone else restarted the timer then we're done */ 3528 /* if someone else restarted the timer then we're done */
2718 if (cfs_b->timer_active) 3529 if (cfs_b->timer_active)
@@ -2755,8 +3566,7 @@ static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq)
2755 return rq_clock_task(rq_of(cfs_rq)); 3566 return rq_clock_task(rq_of(cfs_rq));
2756} 3567}
2757 3568
2758static void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, 3569static void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec) {}
2759 unsigned long delta_exec) {}
2760static void check_cfs_rq_runtime(struct cfs_rq *cfs_rq) {} 3570static void check_cfs_rq_runtime(struct cfs_rq *cfs_rq) {}
2761static void check_enqueue_throttle(struct cfs_rq *cfs_rq) {} 3571static void check_enqueue_throttle(struct cfs_rq *cfs_rq) {}
2762static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq) {} 3572static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq) {}
@@ -3166,8 +3976,7 @@ static long effective_load(struct task_group *tg, int cpu, long wl, long wg)
3166} 3976}
3167#else 3977#else
3168 3978
3169static inline unsigned long effective_load(struct task_group *tg, int cpu, 3979static long effective_load(struct task_group *tg, int cpu, long wl, long wg)
3170 unsigned long wl, unsigned long wg)
3171{ 3980{
3172 return wl; 3981 return wl;
3173} 3982}
@@ -3420,11 +4229,10 @@ done:
3420 * preempt must be disabled. 4229 * preempt must be disabled.
3421 */ 4230 */
3422static int 4231static int
3423select_task_rq_fair(struct task_struct *p, int sd_flag, int wake_flags) 4232select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_flags)
3424{ 4233{
3425 struct sched_domain *tmp, *affine_sd = NULL, *sd = NULL; 4234 struct sched_domain *tmp, *affine_sd = NULL, *sd = NULL;
3426 int cpu = smp_processor_id(); 4235 int cpu = smp_processor_id();
3427 int prev_cpu = task_cpu(p);
3428 int new_cpu = cpu; 4236 int new_cpu = cpu;
3429 int want_affine = 0; 4237 int want_affine = 0;
3430 int sync = wake_flags & WF_SYNC; 4238 int sync = wake_flags & WF_SYNC;
@@ -3904,9 +4712,12 @@ static bool yield_to_task_fair(struct rq *rq, struct task_struct *p, bool preemp
3904 4712
3905static unsigned long __read_mostly max_load_balance_interval = HZ/10; 4713static unsigned long __read_mostly max_load_balance_interval = HZ/10;
3906 4714
4715enum fbq_type { regular, remote, all };
4716
3907#define LBF_ALL_PINNED 0x01 4717#define LBF_ALL_PINNED 0x01
3908#define LBF_NEED_BREAK 0x02 4718#define LBF_NEED_BREAK 0x02
3909#define LBF_SOME_PINNED 0x04 4719#define LBF_DST_PINNED 0x04
4720#define LBF_SOME_PINNED 0x08
3910 4721
3911struct lb_env { 4722struct lb_env {
3912 struct sched_domain *sd; 4723 struct sched_domain *sd;
@@ -3929,6 +4740,8 @@ struct lb_env {
3929 unsigned int loop; 4740 unsigned int loop;
3930 unsigned int loop_break; 4741 unsigned int loop_break;
3931 unsigned int loop_max; 4742 unsigned int loop_max;
4743
4744 enum fbq_type fbq_type;
3932}; 4745};
3933 4746
3934/* 4747/*
@@ -3975,6 +4788,78 @@ task_hot(struct task_struct *p, u64 now, struct sched_domain *sd)
3975 return delta < (s64)sysctl_sched_migration_cost; 4788 return delta < (s64)sysctl_sched_migration_cost;
3976} 4789}
3977 4790
4791#ifdef CONFIG_NUMA_BALANCING
4792/* Returns true if the destination node has incurred more faults */
4793static bool migrate_improves_locality(struct task_struct *p, struct lb_env *env)
4794{
4795 int src_nid, dst_nid;
4796
4797 if (!sched_feat(NUMA_FAVOUR_HIGHER) || !p->numa_faults ||
4798 !(env->sd->flags & SD_NUMA)) {
4799 return false;
4800 }
4801
4802 src_nid = cpu_to_node(env->src_cpu);
4803 dst_nid = cpu_to_node(env->dst_cpu);
4804
4805 if (src_nid == dst_nid)
4806 return false;
4807
4808 /* Always encourage migration to the preferred node. */
4809 if (dst_nid == p->numa_preferred_nid)
4810 return true;
4811
4812 /* If both task and group weight improve, this move is a winner. */
4813 if (task_weight(p, dst_nid) > task_weight(p, src_nid) &&
4814 group_weight(p, dst_nid) > group_weight(p, src_nid))
4815 return true;
4816
4817 return false;
4818}
4819
4820
4821static bool migrate_degrades_locality(struct task_struct *p, struct lb_env *env)
4822{
4823 int src_nid, dst_nid;
4824
4825 if (!sched_feat(NUMA) || !sched_feat(NUMA_RESIST_LOWER))
4826 return false;
4827
4828 if (!p->numa_faults || !(env->sd->flags & SD_NUMA))
4829 return false;
4830
4831 src_nid = cpu_to_node(env->src_cpu);
4832 dst_nid = cpu_to_node(env->dst_cpu);
4833
4834 if (src_nid == dst_nid)
4835 return false;
4836
4837 /* Migrating away from the preferred node is always bad. */
4838 if (src_nid == p->numa_preferred_nid)
4839 return true;
4840
4841 /* If either task or group weight get worse, don't do it. */
4842 if (task_weight(p, dst_nid) < task_weight(p, src_nid) ||
4843 group_weight(p, dst_nid) < group_weight(p, src_nid))
4844 return true;
4845
4846 return false;
4847}
4848
4849#else
4850static inline bool migrate_improves_locality(struct task_struct *p,
4851 struct lb_env *env)
4852{
4853 return false;
4854}
4855
4856static inline bool migrate_degrades_locality(struct task_struct *p,
4857 struct lb_env *env)
4858{
4859 return false;
4860}
4861#endif
4862
3978/* 4863/*
3979 * can_migrate_task - may task p from runqueue rq be migrated to this_cpu? 4864 * can_migrate_task - may task p from runqueue rq be migrated to this_cpu?
3980 */ 4865 */
@@ -3997,6 +4882,8 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env)
3997 4882
3998 schedstat_inc(p, se.statistics.nr_failed_migrations_affine); 4883 schedstat_inc(p, se.statistics.nr_failed_migrations_affine);
3999 4884
4885 env->flags |= LBF_SOME_PINNED;
4886
4000 /* 4887 /*
4001 * Remember if this task can be migrated to any other cpu in 4888 * Remember if this task can be migrated to any other cpu in
4002 * our sched_group. We may want to revisit it if we couldn't 4889 * our sched_group. We may want to revisit it if we couldn't
@@ -4005,13 +4892,13 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env)
4005 * Also avoid computing new_dst_cpu if we have already computed 4892 * Also avoid computing new_dst_cpu if we have already computed
4006 * one in current iteration. 4893 * one in current iteration.
4007 */ 4894 */
4008 if (!env->dst_grpmask || (env->flags & LBF_SOME_PINNED)) 4895 if (!env->dst_grpmask || (env->flags & LBF_DST_PINNED))
4009 return 0; 4896 return 0;
4010 4897
4011 /* Prevent to re-select dst_cpu via env's cpus */ 4898 /* Prevent to re-select dst_cpu via env's cpus */
4012 for_each_cpu_and(cpu, env->dst_grpmask, env->cpus) { 4899 for_each_cpu_and(cpu, env->dst_grpmask, env->cpus) {
4013 if (cpumask_test_cpu(cpu, tsk_cpus_allowed(p))) { 4900 if (cpumask_test_cpu(cpu, tsk_cpus_allowed(p))) {
4014 env->flags |= LBF_SOME_PINNED; 4901 env->flags |= LBF_DST_PINNED;
4015 env->new_dst_cpu = cpu; 4902 env->new_dst_cpu = cpu;
4016 break; 4903 break;
4017 } 4904 }
@@ -4030,11 +4917,24 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env)
4030 4917
4031 /* 4918 /*
4032 * Aggressive migration if: 4919 * Aggressive migration if:
4033 * 1) task is cache cold, or 4920 * 1) destination numa is preferred
4034 * 2) too many balance attempts have failed. 4921 * 2) task is cache cold, or
4922 * 3) too many balance attempts have failed.
4035 */ 4923 */
4036
4037 tsk_cache_hot = task_hot(p, rq_clock_task(env->src_rq), env->sd); 4924 tsk_cache_hot = task_hot(p, rq_clock_task(env->src_rq), env->sd);
4925 if (!tsk_cache_hot)
4926 tsk_cache_hot = migrate_degrades_locality(p, env);
4927
4928 if (migrate_improves_locality(p, env)) {
4929#ifdef CONFIG_SCHEDSTATS
4930 if (tsk_cache_hot) {
4931 schedstat_inc(env->sd, lb_hot_gained[env->idle]);
4932 schedstat_inc(p, se.statistics.nr_forced_migrations);
4933 }
4934#endif
4935 return 1;
4936 }
4937
4038 if (!tsk_cache_hot || 4938 if (!tsk_cache_hot ||
4039 env->sd->nr_balance_failed > env->sd->cache_nice_tries) { 4939 env->sd->nr_balance_failed > env->sd->cache_nice_tries) {
4040 4940
@@ -4077,8 +4977,6 @@ static int move_one_task(struct lb_env *env)
4077 return 0; 4977 return 0;
4078} 4978}
4079 4979
4080static unsigned long task_h_load(struct task_struct *p);
4081
4082static const unsigned int sched_nr_migrate_break = 32; 4980static const unsigned int sched_nr_migrate_break = 32;
4083 4981
4084/* 4982/*
@@ -4291,6 +5189,10 @@ struct sg_lb_stats {
4291 unsigned int group_weight; 5189 unsigned int group_weight;
4292 int group_imb; /* Is there an imbalance in the group ? */ 5190 int group_imb; /* Is there an imbalance in the group ? */
4293 int group_has_capacity; /* Is there extra capacity in the group? */ 5191 int group_has_capacity; /* Is there extra capacity in the group? */
5192#ifdef CONFIG_NUMA_BALANCING
5193 unsigned int nr_numa_running;
5194 unsigned int nr_preferred_running;
5195#endif
4294}; 5196};
4295 5197
4296/* 5198/*
@@ -4330,7 +5232,7 @@ static inline void init_sd_lb_stats(struct sd_lb_stats *sds)
4330/** 5232/**
4331 * get_sd_load_idx - Obtain the load index for a given sched domain. 5233 * get_sd_load_idx - Obtain the load index for a given sched domain.
4332 * @sd: The sched_domain whose load_idx is to be obtained. 5234 * @sd: The sched_domain whose load_idx is to be obtained.
4333 * @idle: The Idle status of the CPU for whose sd load_icx is obtained. 5235 * @idle: The idle status of the CPU for whose sd load_idx is obtained.
4334 * 5236 *
4335 * Return: The load index. 5237 * Return: The load index.
4336 */ 5238 */
@@ -4447,7 +5349,7 @@ void update_group_power(struct sched_domain *sd, int cpu)
4447{ 5349{
4448 struct sched_domain *child = sd->child; 5350 struct sched_domain *child = sd->child;
4449 struct sched_group *group, *sdg = sd->groups; 5351 struct sched_group *group, *sdg = sd->groups;
4450 unsigned long power; 5352 unsigned long power, power_orig;
4451 unsigned long interval; 5353 unsigned long interval;
4452 5354
4453 interval = msecs_to_jiffies(sd->balance_interval); 5355 interval = msecs_to_jiffies(sd->balance_interval);
@@ -4459,7 +5361,7 @@ void update_group_power(struct sched_domain *sd, int cpu)
4459 return; 5361 return;
4460 } 5362 }
4461 5363
4462 power = 0; 5364 power_orig = power = 0;
4463 5365
4464 if (child->flags & SD_OVERLAP) { 5366 if (child->flags & SD_OVERLAP) {
4465 /* 5367 /*
@@ -4467,8 +5369,33 @@ void update_group_power(struct sched_domain *sd, int cpu)
4467 * span the current group. 5369 * span the current group.
4468 */ 5370 */
4469 5371
4470 for_each_cpu(cpu, sched_group_cpus(sdg)) 5372 for_each_cpu(cpu, sched_group_cpus(sdg)) {
4471 power += power_of(cpu); 5373 struct sched_group_power *sgp;
5374 struct rq *rq = cpu_rq(cpu);
5375
5376 /*
5377 * build_sched_domains() -> init_sched_groups_power()
5378 * gets here before we've attached the domains to the
5379 * runqueues.
5380 *
5381 * Use power_of(), which is set irrespective of domains
5382 * in update_cpu_power().
5383 *
5384 * This avoids power/power_orig from being 0 and
5385 * causing divide-by-zero issues on boot.
5386 *
5387 * Runtime updates will correct power_orig.
5388 */
5389 if (unlikely(!rq->sd)) {
5390 power_orig += power_of(cpu);
5391 power += power_of(cpu);
5392 continue;
5393 }
5394
5395 sgp = rq->sd->groups->sgp;
5396 power_orig += sgp->power_orig;
5397 power += sgp->power;
5398 }
4472 } else { 5399 } else {
4473 /* 5400 /*
4474 * !SD_OVERLAP domains can assume that child groups 5401 * !SD_OVERLAP domains can assume that child groups
@@ -4477,12 +5404,14 @@ void update_group_power(struct sched_domain *sd, int cpu)
4477 5404
4478 group = child->groups; 5405 group = child->groups;
4479 do { 5406 do {
5407 power_orig += group->sgp->power_orig;
4480 power += group->sgp->power; 5408 power += group->sgp->power;
4481 group = group->next; 5409 group = group->next;
4482 } while (group != child->groups); 5410 } while (group != child->groups);
4483 } 5411 }
4484 5412
4485 sdg->sgp->power_orig = sdg->sgp->power = power; 5413 sdg->sgp->power_orig = power_orig;
5414 sdg->sgp->power = power;
4486} 5415}
4487 5416
4488/* 5417/*
@@ -4526,13 +5455,12 @@ fix_small_capacity(struct sched_domain *sd, struct sched_group *group)
4526 * cpu 3 and leave one of the cpus in the second group unused. 5455 * cpu 3 and leave one of the cpus in the second group unused.
4527 * 5456 *
4528 * The current solution to this issue is detecting the skew in the first group 5457 * The current solution to this issue is detecting the skew in the first group
4529 * by noticing it has a cpu that is overloaded while the remaining cpus are 5458 * by noticing the lower domain failed to reach balance and had difficulty
4530 * idle -- or rather, there's a distinct imbalance in the cpus; see 5459 * moving tasks due to affinity constraints.
4531 * sg_imbalanced().
4532 * 5460 *
4533 * When this is so detected; this group becomes a candidate for busiest; see 5461 * When this is so detected; this group becomes a candidate for busiest; see
4534 * update_sd_pick_busiest(). And calculcate_imbalance() and 5462 * update_sd_pick_busiest(). And calculate_imbalance() and
4535 * find_busiest_group() avoid some of the usual balance conditional to allow it 5463 * find_busiest_group() avoid some of the usual balance conditions to allow it
4536 * to create an effective group imbalance. 5464 * to create an effective group imbalance.
4537 * 5465 *
4538 * This is a somewhat tricky proposition since the next run might not find the 5466 * This is a somewhat tricky proposition since the next run might not find the
@@ -4540,49 +5468,36 @@ fix_small_capacity(struct sched_domain *sd, struct sched_group *group)
4540 * subtle and fragile situation. 5468 * subtle and fragile situation.
4541 */ 5469 */
4542 5470
4543struct sg_imb_stats { 5471static inline int sg_imbalanced(struct sched_group *group)
4544 unsigned long max_nr_running, min_nr_running;
4545 unsigned long max_cpu_load, min_cpu_load;
4546};
4547
4548static inline void init_sg_imb_stats(struct sg_imb_stats *sgi)
4549{ 5472{
4550 sgi->max_cpu_load = sgi->max_nr_running = 0UL; 5473 return group->sgp->imbalance;
4551 sgi->min_cpu_load = sgi->min_nr_running = ~0UL;
4552} 5474}
4553 5475
4554static inline void 5476/*
4555update_sg_imb_stats(struct sg_imb_stats *sgi, 5477 * Compute the group capacity.
4556 unsigned long load, unsigned long nr_running) 5478 *
5479 * Avoid the issue where N*frac(smt_power) >= 1 creates 'phantom' cores by
5480 * first dividing out the smt factor and computing the actual number of cores
5481 * and limit power unit capacity with that.
5482 */
5483static inline int sg_capacity(struct lb_env *env, struct sched_group *group)
4557{ 5484{
4558 if (load > sgi->max_cpu_load) 5485 unsigned int capacity, smt, cpus;
4559 sgi->max_cpu_load = load; 5486 unsigned int power, power_orig;
4560 if (sgi->min_cpu_load > load)
4561 sgi->min_cpu_load = load;
4562 5487
4563 if (nr_running > sgi->max_nr_running) 5488 power = group->sgp->power;
4564 sgi->max_nr_running = nr_running; 5489 power_orig = group->sgp->power_orig;
4565 if (sgi->min_nr_running > nr_running) 5490 cpus = group->group_weight;
4566 sgi->min_nr_running = nr_running;
4567}
4568 5491
4569static inline int 5492 /* smt := ceil(cpus / power), assumes: 1 < smt_power < 2 */
4570sg_imbalanced(struct sg_lb_stats *sgs, struct sg_imb_stats *sgi) 5493 smt = DIV_ROUND_UP(SCHED_POWER_SCALE * cpus, power_orig);
4571{ 5494 capacity = cpus / smt; /* cores */
4572 /*
4573 * Consider the group unbalanced when the imbalance is larger
4574 * than the average weight of a task.
4575 *
4576 * APZ: with cgroup the avg task weight can vary wildly and
4577 * might not be a suitable number - should we keep a
4578 * normalized nr_running number somewhere that negates
4579 * the hierarchy?
4580 */
4581 if ((sgi->max_cpu_load - sgi->min_cpu_load) >= sgs->load_per_task &&
4582 (sgi->max_nr_running - sgi->min_nr_running) > 1)
4583 return 1;
4584 5495
4585 return 0; 5496 capacity = min_t(unsigned, capacity, DIV_ROUND_CLOSEST(power, SCHED_POWER_SCALE));
5497 if (!capacity)
5498 capacity = fix_small_capacity(env->sd, group);
5499
5500 return capacity;
4586} 5501}
4587 5502
4588/** 5503/**
@@ -4597,12 +5512,11 @@ static inline void update_sg_lb_stats(struct lb_env *env,
4597 struct sched_group *group, int load_idx, 5512 struct sched_group *group, int load_idx,
4598 int local_group, struct sg_lb_stats *sgs) 5513 int local_group, struct sg_lb_stats *sgs)
4599{ 5514{
4600 struct sg_imb_stats sgi;
4601 unsigned long nr_running; 5515 unsigned long nr_running;
4602 unsigned long load; 5516 unsigned long load;
4603 int i; 5517 int i;
4604 5518
4605 init_sg_imb_stats(&sgi); 5519 memset(sgs, 0, sizeof(*sgs));
4606 5520
4607 for_each_cpu_and(i, sched_group_cpus(group), env->cpus) { 5521 for_each_cpu_and(i, sched_group_cpus(group), env->cpus) {
4608 struct rq *rq = cpu_rq(i); 5522 struct rq *rq = cpu_rq(i);
@@ -4610,24 +5524,22 @@ static inline void update_sg_lb_stats(struct lb_env *env,
4610 nr_running = rq->nr_running; 5524 nr_running = rq->nr_running;
4611 5525
4612 /* Bias balancing toward cpus of our domain */ 5526 /* Bias balancing toward cpus of our domain */
4613 if (local_group) { 5527 if (local_group)
4614 load = target_load(i, load_idx); 5528 load = target_load(i, load_idx);
4615 } else { 5529 else
4616 load = source_load(i, load_idx); 5530 load = source_load(i, load_idx);
4617 update_sg_imb_stats(&sgi, load, nr_running);
4618 }
4619 5531
4620 sgs->group_load += load; 5532 sgs->group_load += load;
4621 sgs->sum_nr_running += nr_running; 5533 sgs->sum_nr_running += nr_running;
5534#ifdef CONFIG_NUMA_BALANCING
5535 sgs->nr_numa_running += rq->nr_numa_running;
5536 sgs->nr_preferred_running += rq->nr_preferred_running;
5537#endif
4622 sgs->sum_weighted_load += weighted_cpuload(i); 5538 sgs->sum_weighted_load += weighted_cpuload(i);
4623 if (idle_cpu(i)) 5539 if (idle_cpu(i))
4624 sgs->idle_cpus++; 5540 sgs->idle_cpus++;
4625 } 5541 }
4626 5542
4627 if (local_group && (env->idle != CPU_NEWLY_IDLE ||
4628 time_after_eq(jiffies, group->sgp->next_update)))
4629 update_group_power(env->sd, env->dst_cpu);
4630
4631 /* Adjust by relative CPU power of the group */ 5543 /* Adjust by relative CPU power of the group */
4632 sgs->group_power = group->sgp->power; 5544 sgs->group_power = group->sgp->power;
4633 sgs->avg_load = (sgs->group_load*SCHED_POWER_SCALE) / sgs->group_power; 5545 sgs->avg_load = (sgs->group_load*SCHED_POWER_SCALE) / sgs->group_power;
@@ -4635,16 +5547,11 @@ static inline void update_sg_lb_stats(struct lb_env *env,
4635 if (sgs->sum_nr_running) 5547 if (sgs->sum_nr_running)
4636 sgs->load_per_task = sgs->sum_weighted_load / sgs->sum_nr_running; 5548 sgs->load_per_task = sgs->sum_weighted_load / sgs->sum_nr_running;
4637 5549
4638 sgs->group_imb = sg_imbalanced(sgs, &sgi);
4639
4640 sgs->group_capacity =
4641 DIV_ROUND_CLOSEST(sgs->group_power, SCHED_POWER_SCALE);
4642
4643 if (!sgs->group_capacity)
4644 sgs->group_capacity = fix_small_capacity(env->sd, group);
4645
4646 sgs->group_weight = group->group_weight; 5550 sgs->group_weight = group->group_weight;
4647 5551
5552 sgs->group_imb = sg_imbalanced(group);
5553 sgs->group_capacity = sg_capacity(env, group);
5554
4648 if (sgs->group_capacity > sgs->sum_nr_running) 5555 if (sgs->group_capacity > sgs->sum_nr_running)
4649 sgs->group_has_capacity = 1; 5556 sgs->group_has_capacity = 1;
4650} 5557}
@@ -4693,14 +5600,42 @@ static bool update_sd_pick_busiest(struct lb_env *env,
4693 return false; 5600 return false;
4694} 5601}
4695 5602
5603#ifdef CONFIG_NUMA_BALANCING
5604static inline enum fbq_type fbq_classify_group(struct sg_lb_stats *sgs)
5605{
5606 if (sgs->sum_nr_running > sgs->nr_numa_running)
5607 return regular;
5608 if (sgs->sum_nr_running > sgs->nr_preferred_running)
5609 return remote;
5610 return all;
5611}
5612
5613static inline enum fbq_type fbq_classify_rq(struct rq *rq)
5614{
5615 if (rq->nr_running > rq->nr_numa_running)
5616 return regular;
5617 if (rq->nr_running > rq->nr_preferred_running)
5618 return remote;
5619 return all;
5620}
5621#else
5622static inline enum fbq_type fbq_classify_group(struct sg_lb_stats *sgs)
5623{
5624 return all;
5625}
5626
5627static inline enum fbq_type fbq_classify_rq(struct rq *rq)
5628{
5629 return regular;
5630}
5631#endif /* CONFIG_NUMA_BALANCING */
5632
4696/** 5633/**
4697 * update_sd_lb_stats - Update sched_domain's statistics for load balancing. 5634 * update_sd_lb_stats - Update sched_domain's statistics for load balancing.
4698 * @env: The load balancing environment. 5635 * @env: The load balancing environment.
4699 * @balance: Should we balance.
4700 * @sds: variable to hold the statistics for this sched_domain. 5636 * @sds: variable to hold the statistics for this sched_domain.
4701 */ 5637 */
4702static inline void update_sd_lb_stats(struct lb_env *env, 5638static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sds)
4703 struct sd_lb_stats *sds)
4704{ 5639{
4705 struct sched_domain *child = env->sd->child; 5640 struct sched_domain *child = env->sd->child;
4706 struct sched_group *sg = env->sd->groups; 5641 struct sched_group *sg = env->sd->groups;
@@ -4720,11 +5655,17 @@ static inline void update_sd_lb_stats(struct lb_env *env,
4720 if (local_group) { 5655 if (local_group) {
4721 sds->local = sg; 5656 sds->local = sg;
4722 sgs = &sds->local_stat; 5657 sgs = &sds->local_stat;
5658
5659 if (env->idle != CPU_NEWLY_IDLE ||
5660 time_after_eq(jiffies, sg->sgp->next_update))
5661 update_group_power(env->sd, env->dst_cpu);
4723 } 5662 }
4724 5663
4725 memset(sgs, 0, sizeof(*sgs));
4726 update_sg_lb_stats(env, sg, load_idx, local_group, sgs); 5664 update_sg_lb_stats(env, sg, load_idx, local_group, sgs);
4727 5665
5666 if (local_group)
5667 goto next_group;
5668
4728 /* 5669 /*
4729 * In case the child domain prefers tasks go to siblings 5670 * In case the child domain prefers tasks go to siblings
4730 * first, lower the sg capacity to one so that we'll try 5671 * first, lower the sg capacity to one so that we'll try
@@ -4735,21 +5676,25 @@ static inline void update_sd_lb_stats(struct lb_env *env,
4735 * heaviest group when it is already under-utilized (possible 5676 * heaviest group when it is already under-utilized (possible
4736 * with a large weight task outweighs the tasks on the system). 5677 * with a large weight task outweighs the tasks on the system).
4737 */ 5678 */
4738 if (prefer_sibling && !local_group && 5679 if (prefer_sibling && sds->local &&
4739 sds->local && sds->local_stat.group_has_capacity) 5680 sds->local_stat.group_has_capacity)
4740 sgs->group_capacity = min(sgs->group_capacity, 1U); 5681 sgs->group_capacity = min(sgs->group_capacity, 1U);
4741 5682
4742 /* Now, start updating sd_lb_stats */ 5683 if (update_sd_pick_busiest(env, sds, sg, sgs)) {
4743 sds->total_load += sgs->group_load;
4744 sds->total_pwr += sgs->group_power;
4745
4746 if (!local_group && update_sd_pick_busiest(env, sds, sg, sgs)) {
4747 sds->busiest = sg; 5684 sds->busiest = sg;
4748 sds->busiest_stat = *sgs; 5685 sds->busiest_stat = *sgs;
4749 } 5686 }
4750 5687
5688next_group:
5689 /* Now, start updating sd_lb_stats */
5690 sds->total_load += sgs->group_load;
5691 sds->total_pwr += sgs->group_power;
5692
4751 sg = sg->next; 5693 sg = sg->next;
4752 } while (sg != env->sd->groups); 5694 } while (sg != env->sd->groups);
5695
5696 if (env->sd->flags & SD_NUMA)
5697 env->fbq_type = fbq_classify_group(&sds->busiest_stat);
4753} 5698}
4754 5699
4755/** 5700/**
@@ -5053,15 +5998,39 @@ static struct rq *find_busiest_queue(struct lb_env *env,
5053 int i; 5998 int i;
5054 5999
5055 for_each_cpu_and(i, sched_group_cpus(group), env->cpus) { 6000 for_each_cpu_and(i, sched_group_cpus(group), env->cpus) {
5056 unsigned long power = power_of(i); 6001 unsigned long power, capacity, wl;
5057 unsigned long capacity = DIV_ROUND_CLOSEST(power, 6002 enum fbq_type rt;
5058 SCHED_POWER_SCALE); 6003
5059 unsigned long wl; 6004 rq = cpu_rq(i);
6005 rt = fbq_classify_rq(rq);
5060 6006
6007 /*
6008 * We classify groups/runqueues into three groups:
6009 * - regular: there are !numa tasks
6010 * - remote: there are numa tasks that run on the 'wrong' node
6011 * - all: there is no distinction
6012 *
6013 * In order to avoid migrating ideally placed numa tasks,
6014 * ignore those when there's better options.
6015 *
6016 * If we ignore the actual busiest queue to migrate another
6017 * task, the next balance pass can still reduce the busiest
6018 * queue by moving tasks around inside the node.
6019 *
6020 * If we cannot move enough load due to this classification
6021 * the next pass will adjust the group classification and
6022 * allow migration of more tasks.
6023 *
6024 * Both cases only affect the total convergence complexity.
6025 */
6026 if (rt > env->fbq_type)
6027 continue;
6028
6029 power = power_of(i);
6030 capacity = DIV_ROUND_CLOSEST(power, SCHED_POWER_SCALE);
5061 if (!capacity) 6031 if (!capacity)
5062 capacity = fix_small_capacity(env->sd, group); 6032 capacity = fix_small_capacity(env->sd, group);
5063 6033
5064 rq = cpu_rq(i);
5065 wl = weighted_cpuload(i); 6034 wl = weighted_cpuload(i);
5066 6035
5067 /* 6036 /*
@@ -5164,6 +6133,7 @@ static int load_balance(int this_cpu, struct rq *this_rq,
5164 int *continue_balancing) 6133 int *continue_balancing)
5165{ 6134{
5166 int ld_moved, cur_ld_moved, active_balance = 0; 6135 int ld_moved, cur_ld_moved, active_balance = 0;
6136 struct sched_domain *sd_parent = sd->parent;
5167 struct sched_group *group; 6137 struct sched_group *group;
5168 struct rq *busiest; 6138 struct rq *busiest;
5169 unsigned long flags; 6139 unsigned long flags;
@@ -5177,6 +6147,7 @@ static int load_balance(int this_cpu, struct rq *this_rq,
5177 .idle = idle, 6147 .idle = idle,
5178 .loop_break = sched_nr_migrate_break, 6148 .loop_break = sched_nr_migrate_break,
5179 .cpus = cpus, 6149 .cpus = cpus,
6150 .fbq_type = all,
5180 }; 6151 };
5181 6152
5182 /* 6153 /*
@@ -5268,17 +6239,17 @@ more_balance:
5268 * moreover subsequent load balance cycles should correct the 6239 * moreover subsequent load balance cycles should correct the
5269 * excess load moved. 6240 * excess load moved.
5270 */ 6241 */
5271 if ((env.flags & LBF_SOME_PINNED) && env.imbalance > 0) { 6242 if ((env.flags & LBF_DST_PINNED) && env.imbalance > 0) {
6243
6244 /* Prevent to re-select dst_cpu via env's cpus */
6245 cpumask_clear_cpu(env.dst_cpu, env.cpus);
5272 6246
5273 env.dst_rq = cpu_rq(env.new_dst_cpu); 6247 env.dst_rq = cpu_rq(env.new_dst_cpu);
5274 env.dst_cpu = env.new_dst_cpu; 6248 env.dst_cpu = env.new_dst_cpu;
5275 env.flags &= ~LBF_SOME_PINNED; 6249 env.flags &= ~LBF_DST_PINNED;
5276 env.loop = 0; 6250 env.loop = 0;
5277 env.loop_break = sched_nr_migrate_break; 6251 env.loop_break = sched_nr_migrate_break;
5278 6252
5279 /* Prevent to re-select dst_cpu via env's cpus */
5280 cpumask_clear_cpu(env.dst_cpu, env.cpus);
5281
5282 /* 6253 /*
5283 * Go back to "more_balance" rather than "redo" since we 6254 * Go back to "more_balance" rather than "redo" since we
5284 * need to continue with same src_cpu. 6255 * need to continue with same src_cpu.
@@ -5286,6 +6257,18 @@ more_balance:
5286 goto more_balance; 6257 goto more_balance;
5287 } 6258 }
5288 6259
6260 /*
6261 * We failed to reach balance because of affinity.
6262 */
6263 if (sd_parent) {
6264 int *group_imbalance = &sd_parent->groups->sgp->imbalance;
6265
6266 if ((env.flags & LBF_SOME_PINNED) && env.imbalance > 0) {
6267 *group_imbalance = 1;
6268 } else if (*group_imbalance)
6269 *group_imbalance = 0;
6270 }
6271
5289 /* All tasks on this runqueue were pinned by CPU affinity */ 6272 /* All tasks on this runqueue were pinned by CPU affinity */
5290 if (unlikely(env.flags & LBF_ALL_PINNED)) { 6273 if (unlikely(env.flags & LBF_ALL_PINNED)) {
5291 cpumask_clear_cpu(cpu_of(busiest), cpus); 6274 cpumask_clear_cpu(cpu_of(busiest), cpus);
@@ -5393,6 +6376,7 @@ void idle_balance(int this_cpu, struct rq *this_rq)
5393 struct sched_domain *sd; 6376 struct sched_domain *sd;
5394 int pulled_task = 0; 6377 int pulled_task = 0;
5395 unsigned long next_balance = jiffies + HZ; 6378 unsigned long next_balance = jiffies + HZ;
6379 u64 curr_cost = 0;
5396 6380
5397 this_rq->idle_stamp = rq_clock(this_rq); 6381 this_rq->idle_stamp = rq_clock(this_rq);
5398 6382
@@ -5409,15 +6393,27 @@ void idle_balance(int this_cpu, struct rq *this_rq)
5409 for_each_domain(this_cpu, sd) { 6393 for_each_domain(this_cpu, sd) {
5410 unsigned long interval; 6394 unsigned long interval;
5411 int continue_balancing = 1; 6395 int continue_balancing = 1;
6396 u64 t0, domain_cost;
5412 6397
5413 if (!(sd->flags & SD_LOAD_BALANCE)) 6398 if (!(sd->flags & SD_LOAD_BALANCE))
5414 continue; 6399 continue;
5415 6400
6401 if (this_rq->avg_idle < curr_cost + sd->max_newidle_lb_cost)
6402 break;
6403
5416 if (sd->flags & SD_BALANCE_NEWIDLE) { 6404 if (sd->flags & SD_BALANCE_NEWIDLE) {
6405 t0 = sched_clock_cpu(this_cpu);
6406
5417 /* If we've pulled tasks over stop searching: */ 6407 /* If we've pulled tasks over stop searching: */
5418 pulled_task = load_balance(this_cpu, this_rq, 6408 pulled_task = load_balance(this_cpu, this_rq,
5419 sd, CPU_NEWLY_IDLE, 6409 sd, CPU_NEWLY_IDLE,
5420 &continue_balancing); 6410 &continue_balancing);
6411
6412 domain_cost = sched_clock_cpu(this_cpu) - t0;
6413 if (domain_cost > sd->max_newidle_lb_cost)
6414 sd->max_newidle_lb_cost = domain_cost;
6415
6416 curr_cost += domain_cost;
5421 } 6417 }
5422 6418
5423 interval = msecs_to_jiffies(sd->balance_interval); 6419 interval = msecs_to_jiffies(sd->balance_interval);
@@ -5439,6 +6435,9 @@ void idle_balance(int this_cpu, struct rq *this_rq)
5439 */ 6435 */
5440 this_rq->next_balance = next_balance; 6436 this_rq->next_balance = next_balance;
5441 } 6437 }
6438
6439 if (curr_cost > this_rq->max_idle_balance_cost)
6440 this_rq->max_idle_balance_cost = curr_cost;
5442} 6441}
5443 6442
5444/* 6443/*
@@ -5572,16 +6571,16 @@ static inline void nohz_balance_exit_idle(int cpu)
5572static inline void set_cpu_sd_state_busy(void) 6571static inline void set_cpu_sd_state_busy(void)
5573{ 6572{
5574 struct sched_domain *sd; 6573 struct sched_domain *sd;
6574 int cpu = smp_processor_id();
5575 6575
5576 rcu_read_lock(); 6576 rcu_read_lock();
5577 sd = rcu_dereference_check_sched_domain(this_rq()->sd); 6577 sd = rcu_dereference(per_cpu(sd_busy, cpu));
5578 6578
5579 if (!sd || !sd->nohz_idle) 6579 if (!sd || !sd->nohz_idle)
5580 goto unlock; 6580 goto unlock;
5581 sd->nohz_idle = 0; 6581 sd->nohz_idle = 0;
5582 6582
5583 for (; sd; sd = sd->parent) 6583 atomic_inc(&sd->groups->sgp->nr_busy_cpus);
5584 atomic_inc(&sd->groups->sgp->nr_busy_cpus);
5585unlock: 6584unlock:
5586 rcu_read_unlock(); 6585 rcu_read_unlock();
5587} 6586}
@@ -5589,16 +6588,16 @@ unlock:
5589void set_cpu_sd_state_idle(void) 6588void set_cpu_sd_state_idle(void)
5590{ 6589{
5591 struct sched_domain *sd; 6590 struct sched_domain *sd;
6591 int cpu = smp_processor_id();
5592 6592
5593 rcu_read_lock(); 6593 rcu_read_lock();
5594 sd = rcu_dereference_check_sched_domain(this_rq()->sd); 6594 sd = rcu_dereference(per_cpu(sd_busy, cpu));
5595 6595
5596 if (!sd || sd->nohz_idle) 6596 if (!sd || sd->nohz_idle)
5597 goto unlock; 6597 goto unlock;
5598 sd->nohz_idle = 1; 6598 sd->nohz_idle = 1;
5599 6599
5600 for (; sd; sd = sd->parent) 6600 atomic_dec(&sd->groups->sgp->nr_busy_cpus);
5601 atomic_dec(&sd->groups->sgp->nr_busy_cpus);
5602unlock: 6601unlock:
5603 rcu_read_unlock(); 6602 rcu_read_unlock();
5604} 6603}
@@ -5662,15 +6661,39 @@ static void rebalance_domains(int cpu, enum cpu_idle_type idle)
5662 /* Earliest time when we have to do rebalance again */ 6661 /* Earliest time when we have to do rebalance again */
5663 unsigned long next_balance = jiffies + 60*HZ; 6662 unsigned long next_balance = jiffies + 60*HZ;
5664 int update_next_balance = 0; 6663 int update_next_balance = 0;
5665 int need_serialize; 6664 int need_serialize, need_decay = 0;
6665 u64 max_cost = 0;
5666 6666
5667 update_blocked_averages(cpu); 6667 update_blocked_averages(cpu);
5668 6668
5669 rcu_read_lock(); 6669 rcu_read_lock();
5670 for_each_domain(cpu, sd) { 6670 for_each_domain(cpu, sd) {
6671 /*
6672 * Decay the newidle max times here because this is a regular
6673 * visit to all the domains. Decay ~1% per second.
6674 */
6675 if (time_after(jiffies, sd->next_decay_max_lb_cost)) {
6676 sd->max_newidle_lb_cost =
6677 (sd->max_newidle_lb_cost * 253) / 256;
6678 sd->next_decay_max_lb_cost = jiffies + HZ;
6679 need_decay = 1;
6680 }
6681 max_cost += sd->max_newidle_lb_cost;
6682
5671 if (!(sd->flags & SD_LOAD_BALANCE)) 6683 if (!(sd->flags & SD_LOAD_BALANCE))
5672 continue; 6684 continue;
5673 6685
6686 /*
6687 * Stop the load balance at this level. There is another
6688 * CPU in our sched group which is doing load balancing more
6689 * actively.
6690 */
6691 if (!continue_balancing) {
6692 if (need_decay)
6693 continue;
6694 break;
6695 }
6696
5674 interval = sd->balance_interval; 6697 interval = sd->balance_interval;
5675 if (idle != CPU_IDLE) 6698 if (idle != CPU_IDLE)
5676 interval *= sd->busy_factor; 6699 interval *= sd->busy_factor;
@@ -5689,7 +6712,7 @@ static void rebalance_domains(int cpu, enum cpu_idle_type idle)
5689 if (time_after_eq(jiffies, sd->last_balance + interval)) { 6712 if (time_after_eq(jiffies, sd->last_balance + interval)) {
5690 if (load_balance(cpu, rq, sd, idle, &continue_balancing)) { 6713 if (load_balance(cpu, rq, sd, idle, &continue_balancing)) {
5691 /* 6714 /*
5692 * The LBF_SOME_PINNED logic could have changed 6715 * The LBF_DST_PINNED logic could have changed
5693 * env->dst_cpu, so we can't know our idle 6716 * env->dst_cpu, so we can't know our idle
5694 * state even if we migrated tasks. Update it. 6717 * state even if we migrated tasks. Update it.
5695 */ 6718 */
@@ -5704,14 +6727,14 @@ out:
5704 next_balance = sd->last_balance + interval; 6727 next_balance = sd->last_balance + interval;
5705 update_next_balance = 1; 6728 update_next_balance = 1;
5706 } 6729 }
5707 6730 }
6731 if (need_decay) {
5708 /* 6732 /*
5709 * Stop the load balance at this level. There is another 6733 * Ensure the rq-wide value also decays but keep it at a
5710 * CPU in our sched group which is doing load balancing more 6734 * reasonable floor to avoid funnies with rq->avg_idle.
5711 * actively.
5712 */ 6735 */
5713 if (!continue_balancing) 6736 rq->max_idle_balance_cost =
5714 break; 6737 max((u64)sysctl_sched_migration_cost, max_cost);
5715 } 6738 }
5716 rcu_read_unlock(); 6739 rcu_read_unlock();
5717 6740
@@ -5781,6 +6804,8 @@ static inline int nohz_kick_needed(struct rq *rq, int cpu)
5781{ 6804{
5782 unsigned long now = jiffies; 6805 unsigned long now = jiffies;
5783 struct sched_domain *sd; 6806 struct sched_domain *sd;
6807 struct sched_group_power *sgp;
6808 int nr_busy;
5784 6809
5785 if (unlikely(idle_cpu(cpu))) 6810 if (unlikely(idle_cpu(cpu)))
5786 return 0; 6811 return 0;
@@ -5806,22 +6831,22 @@ static inline int nohz_kick_needed(struct rq *rq, int cpu)
5806 goto need_kick; 6831 goto need_kick;
5807 6832
5808 rcu_read_lock(); 6833 rcu_read_lock();
5809 for_each_domain(cpu, sd) { 6834 sd = rcu_dereference(per_cpu(sd_busy, cpu));
5810 struct sched_group *sg = sd->groups;
5811 struct sched_group_power *sgp = sg->sgp;
5812 int nr_busy = atomic_read(&sgp->nr_busy_cpus);
5813 6835
5814 if (sd->flags & SD_SHARE_PKG_RESOURCES && nr_busy > 1) 6836 if (sd) {
5815 goto need_kick_unlock; 6837 sgp = sd->groups->sgp;
6838 nr_busy = atomic_read(&sgp->nr_busy_cpus);
5816 6839
5817 if (sd->flags & SD_ASYM_PACKING && nr_busy != sg->group_weight 6840 if (nr_busy > 1)
5818 && (cpumask_first_and(nohz.idle_cpus_mask,
5819 sched_domain_span(sd)) < cpu))
5820 goto need_kick_unlock; 6841 goto need_kick_unlock;
5821
5822 if (!(sd->flags & (SD_SHARE_PKG_RESOURCES | SD_ASYM_PACKING)))
5823 break;
5824 } 6842 }
6843
6844 sd = rcu_dereference(per_cpu(sd_asym, cpu));
6845
6846 if (sd && (cpumask_first_and(nohz.idle_cpus_mask,
6847 sched_domain_span(sd)) < cpu))
6848 goto need_kick_unlock;
6849
5825 rcu_read_unlock(); 6850 rcu_read_unlock();
5826 return 0; 6851 return 0;
5827 6852
@@ -6214,7 +7239,8 @@ void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq,
6214 se->cfs_rq = parent->my_q; 7239 se->cfs_rq = parent->my_q;
6215 7240
6216 se->my_q = cfs_rq; 7241 se->my_q = cfs_rq;
6217 update_load_set(&se->load, 0); 7242 /* guarantee group entities always have weight */
7243 update_load_set(&se->load, NICE_0_LOAD);
6218 se->parent = parent; 7244 se->parent = parent;
6219} 7245}
6220 7246
diff --git a/kernel/sched/features.h b/kernel/sched/features.h
index 99399f8e4799..5716929a2e3a 100644
--- a/kernel/sched/features.h
+++ b/kernel/sched/features.h
@@ -63,10 +63,23 @@ SCHED_FEAT(LB_MIN, false)
63/* 63/*
64 * Apply the automatic NUMA scheduling policy. Enabled automatically 64 * Apply the automatic NUMA scheduling policy. Enabled automatically
65 * at runtime if running on a NUMA machine. Can be controlled via 65 * at runtime if running on a NUMA machine. Can be controlled via
66 * numa_balancing=. Allow PTE scanning to be forced on UMA machines 66 * numa_balancing=
67 * for debugging the core machinery.
68 */ 67 */
69#ifdef CONFIG_NUMA_BALANCING 68#ifdef CONFIG_NUMA_BALANCING
70SCHED_FEAT(NUMA, false) 69SCHED_FEAT(NUMA, false)
71SCHED_FEAT(NUMA_FORCE, false) 70
71/*
72 * NUMA_FAVOUR_HIGHER will favor moving tasks towards nodes where a
73 * higher number of hinting faults are recorded during active load
74 * balancing.
75 */
76SCHED_FEAT(NUMA_FAVOUR_HIGHER, true)
77
78/*
79 * NUMA_RESIST_LOWER will resist moving tasks towards nodes where a
80 * lower number of hinting faults have been recorded. As this has
81 * the potential to prevent a task ever migrating to a new node
82 * due to CPU overload it is disabled by default.
83 */
84SCHED_FEAT(NUMA_RESIST_LOWER, false)
72#endif 85#endif
diff --git a/kernel/sched/idle_task.c b/kernel/sched/idle_task.c
index d8da01008d39..516c3d9ceea1 100644
--- a/kernel/sched/idle_task.c
+++ b/kernel/sched/idle_task.c
@@ -9,7 +9,7 @@
9 9
10#ifdef CONFIG_SMP 10#ifdef CONFIG_SMP
11static int 11static int
12select_task_rq_idle(struct task_struct *p, int sd_flag, int flags) 12select_task_rq_idle(struct task_struct *p, int cpu, int sd_flag, int flags)
13{ 13{
14 return task_cpu(p); /* IDLE tasks as never migrated */ 14 return task_cpu(p); /* IDLE tasks as never migrated */
15} 15}
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
index 01970c8e64df..1c4065575fa2 100644
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -246,8 +246,10 @@ static inline void rt_set_overload(struct rq *rq)
246 * if we should look at the mask. It would be a shame 246 * if we should look at the mask. It would be a shame
247 * if we looked at the mask, but the mask was not 247 * if we looked at the mask, but the mask was not
248 * updated yet. 248 * updated yet.
249 *
250 * Matched by the barrier in pull_rt_task().
249 */ 251 */
250 wmb(); 252 smp_wmb();
251 atomic_inc(&rq->rd->rto_count); 253 atomic_inc(&rq->rd->rto_count);
252} 254}
253 255
@@ -899,6 +901,13 @@ inc_rt_prio_smp(struct rt_rq *rt_rq, int prio, int prev_prio)
899{ 901{
900 struct rq *rq = rq_of_rt_rq(rt_rq); 902 struct rq *rq = rq_of_rt_rq(rt_rq);
901 903
904#ifdef CONFIG_RT_GROUP_SCHED
905 /*
906 * Change rq's cpupri only if rt_rq is the top queue.
907 */
908 if (&rq->rt != rt_rq)
909 return;
910#endif
902 if (rq->online && prio < prev_prio) 911 if (rq->online && prio < prev_prio)
903 cpupri_set(&rq->rd->cpupri, rq->cpu, prio); 912 cpupri_set(&rq->rd->cpupri, rq->cpu, prio);
904} 913}
@@ -908,6 +917,13 @@ dec_rt_prio_smp(struct rt_rq *rt_rq, int prio, int prev_prio)
908{ 917{
909 struct rq *rq = rq_of_rt_rq(rt_rq); 918 struct rq *rq = rq_of_rt_rq(rt_rq);
910 919
920#ifdef CONFIG_RT_GROUP_SCHED
921 /*
922 * Change rq's cpupri only if rt_rq is the top queue.
923 */
924 if (&rq->rt != rt_rq)
925 return;
926#endif
911 if (rq->online && rt_rq->highest_prio.curr != prev_prio) 927 if (rq->online && rt_rq->highest_prio.curr != prev_prio)
912 cpupri_set(&rq->rd->cpupri, rq->cpu, rt_rq->highest_prio.curr); 928 cpupri_set(&rq->rd->cpupri, rq->cpu, rt_rq->highest_prio.curr);
913} 929}
@@ -1169,13 +1185,10 @@ static void yield_task_rt(struct rq *rq)
1169static int find_lowest_rq(struct task_struct *task); 1185static int find_lowest_rq(struct task_struct *task);
1170 1186
1171static int 1187static int
1172select_task_rq_rt(struct task_struct *p, int sd_flag, int flags) 1188select_task_rq_rt(struct task_struct *p, int cpu, int sd_flag, int flags)
1173{ 1189{
1174 struct task_struct *curr; 1190 struct task_struct *curr;
1175 struct rq *rq; 1191 struct rq *rq;
1176 int cpu;
1177
1178 cpu = task_cpu(p);
1179 1192
1180 if (p->nr_cpus_allowed == 1) 1193 if (p->nr_cpus_allowed == 1)
1181 goto out; 1194 goto out;
@@ -1213,8 +1226,7 @@ select_task_rq_rt(struct task_struct *p, int sd_flag, int flags)
1213 */ 1226 */
1214 if (curr && unlikely(rt_task(curr)) && 1227 if (curr && unlikely(rt_task(curr)) &&
1215 (curr->nr_cpus_allowed < 2 || 1228 (curr->nr_cpus_allowed < 2 ||
1216 curr->prio <= p->prio) && 1229 curr->prio <= p->prio)) {
1217 (p->nr_cpus_allowed > 1)) {
1218 int target = find_lowest_rq(p); 1230 int target = find_lowest_rq(p);
1219 1231
1220 if (target != -1) 1232 if (target != -1)
@@ -1630,6 +1642,12 @@ static int pull_rt_task(struct rq *this_rq)
1630 if (likely(!rt_overloaded(this_rq))) 1642 if (likely(!rt_overloaded(this_rq)))
1631 return 0; 1643 return 0;
1632 1644
1645 /*
1646 * Match the barrier from rt_set_overloaded; this guarantees that if we
1647 * see overloaded we must also see the rto_mask bit.
1648 */
1649 smp_rmb();
1650
1633 for_each_cpu(cpu, this_rq->rd->rto_mask) { 1651 for_each_cpu(cpu, this_rq->rd->rto_mask) {
1634 if (this_cpu == cpu) 1652 if (this_cpu == cpu)
1635 continue; 1653 continue;
@@ -1931,8 +1949,8 @@ static void task_tick_rt(struct rq *rq, struct task_struct *p, int queued)
1931 p->rt.time_slice = sched_rr_timeslice; 1949 p->rt.time_slice = sched_rr_timeslice;
1932 1950
1933 /* 1951 /*
1934 * Requeue to the end of queue if we (and all of our ancestors) are the 1952 * Requeue to the end of queue if we (and all of our ancestors) are not
1935 * only element on the queue 1953 * the only element on the queue
1936 */ 1954 */
1937 for_each_sched_rt_entity(rt_se) { 1955 for_each_sched_rt_entity(rt_se) {
1938 if (rt_se->run_list.prev != rt_se->run_list.next) { 1956 if (rt_se->run_list.prev != rt_se->run_list.next) {
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index b3c5653e1dca..88c85b21d633 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -6,6 +6,7 @@
6#include <linux/spinlock.h> 6#include <linux/spinlock.h>
7#include <linux/stop_machine.h> 7#include <linux/stop_machine.h>
8#include <linux/tick.h> 8#include <linux/tick.h>
9#include <linux/slab.h>
9 10
10#include "cpupri.h" 11#include "cpupri.h"
11#include "cpuacct.h" 12#include "cpuacct.h"
@@ -408,6 +409,10 @@ struct rq {
408 * remote CPUs use both these fields when doing load calculation. 409 * remote CPUs use both these fields when doing load calculation.
409 */ 410 */
410 unsigned int nr_running; 411 unsigned int nr_running;
412#ifdef CONFIG_NUMA_BALANCING
413 unsigned int nr_numa_running;
414 unsigned int nr_preferred_running;
415#endif
411 #define CPU_LOAD_IDX_MAX 5 416 #define CPU_LOAD_IDX_MAX 5
412 unsigned long cpu_load[CPU_LOAD_IDX_MAX]; 417 unsigned long cpu_load[CPU_LOAD_IDX_MAX];
413 unsigned long last_load_update_tick; 418 unsigned long last_load_update_tick;
@@ -476,6 +481,9 @@ struct rq {
476 u64 age_stamp; 481 u64 age_stamp;
477 u64 idle_stamp; 482 u64 idle_stamp;
478 u64 avg_idle; 483 u64 avg_idle;
484
485 /* This is used to determine avg_idle's max value */
486 u64 max_idle_balance_cost;
479#endif 487#endif
480 488
481#ifdef CONFIG_IRQ_TIME_ACCOUNTING 489#ifdef CONFIG_IRQ_TIME_ACCOUNTING
@@ -552,6 +560,12 @@ static inline u64 rq_clock_task(struct rq *rq)
552 return rq->clock_task; 560 return rq->clock_task;
553} 561}
554 562
563#ifdef CONFIG_NUMA_BALANCING
564extern void sched_setnuma(struct task_struct *p, int node);
565extern int migrate_task_to(struct task_struct *p, int cpu);
566extern int migrate_swap(struct task_struct *, struct task_struct *);
567#endif /* CONFIG_NUMA_BALANCING */
568
555#ifdef CONFIG_SMP 569#ifdef CONFIG_SMP
556 570
557#define rcu_dereference_check_sched_domain(p) \ 571#define rcu_dereference_check_sched_domain(p) \
@@ -593,9 +607,24 @@ static inline struct sched_domain *highest_flag_domain(int cpu, int flag)
593 return hsd; 607 return hsd;
594} 608}
595 609
610static inline struct sched_domain *lowest_flag_domain(int cpu, int flag)
611{
612 struct sched_domain *sd;
613
614 for_each_domain(cpu, sd) {
615 if (sd->flags & flag)
616 break;
617 }
618
619 return sd;
620}
621
596DECLARE_PER_CPU(struct sched_domain *, sd_llc); 622DECLARE_PER_CPU(struct sched_domain *, sd_llc);
597DECLARE_PER_CPU(int, sd_llc_size); 623DECLARE_PER_CPU(int, sd_llc_size);
598DECLARE_PER_CPU(int, sd_llc_id); 624DECLARE_PER_CPU(int, sd_llc_id);
625DECLARE_PER_CPU(struct sched_domain *, sd_numa);
626DECLARE_PER_CPU(struct sched_domain *, sd_busy);
627DECLARE_PER_CPU(struct sched_domain *, sd_asym);
599 628
600struct sched_group_power { 629struct sched_group_power {
601 atomic_t ref; 630 atomic_t ref;
@@ -605,6 +634,7 @@ struct sched_group_power {
605 */ 634 */
606 unsigned int power, power_orig; 635 unsigned int power, power_orig;
607 unsigned long next_update; 636 unsigned long next_update;
637 int imbalance; /* XXX unrelated to power but shared group state */
608 /* 638 /*
609 * Number of busy cpus in this group. 639 * Number of busy cpus in this group.
610 */ 640 */
@@ -719,6 +749,7 @@ static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu)
719 */ 749 */
720 smp_wmb(); 750 smp_wmb();
721 task_thread_info(p)->cpu = cpu; 751 task_thread_info(p)->cpu = cpu;
752 p->wake_cpu = cpu;
722#endif 753#endif
723} 754}
724 755
@@ -974,7 +1005,7 @@ struct sched_class {
974 void (*put_prev_task) (struct rq *rq, struct task_struct *p); 1005 void (*put_prev_task) (struct rq *rq, struct task_struct *p);
975 1006
976#ifdef CONFIG_SMP 1007#ifdef CONFIG_SMP
977 int (*select_task_rq)(struct task_struct *p, int sd_flag, int flags); 1008 int (*select_task_rq)(struct task_struct *p, int task_cpu, int sd_flag, int flags);
978 void (*migrate_task_rq)(struct task_struct *p, int next_cpu); 1009 void (*migrate_task_rq)(struct task_struct *p, int next_cpu);
979 1010
980 void (*pre_schedule) (struct rq *this_rq, struct task_struct *task); 1011 void (*pre_schedule) (struct rq *this_rq, struct task_struct *task);
@@ -1220,6 +1251,24 @@ static inline void double_unlock_balance(struct rq *this_rq, struct rq *busiest)
1220 lock_set_subclass(&this_rq->lock.dep_map, 0, _RET_IP_); 1251 lock_set_subclass(&this_rq->lock.dep_map, 0, _RET_IP_);
1221} 1252}
1222 1253
1254static inline void double_lock(spinlock_t *l1, spinlock_t *l2)
1255{
1256 if (l1 > l2)
1257 swap(l1, l2);
1258
1259 spin_lock(l1);
1260 spin_lock_nested(l2, SINGLE_DEPTH_NESTING);
1261}
1262
1263static inline void double_raw_lock(raw_spinlock_t *l1, raw_spinlock_t *l2)
1264{
1265 if (l1 > l2)
1266 swap(l1, l2);
1267
1268 raw_spin_lock(l1);
1269 raw_spin_lock_nested(l2, SINGLE_DEPTH_NESTING);
1270}
1271
1223/* 1272/*
1224 * double_rq_lock - safely lock two runqueues 1273 * double_rq_lock - safely lock two runqueues
1225 * 1274 *
@@ -1305,7 +1354,8 @@ extern void print_rt_stats(struct seq_file *m, int cpu);
1305extern void init_cfs_rq(struct cfs_rq *cfs_rq); 1354extern void init_cfs_rq(struct cfs_rq *cfs_rq);
1306extern void init_rt_rq(struct rt_rq *rt_rq, struct rq *rq); 1355extern void init_rt_rq(struct rt_rq *rt_rq, struct rq *rq);
1307 1356
1308extern void account_cfs_bandwidth_used(int enabled, int was_enabled); 1357extern void cfs_bandwidth_usage_inc(void);
1358extern void cfs_bandwidth_usage_dec(void);
1309 1359
1310#ifdef CONFIG_NO_HZ_COMMON 1360#ifdef CONFIG_NO_HZ_COMMON
1311enum rq_nohz_flag_bits { 1361enum rq_nohz_flag_bits {
diff --git a/kernel/sched/stats.h b/kernel/sched/stats.h
index c7edee71bce8..4ab704339656 100644
--- a/kernel/sched/stats.h
+++ b/kernel/sched/stats.h
@@ -59,9 +59,9 @@ static inline void sched_info_reset_dequeued(struct task_struct *t)
59 * from dequeue_task() to account for possible rq->clock skew across cpus. The 59 * from dequeue_task() to account for possible rq->clock skew across cpus. The
60 * delta taken on each cpu would annul the skew. 60 * delta taken on each cpu would annul the skew.
61 */ 61 */
62static inline void sched_info_dequeued(struct task_struct *t) 62static inline void sched_info_dequeued(struct rq *rq, struct task_struct *t)
63{ 63{
64 unsigned long long now = rq_clock(task_rq(t)), delta = 0; 64 unsigned long long now = rq_clock(rq), delta = 0;
65 65
66 if (unlikely(sched_info_on())) 66 if (unlikely(sched_info_on()))
67 if (t->sched_info.last_queued) 67 if (t->sched_info.last_queued)
@@ -69,7 +69,7 @@ static inline void sched_info_dequeued(struct task_struct *t)
69 sched_info_reset_dequeued(t); 69 sched_info_reset_dequeued(t);
70 t->sched_info.run_delay += delta; 70 t->sched_info.run_delay += delta;
71 71
72 rq_sched_info_dequeued(task_rq(t), delta); 72 rq_sched_info_dequeued(rq, delta);
73} 73}
74 74
75/* 75/*
@@ -77,9 +77,9 @@ static inline void sched_info_dequeued(struct task_struct *t)
77 * long it was waiting to run. We also note when it began so that we 77 * long it was waiting to run. We also note when it began so that we
78 * can keep stats on how long its timeslice is. 78 * can keep stats on how long its timeslice is.
79 */ 79 */
80static void sched_info_arrive(struct task_struct *t) 80static void sched_info_arrive(struct rq *rq, struct task_struct *t)
81{ 81{
82 unsigned long long now = rq_clock(task_rq(t)), delta = 0; 82 unsigned long long now = rq_clock(rq), delta = 0;
83 83
84 if (t->sched_info.last_queued) 84 if (t->sched_info.last_queued)
85 delta = now - t->sched_info.last_queued; 85 delta = now - t->sched_info.last_queued;
@@ -88,7 +88,7 @@ static void sched_info_arrive(struct task_struct *t)
88 t->sched_info.last_arrival = now; 88 t->sched_info.last_arrival = now;
89 t->sched_info.pcount++; 89 t->sched_info.pcount++;
90 90
91 rq_sched_info_arrive(task_rq(t), delta); 91 rq_sched_info_arrive(rq, delta);
92} 92}
93 93
94/* 94/*
@@ -96,11 +96,11 @@ static void sched_info_arrive(struct task_struct *t)
96 * the timestamp if it is already not set. It's assumed that 96 * the timestamp if it is already not set. It's assumed that
97 * sched_info_dequeued() will clear that stamp when appropriate. 97 * sched_info_dequeued() will clear that stamp when appropriate.
98 */ 98 */
99static inline void sched_info_queued(struct task_struct *t) 99static inline void sched_info_queued(struct rq *rq, struct task_struct *t)
100{ 100{
101 if (unlikely(sched_info_on())) 101 if (unlikely(sched_info_on()))
102 if (!t->sched_info.last_queued) 102 if (!t->sched_info.last_queued)
103 t->sched_info.last_queued = rq_clock(task_rq(t)); 103 t->sched_info.last_queued = rq_clock(rq);
104} 104}
105 105
106/* 106/*
@@ -111,15 +111,15 @@ static inline void sched_info_queued(struct task_struct *t)
111 * sched_info_queued() to mark that it has now again started waiting on 111 * sched_info_queued() to mark that it has now again started waiting on
112 * the runqueue. 112 * the runqueue.
113 */ 113 */
114static inline void sched_info_depart(struct task_struct *t) 114static inline void sched_info_depart(struct rq *rq, struct task_struct *t)
115{ 115{
116 unsigned long long delta = rq_clock(task_rq(t)) - 116 unsigned long long delta = rq_clock(rq) -
117 t->sched_info.last_arrival; 117 t->sched_info.last_arrival;
118 118
119 rq_sched_info_depart(task_rq(t), delta); 119 rq_sched_info_depart(rq, delta);
120 120
121 if (t->state == TASK_RUNNING) 121 if (t->state == TASK_RUNNING)
122 sched_info_queued(t); 122 sched_info_queued(rq, t);
123} 123}
124 124
125/* 125/*
@@ -128,32 +128,34 @@ static inline void sched_info_depart(struct task_struct *t)
128 * the idle task.) We are only called when prev != next. 128 * the idle task.) We are only called when prev != next.
129 */ 129 */
130static inline void 130static inline void
131__sched_info_switch(struct task_struct *prev, struct task_struct *next) 131__sched_info_switch(struct rq *rq,
132 struct task_struct *prev, struct task_struct *next)
132{ 133{
133 struct rq *rq = task_rq(prev);
134
135 /* 134 /*
136 * prev now departs the cpu. It's not interesting to record 135 * prev now departs the cpu. It's not interesting to record
137 * stats about how efficient we were at scheduling the idle 136 * stats about how efficient we were at scheduling the idle
138 * process, however. 137 * process, however.
139 */ 138 */
140 if (prev != rq->idle) 139 if (prev != rq->idle)
141 sched_info_depart(prev); 140 sched_info_depart(rq, prev);
142 141
143 if (next != rq->idle) 142 if (next != rq->idle)
144 sched_info_arrive(next); 143 sched_info_arrive(rq, next);
145} 144}
146static inline void 145static inline void
147sched_info_switch(struct task_struct *prev, struct task_struct *next) 146sched_info_switch(struct rq *rq,
147 struct task_struct *prev, struct task_struct *next)
148{ 148{
149 if (unlikely(sched_info_on())) 149 if (unlikely(sched_info_on()))
150 __sched_info_switch(prev, next); 150 __sched_info_switch(rq, prev, next);
151} 151}
152#else 152#else
153#define sched_info_queued(t) do { } while (0) 153#define sched_info_queued(rq, t) do { } while (0)
154#define sched_info_reset_dequeued(t) do { } while (0) 154#define sched_info_reset_dequeued(t) do { } while (0)
155#define sched_info_dequeued(t) do { } while (0) 155#define sched_info_dequeued(rq, t) do { } while (0)
156#define sched_info_switch(t, next) do { } while (0) 156#define sched_info_depart(rq, t) do { } while (0)
157#define sched_info_arrive(rq, next) do { } while (0)
158#define sched_info_switch(rq, t, next) do { } while (0)
157#endif /* CONFIG_SCHEDSTATS || CONFIG_TASK_DELAY_ACCT */ 159#endif /* CONFIG_SCHEDSTATS || CONFIG_TASK_DELAY_ACCT */
158 160
159/* 161/*
diff --git a/kernel/sched/stop_task.c b/kernel/sched/stop_task.c
index e08fbeeb54b9..47197de8abd9 100644
--- a/kernel/sched/stop_task.c
+++ b/kernel/sched/stop_task.c
@@ -11,7 +11,7 @@
11 11
12#ifdef CONFIG_SMP 12#ifdef CONFIG_SMP
13static int 13static int
14select_task_rq_stop(struct task_struct *p, int sd_flag, int flags) 14select_task_rq_stop(struct task_struct *p, int cpu, int sd_flag, int flags)
15{ 15{
16 return task_cpu(p); /* stop tasks as never migrate */ 16 return task_cpu(p); /* stop tasks as never migrate */
17} 17}
diff --git a/kernel/wait.c b/kernel/sched/wait.c
index d550920e040c..7d50f794e248 100644
--- a/kernel/wait.c
+++ b/kernel/sched/wait.c
@@ -53,6 +53,109 @@ EXPORT_SYMBOL(remove_wait_queue);
53 53
54 54
55/* 55/*
56 * The core wakeup function. Non-exclusive wakeups (nr_exclusive == 0) just
57 * wake everything up. If it's an exclusive wakeup (nr_exclusive == small +ve
58 * number) then we wake all the non-exclusive tasks and one exclusive task.
59 *
60 * There are circumstances in which we can try to wake a task which has already
61 * started to run but is not in state TASK_RUNNING. try_to_wake_up() returns
62 * zero in this (rare) case, and we handle it by continuing to scan the queue.
63 */
64static void __wake_up_common(wait_queue_head_t *q, unsigned int mode,
65 int nr_exclusive, int wake_flags, void *key)
66{
67 wait_queue_t *curr, *next;
68
69 list_for_each_entry_safe(curr, next, &q->task_list, task_list) {
70 unsigned flags = curr->flags;
71
72 if (curr->func(curr, mode, wake_flags, key) &&
73 (flags & WQ_FLAG_EXCLUSIVE) && !--nr_exclusive)
74 break;
75 }
76}
77
78/**
79 * __wake_up - wake up threads blocked on a waitqueue.
80 * @q: the waitqueue
81 * @mode: which threads
82 * @nr_exclusive: how many wake-one or wake-many threads to wake up
83 * @key: is directly passed to the wakeup function
84 *
85 * It may be assumed that this function implies a write memory barrier before
86 * changing the task state if and only if any tasks are woken up.
87 */
88void __wake_up(wait_queue_head_t *q, unsigned int mode,
89 int nr_exclusive, void *key)
90{
91 unsigned long flags;
92
93 spin_lock_irqsave(&q->lock, flags);
94 __wake_up_common(q, mode, nr_exclusive, 0, key);
95 spin_unlock_irqrestore(&q->lock, flags);
96}
97EXPORT_SYMBOL(__wake_up);
98
99/*
100 * Same as __wake_up but called with the spinlock in wait_queue_head_t held.
101 */
102void __wake_up_locked(wait_queue_head_t *q, unsigned int mode, int nr)
103{
104 __wake_up_common(q, mode, nr, 0, NULL);
105}
106EXPORT_SYMBOL_GPL(__wake_up_locked);
107
108void __wake_up_locked_key(wait_queue_head_t *q, unsigned int mode, void *key)
109{
110 __wake_up_common(q, mode, 1, 0, key);
111}
112EXPORT_SYMBOL_GPL(__wake_up_locked_key);
113
114/**
115 * __wake_up_sync_key - wake up threads blocked on a waitqueue.
116 * @q: the waitqueue
117 * @mode: which threads
118 * @nr_exclusive: how many wake-one or wake-many threads to wake up
119 * @key: opaque value to be passed to wakeup targets
120 *
121 * The sync wakeup differs that the waker knows that it will schedule
122 * away soon, so while the target thread will be woken up, it will not
123 * be migrated to another CPU - ie. the two threads are 'synchronized'
124 * with each other. This can prevent needless bouncing between CPUs.
125 *
126 * On UP it can prevent extra preemption.
127 *
128 * It may be assumed that this function implies a write memory barrier before
129 * changing the task state if and only if any tasks are woken up.
130 */
131void __wake_up_sync_key(wait_queue_head_t *q, unsigned int mode,
132 int nr_exclusive, void *key)
133{
134 unsigned long flags;
135 int wake_flags = 1; /* XXX WF_SYNC */
136
137 if (unlikely(!q))
138 return;
139
140 if (unlikely(nr_exclusive != 1))
141 wake_flags = 0;
142
143 spin_lock_irqsave(&q->lock, flags);
144 __wake_up_common(q, mode, nr_exclusive, wake_flags, key);
145 spin_unlock_irqrestore(&q->lock, flags);
146}
147EXPORT_SYMBOL_GPL(__wake_up_sync_key);
148
149/*
150 * __wake_up_sync - see __wake_up_sync_key()
151 */
152void __wake_up_sync(wait_queue_head_t *q, unsigned int mode, int nr_exclusive)
153{
154 __wake_up_sync_key(q, mode, nr_exclusive, NULL);
155}
156EXPORT_SYMBOL_GPL(__wake_up_sync); /* For internal use only */
157
158/*
56 * Note: we use "set_current_state()" _after_ the wait-queue add, 159 * Note: we use "set_current_state()" _after_ the wait-queue add,
57 * because we need a memory barrier there on SMP, so that any 160 * because we need a memory barrier there on SMP, so that any
58 * wake-function that tests for the wait-queue being active 161 * wake-function that tests for the wait-queue being active
@@ -92,6 +195,30 @@ prepare_to_wait_exclusive(wait_queue_head_t *q, wait_queue_t *wait, int state)
92} 195}
93EXPORT_SYMBOL(prepare_to_wait_exclusive); 196EXPORT_SYMBOL(prepare_to_wait_exclusive);
94 197
198long prepare_to_wait_event(wait_queue_head_t *q, wait_queue_t *wait, int state)
199{
200 unsigned long flags;
201
202 if (signal_pending_state(state, current))
203 return -ERESTARTSYS;
204
205 wait->private = current;
206 wait->func = autoremove_wake_function;
207
208 spin_lock_irqsave(&q->lock, flags);
209 if (list_empty(&wait->task_list)) {
210 if (wait->flags & WQ_FLAG_EXCLUSIVE)
211 __add_wait_queue_tail(q, wait);
212 else
213 __add_wait_queue(q, wait);
214 }
215 set_current_state(state);
216 spin_unlock_irqrestore(&q->lock, flags);
217
218 return 0;
219}
220EXPORT_SYMBOL(prepare_to_wait_event);
221
95/** 222/**
96 * finish_wait - clean up after waiting in a queue 223 * finish_wait - clean up after waiting in a queue
97 * @q: waitqueue waited on 224 * @q: waitqueue waited on
diff --git a/kernel/signal.c b/kernel/signal.c
index ded28b91fa53..940b30ee9a30 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -2723,7 +2723,7 @@ COMPAT_SYSCALL_DEFINE2(rt_sigpending, compat_sigset_t __user *, uset,
2723 2723
2724#ifndef HAVE_ARCH_COPY_SIGINFO_TO_USER 2724#ifndef HAVE_ARCH_COPY_SIGINFO_TO_USER
2725 2725
2726int copy_siginfo_to_user(siginfo_t __user *to, siginfo_t *from) 2726int copy_siginfo_to_user(siginfo_t __user *to, const siginfo_t *from)
2727{ 2727{
2728 int err; 2728 int err;
2729 2729
diff --git a/kernel/smp.c b/kernel/smp.c
index 0564571dcdf7..bd9f94028838 100644
--- a/kernel/smp.c
+++ b/kernel/smp.c
@@ -15,9 +15,9 @@
15 15
16#include "smpboot.h" 16#include "smpboot.h"
17 17
18#ifdef CONFIG_USE_GENERIC_SMP_HELPERS
19enum { 18enum {
20 CSD_FLAG_LOCK = 0x01, 19 CSD_FLAG_LOCK = 0x01,
20 CSD_FLAG_WAIT = 0x02,
21}; 21};
22 22
23struct call_function_data { 23struct call_function_data {
@@ -124,7 +124,7 @@ static void csd_lock(struct call_single_data *csd)
124 124
125static void csd_unlock(struct call_single_data *csd) 125static void csd_unlock(struct call_single_data *csd)
126{ 126{
127 WARN_ON(!(csd->flags & CSD_FLAG_LOCK)); 127 WARN_ON((csd->flags & CSD_FLAG_WAIT) && !(csd->flags & CSD_FLAG_LOCK));
128 128
129 /* 129 /*
130 * ensure we're all done before releasing data: 130 * ensure we're all done before releasing data:
@@ -139,13 +139,15 @@ static void csd_unlock(struct call_single_data *csd)
139 * for execution on the given CPU. data must already have 139 * for execution on the given CPU. data must already have
140 * ->func, ->info, and ->flags set. 140 * ->func, ->info, and ->flags set.
141 */ 141 */
142static 142static void generic_exec_single(int cpu, struct call_single_data *csd, int wait)
143void generic_exec_single(int cpu, struct call_single_data *csd, int wait)
144{ 143{
145 struct call_single_queue *dst = &per_cpu(call_single_queue, cpu); 144 struct call_single_queue *dst = &per_cpu(call_single_queue, cpu);
146 unsigned long flags; 145 unsigned long flags;
147 int ipi; 146 int ipi;
148 147
148 if (wait)
149 csd->flags |= CSD_FLAG_WAIT;
150
149 raw_spin_lock_irqsave(&dst->lock, flags); 151 raw_spin_lock_irqsave(&dst->lock, flags);
150 ipi = list_empty(&dst->list); 152 ipi = list_empty(&dst->list);
151 list_add_tail(&csd->list, &dst->list); 153 list_add_tail(&csd->list, &dst->list);
@@ -340,6 +342,7 @@ void __smp_call_function_single(int cpu, struct call_single_data *csd,
340 } 342 }
341 put_cpu(); 343 put_cpu();
342} 344}
345EXPORT_SYMBOL_GPL(__smp_call_function_single);
343 346
344/** 347/**
345 * smp_call_function_many(): Run a function on a set of other CPUs. 348 * smp_call_function_many(): Run a function on a set of other CPUs.
@@ -459,7 +462,6 @@ int smp_call_function(smp_call_func_t func, void *info, int wait)
459 return 0; 462 return 0;
460} 463}
461EXPORT_SYMBOL(smp_call_function); 464EXPORT_SYMBOL(smp_call_function);
462#endif /* USE_GENERIC_SMP_HELPERS */
463 465
464/* Setup configured maximum number of CPUs to activate */ 466/* Setup configured maximum number of CPUs to activate */
465unsigned int setup_max_cpus = NR_CPUS; 467unsigned int setup_max_cpus = NR_CPUS;
@@ -524,6 +526,11 @@ void __init setup_nr_cpu_ids(void)
524 nr_cpu_ids = find_last_bit(cpumask_bits(cpu_possible_mask),NR_CPUS) + 1; 526 nr_cpu_ids = find_last_bit(cpumask_bits(cpu_possible_mask),NR_CPUS) + 1;
525} 527}
526 528
529void __weak smp_announce(void)
530{
531 printk(KERN_INFO "Brought up %d CPUs\n", num_online_cpus());
532}
533
527/* Called by boot processor to activate the rest. */ 534/* Called by boot processor to activate the rest. */
528void __init smp_init(void) 535void __init smp_init(void)
529{ 536{
@@ -540,7 +547,7 @@ void __init smp_init(void)
540 } 547 }
541 548
542 /* Any cleanup work */ 549 /* Any cleanup work */
543 printk(KERN_INFO "Brought up %ld CPUs\n", (long)num_online_cpus()); 550 smp_announce();
544 smp_cpus_done(setup_max_cpus); 551 smp_cpus_done(setup_max_cpus);
545} 552}
546 553
diff --git a/kernel/softirq.c b/kernel/softirq.c
index d7d498d8cc4f..11025ccc06dd 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -6,8 +6,6 @@
6 * Distribute under GPLv2. 6 * Distribute under GPLv2.
7 * 7 *
8 * Rewritten. Old one was good in 2.2, but in 2.3 it was immoral. --ANK (990903) 8 * Rewritten. Old one was good in 2.2, but in 2.3 it was immoral. --ANK (990903)
9 *
10 * Remote softirq infrastructure is by Jens Axboe.
11 */ 9 */
12 10
13#include <linux/export.h> 11#include <linux/export.h>
@@ -29,7 +27,6 @@
29#define CREATE_TRACE_POINTS 27#define CREATE_TRACE_POINTS
30#include <trace/events/irq.h> 28#include <trace/events/irq.h>
31 29
32#include <asm/irq.h>
33/* 30/*
34 - No shared variables, all the data are CPU local. 31 - No shared variables, all the data are CPU local.
35 - If a softirq needs serialization, let it serialize itself 32 - If a softirq needs serialization, let it serialize itself
@@ -100,13 +97,13 @@ static void __local_bh_disable(unsigned long ip, unsigned int cnt)
100 97
101 raw_local_irq_save(flags); 98 raw_local_irq_save(flags);
102 /* 99 /*
103 * The preempt tracer hooks into add_preempt_count and will break 100 * The preempt tracer hooks into preempt_count_add and will break
104 * lockdep because it calls back into lockdep after SOFTIRQ_OFFSET 101 * lockdep because it calls back into lockdep after SOFTIRQ_OFFSET
105 * is set and before current->softirq_enabled is cleared. 102 * is set and before current->softirq_enabled is cleared.
106 * We must manually increment preempt_count here and manually 103 * We must manually increment preempt_count here and manually
107 * call the trace_preempt_off later. 104 * call the trace_preempt_off later.
108 */ 105 */
109 preempt_count() += cnt; 106 __preempt_count_add(cnt);
110 /* 107 /*
111 * Were softirqs turned off above: 108 * Were softirqs turned off above:
112 */ 109 */
@@ -120,7 +117,7 @@ static void __local_bh_disable(unsigned long ip, unsigned int cnt)
120#else /* !CONFIG_TRACE_IRQFLAGS */ 117#else /* !CONFIG_TRACE_IRQFLAGS */
121static inline void __local_bh_disable(unsigned long ip, unsigned int cnt) 118static inline void __local_bh_disable(unsigned long ip, unsigned int cnt)
122{ 119{
123 add_preempt_count(cnt); 120 preempt_count_add(cnt);
124 barrier(); 121 barrier();
125} 122}
126#endif /* CONFIG_TRACE_IRQFLAGS */ 123#endif /* CONFIG_TRACE_IRQFLAGS */
@@ -134,12 +131,11 @@ EXPORT_SYMBOL(local_bh_disable);
134 131
135static void __local_bh_enable(unsigned int cnt) 132static void __local_bh_enable(unsigned int cnt)
136{ 133{
137 WARN_ON_ONCE(in_irq());
138 WARN_ON_ONCE(!irqs_disabled()); 134 WARN_ON_ONCE(!irqs_disabled());
139 135
140 if (softirq_count() == cnt) 136 if (softirq_count() == cnt)
141 trace_softirqs_on(_RET_IP_); 137 trace_softirqs_on(_RET_IP_);
142 sub_preempt_count(cnt); 138 preempt_count_sub(cnt);
143} 139}
144 140
145/* 141/*
@@ -149,6 +145,7 @@ static void __local_bh_enable(unsigned int cnt)
149 */ 145 */
150void _local_bh_enable(void) 146void _local_bh_enable(void)
151{ 147{
148 WARN_ON_ONCE(in_irq());
152 __local_bh_enable(SOFTIRQ_DISABLE_OFFSET); 149 __local_bh_enable(SOFTIRQ_DISABLE_OFFSET);
153} 150}
154 151
@@ -169,12 +166,17 @@ static inline void _local_bh_enable_ip(unsigned long ip)
169 * Keep preemption disabled until we are done with 166 * Keep preemption disabled until we are done with
170 * softirq processing: 167 * softirq processing:
171 */ 168 */
172 sub_preempt_count(SOFTIRQ_DISABLE_OFFSET - 1); 169 preempt_count_sub(SOFTIRQ_DISABLE_OFFSET - 1);
173 170
174 if (unlikely(!in_interrupt() && local_softirq_pending())) 171 if (unlikely(!in_interrupt() && local_softirq_pending())) {
172 /*
173 * Run softirq if any pending. And do it in its own stack
174 * as we may be calling this deep in a task call stack already.
175 */
175 do_softirq(); 176 do_softirq();
177 }
176 178
177 dec_preempt_count(); 179 preempt_count_dec();
178#ifdef CONFIG_TRACE_IRQFLAGS 180#ifdef CONFIG_TRACE_IRQFLAGS
179 local_irq_enable(); 181 local_irq_enable();
180#endif 182#endif
@@ -256,7 +258,7 @@ restart:
256 " exited with %08x?\n", vec_nr, 258 " exited with %08x?\n", vec_nr,
257 softirq_to_name[vec_nr], h->action, 259 softirq_to_name[vec_nr], h->action,
258 prev_count, preempt_count()); 260 prev_count, preempt_count());
259 preempt_count() = prev_count; 261 preempt_count_set(prev_count);
260 } 262 }
261 263
262 rcu_bh_qs(cpu); 264 rcu_bh_qs(cpu);
@@ -280,10 +282,11 @@ restart:
280 282
281 account_irq_exit_time(current); 283 account_irq_exit_time(current);
282 __local_bh_enable(SOFTIRQ_OFFSET); 284 __local_bh_enable(SOFTIRQ_OFFSET);
285 WARN_ON_ONCE(in_interrupt());
283 tsk_restore_flags(current, old_flags, PF_MEMALLOC); 286 tsk_restore_flags(current, old_flags, PF_MEMALLOC);
284} 287}
285 288
286#ifndef __ARCH_HAS_DO_SOFTIRQ 289
287 290
288asmlinkage void do_softirq(void) 291asmlinkage void do_softirq(void)
289{ 292{
@@ -298,13 +301,11 @@ asmlinkage void do_softirq(void)
298 pending = local_softirq_pending(); 301 pending = local_softirq_pending();
299 302
300 if (pending) 303 if (pending)
301 __do_softirq(); 304 do_softirq_own_stack();
302 305
303 local_irq_restore(flags); 306 local_irq_restore(flags);
304} 307}
305 308
306#endif
307
308/* 309/*
309 * Enter an interrupt context. 310 * Enter an interrupt context.
310 */ 311 */
@@ -329,15 +330,21 @@ void irq_enter(void)
329static inline void invoke_softirq(void) 330static inline void invoke_softirq(void)
330{ 331{
331 if (!force_irqthreads) { 332 if (!force_irqthreads) {
333#ifdef CONFIG_HAVE_IRQ_EXIT_ON_IRQ_STACK
332 /* 334 /*
333 * We can safely execute softirq on the current stack if 335 * We can safely execute softirq on the current stack if
334 * it is the irq stack, because it should be near empty 336 * it is the irq stack, because it should be near empty
335 * at this stage. But we have no way to know if the arch 337 * at this stage.
336 * calls irq_exit() on the irq stack. So call softirq
337 * in its own stack to prevent from any overrun on top
338 * of a potentially deep task stack.
339 */ 338 */
340 do_softirq(); 339 __do_softirq();
340#else
341 /*
342 * Otherwise, irq_exit() is called on the task stack that can
343 * be potentially deep already. So call softirq in its own stack
344 * to prevent from any overrun.
345 */
346 do_softirq_own_stack();
347#endif
341 } else { 348 } else {
342 wakeup_softirqd(); 349 wakeup_softirqd();
343 } 350 }
@@ -369,7 +376,7 @@ void irq_exit(void)
369 376
370 account_irq_exit_time(current); 377 account_irq_exit_time(current);
371 trace_hardirq_exit(); 378 trace_hardirq_exit();
372 sub_preempt_count(HARDIRQ_OFFSET); 379 preempt_count_sub(HARDIRQ_OFFSET);
373 if (!in_interrupt() && local_softirq_pending()) 380 if (!in_interrupt() && local_softirq_pending())
374 invoke_softirq(); 381 invoke_softirq();
375 382
@@ -618,146 +625,17 @@ void tasklet_hrtimer_init(struct tasklet_hrtimer *ttimer,
618} 625}
619EXPORT_SYMBOL_GPL(tasklet_hrtimer_init); 626EXPORT_SYMBOL_GPL(tasklet_hrtimer_init);
620 627
621/*
622 * Remote softirq bits
623 */
624
625DEFINE_PER_CPU(struct list_head [NR_SOFTIRQS], softirq_work_list);
626EXPORT_PER_CPU_SYMBOL(softirq_work_list);
627
628static void __local_trigger(struct call_single_data *cp, int softirq)
629{
630 struct list_head *head = &__get_cpu_var(softirq_work_list[softirq]);
631
632 list_add_tail(&cp->list, head);
633
634 /* Trigger the softirq only if the list was previously empty. */
635 if (head->next == &cp->list)
636 raise_softirq_irqoff(softirq);
637}
638
639#ifdef CONFIG_USE_GENERIC_SMP_HELPERS
640static void remote_softirq_receive(void *data)
641{
642 struct call_single_data *cp = data;
643 unsigned long flags;
644 int softirq;
645
646 softirq = *(int *)cp->info;
647 local_irq_save(flags);
648 __local_trigger(cp, softirq);
649 local_irq_restore(flags);
650}
651
652static int __try_remote_softirq(struct call_single_data *cp, int cpu, int softirq)
653{
654 if (cpu_online(cpu)) {
655 cp->func = remote_softirq_receive;
656 cp->info = &softirq;
657 cp->flags = 0;
658
659 __smp_call_function_single(cpu, cp, 0);
660 return 0;
661 }
662 return 1;
663}
664#else /* CONFIG_USE_GENERIC_SMP_HELPERS */
665static int __try_remote_softirq(struct call_single_data *cp, int cpu, int softirq)
666{
667 return 1;
668}
669#endif
670
671/**
672 * __send_remote_softirq - try to schedule softirq work on a remote cpu
673 * @cp: private SMP call function data area
674 * @cpu: the remote cpu
675 * @this_cpu: the currently executing cpu
676 * @softirq: the softirq for the work
677 *
678 * Attempt to schedule softirq work on a remote cpu. If this cannot be
679 * done, the work is instead queued up on the local cpu.
680 *
681 * Interrupts must be disabled.
682 */
683void __send_remote_softirq(struct call_single_data *cp, int cpu, int this_cpu, int softirq)
684{
685 if (cpu == this_cpu || __try_remote_softirq(cp, cpu, softirq))
686 __local_trigger(cp, softirq);
687}
688EXPORT_SYMBOL(__send_remote_softirq);
689
690/**
691 * send_remote_softirq - try to schedule softirq work on a remote cpu
692 * @cp: private SMP call function data area
693 * @cpu: the remote cpu
694 * @softirq: the softirq for the work
695 *
696 * Like __send_remote_softirq except that disabling interrupts and
697 * computing the current cpu is done for the caller.
698 */
699void send_remote_softirq(struct call_single_data *cp, int cpu, int softirq)
700{
701 unsigned long flags;
702 int this_cpu;
703
704 local_irq_save(flags);
705 this_cpu = smp_processor_id();
706 __send_remote_softirq(cp, cpu, this_cpu, softirq);
707 local_irq_restore(flags);
708}
709EXPORT_SYMBOL(send_remote_softirq);
710
711static int remote_softirq_cpu_notify(struct notifier_block *self,
712 unsigned long action, void *hcpu)
713{
714 /*
715 * If a CPU goes away, splice its entries to the current CPU
716 * and trigger a run of the softirq
717 */
718 if (action == CPU_DEAD || action == CPU_DEAD_FROZEN) {
719 int cpu = (unsigned long) hcpu;
720 int i;
721
722 local_irq_disable();
723 for (i = 0; i < NR_SOFTIRQS; i++) {
724 struct list_head *head = &per_cpu(softirq_work_list[i], cpu);
725 struct list_head *local_head;
726
727 if (list_empty(head))
728 continue;
729
730 local_head = &__get_cpu_var(softirq_work_list[i]);
731 list_splice_init(head, local_head);
732 raise_softirq_irqoff(i);
733 }
734 local_irq_enable();
735 }
736
737 return NOTIFY_OK;
738}
739
740static struct notifier_block remote_softirq_cpu_notifier = {
741 .notifier_call = remote_softirq_cpu_notify,
742};
743
744void __init softirq_init(void) 628void __init softirq_init(void)
745{ 629{
746 int cpu; 630 int cpu;
747 631
748 for_each_possible_cpu(cpu) { 632 for_each_possible_cpu(cpu) {
749 int i;
750
751 per_cpu(tasklet_vec, cpu).tail = 633 per_cpu(tasklet_vec, cpu).tail =
752 &per_cpu(tasklet_vec, cpu).head; 634 &per_cpu(tasklet_vec, cpu).head;
753 per_cpu(tasklet_hi_vec, cpu).tail = 635 per_cpu(tasklet_hi_vec, cpu).tail =
754 &per_cpu(tasklet_hi_vec, cpu).head; 636 &per_cpu(tasklet_hi_vec, cpu).head;
755 for (i = 0; i < NR_SOFTIRQS; i++)
756 INIT_LIST_HEAD(&per_cpu(softirq_work_list[i], cpu));
757 } 637 }
758 638
759 register_hotcpu_notifier(&remote_softirq_cpu_notifier);
760
761 open_softirq(TASKLET_SOFTIRQ, tasklet_action); 639 open_softirq(TASKLET_SOFTIRQ, tasklet_action);
762 open_softirq(HI_SOFTIRQ, tasklet_hi_action); 640 open_softirq(HI_SOFTIRQ, tasklet_hi_action);
763} 641}
@@ -771,6 +649,10 @@ static void run_ksoftirqd(unsigned int cpu)
771{ 649{
772 local_irq_disable(); 650 local_irq_disable();
773 if (local_softirq_pending()) { 651 if (local_softirq_pending()) {
652 /*
653 * We can safely run softirq on inline stack, as we are not deep
654 * in the task stack here.
655 */
774 __do_softirq(); 656 __do_softirq();
775 rcu_note_context_switch(cpu); 657 rcu_note_context_switch(cpu);
776 local_irq_enable(); 658 local_irq_enable();
diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c
index c09f2955ae30..84571e09c907 100644
--- a/kernel/stop_machine.c
+++ b/kernel/stop_machine.c
@@ -20,6 +20,7 @@
20#include <linux/kallsyms.h> 20#include <linux/kallsyms.h>
21#include <linux/smpboot.h> 21#include <linux/smpboot.h>
22#include <linux/atomic.h> 22#include <linux/atomic.h>
23#include <linux/lglock.h>
23 24
24/* 25/*
25 * Structure to determine completion condition and record errors. May 26 * Structure to determine completion condition and record errors. May
@@ -43,6 +44,14 @@ static DEFINE_PER_CPU(struct cpu_stopper, cpu_stopper);
43static DEFINE_PER_CPU(struct task_struct *, cpu_stopper_task); 44static DEFINE_PER_CPU(struct task_struct *, cpu_stopper_task);
44static bool stop_machine_initialized = false; 45static bool stop_machine_initialized = false;
45 46
47/*
48 * Avoids a race between stop_two_cpus and global stop_cpus, where
49 * the stoppers could get queued up in reverse order, leading to
50 * system deadlock. Using an lglock means stop_two_cpus remains
51 * relatively cheap.
52 */
53DEFINE_STATIC_LGLOCK(stop_cpus_lock);
54
46static void cpu_stop_init_done(struct cpu_stop_done *done, unsigned int nr_todo) 55static void cpu_stop_init_done(struct cpu_stop_done *done, unsigned int nr_todo)
47{ 56{
48 memset(done, 0, sizeof(*done)); 57 memset(done, 0, sizeof(*done));
@@ -115,6 +124,184 @@ int stop_one_cpu(unsigned int cpu, cpu_stop_fn_t fn, void *arg)
115 return done.executed ? done.ret : -ENOENT; 124 return done.executed ? done.ret : -ENOENT;
116} 125}
117 126
127/* This controls the threads on each CPU. */
128enum multi_stop_state {
129 /* Dummy starting state for thread. */
130 MULTI_STOP_NONE,
131 /* Awaiting everyone to be scheduled. */
132 MULTI_STOP_PREPARE,
133 /* Disable interrupts. */
134 MULTI_STOP_DISABLE_IRQ,
135 /* Run the function */
136 MULTI_STOP_RUN,
137 /* Exit */
138 MULTI_STOP_EXIT,
139};
140
141struct multi_stop_data {
142 int (*fn)(void *);
143 void *data;
144 /* Like num_online_cpus(), but hotplug cpu uses us, so we need this. */
145 unsigned int num_threads;
146 const struct cpumask *active_cpus;
147
148 enum multi_stop_state state;
149 atomic_t thread_ack;
150};
151
152static void set_state(struct multi_stop_data *msdata,
153 enum multi_stop_state newstate)
154{
155 /* Reset ack counter. */
156 atomic_set(&msdata->thread_ack, msdata->num_threads);
157 smp_wmb();
158 msdata->state = newstate;
159}
160
161/* Last one to ack a state moves to the next state. */
162static void ack_state(struct multi_stop_data *msdata)
163{
164 if (atomic_dec_and_test(&msdata->thread_ack))
165 set_state(msdata, msdata->state + 1);
166}
167
168/* This is the cpu_stop function which stops the CPU. */
169static int multi_cpu_stop(void *data)
170{
171 struct multi_stop_data *msdata = data;
172 enum multi_stop_state curstate = MULTI_STOP_NONE;
173 int cpu = smp_processor_id(), err = 0;
174 unsigned long flags;
175 bool is_active;
176
177 /*
178 * When called from stop_machine_from_inactive_cpu(), irq might
179 * already be disabled. Save the state and restore it on exit.
180 */
181 local_save_flags(flags);
182
183 if (!msdata->active_cpus)
184 is_active = cpu == cpumask_first(cpu_online_mask);
185 else
186 is_active = cpumask_test_cpu(cpu, msdata->active_cpus);
187
188 /* Simple state machine */
189 do {
190 /* Chill out and ensure we re-read multi_stop_state. */
191 cpu_relax();
192 if (msdata->state != curstate) {
193 curstate = msdata->state;
194 switch (curstate) {
195 case MULTI_STOP_DISABLE_IRQ:
196 local_irq_disable();
197 hard_irq_disable();
198 break;
199 case MULTI_STOP_RUN:
200 if (is_active)
201 err = msdata->fn(msdata->data);
202 break;
203 default:
204 break;
205 }
206 ack_state(msdata);
207 }
208 } while (curstate != MULTI_STOP_EXIT);
209
210 local_irq_restore(flags);
211 return err;
212}
213
214struct irq_cpu_stop_queue_work_info {
215 int cpu1;
216 int cpu2;
217 struct cpu_stop_work *work1;
218 struct cpu_stop_work *work2;
219};
220
221/*
222 * This function is always run with irqs and preemption disabled.
223 * This guarantees that both work1 and work2 get queued, before
224 * our local migrate thread gets the chance to preempt us.
225 */
226static void irq_cpu_stop_queue_work(void *arg)
227{
228 struct irq_cpu_stop_queue_work_info *info = arg;
229 cpu_stop_queue_work(info->cpu1, info->work1);
230 cpu_stop_queue_work(info->cpu2, info->work2);
231}
232
233/**
234 * stop_two_cpus - stops two cpus
235 * @cpu1: the cpu to stop
236 * @cpu2: the other cpu to stop
237 * @fn: function to execute
238 * @arg: argument to @fn
239 *
240 * Stops both the current and specified CPU and runs @fn on one of them.
241 *
242 * returns when both are completed.
243 */
244int stop_two_cpus(unsigned int cpu1, unsigned int cpu2, cpu_stop_fn_t fn, void *arg)
245{
246 struct cpu_stop_done done;
247 struct cpu_stop_work work1, work2;
248 struct irq_cpu_stop_queue_work_info call_args;
249 struct multi_stop_data msdata;
250
251 preempt_disable();
252 msdata = (struct multi_stop_data){
253 .fn = fn,
254 .data = arg,
255 .num_threads = 2,
256 .active_cpus = cpumask_of(cpu1),
257 };
258
259 work1 = work2 = (struct cpu_stop_work){
260 .fn = multi_cpu_stop,
261 .arg = &msdata,
262 .done = &done
263 };
264
265 call_args = (struct irq_cpu_stop_queue_work_info){
266 .cpu1 = cpu1,
267 .cpu2 = cpu2,
268 .work1 = &work1,
269 .work2 = &work2,
270 };
271
272 cpu_stop_init_done(&done, 2);
273 set_state(&msdata, MULTI_STOP_PREPARE);
274
275 /*
276 * If we observe both CPUs active we know _cpu_down() cannot yet have
277 * queued its stop_machine works and therefore ours will get executed
278 * first. Or its not either one of our CPUs that's getting unplugged,
279 * in which case we don't care.
280 *
281 * This relies on the stopper workqueues to be FIFO.
282 */
283 if (!cpu_active(cpu1) || !cpu_active(cpu2)) {
284 preempt_enable();
285 return -ENOENT;
286 }
287
288 lg_local_lock(&stop_cpus_lock);
289 /*
290 * Queuing needs to be done by the lowest numbered CPU, to ensure
291 * that works are always queued in the same order on every CPU.
292 * This prevents deadlocks.
293 */
294 smp_call_function_single(min(cpu1, cpu2),
295 &irq_cpu_stop_queue_work,
296 &call_args, 0);
297 lg_local_unlock(&stop_cpus_lock);
298 preempt_enable();
299
300 wait_for_completion(&done.completion);
301
302 return done.executed ? done.ret : -ENOENT;
303}
304
118/** 305/**
119 * stop_one_cpu_nowait - stop a cpu but don't wait for completion 306 * stop_one_cpu_nowait - stop a cpu but don't wait for completion
120 * @cpu: cpu to stop 307 * @cpu: cpu to stop
@@ -159,10 +346,10 @@ static void queue_stop_cpus_work(const struct cpumask *cpumask,
159 * preempted by a stopper which might wait for other stoppers 346 * preempted by a stopper which might wait for other stoppers
160 * to enter @fn which can lead to deadlock. 347 * to enter @fn which can lead to deadlock.
161 */ 348 */
162 preempt_disable(); 349 lg_global_lock(&stop_cpus_lock);
163 for_each_cpu(cpu, cpumask) 350 for_each_cpu(cpu, cpumask)
164 cpu_stop_queue_work(cpu, &per_cpu(stop_cpus_work, cpu)); 351 cpu_stop_queue_work(cpu, &per_cpu(stop_cpus_work, cpu));
165 preempt_enable(); 352 lg_global_unlock(&stop_cpus_lock);
166} 353}
167 354
168static int __stop_cpus(const struct cpumask *cpumask, 355static int __stop_cpus(const struct cpumask *cpumask,
@@ -359,98 +546,14 @@ early_initcall(cpu_stop_init);
359 546
360#ifdef CONFIG_STOP_MACHINE 547#ifdef CONFIG_STOP_MACHINE
361 548
362/* This controls the threads on each CPU. */
363enum stopmachine_state {
364 /* Dummy starting state for thread. */
365 STOPMACHINE_NONE,
366 /* Awaiting everyone to be scheduled. */
367 STOPMACHINE_PREPARE,
368 /* Disable interrupts. */
369 STOPMACHINE_DISABLE_IRQ,
370 /* Run the function */
371 STOPMACHINE_RUN,
372 /* Exit */
373 STOPMACHINE_EXIT,
374};
375
376struct stop_machine_data {
377 int (*fn)(void *);
378 void *data;
379 /* Like num_online_cpus(), but hotplug cpu uses us, so we need this. */
380 unsigned int num_threads;
381 const struct cpumask *active_cpus;
382
383 enum stopmachine_state state;
384 atomic_t thread_ack;
385};
386
387static void set_state(struct stop_machine_data *smdata,
388 enum stopmachine_state newstate)
389{
390 /* Reset ack counter. */
391 atomic_set(&smdata->thread_ack, smdata->num_threads);
392 smp_wmb();
393 smdata->state = newstate;
394}
395
396/* Last one to ack a state moves to the next state. */
397static void ack_state(struct stop_machine_data *smdata)
398{
399 if (atomic_dec_and_test(&smdata->thread_ack))
400 set_state(smdata, smdata->state + 1);
401}
402
403/* This is the cpu_stop function which stops the CPU. */
404static int stop_machine_cpu_stop(void *data)
405{
406 struct stop_machine_data *smdata = data;
407 enum stopmachine_state curstate = STOPMACHINE_NONE;
408 int cpu = smp_processor_id(), err = 0;
409 unsigned long flags;
410 bool is_active;
411
412 /*
413 * When called from stop_machine_from_inactive_cpu(), irq might
414 * already be disabled. Save the state and restore it on exit.
415 */
416 local_save_flags(flags);
417
418 if (!smdata->active_cpus)
419 is_active = cpu == cpumask_first(cpu_online_mask);
420 else
421 is_active = cpumask_test_cpu(cpu, smdata->active_cpus);
422
423 /* Simple state machine */
424 do {
425 /* Chill out and ensure we re-read stopmachine_state. */
426 cpu_relax();
427 if (smdata->state != curstate) {
428 curstate = smdata->state;
429 switch (curstate) {
430 case STOPMACHINE_DISABLE_IRQ:
431 local_irq_disable();
432 hard_irq_disable();
433 break;
434 case STOPMACHINE_RUN:
435 if (is_active)
436 err = smdata->fn(smdata->data);
437 break;
438 default:
439 break;
440 }
441 ack_state(smdata);
442 }
443 } while (curstate != STOPMACHINE_EXIT);
444
445 local_irq_restore(flags);
446 return err;
447}
448
449int __stop_machine(int (*fn)(void *), void *data, const struct cpumask *cpus) 549int __stop_machine(int (*fn)(void *), void *data, const struct cpumask *cpus)
450{ 550{
451 struct stop_machine_data smdata = { .fn = fn, .data = data, 551 struct multi_stop_data msdata = {
452 .num_threads = num_online_cpus(), 552 .fn = fn,
453 .active_cpus = cpus }; 553 .data = data,
554 .num_threads = num_online_cpus(),
555 .active_cpus = cpus,
556 };
454 557
455 if (!stop_machine_initialized) { 558 if (!stop_machine_initialized) {
456 /* 559 /*
@@ -461,7 +564,7 @@ int __stop_machine(int (*fn)(void *), void *data, const struct cpumask *cpus)
461 unsigned long flags; 564 unsigned long flags;
462 int ret; 565 int ret;
463 566
464 WARN_ON_ONCE(smdata.num_threads != 1); 567 WARN_ON_ONCE(msdata.num_threads != 1);
465 568
466 local_irq_save(flags); 569 local_irq_save(flags);
467 hard_irq_disable(); 570 hard_irq_disable();
@@ -472,8 +575,8 @@ int __stop_machine(int (*fn)(void *), void *data, const struct cpumask *cpus)
472 } 575 }
473 576
474 /* Set the initial state and stop all online cpus. */ 577 /* Set the initial state and stop all online cpus. */
475 set_state(&smdata, STOPMACHINE_PREPARE); 578 set_state(&msdata, MULTI_STOP_PREPARE);
476 return stop_cpus(cpu_online_mask, stop_machine_cpu_stop, &smdata); 579 return stop_cpus(cpu_online_mask, multi_cpu_stop, &msdata);
477} 580}
478 581
479int stop_machine(int (*fn)(void *), void *data, const struct cpumask *cpus) 582int stop_machine(int (*fn)(void *), void *data, const struct cpumask *cpus)
@@ -513,25 +616,25 @@ EXPORT_SYMBOL_GPL(stop_machine);
513int stop_machine_from_inactive_cpu(int (*fn)(void *), void *data, 616int stop_machine_from_inactive_cpu(int (*fn)(void *), void *data,
514 const struct cpumask *cpus) 617 const struct cpumask *cpus)
515{ 618{
516 struct stop_machine_data smdata = { .fn = fn, .data = data, 619 struct multi_stop_data msdata = { .fn = fn, .data = data,
517 .active_cpus = cpus }; 620 .active_cpus = cpus };
518 struct cpu_stop_done done; 621 struct cpu_stop_done done;
519 int ret; 622 int ret;
520 623
521 /* Local CPU must be inactive and CPU hotplug in progress. */ 624 /* Local CPU must be inactive and CPU hotplug in progress. */
522 BUG_ON(cpu_active(raw_smp_processor_id())); 625 BUG_ON(cpu_active(raw_smp_processor_id()));
523 smdata.num_threads = num_active_cpus() + 1; /* +1 for local */ 626 msdata.num_threads = num_active_cpus() + 1; /* +1 for local */
524 627
525 /* No proper task established and can't sleep - busy wait for lock. */ 628 /* No proper task established and can't sleep - busy wait for lock. */
526 while (!mutex_trylock(&stop_cpus_mutex)) 629 while (!mutex_trylock(&stop_cpus_mutex))
527 cpu_relax(); 630 cpu_relax();
528 631
529 /* Schedule work on other CPUs and execute directly for local CPU */ 632 /* Schedule work on other CPUs and execute directly for local CPU */
530 set_state(&smdata, STOPMACHINE_PREPARE); 633 set_state(&msdata, MULTI_STOP_PREPARE);
531 cpu_stop_init_done(&done, num_active_cpus()); 634 cpu_stop_init_done(&done, num_active_cpus());
532 queue_stop_cpus_work(cpu_active_mask, stop_machine_cpu_stop, &smdata, 635 queue_stop_cpus_work(cpu_active_mask, multi_cpu_stop, &msdata,
533 &done); 636 &done);
534 ret = stop_machine_cpu_stop(&smdata); 637 ret = multi_cpu_stop(&msdata);
535 638
536 /* Busy wait for completion. */ 639 /* Busy wait for completion. */
537 while (!completion_done(&done.completion)) 640 while (!completion_done(&done.completion))
diff --git a/kernel/sys.c b/kernel/sys.c
index c18ecca575b4..c72311324ea7 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -16,7 +16,6 @@
16#include <linux/perf_event.h> 16#include <linux/perf_event.h>
17#include <linux/resource.h> 17#include <linux/resource.h>
18#include <linux/kernel.h> 18#include <linux/kernel.h>
19#include <linux/kexec.h>
20#include <linux/workqueue.h> 19#include <linux/workqueue.h>
21#include <linux/capability.h> 20#include <linux/capability.h>
22#include <linux/device.h> 21#include <linux/device.h>
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index b2f06f3c6a3f..34a604726d0b 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -190,7 +190,7 @@ static int proc_dostring_coredump(struct ctl_table *table, int write,
190 190
191#ifdef CONFIG_MAGIC_SYSRQ 191#ifdef CONFIG_MAGIC_SYSRQ
192/* Note: sysrq code uses it's own private copy */ 192/* Note: sysrq code uses it's own private copy */
193static int __sysrq_enabled = SYSRQ_DEFAULT_ENABLE; 193static int __sysrq_enabled = CONFIG_MAGIC_SYSRQ_DEFAULT_ENABLE;
194 194
195static int sysrq_sysctl_handler(ctl_table *table, int write, 195static int sysrq_sysctl_handler(ctl_table *table, int write,
196 void __user *buffer, size_t *lenp, 196 void __user *buffer, size_t *lenp,
@@ -371,13 +371,6 @@ static struct ctl_table kern_table[] = {
371 .proc_handler = proc_dointvec, 371 .proc_handler = proc_dointvec,
372 }, 372 },
373 { 373 {
374 .procname = "numa_balancing_scan_period_reset",
375 .data = &sysctl_numa_balancing_scan_period_reset,
376 .maxlen = sizeof(unsigned int),
377 .mode = 0644,
378 .proc_handler = proc_dointvec,
379 },
380 {
381 .procname = "numa_balancing_scan_period_max_ms", 374 .procname = "numa_balancing_scan_period_max_ms",
382 .data = &sysctl_numa_balancing_scan_period_max, 375 .data = &sysctl_numa_balancing_scan_period_max,
383 .maxlen = sizeof(unsigned int), 376 .maxlen = sizeof(unsigned int),
@@ -391,6 +384,20 @@ static struct ctl_table kern_table[] = {
391 .mode = 0644, 384 .mode = 0644,
392 .proc_handler = proc_dointvec, 385 .proc_handler = proc_dointvec,
393 }, 386 },
387 {
388 .procname = "numa_balancing_settle_count",
389 .data = &sysctl_numa_balancing_settle_count,
390 .maxlen = sizeof(unsigned int),
391 .mode = 0644,
392 .proc_handler = proc_dointvec,
393 },
394 {
395 .procname = "numa_balancing_migrate_deferred",
396 .data = &sysctl_numa_balancing_migrate_deferred,
397 .maxlen = sizeof(unsigned int),
398 .mode = 0644,
399 .proc_handler = proc_dointvec,
400 },
394#endif /* CONFIG_NUMA_BALANCING */ 401#endif /* CONFIG_NUMA_BALANCING */
395#endif /* CONFIG_SCHED_DEBUG */ 402#endif /* CONFIG_SCHED_DEBUG */
396 { 403 {
@@ -962,9 +969,10 @@ static struct ctl_table kern_table[] = {
962 { 969 {
963 .procname = "hung_task_check_count", 970 .procname = "hung_task_check_count",
964 .data = &sysctl_hung_task_check_count, 971 .data = &sysctl_hung_task_check_count,
965 .maxlen = sizeof(unsigned long), 972 .maxlen = sizeof(int),
966 .mode = 0644, 973 .mode = 0644,
967 .proc_handler = proc_doulongvec_minmax, 974 .proc_handler = proc_dointvec_minmax,
975 .extra1 = &zero,
968 }, 976 },
969 { 977 {
970 .procname = "hung_task_timeout_secs", 978 .procname = "hung_task_timeout_secs",
@@ -1049,6 +1057,7 @@ static struct ctl_table kern_table[] = {
1049 .maxlen = sizeof(sysctl_perf_event_sample_rate), 1057 .maxlen = sizeof(sysctl_perf_event_sample_rate),
1050 .mode = 0644, 1058 .mode = 0644,
1051 .proc_handler = perf_proc_update_handler, 1059 .proc_handler = perf_proc_update_handler,
1060 .extra1 = &one,
1052 }, 1061 },
1053 { 1062 {
1054 .procname = "perf_cpu_time_max_percent", 1063 .procname = "perf_cpu_time_max_percent",
@@ -2214,8 +2223,11 @@ static int __do_proc_doulongvec_minmax(void *data, struct ctl_table *table, int
2214 *i = val; 2223 *i = val;
2215 } else { 2224 } else {
2216 val = convdiv * (*i) / convmul; 2225 val = convdiv * (*i) / convmul;
2217 if (!first) 2226 if (!first) {
2218 err = proc_put_char(&buffer, &left, '\t'); 2227 err = proc_put_char(&buffer, &left, '\t');
2228 if (err)
2229 break;
2230 }
2219 err = proc_put_long(&buffer, &left, val, false); 2231 err = proc_put_long(&buffer, &left, val, false);
2220 if (err) 2232 if (err)
2221 break; 2233 break;
diff --git a/kernel/sysctl_binary.c b/kernel/sysctl_binary.c
index b609213ca9a2..653cbbd9e7ad 100644
--- a/kernel/sysctl_binary.c
+++ b/kernel/sysctl_binary.c
@@ -1024,7 +1024,7 @@ static ssize_t bin_intvec(struct file *file,
1024 if (get_user(value, vec + i)) 1024 if (get_user(value, vec + i))
1025 goto out_kfree; 1025 goto out_kfree;
1026 1026
1027 str += snprintf(str, end - str, "%lu\t", value); 1027 str += scnprintf(str, end - str, "%lu\t", value);
1028 } 1028 }
1029 1029
1030 result = kernel_write(file, buffer, str - buffer, 0); 1030 result = kernel_write(file, buffer, str - buffer, 0);
@@ -1095,7 +1095,7 @@ static ssize_t bin_ulongvec(struct file *file,
1095 if (get_user(value, vec + i)) 1095 if (get_user(value, vec + i))
1096 goto out_kfree; 1096 goto out_kfree;
1097 1097
1098 str += snprintf(str, end - str, "%lu\t", value); 1098 str += scnprintf(str, end - str, "%lu\t", value);
1099 } 1099 }
1100 1100
1101 result = kernel_write(file, buffer, str - buffer, 0); 1101 result = kernel_write(file, buffer, str - buffer, 0);
@@ -1205,7 +1205,7 @@ static ssize_t bin_dn_node_address(struct file *file,
1205 if (get_user(dnaddr, (__le16 __user *)newval)) 1205 if (get_user(dnaddr, (__le16 __user *)newval))
1206 goto out; 1206 goto out;
1207 1207
1208 len = snprintf(buf, sizeof(buf), "%hu.%hu", 1208 len = scnprintf(buf, sizeof(buf), "%hu.%hu",
1209 le16_to_cpu(dnaddr) >> 10, 1209 le16_to_cpu(dnaddr) >> 10,
1210 le16_to_cpu(dnaddr) & 0x3ff); 1210 le16_to_cpu(dnaddr) & 0x3ff);
1211 1211
diff --git a/kernel/system_certificates.S b/kernel/system_certificates.S
new file mode 100644
index 000000000000..3e9868d47535
--- /dev/null
+++ b/kernel/system_certificates.S
@@ -0,0 +1,20 @@
1#include <linux/export.h>
2#include <linux/init.h>
3
4 __INITRODATA
5
6 .align 8
7 .globl VMLINUX_SYMBOL(system_certificate_list)
8VMLINUX_SYMBOL(system_certificate_list):
9__cert_list_start:
10 .incbin "kernel/x509_certificate_list"
11__cert_list_end:
12
13 .align 8
14 .globl VMLINUX_SYMBOL(system_certificate_list_size)
15VMLINUX_SYMBOL(system_certificate_list_size):
16#ifdef CONFIG_64BIT
17 .quad __cert_list_end - __cert_list_start
18#else
19 .long __cert_list_end - __cert_list_start
20#endif
diff --git a/kernel/system_keyring.c b/kernel/system_keyring.c
new file mode 100644
index 000000000000..52ebc70263f4
--- /dev/null
+++ b/kernel/system_keyring.c
@@ -0,0 +1,105 @@
1/* System trusted keyring for trusted public keys
2 *
3 * Copyright (C) 2012 Red Hat, Inc. All Rights Reserved.
4 * Written by David Howells (dhowells@redhat.com)
5 *
6 * This program is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public Licence
8 * as published by the Free Software Foundation; either version
9 * 2 of the Licence, or (at your option) any later version.
10 */
11
12#include <linux/export.h>
13#include <linux/kernel.h>
14#include <linux/sched.h>
15#include <linux/cred.h>
16#include <linux/err.h>
17#include <keys/asymmetric-type.h>
18#include <keys/system_keyring.h>
19#include "module-internal.h"
20
21struct key *system_trusted_keyring;
22EXPORT_SYMBOL_GPL(system_trusted_keyring);
23
24extern __initconst const u8 system_certificate_list[];
25extern __initconst const unsigned long system_certificate_list_size;
26
27/*
28 * Load the compiled-in keys
29 */
30static __init int system_trusted_keyring_init(void)
31{
32 pr_notice("Initialise system trusted keyring\n");
33
34 system_trusted_keyring =
35 keyring_alloc(".system_keyring",
36 KUIDT_INIT(0), KGIDT_INIT(0), current_cred(),
37 ((KEY_POS_ALL & ~KEY_POS_SETATTR) |
38 KEY_USR_VIEW | KEY_USR_READ | KEY_USR_SEARCH),
39 KEY_ALLOC_NOT_IN_QUOTA, NULL);
40 if (IS_ERR(system_trusted_keyring))
41 panic("Can't allocate system trusted keyring\n");
42
43 set_bit(KEY_FLAG_TRUSTED_ONLY, &system_trusted_keyring->flags);
44 return 0;
45}
46
47/*
48 * Must be initialised before we try and load the keys into the keyring.
49 */
50device_initcall(system_trusted_keyring_init);
51
52/*
53 * Load the compiled-in list of X.509 certificates.
54 */
55static __init int load_system_certificate_list(void)
56{
57 key_ref_t key;
58 const u8 *p, *end;
59 size_t plen;
60
61 pr_notice("Loading compiled-in X.509 certificates\n");
62
63 p = system_certificate_list;
64 end = p + system_certificate_list_size;
65 while (p < end) {
66 /* Each cert begins with an ASN.1 SEQUENCE tag and must be more
67 * than 256 bytes in size.
68 */
69 if (end - p < 4)
70 goto dodgy_cert;
71 if (p[0] != 0x30 &&
72 p[1] != 0x82)
73 goto dodgy_cert;
74 plen = (p[2] << 8) | p[3];
75 plen += 4;
76 if (plen > end - p)
77 goto dodgy_cert;
78
79 key = key_create_or_update(make_key_ref(system_trusted_keyring, 1),
80 "asymmetric",
81 NULL,
82 p,
83 plen,
84 ((KEY_POS_ALL & ~KEY_POS_SETATTR) |
85 KEY_USR_VIEW | KEY_USR_READ),
86 KEY_ALLOC_NOT_IN_QUOTA |
87 KEY_ALLOC_TRUSTED);
88 if (IS_ERR(key)) {
89 pr_err("Problem loading in-kernel X.509 certificate (%ld)\n",
90 PTR_ERR(key));
91 } else {
92 pr_notice("Loaded X.509 cert '%s'\n",
93 key_ref_to_ptr(key)->description);
94 key_ref_put(key);
95 }
96 p += plen;
97 }
98
99 return 0;
100
101dodgy_cert:
102 pr_err("Problem parsing in-kernel X.509 certificate list\n");
103 return 0;
104}
105late_initcall(load_system_certificate_list);
diff --git a/kernel/taskstats.c b/kernel/taskstats.c
index 145bb4d3bd4d..13d2f7cd65db 100644
--- a/kernel/taskstats.c
+++ b/kernel/taskstats.c
@@ -290,6 +290,7 @@ static int add_del_listener(pid_t pid, const struct cpumask *mask, int isadd)
290 struct listener_list *listeners; 290 struct listener_list *listeners;
291 struct listener *s, *tmp, *s2; 291 struct listener *s, *tmp, *s2;
292 unsigned int cpu; 292 unsigned int cpu;
293 int ret = 0;
293 294
294 if (!cpumask_subset(mask, cpu_possible_mask)) 295 if (!cpumask_subset(mask, cpu_possible_mask))
295 return -EINVAL; 296 return -EINVAL;
@@ -304,9 +305,10 @@ static int add_del_listener(pid_t pid, const struct cpumask *mask, int isadd)
304 for_each_cpu(cpu, mask) { 305 for_each_cpu(cpu, mask) {
305 s = kmalloc_node(sizeof(struct listener), 306 s = kmalloc_node(sizeof(struct listener),
306 GFP_KERNEL, cpu_to_node(cpu)); 307 GFP_KERNEL, cpu_to_node(cpu));
307 if (!s) 308 if (!s) {
309 ret = -ENOMEM;
308 goto cleanup; 310 goto cleanup;
309 311 }
310 s->pid = pid; 312 s->pid = pid;
311 s->valid = 1; 313 s->valid = 1;
312 314
@@ -339,7 +341,7 @@ cleanup:
339 } 341 }
340 up_write(&listeners->sem); 342 up_write(&listeners->sem);
341 } 343 }
342 return 0; 344 return ret;
343} 345}
344 346
345static int parse(struct nlattr *na, struct cpumask *mask) 347static int parse(struct nlattr *na, struct cpumask *mask)
@@ -404,11 +406,15 @@ static struct taskstats *mk_reply(struct sk_buff *skb, int type, u32 pid)
404 if (!na) 406 if (!na)
405 goto err; 407 goto err;
406 408
407 if (nla_put(skb, type, sizeof(pid), &pid) < 0) 409 if (nla_put(skb, type, sizeof(pid), &pid) < 0) {
410 nla_nest_cancel(skb, na);
408 goto err; 411 goto err;
412 }
409 ret = nla_reserve(skb, TASKSTATS_TYPE_STATS, sizeof(struct taskstats)); 413 ret = nla_reserve(skb, TASKSTATS_TYPE_STATS, sizeof(struct taskstats));
410 if (!ret) 414 if (!ret) {
415 nla_nest_cancel(skb, na);
411 goto err; 416 goto err;
417 }
412 nla_nest_end(skb, na); 418 nla_nest_end(skb, na);
413 419
414 return nla_data(ret); 420 return nla_data(ret);
@@ -667,17 +673,18 @@ err:
667 nlmsg_free(rep_skb); 673 nlmsg_free(rep_skb);
668} 674}
669 675
670static struct genl_ops taskstats_ops = { 676static const struct genl_ops taskstats_ops[] = {
671 .cmd = TASKSTATS_CMD_GET, 677 {
672 .doit = taskstats_user_cmd, 678 .cmd = TASKSTATS_CMD_GET,
673 .policy = taskstats_cmd_get_policy, 679 .doit = taskstats_user_cmd,
674 .flags = GENL_ADMIN_PERM, 680 .policy = taskstats_cmd_get_policy,
675}; 681 .flags = GENL_ADMIN_PERM,
676 682 },
677static struct genl_ops cgroupstats_ops = { 683 {
678 .cmd = CGROUPSTATS_CMD_GET, 684 .cmd = CGROUPSTATS_CMD_GET,
679 .doit = cgroupstats_user_cmd, 685 .doit = cgroupstats_user_cmd,
680 .policy = cgroupstats_cmd_get_policy, 686 .policy = cgroupstats_cmd_get_policy,
687 },
681}; 688};
682 689
683/* Needed early in initialization */ 690/* Needed early in initialization */
@@ -696,26 +703,13 @@ static int __init taskstats_init(void)
696{ 703{
697 int rc; 704 int rc;
698 705
699 rc = genl_register_family(&family); 706 rc = genl_register_family_with_ops(&family, taskstats_ops);
700 if (rc) 707 if (rc)
701 return rc; 708 return rc;
702 709
703 rc = genl_register_ops(&family, &taskstats_ops);
704 if (rc < 0)
705 goto err;
706
707 rc = genl_register_ops(&family, &cgroupstats_ops);
708 if (rc < 0)
709 goto err_cgroup_ops;
710
711 family_registered = 1; 710 family_registered = 1;
712 pr_info("registered taskstats version %d\n", TASKSTATS_GENL_VERSION); 711 pr_info("registered taskstats version %d\n", TASKSTATS_GENL_VERSION);
713 return 0; 712 return 0;
714err_cgroup_ops:
715 genl_unregister_ops(&family, &taskstats_ops);
716err:
717 genl_unregister_family(&family);
718 return rc;
719} 713}
720 714
721/* 715/*
diff --git a/kernel/time/Kconfig b/kernel/time/Kconfig
index 2b62fe86f9ec..3ce6e8c5f3fc 100644
--- a/kernel/time/Kconfig
+++ b/kernel/time/Kconfig
@@ -100,7 +100,7 @@ config NO_HZ_FULL
100 # RCU_USER_QS dependency 100 # RCU_USER_QS dependency
101 depends on HAVE_CONTEXT_TRACKING 101 depends on HAVE_CONTEXT_TRACKING
102 # VIRT_CPU_ACCOUNTING_GEN dependency 102 # VIRT_CPU_ACCOUNTING_GEN dependency
103 depends on 64BIT 103 depends on HAVE_VIRT_CPU_ACCOUNTING_GEN
104 select NO_HZ_COMMON 104 select NO_HZ_COMMON
105 select RCU_USER_QS 105 select RCU_USER_QS
106 select RCU_NOCB_CPU 106 select RCU_NOCB_CPU
diff --git a/kernel/time/alarmtimer.c b/kernel/time/alarmtimer.c
index eec50fcef9e4..88c9c65a430d 100644
--- a/kernel/time/alarmtimer.c
+++ b/kernel/time/alarmtimer.c
@@ -490,7 +490,7 @@ static int alarm_clock_getres(const clockid_t which_clock, struct timespec *tp)
490 clockid_t baseid = alarm_bases[clock2alarm(which_clock)].base_clockid; 490 clockid_t baseid = alarm_bases[clock2alarm(which_clock)].base_clockid;
491 491
492 if (!alarmtimer_get_rtcdev()) 492 if (!alarmtimer_get_rtcdev())
493 return -ENOTSUPP; 493 return -EINVAL;
494 494
495 return hrtimer_get_res(baseid, tp); 495 return hrtimer_get_res(baseid, tp);
496} 496}
@@ -507,7 +507,7 @@ static int alarm_clock_get(clockid_t which_clock, struct timespec *tp)
507 struct alarm_base *base = &alarm_bases[clock2alarm(which_clock)]; 507 struct alarm_base *base = &alarm_bases[clock2alarm(which_clock)];
508 508
509 if (!alarmtimer_get_rtcdev()) 509 if (!alarmtimer_get_rtcdev())
510 return -ENOTSUPP; 510 return -EINVAL;
511 511
512 *tp = ktime_to_timespec(base->gettime()); 512 *tp = ktime_to_timespec(base->gettime());
513 return 0; 513 return 0;
diff --git a/kernel/time/clockevents.c b/kernel/time/clockevents.c
index 662c5798a685..086ad6043bcb 100644
--- a/kernel/time/clockevents.c
+++ b/kernel/time/clockevents.c
@@ -619,7 +619,7 @@ static ssize_t sysfs_unbind_tick_dev(struct device *dev,
619 const char *buf, size_t count) 619 const char *buf, size_t count)
620{ 620{
621 char name[CS_NAME_LEN]; 621 char name[CS_NAME_LEN];
622 size_t ret = sysfs_get_uname(buf, name, count); 622 ssize_t ret = sysfs_get_uname(buf, name, count);
623 struct clock_event_device *ce; 623 struct clock_event_device *ce;
624 624
625 if (ret < 0) 625 if (ret < 0)
diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c
index 50a8736757f3..ba3e502c955a 100644
--- a/kernel/time/clocksource.c
+++ b/kernel/time/clocksource.c
@@ -479,6 +479,7 @@ static inline void clocksource_dequeue_watchdog(struct clocksource *cs) { }
479static inline void clocksource_resume_watchdog(void) { } 479static inline void clocksource_resume_watchdog(void) { }
480static inline int __clocksource_watchdog_kthread(void) { return 0; } 480static inline int __clocksource_watchdog_kthread(void) { return 0; }
481static bool clocksource_is_watchdog(struct clocksource *cs) { return false; } 481static bool clocksource_is_watchdog(struct clocksource *cs) { return false; }
482void clocksource_mark_unstable(struct clocksource *cs) { }
482 483
483#endif /* CONFIG_CLOCKSOURCE_WATCHDOG */ 484#endif /* CONFIG_CLOCKSOURCE_WATCHDOG */
484 485
@@ -537,40 +538,55 @@ static u32 clocksource_max_adjustment(struct clocksource *cs)
537} 538}
538 539
539/** 540/**
540 * clocksource_max_deferment - Returns max time the clocksource can be deferred 541 * clocks_calc_max_nsecs - Returns maximum nanoseconds that can be converted
541 * @cs: Pointer to clocksource 542 * @mult: cycle to nanosecond multiplier
542 * 543 * @shift: cycle to nanosecond divisor (power of two)
544 * @maxadj: maximum adjustment value to mult (~11%)
545 * @mask: bitmask for two's complement subtraction of non 64 bit counters
543 */ 546 */
544static u64 clocksource_max_deferment(struct clocksource *cs) 547u64 clocks_calc_max_nsecs(u32 mult, u32 shift, u32 maxadj, u64 mask)
545{ 548{
546 u64 max_nsecs, max_cycles; 549 u64 max_nsecs, max_cycles;
547 550
548 /* 551 /*
549 * Calculate the maximum number of cycles that we can pass to the 552 * Calculate the maximum number of cycles that we can pass to the
550 * cyc2ns function without overflowing a 64-bit signed result. The 553 * cyc2ns function without overflowing a 64-bit signed result. The
551 * maximum number of cycles is equal to ULLONG_MAX/(cs->mult+cs->maxadj) 554 * maximum number of cycles is equal to ULLONG_MAX/(mult+maxadj)
552 * which is equivalent to the below. 555 * which is equivalent to the below.
553 * max_cycles < (2^63)/(cs->mult + cs->maxadj) 556 * max_cycles < (2^63)/(mult + maxadj)
554 * max_cycles < 2^(log2((2^63)/(cs->mult + cs->maxadj))) 557 * max_cycles < 2^(log2((2^63)/(mult + maxadj)))
555 * max_cycles < 2^(log2(2^63) - log2(cs->mult + cs->maxadj)) 558 * max_cycles < 2^(log2(2^63) - log2(mult + maxadj))
556 * max_cycles < 2^(63 - log2(cs->mult + cs->maxadj)) 559 * max_cycles < 2^(63 - log2(mult + maxadj))
557 * max_cycles < 1 << (63 - log2(cs->mult + cs->maxadj)) 560 * max_cycles < 1 << (63 - log2(mult + maxadj))
558 * Please note that we add 1 to the result of the log2 to account for 561 * Please note that we add 1 to the result of the log2 to account for
559 * any rounding errors, ensure the above inequality is satisfied and 562 * any rounding errors, ensure the above inequality is satisfied and
560 * no overflow will occur. 563 * no overflow will occur.
561 */ 564 */
562 max_cycles = 1ULL << (63 - (ilog2(cs->mult + cs->maxadj) + 1)); 565 max_cycles = 1ULL << (63 - (ilog2(mult + maxadj) + 1));
563 566
564 /* 567 /*
565 * The actual maximum number of cycles we can defer the clocksource is 568 * The actual maximum number of cycles we can defer the clocksource is
566 * determined by the minimum of max_cycles and cs->mask. 569 * determined by the minimum of max_cycles and mask.
567 * Note: Here we subtract the maxadj to make sure we don't sleep for 570 * Note: Here we subtract the maxadj to make sure we don't sleep for
568 * too long if there's a large negative adjustment. 571 * too long if there's a large negative adjustment.
569 */ 572 */
570 max_cycles = min_t(u64, max_cycles, (u64) cs->mask); 573 max_cycles = min(max_cycles, mask);
571 max_nsecs = clocksource_cyc2ns(max_cycles, cs->mult - cs->maxadj, 574 max_nsecs = clocksource_cyc2ns(max_cycles, mult - maxadj, shift);
572 cs->shift); 575
576 return max_nsecs;
577}
578
579/**
580 * clocksource_max_deferment - Returns max time the clocksource can be deferred
581 * @cs: Pointer to clocksource
582 *
583 */
584static u64 clocksource_max_deferment(struct clocksource *cs)
585{
586 u64 max_nsecs;
573 587
588 max_nsecs = clocks_calc_max_nsecs(cs->mult, cs->shift, cs->maxadj,
589 cs->mask);
574 /* 590 /*
575 * To ensure that the clocksource does not wrap whilst we are idle, 591 * To ensure that the clocksource does not wrap whilst we are idle,
576 * limit the time the clocksource can be deferred by 12.5%. Please 592 * limit the time the clocksource can be deferred by 12.5%. Please
@@ -893,7 +909,7 @@ sysfs_show_current_clocksources(struct device *dev,
893 return count; 909 return count;
894} 910}
895 911
896size_t sysfs_get_uname(const char *buf, char *dst, size_t cnt) 912ssize_t sysfs_get_uname(const char *buf, char *dst, size_t cnt)
897{ 913{
898 size_t ret = cnt; 914 size_t ret = cnt;
899 915
@@ -924,7 +940,7 @@ static ssize_t sysfs_override_clocksource(struct device *dev,
924 struct device_attribute *attr, 940 struct device_attribute *attr,
925 const char *buf, size_t count) 941 const char *buf, size_t count)
926{ 942{
927 size_t ret; 943 ssize_t ret;
928 944
929 mutex_lock(&clocksource_mutex); 945 mutex_lock(&clocksource_mutex);
930 946
@@ -952,7 +968,7 @@ static ssize_t sysfs_unbind_clocksource(struct device *dev,
952{ 968{
953 struct clocksource *cs; 969 struct clocksource *cs;
954 char name[CS_NAME_LEN]; 970 char name[CS_NAME_LEN];
955 size_t ret; 971 ssize_t ret;
956 972
957 ret = sysfs_get_uname(buf, name, count); 973 ret = sysfs_get_uname(buf, name, count);
958 if (ret < 0) 974 if (ret < 0)
diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c
index bb2215174f05..af8d1d4f3d55 100644
--- a/kernel/time/ntp.c
+++ b/kernel/time/ntp.c
@@ -475,6 +475,7 @@ static void sync_cmos_clock(struct work_struct *work)
475 * called as close as possible to 500 ms before the new second starts. 475 * called as close as possible to 500 ms before the new second starts.
476 * This code is run on a timer. If the clock is set, that timer 476 * This code is run on a timer. If the clock is set, that timer
477 * may not expire at the correct time. Thus, we adjust... 477 * may not expire at the correct time. Thus, we adjust...
478 * We want the clock to be within a couple of ticks from the target.
478 */ 479 */
479 if (!ntp_synced()) { 480 if (!ntp_synced()) {
480 /* 481 /*
@@ -485,7 +486,7 @@ static void sync_cmos_clock(struct work_struct *work)
485 } 486 }
486 487
487 getnstimeofday(&now); 488 getnstimeofday(&now);
488 if (abs(now.tv_nsec - (NSEC_PER_SEC / 2)) <= tick_nsec / 2) { 489 if (abs(now.tv_nsec - (NSEC_PER_SEC / 2)) <= tick_nsec * 5) {
489 struct timespec adjust = now; 490 struct timespec adjust = now;
490 491
491 fail = -ENODEV; 492 fail = -ENODEV;
diff --git a/kernel/time/sched_clock.c b/kernel/time/sched_clock.c
index 0b479a6a22bb..0abb36464281 100644
--- a/kernel/time/sched_clock.c
+++ b/kernel/time/sched_clock.c
@@ -8,25 +8,28 @@
8#include <linux/clocksource.h> 8#include <linux/clocksource.h>
9#include <linux/init.h> 9#include <linux/init.h>
10#include <linux/jiffies.h> 10#include <linux/jiffies.h>
11#include <linux/ktime.h>
11#include <linux/kernel.h> 12#include <linux/kernel.h>
12#include <linux/moduleparam.h> 13#include <linux/moduleparam.h>
13#include <linux/sched.h> 14#include <linux/sched.h>
14#include <linux/syscore_ops.h> 15#include <linux/syscore_ops.h>
15#include <linux/timer.h> 16#include <linux/hrtimer.h>
16#include <linux/sched_clock.h> 17#include <linux/sched_clock.h>
18#include <linux/seqlock.h>
19#include <linux/bitops.h>
17 20
18struct clock_data { 21struct clock_data {
22 ktime_t wrap_kt;
19 u64 epoch_ns; 23 u64 epoch_ns;
20 u32 epoch_cyc; 24 u64 epoch_cyc;
21 u32 epoch_cyc_copy; 25 seqcount_t seq;
22 unsigned long rate; 26 unsigned long rate;
23 u32 mult; 27 u32 mult;
24 u32 shift; 28 u32 shift;
25 bool suspended; 29 bool suspended;
26}; 30};
27 31
28static void sched_clock_poll(unsigned long wrap_ticks); 32static struct hrtimer sched_clock_timer;
29static DEFINE_TIMER(sched_clock_timer, sched_clock_poll, 0, 0);
30static int irqtime = -1; 33static int irqtime = -1;
31 34
32core_param(irqtime, irqtime, int, 0400); 35core_param(irqtime, irqtime, int, 0400);
@@ -35,42 +38,46 @@ static struct clock_data cd = {
35 .mult = NSEC_PER_SEC / HZ, 38 .mult = NSEC_PER_SEC / HZ,
36}; 39};
37 40
38static u32 __read_mostly sched_clock_mask = 0xffffffff; 41static u64 __read_mostly sched_clock_mask;
39 42
40static u32 notrace jiffy_sched_clock_read(void) 43static u64 notrace jiffy_sched_clock_read(void)
41{ 44{
42 return (u32)(jiffies - INITIAL_JIFFIES); 45 /*
46 * We don't need to use get_jiffies_64 on 32-bit arches here
47 * because we register with BITS_PER_LONG
48 */
49 return (u64)(jiffies - INITIAL_JIFFIES);
43} 50}
44 51
45static u32 __read_mostly (*read_sched_clock)(void) = jiffy_sched_clock_read; 52static u32 __read_mostly (*read_sched_clock_32)(void);
53
54static u64 notrace read_sched_clock_32_wrapper(void)
55{
56 return read_sched_clock_32();
57}
58
59static u64 __read_mostly (*read_sched_clock)(void) = jiffy_sched_clock_read;
46 60
47static inline u64 notrace cyc_to_ns(u64 cyc, u32 mult, u32 shift) 61static inline u64 notrace cyc_to_ns(u64 cyc, u32 mult, u32 shift)
48{ 62{
49 return (cyc * mult) >> shift; 63 return (cyc * mult) >> shift;
50} 64}
51 65
52static unsigned long long notrace sched_clock_32(void) 66unsigned long long notrace sched_clock(void)
53{ 67{
54 u64 epoch_ns; 68 u64 epoch_ns;
55 u32 epoch_cyc; 69 u64 epoch_cyc;
56 u32 cyc; 70 u64 cyc;
71 unsigned long seq;
57 72
58 if (cd.suspended) 73 if (cd.suspended)
59 return cd.epoch_ns; 74 return cd.epoch_ns;
60 75
61 /*
62 * Load the epoch_cyc and epoch_ns atomically. We do this by
63 * ensuring that we always write epoch_cyc, epoch_ns and
64 * epoch_cyc_copy in strict order, and read them in strict order.
65 * If epoch_cyc and epoch_cyc_copy are not equal, then we're in
66 * the middle of an update, and we should repeat the load.
67 */
68 do { 76 do {
77 seq = raw_read_seqcount_begin(&cd.seq);
69 epoch_cyc = cd.epoch_cyc; 78 epoch_cyc = cd.epoch_cyc;
70 smp_rmb();
71 epoch_ns = cd.epoch_ns; 79 epoch_ns = cd.epoch_ns;
72 smp_rmb(); 80 } while (read_seqcount_retry(&cd.seq, seq));
73 } while (epoch_cyc != cd.epoch_cyc_copy);
74 81
75 cyc = read_sched_clock(); 82 cyc = read_sched_clock();
76 cyc = (cyc - epoch_cyc) & sched_clock_mask; 83 cyc = (cyc - epoch_cyc) & sched_clock_mask;
@@ -83,49 +90,46 @@ static unsigned long long notrace sched_clock_32(void)
83static void notrace update_sched_clock(void) 90static void notrace update_sched_clock(void)
84{ 91{
85 unsigned long flags; 92 unsigned long flags;
86 u32 cyc; 93 u64 cyc;
87 u64 ns; 94 u64 ns;
88 95
89 cyc = read_sched_clock(); 96 cyc = read_sched_clock();
90 ns = cd.epoch_ns + 97 ns = cd.epoch_ns +
91 cyc_to_ns((cyc - cd.epoch_cyc) & sched_clock_mask, 98 cyc_to_ns((cyc - cd.epoch_cyc) & sched_clock_mask,
92 cd.mult, cd.shift); 99 cd.mult, cd.shift);
93 /* 100
94 * Write epoch_cyc and epoch_ns in a way that the update is
95 * detectable in cyc_to_fixed_sched_clock().
96 */
97 raw_local_irq_save(flags); 101 raw_local_irq_save(flags);
98 cd.epoch_cyc_copy = cyc; 102 raw_write_seqcount_begin(&cd.seq);
99 smp_wmb();
100 cd.epoch_ns = ns; 103 cd.epoch_ns = ns;
101 smp_wmb();
102 cd.epoch_cyc = cyc; 104 cd.epoch_cyc = cyc;
105 raw_write_seqcount_end(&cd.seq);
103 raw_local_irq_restore(flags); 106 raw_local_irq_restore(flags);
104} 107}
105 108
106static void sched_clock_poll(unsigned long wrap_ticks) 109static enum hrtimer_restart sched_clock_poll(struct hrtimer *hrt)
107{ 110{
108 mod_timer(&sched_clock_timer, round_jiffies(jiffies + wrap_ticks));
109 update_sched_clock(); 111 update_sched_clock();
112 hrtimer_forward_now(hrt, cd.wrap_kt);
113 return HRTIMER_RESTART;
110} 114}
111 115
112void __init setup_sched_clock(u32 (*read)(void), int bits, unsigned long rate) 116void __init sched_clock_register(u64 (*read)(void), int bits,
117 unsigned long rate)
113{ 118{
114 unsigned long r, w; 119 unsigned long r;
115 u64 res, wrap; 120 u64 res, wrap;
116 char r_unit; 121 char r_unit;
117 122
118 if (cd.rate > rate) 123 if (cd.rate > rate)
119 return; 124 return;
120 125
121 BUG_ON(bits > 32);
122 WARN_ON(!irqs_disabled()); 126 WARN_ON(!irqs_disabled());
123 read_sched_clock = read; 127 read_sched_clock = read;
124 sched_clock_mask = (1ULL << bits) - 1; 128 sched_clock_mask = CLOCKSOURCE_MASK(bits);
125 cd.rate = rate; 129 cd.rate = rate;
126 130
127 /* calculate the mult/shift to convert counter ticks to ns. */ 131 /* calculate the mult/shift to convert counter ticks to ns. */
128 clocks_calc_mult_shift(&cd.mult, &cd.shift, rate, NSEC_PER_SEC, 0); 132 clocks_calc_mult_shift(&cd.mult, &cd.shift, rate, NSEC_PER_SEC, 3600);
129 133
130 r = rate; 134 r = rate;
131 if (r >= 4000000) { 135 if (r >= 4000000) {
@@ -138,20 +142,14 @@ void __init setup_sched_clock(u32 (*read)(void), int bits, unsigned long rate)
138 r_unit = ' '; 142 r_unit = ' ';
139 143
140 /* calculate how many ns until we wrap */ 144 /* calculate how many ns until we wrap */
141 wrap = cyc_to_ns((1ULL << bits) - 1, cd.mult, cd.shift); 145 wrap = clocks_calc_max_nsecs(cd.mult, cd.shift, 0, sched_clock_mask);
142 do_div(wrap, NSEC_PER_MSEC); 146 cd.wrap_kt = ns_to_ktime(wrap - (wrap >> 3));
143 w = wrap;
144 147
145 /* calculate the ns resolution of this counter */ 148 /* calculate the ns resolution of this counter */
146 res = cyc_to_ns(1ULL, cd.mult, cd.shift); 149 res = cyc_to_ns(1ULL, cd.mult, cd.shift);
147 pr_info("sched_clock: %u bits at %lu%cHz, resolution %lluns, wraps every %lums\n", 150 pr_info("sched_clock: %u bits at %lu%cHz, resolution %lluns, wraps every %lluns\n",
148 bits, r, r_unit, res, w); 151 bits, r, r_unit, res, wrap);
149 152
150 /*
151 * Start the timer to keep sched_clock() properly updated and
152 * sets the initial epoch.
153 */
154 sched_clock_timer.data = msecs_to_jiffies(w - (w / 10));
155 update_sched_clock(); 153 update_sched_clock();
156 154
157 /* 155 /*
@@ -166,11 +164,10 @@ void __init setup_sched_clock(u32 (*read)(void), int bits, unsigned long rate)
166 pr_debug("Registered %pF as sched_clock source\n", read); 164 pr_debug("Registered %pF as sched_clock source\n", read);
167} 165}
168 166
169unsigned long long __read_mostly (*sched_clock_func)(void) = sched_clock_32; 167void __init setup_sched_clock(u32 (*read)(void), int bits, unsigned long rate)
170
171unsigned long long notrace sched_clock(void)
172{ 168{
173 return sched_clock_func(); 169 read_sched_clock_32 = read;
170 sched_clock_register(read_sched_clock_32_wrapper, bits, rate);
174} 171}
175 172
176void __init sched_clock_postinit(void) 173void __init sched_clock_postinit(void)
@@ -180,14 +177,22 @@ void __init sched_clock_postinit(void)
180 * make it the final one one. 177 * make it the final one one.
181 */ 178 */
182 if (read_sched_clock == jiffy_sched_clock_read) 179 if (read_sched_clock == jiffy_sched_clock_read)
183 setup_sched_clock(jiffy_sched_clock_read, 32, HZ); 180 sched_clock_register(jiffy_sched_clock_read, BITS_PER_LONG, HZ);
184 181
185 sched_clock_poll(sched_clock_timer.data); 182 update_sched_clock();
183
184 /*
185 * Start the timer to keep sched_clock() properly updated and
186 * sets the initial epoch.
187 */
188 hrtimer_init(&sched_clock_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
189 sched_clock_timer.function = sched_clock_poll;
190 hrtimer_start(&sched_clock_timer, cd.wrap_kt, HRTIMER_MODE_REL);
186} 191}
187 192
188static int sched_clock_suspend(void) 193static int sched_clock_suspend(void)
189{ 194{
190 sched_clock_poll(sched_clock_timer.data); 195 sched_clock_poll(&sched_clock_timer);
191 cd.suspended = true; 196 cd.suspended = true;
192 return 0; 197 return 0;
193} 198}
@@ -195,7 +200,6 @@ static int sched_clock_suspend(void)
195static void sched_clock_resume(void) 200static void sched_clock_resume(void)
196{ 201{
197 cd.epoch_cyc = read_sched_clock(); 202 cd.epoch_cyc = read_sched_clock();
198 cd.epoch_cyc_copy = cd.epoch_cyc;
199 cd.suspended = false; 203 cd.suspended = false;
200} 204}
201 205
diff --git a/kernel/time/tick-broadcast.c b/kernel/time/tick-broadcast.c
index 218bcb565fed..9532690daaa9 100644
--- a/kernel/time/tick-broadcast.c
+++ b/kernel/time/tick-broadcast.c
@@ -70,6 +70,7 @@ static bool tick_check_broadcast_device(struct clock_event_device *curdev,
70 struct clock_event_device *newdev) 70 struct clock_event_device *newdev)
71{ 71{
72 if ((newdev->features & CLOCK_EVT_FEAT_DUMMY) || 72 if ((newdev->features & CLOCK_EVT_FEAT_DUMMY) ||
73 (newdev->features & CLOCK_EVT_FEAT_PERCPU) ||
73 (newdev->features & CLOCK_EVT_FEAT_C3STOP)) 74 (newdev->features & CLOCK_EVT_FEAT_C3STOP))
74 return false; 75 return false;
75 76
diff --git a/kernel/time/tick-common.c b/kernel/time/tick-common.c
index 64522ecdfe0e..162b03ab0ad2 100644
--- a/kernel/time/tick-common.c
+++ b/kernel/time/tick-common.c
@@ -33,6 +33,21 @@ DEFINE_PER_CPU(struct tick_device, tick_cpu_device);
33 */ 33 */
34ktime_t tick_next_period; 34ktime_t tick_next_period;
35ktime_t tick_period; 35ktime_t tick_period;
36
37/*
38 * tick_do_timer_cpu is a timer core internal variable which holds the CPU NR
39 * which is responsible for calling do_timer(), i.e. the timekeeping stuff. This
40 * variable has two functions:
41 *
42 * 1) Prevent a thundering herd issue of a gazillion of CPUs trying to grab the
43 * timekeeping lock all at once. Only the CPU which is assigned to do the
44 * update is handling it.
45 *
46 * 2) Hand off the duty in the NOHZ idle case by setting the value to
47 * TICK_DO_TIMER_NONE, i.e. a non existing CPU. So the next cpu which looks
48 * at it will take over and keep the time keeping alive. The handover
49 * procedure also covers cpu hotplug.
50 */
36int tick_do_timer_cpu __read_mostly = TICK_DO_TIMER_BOOT; 51int tick_do_timer_cpu __read_mostly = TICK_DO_TIMER_BOOT;
37 52
38/* 53/*
diff --git a/kernel/time/tick-internal.h b/kernel/time/tick-internal.h
index bc906cad709b..18e71f7fbc2a 100644
--- a/kernel/time/tick-internal.h
+++ b/kernel/time/tick-internal.h
@@ -31,7 +31,7 @@ extern void tick_install_replacement(struct clock_event_device *dev);
31 31
32extern void clockevents_shutdown(struct clock_event_device *dev); 32extern void clockevents_shutdown(struct clock_event_device *dev);
33 33
34extern size_t sysfs_get_uname(const char *buf, char *dst, size_t cnt); 34extern ssize_t sysfs_get_uname(const char *buf, char *dst, size_t cnt);
35 35
36/* 36/*
37 * NO_HZ / high resolution timer shared code 37 * NO_HZ / high resolution timer shared code
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index 3612fc77f834..ea20f7d1ac2c 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -361,8 +361,8 @@ void __init tick_nohz_init(void)
361/* 361/*
362 * NO HZ enabled ? 362 * NO HZ enabled ?
363 */ 363 */
364int tick_nohz_enabled __read_mostly = 1; 364static int tick_nohz_enabled __read_mostly = 1;
365 365int tick_nohz_active __read_mostly;
366/* 366/*
367 * Enable / Disable tickless mode 367 * Enable / Disable tickless mode
368 */ 368 */
@@ -465,7 +465,7 @@ u64 get_cpu_idle_time_us(int cpu, u64 *last_update_time)
465 struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu); 465 struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu);
466 ktime_t now, idle; 466 ktime_t now, idle;
467 467
468 if (!tick_nohz_enabled) 468 if (!tick_nohz_active)
469 return -1; 469 return -1;
470 470
471 now = ktime_get(); 471 now = ktime_get();
@@ -506,7 +506,7 @@ u64 get_cpu_iowait_time_us(int cpu, u64 *last_update_time)
506 struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu); 506 struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu);
507 ktime_t now, iowait; 507 ktime_t now, iowait;
508 508
509 if (!tick_nohz_enabled) 509 if (!tick_nohz_active)
510 return -1; 510 return -1;
511 511
512 now = ktime_get(); 512 now = ktime_get();
@@ -711,8 +711,10 @@ static bool can_stop_idle_tick(int cpu, struct tick_sched *ts)
711 return false; 711 return false;
712 } 712 }
713 713
714 if (unlikely(ts->nohz_mode == NOHZ_MODE_INACTIVE)) 714 if (unlikely(ts->nohz_mode == NOHZ_MODE_INACTIVE)) {
715 ts->sleep_length = (ktime_t) { .tv64 = NSEC_PER_SEC/HZ };
715 return false; 716 return false;
717 }
716 718
717 if (need_resched()) 719 if (need_resched())
718 return false; 720 return false;
@@ -799,11 +801,6 @@ void tick_nohz_idle_enter(void)
799 local_irq_disable(); 801 local_irq_disable();
800 802
801 ts = &__get_cpu_var(tick_cpu_sched); 803 ts = &__get_cpu_var(tick_cpu_sched);
802 /*
803 * set ts->inidle unconditionally. even if the system did not
804 * switch to nohz mode the cpu frequency governers rely on the
805 * update of the idle time accounting in tick_nohz_start_idle().
806 */
807 ts->inidle = 1; 804 ts->inidle = 1;
808 __tick_nohz_idle_enter(ts); 805 __tick_nohz_idle_enter(ts);
809 806
@@ -973,7 +970,7 @@ static void tick_nohz_switch_to_nohz(void)
973 struct tick_sched *ts = &__get_cpu_var(tick_cpu_sched); 970 struct tick_sched *ts = &__get_cpu_var(tick_cpu_sched);
974 ktime_t next; 971 ktime_t next;
975 972
976 if (!tick_nohz_enabled) 973 if (!tick_nohz_active)
977 return; 974 return;
978 975
979 local_irq_disable(); 976 local_irq_disable();
@@ -981,7 +978,7 @@ static void tick_nohz_switch_to_nohz(void)
981 local_irq_enable(); 978 local_irq_enable();
982 return; 979 return;
983 } 980 }
984 981 tick_nohz_active = 1;
985 ts->nohz_mode = NOHZ_MODE_LOWRES; 982 ts->nohz_mode = NOHZ_MODE_LOWRES;
986 983
987 /* 984 /*
@@ -1139,8 +1136,10 @@ void tick_setup_sched_timer(void)
1139 } 1136 }
1140 1137
1141#ifdef CONFIG_NO_HZ_COMMON 1138#ifdef CONFIG_NO_HZ_COMMON
1142 if (tick_nohz_enabled) 1139 if (tick_nohz_enabled) {
1143 ts->nohz_mode = NOHZ_MODE_HIGHRES; 1140 ts->nohz_mode = NOHZ_MODE_HIGHRES;
1141 tick_nohz_active = 1;
1142 }
1144#endif 1143#endif
1145} 1144}
1146#endif /* HIGH_RES_TIMERS */ 1145#endif /* HIGH_RES_TIMERS */
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index 947ba25a95a0..87b4f00284c9 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -1347,7 +1347,7 @@ static inline void old_vsyscall_fixup(struct timekeeper *tk)
1347 tk->xtime_nsec -= remainder; 1347 tk->xtime_nsec -= remainder;
1348 tk->xtime_nsec += 1ULL << tk->shift; 1348 tk->xtime_nsec += 1ULL << tk->shift;
1349 tk->ntp_error += remainder << tk->ntp_error_shift; 1349 tk->ntp_error += remainder << tk->ntp_error_shift;
1350 1350 tk->ntp_error -= (1ULL << tk->shift) << tk->ntp_error_shift;
1351} 1351}
1352#else 1352#else
1353#define old_vsyscall_fixup(tk) 1353#define old_vsyscall_fixup(tk)
@@ -1613,9 +1613,10 @@ void get_xtime_and_monotonic_and_sleep_offset(struct timespec *xtim,
1613 * ktime_get_update_offsets - hrtimer helper 1613 * ktime_get_update_offsets - hrtimer helper
1614 * @offs_real: pointer to storage for monotonic -> realtime offset 1614 * @offs_real: pointer to storage for monotonic -> realtime offset
1615 * @offs_boot: pointer to storage for monotonic -> boottime offset 1615 * @offs_boot: pointer to storage for monotonic -> boottime offset
1616 * @offs_tai: pointer to storage for monotonic -> clock tai offset
1616 * 1617 *
1617 * Returns current monotonic time and updates the offsets 1618 * Returns current monotonic time and updates the offsets
1618 * Called from hrtimer_interupt() or retrigger_next_event() 1619 * Called from hrtimer_interrupt() or retrigger_next_event()
1619 */ 1620 */
1620ktime_t ktime_get_update_offsets(ktime_t *offs_real, ktime_t *offs_boot, 1621ktime_t ktime_get_update_offsets(ktime_t *offs_real, ktime_t *offs_boot,
1621 ktime_t *offs_tai) 1622 ktime_t *offs_tai)
diff --git a/kernel/time/timer_stats.c b/kernel/time/timer_stats.c
index 0b537f27b559..1fb08f21302e 100644
--- a/kernel/time/timer_stats.c
+++ b/kernel/time/timer_stats.c
@@ -298,15 +298,15 @@ static int tstats_show(struct seq_file *m, void *v)
298 period = ktime_to_timespec(time); 298 period = ktime_to_timespec(time);
299 ms = period.tv_nsec / 1000000; 299 ms = period.tv_nsec / 1000000;
300 300
301 seq_puts(m, "Timer Stats Version: v0.2\n"); 301 seq_puts(m, "Timer Stats Version: v0.3\n");
302 seq_printf(m, "Sample period: %ld.%03ld s\n", period.tv_sec, ms); 302 seq_printf(m, "Sample period: %ld.%03ld s\n", period.tv_sec, ms);
303 if (atomic_read(&overflow_count)) 303 if (atomic_read(&overflow_count))
304 seq_printf(m, "Overflow: %d entries\n", 304 seq_printf(m, "Overflow: %d entries\n", atomic_read(&overflow_count));
305 atomic_read(&overflow_count)); 305 seq_printf(m, "Collection: %s\n", timer_stats_active ? "active" : "inactive");
306 306
307 for (i = 0; i < nr_entries; i++) { 307 for (i = 0; i < nr_entries; i++) {
308 entry = entries + i; 308 entry = entries + i;
309 if (entry->timer_flag & TIMER_STATS_FLAG_DEFERRABLE) { 309 if (entry->timer_flag & TIMER_STATS_FLAG_DEFERRABLE) {
310 seq_printf(m, "%4luD, %5d %-16s ", 310 seq_printf(m, "%4luD, %5d %-16s ",
311 entry->count, entry->pid, entry->comm); 311 entry->count, entry->pid, entry->comm);
312 } else { 312 } else {
diff --git a/kernel/timer.c b/kernel/timer.c
index 4296d13db3d1..accfd241b9e5 100644
--- a/kernel/timer.c
+++ b/kernel/timer.c
@@ -1092,7 +1092,7 @@ static int cascade(struct tvec_base *base, struct tvec *tv, int index)
1092static void call_timer_fn(struct timer_list *timer, void (*fn)(unsigned long), 1092static void call_timer_fn(struct timer_list *timer, void (*fn)(unsigned long),
1093 unsigned long data) 1093 unsigned long data)
1094{ 1094{
1095 int preempt_count = preempt_count(); 1095 int count = preempt_count();
1096 1096
1097#ifdef CONFIG_LOCKDEP 1097#ifdef CONFIG_LOCKDEP
1098 /* 1098 /*
@@ -1119,16 +1119,16 @@ static void call_timer_fn(struct timer_list *timer, void (*fn)(unsigned long),
1119 1119
1120 lock_map_release(&lockdep_map); 1120 lock_map_release(&lockdep_map);
1121 1121
1122 if (preempt_count != preempt_count()) { 1122 if (count != preempt_count()) {
1123 WARN_ONCE(1, "timer: %pF preempt leak: %08x -> %08x\n", 1123 WARN_ONCE(1, "timer: %pF preempt leak: %08x -> %08x\n",
1124 fn, preempt_count, preempt_count()); 1124 fn, count, preempt_count());
1125 /* 1125 /*
1126 * Restore the preempt count. That gives us a decent 1126 * Restore the preempt count. That gives us a decent
1127 * chance to survive and extract information. If the 1127 * chance to survive and extract information. If the
1128 * callback kept a lock held, bad luck, but not worse 1128 * callback kept a lock held, bad luck, but not worse
1129 * than the BUG() we had. 1129 * than the BUG() we had.
1130 */ 1130 */
1131 preempt_count() = preempt_count; 1131 preempt_count_set(count);
1132 } 1132 }
1133} 1133}
1134 1134
@@ -1518,9 +1518,8 @@ static int init_timers_cpu(int cpu)
1518 /* 1518 /*
1519 * The APs use this path later in boot 1519 * The APs use this path later in boot
1520 */ 1520 */
1521 base = kmalloc_node(sizeof(*base), 1521 base = kzalloc_node(sizeof(*base), GFP_KERNEL,
1522 GFP_KERNEL | __GFP_ZERO, 1522 cpu_to_node(cpu));
1523 cpu_to_node(cpu));
1524 if (!base) 1523 if (!base)
1525 return -ENOMEM; 1524 return -ENOMEM;
1526 1525
diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c
index b8b8560bfb95..f785aef65799 100644
--- a/kernel/trace/blktrace.c
+++ b/kernel/trace/blktrace.c
@@ -26,6 +26,7 @@
26#include <linux/export.h> 26#include <linux/export.h>
27#include <linux/time.h> 27#include <linux/time.h>
28#include <linux/uaccess.h> 28#include <linux/uaccess.h>
29#include <linux/list.h>
29 30
30#include <trace/events/block.h> 31#include <trace/events/block.h>
31 32
@@ -38,6 +39,9 @@ static unsigned int blktrace_seq __read_mostly = 1;
38static struct trace_array *blk_tr; 39static struct trace_array *blk_tr;
39static bool blk_tracer_enabled __read_mostly; 40static bool blk_tracer_enabled __read_mostly;
40 41
42static LIST_HEAD(running_trace_list);
43static __cacheline_aligned_in_smp DEFINE_SPINLOCK(running_trace_lock);
44
41/* Select an alternative, minimalistic output than the original one */ 45/* Select an alternative, minimalistic output than the original one */
42#define TRACE_BLK_OPT_CLASSIC 0x1 46#define TRACE_BLK_OPT_CLASSIC 0x1
43 47
@@ -107,10 +111,18 @@ record_it:
107 * Send out a notify for this process, if we haven't done so since a trace 111 * Send out a notify for this process, if we haven't done so since a trace
108 * started 112 * started
109 */ 113 */
110static void trace_note_tsk(struct blk_trace *bt, struct task_struct *tsk) 114static void trace_note_tsk(struct task_struct *tsk)
111{ 115{
116 unsigned long flags;
117 struct blk_trace *bt;
118
112 tsk->btrace_seq = blktrace_seq; 119 tsk->btrace_seq = blktrace_seq;
113 trace_note(bt, tsk->pid, BLK_TN_PROCESS, tsk->comm, sizeof(tsk->comm)); 120 spin_lock_irqsave(&running_trace_lock, flags);
121 list_for_each_entry(bt, &running_trace_list, running_list) {
122 trace_note(bt, tsk->pid, BLK_TN_PROCESS, tsk->comm,
123 sizeof(tsk->comm));
124 }
125 spin_unlock_irqrestore(&running_trace_lock, flags);
114} 126}
115 127
116static void trace_note_time(struct blk_trace *bt) 128static void trace_note_time(struct blk_trace *bt)
@@ -229,16 +241,15 @@ static void __blk_add_trace(struct blk_trace *bt, sector_t sector, int bytes,
229 goto record_it; 241 goto record_it;
230 } 242 }
231 243
244 if (unlikely(tsk->btrace_seq != blktrace_seq))
245 trace_note_tsk(tsk);
246
232 /* 247 /*
233 * A word about the locking here - we disable interrupts to reserve 248 * A word about the locking here - we disable interrupts to reserve
234 * some space in the relay per-cpu buffer, to prevent an irq 249 * some space in the relay per-cpu buffer, to prevent an irq
235 * from coming in and stepping on our toes. 250 * from coming in and stepping on our toes.
236 */ 251 */
237 local_irq_save(flags); 252 local_irq_save(flags);
238
239 if (unlikely(tsk->btrace_seq != blktrace_seq))
240 trace_note_tsk(bt, tsk);
241
242 t = relay_reserve(bt->rchan, sizeof(*t) + pdu_len); 253 t = relay_reserve(bt->rchan, sizeof(*t) + pdu_len);
243 if (t) { 254 if (t) {
244 sequence = per_cpu_ptr(bt->sequence, cpu); 255 sequence = per_cpu_ptr(bt->sequence, cpu);
@@ -477,6 +488,7 @@ int do_blk_trace_setup(struct request_queue *q, char *name, dev_t dev,
477 bt->dir = dir; 488 bt->dir = dir;
478 bt->dev = dev; 489 bt->dev = dev;
479 atomic_set(&bt->dropped, 0); 490 atomic_set(&bt->dropped, 0);
491 INIT_LIST_HEAD(&bt->running_list);
480 492
481 ret = -EIO; 493 ret = -EIO;
482 bt->dropped_file = debugfs_create_file("dropped", 0444, dir, bt, 494 bt->dropped_file = debugfs_create_file("dropped", 0444, dir, bt,
@@ -567,13 +579,12 @@ static int compat_blk_trace_setup(struct request_queue *q, char *name,
567 .end_lba = cbuts.end_lba, 579 .end_lba = cbuts.end_lba,
568 .pid = cbuts.pid, 580 .pid = cbuts.pid,
569 }; 581 };
570 memcpy(&buts.name, &cbuts.name, 32);
571 582
572 ret = do_blk_trace_setup(q, name, dev, bdev, &buts); 583 ret = do_blk_trace_setup(q, name, dev, bdev, &buts);
573 if (ret) 584 if (ret)
574 return ret; 585 return ret;
575 586
576 if (copy_to_user(arg, &buts.name, 32)) { 587 if (copy_to_user(arg, &buts.name, ARRAY_SIZE(buts.name))) {
577 blk_trace_remove(q); 588 blk_trace_remove(q);
578 return -EFAULT; 589 return -EFAULT;
579 } 590 }
@@ -601,6 +612,9 @@ int blk_trace_startstop(struct request_queue *q, int start)
601 blktrace_seq++; 612 blktrace_seq++;
602 smp_mb(); 613 smp_mb();
603 bt->trace_state = Blktrace_running; 614 bt->trace_state = Blktrace_running;
615 spin_lock_irq(&running_trace_lock);
616 list_add(&bt->running_list, &running_trace_list);
617 spin_unlock_irq(&running_trace_lock);
604 618
605 trace_note_time(bt); 619 trace_note_time(bt);
606 ret = 0; 620 ret = 0;
@@ -608,6 +622,9 @@ int blk_trace_startstop(struct request_queue *q, int start)
608 } else { 622 } else {
609 if (bt->trace_state == Blktrace_running) { 623 if (bt->trace_state == Blktrace_running) {
610 bt->trace_state = Blktrace_stopped; 624 bt->trace_state = Blktrace_stopped;
625 spin_lock_irq(&running_trace_lock);
626 list_del_init(&bt->running_list);
627 spin_unlock_irq(&running_trace_lock);
611 relay_flush(bt->rchan); 628 relay_flush(bt->rchan);
612 ret = 0; 629 ret = 0;
613 } 630 }
@@ -1472,6 +1489,9 @@ static int blk_trace_remove_queue(struct request_queue *q)
1472 if (atomic_dec_and_test(&blk_probes_ref)) 1489 if (atomic_dec_and_test(&blk_probes_ref))
1473 blk_unregister_tracepoints(); 1490 blk_unregister_tracepoints();
1474 1491
1492 spin_lock_irq(&running_trace_lock);
1493 list_del(&bt->running_list);
1494 spin_unlock_irq(&running_trace_lock);
1475 blk_trace_free(bt); 1495 blk_trace_free(bt);
1476 return 0; 1496 return 0;
1477} 1497}
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 03cf44ac54d3..72a0f81dc5a8 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -367,9 +367,6 @@ static int remove_ftrace_list_ops(struct ftrace_ops **list,
367 367
368static int __register_ftrace_function(struct ftrace_ops *ops) 368static int __register_ftrace_function(struct ftrace_ops *ops)
369{ 369{
370 if (unlikely(ftrace_disabled))
371 return -ENODEV;
372
373 if (FTRACE_WARN_ON(ops == &global_ops)) 370 if (FTRACE_WARN_ON(ops == &global_ops))
374 return -EINVAL; 371 return -EINVAL;
375 372
@@ -428,9 +425,6 @@ static int __unregister_ftrace_function(struct ftrace_ops *ops)
428{ 425{
429 int ret; 426 int ret;
430 427
431 if (ftrace_disabled)
432 return -ENODEV;
433
434 if (WARN_ON(!(ops->flags & FTRACE_OPS_FL_ENABLED))) 428 if (WARN_ON(!(ops->flags & FTRACE_OPS_FL_ENABLED)))
435 return -EBUSY; 429 return -EBUSY;
436 430
@@ -781,7 +775,7 @@ static int ftrace_profile_init(void)
781 int cpu; 775 int cpu;
782 int ret = 0; 776 int ret = 0;
783 777
784 for_each_online_cpu(cpu) { 778 for_each_possible_cpu(cpu) {
785 ret = ftrace_profile_init_cpu(cpu); 779 ret = ftrace_profile_init_cpu(cpu);
786 if (ret) 780 if (ret)
787 break; 781 break;
@@ -2088,10 +2082,15 @@ static void ftrace_startup_enable(int command)
2088static int ftrace_startup(struct ftrace_ops *ops, int command) 2082static int ftrace_startup(struct ftrace_ops *ops, int command)
2089{ 2083{
2090 bool hash_enable = true; 2084 bool hash_enable = true;
2085 int ret;
2091 2086
2092 if (unlikely(ftrace_disabled)) 2087 if (unlikely(ftrace_disabled))
2093 return -ENODEV; 2088 return -ENODEV;
2094 2089
2090 ret = __register_ftrace_function(ops);
2091 if (ret)
2092 return ret;
2093
2095 ftrace_start_up++; 2094 ftrace_start_up++;
2096 command |= FTRACE_UPDATE_CALLS; 2095 command |= FTRACE_UPDATE_CALLS;
2097 2096
@@ -2113,12 +2112,17 @@ static int ftrace_startup(struct ftrace_ops *ops, int command)
2113 return 0; 2112 return 0;
2114} 2113}
2115 2114
2116static void ftrace_shutdown(struct ftrace_ops *ops, int command) 2115static int ftrace_shutdown(struct ftrace_ops *ops, int command)
2117{ 2116{
2118 bool hash_disable = true; 2117 bool hash_disable = true;
2118 int ret;
2119 2119
2120 if (unlikely(ftrace_disabled)) 2120 if (unlikely(ftrace_disabled))
2121 return; 2121 return -ENODEV;
2122
2123 ret = __unregister_ftrace_function(ops);
2124 if (ret)
2125 return ret;
2122 2126
2123 ftrace_start_up--; 2127 ftrace_start_up--;
2124 /* 2128 /*
@@ -2153,9 +2157,10 @@ static void ftrace_shutdown(struct ftrace_ops *ops, int command)
2153 } 2157 }
2154 2158
2155 if (!command || !ftrace_enabled) 2159 if (!command || !ftrace_enabled)
2156 return; 2160 return 0;
2157 2161
2158 ftrace_run_update_code(command); 2162 ftrace_run_update_code(command);
2163 return 0;
2159} 2164}
2160 2165
2161static void ftrace_startup_sysctl(void) 2166static void ftrace_startup_sysctl(void)
@@ -3060,16 +3065,13 @@ static void __enable_ftrace_function_probe(void)
3060 if (i == FTRACE_FUNC_HASHSIZE) 3065 if (i == FTRACE_FUNC_HASHSIZE)
3061 return; 3066 return;
3062 3067
3063 ret = __register_ftrace_function(&trace_probe_ops); 3068 ret = ftrace_startup(&trace_probe_ops, 0);
3064 if (!ret)
3065 ret = ftrace_startup(&trace_probe_ops, 0);
3066 3069
3067 ftrace_probe_registered = 1; 3070 ftrace_probe_registered = 1;
3068} 3071}
3069 3072
3070static void __disable_ftrace_function_probe(void) 3073static void __disable_ftrace_function_probe(void)
3071{ 3074{
3072 int ret;
3073 int i; 3075 int i;
3074 3076
3075 if (!ftrace_probe_registered) 3077 if (!ftrace_probe_registered)
@@ -3082,9 +3084,7 @@ static void __disable_ftrace_function_probe(void)
3082 } 3084 }
3083 3085
3084 /* no more funcs left */ 3086 /* no more funcs left */
3085 ret = __unregister_ftrace_function(&trace_probe_ops); 3087 ftrace_shutdown(&trace_probe_ops, 0);
3086 if (!ret)
3087 ftrace_shutdown(&trace_probe_ops, 0);
3088 3088
3089 ftrace_probe_registered = 0; 3089 ftrace_probe_registered = 0;
3090} 3090}
@@ -3307,7 +3307,11 @@ void unregister_ftrace_function_probe_all(char *glob)
3307static LIST_HEAD(ftrace_commands); 3307static LIST_HEAD(ftrace_commands);
3308static DEFINE_MUTEX(ftrace_cmd_mutex); 3308static DEFINE_MUTEX(ftrace_cmd_mutex);
3309 3309
3310int register_ftrace_command(struct ftrace_func_command *cmd) 3310/*
3311 * Currently we only register ftrace commands from __init, so mark this
3312 * __init too.
3313 */
3314__init int register_ftrace_command(struct ftrace_func_command *cmd)
3311{ 3315{
3312 struct ftrace_func_command *p; 3316 struct ftrace_func_command *p;
3313 int ret = 0; 3317 int ret = 0;
@@ -3326,7 +3330,11 @@ int register_ftrace_command(struct ftrace_func_command *cmd)
3326 return ret; 3330 return ret;
3327} 3331}
3328 3332
3329int unregister_ftrace_command(struct ftrace_func_command *cmd) 3333/*
3334 * Currently we only unregister ftrace commands from __init, so mark
3335 * this __init too.
3336 */
3337__init int unregister_ftrace_command(struct ftrace_func_command *cmd)
3330{ 3338{
3331 struct ftrace_func_command *p, *n; 3339 struct ftrace_func_command *p, *n;
3332 int ret = -ENODEV; 3340 int ret = -ENODEV;
@@ -3641,7 +3649,7 @@ __setup("ftrace_filter=", set_ftrace_filter);
3641 3649
3642#ifdef CONFIG_FUNCTION_GRAPH_TRACER 3650#ifdef CONFIG_FUNCTION_GRAPH_TRACER
3643static char ftrace_graph_buf[FTRACE_FILTER_SIZE] __initdata; 3651static char ftrace_graph_buf[FTRACE_FILTER_SIZE] __initdata;
3644static int ftrace_set_func(unsigned long *array, int *idx, char *buffer); 3652static int ftrace_set_func(unsigned long *array, int *idx, int size, char *buffer);
3645 3653
3646static int __init set_graph_function(char *str) 3654static int __init set_graph_function(char *str)
3647{ 3655{
@@ -3659,7 +3667,7 @@ static void __init set_ftrace_early_graph(char *buf)
3659 func = strsep(&buf, ","); 3667 func = strsep(&buf, ",");
3660 /* we allow only one expression at a time */ 3668 /* we allow only one expression at a time */
3661 ret = ftrace_set_func(ftrace_graph_funcs, &ftrace_graph_count, 3669 ret = ftrace_set_func(ftrace_graph_funcs, &ftrace_graph_count,
3662 func); 3670 FTRACE_GRAPH_MAX_FUNCS, func);
3663 if (ret) 3671 if (ret)
3664 printk(KERN_DEBUG "ftrace: function %s not " 3672 printk(KERN_DEBUG "ftrace: function %s not "
3665 "traceable\n", func); 3673 "traceable\n", func);
@@ -3776,15 +3784,25 @@ static const struct file_operations ftrace_notrace_fops = {
3776static DEFINE_MUTEX(graph_lock); 3784static DEFINE_MUTEX(graph_lock);
3777 3785
3778int ftrace_graph_count; 3786int ftrace_graph_count;
3779int ftrace_graph_filter_enabled; 3787int ftrace_graph_notrace_count;
3780unsigned long ftrace_graph_funcs[FTRACE_GRAPH_MAX_FUNCS] __read_mostly; 3788unsigned long ftrace_graph_funcs[FTRACE_GRAPH_MAX_FUNCS] __read_mostly;
3789unsigned long ftrace_graph_notrace_funcs[FTRACE_GRAPH_MAX_FUNCS] __read_mostly;
3790
3791struct ftrace_graph_data {
3792 unsigned long *table;
3793 size_t size;
3794 int *count;
3795 const struct seq_operations *seq_ops;
3796};
3781 3797
3782static void * 3798static void *
3783__g_next(struct seq_file *m, loff_t *pos) 3799__g_next(struct seq_file *m, loff_t *pos)
3784{ 3800{
3785 if (*pos >= ftrace_graph_count) 3801 struct ftrace_graph_data *fgd = m->private;
3802
3803 if (*pos >= *fgd->count)
3786 return NULL; 3804 return NULL;
3787 return &ftrace_graph_funcs[*pos]; 3805 return &fgd->table[*pos];
3788} 3806}
3789 3807
3790static void * 3808static void *
@@ -3796,10 +3814,12 @@ g_next(struct seq_file *m, void *v, loff_t *pos)
3796 3814
3797static void *g_start(struct seq_file *m, loff_t *pos) 3815static void *g_start(struct seq_file *m, loff_t *pos)
3798{ 3816{
3817 struct ftrace_graph_data *fgd = m->private;
3818
3799 mutex_lock(&graph_lock); 3819 mutex_lock(&graph_lock);
3800 3820
3801 /* Nothing, tell g_show to print all functions are enabled */ 3821 /* Nothing, tell g_show to print all functions are enabled */
3802 if (!ftrace_graph_filter_enabled && !*pos) 3822 if (!*fgd->count && !*pos)
3803 return (void *)1; 3823 return (void *)1;
3804 3824
3805 return __g_next(m, pos); 3825 return __g_next(m, pos);
@@ -3835,38 +3855,88 @@ static const struct seq_operations ftrace_graph_seq_ops = {
3835}; 3855};
3836 3856
3837static int 3857static int
3838ftrace_graph_open(struct inode *inode, struct file *file) 3858__ftrace_graph_open(struct inode *inode, struct file *file,
3859 struct ftrace_graph_data *fgd)
3839{ 3860{
3840 int ret = 0; 3861 int ret = 0;
3841 3862
3842 if (unlikely(ftrace_disabled))
3843 return -ENODEV;
3844
3845 mutex_lock(&graph_lock); 3863 mutex_lock(&graph_lock);
3846 if ((file->f_mode & FMODE_WRITE) && 3864 if ((file->f_mode & FMODE_WRITE) &&
3847 (file->f_flags & O_TRUNC)) { 3865 (file->f_flags & O_TRUNC)) {
3848 ftrace_graph_filter_enabled = 0; 3866 *fgd->count = 0;
3849 ftrace_graph_count = 0; 3867 memset(fgd->table, 0, fgd->size * sizeof(*fgd->table));
3850 memset(ftrace_graph_funcs, 0, sizeof(ftrace_graph_funcs));
3851 } 3868 }
3852 mutex_unlock(&graph_lock); 3869 mutex_unlock(&graph_lock);
3853 3870
3854 if (file->f_mode & FMODE_READ) 3871 if (file->f_mode & FMODE_READ) {
3855 ret = seq_open(file, &ftrace_graph_seq_ops); 3872 ret = seq_open(file, fgd->seq_ops);
3873 if (!ret) {
3874 struct seq_file *m = file->private_data;
3875 m->private = fgd;
3876 }
3877 } else
3878 file->private_data = fgd;
3856 3879
3857 return ret; 3880 return ret;
3858} 3881}
3859 3882
3860static int 3883static int
3884ftrace_graph_open(struct inode *inode, struct file *file)
3885{
3886 struct ftrace_graph_data *fgd;
3887
3888 if (unlikely(ftrace_disabled))
3889 return -ENODEV;
3890
3891 fgd = kmalloc(sizeof(*fgd), GFP_KERNEL);
3892 if (fgd == NULL)
3893 return -ENOMEM;
3894
3895 fgd->table = ftrace_graph_funcs;
3896 fgd->size = FTRACE_GRAPH_MAX_FUNCS;
3897 fgd->count = &ftrace_graph_count;
3898 fgd->seq_ops = &ftrace_graph_seq_ops;
3899
3900 return __ftrace_graph_open(inode, file, fgd);
3901}
3902
3903static int
3904ftrace_graph_notrace_open(struct inode *inode, struct file *file)
3905{
3906 struct ftrace_graph_data *fgd;
3907
3908 if (unlikely(ftrace_disabled))
3909 return -ENODEV;
3910
3911 fgd = kmalloc(sizeof(*fgd), GFP_KERNEL);
3912 if (fgd == NULL)
3913 return -ENOMEM;
3914
3915 fgd->table = ftrace_graph_notrace_funcs;
3916 fgd->size = FTRACE_GRAPH_MAX_FUNCS;
3917 fgd->count = &ftrace_graph_notrace_count;
3918 fgd->seq_ops = &ftrace_graph_seq_ops;
3919
3920 return __ftrace_graph_open(inode, file, fgd);
3921}
3922
3923static int
3861ftrace_graph_release(struct inode *inode, struct file *file) 3924ftrace_graph_release(struct inode *inode, struct file *file)
3862{ 3925{
3863 if (file->f_mode & FMODE_READ) 3926 if (file->f_mode & FMODE_READ) {
3927 struct seq_file *m = file->private_data;
3928
3929 kfree(m->private);
3864 seq_release(inode, file); 3930 seq_release(inode, file);
3931 } else {
3932 kfree(file->private_data);
3933 }
3934
3865 return 0; 3935 return 0;
3866} 3936}
3867 3937
3868static int 3938static int
3869ftrace_set_func(unsigned long *array, int *idx, char *buffer) 3939ftrace_set_func(unsigned long *array, int *idx, int size, char *buffer)
3870{ 3940{
3871 struct dyn_ftrace *rec; 3941 struct dyn_ftrace *rec;
3872 struct ftrace_page *pg; 3942 struct ftrace_page *pg;
@@ -3879,7 +3949,7 @@ ftrace_set_func(unsigned long *array, int *idx, char *buffer)
3879 3949
3880 /* decode regex */ 3950 /* decode regex */
3881 type = filter_parse_regex(buffer, strlen(buffer), &search, &not); 3951 type = filter_parse_regex(buffer, strlen(buffer), &search, &not);
3882 if (!not && *idx >= FTRACE_GRAPH_MAX_FUNCS) 3952 if (!not && *idx >= size)
3883 return -EBUSY; 3953 return -EBUSY;
3884 3954
3885 search_len = strlen(search); 3955 search_len = strlen(search);
@@ -3907,7 +3977,7 @@ ftrace_set_func(unsigned long *array, int *idx, char *buffer)
3907 fail = 0; 3977 fail = 0;
3908 if (!exists) { 3978 if (!exists) {
3909 array[(*idx)++] = rec->ip; 3979 array[(*idx)++] = rec->ip;
3910 if (*idx >= FTRACE_GRAPH_MAX_FUNCS) 3980 if (*idx >= size)
3911 goto out; 3981 goto out;
3912 } 3982 }
3913 } else { 3983 } else {
@@ -3925,8 +3995,6 @@ out:
3925 if (fail) 3995 if (fail)
3926 return -EINVAL; 3996 return -EINVAL;
3927 3997
3928 ftrace_graph_filter_enabled = !!(*idx);
3929
3930 return 0; 3998 return 0;
3931} 3999}
3932 4000
@@ -3935,36 +4003,33 @@ ftrace_graph_write(struct file *file, const char __user *ubuf,
3935 size_t cnt, loff_t *ppos) 4003 size_t cnt, loff_t *ppos)
3936{ 4004{
3937 struct trace_parser parser; 4005 struct trace_parser parser;
3938 ssize_t read, ret; 4006 ssize_t read, ret = 0;
4007 struct ftrace_graph_data *fgd = file->private_data;
3939 4008
3940 if (!cnt) 4009 if (!cnt)
3941 return 0; 4010 return 0;
3942 4011
3943 mutex_lock(&graph_lock); 4012 if (trace_parser_get_init(&parser, FTRACE_BUFF_MAX))
3944 4013 return -ENOMEM;
3945 if (trace_parser_get_init(&parser, FTRACE_BUFF_MAX)) {
3946 ret = -ENOMEM;
3947 goto out_unlock;
3948 }
3949 4014
3950 read = trace_get_user(&parser, ubuf, cnt, ppos); 4015 read = trace_get_user(&parser, ubuf, cnt, ppos);
3951 4016
3952 if (read >= 0 && trace_parser_loaded((&parser))) { 4017 if (read >= 0 && trace_parser_loaded((&parser))) {
3953 parser.buffer[parser.idx] = 0; 4018 parser.buffer[parser.idx] = 0;
3954 4019
4020 mutex_lock(&graph_lock);
4021
3955 /* we allow only one expression at a time */ 4022 /* we allow only one expression at a time */
3956 ret = ftrace_set_func(ftrace_graph_funcs, &ftrace_graph_count, 4023 ret = ftrace_set_func(fgd->table, fgd->count, fgd->size,
3957 parser.buffer); 4024 parser.buffer);
3958 if (ret) 4025
3959 goto out_free; 4026 mutex_unlock(&graph_lock);
3960 } 4027 }
3961 4028
3962 ret = read; 4029 if (!ret)
4030 ret = read;
3963 4031
3964out_free:
3965 trace_parser_put(&parser); 4032 trace_parser_put(&parser);
3966out_unlock:
3967 mutex_unlock(&graph_lock);
3968 4033
3969 return ret; 4034 return ret;
3970} 4035}
@@ -3976,6 +4041,14 @@ static const struct file_operations ftrace_graph_fops = {
3976 .llseek = ftrace_filter_lseek, 4041 .llseek = ftrace_filter_lseek,
3977 .release = ftrace_graph_release, 4042 .release = ftrace_graph_release,
3978}; 4043};
4044
4045static const struct file_operations ftrace_graph_notrace_fops = {
4046 .open = ftrace_graph_notrace_open,
4047 .read = seq_read,
4048 .write = ftrace_graph_write,
4049 .llseek = ftrace_filter_lseek,
4050 .release = ftrace_graph_release,
4051};
3979#endif /* CONFIG_FUNCTION_GRAPH_TRACER */ 4052#endif /* CONFIG_FUNCTION_GRAPH_TRACER */
3980 4053
3981static __init int ftrace_init_dyn_debugfs(struct dentry *d_tracer) 4054static __init int ftrace_init_dyn_debugfs(struct dentry *d_tracer)
@@ -3997,6 +4070,9 @@ static __init int ftrace_init_dyn_debugfs(struct dentry *d_tracer)
3997 trace_create_file("set_graph_function", 0444, d_tracer, 4070 trace_create_file("set_graph_function", 0444, d_tracer,
3998 NULL, 4071 NULL,
3999 &ftrace_graph_fops); 4072 &ftrace_graph_fops);
4073 trace_create_file("set_graph_notrace", 0444, d_tracer,
4074 NULL,
4075 &ftrace_graph_notrace_fops);
4000#endif /* CONFIG_FUNCTION_GRAPH_TRACER */ 4076#endif /* CONFIG_FUNCTION_GRAPH_TRACER */
4001 4077
4002 return 0; 4078 return 0;
@@ -4290,12 +4366,15 @@ core_initcall(ftrace_nodyn_init);
4290static inline int ftrace_init_dyn_debugfs(struct dentry *d_tracer) { return 0; } 4366static inline int ftrace_init_dyn_debugfs(struct dentry *d_tracer) { return 0; }
4291static inline void ftrace_startup_enable(int command) { } 4367static inline void ftrace_startup_enable(int command) { }
4292/* Keep as macros so we do not need to define the commands */ 4368/* Keep as macros so we do not need to define the commands */
4293# define ftrace_startup(ops, command) \ 4369# define ftrace_startup(ops, command) \
4294 ({ \ 4370 ({ \
4295 (ops)->flags |= FTRACE_OPS_FL_ENABLED; \ 4371 int ___ret = __register_ftrace_function(ops); \
4296 0; \ 4372 if (!___ret) \
4373 (ops)->flags |= FTRACE_OPS_FL_ENABLED; \
4374 ___ret; \
4297 }) 4375 })
4298# define ftrace_shutdown(ops, command) do { } while (0) 4376# define ftrace_shutdown(ops, command) __unregister_ftrace_function(ops)
4377
4299# define ftrace_startup_sysctl() do { } while (0) 4378# define ftrace_startup_sysctl() do { } while (0)
4300# define ftrace_shutdown_sysctl() do { } while (0) 4379# define ftrace_shutdown_sysctl() do { } while (0)
4301 4380
@@ -4320,12 +4399,21 @@ ftrace_ops_control_func(unsigned long ip, unsigned long parent_ip,
4320 */ 4399 */
4321 preempt_disable_notrace(); 4400 preempt_disable_notrace();
4322 trace_recursion_set(TRACE_CONTROL_BIT); 4401 trace_recursion_set(TRACE_CONTROL_BIT);
4402
4403 /*
4404 * Control funcs (perf) uses RCU. Only trace if
4405 * RCU is currently active.
4406 */
4407 if (!rcu_is_watching())
4408 goto out;
4409
4323 do_for_each_ftrace_op(op, ftrace_control_list) { 4410 do_for_each_ftrace_op(op, ftrace_control_list) {
4324 if (!(op->flags & FTRACE_OPS_FL_STUB) && 4411 if (!(op->flags & FTRACE_OPS_FL_STUB) &&
4325 !ftrace_function_local_disabled(op) && 4412 !ftrace_function_local_disabled(op) &&
4326 ftrace_ops_test(op, ip, regs)) 4413 ftrace_ops_test(op, ip, regs))
4327 op->func(ip, parent_ip, op, regs); 4414 op->func(ip, parent_ip, op, regs);
4328 } while_for_each_ftrace_op(op); 4415 } while_for_each_ftrace_op(op);
4416 out:
4329 trace_recursion_clear(TRACE_CONTROL_BIT); 4417 trace_recursion_clear(TRACE_CONTROL_BIT);
4330 preempt_enable_notrace(); 4418 preempt_enable_notrace();
4331} 4419}
@@ -4695,9 +4783,7 @@ int register_ftrace_function(struct ftrace_ops *ops)
4695 4783
4696 mutex_lock(&ftrace_lock); 4784 mutex_lock(&ftrace_lock);
4697 4785
4698 ret = __register_ftrace_function(ops); 4786 ret = ftrace_startup(ops, 0);
4699 if (!ret)
4700 ret = ftrace_startup(ops, 0);
4701 4787
4702 mutex_unlock(&ftrace_lock); 4788 mutex_unlock(&ftrace_lock);
4703 4789
@@ -4716,9 +4802,7 @@ int unregister_ftrace_function(struct ftrace_ops *ops)
4716 int ret; 4802 int ret;
4717 4803
4718 mutex_lock(&ftrace_lock); 4804 mutex_lock(&ftrace_lock);
4719 ret = __unregister_ftrace_function(ops); 4805 ret = ftrace_shutdown(ops, 0);
4720 if (!ret)
4721 ftrace_shutdown(ops, 0);
4722 mutex_unlock(&ftrace_lock); 4806 mutex_unlock(&ftrace_lock);
4723 4807
4724 return ret; 4808 return ret;
@@ -4912,6 +4996,13 @@ ftrace_suspend_notifier_call(struct notifier_block *bl, unsigned long state,
4912 return NOTIFY_DONE; 4996 return NOTIFY_DONE;
4913} 4997}
4914 4998
4999/* Just a place holder for function graph */
5000static struct ftrace_ops fgraph_ops __read_mostly = {
5001 .func = ftrace_stub,
5002 .flags = FTRACE_OPS_FL_STUB | FTRACE_OPS_FL_GLOBAL |
5003 FTRACE_OPS_FL_RECURSION_SAFE,
5004};
5005
4915int register_ftrace_graph(trace_func_graph_ret_t retfunc, 5006int register_ftrace_graph(trace_func_graph_ret_t retfunc,
4916 trace_func_graph_ent_t entryfunc) 5007 trace_func_graph_ent_t entryfunc)
4917{ 5008{
@@ -4938,7 +5029,7 @@ int register_ftrace_graph(trace_func_graph_ret_t retfunc,
4938 ftrace_graph_return = retfunc; 5029 ftrace_graph_return = retfunc;
4939 ftrace_graph_entry = entryfunc; 5030 ftrace_graph_entry = entryfunc;
4940 5031
4941 ret = ftrace_startup(&global_ops, FTRACE_START_FUNC_RET); 5032 ret = ftrace_startup(&fgraph_ops, FTRACE_START_FUNC_RET);
4942 5033
4943out: 5034out:
4944 mutex_unlock(&ftrace_lock); 5035 mutex_unlock(&ftrace_lock);
@@ -4955,7 +5046,7 @@ void unregister_ftrace_graph(void)
4955 ftrace_graph_active--; 5046 ftrace_graph_active--;
4956 ftrace_graph_return = (trace_func_graph_ret_t)ftrace_stub; 5047 ftrace_graph_return = (trace_func_graph_ret_t)ftrace_stub;
4957 ftrace_graph_entry = ftrace_graph_entry_stub; 5048 ftrace_graph_entry = ftrace_graph_entry_stub;
4958 ftrace_shutdown(&global_ops, FTRACE_STOP_FUNC_RET); 5049 ftrace_shutdown(&fgraph_ops, FTRACE_STOP_FUNC_RET);
4959 unregister_pm_notifier(&ftrace_suspend_notifier); 5050 unregister_pm_notifier(&ftrace_suspend_notifier);
4960 unregister_trace_sched_switch(ftrace_graph_probe_sched_switch, NULL); 5051 unregister_trace_sched_switch(ftrace_graph_probe_sched_switch, NULL);
4961 5052
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 7974ba20557d..9d20cd9743ef 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -235,13 +235,33 @@ void trace_array_put(struct trace_array *this_tr)
235 mutex_unlock(&trace_types_lock); 235 mutex_unlock(&trace_types_lock);
236} 236}
237 237
238int filter_current_check_discard(struct ring_buffer *buffer, 238int filter_check_discard(struct ftrace_event_file *file, void *rec,
239 struct ftrace_event_call *call, void *rec, 239 struct ring_buffer *buffer,
240 struct ring_buffer_event *event) 240 struct ring_buffer_event *event)
241{ 241{
242 return filter_check_discard(call, rec, buffer, event); 242 if (unlikely(file->flags & FTRACE_EVENT_FL_FILTERED) &&
243 !filter_match_preds(file->filter, rec)) {
244 ring_buffer_discard_commit(buffer, event);
245 return 1;
246 }
247
248 return 0;
249}
250EXPORT_SYMBOL_GPL(filter_check_discard);
251
252int call_filter_check_discard(struct ftrace_event_call *call, void *rec,
253 struct ring_buffer *buffer,
254 struct ring_buffer_event *event)
255{
256 if (unlikely(call->flags & TRACE_EVENT_FL_FILTERED) &&
257 !filter_match_preds(call->filter, rec)) {
258 ring_buffer_discard_commit(buffer, event);
259 return 1;
260 }
261
262 return 0;
243} 263}
244EXPORT_SYMBOL_GPL(filter_current_check_discard); 264EXPORT_SYMBOL_GPL(call_filter_check_discard);
245 265
246cycle_t buffer_ftrace_now(struct trace_buffer *buf, int cpu) 266cycle_t buffer_ftrace_now(struct trace_buffer *buf, int cpu)
247{ 267{
@@ -843,9 +863,12 @@ int trace_get_user(struct trace_parser *parser, const char __user *ubuf,
843 if (isspace(ch)) { 863 if (isspace(ch)) {
844 parser->buffer[parser->idx] = 0; 864 parser->buffer[parser->idx] = 0;
845 parser->cont = false; 865 parser->cont = false;
846 } else { 866 } else if (parser->idx < parser->size - 1) {
847 parser->cont = true; 867 parser->cont = true;
848 parser->buffer[parser->idx++] = ch; 868 parser->buffer[parser->idx++] = ch;
869 } else {
870 ret = -EINVAL;
871 goto out;
849 } 872 }
850 873
851 *ppos += read; 874 *ppos += read;
@@ -1261,21 +1284,6 @@ int is_tracing_stopped(void)
1261} 1284}
1262 1285
1263/** 1286/**
1264 * ftrace_off_permanent - disable all ftrace code permanently
1265 *
1266 * This should only be called when a serious anomally has
1267 * been detected. This will turn off the function tracing,
1268 * ring buffers, and other tracing utilites. It takes no
1269 * locks and can be called from any context.
1270 */
1271void ftrace_off_permanent(void)
1272{
1273 tracing_disabled = 1;
1274 ftrace_stop();
1275 tracing_off_permanent();
1276}
1277
1278/**
1279 * tracing_start - quick start of the tracer 1287 * tracing_start - quick start of the tracer
1280 * 1288 *
1281 * If tracing is enabled but was stopped by tracing_stop, 1289 * If tracing is enabled but was stopped by tracing_stop,
@@ -1509,7 +1517,8 @@ tracing_generic_entry_update(struct trace_entry *entry, unsigned long flags,
1509#endif 1517#endif
1510 ((pc & HARDIRQ_MASK) ? TRACE_FLAG_HARDIRQ : 0) | 1518 ((pc & HARDIRQ_MASK) ? TRACE_FLAG_HARDIRQ : 0) |
1511 ((pc & SOFTIRQ_MASK) ? TRACE_FLAG_SOFTIRQ : 0) | 1519 ((pc & SOFTIRQ_MASK) ? TRACE_FLAG_SOFTIRQ : 0) |
1512 (need_resched() ? TRACE_FLAG_NEED_RESCHED : 0); 1520 (tif_need_resched() ? TRACE_FLAG_NEED_RESCHED : 0) |
1521 (test_preempt_need_resched() ? TRACE_FLAG_PREEMPT_RESCHED : 0);
1513} 1522}
1514EXPORT_SYMBOL_GPL(tracing_generic_entry_update); 1523EXPORT_SYMBOL_GPL(tracing_generic_entry_update);
1515 1524
@@ -1630,7 +1639,7 @@ trace_function(struct trace_array *tr,
1630 entry->ip = ip; 1639 entry->ip = ip;
1631 entry->parent_ip = parent_ip; 1640 entry->parent_ip = parent_ip;
1632 1641
1633 if (!filter_check_discard(call, entry, buffer, event)) 1642 if (!call_filter_check_discard(call, entry, buffer, event))
1634 __buffer_unlock_commit(buffer, event); 1643 __buffer_unlock_commit(buffer, event);
1635} 1644}
1636 1645
@@ -1714,7 +1723,7 @@ static void __ftrace_trace_stack(struct ring_buffer *buffer,
1714 1723
1715 entry->size = trace.nr_entries; 1724 entry->size = trace.nr_entries;
1716 1725
1717 if (!filter_check_discard(call, entry, buffer, event)) 1726 if (!call_filter_check_discard(call, entry, buffer, event))
1718 __buffer_unlock_commit(buffer, event); 1727 __buffer_unlock_commit(buffer, event);
1719 1728
1720 out: 1729 out:
@@ -1816,7 +1825,7 @@ ftrace_trace_userstack(struct ring_buffer *buffer, unsigned long flags, int pc)
1816 trace.entries = entry->caller; 1825 trace.entries = entry->caller;
1817 1826
1818 save_stack_trace_user(&trace); 1827 save_stack_trace_user(&trace);
1819 if (!filter_check_discard(call, entry, buffer, event)) 1828 if (!call_filter_check_discard(call, entry, buffer, event))
1820 __buffer_unlock_commit(buffer, event); 1829 __buffer_unlock_commit(buffer, event);
1821 1830
1822 out_drop_count: 1831 out_drop_count:
@@ -2008,7 +2017,7 @@ int trace_vbprintk(unsigned long ip, const char *fmt, va_list args)
2008 entry->fmt = fmt; 2017 entry->fmt = fmt;
2009 2018
2010 memcpy(entry->buf, tbuffer, sizeof(u32) * len); 2019 memcpy(entry->buf, tbuffer, sizeof(u32) * len);
2011 if (!filter_check_discard(call, entry, buffer, event)) { 2020 if (!call_filter_check_discard(call, entry, buffer, event)) {
2012 __buffer_unlock_commit(buffer, event); 2021 __buffer_unlock_commit(buffer, event);
2013 ftrace_trace_stack(buffer, flags, 6, pc); 2022 ftrace_trace_stack(buffer, flags, 6, pc);
2014 } 2023 }
@@ -2063,7 +2072,7 @@ __trace_array_vprintk(struct ring_buffer *buffer,
2063 2072
2064 memcpy(&entry->buf, tbuffer, len); 2073 memcpy(&entry->buf, tbuffer, len);
2065 entry->buf[len] = '\0'; 2074 entry->buf[len] = '\0';
2066 if (!filter_check_discard(call, entry, buffer, event)) { 2075 if (!call_filter_check_discard(call, entry, buffer, event)) {
2067 __buffer_unlock_commit(buffer, event); 2076 __buffer_unlock_commit(buffer, event);
2068 ftrace_trace_stack(buffer, flags, 6, pc); 2077 ftrace_trace_stack(buffer, flags, 6, pc);
2069 } 2078 }
@@ -2760,7 +2769,7 @@ static void show_snapshot_main_help(struct seq_file *m)
2760 seq_printf(m, "# echo 0 > snapshot : Clears and frees snapshot buffer\n"); 2769 seq_printf(m, "# echo 0 > snapshot : Clears and frees snapshot buffer\n");
2761 seq_printf(m, "# echo 1 > snapshot : Allocates snapshot buffer, if not already allocated.\n"); 2770 seq_printf(m, "# echo 1 > snapshot : Allocates snapshot buffer, if not already allocated.\n");
2762 seq_printf(m, "# Takes a snapshot of the main buffer.\n"); 2771 seq_printf(m, "# Takes a snapshot of the main buffer.\n");
2763 seq_printf(m, "# echo 2 > snapshot : Clears snapshot buffer (but does not allocate)\n"); 2772 seq_printf(m, "# echo 2 > snapshot : Clears snapshot buffer (but does not allocate or free)\n");
2764 seq_printf(m, "# (Doesn't have to be '2' works with any number that\n"); 2773 seq_printf(m, "# (Doesn't have to be '2' works with any number that\n");
2765 seq_printf(m, "# is not a '0' or '1')\n"); 2774 seq_printf(m, "# is not a '0' or '1')\n");
2766} 2775}
@@ -2964,6 +2973,11 @@ int tracing_open_generic(struct inode *inode, struct file *filp)
2964 return 0; 2973 return 0;
2965} 2974}
2966 2975
2976bool tracing_is_disabled(void)
2977{
2978 return (tracing_disabled) ? true: false;
2979}
2980
2967/* 2981/*
2968 * Open and update trace_array ref count. 2982 * Open and update trace_array ref count.
2969 * Must have the current trace_array passed to it. 2983 * Must have the current trace_array passed to it.
@@ -5454,12 +5468,12 @@ static struct ftrace_func_command ftrace_snapshot_cmd = {
5454 .func = ftrace_trace_snapshot_callback, 5468 .func = ftrace_trace_snapshot_callback,
5455}; 5469};
5456 5470
5457static int register_snapshot_cmd(void) 5471static __init int register_snapshot_cmd(void)
5458{ 5472{
5459 return register_ftrace_command(&ftrace_snapshot_cmd); 5473 return register_ftrace_command(&ftrace_snapshot_cmd);
5460} 5474}
5461#else 5475#else
5462static inline int register_snapshot_cmd(void) { return 0; } 5476static inline __init int register_snapshot_cmd(void) { return 0; }
5463#endif /* defined(CONFIG_TRACER_SNAPSHOT) && defined(CONFIG_DYNAMIC_FTRACE) */ 5477#endif /* defined(CONFIG_TRACER_SNAPSHOT) && defined(CONFIG_DYNAMIC_FTRACE) */
5464 5478
5465struct dentry *tracing_init_dentry_tr(struct trace_array *tr) 5479struct dentry *tracing_init_dentry_tr(struct trace_array *tr)
@@ -6253,6 +6267,17 @@ void trace_init_global_iter(struct trace_iterator *iter)
6253 iter->trace = iter->tr->current_trace; 6267 iter->trace = iter->tr->current_trace;
6254 iter->cpu_file = RING_BUFFER_ALL_CPUS; 6268 iter->cpu_file = RING_BUFFER_ALL_CPUS;
6255 iter->trace_buffer = &global_trace.trace_buffer; 6269 iter->trace_buffer = &global_trace.trace_buffer;
6270
6271 if (iter->trace && iter->trace->open)
6272 iter->trace->open(iter);
6273
6274 /* Annotate start of buffers if we had overruns */
6275 if (ring_buffer_overruns(iter->trace_buffer->buffer))
6276 iter->iter_flags |= TRACE_FILE_ANNOTATE;
6277
6278 /* Output in nanoseconds only if we are using a clock in nanoseconds. */
6279 if (trace_clocks[iter->tr->clock_id].in_ns)
6280 iter->iter_flags |= TRACE_FILE_TIME_IN_NS;
6256} 6281}
6257 6282
6258void ftrace_dump(enum ftrace_dump_mode oops_dump_mode) 6283void ftrace_dump(enum ftrace_dump_mode oops_dump_mode)
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 10c86fb7a2b4..ea189e027b80 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -124,6 +124,7 @@ enum trace_flag_type {
124 TRACE_FLAG_NEED_RESCHED = 0x04, 124 TRACE_FLAG_NEED_RESCHED = 0x04,
125 TRACE_FLAG_HARDIRQ = 0x08, 125 TRACE_FLAG_HARDIRQ = 0x08,
126 TRACE_FLAG_SOFTIRQ = 0x10, 126 TRACE_FLAG_SOFTIRQ = 0x10,
127 TRACE_FLAG_PREEMPT_RESCHED = 0x20,
127}; 128};
128 129
129#define TRACE_BUF_SIZE 1024 130#define TRACE_BUF_SIZE 1024
@@ -192,8 +193,8 @@ struct trace_array {
192#ifdef CONFIG_FTRACE_SYSCALLS 193#ifdef CONFIG_FTRACE_SYSCALLS
193 int sys_refcount_enter; 194 int sys_refcount_enter;
194 int sys_refcount_exit; 195 int sys_refcount_exit;
195 DECLARE_BITMAP(enabled_enter_syscalls, NR_syscalls); 196 struct ftrace_event_file __rcu *enter_syscall_files[NR_syscalls];
196 DECLARE_BITMAP(enabled_exit_syscalls, NR_syscalls); 197 struct ftrace_event_file __rcu *exit_syscall_files[NR_syscalls];
197#endif 198#endif
198 int stop_count; 199 int stop_count;
199 int clock_id; 200 int clock_id;
@@ -514,6 +515,7 @@ void tracing_reset_online_cpus(struct trace_buffer *buf);
514void tracing_reset_current(int cpu); 515void tracing_reset_current(int cpu);
515void tracing_reset_all_online_cpus(void); 516void tracing_reset_all_online_cpus(void);
516int tracing_open_generic(struct inode *inode, struct file *filp); 517int tracing_open_generic(struct inode *inode, struct file *filp);
518bool tracing_is_disabled(void);
517struct dentry *trace_create_file(const char *name, 519struct dentry *trace_create_file(const char *name,
518 umode_t mode, 520 umode_t mode,
519 struct dentry *parent, 521 struct dentry *parent,
@@ -711,6 +713,8 @@ extern unsigned long trace_flags;
711#define TRACE_GRAPH_PRINT_PROC 0x8 713#define TRACE_GRAPH_PRINT_PROC 0x8
712#define TRACE_GRAPH_PRINT_DURATION 0x10 714#define TRACE_GRAPH_PRINT_DURATION 0x10
713#define TRACE_GRAPH_PRINT_ABS_TIME 0x20 715#define TRACE_GRAPH_PRINT_ABS_TIME 0x20
716#define TRACE_GRAPH_PRINT_FILL_SHIFT 28
717#define TRACE_GRAPH_PRINT_FILL_MASK (0x3 << TRACE_GRAPH_PRINT_FILL_SHIFT)
714 718
715extern enum print_line_t 719extern enum print_line_t
716print_graph_function_flags(struct trace_iterator *iter, u32 flags); 720print_graph_function_flags(struct trace_iterator *iter, u32 flags);
@@ -730,15 +734,16 @@ extern void __trace_graph_return(struct trace_array *tr,
730#ifdef CONFIG_DYNAMIC_FTRACE 734#ifdef CONFIG_DYNAMIC_FTRACE
731/* TODO: make this variable */ 735/* TODO: make this variable */
732#define FTRACE_GRAPH_MAX_FUNCS 32 736#define FTRACE_GRAPH_MAX_FUNCS 32
733extern int ftrace_graph_filter_enabled;
734extern int ftrace_graph_count; 737extern int ftrace_graph_count;
735extern unsigned long ftrace_graph_funcs[FTRACE_GRAPH_MAX_FUNCS]; 738extern unsigned long ftrace_graph_funcs[FTRACE_GRAPH_MAX_FUNCS];
739extern int ftrace_graph_notrace_count;
740extern unsigned long ftrace_graph_notrace_funcs[FTRACE_GRAPH_MAX_FUNCS];
736 741
737static inline int ftrace_graph_addr(unsigned long addr) 742static inline int ftrace_graph_addr(unsigned long addr)
738{ 743{
739 int i; 744 int i;
740 745
741 if (!ftrace_graph_filter_enabled) 746 if (!ftrace_graph_count)
742 return 1; 747 return 1;
743 748
744 for (i = 0; i < ftrace_graph_count; i++) { 749 for (i = 0; i < ftrace_graph_count; i++) {
@@ -758,11 +763,31 @@ static inline int ftrace_graph_addr(unsigned long addr)
758 763
759 return 0; 764 return 0;
760} 765}
766
767static inline int ftrace_graph_notrace_addr(unsigned long addr)
768{
769 int i;
770
771 if (!ftrace_graph_notrace_count)
772 return 0;
773
774 for (i = 0; i < ftrace_graph_notrace_count; i++) {
775 if (addr == ftrace_graph_notrace_funcs[i])
776 return 1;
777 }
778
779 return 0;
780}
761#else 781#else
762static inline int ftrace_graph_addr(unsigned long addr) 782static inline int ftrace_graph_addr(unsigned long addr)
763{ 783{
764 return 1; 784 return 1;
765} 785}
786
787static inline int ftrace_graph_notrace_addr(unsigned long addr)
788{
789 return 0;
790}
766#endif /* CONFIG_DYNAMIC_FTRACE */ 791#endif /* CONFIG_DYNAMIC_FTRACE */
767#else /* CONFIG_FUNCTION_GRAPH_TRACER */ 792#else /* CONFIG_FUNCTION_GRAPH_TRACER */
768static inline enum print_line_t 793static inline enum print_line_t
@@ -986,9 +1011,9 @@ struct filter_pred {
986 1011
987extern enum regex_type 1012extern enum regex_type
988filter_parse_regex(char *buff, int len, char **search, int *not); 1013filter_parse_regex(char *buff, int len, char **search, int *not);
989extern void print_event_filter(struct ftrace_event_call *call, 1014extern void print_event_filter(struct ftrace_event_file *file,
990 struct trace_seq *s); 1015 struct trace_seq *s);
991extern int apply_event_filter(struct ftrace_event_call *call, 1016extern int apply_event_filter(struct ftrace_event_file *file,
992 char *filter_string); 1017 char *filter_string);
993extern int apply_subsystem_event_filter(struct ftrace_subsystem_dir *dir, 1018extern int apply_subsystem_event_filter(struct ftrace_subsystem_dir *dir,
994 char *filter_string); 1019 char *filter_string);
@@ -999,20 +1024,6 @@ extern int filter_assign_type(const char *type);
999struct ftrace_event_field * 1024struct ftrace_event_field *
1000trace_find_event_field(struct ftrace_event_call *call, char *name); 1025trace_find_event_field(struct ftrace_event_call *call, char *name);
1001 1026
1002static inline int
1003filter_check_discard(struct ftrace_event_call *call, void *rec,
1004 struct ring_buffer *buffer,
1005 struct ring_buffer_event *event)
1006{
1007 if (unlikely(call->flags & TRACE_EVENT_FL_FILTERED) &&
1008 !filter_match_preds(call->filter, rec)) {
1009 ring_buffer_discard_commit(buffer, event);
1010 return 1;
1011 }
1012
1013 return 0;
1014}
1015
1016extern void trace_event_enable_cmd_record(bool enable); 1027extern void trace_event_enable_cmd_record(bool enable);
1017extern int event_trace_add_tracer(struct dentry *parent, struct trace_array *tr); 1028extern int event_trace_add_tracer(struct dentry *parent, struct trace_array *tr);
1018extern int event_trace_del_tracer(struct trace_array *tr); 1029extern int event_trace_del_tracer(struct trace_array *tr);
diff --git a/kernel/trace/trace_branch.c b/kernel/trace/trace_branch.c
index d594da0dc03c..697fb9bac8f0 100644
--- a/kernel/trace/trace_branch.c
+++ b/kernel/trace/trace_branch.c
@@ -78,7 +78,7 @@ probe_likely_condition(struct ftrace_branch_data *f, int val, int expect)
78 entry->line = f->line; 78 entry->line = f->line;
79 entry->correct = val == expect; 79 entry->correct = val == expect;
80 80
81 if (!filter_check_discard(call, entry, buffer, event)) 81 if (!call_filter_check_discard(call, entry, buffer, event))
82 __buffer_unlock_commit(buffer, event); 82 __buffer_unlock_commit(buffer, event);
83 83
84 out: 84 out:
diff --git a/kernel/trace/trace_event_perf.c b/kernel/trace/trace_event_perf.c
index 80c36bcf66e8..e854f420e033 100644
--- a/kernel/trace/trace_event_perf.c
+++ b/kernel/trace/trace_event_perf.c
@@ -24,9 +24,15 @@ static int total_ref_count;
24static int perf_trace_event_perm(struct ftrace_event_call *tp_event, 24static int perf_trace_event_perm(struct ftrace_event_call *tp_event,
25 struct perf_event *p_event) 25 struct perf_event *p_event)
26{ 26{
27 if (tp_event->perf_perm) {
28 int ret = tp_event->perf_perm(tp_event, p_event);
29 if (ret)
30 return ret;
31 }
32
27 /* The ftrace function trace is allowed only for root. */ 33 /* The ftrace function trace is allowed only for root. */
28 if (ftrace_event_is_function(tp_event) && 34 if (ftrace_event_is_function(tp_event) &&
29 perf_paranoid_kernel() && !capable(CAP_SYS_ADMIN)) 35 perf_paranoid_tracepoint_raw() && !capable(CAP_SYS_ADMIN))
30 return -EPERM; 36 return -EPERM;
31 37
32 /* No tracing, just counting, so no obvious leak */ 38 /* No tracing, just counting, so no obvious leak */
@@ -173,7 +179,7 @@ static int perf_trace_event_init(struct ftrace_event_call *tp_event,
173int perf_trace_init(struct perf_event *p_event) 179int perf_trace_init(struct perf_event *p_event)
174{ 180{
175 struct ftrace_event_call *tp_event; 181 struct ftrace_event_call *tp_event;
176 int event_id = p_event->attr.config; 182 u64 event_id = p_event->attr.config;
177 int ret = -EINVAL; 183 int ret = -EINVAL;
178 184
179 mutex_lock(&event_mutex); 185 mutex_lock(&event_mutex);
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index 368a4d50cc30..a11800ae96de 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -989,7 +989,7 @@ static ssize_t
989event_filter_read(struct file *filp, char __user *ubuf, size_t cnt, 989event_filter_read(struct file *filp, char __user *ubuf, size_t cnt,
990 loff_t *ppos) 990 loff_t *ppos)
991{ 991{
992 struct ftrace_event_call *call; 992 struct ftrace_event_file *file;
993 struct trace_seq *s; 993 struct trace_seq *s;
994 int r = -ENODEV; 994 int r = -ENODEV;
995 995
@@ -1004,12 +1004,12 @@ event_filter_read(struct file *filp, char __user *ubuf, size_t cnt,
1004 trace_seq_init(s); 1004 trace_seq_init(s);
1005 1005
1006 mutex_lock(&event_mutex); 1006 mutex_lock(&event_mutex);
1007 call = event_file_data(filp); 1007 file = event_file_data(filp);
1008 if (call) 1008 if (file)
1009 print_event_filter(call, s); 1009 print_event_filter(file, s);
1010 mutex_unlock(&event_mutex); 1010 mutex_unlock(&event_mutex);
1011 1011
1012 if (call) 1012 if (file)
1013 r = simple_read_from_buffer(ubuf, cnt, ppos, s->buffer, s->len); 1013 r = simple_read_from_buffer(ubuf, cnt, ppos, s->buffer, s->len);
1014 1014
1015 kfree(s); 1015 kfree(s);
@@ -1021,7 +1021,7 @@ static ssize_t
1021event_filter_write(struct file *filp, const char __user *ubuf, size_t cnt, 1021event_filter_write(struct file *filp, const char __user *ubuf, size_t cnt,
1022 loff_t *ppos) 1022 loff_t *ppos)
1023{ 1023{
1024 struct ftrace_event_call *call; 1024 struct ftrace_event_file *file;
1025 char *buf; 1025 char *buf;
1026 int err = -ENODEV; 1026 int err = -ENODEV;
1027 1027
@@ -1039,9 +1039,9 @@ event_filter_write(struct file *filp, const char __user *ubuf, size_t cnt,
1039 buf[cnt] = '\0'; 1039 buf[cnt] = '\0';
1040 1040
1041 mutex_lock(&event_mutex); 1041 mutex_lock(&event_mutex);
1042 call = event_file_data(filp); 1042 file = event_file_data(filp);
1043 if (call) 1043 if (file)
1044 err = apply_event_filter(call, buf); 1044 err = apply_event_filter(file, buf);
1045 mutex_unlock(&event_mutex); 1045 mutex_unlock(&event_mutex);
1046 1046
1047 free_page((unsigned long) buf); 1047 free_page((unsigned long) buf);
@@ -1062,6 +1062,9 @@ static int subsystem_open(struct inode *inode, struct file *filp)
1062 struct trace_array *tr; 1062 struct trace_array *tr;
1063 int ret; 1063 int ret;
1064 1064
1065 if (tracing_is_disabled())
1066 return -ENODEV;
1067
1065 /* Make sure the system still exists */ 1068 /* Make sure the system still exists */
1066 mutex_lock(&trace_types_lock); 1069 mutex_lock(&trace_types_lock);
1067 mutex_lock(&event_mutex); 1070 mutex_lock(&event_mutex);
@@ -1108,6 +1111,9 @@ static int system_tr_open(struct inode *inode, struct file *filp)
1108 struct trace_array *tr = inode->i_private; 1111 struct trace_array *tr = inode->i_private;
1109 int ret; 1112 int ret;
1110 1113
1114 if (tracing_is_disabled())
1115 return -ENODEV;
1116
1111 if (trace_array_get(tr) < 0) 1117 if (trace_array_get(tr) < 0)
1112 return -ENODEV; 1118 return -ENODEV;
1113 1119
@@ -1124,11 +1130,12 @@ static int system_tr_open(struct inode *inode, struct file *filp)
1124 if (ret < 0) { 1130 if (ret < 0) {
1125 trace_array_put(tr); 1131 trace_array_put(tr);
1126 kfree(dir); 1132 kfree(dir);
1133 return ret;
1127 } 1134 }
1128 1135
1129 filp->private_data = dir; 1136 filp->private_data = dir;
1130 1137
1131 return ret; 1138 return 0;
1132} 1139}
1133 1140
1134static int subsystem_release(struct inode *inode, struct file *file) 1141static int subsystem_release(struct inode *inode, struct file *file)
@@ -1539,7 +1546,7 @@ event_create_dir(struct dentry *parent, struct ftrace_event_file *file)
1539 return -1; 1546 return -1;
1540 } 1547 }
1541 } 1548 }
1542 trace_create_file("filter", 0644, file->dir, call, 1549 trace_create_file("filter", 0644, file->dir, file,
1543 &ftrace_event_filter_fops); 1550 &ftrace_event_filter_fops);
1544 1551
1545 trace_create_file("format", 0444, file->dir, call, 1552 trace_create_file("format", 0444, file->dir, call,
@@ -1577,6 +1584,7 @@ static void event_remove(struct ftrace_event_call *call)
1577 if (file->event_call != call) 1584 if (file->event_call != call)
1578 continue; 1585 continue;
1579 ftrace_event_enable_disable(file, 0); 1586 ftrace_event_enable_disable(file, 0);
1587 destroy_preds(file);
1580 /* 1588 /*
1581 * The do_for_each_event_file() is 1589 * The do_for_each_event_file() is
1582 * a double loop. After finding the call for this 1590 * a double loop. After finding the call for this
@@ -1700,7 +1708,7 @@ static void __trace_remove_event_call(struct ftrace_event_call *call)
1700{ 1708{
1701 event_remove(call); 1709 event_remove(call);
1702 trace_destroy_fields(call); 1710 trace_destroy_fields(call);
1703 destroy_preds(call); 1711 destroy_call_preds(call);
1704} 1712}
1705 1713
1706static int probe_remove_event_call(struct ftrace_event_call *call) 1714static int probe_remove_event_call(struct ftrace_event_call *call)
@@ -2306,6 +2314,9 @@ int event_trace_del_tracer(struct trace_array *tr)
2306 /* Disable any running events */ 2314 /* Disable any running events */
2307 __ftrace_set_clr_event_nolock(tr, NULL, NULL, NULL, 0); 2315 __ftrace_set_clr_event_nolock(tr, NULL, NULL, NULL, 0);
2308 2316
2317 /* Access to events are within rcu_read_lock_sched() */
2318 synchronize_sched();
2319
2309 down_write(&trace_event_sem); 2320 down_write(&trace_event_sem);
2310 __trace_remove_event_dirs(tr); 2321 __trace_remove_event_dirs(tr);
2311 debugfs_remove_recursive(tr->event_dir); 2322 debugfs_remove_recursive(tr->event_dir);
diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c
index 97daa8cf958d..2468f56dc5db 100644
--- a/kernel/trace/trace_events_filter.c
+++ b/kernel/trace/trace_events_filter.c
@@ -637,10 +637,18 @@ static void append_filter_err(struct filter_parse_state *ps,
637 free_page((unsigned long) buf); 637 free_page((unsigned long) buf);
638} 638}
639 639
640static inline struct event_filter *event_filter(struct ftrace_event_file *file)
641{
642 if (file->event_call->flags & TRACE_EVENT_FL_USE_CALL_FILTER)
643 return file->event_call->filter;
644 else
645 return file->filter;
646}
647
640/* caller must hold event_mutex */ 648/* caller must hold event_mutex */
641void print_event_filter(struct ftrace_event_call *call, struct trace_seq *s) 649void print_event_filter(struct ftrace_event_file *file, struct trace_seq *s)
642{ 650{
643 struct event_filter *filter = call->filter; 651 struct event_filter *filter = event_filter(file);
644 652
645 if (filter && filter->filter_string) 653 if (filter && filter->filter_string)
646 trace_seq_printf(s, "%s\n", filter->filter_string); 654 trace_seq_printf(s, "%s\n", filter->filter_string);
@@ -766,11 +774,21 @@ static void __free_preds(struct event_filter *filter)
766 filter->n_preds = 0; 774 filter->n_preds = 0;
767} 775}
768 776
769static void filter_disable(struct ftrace_event_call *call) 777static void call_filter_disable(struct ftrace_event_call *call)
770{ 778{
771 call->flags &= ~TRACE_EVENT_FL_FILTERED; 779 call->flags &= ~TRACE_EVENT_FL_FILTERED;
772} 780}
773 781
782static void filter_disable(struct ftrace_event_file *file)
783{
784 struct ftrace_event_call *call = file->event_call;
785
786 if (call->flags & TRACE_EVENT_FL_USE_CALL_FILTER)
787 call_filter_disable(call);
788 else
789 file->flags &= ~FTRACE_EVENT_FL_FILTERED;
790}
791
774static void __free_filter(struct event_filter *filter) 792static void __free_filter(struct event_filter *filter)
775{ 793{
776 if (!filter) 794 if (!filter)
@@ -781,16 +799,30 @@ static void __free_filter(struct event_filter *filter)
781 kfree(filter); 799 kfree(filter);
782} 800}
783 801
802void destroy_call_preds(struct ftrace_event_call *call)
803{
804 __free_filter(call->filter);
805 call->filter = NULL;
806}
807
808static void destroy_file_preds(struct ftrace_event_file *file)
809{
810 __free_filter(file->filter);
811 file->filter = NULL;
812}
813
784/* 814/*
785 * Called when destroying the ftrace_event_call. 815 * Called when destroying the ftrace_event_file.
786 * The call is being freed, so we do not need to worry about 816 * The file is being freed, so we do not need to worry about
787 * the call being currently used. This is for module code removing 817 * the file being currently used. This is for module code removing
788 * the tracepoints from within it. 818 * the tracepoints from within it.
789 */ 819 */
790void destroy_preds(struct ftrace_event_call *call) 820void destroy_preds(struct ftrace_event_file *file)
791{ 821{
792 __free_filter(call->filter); 822 if (file->event_call->flags & TRACE_EVENT_FL_USE_CALL_FILTER)
793 call->filter = NULL; 823 destroy_call_preds(file->event_call);
824 else
825 destroy_file_preds(file);
794} 826}
795 827
796static struct event_filter *__alloc_filter(void) 828static struct event_filter *__alloc_filter(void)
@@ -825,28 +857,56 @@ static int __alloc_preds(struct event_filter *filter, int n_preds)
825 return 0; 857 return 0;
826} 858}
827 859
828static void filter_free_subsystem_preds(struct event_subsystem *system) 860static inline void __remove_filter(struct ftrace_event_file *file)
829{ 861{
862 struct ftrace_event_call *call = file->event_call;
863
864 filter_disable(file);
865 if (call->flags & TRACE_EVENT_FL_USE_CALL_FILTER)
866 remove_filter_string(call->filter);
867 else
868 remove_filter_string(file->filter);
869}
870
871static void filter_free_subsystem_preds(struct event_subsystem *system,
872 struct trace_array *tr)
873{
874 struct ftrace_event_file *file;
830 struct ftrace_event_call *call; 875 struct ftrace_event_call *call;
831 876
832 list_for_each_entry(call, &ftrace_events, list) { 877 list_for_each_entry(file, &tr->events, list) {
878 call = file->event_call;
833 if (strcmp(call->class->system, system->name) != 0) 879 if (strcmp(call->class->system, system->name) != 0)
834 continue; 880 continue;
835 881
836 filter_disable(call); 882 __remove_filter(file);
837 remove_filter_string(call->filter);
838 } 883 }
839} 884}
840 885
841static void filter_free_subsystem_filters(struct event_subsystem *system) 886static inline void __free_subsystem_filter(struct ftrace_event_file *file)
842{ 887{
888 struct ftrace_event_call *call = file->event_call;
889
890 if (call->flags & TRACE_EVENT_FL_USE_CALL_FILTER) {
891 __free_filter(call->filter);
892 call->filter = NULL;
893 } else {
894 __free_filter(file->filter);
895 file->filter = NULL;
896 }
897}
898
899static void filter_free_subsystem_filters(struct event_subsystem *system,
900 struct trace_array *tr)
901{
902 struct ftrace_event_file *file;
843 struct ftrace_event_call *call; 903 struct ftrace_event_call *call;
844 904
845 list_for_each_entry(call, &ftrace_events, list) { 905 list_for_each_entry(file, &tr->events, list) {
906 call = file->event_call;
846 if (strcmp(call->class->system, system->name) != 0) 907 if (strcmp(call->class->system, system->name) != 0)
847 continue; 908 continue;
848 __free_filter(call->filter); 909 __free_subsystem_filter(file);
849 call->filter = NULL;
850 } 910 }
851} 911}
852 912
@@ -1617,15 +1677,85 @@ fail:
1617 return err; 1677 return err;
1618} 1678}
1619 1679
1680static inline void event_set_filtered_flag(struct ftrace_event_file *file)
1681{
1682 struct ftrace_event_call *call = file->event_call;
1683
1684 if (call->flags & TRACE_EVENT_FL_USE_CALL_FILTER)
1685 call->flags |= TRACE_EVENT_FL_FILTERED;
1686 else
1687 file->flags |= FTRACE_EVENT_FL_FILTERED;
1688}
1689
1690static inline void event_set_filter(struct ftrace_event_file *file,
1691 struct event_filter *filter)
1692{
1693 struct ftrace_event_call *call = file->event_call;
1694
1695 if (call->flags & TRACE_EVENT_FL_USE_CALL_FILTER)
1696 rcu_assign_pointer(call->filter, filter);
1697 else
1698 rcu_assign_pointer(file->filter, filter);
1699}
1700
1701static inline void event_clear_filter(struct ftrace_event_file *file)
1702{
1703 struct ftrace_event_call *call = file->event_call;
1704
1705 if (call->flags & TRACE_EVENT_FL_USE_CALL_FILTER)
1706 RCU_INIT_POINTER(call->filter, NULL);
1707 else
1708 RCU_INIT_POINTER(file->filter, NULL);
1709}
1710
1711static inline void
1712event_set_no_set_filter_flag(struct ftrace_event_file *file)
1713{
1714 struct ftrace_event_call *call = file->event_call;
1715
1716 if (call->flags & TRACE_EVENT_FL_USE_CALL_FILTER)
1717 call->flags |= TRACE_EVENT_FL_NO_SET_FILTER;
1718 else
1719 file->flags |= FTRACE_EVENT_FL_NO_SET_FILTER;
1720}
1721
1722static inline void
1723event_clear_no_set_filter_flag(struct ftrace_event_file *file)
1724{
1725 struct ftrace_event_call *call = file->event_call;
1726
1727 if (call->flags & TRACE_EVENT_FL_USE_CALL_FILTER)
1728 call->flags &= ~TRACE_EVENT_FL_NO_SET_FILTER;
1729 else
1730 file->flags &= ~FTRACE_EVENT_FL_NO_SET_FILTER;
1731}
1732
1733static inline bool
1734event_no_set_filter_flag(struct ftrace_event_file *file)
1735{
1736 struct ftrace_event_call *call = file->event_call;
1737
1738 if (file->flags & FTRACE_EVENT_FL_NO_SET_FILTER)
1739 return true;
1740
1741 if ((call->flags & TRACE_EVENT_FL_USE_CALL_FILTER) &&
1742 (call->flags & TRACE_EVENT_FL_NO_SET_FILTER))
1743 return true;
1744
1745 return false;
1746}
1747
1620struct filter_list { 1748struct filter_list {
1621 struct list_head list; 1749 struct list_head list;
1622 struct event_filter *filter; 1750 struct event_filter *filter;
1623}; 1751};
1624 1752
1625static int replace_system_preds(struct event_subsystem *system, 1753static int replace_system_preds(struct event_subsystem *system,
1754 struct trace_array *tr,
1626 struct filter_parse_state *ps, 1755 struct filter_parse_state *ps,
1627 char *filter_string) 1756 char *filter_string)
1628{ 1757{
1758 struct ftrace_event_file *file;
1629 struct ftrace_event_call *call; 1759 struct ftrace_event_call *call;
1630 struct filter_list *filter_item; 1760 struct filter_list *filter_item;
1631 struct filter_list *tmp; 1761 struct filter_list *tmp;
@@ -1633,8 +1763,8 @@ static int replace_system_preds(struct event_subsystem *system,
1633 bool fail = true; 1763 bool fail = true;
1634 int err; 1764 int err;
1635 1765
1636 list_for_each_entry(call, &ftrace_events, list) { 1766 list_for_each_entry(file, &tr->events, list) {
1637 1767 call = file->event_call;
1638 if (strcmp(call->class->system, system->name) != 0) 1768 if (strcmp(call->class->system, system->name) != 0)
1639 continue; 1769 continue;
1640 1770
@@ -1644,18 +1774,20 @@ static int replace_system_preds(struct event_subsystem *system,
1644 */ 1774 */
1645 err = replace_preds(call, NULL, ps, filter_string, true); 1775 err = replace_preds(call, NULL, ps, filter_string, true);
1646 if (err) 1776 if (err)
1647 call->flags |= TRACE_EVENT_FL_NO_SET_FILTER; 1777 event_set_no_set_filter_flag(file);
1648 else 1778 else
1649 call->flags &= ~TRACE_EVENT_FL_NO_SET_FILTER; 1779 event_clear_no_set_filter_flag(file);
1650 } 1780 }
1651 1781
1652 list_for_each_entry(call, &ftrace_events, list) { 1782 list_for_each_entry(file, &tr->events, list) {
1653 struct event_filter *filter; 1783 struct event_filter *filter;
1654 1784
1785 call = file->event_call;
1786
1655 if (strcmp(call->class->system, system->name) != 0) 1787 if (strcmp(call->class->system, system->name) != 0)
1656 continue; 1788 continue;
1657 1789
1658 if (call->flags & TRACE_EVENT_FL_NO_SET_FILTER) 1790 if (event_no_set_filter_flag(file))
1659 continue; 1791 continue;
1660 1792
1661 filter_item = kzalloc(sizeof(*filter_item), GFP_KERNEL); 1793 filter_item = kzalloc(sizeof(*filter_item), GFP_KERNEL);
@@ -1676,17 +1808,17 @@ static int replace_system_preds(struct event_subsystem *system,
1676 1808
1677 err = replace_preds(call, filter, ps, filter_string, false); 1809 err = replace_preds(call, filter, ps, filter_string, false);
1678 if (err) { 1810 if (err) {
1679 filter_disable(call); 1811 filter_disable(file);
1680 parse_error(ps, FILT_ERR_BAD_SUBSYS_FILTER, 0); 1812 parse_error(ps, FILT_ERR_BAD_SUBSYS_FILTER, 0);
1681 append_filter_err(ps, filter); 1813 append_filter_err(ps, filter);
1682 } else 1814 } else
1683 call->flags |= TRACE_EVENT_FL_FILTERED; 1815 event_set_filtered_flag(file);
1684 /* 1816 /*
1685 * Regardless of if this returned an error, we still 1817 * Regardless of if this returned an error, we still
1686 * replace the filter for the call. 1818 * replace the filter for the call.
1687 */ 1819 */
1688 filter = call->filter; 1820 filter = event_filter(file);
1689 rcu_assign_pointer(call->filter, filter_item->filter); 1821 event_set_filter(file, filter_item->filter);
1690 filter_item->filter = filter; 1822 filter_item->filter = filter;
1691 1823
1692 fail = false; 1824 fail = false;
@@ -1816,6 +1948,7 @@ static int create_filter(struct ftrace_event_call *call,
1816 * and always remembers @filter_str. 1948 * and always remembers @filter_str.
1817 */ 1949 */
1818static int create_system_filter(struct event_subsystem *system, 1950static int create_system_filter(struct event_subsystem *system,
1951 struct trace_array *tr,
1819 char *filter_str, struct event_filter **filterp) 1952 char *filter_str, struct event_filter **filterp)
1820{ 1953{
1821 struct event_filter *filter = NULL; 1954 struct event_filter *filter = NULL;
@@ -1824,7 +1957,7 @@ static int create_system_filter(struct event_subsystem *system,
1824 1957
1825 err = create_filter_start(filter_str, true, &ps, &filter); 1958 err = create_filter_start(filter_str, true, &ps, &filter);
1826 if (!err) { 1959 if (!err) {
1827 err = replace_system_preds(system, ps, filter_str); 1960 err = replace_system_preds(system, tr, ps, filter_str);
1828 if (!err) { 1961 if (!err) {
1829 /* System filters just show a default message */ 1962 /* System filters just show a default message */
1830 kfree(filter->filter_string); 1963 kfree(filter->filter_string);
@@ -1840,20 +1973,25 @@ static int create_system_filter(struct event_subsystem *system,
1840} 1973}
1841 1974
1842/* caller must hold event_mutex */ 1975/* caller must hold event_mutex */
1843int apply_event_filter(struct ftrace_event_call *call, char *filter_string) 1976int apply_event_filter(struct ftrace_event_file *file, char *filter_string)
1844{ 1977{
1978 struct ftrace_event_call *call = file->event_call;
1845 struct event_filter *filter; 1979 struct event_filter *filter;
1846 int err; 1980 int err;
1847 1981
1848 if (!strcmp(strstrip(filter_string), "0")) { 1982 if (!strcmp(strstrip(filter_string), "0")) {
1849 filter_disable(call); 1983 filter_disable(file);
1850 filter = call->filter; 1984 filter = event_filter(file);
1985
1851 if (!filter) 1986 if (!filter)
1852 return 0; 1987 return 0;
1853 RCU_INIT_POINTER(call->filter, NULL); 1988
1989 event_clear_filter(file);
1990
1854 /* Make sure the filter is not being used */ 1991 /* Make sure the filter is not being used */
1855 synchronize_sched(); 1992 synchronize_sched();
1856 __free_filter(filter); 1993 __free_filter(filter);
1994
1857 return 0; 1995 return 0;
1858 } 1996 }
1859 1997
@@ -1866,14 +2004,15 @@ int apply_event_filter(struct ftrace_event_call *call, char *filter_string)
1866 * string 2004 * string
1867 */ 2005 */
1868 if (filter) { 2006 if (filter) {
1869 struct event_filter *tmp = call->filter; 2007 struct event_filter *tmp;
1870 2008
2009 tmp = event_filter(file);
1871 if (!err) 2010 if (!err)
1872 call->flags |= TRACE_EVENT_FL_FILTERED; 2011 event_set_filtered_flag(file);
1873 else 2012 else
1874 filter_disable(call); 2013 filter_disable(file);
1875 2014
1876 rcu_assign_pointer(call->filter, filter); 2015 event_set_filter(file, filter);
1877 2016
1878 if (tmp) { 2017 if (tmp) {
1879 /* Make sure the call is done with the filter */ 2018 /* Make sure the call is done with the filter */
@@ -1889,6 +2028,7 @@ int apply_subsystem_event_filter(struct ftrace_subsystem_dir *dir,
1889 char *filter_string) 2028 char *filter_string)
1890{ 2029{
1891 struct event_subsystem *system = dir->subsystem; 2030 struct event_subsystem *system = dir->subsystem;
2031 struct trace_array *tr = dir->tr;
1892 struct event_filter *filter; 2032 struct event_filter *filter;
1893 int err = 0; 2033 int err = 0;
1894 2034
@@ -1901,18 +2041,18 @@ int apply_subsystem_event_filter(struct ftrace_subsystem_dir *dir,
1901 } 2041 }
1902 2042
1903 if (!strcmp(strstrip(filter_string), "0")) { 2043 if (!strcmp(strstrip(filter_string), "0")) {
1904 filter_free_subsystem_preds(system); 2044 filter_free_subsystem_preds(system, tr);
1905 remove_filter_string(system->filter); 2045 remove_filter_string(system->filter);
1906 filter = system->filter; 2046 filter = system->filter;
1907 system->filter = NULL; 2047 system->filter = NULL;
1908 /* Ensure all filters are no longer used */ 2048 /* Ensure all filters are no longer used */
1909 synchronize_sched(); 2049 synchronize_sched();
1910 filter_free_subsystem_filters(system); 2050 filter_free_subsystem_filters(system, tr);
1911 __free_filter(filter); 2051 __free_filter(filter);
1912 goto out_unlock; 2052 goto out_unlock;
1913 } 2053 }
1914 2054
1915 err = create_system_filter(system, filter_string, &filter); 2055 err = create_system_filter(system, tr, filter_string, &filter);
1916 if (filter) { 2056 if (filter) {
1917 /* 2057 /*
1918 * No event actually uses the system filter 2058 * No event actually uses the system filter
diff --git a/kernel/trace/trace_export.c b/kernel/trace/trace_export.c
index d21a74670088..7c3e3e72e2b6 100644
--- a/kernel/trace/trace_export.c
+++ b/kernel/trace/trace_export.c
@@ -180,7 +180,7 @@ struct ftrace_event_call __used event_##call = { \
180 .event.type = etype, \ 180 .event.type = etype, \
181 .class = &event_class_ftrace_##call, \ 181 .class = &event_class_ftrace_##call, \
182 .print_fmt = print, \ 182 .print_fmt = print, \
183 .flags = TRACE_EVENT_FL_IGNORE_ENABLE, \ 183 .flags = TRACE_EVENT_FL_IGNORE_ENABLE | TRACE_EVENT_FL_USE_CALL_FILTER, \
184}; \ 184}; \
185struct ftrace_event_call __used \ 185struct ftrace_event_call __used \
186__attribute__((section("_ftrace_events"))) *__event_##call = &event_##call; 186__attribute__((section("_ftrace_events"))) *__event_##call = &event_##call;
diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c
index b5c09242683d..0b99120d395c 100644
--- a/kernel/trace/trace_functions_graph.c
+++ b/kernel/trace/trace_functions_graph.c
@@ -82,9 +82,9 @@ static struct trace_array *graph_array;
82 * to fill in space into DURATION column. 82 * to fill in space into DURATION column.
83 */ 83 */
84enum { 84enum {
85 DURATION_FILL_FULL = -1, 85 FLAGS_FILL_FULL = 1 << TRACE_GRAPH_PRINT_FILL_SHIFT,
86 DURATION_FILL_START = -2, 86 FLAGS_FILL_START = 2 << TRACE_GRAPH_PRINT_FILL_SHIFT,
87 DURATION_FILL_END = -3, 87 FLAGS_FILL_END = 3 << TRACE_GRAPH_PRINT_FILL_SHIFT,
88}; 88};
89 89
90static enum print_line_t 90static enum print_line_t
@@ -114,16 +114,37 @@ ftrace_push_return_trace(unsigned long ret, unsigned long func, int *depth,
114 return -EBUSY; 114 return -EBUSY;
115 } 115 }
116 116
117 /*
118 * The curr_ret_stack is an index to ftrace return stack of
119 * current task. Its value should be in [0, FTRACE_RETFUNC_
120 * DEPTH) when the function graph tracer is used. To support
121 * filtering out specific functions, it makes the index
122 * negative by subtracting huge value (FTRACE_NOTRACE_DEPTH)
123 * so when it sees a negative index the ftrace will ignore
124 * the record. And the index gets recovered when returning
125 * from the filtered function by adding the FTRACE_NOTRACE_
126 * DEPTH and then it'll continue to record functions normally.
127 *
128 * The curr_ret_stack is initialized to -1 and get increased
129 * in this function. So it can be less than -1 only if it was
130 * filtered out via ftrace_graph_notrace_addr() which can be
131 * set from set_graph_notrace file in debugfs by user.
132 */
133 if (current->curr_ret_stack < -1)
134 return -EBUSY;
135
117 calltime = trace_clock_local(); 136 calltime = trace_clock_local();
118 137
119 index = ++current->curr_ret_stack; 138 index = ++current->curr_ret_stack;
139 if (ftrace_graph_notrace_addr(func))
140 current->curr_ret_stack -= FTRACE_NOTRACE_DEPTH;
120 barrier(); 141 barrier();
121 current->ret_stack[index].ret = ret; 142 current->ret_stack[index].ret = ret;
122 current->ret_stack[index].func = func; 143 current->ret_stack[index].func = func;
123 current->ret_stack[index].calltime = calltime; 144 current->ret_stack[index].calltime = calltime;
124 current->ret_stack[index].subtime = 0; 145 current->ret_stack[index].subtime = 0;
125 current->ret_stack[index].fp = frame_pointer; 146 current->ret_stack[index].fp = frame_pointer;
126 *depth = index; 147 *depth = current->curr_ret_stack;
127 148
128 return 0; 149 return 0;
129} 150}
@@ -137,7 +158,17 @@ ftrace_pop_return_trace(struct ftrace_graph_ret *trace, unsigned long *ret,
137 158
138 index = current->curr_ret_stack; 159 index = current->curr_ret_stack;
139 160
140 if (unlikely(index < 0)) { 161 /*
162 * A negative index here means that it's just returned from a
163 * notrace'd function. Recover index to get an original
164 * return address. See ftrace_push_return_trace().
165 *
166 * TODO: Need to check whether the stack gets corrupted.
167 */
168 if (index < 0)
169 index += FTRACE_NOTRACE_DEPTH;
170
171 if (unlikely(index < 0 || index >= FTRACE_RETFUNC_DEPTH)) {
141 ftrace_graph_stop(); 172 ftrace_graph_stop();
142 WARN_ON(1); 173 WARN_ON(1);
143 /* Might as well panic, otherwise we have no where to go */ 174 /* Might as well panic, otherwise we have no where to go */
@@ -193,6 +224,15 @@ unsigned long ftrace_return_to_handler(unsigned long frame_pointer)
193 trace.rettime = trace_clock_local(); 224 trace.rettime = trace_clock_local();
194 barrier(); 225 barrier();
195 current->curr_ret_stack--; 226 current->curr_ret_stack--;
227 /*
228 * The curr_ret_stack can be less than -1 only if it was
229 * filtered out and it's about to return from the function.
230 * Recover the index and continue to trace normal functions.
231 */
232 if (current->curr_ret_stack < -1) {
233 current->curr_ret_stack += FTRACE_NOTRACE_DEPTH;
234 return ret;
235 }
196 236
197 /* 237 /*
198 * The trace should run after decrementing the ret counter 238 * The trace should run after decrementing the ret counter
@@ -230,7 +270,7 @@ int __trace_graph_entry(struct trace_array *tr,
230 return 0; 270 return 0;
231 entry = ring_buffer_event_data(event); 271 entry = ring_buffer_event_data(event);
232 entry->graph_ent = *trace; 272 entry->graph_ent = *trace;
233 if (!filter_current_check_discard(buffer, call, entry, event)) 273 if (!call_filter_check_discard(call, entry, buffer, event))
234 __buffer_unlock_commit(buffer, event); 274 __buffer_unlock_commit(buffer, event);
235 275
236 return 1; 276 return 1;
@@ -259,10 +299,20 @@ int trace_graph_entry(struct ftrace_graph_ent *trace)
259 299
260 /* trace it when it is-nested-in or is a function enabled. */ 300 /* trace it when it is-nested-in or is a function enabled. */
261 if ((!(trace->depth || ftrace_graph_addr(trace->func)) || 301 if ((!(trace->depth || ftrace_graph_addr(trace->func)) ||
262 ftrace_graph_ignore_irqs()) || 302 ftrace_graph_ignore_irqs()) || (trace->depth < 0) ||
263 (max_depth && trace->depth >= max_depth)) 303 (max_depth && trace->depth >= max_depth))
264 return 0; 304 return 0;
265 305
306 /*
307 * Do not trace a function if it's filtered by set_graph_notrace.
308 * Make the index of ret stack negative to indicate that it should
309 * ignore further functions. But it needs its own ret stack entry
310 * to recover the original index in order to continue tracing after
311 * returning from the function.
312 */
313 if (ftrace_graph_notrace_addr(trace->func))
314 return 1;
315
266 local_irq_save(flags); 316 local_irq_save(flags);
267 cpu = raw_smp_processor_id(); 317 cpu = raw_smp_processor_id();
268 data = per_cpu_ptr(tr->trace_buffer.data, cpu); 318 data = per_cpu_ptr(tr->trace_buffer.data, cpu);
@@ -335,7 +385,7 @@ void __trace_graph_return(struct trace_array *tr,
335 return; 385 return;
336 entry = ring_buffer_event_data(event); 386 entry = ring_buffer_event_data(event);
337 entry->ret = *trace; 387 entry->ret = *trace;
338 if (!filter_current_check_discard(buffer, call, entry, event)) 388 if (!call_filter_check_discard(call, entry, buffer, event))
339 __buffer_unlock_commit(buffer, event); 389 __buffer_unlock_commit(buffer, event);
340} 390}
341 391
@@ -652,7 +702,7 @@ print_graph_irq(struct trace_iterator *iter, unsigned long addr,
652 } 702 }
653 703
654 /* No overhead */ 704 /* No overhead */
655 ret = print_graph_duration(DURATION_FILL_START, s, flags); 705 ret = print_graph_duration(0, s, flags | FLAGS_FILL_START);
656 if (ret != TRACE_TYPE_HANDLED) 706 if (ret != TRACE_TYPE_HANDLED)
657 return ret; 707 return ret;
658 708
@@ -664,7 +714,7 @@ print_graph_irq(struct trace_iterator *iter, unsigned long addr,
664 if (!ret) 714 if (!ret)
665 return TRACE_TYPE_PARTIAL_LINE; 715 return TRACE_TYPE_PARTIAL_LINE;
666 716
667 ret = print_graph_duration(DURATION_FILL_END, s, flags); 717 ret = print_graph_duration(0, s, flags | FLAGS_FILL_END);
668 if (ret != TRACE_TYPE_HANDLED) 718 if (ret != TRACE_TYPE_HANDLED)
669 return ret; 719 return ret;
670 720
@@ -729,14 +779,14 @@ print_graph_duration(unsigned long long duration, struct trace_seq *s,
729 return TRACE_TYPE_HANDLED; 779 return TRACE_TYPE_HANDLED;
730 780
731 /* No real adata, just filling the column with spaces */ 781 /* No real adata, just filling the column with spaces */
732 switch (duration) { 782 switch (flags & TRACE_GRAPH_PRINT_FILL_MASK) {
733 case DURATION_FILL_FULL: 783 case FLAGS_FILL_FULL:
734 ret = trace_seq_puts(s, " | "); 784 ret = trace_seq_puts(s, " | ");
735 return ret ? TRACE_TYPE_HANDLED : TRACE_TYPE_PARTIAL_LINE; 785 return ret ? TRACE_TYPE_HANDLED : TRACE_TYPE_PARTIAL_LINE;
736 case DURATION_FILL_START: 786 case FLAGS_FILL_START:
737 ret = trace_seq_puts(s, " "); 787 ret = trace_seq_puts(s, " ");
738 return ret ? TRACE_TYPE_HANDLED : TRACE_TYPE_PARTIAL_LINE; 788 return ret ? TRACE_TYPE_HANDLED : TRACE_TYPE_PARTIAL_LINE;
739 case DURATION_FILL_END: 789 case FLAGS_FILL_END:
740 ret = trace_seq_puts(s, " |"); 790 ret = trace_seq_puts(s, " |");
741 return ret ? TRACE_TYPE_HANDLED : TRACE_TYPE_PARTIAL_LINE; 791 return ret ? TRACE_TYPE_HANDLED : TRACE_TYPE_PARTIAL_LINE;
742 } 792 }
@@ -852,7 +902,7 @@ print_graph_entry_nested(struct trace_iterator *iter,
852 } 902 }
853 903
854 /* No time */ 904 /* No time */
855 ret = print_graph_duration(DURATION_FILL_FULL, s, flags); 905 ret = print_graph_duration(0, s, flags | FLAGS_FILL_FULL);
856 if (ret != TRACE_TYPE_HANDLED) 906 if (ret != TRACE_TYPE_HANDLED)
857 return ret; 907 return ret;
858 908
@@ -1172,7 +1222,7 @@ print_graph_comment(struct trace_seq *s, struct trace_entry *ent,
1172 return TRACE_TYPE_PARTIAL_LINE; 1222 return TRACE_TYPE_PARTIAL_LINE;
1173 1223
1174 /* No time */ 1224 /* No time */
1175 ret = print_graph_duration(DURATION_FILL_FULL, s, flags); 1225 ret = print_graph_duration(0, s, flags | FLAGS_FILL_FULL);
1176 if (ret != TRACE_TYPE_HANDLED) 1226 if (ret != TRACE_TYPE_HANDLED)
1177 return ret; 1227 return ret;
1178 1228
diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c
index 243f6834d026..dae9541ada9e 100644
--- a/kernel/trace/trace_kprobe.c
+++ b/kernel/trace/trace_kprobe.c
@@ -835,7 +835,7 @@ __kprobe_trace_func(struct trace_probe *tp, struct pt_regs *regs,
835 entry->ip = (unsigned long)tp->rp.kp.addr; 835 entry->ip = (unsigned long)tp->rp.kp.addr;
836 store_trace_args(sizeof(*entry), tp, regs, (u8 *)&entry[1], dsize); 836 store_trace_args(sizeof(*entry), tp, regs, (u8 *)&entry[1], dsize);
837 837
838 if (!filter_current_check_discard(buffer, call, entry, event)) 838 if (!filter_check_discard(ftrace_file, entry, buffer, event))
839 trace_buffer_unlock_commit_regs(buffer, event, 839 trace_buffer_unlock_commit_regs(buffer, event,
840 irq_flags, pc, regs); 840 irq_flags, pc, regs);
841} 841}
@@ -884,7 +884,7 @@ __kretprobe_trace_func(struct trace_probe *tp, struct kretprobe_instance *ri,
884 entry->ret_ip = (unsigned long)ri->ret_addr; 884 entry->ret_ip = (unsigned long)ri->ret_addr;
885 store_trace_args(sizeof(*entry), tp, regs, (u8 *)&entry[1], dsize); 885 store_trace_args(sizeof(*entry), tp, regs, (u8 *)&entry[1], dsize);
886 886
887 if (!filter_current_check_discard(buffer, call, entry, event)) 887 if (!filter_check_discard(ftrace_file, entry, buffer, event))
888 trace_buffer_unlock_commit_regs(buffer, event, 888 trace_buffer_unlock_commit_regs(buffer, event,
889 irq_flags, pc, regs); 889 irq_flags, pc, regs);
890} 890}
diff --git a/kernel/trace/trace_mmiotrace.c b/kernel/trace/trace_mmiotrace.c
index b3dcfb2f0fef..0abd9b863474 100644
--- a/kernel/trace/trace_mmiotrace.c
+++ b/kernel/trace/trace_mmiotrace.c
@@ -323,7 +323,7 @@ static void __trace_mmiotrace_rw(struct trace_array *tr,
323 entry = ring_buffer_event_data(event); 323 entry = ring_buffer_event_data(event);
324 entry->rw = *rw; 324 entry->rw = *rw;
325 325
326 if (!filter_check_discard(call, entry, buffer, event)) 326 if (!call_filter_check_discard(call, entry, buffer, event))
327 trace_buffer_unlock_commit(buffer, event, 0, pc); 327 trace_buffer_unlock_commit(buffer, event, 0, pc);
328} 328}
329 329
@@ -353,7 +353,7 @@ static void __trace_mmiotrace_map(struct trace_array *tr,
353 entry = ring_buffer_event_data(event); 353 entry = ring_buffer_event_data(event);
354 entry->map = *map; 354 entry->map = *map;
355 355
356 if (!filter_check_discard(call, entry, buffer, event)) 356 if (!call_filter_check_discard(call, entry, buffer, event))
357 trace_buffer_unlock_commit(buffer, event, 0, pc); 357 trace_buffer_unlock_commit(buffer, event, 0, pc);
358} 358}
359 359
diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c
index 34e7cbac0c9c..ed32284fbe32 100644
--- a/kernel/trace/trace_output.c
+++ b/kernel/trace/trace_output.c
@@ -618,8 +618,23 @@ int trace_print_lat_fmt(struct trace_seq *s, struct trace_entry *entry)
618 (entry->flags & TRACE_FLAG_IRQS_OFF) ? 'd' : 618 (entry->flags & TRACE_FLAG_IRQS_OFF) ? 'd' :
619 (entry->flags & TRACE_FLAG_IRQS_NOSUPPORT) ? 'X' : 619 (entry->flags & TRACE_FLAG_IRQS_NOSUPPORT) ? 'X' :
620 '.'; 620 '.';
621 need_resched = 621
622 (entry->flags & TRACE_FLAG_NEED_RESCHED) ? 'N' : '.'; 622 switch (entry->flags & (TRACE_FLAG_NEED_RESCHED |
623 TRACE_FLAG_PREEMPT_RESCHED)) {
624 case TRACE_FLAG_NEED_RESCHED | TRACE_FLAG_PREEMPT_RESCHED:
625 need_resched = 'N';
626 break;
627 case TRACE_FLAG_NEED_RESCHED:
628 need_resched = 'n';
629 break;
630 case TRACE_FLAG_PREEMPT_RESCHED:
631 need_resched = 'p';
632 break;
633 default:
634 need_resched = '.';
635 break;
636 }
637
623 hardsoft_irq = 638 hardsoft_irq =
624 (hardirq && softirq) ? 'H' : 639 (hardirq && softirq) ? 'H' :
625 hardirq ? 'h' : 640 hardirq ? 'h' :
diff --git a/kernel/trace/trace_sched_switch.c b/kernel/trace/trace_sched_switch.c
index 4e98e3b257a3..3f34dc9b40f3 100644
--- a/kernel/trace/trace_sched_switch.c
+++ b/kernel/trace/trace_sched_switch.c
@@ -45,7 +45,7 @@ tracing_sched_switch_trace(struct trace_array *tr,
45 entry->next_state = next->state; 45 entry->next_state = next->state;
46 entry->next_cpu = task_cpu(next); 46 entry->next_cpu = task_cpu(next);
47 47
48 if (!filter_check_discard(call, entry, buffer, event)) 48 if (!call_filter_check_discard(call, entry, buffer, event))
49 trace_buffer_unlock_commit(buffer, event, flags, pc); 49 trace_buffer_unlock_commit(buffer, event, flags, pc);
50} 50}
51 51
@@ -101,7 +101,7 @@ tracing_sched_wakeup_trace(struct trace_array *tr,
101 entry->next_state = wakee->state; 101 entry->next_state = wakee->state;
102 entry->next_cpu = task_cpu(wakee); 102 entry->next_cpu = task_cpu(wakee);
103 103
104 if (!filter_check_discard(call, entry, buffer, event)) 104 if (!call_filter_check_discard(call, entry, buffer, event))
105 trace_buffer_unlock_commit(buffer, event, flags, pc); 105 trace_buffer_unlock_commit(buffer, event, flags, pc);
106} 106}
107 107
diff --git a/kernel/trace/trace_stat.c b/kernel/trace/trace_stat.c
index 847f88a6194b..7af67360b330 100644
--- a/kernel/trace/trace_stat.c
+++ b/kernel/trace/trace_stat.c
@@ -43,46 +43,15 @@ static DEFINE_MUTEX(all_stat_sessions_mutex);
43/* The root directory for all stat files */ 43/* The root directory for all stat files */
44static struct dentry *stat_dir; 44static struct dentry *stat_dir;
45 45
46/* 46static void __reset_stat_session(struct stat_session *session)
47 * Iterate through the rbtree using a post order traversal path
48 * to release the next node.
49 * It won't necessary release one at each iteration
50 * but it will at least advance closer to the next one
51 * to be released.
52 */
53static struct rb_node *release_next(struct tracer_stat *ts,
54 struct rb_node *node)
55{ 47{
56 struct stat_node *snode; 48 struct stat_node *snode, *n;
57 struct rb_node *parent = rb_parent(node);
58
59 if (node->rb_left)
60 return node->rb_left;
61 else if (node->rb_right)
62 return node->rb_right;
63 else {
64 if (!parent)
65 ;
66 else if (parent->rb_left == node)
67 parent->rb_left = NULL;
68 else
69 parent->rb_right = NULL;
70 49
71 snode = container_of(node, struct stat_node, node); 50 rbtree_postorder_for_each_entry_safe(snode, n, &session->stat_root, node) {
72 if (ts->stat_release) 51 if (session->ts->stat_release)
73 ts->stat_release(snode->stat); 52 session->ts->stat_release(snode->stat);
74 kfree(snode); 53 kfree(snode);
75
76 return parent;
77 } 54 }
78}
79
80static void __reset_stat_session(struct stat_session *session)
81{
82 struct rb_node *node = session->stat_root.rb_node;
83
84 while (node)
85 node = release_next(session->ts, node);
86 55
87 session->stat_root = RB_ROOT; 56 session->stat_root = RB_ROOT;
88} 57}
diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c
index 559329d9bd2f..ea90eb5f6f17 100644
--- a/kernel/trace/trace_syscalls.c
+++ b/kernel/trace/trace_syscalls.c
@@ -302,6 +302,7 @@ static int __init syscall_exit_define_fields(struct ftrace_event_call *call)
302static void ftrace_syscall_enter(void *data, struct pt_regs *regs, long id) 302static void ftrace_syscall_enter(void *data, struct pt_regs *regs, long id)
303{ 303{
304 struct trace_array *tr = data; 304 struct trace_array *tr = data;
305 struct ftrace_event_file *ftrace_file;
305 struct syscall_trace_enter *entry; 306 struct syscall_trace_enter *entry;
306 struct syscall_metadata *sys_data; 307 struct syscall_metadata *sys_data;
307 struct ring_buffer_event *event; 308 struct ring_buffer_event *event;
@@ -314,7 +315,13 @@ static void ftrace_syscall_enter(void *data, struct pt_regs *regs, long id)
314 syscall_nr = trace_get_syscall_nr(current, regs); 315 syscall_nr = trace_get_syscall_nr(current, regs);
315 if (syscall_nr < 0) 316 if (syscall_nr < 0)
316 return; 317 return;
317 if (!test_bit(syscall_nr, tr->enabled_enter_syscalls)) 318
319 /* Here we're inside tp handler's rcu_read_lock_sched (__DO_TRACE) */
320 ftrace_file = rcu_dereference_sched(tr->enter_syscall_files[syscall_nr]);
321 if (!ftrace_file)
322 return;
323
324 if (test_bit(FTRACE_EVENT_FL_SOFT_DISABLED_BIT, &ftrace_file->flags))
318 return; 325 return;
319 326
320 sys_data = syscall_nr_to_meta(syscall_nr); 327 sys_data = syscall_nr_to_meta(syscall_nr);
@@ -336,8 +343,7 @@ static void ftrace_syscall_enter(void *data, struct pt_regs *regs, long id)
336 entry->nr = syscall_nr; 343 entry->nr = syscall_nr;
337 syscall_get_arguments(current, regs, 0, sys_data->nb_args, entry->args); 344 syscall_get_arguments(current, regs, 0, sys_data->nb_args, entry->args);
338 345
339 if (!filter_current_check_discard(buffer, sys_data->enter_event, 346 if (!filter_check_discard(ftrace_file, entry, buffer, event))
340 entry, event))
341 trace_current_buffer_unlock_commit(buffer, event, 347 trace_current_buffer_unlock_commit(buffer, event,
342 irq_flags, pc); 348 irq_flags, pc);
343} 349}
@@ -345,6 +351,7 @@ static void ftrace_syscall_enter(void *data, struct pt_regs *regs, long id)
345static void ftrace_syscall_exit(void *data, struct pt_regs *regs, long ret) 351static void ftrace_syscall_exit(void *data, struct pt_regs *regs, long ret)
346{ 352{
347 struct trace_array *tr = data; 353 struct trace_array *tr = data;
354 struct ftrace_event_file *ftrace_file;
348 struct syscall_trace_exit *entry; 355 struct syscall_trace_exit *entry;
349 struct syscall_metadata *sys_data; 356 struct syscall_metadata *sys_data;
350 struct ring_buffer_event *event; 357 struct ring_buffer_event *event;
@@ -356,7 +363,13 @@ static void ftrace_syscall_exit(void *data, struct pt_regs *regs, long ret)
356 syscall_nr = trace_get_syscall_nr(current, regs); 363 syscall_nr = trace_get_syscall_nr(current, regs);
357 if (syscall_nr < 0) 364 if (syscall_nr < 0)
358 return; 365 return;
359 if (!test_bit(syscall_nr, tr->enabled_exit_syscalls)) 366
367 /* Here we're inside tp handler's rcu_read_lock_sched (__DO_TRACE()) */
368 ftrace_file = rcu_dereference_sched(tr->exit_syscall_files[syscall_nr]);
369 if (!ftrace_file)
370 return;
371
372 if (test_bit(FTRACE_EVENT_FL_SOFT_DISABLED_BIT, &ftrace_file->flags))
360 return; 373 return;
361 374
362 sys_data = syscall_nr_to_meta(syscall_nr); 375 sys_data = syscall_nr_to_meta(syscall_nr);
@@ -377,8 +390,7 @@ static void ftrace_syscall_exit(void *data, struct pt_regs *regs, long ret)
377 entry->nr = syscall_nr; 390 entry->nr = syscall_nr;
378 entry->ret = syscall_get_return_value(current, regs); 391 entry->ret = syscall_get_return_value(current, regs);
379 392
380 if (!filter_current_check_discard(buffer, sys_data->exit_event, 393 if (!filter_check_discard(ftrace_file, entry, buffer, event))
381 entry, event))
382 trace_current_buffer_unlock_commit(buffer, event, 394 trace_current_buffer_unlock_commit(buffer, event,
383 irq_flags, pc); 395 irq_flags, pc);
384} 396}
@@ -397,7 +409,7 @@ static int reg_event_syscall_enter(struct ftrace_event_file *file,
397 if (!tr->sys_refcount_enter) 409 if (!tr->sys_refcount_enter)
398 ret = register_trace_sys_enter(ftrace_syscall_enter, tr); 410 ret = register_trace_sys_enter(ftrace_syscall_enter, tr);
399 if (!ret) { 411 if (!ret) {
400 set_bit(num, tr->enabled_enter_syscalls); 412 rcu_assign_pointer(tr->enter_syscall_files[num], file);
401 tr->sys_refcount_enter++; 413 tr->sys_refcount_enter++;
402 } 414 }
403 mutex_unlock(&syscall_trace_lock); 415 mutex_unlock(&syscall_trace_lock);
@@ -415,7 +427,7 @@ static void unreg_event_syscall_enter(struct ftrace_event_file *file,
415 return; 427 return;
416 mutex_lock(&syscall_trace_lock); 428 mutex_lock(&syscall_trace_lock);
417 tr->sys_refcount_enter--; 429 tr->sys_refcount_enter--;
418 clear_bit(num, tr->enabled_enter_syscalls); 430 rcu_assign_pointer(tr->enter_syscall_files[num], NULL);
419 if (!tr->sys_refcount_enter) 431 if (!tr->sys_refcount_enter)
420 unregister_trace_sys_enter(ftrace_syscall_enter, tr); 432 unregister_trace_sys_enter(ftrace_syscall_enter, tr);
421 mutex_unlock(&syscall_trace_lock); 433 mutex_unlock(&syscall_trace_lock);
@@ -435,7 +447,7 @@ static int reg_event_syscall_exit(struct ftrace_event_file *file,
435 if (!tr->sys_refcount_exit) 447 if (!tr->sys_refcount_exit)
436 ret = register_trace_sys_exit(ftrace_syscall_exit, tr); 448 ret = register_trace_sys_exit(ftrace_syscall_exit, tr);
437 if (!ret) { 449 if (!ret) {
438 set_bit(num, tr->enabled_exit_syscalls); 450 rcu_assign_pointer(tr->exit_syscall_files[num], file);
439 tr->sys_refcount_exit++; 451 tr->sys_refcount_exit++;
440 } 452 }
441 mutex_unlock(&syscall_trace_lock); 453 mutex_unlock(&syscall_trace_lock);
@@ -453,7 +465,7 @@ static void unreg_event_syscall_exit(struct ftrace_event_file *file,
453 return; 465 return;
454 mutex_lock(&syscall_trace_lock); 466 mutex_lock(&syscall_trace_lock);
455 tr->sys_refcount_exit--; 467 tr->sys_refcount_exit--;
456 clear_bit(num, tr->enabled_exit_syscalls); 468 rcu_assign_pointer(tr->exit_syscall_files[num], NULL);
457 if (!tr->sys_refcount_exit) 469 if (!tr->sys_refcount_exit)
458 unregister_trace_sys_exit(ftrace_syscall_exit, tr); 470 unregister_trace_sys_exit(ftrace_syscall_exit, tr);
459 mutex_unlock(&syscall_trace_lock); 471 mutex_unlock(&syscall_trace_lock);
diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c
index 272261b5f94f..b6dcc42ef7f5 100644
--- a/kernel/trace/trace_uprobe.c
+++ b/kernel/trace/trace_uprobe.c
@@ -128,6 +128,7 @@ alloc_trace_uprobe(const char *group, const char *event, int nargs, bool is_ret)
128 if (is_ret) 128 if (is_ret)
129 tu->consumer.ret_handler = uretprobe_dispatcher; 129 tu->consumer.ret_handler = uretprobe_dispatcher;
130 init_trace_uprobe_filter(&tu->filter); 130 init_trace_uprobe_filter(&tu->filter);
131 tu->call.flags |= TRACE_EVENT_FL_USE_CALL_FILTER;
131 return tu; 132 return tu;
132 133
133error: 134error:
@@ -561,7 +562,7 @@ static void uprobe_trace_print(struct trace_uprobe *tu,
561 for (i = 0; i < tu->nr_args; i++) 562 for (i = 0; i < tu->nr_args; i++)
562 call_fetch(&tu->args[i].fetch, regs, data + tu->args[i].offset); 563 call_fetch(&tu->args[i].fetch, regs, data + tu->args[i].offset);
563 564
564 if (!filter_current_check_discard(buffer, call, entry, event)) 565 if (!call_filter_check_discard(call, entry, buffer, event))
565 trace_buffer_unlock_commit(buffer, event, 0, 0); 566 trace_buffer_unlock_commit(buffer, event, 0, 0);
566} 567}
567 568
diff --git a/kernel/up.c b/kernel/up.c
index 630d72bf7e41..509403e3fbc6 100644
--- a/kernel/up.c
+++ b/kernel/up.c
@@ -22,6 +22,17 @@ int smp_call_function_single(int cpu, void (*func) (void *info), void *info,
22} 22}
23EXPORT_SYMBOL(smp_call_function_single); 23EXPORT_SYMBOL(smp_call_function_single);
24 24
25void __smp_call_function_single(int cpu, struct call_single_data *csd,
26 int wait)
27{
28 unsigned long flags;
29
30 local_irq_save(flags);
31 csd->func(csd->info);
32 local_irq_restore(flags);
33}
34EXPORT_SYMBOL(__smp_call_function_single);
35
25int on_each_cpu(smp_call_func_t func, void *info, int wait) 36int on_each_cpu(smp_call_func_t func, void *info, int wait)
26{ 37{
27 unsigned long flags; 38 unsigned long flags;
diff --git a/kernel/user.c b/kernel/user.c
index 5bbb91988e69..c006131beb77 100644
--- a/kernel/user.c
+++ b/kernel/user.c
@@ -51,6 +51,10 @@ struct user_namespace init_user_ns = {
51 .owner = GLOBAL_ROOT_UID, 51 .owner = GLOBAL_ROOT_UID,
52 .group = GLOBAL_ROOT_GID, 52 .group = GLOBAL_ROOT_GID,
53 .proc_inum = PROC_USER_INIT_INO, 53 .proc_inum = PROC_USER_INIT_INO,
54#ifdef CONFIG_PERSISTENT_KEYRINGS
55 .persistent_keyring_register_sem =
56 __RWSEM_INITIALIZER(init_user_ns.persistent_keyring_register_sem),
57#endif
54}; 58};
55EXPORT_SYMBOL_GPL(init_user_ns); 59EXPORT_SYMBOL_GPL(init_user_ns);
56 60
diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c
index 13fb1134ba58..240fb62cf394 100644
--- a/kernel/user_namespace.c
+++ b/kernel/user_namespace.c
@@ -101,6 +101,9 @@ int create_user_ns(struct cred *new)
101 101
102 set_cred_user_ns(new, ns); 102 set_cred_user_ns(new, ns);
103 103
104#ifdef CONFIG_PERSISTENT_KEYRINGS
105 init_rwsem(&ns->persistent_keyring_register_sem);
106#endif
104 return 0; 107 return 0;
105} 108}
106 109
@@ -130,6 +133,9 @@ void free_user_ns(struct user_namespace *ns)
130 133
131 do { 134 do {
132 parent = ns->parent; 135 parent = ns->parent;
136#ifdef CONFIG_PERSISTENT_KEYRINGS
137 key_put(ns->persistent_keyring_register);
138#endif
133 proc_free_inum(ns->proc_inum); 139 proc_free_inum(ns->proc_inum);
134 kmem_cache_free(user_ns_cachep, ns); 140 kmem_cache_free(user_ns_cachep, ns);
135 ns = parent; 141 ns = parent;
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 987293d03ebc..b010eac595d2 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -305,6 +305,9 @@ static DEFINE_HASHTABLE(unbound_pool_hash, UNBOUND_POOL_HASH_ORDER);
305/* I: attributes used when instantiating standard unbound pools on demand */ 305/* I: attributes used when instantiating standard unbound pools on demand */
306static struct workqueue_attrs *unbound_std_wq_attrs[NR_STD_WORKER_POOLS]; 306static struct workqueue_attrs *unbound_std_wq_attrs[NR_STD_WORKER_POOLS];
307 307
308/* I: attributes used when instantiating ordered pools on demand */
309static struct workqueue_attrs *ordered_wq_attrs[NR_STD_WORKER_POOLS];
310
308struct workqueue_struct *system_wq __read_mostly; 311struct workqueue_struct *system_wq __read_mostly;
309EXPORT_SYMBOL(system_wq); 312EXPORT_SYMBOL(system_wq);
310struct workqueue_struct *system_highpri_wq __read_mostly; 313struct workqueue_struct *system_highpri_wq __read_mostly;
@@ -518,14 +521,21 @@ static inline void debug_work_activate(struct work_struct *work) { }
518static inline void debug_work_deactivate(struct work_struct *work) { } 521static inline void debug_work_deactivate(struct work_struct *work) { }
519#endif 522#endif
520 523
521/* allocate ID and assign it to @pool */ 524/**
525 * worker_pool_assign_id - allocate ID and assing it to @pool
526 * @pool: the pool pointer of interest
527 *
528 * Returns 0 if ID in [0, WORK_OFFQ_POOL_NONE) is allocated and assigned
529 * successfully, -errno on failure.
530 */
522static int worker_pool_assign_id(struct worker_pool *pool) 531static int worker_pool_assign_id(struct worker_pool *pool)
523{ 532{
524 int ret; 533 int ret;
525 534
526 lockdep_assert_held(&wq_pool_mutex); 535 lockdep_assert_held(&wq_pool_mutex);
527 536
528 ret = idr_alloc(&worker_pool_idr, pool, 0, 0, GFP_KERNEL); 537 ret = idr_alloc(&worker_pool_idr, pool, 0, WORK_OFFQ_POOL_NONE,
538 GFP_KERNEL);
529 if (ret >= 0) { 539 if (ret >= 0) {
530 pool->id = ret; 540 pool->id = ret;
531 return 0; 541 return 0;
@@ -1320,7 +1330,7 @@ static void __queue_work(int cpu, struct workqueue_struct *wq,
1320 1330
1321 debug_work_activate(work); 1331 debug_work_activate(work);
1322 1332
1323 /* if dying, only works from the same workqueue are allowed */ 1333 /* if draining, only works from the same workqueue are allowed */
1324 if (unlikely(wq->flags & __WQ_DRAINING) && 1334 if (unlikely(wq->flags & __WQ_DRAINING) &&
1325 WARN_ON_ONCE(!is_chained_work(wq))) 1335 WARN_ON_ONCE(!is_chained_work(wq)))
1326 return; 1336 return;
@@ -1736,16 +1746,17 @@ static struct worker *create_worker(struct worker_pool *pool)
1736 if (IS_ERR(worker->task)) 1746 if (IS_ERR(worker->task))
1737 goto fail; 1747 goto fail;
1738 1748
1749 set_user_nice(worker->task, pool->attrs->nice);
1750
1751 /* prevent userland from meddling with cpumask of workqueue workers */
1752 worker->task->flags |= PF_NO_SETAFFINITY;
1753
1739 /* 1754 /*
1740 * set_cpus_allowed_ptr() will fail if the cpumask doesn't have any 1755 * set_cpus_allowed_ptr() will fail if the cpumask doesn't have any
1741 * online CPUs. It'll be re-applied when any of the CPUs come up. 1756 * online CPUs. It'll be re-applied when any of the CPUs come up.
1742 */ 1757 */
1743 set_user_nice(worker->task, pool->attrs->nice);
1744 set_cpus_allowed_ptr(worker->task, pool->attrs->cpumask); 1758 set_cpus_allowed_ptr(worker->task, pool->attrs->cpumask);
1745 1759
1746 /* prevent userland from meddling with cpumask of workqueue workers */
1747 worker->task->flags |= PF_NO_SETAFFINITY;
1748
1749 /* 1760 /*
1750 * The caller is responsible for ensuring %POOL_DISASSOCIATED 1761 * The caller is responsible for ensuring %POOL_DISASSOCIATED
1751 * remains stable across this function. See the comments above the 1762 * remains stable across this function. See the comments above the
@@ -2840,19 +2851,6 @@ already_gone:
2840 return false; 2851 return false;
2841} 2852}
2842 2853
2843static bool __flush_work(struct work_struct *work)
2844{
2845 struct wq_barrier barr;
2846
2847 if (start_flush_work(work, &barr)) {
2848 wait_for_completion(&barr.done);
2849 destroy_work_on_stack(&barr.work);
2850 return true;
2851 } else {
2852 return false;
2853 }
2854}
2855
2856/** 2854/**
2857 * flush_work - wait for a work to finish executing the last queueing instance 2855 * flush_work - wait for a work to finish executing the last queueing instance
2858 * @work: the work to flush 2856 * @work: the work to flush
@@ -2866,10 +2864,18 @@ static bool __flush_work(struct work_struct *work)
2866 */ 2864 */
2867bool flush_work(struct work_struct *work) 2865bool flush_work(struct work_struct *work)
2868{ 2866{
2867 struct wq_barrier barr;
2868
2869 lock_map_acquire(&work->lockdep_map); 2869 lock_map_acquire(&work->lockdep_map);
2870 lock_map_release(&work->lockdep_map); 2870 lock_map_release(&work->lockdep_map);
2871 2871
2872 return __flush_work(work); 2872 if (start_flush_work(work, &barr)) {
2873 wait_for_completion(&barr.done);
2874 destroy_work_on_stack(&barr.work);
2875 return true;
2876 } else {
2877 return false;
2878 }
2873} 2879}
2874EXPORT_SYMBOL_GPL(flush_work); 2880EXPORT_SYMBOL_GPL(flush_work);
2875 2881
@@ -4106,7 +4112,7 @@ out_unlock:
4106static int alloc_and_link_pwqs(struct workqueue_struct *wq) 4112static int alloc_and_link_pwqs(struct workqueue_struct *wq)
4107{ 4113{
4108 bool highpri = wq->flags & WQ_HIGHPRI; 4114 bool highpri = wq->flags & WQ_HIGHPRI;
4109 int cpu; 4115 int cpu, ret;
4110 4116
4111 if (!(wq->flags & WQ_UNBOUND)) { 4117 if (!(wq->flags & WQ_UNBOUND)) {
4112 wq->cpu_pwqs = alloc_percpu(struct pool_workqueue); 4118 wq->cpu_pwqs = alloc_percpu(struct pool_workqueue);
@@ -4126,6 +4132,13 @@ static int alloc_and_link_pwqs(struct workqueue_struct *wq)
4126 mutex_unlock(&wq->mutex); 4132 mutex_unlock(&wq->mutex);
4127 } 4133 }
4128 return 0; 4134 return 0;
4135 } else if (wq->flags & __WQ_ORDERED) {
4136 ret = apply_workqueue_attrs(wq, ordered_wq_attrs[highpri]);
4137 /* there should only be single pwq for ordering guarantee */
4138 WARN(!ret && (wq->pwqs.next != &wq->dfl_pwq->pwqs_node ||
4139 wq->pwqs.prev != &wq->dfl_pwq->pwqs_node),
4140 "ordering guarantee broken for workqueue %s\n", wq->name);
4141 return ret;
4129 } else { 4142 } else {
4130 return apply_workqueue_attrs(wq, unbound_std_wq_attrs[highpri]); 4143 return apply_workqueue_attrs(wq, unbound_std_wq_attrs[highpri]);
4131 } 4144 }
@@ -4814,14 +4827,7 @@ long work_on_cpu(int cpu, long (*fn)(void *), void *arg)
4814 4827
4815 INIT_WORK_ONSTACK(&wfc.work, work_for_cpu_fn); 4828 INIT_WORK_ONSTACK(&wfc.work, work_for_cpu_fn);
4816 schedule_work_on(cpu, &wfc.work); 4829 schedule_work_on(cpu, &wfc.work);
4817 4830 flush_work(&wfc.work);
4818 /*
4819 * The work item is on-stack and can't lead to deadlock through
4820 * flushing. Use __flush_work() to avoid spurious lockdep warnings
4821 * when work_on_cpu()s are nested.
4822 */
4823 __flush_work(&wfc.work);
4824
4825 return wfc.ret; 4831 return wfc.ret;
4826} 4832}
4827EXPORT_SYMBOL_GPL(work_on_cpu); 4833EXPORT_SYMBOL_GPL(work_on_cpu);
@@ -5009,10 +5015,6 @@ static int __init init_workqueues(void)
5009 int std_nice[NR_STD_WORKER_POOLS] = { 0, HIGHPRI_NICE_LEVEL }; 5015 int std_nice[NR_STD_WORKER_POOLS] = { 0, HIGHPRI_NICE_LEVEL };
5010 int i, cpu; 5016 int i, cpu;
5011 5017
5012 /* make sure we have enough bits for OFFQ pool ID */
5013 BUILD_BUG_ON((1LU << (BITS_PER_LONG - WORK_OFFQ_POOL_SHIFT)) <
5014 WORK_CPU_END * NR_STD_WORKER_POOLS);
5015
5016 WARN_ON(__alignof__(struct pool_workqueue) < __alignof__(long long)); 5018 WARN_ON(__alignof__(struct pool_workqueue) < __alignof__(long long));
5017 5019
5018 pwq_cache = KMEM_CACHE(pool_workqueue, SLAB_PANIC); 5020 pwq_cache = KMEM_CACHE(pool_workqueue, SLAB_PANIC);
@@ -5051,13 +5053,23 @@ static int __init init_workqueues(void)
5051 } 5053 }
5052 } 5054 }
5053 5055
5054 /* create default unbound wq attrs */ 5056 /* create default unbound and ordered wq attrs */
5055 for (i = 0; i < NR_STD_WORKER_POOLS; i++) { 5057 for (i = 0; i < NR_STD_WORKER_POOLS; i++) {
5056 struct workqueue_attrs *attrs; 5058 struct workqueue_attrs *attrs;
5057 5059
5058 BUG_ON(!(attrs = alloc_workqueue_attrs(GFP_KERNEL))); 5060 BUG_ON(!(attrs = alloc_workqueue_attrs(GFP_KERNEL)));
5059 attrs->nice = std_nice[i]; 5061 attrs->nice = std_nice[i];
5060 unbound_std_wq_attrs[i] = attrs; 5062 unbound_std_wq_attrs[i] = attrs;
5063
5064 /*
5065 * An ordered wq should have only one pwq as ordering is
5066 * guaranteed by max_active which is enforced by pwqs.
5067 * Turn off NUMA so that dfl_pwq is used for all nodes.
5068 */
5069 BUG_ON(!(attrs = alloc_workqueue_attrs(GFP_KERNEL)));
5070 attrs->nice = std_nice[i];
5071 attrs->no_numa = true;
5072 ordered_wq_attrs[i] = attrs;
5061 } 5073 }
5062 5074
5063 system_wq = alloc_workqueue("events", 0, 0); 5075 system_wq = alloc_workqueue("events", 0, 0);