diff options
author | Eric Paris <eparis@redhat.com> | 2014-03-07 11:41:32 -0500 |
---|---|---|
committer | Eric Paris <eparis@redhat.com> | 2014-03-07 11:41:32 -0500 |
commit | b7d3622a39fde7658170b7f3cf6c6889bb8db30d (patch) | |
tree | 64f4e781ecb2a85d675e234072b988560bcd25f1 /kernel | |
parent | f3411cb2b2e396a41ed3a439863f028db7140a34 (diff) | |
parent | d8ec26d7f8287f5788a494f56e8814210f0e64be (diff) |
Merge tag 'v3.13' into for-3.15
Linux 3.13
Conflicts:
include/net/xfrm.h
Simple merge where v3.13 removed 'extern' from definitions and the audit
tree did s/u32/unsigned int/ to the same definitions.
Diffstat (limited to 'kernel')
147 files changed, 6384 insertions, 2588 deletions
diff --git a/kernel/.gitignore b/kernel/.gitignore index b3097bde4e9c..790d83c7d160 100644 --- a/kernel/.gitignore +++ b/kernel/.gitignore | |||
@@ -5,3 +5,4 @@ config_data.h | |||
5 | config_data.gz | 5 | config_data.gz |
6 | timeconst.h | 6 | timeconst.h |
7 | hz.bc | 7 | hz.bc |
8 | x509_certificate_list | ||
diff --git a/kernel/Kconfig.hz b/kernel/Kconfig.hz index 94fabd534b03..2a202a846757 100644 --- a/kernel/Kconfig.hz +++ b/kernel/Kconfig.hz | |||
@@ -55,4 +55,4 @@ config HZ | |||
55 | default 1000 if HZ_1000 | 55 | default 1000 if HZ_1000 |
56 | 56 | ||
57 | config SCHED_HRTICK | 57 | config SCHED_HRTICK |
58 | def_bool HIGH_RES_TIMERS && (!SMP || USE_GENERIC_SMP_HELPERS) | 58 | def_bool HIGH_RES_TIMERS |
diff --git a/kernel/Makefile b/kernel/Makefile index 1ce47553fb02..bc010ee272b6 100644 --- a/kernel/Makefile +++ b/kernel/Makefile | |||
@@ -6,56 +6,44 @@ obj-y = fork.o exec_domain.o panic.o \ | |||
6 | cpu.o exit.o itimer.o time.o softirq.o resource.o \ | 6 | cpu.o exit.o itimer.o time.o softirq.o resource.o \ |
7 | sysctl.o sysctl_binary.o capability.o ptrace.o timer.o user.o \ | 7 | sysctl.o sysctl_binary.o capability.o ptrace.o timer.o user.o \ |
8 | signal.o sys.o kmod.o workqueue.o pid.o task_work.o \ | 8 | signal.o sys.o kmod.o workqueue.o pid.o task_work.o \ |
9 | rcupdate.o extable.o params.o posix-timers.o \ | 9 | extable.o params.o posix-timers.o \ |
10 | kthread.o wait.o sys_ni.o posix-cpu-timers.o mutex.o \ | 10 | kthread.o sys_ni.o posix-cpu-timers.o \ |
11 | hrtimer.o rwsem.o nsproxy.o srcu.o semaphore.o \ | 11 | hrtimer.o nsproxy.o \ |
12 | notifier.o ksysfs.o cred.o reboot.o \ | 12 | notifier.o ksysfs.o cred.o reboot.o \ |
13 | async.o range.o groups.o lglock.o smpboot.o | 13 | async.o range.o groups.o smpboot.o |
14 | 14 | ||
15 | ifdef CONFIG_FUNCTION_TRACER | 15 | ifdef CONFIG_FUNCTION_TRACER |
16 | # Do not trace debug files and internal ftrace files | 16 | # Do not trace debug files and internal ftrace files |
17 | CFLAGS_REMOVE_lockdep.o = -pg | ||
18 | CFLAGS_REMOVE_lockdep_proc.o = -pg | ||
19 | CFLAGS_REMOVE_mutex-debug.o = -pg | ||
20 | CFLAGS_REMOVE_rtmutex-debug.o = -pg | ||
21 | CFLAGS_REMOVE_cgroup-debug.o = -pg | 17 | CFLAGS_REMOVE_cgroup-debug.o = -pg |
22 | CFLAGS_REMOVE_irq_work.o = -pg | 18 | CFLAGS_REMOVE_irq_work.o = -pg |
23 | endif | 19 | endif |
24 | 20 | ||
25 | obj-y += sched/ | 21 | obj-y += sched/ |
22 | obj-y += locking/ | ||
26 | obj-y += power/ | 23 | obj-y += power/ |
27 | obj-y += printk/ | 24 | obj-y += printk/ |
28 | obj-y += cpu/ | 25 | obj-y += cpu/ |
29 | obj-y += irq/ | 26 | obj-y += irq/ |
27 | obj-y += rcu/ | ||
30 | 28 | ||
31 | obj-$(CONFIG_CHECKPOINT_RESTORE) += kcmp.o | 29 | obj-$(CONFIG_CHECKPOINT_RESTORE) += kcmp.o |
32 | obj-$(CONFIG_FREEZER) += freezer.o | 30 | obj-$(CONFIG_FREEZER) += freezer.o |
33 | obj-$(CONFIG_PROFILING) += profile.o | 31 | obj-$(CONFIG_PROFILING) += profile.o |
34 | obj-$(CONFIG_STACKTRACE) += stacktrace.o | 32 | obj-$(CONFIG_STACKTRACE) += stacktrace.o |
35 | obj-y += time/ | 33 | obj-y += time/ |
36 | obj-$(CONFIG_DEBUG_MUTEXES) += mutex-debug.o | ||
37 | obj-$(CONFIG_LOCKDEP) += lockdep.o | ||
38 | ifeq ($(CONFIG_PROC_FS),y) | ||
39 | obj-$(CONFIG_LOCKDEP) += lockdep_proc.o | ||
40 | endif | ||
41 | obj-$(CONFIG_FUTEX) += futex.o | 34 | obj-$(CONFIG_FUTEX) += futex.o |
42 | ifeq ($(CONFIG_COMPAT),y) | 35 | ifeq ($(CONFIG_COMPAT),y) |
43 | obj-$(CONFIG_FUTEX) += futex_compat.o | 36 | obj-$(CONFIG_FUTEX) += futex_compat.o |
44 | endif | 37 | endif |
45 | obj-$(CONFIG_RT_MUTEXES) += rtmutex.o | ||
46 | obj-$(CONFIG_DEBUG_RT_MUTEXES) += rtmutex-debug.o | ||
47 | obj-$(CONFIG_RT_MUTEX_TESTER) += rtmutex-tester.o | ||
48 | obj-$(CONFIG_GENERIC_ISA_DMA) += dma.o | 38 | obj-$(CONFIG_GENERIC_ISA_DMA) += dma.o |
49 | obj-$(CONFIG_SMP) += smp.o | 39 | obj-$(CONFIG_SMP) += smp.o |
50 | ifneq ($(CONFIG_SMP),y) | 40 | ifneq ($(CONFIG_SMP),y) |
51 | obj-y += up.o | 41 | obj-y += up.o |
52 | endif | 42 | endif |
53 | obj-$(CONFIG_SMP) += spinlock.o | ||
54 | obj-$(CONFIG_DEBUG_SPINLOCK) += spinlock.o | ||
55 | obj-$(CONFIG_PROVE_LOCKING) += spinlock.o | ||
56 | obj-$(CONFIG_UID16) += uid16.o | 43 | obj-$(CONFIG_UID16) += uid16.o |
44 | obj-$(CONFIG_SYSTEM_TRUSTED_KEYRING) += system_keyring.o system_certificates.o | ||
57 | obj-$(CONFIG_MODULES) += module.o | 45 | obj-$(CONFIG_MODULES) += module.o |
58 | obj-$(CONFIG_MODULE_SIG) += module_signing.o modsign_pubkey.o modsign_certificate.o | 46 | obj-$(CONFIG_MODULE_SIG) += module_signing.o |
59 | obj-$(CONFIG_KALLSYMS) += kallsyms.o | 47 | obj-$(CONFIG_KALLSYMS) += kallsyms.o |
60 | obj-$(CONFIG_BSD_PROCESS_ACCT) += acct.o | 48 | obj-$(CONFIG_BSD_PROCESS_ACCT) += acct.o |
61 | obj-$(CONFIG_KEXEC) += kexec.o | 49 | obj-$(CONFIG_KEXEC) += kexec.o |
@@ -81,12 +69,6 @@ obj-$(CONFIG_KGDB) += debug/ | |||
81 | obj-$(CONFIG_DETECT_HUNG_TASK) += hung_task.o | 69 | obj-$(CONFIG_DETECT_HUNG_TASK) += hung_task.o |
82 | obj-$(CONFIG_LOCKUP_DETECTOR) += watchdog.o | 70 | obj-$(CONFIG_LOCKUP_DETECTOR) += watchdog.o |
83 | obj-$(CONFIG_SECCOMP) += seccomp.o | 71 | obj-$(CONFIG_SECCOMP) += seccomp.o |
84 | obj-$(CONFIG_RCU_TORTURE_TEST) += rcutorture.o | ||
85 | obj-$(CONFIG_TREE_RCU) += rcutree.o | ||
86 | obj-$(CONFIG_TREE_PREEMPT_RCU) += rcutree.o | ||
87 | obj-$(CONFIG_TREE_RCU_TRACE) += rcutree_trace.o | ||
88 | obj-$(CONFIG_TINY_RCU) += rcutiny.o | ||
89 | obj-$(CONFIG_TINY_PREEMPT_RCU) += rcutiny.o | ||
90 | obj-$(CONFIG_RELAY) += relay.o | 72 | obj-$(CONFIG_RELAY) += relay.o |
91 | obj-$(CONFIG_SYSCTL) += utsname_sysctl.o | 73 | obj-$(CONFIG_SYSCTL) += utsname_sysctl.o |
92 | obj-$(CONFIG_TASK_DELAY_ACCT) += delayacct.o | 74 | obj-$(CONFIG_TASK_DELAY_ACCT) += delayacct.o |
@@ -141,19 +123,53 @@ targets += timeconst.h | |||
141 | $(obj)/timeconst.h: $(obj)/hz.bc $(src)/timeconst.bc FORCE | 123 | $(obj)/timeconst.h: $(obj)/hz.bc $(src)/timeconst.bc FORCE |
142 | $(call if_changed,bc) | 124 | $(call if_changed,bc) |
143 | 125 | ||
144 | ifeq ($(CONFIG_MODULE_SIG),y) | 126 | ############################################################################### |
127 | # | ||
128 | # Roll all the X.509 certificates that we can find together and pull them into | ||
129 | # the kernel so that they get loaded into the system trusted keyring during | ||
130 | # boot. | ||
145 | # | 131 | # |
146 | # Pull the signing certificate and any extra certificates into the kernel | 132 | # We look in the source root and the build root for all files whose name ends |
133 | # in ".x509". Unfortunately, this will generate duplicate filenames, so we | ||
134 | # have make canonicalise the pathnames and then sort them to discard the | ||
135 | # duplicates. | ||
147 | # | 136 | # |
137 | ############################################################################### | ||
138 | ifeq ($(CONFIG_SYSTEM_TRUSTED_KEYRING),y) | ||
139 | X509_CERTIFICATES-y := $(wildcard *.x509) $(wildcard $(srctree)/*.x509) | ||
140 | X509_CERTIFICATES-$(CONFIG_MODULE_SIG) += $(objtree)/signing_key.x509 | ||
141 | X509_CERTIFICATES-raw := $(sort $(foreach CERT,$(X509_CERTIFICATES-y), \ | ||
142 | $(or $(realpath $(CERT)),$(CERT)))) | ||
143 | X509_CERTIFICATES := $(subst $(realpath $(objtree))/,,$(X509_CERTIFICATES-raw)) | ||
144 | |||
145 | ifeq ($(X509_CERTIFICATES),) | ||
146 | $(warning *** No X.509 certificates found ***) | ||
147 | endif | ||
148 | 148 | ||
149 | quiet_cmd_touch = TOUCH $@ | 149 | ifneq ($(wildcard $(obj)/.x509.list),) |
150 | cmd_touch = touch $@ | 150 | ifneq ($(shell cat $(obj)/.x509.list),$(X509_CERTIFICATES)) |
151 | $(info X.509 certificate list changed) | ||
152 | $(shell rm $(obj)/.x509.list) | ||
153 | endif | ||
154 | endif | ||
155 | |||
156 | kernel/system_certificates.o: $(obj)/x509_certificate_list | ||
157 | |||
158 | quiet_cmd_x509certs = CERTS $@ | ||
159 | cmd_x509certs = cat $(X509_CERTIFICATES) /dev/null >$@ $(foreach X509,$(X509_CERTIFICATES),; echo " - Including cert $(X509)") | ||
151 | 160 | ||
152 | extra_certificates: | 161 | targets += $(obj)/x509_certificate_list |
153 | $(call cmd,touch) | 162 | $(obj)/x509_certificate_list: $(X509_CERTIFICATES) $(obj)/.x509.list |
163 | $(call if_changed,x509certs) | ||
154 | 164 | ||
155 | kernel/modsign_certificate.o: signing_key.x509 extra_certificates | 165 | targets += $(obj)/.x509.list |
166 | $(obj)/.x509.list: | ||
167 | @echo $(X509_CERTIFICATES) >$@ | ||
168 | endif | ||
169 | |||
170 | clean-files := x509_certificate_list .x509.list | ||
156 | 171 | ||
172 | ifeq ($(CONFIG_MODULE_SIG),y) | ||
157 | ############################################################################### | 173 | ############################################################################### |
158 | # | 174 | # |
159 | # If module signing is requested, say by allyesconfig, but a key has not been | 175 | # If module signing is requested, say by allyesconfig, but a key has not been |
diff --git a/kernel/bounds.c b/kernel/bounds.c index 0c9b862292b2..9fd4246b04b8 100644 --- a/kernel/bounds.c +++ b/kernel/bounds.c | |||
@@ -10,6 +10,8 @@ | |||
10 | #include <linux/mmzone.h> | 10 | #include <linux/mmzone.h> |
11 | #include <linux/kbuild.h> | 11 | #include <linux/kbuild.h> |
12 | #include <linux/page_cgroup.h> | 12 | #include <linux/page_cgroup.h> |
13 | #include <linux/log2.h> | ||
14 | #include <linux/spinlock_types.h> | ||
13 | 15 | ||
14 | void foo(void) | 16 | void foo(void) |
15 | { | 17 | { |
@@ -17,5 +19,9 @@ void foo(void) | |||
17 | DEFINE(NR_PAGEFLAGS, __NR_PAGEFLAGS); | 19 | DEFINE(NR_PAGEFLAGS, __NR_PAGEFLAGS); |
18 | DEFINE(MAX_NR_ZONES, __MAX_NR_ZONES); | 20 | DEFINE(MAX_NR_ZONES, __MAX_NR_ZONES); |
19 | DEFINE(NR_PCG_FLAGS, __NR_PCG_FLAGS); | 21 | DEFINE(NR_PCG_FLAGS, __NR_PCG_FLAGS); |
22 | #ifdef CONFIG_SMP | ||
23 | DEFINE(NR_CPUS_BITS, ilog2(CONFIG_NR_CPUS)); | ||
24 | #endif | ||
25 | DEFINE(SPINLOCK_SIZE, sizeof(spinlock_t)); | ||
20 | /* End of constants */ | 26 | /* End of constants */ |
21 | } | 27 | } |
diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 8bd9cfdc70d7..bc1dcabe9217 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c | |||
@@ -90,6 +90,14 @@ static DEFINE_MUTEX(cgroup_mutex); | |||
90 | static DEFINE_MUTEX(cgroup_root_mutex); | 90 | static DEFINE_MUTEX(cgroup_root_mutex); |
91 | 91 | ||
92 | /* | 92 | /* |
93 | * cgroup destruction makes heavy use of work items and there can be a lot | ||
94 | * of concurrent destructions. Use a separate workqueue so that cgroup | ||
95 | * destruction work items don't end up filling up max_active of system_wq | ||
96 | * which may lead to deadlock. | ||
97 | */ | ||
98 | static struct workqueue_struct *cgroup_destroy_wq; | ||
99 | |||
100 | /* | ||
93 | * Generate an array of cgroup subsystem pointers. At boot time, this is | 101 | * Generate an array of cgroup subsystem pointers. At boot time, this is |
94 | * populated with the built in subsystems, and modular subsystems are | 102 | * populated with the built in subsystems, and modular subsystems are |
95 | * registered after that. The mutable section of this array is protected by | 103 | * registered after that. The mutable section of this array is protected by |
@@ -125,38 +133,6 @@ struct cfent { | |||
125 | }; | 133 | }; |
126 | 134 | ||
127 | /* | 135 | /* |
128 | * CSS ID -- ID per subsys's Cgroup Subsys State(CSS). used only when | ||
129 | * cgroup_subsys->use_id != 0. | ||
130 | */ | ||
131 | #define CSS_ID_MAX (65535) | ||
132 | struct css_id { | ||
133 | /* | ||
134 | * The css to which this ID points. This pointer is set to valid value | ||
135 | * after cgroup is populated. If cgroup is removed, this will be NULL. | ||
136 | * This pointer is expected to be RCU-safe because destroy() | ||
137 | * is called after synchronize_rcu(). But for safe use, css_tryget() | ||
138 | * should be used for avoiding race. | ||
139 | */ | ||
140 | struct cgroup_subsys_state __rcu *css; | ||
141 | /* | ||
142 | * ID of this css. | ||
143 | */ | ||
144 | unsigned short id; | ||
145 | /* | ||
146 | * Depth in hierarchy which this ID belongs to. | ||
147 | */ | ||
148 | unsigned short depth; | ||
149 | /* | ||
150 | * ID is freed by RCU. (and lookup routine is RCU safe.) | ||
151 | */ | ||
152 | struct rcu_head rcu_head; | ||
153 | /* | ||
154 | * Hierarchy of CSS ID belongs to. | ||
155 | */ | ||
156 | unsigned short stack[0]; /* Array of Length (depth+1) */ | ||
157 | }; | ||
158 | |||
159 | /* | ||
160 | * cgroup_event represents events which userspace want to receive. | 136 | * cgroup_event represents events which userspace want to receive. |
161 | */ | 137 | */ |
162 | struct cgroup_event { | 138 | struct cgroup_event { |
@@ -223,6 +199,7 @@ static void cgroup_destroy_css_killed(struct cgroup *cgrp); | |||
223 | static int cgroup_destroy_locked(struct cgroup *cgrp); | 199 | static int cgroup_destroy_locked(struct cgroup *cgrp); |
224 | static int cgroup_addrm_files(struct cgroup *cgrp, struct cftype cfts[], | 200 | static int cgroup_addrm_files(struct cgroup *cgrp, struct cftype cfts[], |
225 | bool is_add); | 201 | bool is_add); |
202 | static int cgroup_file_release(struct inode *inode, struct file *file); | ||
226 | 203 | ||
227 | /** | 204 | /** |
228 | * cgroup_css - obtain a cgroup's css for the specified subsystem | 205 | * cgroup_css - obtain a cgroup's css for the specified subsystem |
@@ -387,9 +364,6 @@ struct cgrp_cset_link { | |||
387 | static struct css_set init_css_set; | 364 | static struct css_set init_css_set; |
388 | static struct cgrp_cset_link init_cgrp_cset_link; | 365 | static struct cgrp_cset_link init_cgrp_cset_link; |
389 | 366 | ||
390 | static int cgroup_init_idr(struct cgroup_subsys *ss, | ||
391 | struct cgroup_subsys_state *css); | ||
392 | |||
393 | /* | 367 | /* |
394 | * css_set_lock protects the list of css_set objects, and the chain of | 368 | * css_set_lock protects the list of css_set objects, and the chain of |
395 | * tasks off each css_set. Nests outside task->alloc_lock due to | 369 | * tasks off each css_set. Nests outside task->alloc_lock due to |
@@ -841,8 +815,6 @@ static struct backing_dev_info cgroup_backing_dev_info = { | |||
841 | .capabilities = BDI_CAP_NO_ACCT_AND_WRITEBACK, | 815 | .capabilities = BDI_CAP_NO_ACCT_AND_WRITEBACK, |
842 | }; | 816 | }; |
843 | 817 | ||
844 | static int alloc_css_id(struct cgroup_subsys_state *child_css); | ||
845 | |||
846 | static struct inode *cgroup_new_inode(umode_t mode, struct super_block *sb) | 818 | static struct inode *cgroup_new_inode(umode_t mode, struct super_block *sb) |
847 | { | 819 | { |
848 | struct inode *inode = new_inode(sb); | 820 | struct inode *inode = new_inode(sb); |
@@ -908,7 +880,7 @@ static void cgroup_free_rcu(struct rcu_head *head) | |||
908 | struct cgroup *cgrp = container_of(head, struct cgroup, rcu_head); | 880 | struct cgroup *cgrp = container_of(head, struct cgroup, rcu_head); |
909 | 881 | ||
910 | INIT_WORK(&cgrp->destroy_work, cgroup_free_fn); | 882 | INIT_WORK(&cgrp->destroy_work, cgroup_free_fn); |
911 | schedule_work(&cgrp->destroy_work); | 883 | queue_work(cgroup_destroy_wq, &cgrp->destroy_work); |
912 | } | 884 | } |
913 | 885 | ||
914 | static void cgroup_diput(struct dentry *dentry, struct inode *inode) | 886 | static void cgroup_diput(struct dentry *dentry, struct inode *inode) |
@@ -918,6 +890,16 @@ static void cgroup_diput(struct dentry *dentry, struct inode *inode) | |||
918 | struct cgroup *cgrp = dentry->d_fsdata; | 890 | struct cgroup *cgrp = dentry->d_fsdata; |
919 | 891 | ||
920 | BUG_ON(!(cgroup_is_dead(cgrp))); | 892 | BUG_ON(!(cgroup_is_dead(cgrp))); |
893 | |||
894 | /* | ||
895 | * XXX: cgrp->id is only used to look up css's. As cgroup | ||
896 | * and css's lifetimes will be decoupled, it should be made | ||
897 | * per-subsystem and moved to css->id so that lookups are | ||
898 | * successful until the target css is released. | ||
899 | */ | ||
900 | idr_remove(&cgrp->root->cgroup_idr, cgrp->id); | ||
901 | cgrp->id = -1; | ||
902 | |||
921 | call_rcu(&cgrp->rcu_head, cgroup_free_rcu); | 903 | call_rcu(&cgrp->rcu_head, cgroup_free_rcu); |
922 | } else { | 904 | } else { |
923 | struct cfent *cfe = __d_cfe(dentry); | 905 | struct cfent *cfe = __d_cfe(dentry); |
@@ -932,11 +914,6 @@ static void cgroup_diput(struct dentry *dentry, struct inode *inode) | |||
932 | iput(inode); | 914 | iput(inode); |
933 | } | 915 | } |
934 | 916 | ||
935 | static int cgroup_delete(const struct dentry *d) | ||
936 | { | ||
937 | return 1; | ||
938 | } | ||
939 | |||
940 | static void remove_dir(struct dentry *d) | 917 | static void remove_dir(struct dentry *d) |
941 | { | 918 | { |
942 | struct dentry *parent = dget(d->d_parent); | 919 | struct dentry *parent = dget(d->d_parent); |
@@ -1523,7 +1500,7 @@ static int cgroup_get_rootdir(struct super_block *sb) | |||
1523 | { | 1500 | { |
1524 | static const struct dentry_operations cgroup_dops = { | 1501 | static const struct dentry_operations cgroup_dops = { |
1525 | .d_iput = cgroup_diput, | 1502 | .d_iput = cgroup_diput, |
1526 | .d_delete = cgroup_delete, | 1503 | .d_delete = always_delete_dentry, |
1527 | }; | 1504 | }; |
1528 | 1505 | ||
1529 | struct inode *inode = | 1506 | struct inode *inode = |
@@ -2463,7 +2440,7 @@ static const struct file_operations cgroup_seqfile_operations = { | |||
2463 | .read = seq_read, | 2440 | .read = seq_read, |
2464 | .write = cgroup_file_write, | 2441 | .write = cgroup_file_write, |
2465 | .llseek = seq_lseek, | 2442 | .llseek = seq_lseek, |
2466 | .release = single_release, | 2443 | .release = cgroup_file_release, |
2467 | }; | 2444 | }; |
2468 | 2445 | ||
2469 | static int cgroup_file_open(struct inode *inode, struct file *file) | 2446 | static int cgroup_file_open(struct inode *inode, struct file *file) |
@@ -2524,6 +2501,8 @@ static int cgroup_file_release(struct inode *inode, struct file *file) | |||
2524 | ret = cft->release(inode, file); | 2501 | ret = cft->release(inode, file); |
2525 | if (css->ss) | 2502 | if (css->ss) |
2526 | css_put(css); | 2503 | css_put(css); |
2504 | if (file->f_op == &cgroup_seqfile_operations) | ||
2505 | single_release(inode, file); | ||
2527 | return ret; | 2506 | return ret; |
2528 | } | 2507 | } |
2529 | 2508 | ||
@@ -4240,21 +4219,6 @@ static int cgroup_populate_dir(struct cgroup *cgrp, unsigned long subsys_mask) | |||
4240 | goto err; | 4219 | goto err; |
4241 | } | 4220 | } |
4242 | } | 4221 | } |
4243 | |||
4244 | /* This cgroup is ready now */ | ||
4245 | for_each_root_subsys(cgrp->root, ss) { | ||
4246 | struct cgroup_subsys_state *css = cgroup_css(cgrp, ss); | ||
4247 | struct css_id *id = rcu_dereference_protected(css->id, true); | ||
4248 | |||
4249 | /* | ||
4250 | * Update id->css pointer and make this css visible from | ||
4251 | * CSS ID functions. This pointer will be dereferened | ||
4252 | * from RCU-read-side without locks. | ||
4253 | */ | ||
4254 | if (id) | ||
4255 | rcu_assign_pointer(id->css, css); | ||
4256 | } | ||
4257 | |||
4258 | return 0; | 4222 | return 0; |
4259 | err: | 4223 | err: |
4260 | cgroup_clear_dir(cgrp, subsys_mask); | 4224 | cgroup_clear_dir(cgrp, subsys_mask); |
@@ -4306,7 +4270,7 @@ static void css_free_rcu_fn(struct rcu_head *rcu_head) | |||
4306 | * css_put(). dput() requires process context which we don't have. | 4270 | * css_put(). dput() requires process context which we don't have. |
4307 | */ | 4271 | */ |
4308 | INIT_WORK(&css->destroy_work, css_free_work_fn); | 4272 | INIT_WORK(&css->destroy_work, css_free_work_fn); |
4309 | schedule_work(&css->destroy_work); | 4273 | queue_work(cgroup_destroy_wq, &css->destroy_work); |
4310 | } | 4274 | } |
4311 | 4275 | ||
4312 | static void css_release(struct percpu_ref *ref) | 4276 | static void css_release(struct percpu_ref *ref) |
@@ -4314,6 +4278,7 @@ static void css_release(struct percpu_ref *ref) | |||
4314 | struct cgroup_subsys_state *css = | 4278 | struct cgroup_subsys_state *css = |
4315 | container_of(ref, struct cgroup_subsys_state, refcnt); | 4279 | container_of(ref, struct cgroup_subsys_state, refcnt); |
4316 | 4280 | ||
4281 | rcu_assign_pointer(css->cgroup->subsys[css->ss->subsys_id], NULL); | ||
4317 | call_rcu(&css->rcu_head, css_free_rcu_fn); | 4282 | call_rcu(&css->rcu_head, css_free_rcu_fn); |
4318 | } | 4283 | } |
4319 | 4284 | ||
@@ -4323,7 +4288,6 @@ static void init_css(struct cgroup_subsys_state *css, struct cgroup_subsys *ss, | |||
4323 | css->cgroup = cgrp; | 4288 | css->cgroup = cgrp; |
4324 | css->ss = ss; | 4289 | css->ss = ss; |
4325 | css->flags = 0; | 4290 | css->flags = 0; |
4326 | css->id = NULL; | ||
4327 | 4291 | ||
4328 | if (cgrp->parent) | 4292 | if (cgrp->parent) |
4329 | css->parent = cgroup_css(cgrp->parent, ss); | 4293 | css->parent = cgroup_css(cgrp->parent, ss); |
@@ -4455,12 +4419,6 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry, | |||
4455 | goto err_free_all; | 4419 | goto err_free_all; |
4456 | 4420 | ||
4457 | init_css(css, ss, cgrp); | 4421 | init_css(css, ss, cgrp); |
4458 | |||
4459 | if (ss->use_id) { | ||
4460 | err = alloc_css_id(css); | ||
4461 | if (err) | ||
4462 | goto err_free_all; | ||
4463 | } | ||
4464 | } | 4422 | } |
4465 | 4423 | ||
4466 | /* | 4424 | /* |
@@ -4479,14 +4437,6 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry, | |||
4479 | list_add_tail_rcu(&cgrp->sibling, &cgrp->parent->children); | 4437 | list_add_tail_rcu(&cgrp->sibling, &cgrp->parent->children); |
4480 | root->number_of_cgroups++; | 4438 | root->number_of_cgroups++; |
4481 | 4439 | ||
4482 | /* each css holds a ref to the cgroup's dentry and the parent css */ | ||
4483 | for_each_root_subsys(root, ss) { | ||
4484 | struct cgroup_subsys_state *css = css_ar[ss->subsys_id]; | ||
4485 | |||
4486 | dget(dentry); | ||
4487 | css_get(css->parent); | ||
4488 | } | ||
4489 | |||
4490 | /* hold a ref to the parent's dentry */ | 4440 | /* hold a ref to the parent's dentry */ |
4491 | dget(parent->dentry); | 4441 | dget(parent->dentry); |
4492 | 4442 | ||
@@ -4498,6 +4448,13 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry, | |||
4498 | if (err) | 4448 | if (err) |
4499 | goto err_destroy; | 4449 | goto err_destroy; |
4500 | 4450 | ||
4451 | /* each css holds a ref to the cgroup's dentry and parent css */ | ||
4452 | dget(dentry); | ||
4453 | css_get(css->parent); | ||
4454 | |||
4455 | /* mark it consumed for error path */ | ||
4456 | css_ar[ss->subsys_id] = NULL; | ||
4457 | |||
4501 | if (ss->broken_hierarchy && !ss->warned_broken_hierarchy && | 4458 | if (ss->broken_hierarchy && !ss->warned_broken_hierarchy && |
4502 | parent->parent) { | 4459 | parent->parent) { |
4503 | pr_warning("cgroup: %s (%d) created nested cgroup for controller \"%s\" which has incomplete hierarchy support. Nested cgroups may change behavior in the future.\n", | 4460 | pr_warning("cgroup: %s (%d) created nested cgroup for controller \"%s\" which has incomplete hierarchy support. Nested cgroups may change behavior in the future.\n", |
@@ -4544,6 +4501,14 @@ err_free_cgrp: | |||
4544 | return err; | 4501 | return err; |
4545 | 4502 | ||
4546 | err_destroy: | 4503 | err_destroy: |
4504 | for_each_root_subsys(root, ss) { | ||
4505 | struct cgroup_subsys_state *css = css_ar[ss->subsys_id]; | ||
4506 | |||
4507 | if (css) { | ||
4508 | percpu_ref_cancel_init(&css->refcnt); | ||
4509 | ss->css_free(css); | ||
4510 | } | ||
4511 | } | ||
4547 | cgroup_destroy_locked(cgrp); | 4512 | cgroup_destroy_locked(cgrp); |
4548 | mutex_unlock(&cgroup_mutex); | 4513 | mutex_unlock(&cgroup_mutex); |
4549 | mutex_unlock(&dentry->d_inode->i_mutex); | 4514 | mutex_unlock(&dentry->d_inode->i_mutex); |
@@ -4603,7 +4568,7 @@ static void css_killed_ref_fn(struct percpu_ref *ref) | |||
4603 | container_of(ref, struct cgroup_subsys_state, refcnt); | 4568 | container_of(ref, struct cgroup_subsys_state, refcnt); |
4604 | 4569 | ||
4605 | INIT_WORK(&css->destroy_work, css_killed_work_fn); | 4570 | INIT_WORK(&css->destroy_work, css_killed_work_fn); |
4606 | schedule_work(&css->destroy_work); | 4571 | queue_work(cgroup_destroy_wq, &css->destroy_work); |
4607 | } | 4572 | } |
4608 | 4573 | ||
4609 | /** | 4574 | /** |
@@ -4705,8 +4670,12 @@ static int cgroup_destroy_locked(struct cgroup *cgrp) | |||
4705 | * will be invoked to perform the rest of destruction once the | 4670 | * will be invoked to perform the rest of destruction once the |
4706 | * percpu refs of all css's are confirmed to be killed. | 4671 | * percpu refs of all css's are confirmed to be killed. |
4707 | */ | 4672 | */ |
4708 | for_each_root_subsys(cgrp->root, ss) | 4673 | for_each_root_subsys(cgrp->root, ss) { |
4709 | kill_css(cgroup_css(cgrp, ss)); | 4674 | struct cgroup_subsys_state *css = cgroup_css(cgrp, ss); |
4675 | |||
4676 | if (css) | ||
4677 | kill_css(css); | ||
4678 | } | ||
4710 | 4679 | ||
4711 | /* | 4680 | /* |
4712 | * Mark @cgrp dead. This prevents further task migration and child | 4681 | * Mark @cgrp dead. This prevents further task migration and child |
@@ -4775,14 +4744,6 @@ static void cgroup_destroy_css_killed(struct cgroup *cgrp) | |||
4775 | /* delete this cgroup from parent->children */ | 4744 | /* delete this cgroup from parent->children */ |
4776 | list_del_rcu(&cgrp->sibling); | 4745 | list_del_rcu(&cgrp->sibling); |
4777 | 4746 | ||
4778 | /* | ||
4779 | * We should remove the cgroup object from idr before its grace | ||
4780 | * period starts, so we won't be looking up a cgroup while the | ||
4781 | * cgroup is being freed. | ||
4782 | */ | ||
4783 | idr_remove(&cgrp->root->cgroup_idr, cgrp->id); | ||
4784 | cgrp->id = -1; | ||
4785 | |||
4786 | dput(d); | 4747 | dput(d); |
4787 | 4748 | ||
4788 | set_bit(CGRP_RELEASABLE, &parent->flags); | 4749 | set_bit(CGRP_RELEASABLE, &parent->flags); |
@@ -4925,12 +4886,6 @@ int __init_or_module cgroup_load_subsys(struct cgroup_subsys *ss) | |||
4925 | 4886 | ||
4926 | /* our new subsystem will be attached to the dummy hierarchy. */ | 4887 | /* our new subsystem will be attached to the dummy hierarchy. */ |
4927 | init_css(css, ss, cgroup_dummy_top); | 4888 | init_css(css, ss, cgroup_dummy_top); |
4928 | /* init_idr must be after init_css() because it sets css->id. */ | ||
4929 | if (ss->use_id) { | ||
4930 | ret = cgroup_init_idr(ss, css); | ||
4931 | if (ret) | ||
4932 | goto err_unload; | ||
4933 | } | ||
4934 | 4889 | ||
4935 | /* | 4890 | /* |
4936 | * Now we need to entangle the css into the existing css_sets. unlike | 4891 | * Now we need to entangle the css into the existing css_sets. unlike |
@@ -4996,9 +4951,6 @@ void cgroup_unload_subsys(struct cgroup_subsys *ss) | |||
4996 | 4951 | ||
4997 | offline_css(cgroup_css(cgroup_dummy_top, ss)); | 4952 | offline_css(cgroup_css(cgroup_dummy_top, ss)); |
4998 | 4953 | ||
4999 | if (ss->use_id) | ||
5000 | idr_destroy(&ss->idr); | ||
5001 | |||
5002 | /* deassign the subsys_id */ | 4954 | /* deassign the subsys_id */ |
5003 | cgroup_subsys[ss->subsys_id] = NULL; | 4955 | cgroup_subsys[ss->subsys_id] = NULL; |
5004 | 4956 | ||
@@ -5025,8 +4977,7 @@ void cgroup_unload_subsys(struct cgroup_subsys *ss) | |||
5025 | /* | 4977 | /* |
5026 | * remove subsystem's css from the cgroup_dummy_top and free it - | 4978 | * remove subsystem's css from the cgroup_dummy_top and free it - |
5027 | * need to free before marking as null because ss->css_free needs | 4979 | * need to free before marking as null because ss->css_free needs |
5028 | * the cgrp->subsys pointer to find their state. note that this | 4980 | * the cgrp->subsys pointer to find their state. |
5029 | * also takes care of freeing the css_id. | ||
5030 | */ | 4981 | */ |
5031 | ss->css_free(cgroup_css(cgroup_dummy_top, ss)); | 4982 | ss->css_free(cgroup_css(cgroup_dummy_top, ss)); |
5032 | RCU_INIT_POINTER(cgroup_dummy_top->subsys[ss->subsys_id], NULL); | 4983 | RCU_INIT_POINTER(cgroup_dummy_top->subsys[ss->subsys_id], NULL); |
@@ -5097,8 +5048,6 @@ int __init cgroup_init(void) | |||
5097 | for_each_builtin_subsys(ss, i) { | 5048 | for_each_builtin_subsys(ss, i) { |
5098 | if (!ss->early_init) | 5049 | if (!ss->early_init) |
5099 | cgroup_init_subsys(ss); | 5050 | cgroup_init_subsys(ss); |
5100 | if (ss->use_id) | ||
5101 | cgroup_init_idr(ss, init_css_set.subsys[ss->subsys_id]); | ||
5102 | } | 5051 | } |
5103 | 5052 | ||
5104 | /* allocate id for the dummy hierarchy */ | 5053 | /* allocate id for the dummy hierarchy */ |
@@ -5139,6 +5088,22 @@ out: | |||
5139 | return err; | 5088 | return err; |
5140 | } | 5089 | } |
5141 | 5090 | ||
5091 | static int __init cgroup_wq_init(void) | ||
5092 | { | ||
5093 | /* | ||
5094 | * There isn't much point in executing destruction path in | ||
5095 | * parallel. Good chunk is serialized with cgroup_mutex anyway. | ||
5096 | * Use 1 for @max_active. | ||
5097 | * | ||
5098 | * We would prefer to do this in cgroup_init() above, but that | ||
5099 | * is called before init_workqueues(): so leave this until after. | ||
5100 | */ | ||
5101 | cgroup_destroy_wq = alloc_workqueue("cgroup_destroy", 0, 1); | ||
5102 | BUG_ON(!cgroup_destroy_wq); | ||
5103 | return 0; | ||
5104 | } | ||
5105 | core_initcall(cgroup_wq_init); | ||
5106 | |||
5142 | /* | 5107 | /* |
5143 | * proc_cgroup_show() | 5108 | * proc_cgroup_show() |
5144 | * - Print task's cgroup paths into seq_file, one line for each hierarchy | 5109 | * - Print task's cgroup paths into seq_file, one line for each hierarchy |
@@ -5518,181 +5483,6 @@ static int __init cgroup_disable(char *str) | |||
5518 | } | 5483 | } |
5519 | __setup("cgroup_disable=", cgroup_disable); | 5484 | __setup("cgroup_disable=", cgroup_disable); |
5520 | 5485 | ||
5521 | /* | ||
5522 | * Functons for CSS ID. | ||
5523 | */ | ||
5524 | |||
5525 | /* to get ID other than 0, this should be called when !cgroup_is_dead() */ | ||
5526 | unsigned short css_id(struct cgroup_subsys_state *css) | ||
5527 | { | ||
5528 | struct css_id *cssid; | ||
5529 | |||
5530 | /* | ||
5531 | * This css_id() can return correct value when somone has refcnt | ||
5532 | * on this or this is under rcu_read_lock(). Once css->id is allocated, | ||
5533 | * it's unchanged until freed. | ||
5534 | */ | ||
5535 | cssid = rcu_dereference_raw(css->id); | ||
5536 | |||
5537 | if (cssid) | ||
5538 | return cssid->id; | ||
5539 | return 0; | ||
5540 | } | ||
5541 | EXPORT_SYMBOL_GPL(css_id); | ||
5542 | |||
5543 | /** | ||
5544 | * css_is_ancestor - test "root" css is an ancestor of "child" | ||
5545 | * @child: the css to be tested. | ||
5546 | * @root: the css supporsed to be an ancestor of the child. | ||
5547 | * | ||
5548 | * Returns true if "root" is an ancestor of "child" in its hierarchy. Because | ||
5549 | * this function reads css->id, the caller must hold rcu_read_lock(). | ||
5550 | * But, considering usual usage, the csses should be valid objects after test. | ||
5551 | * Assuming that the caller will do some action to the child if this returns | ||
5552 | * returns true, the caller must take "child";s reference count. | ||
5553 | * If "child" is valid object and this returns true, "root" is valid, too. | ||
5554 | */ | ||
5555 | |||
5556 | bool css_is_ancestor(struct cgroup_subsys_state *child, | ||
5557 | const struct cgroup_subsys_state *root) | ||
5558 | { | ||
5559 | struct css_id *child_id; | ||
5560 | struct css_id *root_id; | ||
5561 | |||
5562 | child_id = rcu_dereference(child->id); | ||
5563 | if (!child_id) | ||
5564 | return false; | ||
5565 | root_id = rcu_dereference(root->id); | ||
5566 | if (!root_id) | ||
5567 | return false; | ||
5568 | if (child_id->depth < root_id->depth) | ||
5569 | return false; | ||
5570 | if (child_id->stack[root_id->depth] != root_id->id) | ||
5571 | return false; | ||
5572 | return true; | ||
5573 | } | ||
5574 | |||
5575 | void free_css_id(struct cgroup_subsys *ss, struct cgroup_subsys_state *css) | ||
5576 | { | ||
5577 | struct css_id *id = rcu_dereference_protected(css->id, true); | ||
5578 | |||
5579 | /* When this is called before css_id initialization, id can be NULL */ | ||
5580 | if (!id) | ||
5581 | return; | ||
5582 | |||
5583 | BUG_ON(!ss->use_id); | ||
5584 | |||
5585 | rcu_assign_pointer(id->css, NULL); | ||
5586 | rcu_assign_pointer(css->id, NULL); | ||
5587 | spin_lock(&ss->id_lock); | ||
5588 | idr_remove(&ss->idr, id->id); | ||
5589 | spin_unlock(&ss->id_lock); | ||
5590 | kfree_rcu(id, rcu_head); | ||
5591 | } | ||
5592 | EXPORT_SYMBOL_GPL(free_css_id); | ||
5593 | |||
5594 | /* | ||
5595 | * This is called by init or create(). Then, calls to this function are | ||
5596 | * always serialized (By cgroup_mutex() at create()). | ||
5597 | */ | ||
5598 | |||
5599 | static struct css_id *get_new_cssid(struct cgroup_subsys *ss, int depth) | ||
5600 | { | ||
5601 | struct css_id *newid; | ||
5602 | int ret, size; | ||
5603 | |||
5604 | BUG_ON(!ss->use_id); | ||
5605 | |||
5606 | size = sizeof(*newid) + sizeof(unsigned short) * (depth + 1); | ||
5607 | newid = kzalloc(size, GFP_KERNEL); | ||
5608 | if (!newid) | ||
5609 | return ERR_PTR(-ENOMEM); | ||
5610 | |||
5611 | idr_preload(GFP_KERNEL); | ||
5612 | spin_lock(&ss->id_lock); | ||
5613 | /* Don't use 0. allocates an ID of 1-65535 */ | ||
5614 | ret = idr_alloc(&ss->idr, newid, 1, CSS_ID_MAX + 1, GFP_NOWAIT); | ||
5615 | spin_unlock(&ss->id_lock); | ||
5616 | idr_preload_end(); | ||
5617 | |||
5618 | /* Returns error when there are no free spaces for new ID.*/ | ||
5619 | if (ret < 0) | ||
5620 | goto err_out; | ||
5621 | |||
5622 | newid->id = ret; | ||
5623 | newid->depth = depth; | ||
5624 | return newid; | ||
5625 | err_out: | ||
5626 | kfree(newid); | ||
5627 | return ERR_PTR(ret); | ||
5628 | |||
5629 | } | ||
5630 | |||
5631 | static int __init_or_module cgroup_init_idr(struct cgroup_subsys *ss, | ||
5632 | struct cgroup_subsys_state *rootcss) | ||
5633 | { | ||
5634 | struct css_id *newid; | ||
5635 | |||
5636 | spin_lock_init(&ss->id_lock); | ||
5637 | idr_init(&ss->idr); | ||
5638 | |||
5639 | newid = get_new_cssid(ss, 0); | ||
5640 | if (IS_ERR(newid)) | ||
5641 | return PTR_ERR(newid); | ||
5642 | |||
5643 | newid->stack[0] = newid->id; | ||
5644 | RCU_INIT_POINTER(newid->css, rootcss); | ||
5645 | RCU_INIT_POINTER(rootcss->id, newid); | ||
5646 | return 0; | ||
5647 | } | ||
5648 | |||
5649 | static int alloc_css_id(struct cgroup_subsys_state *child_css) | ||
5650 | { | ||
5651 | struct cgroup_subsys_state *parent_css = css_parent(child_css); | ||
5652 | struct css_id *child_id, *parent_id; | ||
5653 | int i, depth; | ||
5654 | |||
5655 | parent_id = rcu_dereference_protected(parent_css->id, true); | ||
5656 | depth = parent_id->depth + 1; | ||
5657 | |||
5658 | child_id = get_new_cssid(child_css->ss, depth); | ||
5659 | if (IS_ERR(child_id)) | ||
5660 | return PTR_ERR(child_id); | ||
5661 | |||
5662 | for (i = 0; i < depth; i++) | ||
5663 | child_id->stack[i] = parent_id->stack[i]; | ||
5664 | child_id->stack[depth] = child_id->id; | ||
5665 | /* | ||
5666 | * child_id->css pointer will be set after this cgroup is available | ||
5667 | * see cgroup_populate_dir() | ||
5668 | */ | ||
5669 | rcu_assign_pointer(child_css->id, child_id); | ||
5670 | |||
5671 | return 0; | ||
5672 | } | ||
5673 | |||
5674 | /** | ||
5675 | * css_lookup - lookup css by id | ||
5676 | * @ss: cgroup subsys to be looked into. | ||
5677 | * @id: the id | ||
5678 | * | ||
5679 | * Returns pointer to cgroup_subsys_state if there is valid one with id. | ||
5680 | * NULL if not. Should be called under rcu_read_lock() | ||
5681 | */ | ||
5682 | struct cgroup_subsys_state *css_lookup(struct cgroup_subsys *ss, int id) | ||
5683 | { | ||
5684 | struct css_id *cssid = NULL; | ||
5685 | |||
5686 | BUG_ON(!ss->use_id); | ||
5687 | cssid = idr_find(&ss->idr, id); | ||
5688 | |||
5689 | if (unlikely(!cssid)) | ||
5690 | return NULL; | ||
5691 | |||
5692 | return rcu_dereference(cssid->css); | ||
5693 | } | ||
5694 | EXPORT_SYMBOL_GPL(css_lookup); | ||
5695 | |||
5696 | /** | 5486 | /** |
5697 | * css_from_dir - get corresponding css from the dentry of a cgroup dir | 5487 | * css_from_dir - get corresponding css from the dentry of a cgroup dir |
5698 | * @dentry: directory dentry of interest | 5488 | * @dentry: directory dentry of interest |
diff --git a/kernel/context_tracking.c b/kernel/context_tracking.c index 859c8dfd78a1..e5f3917aa05b 100644 --- a/kernel/context_tracking.c +++ b/kernel/context_tracking.c | |||
@@ -120,7 +120,7 @@ void context_tracking_user_enter(void) | |||
120 | * instead of preempt_schedule() to exit user context if needed before | 120 | * instead of preempt_schedule() to exit user context if needed before |
121 | * calling the scheduler. | 121 | * calling the scheduler. |
122 | */ | 122 | */ |
123 | void __sched notrace preempt_schedule_context(void) | 123 | asmlinkage void __sched notrace preempt_schedule_context(void) |
124 | { | 124 | { |
125 | enum ctx_state prev_ctx; | 125 | enum ctx_state prev_ctx; |
126 | 126 | ||
diff --git a/kernel/cpu.c b/kernel/cpu.c index d7f07a2da5a6..deff2e693766 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c | |||
@@ -306,8 +306,28 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen) | |||
306 | __func__, cpu); | 306 | __func__, cpu); |
307 | goto out_release; | 307 | goto out_release; |
308 | } | 308 | } |
309 | |||
310 | /* | ||
311 | * By now we've cleared cpu_active_mask, wait for all preempt-disabled | ||
312 | * and RCU users of this state to go away such that all new such users | ||
313 | * will observe it. | ||
314 | * | ||
315 | * For CONFIG_PREEMPT we have preemptible RCU and its sync_rcu() might | ||
316 | * not imply sync_sched(), so explicitly call both. | ||
317 | * | ||
318 | * Do sync before park smpboot threads to take care the rcu boost case. | ||
319 | */ | ||
320 | #ifdef CONFIG_PREEMPT | ||
321 | synchronize_sched(); | ||
322 | #endif | ||
323 | synchronize_rcu(); | ||
324 | |||
309 | smpboot_park_threads(cpu); | 325 | smpboot_park_threads(cpu); |
310 | 326 | ||
327 | /* | ||
328 | * So now all preempt/rcu users must observe !cpu_active(). | ||
329 | */ | ||
330 | |||
311 | err = __stop_machine(take_cpu_down, &tcd_param, cpumask_of(cpu)); | 331 | err = __stop_machine(take_cpu_down, &tcd_param, cpumask_of(cpu)); |
312 | if (err) { | 332 | if (err) { |
313 | /* CPU didn't die: tell everyone. Can't complain. */ | 333 | /* CPU didn't die: tell everyone. Can't complain. */ |
@@ -420,11 +440,6 @@ int cpu_up(unsigned int cpu) | |||
420 | { | 440 | { |
421 | int err = 0; | 441 | int err = 0; |
422 | 442 | ||
423 | #ifdef CONFIG_MEMORY_HOTPLUG | ||
424 | int nid; | ||
425 | pg_data_t *pgdat; | ||
426 | #endif | ||
427 | |||
428 | if (!cpu_possible(cpu)) { | 443 | if (!cpu_possible(cpu)) { |
429 | printk(KERN_ERR "can't online cpu %d because it is not " | 444 | printk(KERN_ERR "can't online cpu %d because it is not " |
430 | "configured as may-hotadd at boot time\n", cpu); | 445 | "configured as may-hotadd at boot time\n", cpu); |
@@ -435,27 +450,9 @@ int cpu_up(unsigned int cpu) | |||
435 | return -EINVAL; | 450 | return -EINVAL; |
436 | } | 451 | } |
437 | 452 | ||
438 | #ifdef CONFIG_MEMORY_HOTPLUG | 453 | err = try_online_node(cpu_to_node(cpu)); |
439 | nid = cpu_to_node(cpu); | 454 | if (err) |
440 | if (!node_online(nid)) { | 455 | return err; |
441 | err = mem_online_node(nid); | ||
442 | if (err) | ||
443 | return err; | ||
444 | } | ||
445 | |||
446 | pgdat = NODE_DATA(nid); | ||
447 | if (!pgdat) { | ||
448 | printk(KERN_ERR | ||
449 | "Can't online cpu %d due to NULL pgdat\n", cpu); | ||
450 | return -ENOMEM; | ||
451 | } | ||
452 | |||
453 | if (pgdat->node_zonelists->_zonerefs->zone == NULL) { | ||
454 | mutex_lock(&zonelists_mutex); | ||
455 | build_all_zonelists(NULL, NULL); | ||
456 | mutex_unlock(&zonelists_mutex); | ||
457 | } | ||
458 | #endif | ||
459 | 456 | ||
460 | cpu_maps_update_begin(); | 457 | cpu_maps_update_begin(); |
461 | 458 | ||
diff --git a/kernel/cpu/idle.c b/kernel/cpu/idle.c index e695c0a0bcb5..988573a9a387 100644 --- a/kernel/cpu/idle.c +++ b/kernel/cpu/idle.c | |||
@@ -44,7 +44,7 @@ static inline int cpu_idle_poll(void) | |||
44 | rcu_idle_enter(); | 44 | rcu_idle_enter(); |
45 | trace_cpu_idle_rcuidle(0, smp_processor_id()); | 45 | trace_cpu_idle_rcuidle(0, smp_processor_id()); |
46 | local_irq_enable(); | 46 | local_irq_enable(); |
47 | while (!need_resched()) | 47 | while (!tif_need_resched()) |
48 | cpu_relax(); | 48 | cpu_relax(); |
49 | trace_cpu_idle_rcuidle(PWR_EVENT_EXIT, smp_processor_id()); | 49 | trace_cpu_idle_rcuidle(PWR_EVENT_EXIT, smp_processor_id()); |
50 | rcu_idle_exit(); | 50 | rcu_idle_exit(); |
@@ -92,8 +92,7 @@ static void cpu_idle_loop(void) | |||
92 | if (cpu_idle_force_poll || tick_check_broadcast_expired()) { | 92 | if (cpu_idle_force_poll || tick_check_broadcast_expired()) { |
93 | cpu_idle_poll(); | 93 | cpu_idle_poll(); |
94 | } else { | 94 | } else { |
95 | current_clr_polling(); | 95 | if (!current_clr_polling_and_test()) { |
96 | if (!need_resched()) { | ||
97 | stop_critical_timings(); | 96 | stop_critical_timings(); |
98 | rcu_idle_enter(); | 97 | rcu_idle_enter(); |
99 | arch_cpu_idle(); | 98 | arch_cpu_idle(); |
@@ -103,9 +102,16 @@ static void cpu_idle_loop(void) | |||
103 | } else { | 102 | } else { |
104 | local_irq_enable(); | 103 | local_irq_enable(); |
105 | } | 104 | } |
106 | current_set_polling(); | 105 | __current_set_polling(); |
107 | } | 106 | } |
108 | arch_cpu_idle_exit(); | 107 | arch_cpu_idle_exit(); |
108 | /* | ||
109 | * We need to test and propagate the TIF_NEED_RESCHED | ||
110 | * bit here because we might not have send the | ||
111 | * reschedule IPI to idle tasks. | ||
112 | */ | ||
113 | if (tif_need_resched()) | ||
114 | set_preempt_need_resched(); | ||
109 | } | 115 | } |
110 | tick_nohz_idle_exit(); | 116 | tick_nohz_idle_exit(); |
111 | schedule_preempt_disabled(); | 117 | schedule_preempt_disabled(); |
@@ -129,7 +135,7 @@ void cpu_startup_entry(enum cpuhp_state state) | |||
129 | */ | 135 | */ |
130 | boot_init_stack_canary(); | 136 | boot_init_stack_canary(); |
131 | #endif | 137 | #endif |
132 | current_set_polling(); | 138 | __current_set_polling(); |
133 | arch_cpu_idle_prepare(); | 139 | arch_cpu_idle_prepare(); |
134 | cpu_idle_loop(); | 140 | cpu_idle_loop(); |
135 | } | 141 | } |
diff --git a/kernel/cpuset.c b/kernel/cpuset.c index 6bf981e13c43..4772034b4b17 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c | |||
@@ -1033,8 +1033,10 @@ static void cpuset_change_task_nodemask(struct task_struct *tsk, | |||
1033 | need_loop = task_has_mempolicy(tsk) || | 1033 | need_loop = task_has_mempolicy(tsk) || |
1034 | !nodes_intersects(*newmems, tsk->mems_allowed); | 1034 | !nodes_intersects(*newmems, tsk->mems_allowed); |
1035 | 1035 | ||
1036 | if (need_loop) | 1036 | if (need_loop) { |
1037 | local_irq_disable(); | ||
1037 | write_seqcount_begin(&tsk->mems_allowed_seq); | 1038 | write_seqcount_begin(&tsk->mems_allowed_seq); |
1039 | } | ||
1038 | 1040 | ||
1039 | nodes_or(tsk->mems_allowed, tsk->mems_allowed, *newmems); | 1041 | nodes_or(tsk->mems_allowed, tsk->mems_allowed, *newmems); |
1040 | mpol_rebind_task(tsk, newmems, MPOL_REBIND_STEP1); | 1042 | mpol_rebind_task(tsk, newmems, MPOL_REBIND_STEP1); |
@@ -1042,8 +1044,10 @@ static void cpuset_change_task_nodemask(struct task_struct *tsk, | |||
1042 | mpol_rebind_task(tsk, newmems, MPOL_REBIND_STEP2); | 1044 | mpol_rebind_task(tsk, newmems, MPOL_REBIND_STEP2); |
1043 | tsk->mems_allowed = *newmems; | 1045 | tsk->mems_allowed = *newmems; |
1044 | 1046 | ||
1045 | if (need_loop) | 1047 | if (need_loop) { |
1046 | write_seqcount_end(&tsk->mems_allowed_seq); | 1048 | write_seqcount_end(&tsk->mems_allowed_seq); |
1049 | local_irq_enable(); | ||
1050 | } | ||
1047 | 1051 | ||
1048 | task_unlock(tsk); | 1052 | task_unlock(tsk); |
1049 | } | 1053 | } |
diff --git a/kernel/debug/debug_core.c b/kernel/debug/debug_core.c index 0506d447aed2..7d2f35e5df2f 100644 --- a/kernel/debug/debug_core.c +++ b/kernel/debug/debug_core.c | |||
@@ -575,8 +575,12 @@ return_normal: | |||
575 | raw_spin_lock(&dbg_slave_lock); | 575 | raw_spin_lock(&dbg_slave_lock); |
576 | 576 | ||
577 | #ifdef CONFIG_SMP | 577 | #ifdef CONFIG_SMP |
578 | /* If send_ready set, slaves are already waiting */ | ||
579 | if (ks->send_ready) | ||
580 | atomic_set(ks->send_ready, 1); | ||
581 | |||
578 | /* Signal the other CPUs to enter kgdb_wait() */ | 582 | /* Signal the other CPUs to enter kgdb_wait() */ |
579 | if ((!kgdb_single_step) && kgdb_do_roundup) | 583 | else if ((!kgdb_single_step) && kgdb_do_roundup) |
580 | kgdb_roundup_cpus(flags); | 584 | kgdb_roundup_cpus(flags); |
581 | #endif | 585 | #endif |
582 | 586 | ||
@@ -678,11 +682,11 @@ kgdb_handle_exception(int evector, int signo, int ecode, struct pt_regs *regs) | |||
678 | if (arch_kgdb_ops.enable_nmi) | 682 | if (arch_kgdb_ops.enable_nmi) |
679 | arch_kgdb_ops.enable_nmi(0); | 683 | arch_kgdb_ops.enable_nmi(0); |
680 | 684 | ||
685 | memset(ks, 0, sizeof(struct kgdb_state)); | ||
681 | ks->cpu = raw_smp_processor_id(); | 686 | ks->cpu = raw_smp_processor_id(); |
682 | ks->ex_vector = evector; | 687 | ks->ex_vector = evector; |
683 | ks->signo = signo; | 688 | ks->signo = signo; |
684 | ks->err_code = ecode; | 689 | ks->err_code = ecode; |
685 | ks->kgdb_usethreadid = 0; | ||
686 | ks->linux_regs = regs; | 690 | ks->linux_regs = regs; |
687 | 691 | ||
688 | if (kgdb_reenter_check(ks)) | 692 | if (kgdb_reenter_check(ks)) |
@@ -732,6 +736,30 @@ int kgdb_nmicallback(int cpu, void *regs) | |||
732 | return 1; | 736 | return 1; |
733 | } | 737 | } |
734 | 738 | ||
739 | int kgdb_nmicallin(int cpu, int trapnr, void *regs, atomic_t *send_ready) | ||
740 | { | ||
741 | #ifdef CONFIG_SMP | ||
742 | if (!kgdb_io_ready(0) || !send_ready) | ||
743 | return 1; | ||
744 | |||
745 | if (kgdb_info[cpu].enter_kgdb == 0) { | ||
746 | struct kgdb_state kgdb_var; | ||
747 | struct kgdb_state *ks = &kgdb_var; | ||
748 | |||
749 | memset(ks, 0, sizeof(struct kgdb_state)); | ||
750 | ks->cpu = cpu; | ||
751 | ks->ex_vector = trapnr; | ||
752 | ks->signo = SIGTRAP; | ||
753 | ks->err_code = KGDB_KDB_REASON_SYSTEM_NMI; | ||
754 | ks->linux_regs = regs; | ||
755 | ks->send_ready = send_ready; | ||
756 | kgdb_cpu_enter(ks, regs, DCPU_WANT_MASTER); | ||
757 | return 0; | ||
758 | } | ||
759 | #endif | ||
760 | return 1; | ||
761 | } | ||
762 | |||
735 | static void kgdb_console_write(struct console *co, const char *s, | 763 | static void kgdb_console_write(struct console *co, const char *s, |
736 | unsigned count) | 764 | unsigned count) |
737 | { | 765 | { |
diff --git a/kernel/debug/debug_core.h b/kernel/debug/debug_core.h index 2235967e78b0..572aa4f5677c 100644 --- a/kernel/debug/debug_core.h +++ b/kernel/debug/debug_core.h | |||
@@ -26,6 +26,7 @@ struct kgdb_state { | |||
26 | unsigned long threadid; | 26 | unsigned long threadid; |
27 | long kgdb_usethreadid; | 27 | long kgdb_usethreadid; |
28 | struct pt_regs *linux_regs; | 28 | struct pt_regs *linux_regs; |
29 | atomic_t *send_ready; | ||
29 | }; | 30 | }; |
30 | 31 | ||
31 | /* Exception state values */ | 32 | /* Exception state values */ |
@@ -74,11 +75,13 @@ extern int kdb_stub(struct kgdb_state *ks); | |||
74 | extern int kdb_parse(const char *cmdstr); | 75 | extern int kdb_parse(const char *cmdstr); |
75 | extern int kdb_common_init_state(struct kgdb_state *ks); | 76 | extern int kdb_common_init_state(struct kgdb_state *ks); |
76 | extern int kdb_common_deinit_state(void); | 77 | extern int kdb_common_deinit_state(void); |
78 | #define KGDB_KDB_REASON_SYSTEM_NMI KDB_REASON_SYSTEM_NMI | ||
77 | #else /* ! CONFIG_KGDB_KDB */ | 79 | #else /* ! CONFIG_KGDB_KDB */ |
78 | static inline int kdb_stub(struct kgdb_state *ks) | 80 | static inline int kdb_stub(struct kgdb_state *ks) |
79 | { | 81 | { |
80 | return DBG_PASS_EVENT; | 82 | return DBG_PASS_EVENT; |
81 | } | 83 | } |
84 | #define KGDB_KDB_REASON_SYSTEM_NMI 0 | ||
82 | #endif /* CONFIG_KGDB_KDB */ | 85 | #endif /* CONFIG_KGDB_KDB */ |
83 | 86 | ||
84 | #endif /* _DEBUG_CORE_H_ */ | 87 | #endif /* _DEBUG_CORE_H_ */ |
diff --git a/kernel/debug/kdb/kdb_debugger.c b/kernel/debug/kdb/kdb_debugger.c index 328d18ef31e4..8859ca34dcfe 100644 --- a/kernel/debug/kdb/kdb_debugger.c +++ b/kernel/debug/kdb/kdb_debugger.c | |||
@@ -69,7 +69,10 @@ int kdb_stub(struct kgdb_state *ks) | |||
69 | if (atomic_read(&kgdb_setting_breakpoint)) | 69 | if (atomic_read(&kgdb_setting_breakpoint)) |
70 | reason = KDB_REASON_KEYBOARD; | 70 | reason = KDB_REASON_KEYBOARD; |
71 | 71 | ||
72 | if (in_nmi()) | 72 | if (ks->err_code == KDB_REASON_SYSTEM_NMI && ks->signo == SIGTRAP) |
73 | reason = KDB_REASON_SYSTEM_NMI; | ||
74 | |||
75 | else if (in_nmi()) | ||
73 | reason = KDB_REASON_NMI; | 76 | reason = KDB_REASON_NMI; |
74 | 77 | ||
75 | for (i = 0, bp = kdb_breakpoints; i < KDB_MAXBPT; i++, bp++) { | 78 | for (i = 0, bp = kdb_breakpoints; i < KDB_MAXBPT; i++, bp++) { |
diff --git a/kernel/debug/kdb/kdb_main.c b/kernel/debug/kdb/kdb_main.c index 00eb8f7fbf41..0b097c8a1e50 100644 --- a/kernel/debug/kdb/kdb_main.c +++ b/kernel/debug/kdb/kdb_main.c | |||
@@ -1200,6 +1200,9 @@ static int kdb_local(kdb_reason_t reason, int error, struct pt_regs *regs, | |||
1200 | instruction_pointer(regs)); | 1200 | instruction_pointer(regs)); |
1201 | kdb_dumpregs(regs); | 1201 | kdb_dumpregs(regs); |
1202 | break; | 1202 | break; |
1203 | case KDB_REASON_SYSTEM_NMI: | ||
1204 | kdb_printf("due to System NonMaskable Interrupt\n"); | ||
1205 | break; | ||
1203 | case KDB_REASON_NMI: | 1206 | case KDB_REASON_NMI: |
1204 | kdb_printf("due to NonMaskable Interrupt @ " | 1207 | kdb_printf("due to NonMaskable Interrupt @ " |
1205 | kdb_machreg_fmt "\n", | 1208 | kdb_machreg_fmt "\n", |
diff --git a/kernel/delayacct.c b/kernel/delayacct.c index d473988c1d0b..54996b71e66d 100644 --- a/kernel/delayacct.c +++ b/kernel/delayacct.c | |||
@@ -108,12 +108,6 @@ int __delayacct_add_tsk(struct taskstats *d, struct task_struct *tsk) | |||
108 | struct timespec ts; | 108 | struct timespec ts; |
109 | cputime_t utime, stime, stimescaled, utimescaled; | 109 | cputime_t utime, stime, stimescaled, utimescaled; |
110 | 110 | ||
111 | /* Though tsk->delays accessed later, early exit avoids | ||
112 | * unnecessary returning of other data | ||
113 | */ | ||
114 | if (!tsk->delays) | ||
115 | goto done; | ||
116 | |||
117 | tmp = (s64)d->cpu_run_real_total; | 111 | tmp = (s64)d->cpu_run_real_total; |
118 | task_cputime(tsk, &utime, &stime); | 112 | task_cputime(tsk, &utime, &stime); |
119 | cputime_to_timespec(utime + stime, &ts); | 113 | cputime_to_timespec(utime + stime, &ts); |
@@ -158,7 +152,6 @@ int __delayacct_add_tsk(struct taskstats *d, struct task_struct *tsk) | |||
158 | d->freepages_count += tsk->delays->freepages_count; | 152 | d->freepages_count += tsk->delays->freepages_count; |
159 | spin_unlock_irqrestore(&tsk->delays->lock, flags); | 153 | spin_unlock_irqrestore(&tsk->delays->lock, flags); |
160 | 154 | ||
161 | done: | ||
162 | return 0; | 155 | return 0; |
163 | } | 156 | } |
164 | 157 | ||
diff --git a/kernel/elfcore.c b/kernel/elfcore.c index ff915efef66d..e556751d15d9 100644 --- a/kernel/elfcore.c +++ b/kernel/elfcore.c | |||
@@ -1,23 +1,19 @@ | |||
1 | #include <linux/elf.h> | 1 | #include <linux/elf.h> |
2 | #include <linux/fs.h> | 2 | #include <linux/fs.h> |
3 | #include <linux/mm.h> | 3 | #include <linux/mm.h> |
4 | 4 | #include <linux/binfmts.h> | |
5 | #include <asm/elf.h> | ||
6 | |||
7 | 5 | ||
8 | Elf_Half __weak elf_core_extra_phdrs(void) | 6 | Elf_Half __weak elf_core_extra_phdrs(void) |
9 | { | 7 | { |
10 | return 0; | 8 | return 0; |
11 | } | 9 | } |
12 | 10 | ||
13 | int __weak elf_core_write_extra_phdrs(struct file *file, loff_t offset, size_t *size, | 11 | int __weak elf_core_write_extra_phdrs(struct coredump_params *cprm, loff_t offset) |
14 | unsigned long limit) | ||
15 | { | 12 | { |
16 | return 1; | 13 | return 1; |
17 | } | 14 | } |
18 | 15 | ||
19 | int __weak elf_core_write_extra_data(struct file *file, size_t *size, | 16 | int __weak elf_core_write_extra_data(struct coredump_params *cprm) |
20 | unsigned long limit) | ||
21 | { | 17 | { |
22 | return 1; | 18 | return 1; |
23 | } | 19 | } |
diff --git a/kernel/events/core.c b/kernel/events/core.c index 953c14348375..f5744010a8d2 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c | |||
@@ -175,8 +175,8 @@ int sysctl_perf_event_sample_rate __read_mostly = DEFAULT_MAX_SAMPLE_RATE; | |||
175 | static int max_samples_per_tick __read_mostly = DIV_ROUND_UP(DEFAULT_MAX_SAMPLE_RATE, HZ); | 175 | static int max_samples_per_tick __read_mostly = DIV_ROUND_UP(DEFAULT_MAX_SAMPLE_RATE, HZ); |
176 | static int perf_sample_period_ns __read_mostly = DEFAULT_SAMPLE_PERIOD_NS; | 176 | static int perf_sample_period_ns __read_mostly = DEFAULT_SAMPLE_PERIOD_NS; |
177 | 177 | ||
178 | static atomic_t perf_sample_allowed_ns __read_mostly = | 178 | static int perf_sample_allowed_ns __read_mostly = |
179 | ATOMIC_INIT( DEFAULT_SAMPLE_PERIOD_NS * DEFAULT_CPU_TIME_MAX_PERCENT / 100); | 179 | DEFAULT_SAMPLE_PERIOD_NS * DEFAULT_CPU_TIME_MAX_PERCENT / 100; |
180 | 180 | ||
181 | void update_perf_cpu_limits(void) | 181 | void update_perf_cpu_limits(void) |
182 | { | 182 | { |
@@ -184,7 +184,7 @@ void update_perf_cpu_limits(void) | |||
184 | 184 | ||
185 | tmp *= sysctl_perf_cpu_time_max_percent; | 185 | tmp *= sysctl_perf_cpu_time_max_percent; |
186 | do_div(tmp, 100); | 186 | do_div(tmp, 100); |
187 | atomic_set(&perf_sample_allowed_ns, tmp); | 187 | ACCESS_ONCE(perf_sample_allowed_ns) = tmp; |
188 | } | 188 | } |
189 | 189 | ||
190 | static int perf_rotate_context(struct perf_cpu_context *cpuctx); | 190 | static int perf_rotate_context(struct perf_cpu_context *cpuctx); |
@@ -193,7 +193,7 @@ int perf_proc_update_handler(struct ctl_table *table, int write, | |||
193 | void __user *buffer, size_t *lenp, | 193 | void __user *buffer, size_t *lenp, |
194 | loff_t *ppos) | 194 | loff_t *ppos) |
195 | { | 195 | { |
196 | int ret = proc_dointvec(table, write, buffer, lenp, ppos); | 196 | int ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos); |
197 | 197 | ||
198 | if (ret || !write) | 198 | if (ret || !write) |
199 | return ret; | 199 | return ret; |
@@ -228,14 +228,15 @@ int perf_cpu_time_max_percent_handler(struct ctl_table *table, int write, | |||
228 | * we detect that events are taking too long. | 228 | * we detect that events are taking too long. |
229 | */ | 229 | */ |
230 | #define NR_ACCUMULATED_SAMPLES 128 | 230 | #define NR_ACCUMULATED_SAMPLES 128 |
231 | DEFINE_PER_CPU(u64, running_sample_length); | 231 | static DEFINE_PER_CPU(u64, running_sample_length); |
232 | 232 | ||
233 | void perf_sample_event_took(u64 sample_len_ns) | 233 | void perf_sample_event_took(u64 sample_len_ns) |
234 | { | 234 | { |
235 | u64 avg_local_sample_len; | 235 | u64 avg_local_sample_len; |
236 | u64 local_samples_len; | 236 | u64 local_samples_len; |
237 | u64 allowed_ns = ACCESS_ONCE(perf_sample_allowed_ns); | ||
237 | 238 | ||
238 | if (atomic_read(&perf_sample_allowed_ns) == 0) | 239 | if (allowed_ns == 0) |
239 | return; | 240 | return; |
240 | 241 | ||
241 | /* decay the counter by 1 average sample */ | 242 | /* decay the counter by 1 average sample */ |
@@ -251,7 +252,7 @@ void perf_sample_event_took(u64 sample_len_ns) | |||
251 | */ | 252 | */ |
252 | avg_local_sample_len = local_samples_len/NR_ACCUMULATED_SAMPLES; | 253 | avg_local_sample_len = local_samples_len/NR_ACCUMULATED_SAMPLES; |
253 | 254 | ||
254 | if (avg_local_sample_len <= atomic_read(&perf_sample_allowed_ns)) | 255 | if (avg_local_sample_len <= allowed_ns) |
255 | return; | 256 | return; |
256 | 257 | ||
257 | if (max_samples_per_tick <= 1) | 258 | if (max_samples_per_tick <= 1) |
@@ -262,10 +263,9 @@ void perf_sample_event_took(u64 sample_len_ns) | |||
262 | perf_sample_period_ns = NSEC_PER_SEC / sysctl_perf_event_sample_rate; | 263 | perf_sample_period_ns = NSEC_PER_SEC / sysctl_perf_event_sample_rate; |
263 | 264 | ||
264 | printk_ratelimited(KERN_WARNING | 265 | printk_ratelimited(KERN_WARNING |
265 | "perf samples too long (%lld > %d), lowering " | 266 | "perf samples too long (%lld > %lld), lowering " |
266 | "kernel.perf_event_max_sample_rate to %d\n", | 267 | "kernel.perf_event_max_sample_rate to %d\n", |
267 | avg_local_sample_len, | 268 | avg_local_sample_len, allowed_ns, |
268 | atomic_read(&perf_sample_allowed_ns), | ||
269 | sysctl_perf_event_sample_rate); | 269 | sysctl_perf_event_sample_rate); |
270 | 270 | ||
271 | update_perf_cpu_limits(); | 271 | update_perf_cpu_limits(); |
@@ -899,6 +899,7 @@ static void unclone_ctx(struct perf_event_context *ctx) | |||
899 | put_ctx(ctx->parent_ctx); | 899 | put_ctx(ctx->parent_ctx); |
900 | ctx->parent_ctx = NULL; | 900 | ctx->parent_ctx = NULL; |
901 | } | 901 | } |
902 | ctx->generation++; | ||
902 | } | 903 | } |
903 | 904 | ||
904 | static u32 perf_event_pid(struct perf_event *event, struct task_struct *p) | 905 | static u32 perf_event_pid(struct perf_event *event, struct task_struct *p) |
@@ -1136,6 +1137,8 @@ list_add_event(struct perf_event *event, struct perf_event_context *ctx) | |||
1136 | ctx->nr_events++; | 1137 | ctx->nr_events++; |
1137 | if (event->attr.inherit_stat) | 1138 | if (event->attr.inherit_stat) |
1138 | ctx->nr_stat++; | 1139 | ctx->nr_stat++; |
1140 | |||
1141 | ctx->generation++; | ||
1139 | } | 1142 | } |
1140 | 1143 | ||
1141 | /* | 1144 | /* |
@@ -1201,6 +1204,9 @@ static void perf_event__header_size(struct perf_event *event) | |||
1201 | if (sample_type & PERF_SAMPLE_DATA_SRC) | 1204 | if (sample_type & PERF_SAMPLE_DATA_SRC) |
1202 | size += sizeof(data->data_src.val); | 1205 | size += sizeof(data->data_src.val); |
1203 | 1206 | ||
1207 | if (sample_type & PERF_SAMPLE_TRANSACTION) | ||
1208 | size += sizeof(data->txn); | ||
1209 | |||
1204 | event->header_size = size; | 1210 | event->header_size = size; |
1205 | } | 1211 | } |
1206 | 1212 | ||
@@ -1310,6 +1316,8 @@ list_del_event(struct perf_event *event, struct perf_event_context *ctx) | |||
1310 | */ | 1316 | */ |
1311 | if (event->state > PERF_EVENT_STATE_OFF) | 1317 | if (event->state > PERF_EVENT_STATE_OFF) |
1312 | event->state = PERF_EVENT_STATE_OFF; | 1318 | event->state = PERF_EVENT_STATE_OFF; |
1319 | |||
1320 | ctx->generation++; | ||
1313 | } | 1321 | } |
1314 | 1322 | ||
1315 | static void perf_group_detach(struct perf_event *event) | 1323 | static void perf_group_detach(struct perf_event *event) |
@@ -1388,6 +1396,8 @@ event_sched_out(struct perf_event *event, | |||
1388 | if (event->state != PERF_EVENT_STATE_ACTIVE) | 1396 | if (event->state != PERF_EVENT_STATE_ACTIVE) |
1389 | return; | 1397 | return; |
1390 | 1398 | ||
1399 | perf_pmu_disable(event->pmu); | ||
1400 | |||
1391 | event->state = PERF_EVENT_STATE_INACTIVE; | 1401 | event->state = PERF_EVENT_STATE_INACTIVE; |
1392 | if (event->pending_disable) { | 1402 | if (event->pending_disable) { |
1393 | event->pending_disable = 0; | 1403 | event->pending_disable = 0; |
@@ -1404,6 +1414,8 @@ event_sched_out(struct perf_event *event, | |||
1404 | ctx->nr_freq--; | 1414 | ctx->nr_freq--; |
1405 | if (event->attr.exclusive || !cpuctx->active_oncpu) | 1415 | if (event->attr.exclusive || !cpuctx->active_oncpu) |
1406 | cpuctx->exclusive = 0; | 1416 | cpuctx->exclusive = 0; |
1417 | |||
1418 | perf_pmu_enable(event->pmu); | ||
1407 | } | 1419 | } |
1408 | 1420 | ||
1409 | static void | 1421 | static void |
@@ -1644,6 +1656,7 @@ event_sched_in(struct perf_event *event, | |||
1644 | struct perf_event_context *ctx) | 1656 | struct perf_event_context *ctx) |
1645 | { | 1657 | { |
1646 | u64 tstamp = perf_event_time(event); | 1658 | u64 tstamp = perf_event_time(event); |
1659 | int ret = 0; | ||
1647 | 1660 | ||
1648 | if (event->state <= PERF_EVENT_STATE_OFF) | 1661 | if (event->state <= PERF_EVENT_STATE_OFF) |
1649 | return 0; | 1662 | return 0; |
@@ -1666,10 +1679,13 @@ event_sched_in(struct perf_event *event, | |||
1666 | */ | 1679 | */ |
1667 | smp_wmb(); | 1680 | smp_wmb(); |
1668 | 1681 | ||
1682 | perf_pmu_disable(event->pmu); | ||
1683 | |||
1669 | if (event->pmu->add(event, PERF_EF_START)) { | 1684 | if (event->pmu->add(event, PERF_EF_START)) { |
1670 | event->state = PERF_EVENT_STATE_INACTIVE; | 1685 | event->state = PERF_EVENT_STATE_INACTIVE; |
1671 | event->oncpu = -1; | 1686 | event->oncpu = -1; |
1672 | return -EAGAIN; | 1687 | ret = -EAGAIN; |
1688 | goto out; | ||
1673 | } | 1689 | } |
1674 | 1690 | ||
1675 | event->tstamp_running += tstamp - event->tstamp_stopped; | 1691 | event->tstamp_running += tstamp - event->tstamp_stopped; |
@@ -1685,7 +1701,10 @@ event_sched_in(struct perf_event *event, | |||
1685 | if (event->attr.exclusive) | 1701 | if (event->attr.exclusive) |
1686 | cpuctx->exclusive = 1; | 1702 | cpuctx->exclusive = 1; |
1687 | 1703 | ||
1688 | return 0; | 1704 | out: |
1705 | perf_pmu_enable(event->pmu); | ||
1706 | |||
1707 | return ret; | ||
1689 | } | 1708 | } |
1690 | 1709 | ||
1691 | static int | 1710 | static int |
@@ -2146,22 +2165,38 @@ static void ctx_sched_out(struct perf_event_context *ctx, | |||
2146 | } | 2165 | } |
2147 | 2166 | ||
2148 | /* | 2167 | /* |
2149 | * Test whether two contexts are equivalent, i.e. whether they | 2168 | * Test whether two contexts are equivalent, i.e. whether they have both been |
2150 | * have both been cloned from the same version of the same context | 2169 | * cloned from the same version of the same context. |
2151 | * and they both have the same number of enabled events. | 2170 | * |
2152 | * If the number of enabled events is the same, then the set | 2171 | * Equivalence is measured using a generation number in the context that is |
2153 | * of enabled events should be the same, because these are both | 2172 | * incremented on each modification to it; see unclone_ctx(), list_add_event() |
2154 | * inherited contexts, therefore we can't access individual events | 2173 | * and list_del_event(). |
2155 | * in them directly with an fd; we can only enable/disable all | ||
2156 | * events via prctl, or enable/disable all events in a family | ||
2157 | * via ioctl, which will have the same effect on both contexts. | ||
2158 | */ | 2174 | */ |
2159 | static int context_equiv(struct perf_event_context *ctx1, | 2175 | static int context_equiv(struct perf_event_context *ctx1, |
2160 | struct perf_event_context *ctx2) | 2176 | struct perf_event_context *ctx2) |
2161 | { | 2177 | { |
2162 | return ctx1->parent_ctx && ctx1->parent_ctx == ctx2->parent_ctx | 2178 | /* Pinning disables the swap optimization */ |
2163 | && ctx1->parent_gen == ctx2->parent_gen | 2179 | if (ctx1->pin_count || ctx2->pin_count) |
2164 | && !ctx1->pin_count && !ctx2->pin_count; | 2180 | return 0; |
2181 | |||
2182 | /* If ctx1 is the parent of ctx2 */ | ||
2183 | if (ctx1 == ctx2->parent_ctx && ctx1->generation == ctx2->parent_gen) | ||
2184 | return 1; | ||
2185 | |||
2186 | /* If ctx2 is the parent of ctx1 */ | ||
2187 | if (ctx1->parent_ctx == ctx2 && ctx1->parent_gen == ctx2->generation) | ||
2188 | return 1; | ||
2189 | |||
2190 | /* | ||
2191 | * If ctx1 and ctx2 have the same parent; we flatten the parent | ||
2192 | * hierarchy, see perf_event_init_context(). | ||
2193 | */ | ||
2194 | if (ctx1->parent_ctx && ctx1->parent_ctx == ctx2->parent_ctx && | ||
2195 | ctx1->parent_gen == ctx2->parent_gen) | ||
2196 | return 1; | ||
2197 | |||
2198 | /* Unmatched */ | ||
2199 | return 0; | ||
2165 | } | 2200 | } |
2166 | 2201 | ||
2167 | static void __perf_event_sync_stat(struct perf_event *event, | 2202 | static void __perf_event_sync_stat(struct perf_event *event, |
@@ -2210,9 +2245,6 @@ static void __perf_event_sync_stat(struct perf_event *event, | |||
2210 | perf_event_update_userpage(next_event); | 2245 | perf_event_update_userpage(next_event); |
2211 | } | 2246 | } |
2212 | 2247 | ||
2213 | #define list_next_entry(pos, member) \ | ||
2214 | list_entry(pos->member.next, typeof(*pos), member) | ||
2215 | |||
2216 | static void perf_event_sync_stat(struct perf_event_context *ctx, | 2248 | static void perf_event_sync_stat(struct perf_event_context *ctx, |
2217 | struct perf_event_context *next_ctx) | 2249 | struct perf_event_context *next_ctx) |
2218 | { | 2250 | { |
@@ -2244,7 +2276,7 @@ static void perf_event_context_sched_out(struct task_struct *task, int ctxn, | |||
2244 | { | 2276 | { |
2245 | struct perf_event_context *ctx = task->perf_event_ctxp[ctxn]; | 2277 | struct perf_event_context *ctx = task->perf_event_ctxp[ctxn]; |
2246 | struct perf_event_context *next_ctx; | 2278 | struct perf_event_context *next_ctx; |
2247 | struct perf_event_context *parent; | 2279 | struct perf_event_context *parent, *next_parent; |
2248 | struct perf_cpu_context *cpuctx; | 2280 | struct perf_cpu_context *cpuctx; |
2249 | int do_switch = 1; | 2281 | int do_switch = 1; |
2250 | 2282 | ||
@@ -2256,10 +2288,18 @@ static void perf_event_context_sched_out(struct task_struct *task, int ctxn, | |||
2256 | return; | 2288 | return; |
2257 | 2289 | ||
2258 | rcu_read_lock(); | 2290 | rcu_read_lock(); |
2259 | parent = rcu_dereference(ctx->parent_ctx); | ||
2260 | next_ctx = next->perf_event_ctxp[ctxn]; | 2291 | next_ctx = next->perf_event_ctxp[ctxn]; |
2261 | if (parent && next_ctx && | 2292 | if (!next_ctx) |
2262 | rcu_dereference(next_ctx->parent_ctx) == parent) { | 2293 | goto unlock; |
2294 | |||
2295 | parent = rcu_dereference(ctx->parent_ctx); | ||
2296 | next_parent = rcu_dereference(next_ctx->parent_ctx); | ||
2297 | |||
2298 | /* If neither context have a parent context; they cannot be clones. */ | ||
2299 | if (!parent && !next_parent) | ||
2300 | goto unlock; | ||
2301 | |||
2302 | if (next_parent == ctx || next_ctx == parent || next_parent == parent) { | ||
2263 | /* | 2303 | /* |
2264 | * Looks like the two contexts are clones, so we might be | 2304 | * Looks like the two contexts are clones, so we might be |
2265 | * able to optimize the context switch. We lock both | 2305 | * able to optimize the context switch. We lock both |
@@ -2287,6 +2327,7 @@ static void perf_event_context_sched_out(struct task_struct *task, int ctxn, | |||
2287 | raw_spin_unlock(&next_ctx->lock); | 2327 | raw_spin_unlock(&next_ctx->lock); |
2288 | raw_spin_unlock(&ctx->lock); | 2328 | raw_spin_unlock(&ctx->lock); |
2289 | } | 2329 | } |
2330 | unlock: | ||
2290 | rcu_read_unlock(); | 2331 | rcu_read_unlock(); |
2291 | 2332 | ||
2292 | if (do_switch) { | 2333 | if (do_switch) { |
@@ -2713,6 +2754,8 @@ static void perf_adjust_freq_unthr_context(struct perf_event_context *ctx, | |||
2713 | if (!event_filter_match(event)) | 2754 | if (!event_filter_match(event)) |
2714 | continue; | 2755 | continue; |
2715 | 2756 | ||
2757 | perf_pmu_disable(event->pmu); | ||
2758 | |||
2716 | hwc = &event->hw; | 2759 | hwc = &event->hw; |
2717 | 2760 | ||
2718 | if (hwc->interrupts == MAX_INTERRUPTS) { | 2761 | if (hwc->interrupts == MAX_INTERRUPTS) { |
@@ -2722,7 +2765,7 @@ static void perf_adjust_freq_unthr_context(struct perf_event_context *ctx, | |||
2722 | } | 2765 | } |
2723 | 2766 | ||
2724 | if (!event->attr.freq || !event->attr.sample_freq) | 2767 | if (!event->attr.freq || !event->attr.sample_freq) |
2725 | continue; | 2768 | goto next; |
2726 | 2769 | ||
2727 | /* | 2770 | /* |
2728 | * stop the event and update event->count | 2771 | * stop the event and update event->count |
@@ -2744,6 +2787,8 @@ static void perf_adjust_freq_unthr_context(struct perf_event_context *ctx, | |||
2744 | perf_adjust_period(event, period, delta, false); | 2787 | perf_adjust_period(event, period, delta, false); |
2745 | 2788 | ||
2746 | event->pmu->start(event, delta > 0 ? PERF_EF_RELOAD : 0); | 2789 | event->pmu->start(event, delta > 0 ? PERF_EF_RELOAD : 0); |
2790 | next: | ||
2791 | perf_pmu_enable(event->pmu); | ||
2747 | } | 2792 | } |
2748 | 2793 | ||
2749 | perf_pmu_enable(ctx->pmu); | 2794 | perf_pmu_enable(ctx->pmu); |
@@ -4572,6 +4617,9 @@ void perf_output_sample(struct perf_output_handle *handle, | |||
4572 | if (sample_type & PERF_SAMPLE_DATA_SRC) | 4617 | if (sample_type & PERF_SAMPLE_DATA_SRC) |
4573 | perf_output_put(handle, data->data_src.val); | 4618 | perf_output_put(handle, data->data_src.val); |
4574 | 4619 | ||
4620 | if (sample_type & PERF_SAMPLE_TRANSACTION) | ||
4621 | perf_output_put(handle, data->txn); | ||
4622 | |||
4575 | if (!event->attr.watermark) { | 4623 | if (!event->attr.watermark) { |
4576 | int wakeup_events = event->attr.wakeup_events; | 4624 | int wakeup_events = event->attr.wakeup_events; |
4577 | 4625 | ||
@@ -5100,27 +5148,26 @@ static void perf_event_mmap_event(struct perf_mmap_event *mmap_event) | |||
5100 | unsigned int size; | 5148 | unsigned int size; |
5101 | char tmp[16]; | 5149 | char tmp[16]; |
5102 | char *buf = NULL; | 5150 | char *buf = NULL; |
5103 | const char *name; | 5151 | char *name; |
5104 | |||
5105 | memset(tmp, 0, sizeof(tmp)); | ||
5106 | 5152 | ||
5107 | if (file) { | 5153 | if (file) { |
5108 | struct inode *inode; | 5154 | struct inode *inode; |
5109 | dev_t dev; | 5155 | dev_t dev; |
5156 | |||
5157 | buf = kmalloc(PATH_MAX, GFP_KERNEL); | ||
5158 | if (!buf) { | ||
5159 | name = "//enomem"; | ||
5160 | goto cpy_name; | ||
5161 | } | ||
5110 | /* | 5162 | /* |
5111 | * d_path works from the end of the rb backwards, so we | 5163 | * d_path() works from the end of the rb backwards, so we |
5112 | * need to add enough zero bytes after the string to handle | 5164 | * need to add enough zero bytes after the string to handle |
5113 | * the 64bit alignment we do later. | 5165 | * the 64bit alignment we do later. |
5114 | */ | 5166 | */ |
5115 | buf = kzalloc(PATH_MAX + sizeof(u64), GFP_KERNEL); | 5167 | name = d_path(&file->f_path, buf, PATH_MAX - sizeof(u64)); |
5116 | if (!buf) { | ||
5117 | name = strncpy(tmp, "//enomem", sizeof(tmp)); | ||
5118 | goto got_name; | ||
5119 | } | ||
5120 | name = d_path(&file->f_path, buf, PATH_MAX); | ||
5121 | if (IS_ERR(name)) { | 5168 | if (IS_ERR(name)) { |
5122 | name = strncpy(tmp, "//toolong", sizeof(tmp)); | 5169 | name = "//toolong"; |
5123 | goto got_name; | 5170 | goto cpy_name; |
5124 | } | 5171 | } |
5125 | inode = file_inode(vma->vm_file); | 5172 | inode = file_inode(vma->vm_file); |
5126 | dev = inode->i_sb->s_dev; | 5173 | dev = inode->i_sb->s_dev; |
@@ -5128,34 +5175,39 @@ static void perf_event_mmap_event(struct perf_mmap_event *mmap_event) | |||
5128 | gen = inode->i_generation; | 5175 | gen = inode->i_generation; |
5129 | maj = MAJOR(dev); | 5176 | maj = MAJOR(dev); |
5130 | min = MINOR(dev); | 5177 | min = MINOR(dev); |
5131 | 5178 | goto got_name; | |
5132 | } else { | 5179 | } else { |
5133 | if (arch_vma_name(mmap_event->vma)) { | 5180 | name = (char *)arch_vma_name(vma); |
5134 | name = strncpy(tmp, arch_vma_name(mmap_event->vma), | 5181 | if (name) |
5135 | sizeof(tmp) - 1); | 5182 | goto cpy_name; |
5136 | tmp[sizeof(tmp) - 1] = '\0'; | ||
5137 | goto got_name; | ||
5138 | } | ||
5139 | 5183 | ||
5140 | if (!vma->vm_mm) { | 5184 | if (vma->vm_start <= vma->vm_mm->start_brk && |
5141 | name = strncpy(tmp, "[vdso]", sizeof(tmp)); | ||
5142 | goto got_name; | ||
5143 | } else if (vma->vm_start <= vma->vm_mm->start_brk && | ||
5144 | vma->vm_end >= vma->vm_mm->brk) { | 5185 | vma->vm_end >= vma->vm_mm->brk) { |
5145 | name = strncpy(tmp, "[heap]", sizeof(tmp)); | 5186 | name = "[heap]"; |
5146 | goto got_name; | 5187 | goto cpy_name; |
5147 | } else if (vma->vm_start <= vma->vm_mm->start_stack && | 5188 | } |
5189 | if (vma->vm_start <= vma->vm_mm->start_stack && | ||
5148 | vma->vm_end >= vma->vm_mm->start_stack) { | 5190 | vma->vm_end >= vma->vm_mm->start_stack) { |
5149 | name = strncpy(tmp, "[stack]", sizeof(tmp)); | 5191 | name = "[stack]"; |
5150 | goto got_name; | 5192 | goto cpy_name; |
5151 | } | 5193 | } |
5152 | 5194 | ||
5153 | name = strncpy(tmp, "//anon", sizeof(tmp)); | 5195 | name = "//anon"; |
5154 | goto got_name; | 5196 | goto cpy_name; |
5155 | } | 5197 | } |
5156 | 5198 | ||
5199 | cpy_name: | ||
5200 | strlcpy(tmp, name, sizeof(tmp)); | ||
5201 | name = tmp; | ||
5157 | got_name: | 5202 | got_name: |
5158 | size = ALIGN(strlen(name)+1, sizeof(u64)); | 5203 | /* |
5204 | * Since our buffer works in 8 byte units we need to align our string | ||
5205 | * size to a multiple of 8. However, we must guarantee the tail end is | ||
5206 | * zero'd out to avoid leaking random bits to userspace. | ||
5207 | */ | ||
5208 | size = strlen(name)+1; | ||
5209 | while (!IS_ALIGNED(size, sizeof(u64))) | ||
5210 | name[size++] = '\0'; | ||
5159 | 5211 | ||
5160 | mmap_event->file_name = name; | 5212 | mmap_event->file_name = name; |
5161 | mmap_event->file_size = size; | 5213 | mmap_event->file_size = size; |
@@ -5643,11 +5695,6 @@ static void swevent_hlist_put(struct perf_event *event) | |||
5643 | { | 5695 | { |
5644 | int cpu; | 5696 | int cpu; |
5645 | 5697 | ||
5646 | if (event->cpu != -1) { | ||
5647 | swevent_hlist_put_cpu(event, event->cpu); | ||
5648 | return; | ||
5649 | } | ||
5650 | |||
5651 | for_each_possible_cpu(cpu) | 5698 | for_each_possible_cpu(cpu) |
5652 | swevent_hlist_put_cpu(event, cpu); | 5699 | swevent_hlist_put_cpu(event, cpu); |
5653 | } | 5700 | } |
@@ -5681,9 +5728,6 @@ static int swevent_hlist_get(struct perf_event *event) | |||
5681 | int err; | 5728 | int err; |
5682 | int cpu, failed_cpu; | 5729 | int cpu, failed_cpu; |
5683 | 5730 | ||
5684 | if (event->cpu != -1) | ||
5685 | return swevent_hlist_get_cpu(event, event->cpu); | ||
5686 | |||
5687 | get_online_cpus(); | 5731 | get_online_cpus(); |
5688 | for_each_possible_cpu(cpu) { | 5732 | for_each_possible_cpu(cpu) { |
5689 | err = swevent_hlist_get_cpu(event, cpu); | 5733 | err = swevent_hlist_get_cpu(event, cpu); |
@@ -6292,6 +6336,7 @@ type_show(struct device *dev, struct device_attribute *attr, char *page) | |||
6292 | 6336 | ||
6293 | return snprintf(page, PAGE_SIZE-1, "%d\n", pmu->type); | 6337 | return snprintf(page, PAGE_SIZE-1, "%d\n", pmu->type); |
6294 | } | 6338 | } |
6339 | static DEVICE_ATTR_RO(type); | ||
6295 | 6340 | ||
6296 | static ssize_t | 6341 | static ssize_t |
6297 | perf_event_mux_interval_ms_show(struct device *dev, | 6342 | perf_event_mux_interval_ms_show(struct device *dev, |
@@ -6336,17 +6381,19 @@ perf_event_mux_interval_ms_store(struct device *dev, | |||
6336 | 6381 | ||
6337 | return count; | 6382 | return count; |
6338 | } | 6383 | } |
6384 | static DEVICE_ATTR_RW(perf_event_mux_interval_ms); | ||
6339 | 6385 | ||
6340 | static struct device_attribute pmu_dev_attrs[] = { | 6386 | static struct attribute *pmu_dev_attrs[] = { |
6341 | __ATTR_RO(type), | 6387 | &dev_attr_type.attr, |
6342 | __ATTR_RW(perf_event_mux_interval_ms), | 6388 | &dev_attr_perf_event_mux_interval_ms.attr, |
6343 | __ATTR_NULL, | 6389 | NULL, |
6344 | }; | 6390 | }; |
6391 | ATTRIBUTE_GROUPS(pmu_dev); | ||
6345 | 6392 | ||
6346 | static int pmu_bus_running; | 6393 | static int pmu_bus_running; |
6347 | static struct bus_type pmu_bus = { | 6394 | static struct bus_type pmu_bus = { |
6348 | .name = "event_source", | 6395 | .name = "event_source", |
6349 | .dev_attrs = pmu_dev_attrs, | 6396 | .dev_groups = pmu_dev_groups, |
6350 | }; | 6397 | }; |
6351 | 6398 | ||
6352 | static void pmu_dev_release(struct device *dev) | 6399 | static void pmu_dev_release(struct device *dev) |
@@ -7126,7 +7173,6 @@ SYSCALL_DEFINE5(perf_event_open, | |||
7126 | } | 7173 | } |
7127 | 7174 | ||
7128 | perf_install_in_context(ctx, event, event->cpu); | 7175 | perf_install_in_context(ctx, event, event->cpu); |
7129 | ++ctx->generation; | ||
7130 | perf_unpin_context(ctx); | 7176 | perf_unpin_context(ctx); |
7131 | mutex_unlock(&ctx->mutex); | 7177 | mutex_unlock(&ctx->mutex); |
7132 | 7178 | ||
@@ -7209,7 +7255,6 @@ perf_event_create_kernel_counter(struct perf_event_attr *attr, int cpu, | |||
7209 | WARN_ON_ONCE(ctx->parent_ctx); | 7255 | WARN_ON_ONCE(ctx->parent_ctx); |
7210 | mutex_lock(&ctx->mutex); | 7256 | mutex_lock(&ctx->mutex); |
7211 | perf_install_in_context(ctx, event, cpu); | 7257 | perf_install_in_context(ctx, event, cpu); |
7212 | ++ctx->generation; | ||
7213 | perf_unpin_context(ctx); | 7258 | perf_unpin_context(ctx); |
7214 | mutex_unlock(&ctx->mutex); | 7259 | mutex_unlock(&ctx->mutex); |
7215 | 7260 | ||
diff --git a/kernel/events/internal.h b/kernel/events/internal.h index ca6599723be5..569b218782ad 100644 --- a/kernel/events/internal.h +++ b/kernel/events/internal.h | |||
@@ -82,16 +82,16 @@ static inline unsigned long perf_data_size(struct ring_buffer *rb) | |||
82 | } | 82 | } |
83 | 83 | ||
84 | #define DEFINE_OUTPUT_COPY(func_name, memcpy_func) \ | 84 | #define DEFINE_OUTPUT_COPY(func_name, memcpy_func) \ |
85 | static inline unsigned int \ | 85 | static inline unsigned long \ |
86 | func_name(struct perf_output_handle *handle, \ | 86 | func_name(struct perf_output_handle *handle, \ |
87 | const void *buf, unsigned int len) \ | 87 | const void *buf, unsigned long len) \ |
88 | { \ | 88 | { \ |
89 | unsigned long size, written; \ | 89 | unsigned long size, written; \ |
90 | \ | 90 | \ |
91 | do { \ | 91 | do { \ |
92 | size = min_t(unsigned long, handle->size, len); \ | 92 | size = min(handle->size, len); \ |
93 | \ | ||
94 | written = memcpy_func(handle->addr, buf, size); \ | 93 | written = memcpy_func(handle->addr, buf, size); \ |
94 | written = size - written; \ | ||
95 | \ | 95 | \ |
96 | len -= written; \ | 96 | len -= written; \ |
97 | handle->addr += written; \ | 97 | handle->addr += written; \ |
@@ -110,20 +110,37 @@ func_name(struct perf_output_handle *handle, \ | |||
110 | return len; \ | 110 | return len; \ |
111 | } | 111 | } |
112 | 112 | ||
113 | static inline int memcpy_common(void *dst, const void *src, size_t n) | 113 | static inline unsigned long |
114 | memcpy_common(void *dst, const void *src, unsigned long n) | ||
114 | { | 115 | { |
115 | memcpy(dst, src, n); | 116 | memcpy(dst, src, n); |
116 | return n; | 117 | return 0; |
117 | } | 118 | } |
118 | 119 | ||
119 | DEFINE_OUTPUT_COPY(__output_copy, memcpy_common) | 120 | DEFINE_OUTPUT_COPY(__output_copy, memcpy_common) |
120 | 121 | ||
121 | #define MEMCPY_SKIP(dst, src, n) (n) | 122 | static inline unsigned long |
123 | memcpy_skip(void *dst, const void *src, unsigned long n) | ||
124 | { | ||
125 | return 0; | ||
126 | } | ||
122 | 127 | ||
123 | DEFINE_OUTPUT_COPY(__output_skip, MEMCPY_SKIP) | 128 | DEFINE_OUTPUT_COPY(__output_skip, memcpy_skip) |
124 | 129 | ||
125 | #ifndef arch_perf_out_copy_user | 130 | #ifndef arch_perf_out_copy_user |
126 | #define arch_perf_out_copy_user __copy_from_user_inatomic | 131 | #define arch_perf_out_copy_user arch_perf_out_copy_user |
132 | |||
133 | static inline unsigned long | ||
134 | arch_perf_out_copy_user(void *dst, const void *src, unsigned long n) | ||
135 | { | ||
136 | unsigned long ret; | ||
137 | |||
138 | pagefault_disable(); | ||
139 | ret = __copy_from_user_inatomic(dst, src, n); | ||
140 | pagefault_enable(); | ||
141 | |||
142 | return ret; | ||
143 | } | ||
127 | #endif | 144 | #endif |
128 | 145 | ||
129 | DEFINE_OUTPUT_COPY(__output_copy_user, arch_perf_out_copy_user) | 146 | DEFINE_OUTPUT_COPY(__output_copy_user, arch_perf_out_copy_user) |
diff --git a/kernel/events/ring_buffer.c b/kernel/events/ring_buffer.c index 9c2ddfbf4525..e8b168af135b 100644 --- a/kernel/events/ring_buffer.c +++ b/kernel/events/ring_buffer.c | |||
@@ -12,40 +12,10 @@ | |||
12 | #include <linux/perf_event.h> | 12 | #include <linux/perf_event.h> |
13 | #include <linux/vmalloc.h> | 13 | #include <linux/vmalloc.h> |
14 | #include <linux/slab.h> | 14 | #include <linux/slab.h> |
15 | #include <linux/circ_buf.h> | ||
15 | 16 | ||
16 | #include "internal.h" | 17 | #include "internal.h" |
17 | 18 | ||
18 | static bool perf_output_space(struct ring_buffer *rb, unsigned long tail, | ||
19 | unsigned long offset, unsigned long head) | ||
20 | { | ||
21 | unsigned long sz = perf_data_size(rb); | ||
22 | unsigned long mask = sz - 1; | ||
23 | |||
24 | /* | ||
25 | * check if user-writable | ||
26 | * overwrite : over-write its own tail | ||
27 | * !overwrite: buffer possibly drops events. | ||
28 | */ | ||
29 | if (rb->overwrite) | ||
30 | return true; | ||
31 | |||
32 | /* | ||
33 | * verify that payload is not bigger than buffer | ||
34 | * otherwise masking logic may fail to detect | ||
35 | * the "not enough space" condition | ||
36 | */ | ||
37 | if ((head - offset) > sz) | ||
38 | return false; | ||
39 | |||
40 | offset = (offset - tail) & mask; | ||
41 | head = (head - tail) & mask; | ||
42 | |||
43 | if ((int)(head - offset) < 0) | ||
44 | return false; | ||
45 | |||
46 | return true; | ||
47 | } | ||
48 | |||
49 | static void perf_output_wakeup(struct perf_output_handle *handle) | 19 | static void perf_output_wakeup(struct perf_output_handle *handle) |
50 | { | 20 | { |
51 | atomic_set(&handle->rb->poll, POLL_IN); | 21 | atomic_set(&handle->rb->poll, POLL_IN); |
@@ -115,8 +85,8 @@ again: | |||
115 | rb->user_page->data_head = head; | 85 | rb->user_page->data_head = head; |
116 | 86 | ||
117 | /* | 87 | /* |
118 | * Now check if we missed an update, rely on the (compiler) | 88 | * Now check if we missed an update -- rely on previous implied |
119 | * barrier in atomic_dec_and_test() to re-read rb->head. | 89 | * compiler barriers to force a re-read. |
120 | */ | 90 | */ |
121 | if (unlikely(head != local_read(&rb->head))) { | 91 | if (unlikely(head != local_read(&rb->head))) { |
122 | local_inc(&rb->nest); | 92 | local_inc(&rb->nest); |
@@ -135,8 +105,7 @@ int perf_output_begin(struct perf_output_handle *handle, | |||
135 | { | 105 | { |
136 | struct ring_buffer *rb; | 106 | struct ring_buffer *rb; |
137 | unsigned long tail, offset, head; | 107 | unsigned long tail, offset, head; |
138 | int have_lost; | 108 | int have_lost, page_shift; |
139 | struct perf_sample_data sample_data; | ||
140 | struct { | 109 | struct { |
141 | struct perf_event_header header; | 110 | struct perf_event_header header; |
142 | u64 id; | 111 | u64 id; |
@@ -151,57 +120,63 @@ int perf_output_begin(struct perf_output_handle *handle, | |||
151 | event = event->parent; | 120 | event = event->parent; |
152 | 121 | ||
153 | rb = rcu_dereference(event->rb); | 122 | rb = rcu_dereference(event->rb); |
154 | if (!rb) | 123 | if (unlikely(!rb)) |
155 | goto out; | 124 | goto out; |
156 | 125 | ||
157 | handle->rb = rb; | 126 | if (unlikely(!rb->nr_pages)) |
158 | handle->event = event; | ||
159 | |||
160 | if (!rb->nr_pages) | ||
161 | goto out; | 127 | goto out; |
162 | 128 | ||
129 | handle->rb = rb; | ||
130 | handle->event = event; | ||
131 | |||
163 | have_lost = local_read(&rb->lost); | 132 | have_lost = local_read(&rb->lost); |
164 | if (have_lost) { | 133 | if (unlikely(have_lost)) { |
165 | lost_event.header.size = sizeof(lost_event); | 134 | size += sizeof(lost_event); |
166 | perf_event_header__init_id(&lost_event.header, &sample_data, | 135 | if (event->attr.sample_id_all) |
167 | event); | 136 | size += event->id_header_size; |
168 | size += lost_event.header.size; | ||
169 | } | 137 | } |
170 | 138 | ||
171 | perf_output_get_handle(handle); | 139 | perf_output_get_handle(handle); |
172 | 140 | ||
173 | do { | 141 | do { |
174 | /* | ||
175 | * Userspace could choose to issue a mb() before updating the | ||
176 | * tail pointer. So that all reads will be completed before the | ||
177 | * write is issued. | ||
178 | * | ||
179 | * See perf_output_put_handle(). | ||
180 | */ | ||
181 | tail = ACCESS_ONCE(rb->user_page->data_tail); | 142 | tail = ACCESS_ONCE(rb->user_page->data_tail); |
182 | smp_mb(); | ||
183 | offset = head = local_read(&rb->head); | 143 | offset = head = local_read(&rb->head); |
184 | head += size; | 144 | if (!rb->overwrite && |
185 | if (unlikely(!perf_output_space(rb, tail, offset, head))) | 145 | unlikely(CIRC_SPACE(head, tail, perf_data_size(rb)) < size)) |
186 | goto fail; | 146 | goto fail; |
147 | head += size; | ||
187 | } while (local_cmpxchg(&rb->head, offset, head) != offset); | 148 | } while (local_cmpxchg(&rb->head, offset, head) != offset); |
188 | 149 | ||
189 | if (head - local_read(&rb->wakeup) > rb->watermark) | 150 | /* |
151 | * Separate the userpage->tail read from the data stores below. | ||
152 | * Matches the MB userspace SHOULD issue after reading the data | ||
153 | * and before storing the new tail position. | ||
154 | * | ||
155 | * See perf_output_put_handle(). | ||
156 | */ | ||
157 | smp_mb(); | ||
158 | |||
159 | if (unlikely(head - local_read(&rb->wakeup) > rb->watermark)) | ||
190 | local_add(rb->watermark, &rb->wakeup); | 160 | local_add(rb->watermark, &rb->wakeup); |
191 | 161 | ||
192 | handle->page = offset >> (PAGE_SHIFT + page_order(rb)); | 162 | page_shift = PAGE_SHIFT + page_order(rb); |
193 | handle->page &= rb->nr_pages - 1; | ||
194 | handle->size = offset & ((PAGE_SIZE << page_order(rb)) - 1); | ||
195 | handle->addr = rb->data_pages[handle->page]; | ||
196 | handle->addr += handle->size; | ||
197 | handle->size = (PAGE_SIZE << page_order(rb)) - handle->size; | ||
198 | 163 | ||
199 | if (have_lost) { | 164 | handle->page = (offset >> page_shift) & (rb->nr_pages - 1); |
165 | offset &= (1UL << page_shift) - 1; | ||
166 | handle->addr = rb->data_pages[handle->page] + offset; | ||
167 | handle->size = (1UL << page_shift) - offset; | ||
168 | |||
169 | if (unlikely(have_lost)) { | ||
170 | struct perf_sample_data sample_data; | ||
171 | |||
172 | lost_event.header.size = sizeof(lost_event); | ||
200 | lost_event.header.type = PERF_RECORD_LOST; | 173 | lost_event.header.type = PERF_RECORD_LOST; |
201 | lost_event.header.misc = 0; | 174 | lost_event.header.misc = 0; |
202 | lost_event.id = event->id; | 175 | lost_event.id = event->id; |
203 | lost_event.lost = local_xchg(&rb->lost, 0); | 176 | lost_event.lost = local_xchg(&rb->lost, 0); |
204 | 177 | ||
178 | perf_event_header__init_id(&lost_event.header, | ||
179 | &sample_data, event); | ||
205 | perf_output_put(handle, lost_event); | 180 | perf_output_put(handle, lost_event); |
206 | perf_event__output_id_sample(event, handle, &sample_data); | 181 | perf_event__output_id_sample(event, handle, &sample_data); |
207 | } | 182 | } |
diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index ad8e1bdca70e..24b7d6ca871b 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c | |||
@@ -35,6 +35,7 @@ | |||
35 | #include <linux/kdebug.h> /* notifier mechanism */ | 35 | #include <linux/kdebug.h> /* notifier mechanism */ |
36 | #include "../../mm/internal.h" /* munlock_vma_page */ | 36 | #include "../../mm/internal.h" /* munlock_vma_page */ |
37 | #include <linux/percpu-rwsem.h> | 37 | #include <linux/percpu-rwsem.h> |
38 | #include <linux/task_work.h> | ||
38 | 39 | ||
39 | #include <linux/uprobes.h> | 40 | #include <linux/uprobes.h> |
40 | 41 | ||
@@ -244,12 +245,12 @@ static int verify_opcode(struct page *page, unsigned long vaddr, uprobe_opcode_t | |||
244 | * the architecture. If an arch has variable length instruction and the | 245 | * the architecture. If an arch has variable length instruction and the |
245 | * breakpoint instruction is not of the smallest length instruction | 246 | * breakpoint instruction is not of the smallest length instruction |
246 | * supported by that architecture then we need to modify is_trap_at_addr and | 247 | * supported by that architecture then we need to modify is_trap_at_addr and |
247 | * write_opcode accordingly. This would never be a problem for archs that | 248 | * uprobe_write_opcode accordingly. This would never be a problem for archs |
248 | * have fixed length instructions. | 249 | * that have fixed length instructions. |
249 | */ | 250 | */ |
250 | 251 | ||
251 | /* | 252 | /* |
252 | * write_opcode - write the opcode at a given virtual address. | 253 | * uprobe_write_opcode - write the opcode at a given virtual address. |
253 | * @mm: the probed process address space. | 254 | * @mm: the probed process address space. |
254 | * @vaddr: the virtual address to store the opcode. | 255 | * @vaddr: the virtual address to store the opcode. |
255 | * @opcode: opcode to be written at @vaddr. | 256 | * @opcode: opcode to be written at @vaddr. |
@@ -260,7 +261,7 @@ static int verify_opcode(struct page *page, unsigned long vaddr, uprobe_opcode_t | |||
260 | * For mm @mm, write the opcode at @vaddr. | 261 | * For mm @mm, write the opcode at @vaddr. |
261 | * Return 0 (success) or a negative errno. | 262 | * Return 0 (success) or a negative errno. |
262 | */ | 263 | */ |
263 | static int write_opcode(struct mm_struct *mm, unsigned long vaddr, | 264 | int uprobe_write_opcode(struct mm_struct *mm, unsigned long vaddr, |
264 | uprobe_opcode_t opcode) | 265 | uprobe_opcode_t opcode) |
265 | { | 266 | { |
266 | struct page *old_page, *new_page; | 267 | struct page *old_page, *new_page; |
@@ -314,7 +315,7 @@ put_old: | |||
314 | */ | 315 | */ |
315 | int __weak set_swbp(struct arch_uprobe *auprobe, struct mm_struct *mm, unsigned long vaddr) | 316 | int __weak set_swbp(struct arch_uprobe *auprobe, struct mm_struct *mm, unsigned long vaddr) |
316 | { | 317 | { |
317 | return write_opcode(mm, vaddr, UPROBE_SWBP_INSN); | 318 | return uprobe_write_opcode(mm, vaddr, UPROBE_SWBP_INSN); |
318 | } | 319 | } |
319 | 320 | ||
320 | /** | 321 | /** |
@@ -329,7 +330,7 @@ int __weak set_swbp(struct arch_uprobe *auprobe, struct mm_struct *mm, unsigned | |||
329 | int __weak | 330 | int __weak |
330 | set_orig_insn(struct arch_uprobe *auprobe, struct mm_struct *mm, unsigned long vaddr) | 331 | set_orig_insn(struct arch_uprobe *auprobe, struct mm_struct *mm, unsigned long vaddr) |
331 | { | 332 | { |
332 | return write_opcode(mm, vaddr, *(uprobe_opcode_t *)auprobe->insn); | 333 | return uprobe_write_opcode(mm, vaddr, *(uprobe_opcode_t *)auprobe->insn); |
333 | } | 334 | } |
334 | 335 | ||
335 | static int match_uprobe(struct uprobe *l, struct uprobe *r) | 336 | static int match_uprobe(struct uprobe *l, struct uprobe *r) |
@@ -503,9 +504,8 @@ static bool consumer_del(struct uprobe *uprobe, struct uprobe_consumer *uc) | |||
503 | return ret; | 504 | return ret; |
504 | } | 505 | } |
505 | 506 | ||
506 | static int | 507 | static int __copy_insn(struct address_space *mapping, struct file *filp, |
507 | __copy_insn(struct address_space *mapping, struct file *filp, char *insn, | 508 | void *insn, int nbytes, loff_t offset) |
508 | unsigned long nbytes, loff_t offset) | ||
509 | { | 509 | { |
510 | struct page *page; | 510 | struct page *page; |
511 | 511 | ||
@@ -527,28 +527,28 @@ __copy_insn(struct address_space *mapping, struct file *filp, char *insn, | |||
527 | 527 | ||
528 | static int copy_insn(struct uprobe *uprobe, struct file *filp) | 528 | static int copy_insn(struct uprobe *uprobe, struct file *filp) |
529 | { | 529 | { |
530 | struct address_space *mapping; | 530 | struct address_space *mapping = uprobe->inode->i_mapping; |
531 | unsigned long nbytes; | 531 | loff_t offs = uprobe->offset; |
532 | int bytes; | 532 | void *insn = uprobe->arch.insn; |
533 | 533 | int size = MAX_UINSN_BYTES; | |
534 | nbytes = PAGE_SIZE - (uprobe->offset & ~PAGE_MASK); | 534 | int len, err = -EIO; |
535 | mapping = uprobe->inode->i_mapping; | ||
536 | 535 | ||
537 | /* Instruction at end of binary; copy only available bytes */ | 536 | /* Copy only available bytes, -EIO if nothing was read */ |
538 | if (uprobe->offset + MAX_UINSN_BYTES > uprobe->inode->i_size) | 537 | do { |
539 | bytes = uprobe->inode->i_size - uprobe->offset; | 538 | if (offs >= i_size_read(uprobe->inode)) |
540 | else | 539 | break; |
541 | bytes = MAX_UINSN_BYTES; | ||
542 | 540 | ||
543 | /* Instruction at the page-boundary; copy bytes in second page */ | 541 | len = min_t(int, size, PAGE_SIZE - (offs & ~PAGE_MASK)); |
544 | if (nbytes < bytes) { | 542 | err = __copy_insn(mapping, filp, insn, len, offs); |
545 | int err = __copy_insn(mapping, filp, uprobe->arch.insn + nbytes, | ||
546 | bytes - nbytes, uprobe->offset + nbytes); | ||
547 | if (err) | 543 | if (err) |
548 | return err; | 544 | break; |
549 | bytes = nbytes; | 545 | |
550 | } | 546 | insn += len; |
551 | return __copy_insn(mapping, filp, uprobe->arch.insn, bytes, uprobe->offset); | 547 | offs += len; |
548 | size -= len; | ||
549 | } while (size); | ||
550 | |||
551 | return err; | ||
552 | } | 552 | } |
553 | 553 | ||
554 | static int prepare_uprobe(struct uprobe *uprobe, struct file *file, | 554 | static int prepare_uprobe(struct uprobe *uprobe, struct file *file, |
@@ -576,7 +576,7 @@ static int prepare_uprobe(struct uprobe *uprobe, struct file *file, | |||
576 | if (ret) | 576 | if (ret) |
577 | goto out; | 577 | goto out; |
578 | 578 | ||
579 | /* write_opcode() assumes we don't cross page boundary */ | 579 | /* uprobe_write_opcode() assumes we don't cross page boundary */ |
580 | BUG_ON((uprobe->offset & ~PAGE_MASK) + | 580 | BUG_ON((uprobe->offset & ~PAGE_MASK) + |
581 | UPROBE_SWBP_INSN_SIZE > PAGE_SIZE); | 581 | UPROBE_SWBP_INSN_SIZE > PAGE_SIZE); |
582 | 582 | ||
@@ -1096,21 +1096,22 @@ void uprobe_munmap(struct vm_area_struct *vma, unsigned long start, unsigned lon | |||
1096 | } | 1096 | } |
1097 | 1097 | ||
1098 | /* Slot allocation for XOL */ | 1098 | /* Slot allocation for XOL */ |
1099 | static int xol_add_vma(struct xol_area *area) | 1099 | static int xol_add_vma(struct mm_struct *mm, struct xol_area *area) |
1100 | { | 1100 | { |
1101 | struct mm_struct *mm = current->mm; | ||
1102 | int ret = -EALREADY; | 1101 | int ret = -EALREADY; |
1103 | 1102 | ||
1104 | down_write(&mm->mmap_sem); | 1103 | down_write(&mm->mmap_sem); |
1105 | if (mm->uprobes_state.xol_area) | 1104 | if (mm->uprobes_state.xol_area) |
1106 | goto fail; | 1105 | goto fail; |
1107 | 1106 | ||
1108 | ret = -ENOMEM; | 1107 | if (!area->vaddr) { |
1109 | /* Try to map as high as possible, this is only a hint. */ | 1108 | /* Try to map as high as possible, this is only a hint. */ |
1110 | area->vaddr = get_unmapped_area(NULL, TASK_SIZE - PAGE_SIZE, PAGE_SIZE, 0, 0); | 1109 | area->vaddr = get_unmapped_area(NULL, TASK_SIZE - PAGE_SIZE, |
1111 | if (area->vaddr & ~PAGE_MASK) { | 1110 | PAGE_SIZE, 0, 0); |
1112 | ret = area->vaddr; | 1111 | if (area->vaddr & ~PAGE_MASK) { |
1113 | goto fail; | 1112 | ret = area->vaddr; |
1113 | goto fail; | ||
1114 | } | ||
1114 | } | 1115 | } |
1115 | 1116 | ||
1116 | ret = install_special_mapping(mm, area->vaddr, PAGE_SIZE, | 1117 | ret = install_special_mapping(mm, area->vaddr, PAGE_SIZE, |
@@ -1120,30 +1121,19 @@ static int xol_add_vma(struct xol_area *area) | |||
1120 | 1121 | ||
1121 | smp_wmb(); /* pairs with get_xol_area() */ | 1122 | smp_wmb(); /* pairs with get_xol_area() */ |
1122 | mm->uprobes_state.xol_area = area; | 1123 | mm->uprobes_state.xol_area = area; |
1123 | ret = 0; | ||
1124 | fail: | 1124 | fail: |
1125 | up_write(&mm->mmap_sem); | 1125 | up_write(&mm->mmap_sem); |
1126 | 1126 | ||
1127 | return ret; | 1127 | return ret; |
1128 | } | 1128 | } |
1129 | 1129 | ||
1130 | /* | 1130 | static struct xol_area *__create_xol_area(unsigned long vaddr) |
1131 | * get_xol_area - Allocate process's xol_area if necessary. | ||
1132 | * This area will be used for storing instructions for execution out of line. | ||
1133 | * | ||
1134 | * Returns the allocated area or NULL. | ||
1135 | */ | ||
1136 | static struct xol_area *get_xol_area(void) | ||
1137 | { | 1131 | { |
1138 | struct mm_struct *mm = current->mm; | 1132 | struct mm_struct *mm = current->mm; |
1139 | struct xol_area *area; | ||
1140 | uprobe_opcode_t insn = UPROBE_SWBP_INSN; | 1133 | uprobe_opcode_t insn = UPROBE_SWBP_INSN; |
1134 | struct xol_area *area; | ||
1141 | 1135 | ||
1142 | area = mm->uprobes_state.xol_area; | 1136 | area = kmalloc(sizeof(*area), GFP_KERNEL); |
1143 | if (area) | ||
1144 | goto ret; | ||
1145 | |||
1146 | area = kzalloc(sizeof(*area), GFP_KERNEL); | ||
1147 | if (unlikely(!area)) | 1137 | if (unlikely(!area)) |
1148 | goto out; | 1138 | goto out; |
1149 | 1139 | ||
@@ -1155,13 +1145,14 @@ static struct xol_area *get_xol_area(void) | |||
1155 | if (!area->page) | 1145 | if (!area->page) |
1156 | goto free_bitmap; | 1146 | goto free_bitmap; |
1157 | 1147 | ||
1158 | /* allocate first slot of task's xol_area for the return probes */ | 1148 | area->vaddr = vaddr; |
1149 | init_waitqueue_head(&area->wq); | ||
1150 | /* Reserve the 1st slot for get_trampoline_vaddr() */ | ||
1159 | set_bit(0, area->bitmap); | 1151 | set_bit(0, area->bitmap); |
1160 | copy_to_page(area->page, 0, &insn, UPROBE_SWBP_INSN_SIZE); | ||
1161 | atomic_set(&area->slot_count, 1); | 1152 | atomic_set(&area->slot_count, 1); |
1162 | init_waitqueue_head(&area->wq); | 1153 | copy_to_page(area->page, 0, &insn, UPROBE_SWBP_INSN_SIZE); |
1163 | 1154 | ||
1164 | if (!xol_add_vma(area)) | 1155 | if (!xol_add_vma(mm, area)) |
1165 | return area; | 1156 | return area; |
1166 | 1157 | ||
1167 | __free_page(area->page); | 1158 | __free_page(area->page); |
@@ -1170,9 +1161,25 @@ static struct xol_area *get_xol_area(void) | |||
1170 | free_area: | 1161 | free_area: |
1171 | kfree(area); | 1162 | kfree(area); |
1172 | out: | 1163 | out: |
1164 | return NULL; | ||
1165 | } | ||
1166 | |||
1167 | /* | ||
1168 | * get_xol_area - Allocate process's xol_area if necessary. | ||
1169 | * This area will be used for storing instructions for execution out of line. | ||
1170 | * | ||
1171 | * Returns the allocated area or NULL. | ||
1172 | */ | ||
1173 | static struct xol_area *get_xol_area(void) | ||
1174 | { | ||
1175 | struct mm_struct *mm = current->mm; | ||
1176 | struct xol_area *area; | ||
1177 | |||
1178 | if (!mm->uprobes_state.xol_area) | ||
1179 | __create_xol_area(0); | ||
1180 | |||
1173 | area = mm->uprobes_state.xol_area; | 1181 | area = mm->uprobes_state.xol_area; |
1174 | ret: | 1182 | smp_read_barrier_depends(); /* pairs with wmb in xol_add_vma() */ |
1175 | smp_read_barrier_depends(); /* pairs with wmb in xol_add_vma() */ | ||
1176 | return area; | 1183 | return area; |
1177 | } | 1184 | } |
1178 | 1185 | ||
@@ -1256,7 +1263,8 @@ static unsigned long xol_get_insn_slot(struct uprobe *uprobe) | |||
1256 | return 0; | 1263 | return 0; |
1257 | 1264 | ||
1258 | /* Initialize the slot */ | 1265 | /* Initialize the slot */ |
1259 | copy_to_page(area->page, xol_vaddr, uprobe->arch.insn, MAX_UINSN_BYTES); | 1266 | copy_to_page(area->page, xol_vaddr, |
1267 | uprobe->arch.ixol, sizeof(uprobe->arch.ixol)); | ||
1260 | /* | 1268 | /* |
1261 | * We probably need flush_icache_user_range() but it needs vma. | 1269 | * We probably need flush_icache_user_range() but it needs vma. |
1262 | * This should work on supported architectures too. | 1270 | * This should work on supported architectures too. |
@@ -1345,14 +1353,6 @@ void uprobe_free_utask(struct task_struct *t) | |||
1345 | } | 1353 | } |
1346 | 1354 | ||
1347 | /* | 1355 | /* |
1348 | * Called in context of a new clone/fork from copy_process. | ||
1349 | */ | ||
1350 | void uprobe_copy_process(struct task_struct *t) | ||
1351 | { | ||
1352 | t->utask = NULL; | ||
1353 | } | ||
1354 | |||
1355 | /* | ||
1356 | * Allocate a uprobe_task object for the task if if necessary. | 1356 | * Allocate a uprobe_task object for the task if if necessary. |
1357 | * Called when the thread hits a breakpoint. | 1357 | * Called when the thread hits a breakpoint. |
1358 | * | 1358 | * |
@@ -1367,6 +1367,90 @@ static struct uprobe_task *get_utask(void) | |||
1367 | return current->utask; | 1367 | return current->utask; |
1368 | } | 1368 | } |
1369 | 1369 | ||
1370 | static int dup_utask(struct task_struct *t, struct uprobe_task *o_utask) | ||
1371 | { | ||
1372 | struct uprobe_task *n_utask; | ||
1373 | struct return_instance **p, *o, *n; | ||
1374 | |||
1375 | n_utask = kzalloc(sizeof(struct uprobe_task), GFP_KERNEL); | ||
1376 | if (!n_utask) | ||
1377 | return -ENOMEM; | ||
1378 | t->utask = n_utask; | ||
1379 | |||
1380 | p = &n_utask->return_instances; | ||
1381 | for (o = o_utask->return_instances; o; o = o->next) { | ||
1382 | n = kmalloc(sizeof(struct return_instance), GFP_KERNEL); | ||
1383 | if (!n) | ||
1384 | return -ENOMEM; | ||
1385 | |||
1386 | *n = *o; | ||
1387 | atomic_inc(&n->uprobe->ref); | ||
1388 | n->next = NULL; | ||
1389 | |||
1390 | *p = n; | ||
1391 | p = &n->next; | ||
1392 | n_utask->depth++; | ||
1393 | } | ||
1394 | |||
1395 | return 0; | ||
1396 | } | ||
1397 | |||
1398 | static void uprobe_warn(struct task_struct *t, const char *msg) | ||
1399 | { | ||
1400 | pr_warn("uprobe: %s:%d failed to %s\n", | ||
1401 | current->comm, current->pid, msg); | ||
1402 | } | ||
1403 | |||
1404 | static void dup_xol_work(struct callback_head *work) | ||
1405 | { | ||
1406 | kfree(work); | ||
1407 | |||
1408 | if (current->flags & PF_EXITING) | ||
1409 | return; | ||
1410 | |||
1411 | if (!__create_xol_area(current->utask->vaddr)) | ||
1412 | uprobe_warn(current, "dup xol area"); | ||
1413 | } | ||
1414 | |||
1415 | /* | ||
1416 | * Called in context of a new clone/fork from copy_process. | ||
1417 | */ | ||
1418 | void uprobe_copy_process(struct task_struct *t, unsigned long flags) | ||
1419 | { | ||
1420 | struct uprobe_task *utask = current->utask; | ||
1421 | struct mm_struct *mm = current->mm; | ||
1422 | struct callback_head *work; | ||
1423 | struct xol_area *area; | ||
1424 | |||
1425 | t->utask = NULL; | ||
1426 | |||
1427 | if (!utask || !utask->return_instances) | ||
1428 | return; | ||
1429 | |||
1430 | if (mm == t->mm && !(flags & CLONE_VFORK)) | ||
1431 | return; | ||
1432 | |||
1433 | if (dup_utask(t, utask)) | ||
1434 | return uprobe_warn(t, "dup ret instances"); | ||
1435 | |||
1436 | /* The task can fork() after dup_xol_work() fails */ | ||
1437 | area = mm->uprobes_state.xol_area; | ||
1438 | if (!area) | ||
1439 | return uprobe_warn(t, "dup xol area"); | ||
1440 | |||
1441 | if (mm == t->mm) | ||
1442 | return; | ||
1443 | |||
1444 | /* TODO: move it into the union in uprobe_task */ | ||
1445 | work = kmalloc(sizeof(*work), GFP_KERNEL); | ||
1446 | if (!work) | ||
1447 | return uprobe_warn(t, "dup xol area"); | ||
1448 | |||
1449 | t->utask->vaddr = area->vaddr; | ||
1450 | init_task_work(work, dup_xol_work); | ||
1451 | task_work_add(t, work, true); | ||
1452 | } | ||
1453 | |||
1370 | /* | 1454 | /* |
1371 | * Current area->vaddr notion assume the trampoline address is always | 1455 | * Current area->vaddr notion assume the trampoline address is always |
1372 | * equal area->vaddr. | 1456 | * equal area->vaddr. |
@@ -1857,9 +1941,4 @@ static int __init init_uprobes(void) | |||
1857 | 1941 | ||
1858 | return register_die_notifier(&uprobe_exception_nb); | 1942 | return register_die_notifier(&uprobe_exception_nb); |
1859 | } | 1943 | } |
1860 | module_init(init_uprobes); | 1944 | __initcall(init_uprobes); |
1861 | |||
1862 | static void __exit exit_uprobes(void) | ||
1863 | { | ||
1864 | } | ||
1865 | module_exit(exit_uprobes); | ||
diff --git a/kernel/extable.c b/kernel/extable.c index 832cb28105bb..763faf037ec1 100644 --- a/kernel/extable.c +++ b/kernel/extable.c | |||
@@ -61,7 +61,7 @@ const struct exception_table_entry *search_exception_tables(unsigned long addr) | |||
61 | static inline int init_kernel_text(unsigned long addr) | 61 | static inline int init_kernel_text(unsigned long addr) |
62 | { | 62 | { |
63 | if (addr >= (unsigned long)_sinittext && | 63 | if (addr >= (unsigned long)_sinittext && |
64 | addr <= (unsigned long)_einittext) | 64 | addr < (unsigned long)_einittext) |
65 | return 1; | 65 | return 1; |
66 | return 0; | 66 | return 0; |
67 | } | 67 | } |
@@ -69,7 +69,7 @@ static inline int init_kernel_text(unsigned long addr) | |||
69 | int core_kernel_text(unsigned long addr) | 69 | int core_kernel_text(unsigned long addr) |
70 | { | 70 | { |
71 | if (addr >= (unsigned long)_stext && | 71 | if (addr >= (unsigned long)_stext && |
72 | addr <= (unsigned long)_etext) | 72 | addr < (unsigned long)_etext) |
73 | return 1; | 73 | return 1; |
74 | 74 | ||
75 | if (system_state == SYSTEM_BOOTING && | 75 | if (system_state == SYSTEM_BOOTING && |
diff --git a/kernel/fork.c b/kernel/fork.c index 086fe73ad6bd..dfa736c98d17 100644 --- a/kernel/fork.c +++ b/kernel/fork.c | |||
@@ -532,11 +532,12 @@ static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p) | |||
532 | mm->flags = (current->mm) ? | 532 | mm->flags = (current->mm) ? |
533 | (current->mm->flags & MMF_INIT_MASK) : default_dump_filter; | 533 | (current->mm->flags & MMF_INIT_MASK) : default_dump_filter; |
534 | mm->core_state = NULL; | 534 | mm->core_state = NULL; |
535 | mm->nr_ptes = 0; | 535 | atomic_long_set(&mm->nr_ptes, 0); |
536 | memset(&mm->rss_stat, 0, sizeof(mm->rss_stat)); | 536 | memset(&mm->rss_stat, 0, sizeof(mm->rss_stat)); |
537 | spin_lock_init(&mm->page_table_lock); | 537 | spin_lock_init(&mm->page_table_lock); |
538 | mm_init_aio(mm); | 538 | mm_init_aio(mm); |
539 | mm_init_owner(mm, p); | 539 | mm_init_owner(mm, p); |
540 | clear_tlb_flush_pending(mm); | ||
540 | 541 | ||
541 | if (likely(!mm_alloc_pgd(mm))) { | 542 | if (likely(!mm_alloc_pgd(mm))) { |
542 | mm->def_flags = 0; | 543 | mm->def_flags = 0; |
@@ -560,7 +561,7 @@ static void check_mm(struct mm_struct *mm) | |||
560 | "mm:%p idx:%d val:%ld\n", mm, i, x); | 561 | "mm:%p idx:%d val:%ld\n", mm, i, x); |
561 | } | 562 | } |
562 | 563 | ||
563 | #ifdef CONFIG_TRANSPARENT_HUGEPAGE | 564 | #if defined(CONFIG_TRANSPARENT_HUGEPAGE) && !USE_SPLIT_PMD_PTLOCKS |
564 | VM_BUG_ON(mm->pmd_huge_pte); | 565 | VM_BUG_ON(mm->pmd_huge_pte); |
565 | #endif | 566 | #endif |
566 | } | 567 | } |
@@ -814,12 +815,9 @@ struct mm_struct *dup_mm(struct task_struct *tsk) | |||
814 | memcpy(mm, oldmm, sizeof(*mm)); | 815 | memcpy(mm, oldmm, sizeof(*mm)); |
815 | mm_init_cpumask(mm); | 816 | mm_init_cpumask(mm); |
816 | 817 | ||
817 | #ifdef CONFIG_TRANSPARENT_HUGEPAGE | 818 | #if defined(CONFIG_TRANSPARENT_HUGEPAGE) && !USE_SPLIT_PMD_PTLOCKS |
818 | mm->pmd_huge_pte = NULL; | 819 | mm->pmd_huge_pte = NULL; |
819 | #endif | 820 | #endif |
820 | #ifdef CONFIG_NUMA_BALANCING | ||
821 | mm->first_nid = NUMA_PTE_SCAN_INIT; | ||
822 | #endif | ||
823 | if (!mm_init(mm, tsk)) | 821 | if (!mm_init(mm, tsk)) |
824 | goto fail_nomem; | 822 | goto fail_nomem; |
825 | 823 | ||
@@ -1174,7 +1172,7 @@ static struct task_struct *copy_process(unsigned long clone_flags, | |||
1174 | * do not allow it to share a thread group or signal handlers or | 1172 | * do not allow it to share a thread group or signal handlers or |
1175 | * parent with the forking task. | 1173 | * parent with the forking task. |
1176 | */ | 1174 | */ |
1177 | if (clone_flags & (CLONE_SIGHAND | CLONE_PARENT)) { | 1175 | if (clone_flags & CLONE_SIGHAND) { |
1178 | if ((clone_flags & (CLONE_NEWUSER | CLONE_NEWPID)) || | 1176 | if ((clone_flags & (CLONE_NEWUSER | CLONE_NEWPID)) || |
1179 | (task_active_pid_ns(current) != | 1177 | (task_active_pid_ns(current) != |
1180 | current->nsproxy->pid_ns_for_children)) | 1178 | current->nsproxy->pid_ns_for_children)) |
@@ -1313,7 +1311,7 @@ static struct task_struct *copy_process(unsigned long clone_flags, | |||
1313 | #endif | 1311 | #endif |
1314 | 1312 | ||
1315 | /* Perform scheduler related setup. Assign this task to a CPU. */ | 1313 | /* Perform scheduler related setup. Assign this task to a CPU. */ |
1316 | sched_fork(p); | 1314 | sched_fork(clone_flags, p); |
1317 | 1315 | ||
1318 | retval = perf_event_init_task(p); | 1316 | retval = perf_event_init_task(p); |
1319 | if (retval) | 1317 | if (retval) |
@@ -1373,7 +1371,6 @@ static struct task_struct *copy_process(unsigned long clone_flags, | |||
1373 | INIT_LIST_HEAD(&p->pi_state_list); | 1371 | INIT_LIST_HEAD(&p->pi_state_list); |
1374 | p->pi_state_cache = NULL; | 1372 | p->pi_state_cache = NULL; |
1375 | #endif | 1373 | #endif |
1376 | uprobe_copy_process(p); | ||
1377 | /* | 1374 | /* |
1378 | * sigaltstack should be cleared when sharing the same VM | 1375 | * sigaltstack should be cleared when sharing the same VM |
1379 | */ | 1376 | */ |
@@ -1490,6 +1487,7 @@ static struct task_struct *copy_process(unsigned long clone_flags, | |||
1490 | perf_event_fork(p); | 1487 | perf_event_fork(p); |
1491 | 1488 | ||
1492 | trace_task_newtask(p, clone_flags); | 1489 | trace_task_newtask(p, clone_flags); |
1490 | uprobe_copy_process(p, clone_flags); | ||
1493 | 1491 | ||
1494 | return p; | 1492 | return p; |
1495 | 1493 | ||
diff --git a/kernel/freezer.c b/kernel/freezer.c index b462fa197517..aa6a8aadb911 100644 --- a/kernel/freezer.c +++ b/kernel/freezer.c | |||
@@ -19,6 +19,12 @@ EXPORT_SYMBOL(system_freezing_cnt); | |||
19 | bool pm_freezing; | 19 | bool pm_freezing; |
20 | bool pm_nosig_freezing; | 20 | bool pm_nosig_freezing; |
21 | 21 | ||
22 | /* | ||
23 | * Temporary export for the deadlock workaround in ata_scsi_hotplug(). | ||
24 | * Remove once the hack becomes unnecessary. | ||
25 | */ | ||
26 | EXPORT_SYMBOL_GPL(pm_freezing); | ||
27 | |||
22 | /* protects freezing and frozen transitions */ | 28 | /* protects freezing and frozen transitions */ |
23 | static DEFINE_SPINLOCK(freezer_lock); | 29 | static DEFINE_SPINLOCK(freezer_lock); |
24 | 30 | ||
diff --git a/kernel/futex.c b/kernel/futex.c index c3a1a55a5214..f6ff0191ecf7 100644 --- a/kernel/futex.c +++ b/kernel/futex.c | |||
@@ -66,7 +66,7 @@ | |||
66 | 66 | ||
67 | #include <asm/futex.h> | 67 | #include <asm/futex.h> |
68 | 68 | ||
69 | #include "rtmutex_common.h" | 69 | #include "locking/rtmutex_common.h" |
70 | 70 | ||
71 | int __read_mostly futex_cmpxchg_enabled; | 71 | int __read_mostly futex_cmpxchg_enabled; |
72 | 72 | ||
@@ -251,6 +251,9 @@ get_futex_key(u32 __user *uaddr, int fshared, union futex_key *key, int rw) | |||
251 | return -EINVAL; | 251 | return -EINVAL; |
252 | address -= key->both.offset; | 252 | address -= key->both.offset; |
253 | 253 | ||
254 | if (unlikely(!access_ok(rw, uaddr, sizeof(u32)))) | ||
255 | return -EFAULT; | ||
256 | |||
254 | /* | 257 | /* |
255 | * PROCESS_PRIVATE futexes are fast. | 258 | * PROCESS_PRIVATE futexes are fast. |
256 | * As the mm cannot disappear under us and the 'key' only needs | 259 | * As the mm cannot disappear under us and the 'key' only needs |
@@ -259,8 +262,6 @@ get_futex_key(u32 __user *uaddr, int fshared, union futex_key *key, int rw) | |||
259 | * but access_ok() should be faster than find_vma() | 262 | * but access_ok() should be faster than find_vma() |
260 | */ | 263 | */ |
261 | if (!fshared) { | 264 | if (!fshared) { |
262 | if (unlikely(!access_ok(VERIFY_WRITE, uaddr, sizeof(u32)))) | ||
263 | return -EFAULT; | ||
264 | key->private.mm = mm; | 265 | key->private.mm = mm; |
265 | key->private.address = address; | 266 | key->private.address = address; |
266 | get_futex_key_refs(key); | 267 | get_futex_key_refs(key); |
@@ -288,7 +289,7 @@ again: | |||
288 | put_page(page); | 289 | put_page(page); |
289 | /* serialize against __split_huge_page_splitting() */ | 290 | /* serialize against __split_huge_page_splitting() */ |
290 | local_irq_disable(); | 291 | local_irq_disable(); |
291 | if (likely(__get_user_pages_fast(address, 1, 1, &page) == 1)) { | 292 | if (likely(__get_user_pages_fast(address, 1, !ro, &page) == 1)) { |
292 | page_head = compound_head(page); | 293 | page_head = compound_head(page); |
293 | /* | 294 | /* |
294 | * page_head is valid pointer but we must pin | 295 | * page_head is valid pointer but we must pin |
diff --git a/kernel/gcov/Kconfig b/kernel/gcov/Kconfig index d4da55d1fb65..d04ce8ac4399 100644 --- a/kernel/gcov/Kconfig +++ b/kernel/gcov/Kconfig | |||
@@ -46,4 +46,34 @@ config GCOV_PROFILE_ALL | |||
46 | larger and run slower. Also be sure to exclude files from profiling | 46 | larger and run slower. Also be sure to exclude files from profiling |
47 | which are not linked to the kernel image to prevent linker errors. | 47 | which are not linked to the kernel image to prevent linker errors. |
48 | 48 | ||
49 | choice | ||
50 | prompt "Specify GCOV format" | ||
51 | depends on GCOV_KERNEL | ||
52 | default GCOV_FORMAT_AUTODETECT | ||
53 | ---help--- | ||
54 | The gcov format is usually determined by the GCC version, but there are | ||
55 | exceptions where format changes are integrated in lower-version GCCs. | ||
56 | In such a case use this option to adjust the format used in the kernel | ||
57 | accordingly. | ||
58 | |||
59 | If unsure, choose "Autodetect". | ||
60 | |||
61 | config GCOV_FORMAT_AUTODETECT | ||
62 | bool "Autodetect" | ||
63 | ---help--- | ||
64 | Select this option to use the format that corresponds to your GCC | ||
65 | version. | ||
66 | |||
67 | config GCOV_FORMAT_3_4 | ||
68 | bool "GCC 3.4 format" | ||
69 | ---help--- | ||
70 | Select this option to use the format defined by GCC 3.4. | ||
71 | |||
72 | config GCOV_FORMAT_4_7 | ||
73 | bool "GCC 4.7 format" | ||
74 | ---help--- | ||
75 | Select this option to use the format defined by GCC 4.7. | ||
76 | |||
77 | endchoice | ||
78 | |||
49 | endmenu | 79 | endmenu |
diff --git a/kernel/gcov/Makefile b/kernel/gcov/Makefile index e97ca59e2520..52aa7e8de927 100644 --- a/kernel/gcov/Makefile +++ b/kernel/gcov/Makefile | |||
@@ -1,3 +1,33 @@ | |||
1 | ccflags-y := -DSRCTREE='"$(srctree)"' -DOBJTREE='"$(objtree)"' | 1 | ccflags-y := -DSRCTREE='"$(srctree)"' -DOBJTREE='"$(objtree)"' |
2 | 2 | ||
3 | obj-$(CONFIG_GCOV_KERNEL) := base.o fs.o gcc_3_4.o | 3 | # if-lt |
4 | # Usage VAR := $(call if-lt, $(a), $(b)) | ||
5 | # Returns 1 if (a < b) | ||
6 | if-lt = $(shell [ $(1) -lt $(2) ] && echo 1) | ||
7 | |||
8 | ifeq ($(CONFIG_GCOV_FORMAT_3_4),y) | ||
9 | cc-ver := 0304 | ||
10 | else ifeq ($(CONFIG_GCOV_FORMAT_4_7),y) | ||
11 | cc-ver := 0407 | ||
12 | else | ||
13 | # Use cc-version if available, otherwise set 0 | ||
14 | # | ||
15 | # scripts/Kbuild.include, which contains cc-version function, is not included | ||
16 | # during make clean "make -f scripts/Makefile.clean obj=kernel/gcov" | ||
17 | # Meaning cc-ver is empty causing if-lt test to fail with | ||
18 | # "/bin/sh: line 0: [: -lt: unary operator expected" error mesage. | ||
19 | # This has no affect on the clean phase, but the error message could be | ||
20 | # confusing/annoying. So this dummy workaround sets cc-ver to zero if cc-version | ||
21 | # is not available. We can probably move if-lt to Kbuild.include, so it's also | ||
22 | # not defined during clean or to include Kbuild.include in | ||
23 | # scripts/Makefile.clean. But the following workaround seems least invasive. | ||
24 | cc-ver := $(if $(call cc-version),$(call cc-version),0) | ||
25 | endif | ||
26 | |||
27 | obj-$(CONFIG_GCOV_KERNEL) := base.o fs.o | ||
28 | |||
29 | ifeq ($(call if-lt, $(cc-ver), 0407),1) | ||
30 | obj-$(CONFIG_GCOV_KERNEL) += gcc_3_4.o | ||
31 | else | ||
32 | obj-$(CONFIG_GCOV_KERNEL) += gcc_4_7.o | ||
33 | endif | ||
diff --git a/kernel/gcov/base.c b/kernel/gcov/base.c index 9b22d03cc581..f45b75b713c0 100644 --- a/kernel/gcov/base.c +++ b/kernel/gcov/base.c | |||
@@ -20,7 +20,6 @@ | |||
20 | #include <linux/mutex.h> | 20 | #include <linux/mutex.h> |
21 | #include "gcov.h" | 21 | #include "gcov.h" |
22 | 22 | ||
23 | static struct gcov_info *gcov_info_head; | ||
24 | static int gcov_events_enabled; | 23 | static int gcov_events_enabled; |
25 | static DEFINE_MUTEX(gcov_lock); | 24 | static DEFINE_MUTEX(gcov_lock); |
26 | 25 | ||
@@ -34,7 +33,7 @@ void __gcov_init(struct gcov_info *info) | |||
34 | 33 | ||
35 | mutex_lock(&gcov_lock); | 34 | mutex_lock(&gcov_lock); |
36 | if (gcov_version == 0) { | 35 | if (gcov_version == 0) { |
37 | gcov_version = info->version; | 36 | gcov_version = gcov_info_version(info); |
38 | /* | 37 | /* |
39 | * Printing gcc's version magic may prove useful for debugging | 38 | * Printing gcc's version magic may prove useful for debugging |
40 | * incompatibility reports. | 39 | * incompatibility reports. |
@@ -45,8 +44,7 @@ void __gcov_init(struct gcov_info *info) | |||
45 | * Add new profiling data structure to list and inform event | 44 | * Add new profiling data structure to list and inform event |
46 | * listener. | 45 | * listener. |
47 | */ | 46 | */ |
48 | info->next = gcov_info_head; | 47 | gcov_info_link(info); |
49 | gcov_info_head = info; | ||
50 | if (gcov_events_enabled) | 48 | if (gcov_events_enabled) |
51 | gcov_event(GCOV_ADD, info); | 49 | gcov_event(GCOV_ADD, info); |
52 | mutex_unlock(&gcov_lock); | 50 | mutex_unlock(&gcov_lock); |
@@ -81,6 +79,12 @@ void __gcov_merge_delta(gcov_type *counters, unsigned int n_counters) | |||
81 | } | 79 | } |
82 | EXPORT_SYMBOL(__gcov_merge_delta); | 80 | EXPORT_SYMBOL(__gcov_merge_delta); |
83 | 81 | ||
82 | void __gcov_merge_ior(gcov_type *counters, unsigned int n_counters) | ||
83 | { | ||
84 | /* Unused. */ | ||
85 | } | ||
86 | EXPORT_SYMBOL(__gcov_merge_ior); | ||
87 | |||
84 | /** | 88 | /** |
85 | * gcov_enable_events - enable event reporting through gcov_event() | 89 | * gcov_enable_events - enable event reporting through gcov_event() |
86 | * | 90 | * |
@@ -91,13 +95,15 @@ EXPORT_SYMBOL(__gcov_merge_delta); | |||
91 | */ | 95 | */ |
92 | void gcov_enable_events(void) | 96 | void gcov_enable_events(void) |
93 | { | 97 | { |
94 | struct gcov_info *info; | 98 | struct gcov_info *info = NULL; |
95 | 99 | ||
96 | mutex_lock(&gcov_lock); | 100 | mutex_lock(&gcov_lock); |
97 | gcov_events_enabled = 1; | 101 | gcov_events_enabled = 1; |
102 | |||
98 | /* Perform event callback for previously registered entries. */ | 103 | /* Perform event callback for previously registered entries. */ |
99 | for (info = gcov_info_head; info; info = info->next) | 104 | while ((info = gcov_info_next(info))) |
100 | gcov_event(GCOV_ADD, info); | 105 | gcov_event(GCOV_ADD, info); |
106 | |||
101 | mutex_unlock(&gcov_lock); | 107 | mutex_unlock(&gcov_lock); |
102 | } | 108 | } |
103 | 109 | ||
@@ -112,25 +118,23 @@ static int gcov_module_notifier(struct notifier_block *nb, unsigned long event, | |||
112 | void *data) | 118 | void *data) |
113 | { | 119 | { |
114 | struct module *mod = data; | 120 | struct module *mod = data; |
115 | struct gcov_info *info; | 121 | struct gcov_info *info = NULL; |
116 | struct gcov_info *prev; | 122 | struct gcov_info *prev = NULL; |
117 | 123 | ||
118 | if (event != MODULE_STATE_GOING) | 124 | if (event != MODULE_STATE_GOING) |
119 | return NOTIFY_OK; | 125 | return NOTIFY_OK; |
120 | mutex_lock(&gcov_lock); | 126 | mutex_lock(&gcov_lock); |
121 | prev = NULL; | 127 | |
122 | /* Remove entries located in module from linked list. */ | 128 | /* Remove entries located in module from linked list. */ |
123 | for (info = gcov_info_head; info; info = info->next) { | 129 | while ((info = gcov_info_next(info))) { |
124 | if (within(info, mod->module_core, mod->core_size)) { | 130 | if (within(info, mod->module_core, mod->core_size)) { |
125 | if (prev) | 131 | gcov_info_unlink(prev, info); |
126 | prev->next = info->next; | ||
127 | else | ||
128 | gcov_info_head = info->next; | ||
129 | if (gcov_events_enabled) | 132 | if (gcov_events_enabled) |
130 | gcov_event(GCOV_REMOVE, info); | 133 | gcov_event(GCOV_REMOVE, info); |
131 | } else | 134 | } else |
132 | prev = info; | 135 | prev = info; |
133 | } | 136 | } |
137 | |||
134 | mutex_unlock(&gcov_lock); | 138 | mutex_unlock(&gcov_lock); |
135 | 139 | ||
136 | return NOTIFY_OK; | 140 | return NOTIFY_OK; |
diff --git a/kernel/gcov/fs.c b/kernel/gcov/fs.c index 7a7d2ee96d42..15ff01a76379 100644 --- a/kernel/gcov/fs.c +++ b/kernel/gcov/fs.c | |||
@@ -75,7 +75,7 @@ static int __init gcov_persist_setup(char *str) | |||
75 | unsigned long val; | 75 | unsigned long val; |
76 | 76 | ||
77 | if (kstrtoul(str, 0, &val)) { | 77 | if (kstrtoul(str, 0, &val)) { |
78 | pr_warning("invalid gcov_persist parameter '%s'\n", str); | 78 | pr_warn("invalid gcov_persist parameter '%s'\n", str); |
79 | return 0; | 79 | return 0; |
80 | } | 80 | } |
81 | gcov_persist = val; | 81 | gcov_persist = val; |
@@ -242,7 +242,7 @@ static struct gcov_node *get_node_by_name(const char *name) | |||
242 | 242 | ||
243 | list_for_each_entry(node, &all_head, all) { | 243 | list_for_each_entry(node, &all_head, all) { |
244 | info = get_node_info(node); | 244 | info = get_node_info(node); |
245 | if (info && (strcmp(info->filename, name) == 0)) | 245 | if (info && (strcmp(gcov_info_filename(info), name) == 0)) |
246 | return node; | 246 | return node; |
247 | } | 247 | } |
248 | 248 | ||
@@ -279,7 +279,7 @@ static ssize_t gcov_seq_write(struct file *file, const char __user *addr, | |||
279 | seq = file->private_data; | 279 | seq = file->private_data; |
280 | info = gcov_iter_get_info(seq->private); | 280 | info = gcov_iter_get_info(seq->private); |
281 | mutex_lock(&node_lock); | 281 | mutex_lock(&node_lock); |
282 | node = get_node_by_name(info->filename); | 282 | node = get_node_by_name(gcov_info_filename(info)); |
283 | if (node) { | 283 | if (node) { |
284 | /* Reset counts or remove node for unloaded modules. */ | 284 | /* Reset counts or remove node for unloaded modules. */ |
285 | if (node->num_loaded == 0) | 285 | if (node->num_loaded == 0) |
@@ -365,7 +365,7 @@ static const char *deskew(const char *basename) | |||
365 | */ | 365 | */ |
366 | static void add_links(struct gcov_node *node, struct dentry *parent) | 366 | static void add_links(struct gcov_node *node, struct dentry *parent) |
367 | { | 367 | { |
368 | char *basename; | 368 | const char *basename; |
369 | char *target; | 369 | char *target; |
370 | int num; | 370 | int num; |
371 | int i; | 371 | int i; |
@@ -376,14 +376,14 @@ static void add_links(struct gcov_node *node, struct dentry *parent) | |||
376 | if (!node->links) | 376 | if (!node->links) |
377 | return; | 377 | return; |
378 | for (i = 0; i < num; i++) { | 378 | for (i = 0; i < num; i++) { |
379 | target = get_link_target(get_node_info(node)->filename, | 379 | target = get_link_target( |
380 | &gcov_link[i]); | 380 | gcov_info_filename(get_node_info(node)), |
381 | &gcov_link[i]); | ||
381 | if (!target) | 382 | if (!target) |
382 | goto out_err; | 383 | goto out_err; |
383 | basename = strrchr(target, '/'); | 384 | basename = kbasename(target); |
384 | if (!basename) | 385 | if (basename == target) |
385 | goto out_err; | 386 | goto out_err; |
386 | basename++; | ||
387 | node->links[i] = debugfs_create_symlink(deskew(basename), | 387 | node->links[i] = debugfs_create_symlink(deskew(basename), |
388 | parent, target); | 388 | parent, target); |
389 | if (!node->links[i]) | 389 | if (!node->links[i]) |
@@ -450,7 +450,7 @@ static struct gcov_node *new_node(struct gcov_node *parent, | |||
450 | } else | 450 | } else |
451 | node->dentry = debugfs_create_dir(node->name, parent->dentry); | 451 | node->dentry = debugfs_create_dir(node->name, parent->dentry); |
452 | if (!node->dentry) { | 452 | if (!node->dentry) { |
453 | pr_warning("could not create file\n"); | 453 | pr_warn("could not create file\n"); |
454 | kfree(node); | 454 | kfree(node); |
455 | return NULL; | 455 | return NULL; |
456 | } | 456 | } |
@@ -463,7 +463,7 @@ static struct gcov_node *new_node(struct gcov_node *parent, | |||
463 | 463 | ||
464 | err_nomem: | 464 | err_nomem: |
465 | kfree(node); | 465 | kfree(node); |
466 | pr_warning("out of memory\n"); | 466 | pr_warn("out of memory\n"); |
467 | return NULL; | 467 | return NULL; |
468 | } | 468 | } |
469 | 469 | ||
@@ -576,7 +576,7 @@ static void add_node(struct gcov_info *info) | |||
576 | struct gcov_node *parent; | 576 | struct gcov_node *parent; |
577 | struct gcov_node *node; | 577 | struct gcov_node *node; |
578 | 578 | ||
579 | filename = kstrdup(info->filename, GFP_KERNEL); | 579 | filename = kstrdup(gcov_info_filename(info), GFP_KERNEL); |
580 | if (!filename) | 580 | if (!filename) |
581 | return; | 581 | return; |
582 | parent = &root_node; | 582 | parent = &root_node; |
@@ -630,8 +630,8 @@ static void add_info(struct gcov_node *node, struct gcov_info *info) | |||
630 | */ | 630 | */ |
631 | loaded_info = kcalloc(num + 1, sizeof(struct gcov_info *), GFP_KERNEL); | 631 | loaded_info = kcalloc(num + 1, sizeof(struct gcov_info *), GFP_KERNEL); |
632 | if (!loaded_info) { | 632 | if (!loaded_info) { |
633 | pr_warning("could not add '%s' (out of memory)\n", | 633 | pr_warn("could not add '%s' (out of memory)\n", |
634 | info->filename); | 634 | gcov_info_filename(info)); |
635 | return; | 635 | return; |
636 | } | 636 | } |
637 | memcpy(loaded_info, node->loaded_info, | 637 | memcpy(loaded_info, node->loaded_info, |
@@ -644,8 +644,9 @@ static void add_info(struct gcov_node *node, struct gcov_info *info) | |||
644 | * data set replaces the copy of the last one. | 644 | * data set replaces the copy of the last one. |
645 | */ | 645 | */ |
646 | if (!gcov_info_is_compatible(node->unloaded_info, info)) { | 646 | if (!gcov_info_is_compatible(node->unloaded_info, info)) { |
647 | pr_warning("discarding saved data for %s " | 647 | pr_warn("discarding saved data for %s " |
648 | "(incompatible version)\n", info->filename); | 648 | "(incompatible version)\n", |
649 | gcov_info_filename(info)); | ||
649 | gcov_info_free(node->unloaded_info); | 650 | gcov_info_free(node->unloaded_info); |
650 | node->unloaded_info = NULL; | 651 | node->unloaded_info = NULL; |
651 | } | 652 | } |
@@ -655,8 +656,8 @@ static void add_info(struct gcov_node *node, struct gcov_info *info) | |||
655 | * The initial one takes precedence. | 656 | * The initial one takes precedence. |
656 | */ | 657 | */ |
657 | if (!gcov_info_is_compatible(node->loaded_info[0], info)) { | 658 | if (!gcov_info_is_compatible(node->loaded_info[0], info)) { |
658 | pr_warning("could not add '%s' (incompatible " | 659 | pr_warn("could not add '%s' (incompatible " |
659 | "version)\n", info->filename); | 660 | "version)\n", gcov_info_filename(info)); |
660 | kfree(loaded_info); | 661 | kfree(loaded_info); |
661 | return; | 662 | return; |
662 | } | 663 | } |
@@ -691,8 +692,9 @@ static void save_info(struct gcov_node *node, struct gcov_info *info) | |||
691 | else { | 692 | else { |
692 | node->unloaded_info = gcov_info_dup(info); | 693 | node->unloaded_info = gcov_info_dup(info); |
693 | if (!node->unloaded_info) { | 694 | if (!node->unloaded_info) { |
694 | pr_warning("could not save data for '%s' " | 695 | pr_warn("could not save data for '%s' " |
695 | "(out of memory)\n", info->filename); | 696 | "(out of memory)\n", |
697 | gcov_info_filename(info)); | ||
696 | } | 698 | } |
697 | } | 699 | } |
698 | } | 700 | } |
@@ -707,8 +709,8 @@ static void remove_info(struct gcov_node *node, struct gcov_info *info) | |||
707 | 709 | ||
708 | i = get_info_index(node, info); | 710 | i = get_info_index(node, info); |
709 | if (i < 0) { | 711 | if (i < 0) { |
710 | pr_warning("could not remove '%s' (not found)\n", | 712 | pr_warn("could not remove '%s' (not found)\n", |
711 | info->filename); | 713 | gcov_info_filename(info)); |
712 | return; | 714 | return; |
713 | } | 715 | } |
714 | if (gcov_persist) | 716 | if (gcov_persist) |
@@ -735,7 +737,7 @@ void gcov_event(enum gcov_action action, struct gcov_info *info) | |||
735 | struct gcov_node *node; | 737 | struct gcov_node *node; |
736 | 738 | ||
737 | mutex_lock(&node_lock); | 739 | mutex_lock(&node_lock); |
738 | node = get_node_by_name(info->filename); | 740 | node = get_node_by_name(gcov_info_filename(info)); |
739 | switch (action) { | 741 | switch (action) { |
740 | case GCOV_ADD: | 742 | case GCOV_ADD: |
741 | if (node) | 743 | if (node) |
@@ -747,8 +749,8 @@ void gcov_event(enum gcov_action action, struct gcov_info *info) | |||
747 | if (node) | 749 | if (node) |
748 | remove_info(node, info); | 750 | remove_info(node, info); |
749 | else { | 751 | else { |
750 | pr_warning("could not remove '%s' (not found)\n", | 752 | pr_warn("could not remove '%s' (not found)\n", |
751 | info->filename); | 753 | gcov_info_filename(info)); |
752 | } | 754 | } |
753 | break; | 755 | break; |
754 | } | 756 | } |
diff --git a/kernel/gcov/gcc_3_4.c b/kernel/gcov/gcc_3_4.c index ae5bb4260033..27bc88a35013 100644 --- a/kernel/gcov/gcc_3_4.c +++ b/kernel/gcov/gcc_3_4.c | |||
@@ -21,6 +21,121 @@ | |||
21 | #include <linux/vmalloc.h> | 21 | #include <linux/vmalloc.h> |
22 | #include "gcov.h" | 22 | #include "gcov.h" |
23 | 23 | ||
24 | #define GCOV_COUNTERS 5 | ||
25 | |||
26 | static struct gcov_info *gcov_info_head; | ||
27 | |||
28 | /** | ||
29 | * struct gcov_fn_info - profiling meta data per function | ||
30 | * @ident: object file-unique function identifier | ||
31 | * @checksum: function checksum | ||
32 | * @n_ctrs: number of values per counter type belonging to this function | ||
33 | * | ||
34 | * This data is generated by gcc during compilation and doesn't change | ||
35 | * at run-time. | ||
36 | */ | ||
37 | struct gcov_fn_info { | ||
38 | unsigned int ident; | ||
39 | unsigned int checksum; | ||
40 | unsigned int n_ctrs[0]; | ||
41 | }; | ||
42 | |||
43 | /** | ||
44 | * struct gcov_ctr_info - profiling data per counter type | ||
45 | * @num: number of counter values for this type | ||
46 | * @values: array of counter values for this type | ||
47 | * @merge: merge function for counter values of this type (unused) | ||
48 | * | ||
49 | * This data is generated by gcc during compilation and doesn't change | ||
50 | * at run-time with the exception of the values array. | ||
51 | */ | ||
52 | struct gcov_ctr_info { | ||
53 | unsigned int num; | ||
54 | gcov_type *values; | ||
55 | void (*merge)(gcov_type *, unsigned int); | ||
56 | }; | ||
57 | |||
58 | /** | ||
59 | * struct gcov_info - profiling data per object file | ||
60 | * @version: gcov version magic indicating the gcc version used for compilation | ||
61 | * @next: list head for a singly-linked list | ||
62 | * @stamp: time stamp | ||
63 | * @filename: name of the associated gcov data file | ||
64 | * @n_functions: number of instrumented functions | ||
65 | * @functions: function data | ||
66 | * @ctr_mask: mask specifying which counter types are active | ||
67 | * @counts: counter data per counter type | ||
68 | * | ||
69 | * This data is generated by gcc during compilation and doesn't change | ||
70 | * at run-time with the exception of the next pointer. | ||
71 | */ | ||
72 | struct gcov_info { | ||
73 | unsigned int version; | ||
74 | struct gcov_info *next; | ||
75 | unsigned int stamp; | ||
76 | const char *filename; | ||
77 | unsigned int n_functions; | ||
78 | const struct gcov_fn_info *functions; | ||
79 | unsigned int ctr_mask; | ||
80 | struct gcov_ctr_info counts[0]; | ||
81 | }; | ||
82 | |||
83 | /** | ||
84 | * gcov_info_filename - return info filename | ||
85 | * @info: profiling data set | ||
86 | */ | ||
87 | const char *gcov_info_filename(struct gcov_info *info) | ||
88 | { | ||
89 | return info->filename; | ||
90 | } | ||
91 | |||
92 | /** | ||
93 | * gcov_info_version - return info version | ||
94 | * @info: profiling data set | ||
95 | */ | ||
96 | unsigned int gcov_info_version(struct gcov_info *info) | ||
97 | { | ||
98 | return info->version; | ||
99 | } | ||
100 | |||
101 | /** | ||
102 | * gcov_info_next - return next profiling data set | ||
103 | * @info: profiling data set | ||
104 | * | ||
105 | * Returns next gcov_info following @info or first gcov_info in the chain if | ||
106 | * @info is %NULL. | ||
107 | */ | ||
108 | struct gcov_info *gcov_info_next(struct gcov_info *info) | ||
109 | { | ||
110 | if (!info) | ||
111 | return gcov_info_head; | ||
112 | |||
113 | return info->next; | ||
114 | } | ||
115 | |||
116 | /** | ||
117 | * gcov_info_link - link/add profiling data set to the list | ||
118 | * @info: profiling data set | ||
119 | */ | ||
120 | void gcov_info_link(struct gcov_info *info) | ||
121 | { | ||
122 | info->next = gcov_info_head; | ||
123 | gcov_info_head = info; | ||
124 | } | ||
125 | |||
126 | /** | ||
127 | * gcov_info_unlink - unlink/remove profiling data set from the list | ||
128 | * @prev: previous profiling data set | ||
129 | * @info: profiling data set | ||
130 | */ | ||
131 | void gcov_info_unlink(struct gcov_info *prev, struct gcov_info *info) | ||
132 | { | ||
133 | if (prev) | ||
134 | prev->next = info->next; | ||
135 | else | ||
136 | gcov_info_head = info->next; | ||
137 | } | ||
138 | |||
24 | /* Symbolic links to be created for each profiling data file. */ | 139 | /* Symbolic links to be created for each profiling data file. */ |
25 | const struct gcov_link gcov_link[] = { | 140 | const struct gcov_link gcov_link[] = { |
26 | { OBJ_TREE, "gcno" }, /* Link to .gcno file in $(objtree). */ | 141 | { OBJ_TREE, "gcno" }, /* Link to .gcno file in $(objtree). */ |
diff --git a/kernel/gcov/gcc_4_7.c b/kernel/gcov/gcc_4_7.c new file mode 100644 index 000000000000..2c6e4631c814 --- /dev/null +++ b/kernel/gcov/gcc_4_7.c | |||
@@ -0,0 +1,560 @@ | |||
1 | /* | ||
2 | * This code provides functions to handle gcc's profiling data format | ||
3 | * introduced with gcc 4.7. | ||
4 | * | ||
5 | * This file is based heavily on gcc_3_4.c file. | ||
6 | * | ||
7 | * For a better understanding, refer to gcc source: | ||
8 | * gcc/gcov-io.h | ||
9 | * libgcc/libgcov.c | ||
10 | * | ||
11 | * Uses gcc-internal data definitions. | ||
12 | */ | ||
13 | |||
14 | #include <linux/errno.h> | ||
15 | #include <linux/slab.h> | ||
16 | #include <linux/string.h> | ||
17 | #include <linux/seq_file.h> | ||
18 | #include <linux/vmalloc.h> | ||
19 | #include "gcov.h" | ||
20 | |||
21 | #define GCOV_COUNTERS 8 | ||
22 | #define GCOV_TAG_FUNCTION_LENGTH 3 | ||
23 | |||
24 | static struct gcov_info *gcov_info_head; | ||
25 | |||
26 | /** | ||
27 | * struct gcov_ctr_info - information about counters for a single function | ||
28 | * @num: number of counter values for this type | ||
29 | * @values: array of counter values for this type | ||
30 | * | ||
31 | * This data is generated by gcc during compilation and doesn't change | ||
32 | * at run-time with the exception of the values array. | ||
33 | */ | ||
34 | struct gcov_ctr_info { | ||
35 | unsigned int num; | ||
36 | gcov_type *values; | ||
37 | }; | ||
38 | |||
39 | /** | ||
40 | * struct gcov_fn_info - profiling meta data per function | ||
41 | * @key: comdat key | ||
42 | * @ident: unique ident of function | ||
43 | * @lineno_checksum: function lineo_checksum | ||
44 | * @cfg_checksum: function cfg checksum | ||
45 | * @ctrs: instrumented counters | ||
46 | * | ||
47 | * This data is generated by gcc during compilation and doesn't change | ||
48 | * at run-time. | ||
49 | * | ||
50 | * Information about a single function. This uses the trailing array | ||
51 | * idiom. The number of counters is determined from the merge pointer | ||
52 | * array in gcov_info. The key is used to detect which of a set of | ||
53 | * comdat functions was selected -- it points to the gcov_info object | ||
54 | * of the object file containing the selected comdat function. | ||
55 | */ | ||
56 | struct gcov_fn_info { | ||
57 | const struct gcov_info *key; | ||
58 | unsigned int ident; | ||
59 | unsigned int lineno_checksum; | ||
60 | unsigned int cfg_checksum; | ||
61 | struct gcov_ctr_info ctrs[0]; | ||
62 | }; | ||
63 | |||
64 | /** | ||
65 | * struct gcov_info - profiling data per object file | ||
66 | * @version: gcov version magic indicating the gcc version used for compilation | ||
67 | * @next: list head for a singly-linked list | ||
68 | * @stamp: uniquifying time stamp | ||
69 | * @filename: name of the associated gcov data file | ||
70 | * @merge: merge functions (null for unused counter type) | ||
71 | * @n_functions: number of instrumented functions | ||
72 | * @functions: pointer to pointers to function information | ||
73 | * | ||
74 | * This data is generated by gcc during compilation and doesn't change | ||
75 | * at run-time with the exception of the next pointer. | ||
76 | */ | ||
77 | struct gcov_info { | ||
78 | unsigned int version; | ||
79 | struct gcov_info *next; | ||
80 | unsigned int stamp; | ||
81 | const char *filename; | ||
82 | void (*merge[GCOV_COUNTERS])(gcov_type *, unsigned int); | ||
83 | unsigned int n_functions; | ||
84 | struct gcov_fn_info **functions; | ||
85 | }; | ||
86 | |||
87 | /** | ||
88 | * gcov_info_filename - return info filename | ||
89 | * @info: profiling data set | ||
90 | */ | ||
91 | const char *gcov_info_filename(struct gcov_info *info) | ||
92 | { | ||
93 | return info->filename; | ||
94 | } | ||
95 | |||
96 | /** | ||
97 | * gcov_info_version - return info version | ||
98 | * @info: profiling data set | ||
99 | */ | ||
100 | unsigned int gcov_info_version(struct gcov_info *info) | ||
101 | { | ||
102 | return info->version; | ||
103 | } | ||
104 | |||
105 | /** | ||
106 | * gcov_info_next - return next profiling data set | ||
107 | * @info: profiling data set | ||
108 | * | ||
109 | * Returns next gcov_info following @info or first gcov_info in the chain if | ||
110 | * @info is %NULL. | ||
111 | */ | ||
112 | struct gcov_info *gcov_info_next(struct gcov_info *info) | ||
113 | { | ||
114 | if (!info) | ||
115 | return gcov_info_head; | ||
116 | |||
117 | return info->next; | ||
118 | } | ||
119 | |||
120 | /** | ||
121 | * gcov_info_link - link/add profiling data set to the list | ||
122 | * @info: profiling data set | ||
123 | */ | ||
124 | void gcov_info_link(struct gcov_info *info) | ||
125 | { | ||
126 | info->next = gcov_info_head; | ||
127 | gcov_info_head = info; | ||
128 | } | ||
129 | |||
130 | /** | ||
131 | * gcov_info_unlink - unlink/remove profiling data set from the list | ||
132 | * @prev: previous profiling data set | ||
133 | * @info: profiling data set | ||
134 | */ | ||
135 | void gcov_info_unlink(struct gcov_info *prev, struct gcov_info *info) | ||
136 | { | ||
137 | if (prev) | ||
138 | prev->next = info->next; | ||
139 | else | ||
140 | gcov_info_head = info->next; | ||
141 | } | ||
142 | |||
143 | /* Symbolic links to be created for each profiling data file. */ | ||
144 | const struct gcov_link gcov_link[] = { | ||
145 | { OBJ_TREE, "gcno" }, /* Link to .gcno file in $(objtree). */ | ||
146 | { 0, NULL}, | ||
147 | }; | ||
148 | |||
149 | /* | ||
150 | * Determine whether a counter is active. Doesn't change at run-time. | ||
151 | */ | ||
152 | static int counter_active(struct gcov_info *info, unsigned int type) | ||
153 | { | ||
154 | return info->merge[type] ? 1 : 0; | ||
155 | } | ||
156 | |||
157 | /* Determine number of active counters. Based on gcc magic. */ | ||
158 | static unsigned int num_counter_active(struct gcov_info *info) | ||
159 | { | ||
160 | unsigned int i; | ||
161 | unsigned int result = 0; | ||
162 | |||
163 | for (i = 0; i < GCOV_COUNTERS; i++) { | ||
164 | if (counter_active(info, i)) | ||
165 | result++; | ||
166 | } | ||
167 | return result; | ||
168 | } | ||
169 | |||
170 | /** | ||
171 | * gcov_info_reset - reset profiling data to zero | ||
172 | * @info: profiling data set | ||
173 | */ | ||
174 | void gcov_info_reset(struct gcov_info *info) | ||
175 | { | ||
176 | struct gcov_ctr_info *ci_ptr; | ||
177 | unsigned int fi_idx; | ||
178 | unsigned int ct_idx; | ||
179 | |||
180 | for (fi_idx = 0; fi_idx < info->n_functions; fi_idx++) { | ||
181 | ci_ptr = info->functions[fi_idx]->ctrs; | ||
182 | |||
183 | for (ct_idx = 0; ct_idx < GCOV_COUNTERS; ct_idx++) { | ||
184 | if (!counter_active(info, ct_idx)) | ||
185 | continue; | ||
186 | |||
187 | memset(ci_ptr->values, 0, | ||
188 | sizeof(gcov_type) * ci_ptr->num); | ||
189 | ci_ptr++; | ||
190 | } | ||
191 | } | ||
192 | } | ||
193 | |||
194 | /** | ||
195 | * gcov_info_is_compatible - check if profiling data can be added | ||
196 | * @info1: first profiling data set | ||
197 | * @info2: second profiling data set | ||
198 | * | ||
199 | * Returns non-zero if profiling data can be added, zero otherwise. | ||
200 | */ | ||
201 | int gcov_info_is_compatible(struct gcov_info *info1, struct gcov_info *info2) | ||
202 | { | ||
203 | return (info1->stamp == info2->stamp); | ||
204 | } | ||
205 | |||
206 | /** | ||
207 | * gcov_info_add - add up profiling data | ||
208 | * @dest: profiling data set to which data is added | ||
209 | * @source: profiling data set which is added | ||
210 | * | ||
211 | * Adds profiling counts of @source to @dest. | ||
212 | */ | ||
213 | void gcov_info_add(struct gcov_info *dst, struct gcov_info *src) | ||
214 | { | ||
215 | struct gcov_ctr_info *dci_ptr; | ||
216 | struct gcov_ctr_info *sci_ptr; | ||
217 | unsigned int fi_idx; | ||
218 | unsigned int ct_idx; | ||
219 | unsigned int val_idx; | ||
220 | |||
221 | for (fi_idx = 0; fi_idx < src->n_functions; fi_idx++) { | ||
222 | dci_ptr = dst->functions[fi_idx]->ctrs; | ||
223 | sci_ptr = src->functions[fi_idx]->ctrs; | ||
224 | |||
225 | for (ct_idx = 0; ct_idx < GCOV_COUNTERS; ct_idx++) { | ||
226 | if (!counter_active(src, ct_idx)) | ||
227 | continue; | ||
228 | |||
229 | for (val_idx = 0; val_idx < sci_ptr->num; val_idx++) | ||
230 | dci_ptr->values[val_idx] += | ||
231 | sci_ptr->values[val_idx]; | ||
232 | |||
233 | dci_ptr++; | ||
234 | sci_ptr++; | ||
235 | } | ||
236 | } | ||
237 | } | ||
238 | |||
239 | /** | ||
240 | * gcov_info_dup - duplicate profiling data set | ||
241 | * @info: profiling data set to duplicate | ||
242 | * | ||
243 | * Return newly allocated duplicate on success, %NULL on error. | ||
244 | */ | ||
245 | struct gcov_info *gcov_info_dup(struct gcov_info *info) | ||
246 | { | ||
247 | struct gcov_info *dup; | ||
248 | struct gcov_ctr_info *dci_ptr; /* dst counter info */ | ||
249 | struct gcov_ctr_info *sci_ptr; /* src counter info */ | ||
250 | unsigned int active; | ||
251 | unsigned int fi_idx; /* function info idx */ | ||
252 | unsigned int ct_idx; /* counter type idx */ | ||
253 | size_t fi_size; /* function info size */ | ||
254 | size_t cv_size; /* counter values size */ | ||
255 | |||
256 | dup = kmemdup(info, sizeof(*dup), GFP_KERNEL); | ||
257 | if (!dup) | ||
258 | return NULL; | ||
259 | |||
260 | dup->next = NULL; | ||
261 | dup->filename = NULL; | ||
262 | dup->functions = NULL; | ||
263 | |||
264 | dup->filename = kstrdup(info->filename, GFP_KERNEL); | ||
265 | if (!dup->filename) | ||
266 | goto err_free; | ||
267 | |||
268 | dup->functions = kcalloc(info->n_functions, | ||
269 | sizeof(struct gcov_fn_info *), GFP_KERNEL); | ||
270 | if (!dup->functions) | ||
271 | goto err_free; | ||
272 | |||
273 | active = num_counter_active(info); | ||
274 | fi_size = sizeof(struct gcov_fn_info); | ||
275 | fi_size += sizeof(struct gcov_ctr_info) * active; | ||
276 | |||
277 | for (fi_idx = 0; fi_idx < info->n_functions; fi_idx++) { | ||
278 | dup->functions[fi_idx] = kzalloc(fi_size, GFP_KERNEL); | ||
279 | if (!dup->functions[fi_idx]) | ||
280 | goto err_free; | ||
281 | |||
282 | *(dup->functions[fi_idx]) = *(info->functions[fi_idx]); | ||
283 | |||
284 | sci_ptr = info->functions[fi_idx]->ctrs; | ||
285 | dci_ptr = dup->functions[fi_idx]->ctrs; | ||
286 | |||
287 | for (ct_idx = 0; ct_idx < active; ct_idx++) { | ||
288 | |||
289 | cv_size = sizeof(gcov_type) * sci_ptr->num; | ||
290 | |||
291 | dci_ptr->values = vmalloc(cv_size); | ||
292 | |||
293 | if (!dci_ptr->values) | ||
294 | goto err_free; | ||
295 | |||
296 | dci_ptr->num = sci_ptr->num; | ||
297 | memcpy(dci_ptr->values, sci_ptr->values, cv_size); | ||
298 | |||
299 | sci_ptr++; | ||
300 | dci_ptr++; | ||
301 | } | ||
302 | } | ||
303 | |||
304 | return dup; | ||
305 | err_free: | ||
306 | gcov_info_free(dup); | ||
307 | return NULL; | ||
308 | } | ||
309 | |||
310 | /** | ||
311 | * gcov_info_free - release memory for profiling data set duplicate | ||
312 | * @info: profiling data set duplicate to free | ||
313 | */ | ||
314 | void gcov_info_free(struct gcov_info *info) | ||
315 | { | ||
316 | unsigned int active; | ||
317 | unsigned int fi_idx; | ||
318 | unsigned int ct_idx; | ||
319 | struct gcov_ctr_info *ci_ptr; | ||
320 | |||
321 | if (!info->functions) | ||
322 | goto free_info; | ||
323 | |||
324 | active = num_counter_active(info); | ||
325 | |||
326 | for (fi_idx = 0; fi_idx < info->n_functions; fi_idx++) { | ||
327 | if (!info->functions[fi_idx]) | ||
328 | continue; | ||
329 | |||
330 | ci_ptr = info->functions[fi_idx]->ctrs; | ||
331 | |||
332 | for (ct_idx = 0; ct_idx < active; ct_idx++, ci_ptr++) | ||
333 | vfree(ci_ptr->values); | ||
334 | |||
335 | kfree(info->functions[fi_idx]); | ||
336 | } | ||
337 | |||
338 | free_info: | ||
339 | kfree(info->functions); | ||
340 | kfree(info->filename); | ||
341 | kfree(info); | ||
342 | } | ||
343 | |||
344 | #define ITER_STRIDE PAGE_SIZE | ||
345 | |||
346 | /** | ||
347 | * struct gcov_iterator - specifies current file position in logical records | ||
348 | * @info: associated profiling data | ||
349 | * @buffer: buffer containing file data | ||
350 | * @size: size of buffer | ||
351 | * @pos: current position in file | ||
352 | */ | ||
353 | struct gcov_iterator { | ||
354 | struct gcov_info *info; | ||
355 | void *buffer; | ||
356 | size_t size; | ||
357 | loff_t pos; | ||
358 | }; | ||
359 | |||
360 | /** | ||
361 | * store_gcov_u32 - store 32 bit number in gcov format to buffer | ||
362 | * @buffer: target buffer or NULL | ||
363 | * @off: offset into the buffer | ||
364 | * @v: value to be stored | ||
365 | * | ||
366 | * Number format defined by gcc: numbers are recorded in the 32 bit | ||
367 | * unsigned binary form of the endianness of the machine generating the | ||
368 | * file. Returns the number of bytes stored. If @buffer is %NULL, doesn't | ||
369 | * store anything. | ||
370 | */ | ||
371 | static size_t store_gcov_u32(void *buffer, size_t off, u32 v) | ||
372 | { | ||
373 | u32 *data; | ||
374 | |||
375 | if (buffer) { | ||
376 | data = buffer + off; | ||
377 | *data = v; | ||
378 | } | ||
379 | |||
380 | return sizeof(*data); | ||
381 | } | ||
382 | |||
383 | /** | ||
384 | * store_gcov_u64 - store 64 bit number in gcov format to buffer | ||
385 | * @buffer: target buffer or NULL | ||
386 | * @off: offset into the buffer | ||
387 | * @v: value to be stored | ||
388 | * | ||
389 | * Number format defined by gcc: numbers are recorded in the 32 bit | ||
390 | * unsigned binary form of the endianness of the machine generating the | ||
391 | * file. 64 bit numbers are stored as two 32 bit numbers, the low part | ||
392 | * first. Returns the number of bytes stored. If @buffer is %NULL, doesn't store | ||
393 | * anything. | ||
394 | */ | ||
395 | static size_t store_gcov_u64(void *buffer, size_t off, u64 v) | ||
396 | { | ||
397 | u32 *data; | ||
398 | |||
399 | if (buffer) { | ||
400 | data = buffer + off; | ||
401 | |||
402 | data[0] = (v & 0xffffffffUL); | ||
403 | data[1] = (v >> 32); | ||
404 | } | ||
405 | |||
406 | return sizeof(*data) * 2; | ||
407 | } | ||
408 | |||
409 | /** | ||
410 | * convert_to_gcda - convert profiling data set to gcda file format | ||
411 | * @buffer: the buffer to store file data or %NULL if no data should be stored | ||
412 | * @info: profiling data set to be converted | ||
413 | * | ||
414 | * Returns the number of bytes that were/would have been stored into the buffer. | ||
415 | */ | ||
416 | static size_t convert_to_gcda(char *buffer, struct gcov_info *info) | ||
417 | { | ||
418 | struct gcov_fn_info *fi_ptr; | ||
419 | struct gcov_ctr_info *ci_ptr; | ||
420 | unsigned int fi_idx; | ||
421 | unsigned int ct_idx; | ||
422 | unsigned int cv_idx; | ||
423 | size_t pos = 0; | ||
424 | |||
425 | /* File header. */ | ||
426 | pos += store_gcov_u32(buffer, pos, GCOV_DATA_MAGIC); | ||
427 | pos += store_gcov_u32(buffer, pos, info->version); | ||
428 | pos += store_gcov_u32(buffer, pos, info->stamp); | ||
429 | |||
430 | for (fi_idx = 0; fi_idx < info->n_functions; fi_idx++) { | ||
431 | fi_ptr = info->functions[fi_idx]; | ||
432 | |||
433 | /* Function record. */ | ||
434 | pos += store_gcov_u32(buffer, pos, GCOV_TAG_FUNCTION); | ||
435 | pos += store_gcov_u32(buffer, pos, GCOV_TAG_FUNCTION_LENGTH); | ||
436 | pos += store_gcov_u32(buffer, pos, fi_ptr->ident); | ||
437 | pos += store_gcov_u32(buffer, pos, fi_ptr->lineno_checksum); | ||
438 | pos += store_gcov_u32(buffer, pos, fi_ptr->cfg_checksum); | ||
439 | |||
440 | ci_ptr = fi_ptr->ctrs; | ||
441 | |||
442 | for (ct_idx = 0; ct_idx < GCOV_COUNTERS; ct_idx++) { | ||
443 | if (!counter_active(info, ct_idx)) | ||
444 | continue; | ||
445 | |||
446 | /* Counter record. */ | ||
447 | pos += store_gcov_u32(buffer, pos, | ||
448 | GCOV_TAG_FOR_COUNTER(ct_idx)); | ||
449 | pos += store_gcov_u32(buffer, pos, ci_ptr->num * 2); | ||
450 | |||
451 | for (cv_idx = 0; cv_idx < ci_ptr->num; cv_idx++) { | ||
452 | pos += store_gcov_u64(buffer, pos, | ||
453 | ci_ptr->values[cv_idx]); | ||
454 | } | ||
455 | |||
456 | ci_ptr++; | ||
457 | } | ||
458 | } | ||
459 | |||
460 | return pos; | ||
461 | } | ||
462 | |||
463 | /** | ||
464 | * gcov_iter_new - allocate and initialize profiling data iterator | ||
465 | * @info: profiling data set to be iterated | ||
466 | * | ||
467 | * Return file iterator on success, %NULL otherwise. | ||
468 | */ | ||
469 | struct gcov_iterator *gcov_iter_new(struct gcov_info *info) | ||
470 | { | ||
471 | struct gcov_iterator *iter; | ||
472 | |||
473 | iter = kzalloc(sizeof(struct gcov_iterator), GFP_KERNEL); | ||
474 | if (!iter) | ||
475 | goto err_free; | ||
476 | |||
477 | iter->info = info; | ||
478 | /* Dry-run to get the actual buffer size. */ | ||
479 | iter->size = convert_to_gcda(NULL, info); | ||
480 | iter->buffer = vmalloc(iter->size); | ||
481 | if (!iter->buffer) | ||
482 | goto err_free; | ||
483 | |||
484 | convert_to_gcda(iter->buffer, info); | ||
485 | |||
486 | return iter; | ||
487 | |||
488 | err_free: | ||
489 | kfree(iter); | ||
490 | return NULL; | ||
491 | } | ||
492 | |||
493 | |||
494 | /** | ||
495 | * gcov_iter_get_info - return profiling data set for given file iterator | ||
496 | * @iter: file iterator | ||
497 | */ | ||
498 | void gcov_iter_free(struct gcov_iterator *iter) | ||
499 | { | ||
500 | vfree(iter->buffer); | ||
501 | kfree(iter); | ||
502 | } | ||
503 | |||
504 | /** | ||
505 | * gcov_iter_get_info - return profiling data set for given file iterator | ||
506 | * @iter: file iterator | ||
507 | */ | ||
508 | struct gcov_info *gcov_iter_get_info(struct gcov_iterator *iter) | ||
509 | { | ||
510 | return iter->info; | ||
511 | } | ||
512 | |||
513 | /** | ||
514 | * gcov_iter_start - reset file iterator to starting position | ||
515 | * @iter: file iterator | ||
516 | */ | ||
517 | void gcov_iter_start(struct gcov_iterator *iter) | ||
518 | { | ||
519 | iter->pos = 0; | ||
520 | } | ||
521 | |||
522 | /** | ||
523 | * gcov_iter_next - advance file iterator to next logical record | ||
524 | * @iter: file iterator | ||
525 | * | ||
526 | * Return zero if new position is valid, non-zero if iterator has reached end. | ||
527 | */ | ||
528 | int gcov_iter_next(struct gcov_iterator *iter) | ||
529 | { | ||
530 | if (iter->pos < iter->size) | ||
531 | iter->pos += ITER_STRIDE; | ||
532 | |||
533 | if (iter->pos >= iter->size) | ||
534 | return -EINVAL; | ||
535 | |||
536 | return 0; | ||
537 | } | ||
538 | |||
539 | /** | ||
540 | * gcov_iter_write - write data for current pos to seq_file | ||
541 | * @iter: file iterator | ||
542 | * @seq: seq_file handle | ||
543 | * | ||
544 | * Return zero on success, non-zero otherwise. | ||
545 | */ | ||
546 | int gcov_iter_write(struct gcov_iterator *iter, struct seq_file *seq) | ||
547 | { | ||
548 | size_t len; | ||
549 | |||
550 | if (iter->pos >= iter->size) | ||
551 | return -EINVAL; | ||
552 | |||
553 | len = ITER_STRIDE; | ||
554 | if (iter->pos + len > iter->size) | ||
555 | len = iter->size - iter->pos; | ||
556 | |||
557 | seq_write(seq, iter->buffer + iter->pos, len); | ||
558 | |||
559 | return 0; | ||
560 | } | ||
diff --git a/kernel/gcov/gcov.h b/kernel/gcov/gcov.h index 060073ebf7a6..92c8e22a29ed 100644 --- a/kernel/gcov/gcov.h +++ b/kernel/gcov/gcov.h | |||
@@ -21,7 +21,6 @@ | |||
21 | * gcc and need to be kept as close to the original definition as possible to | 21 | * gcc and need to be kept as close to the original definition as possible to |
22 | * remain compatible. | 22 | * remain compatible. |
23 | */ | 23 | */ |
24 | #define GCOV_COUNTERS 5 | ||
25 | #define GCOV_DATA_MAGIC ((unsigned int) 0x67636461) | 24 | #define GCOV_DATA_MAGIC ((unsigned int) 0x67636461) |
26 | #define GCOV_TAG_FUNCTION ((unsigned int) 0x01000000) | 25 | #define GCOV_TAG_FUNCTION ((unsigned int) 0x01000000) |
27 | #define GCOV_TAG_COUNTER_BASE ((unsigned int) 0x01a10000) | 26 | #define GCOV_TAG_COUNTER_BASE ((unsigned int) 0x01a10000) |
@@ -34,60 +33,18 @@ typedef long gcov_type; | |||
34 | typedef long long gcov_type; | 33 | typedef long long gcov_type; |
35 | #endif | 34 | #endif |
36 | 35 | ||
37 | /** | 36 | /* Opaque gcov_info. The gcov structures can change as for example in gcc 4.7 so |
38 | * struct gcov_fn_info - profiling meta data per function | 37 | * we cannot use full definition here and they need to be placed in gcc specific |
39 | * @ident: object file-unique function identifier | 38 | * implementation of gcov. This also means no direct access to the members in |
40 | * @checksum: function checksum | 39 | * generic code and usage of the interface below.*/ |
41 | * @n_ctrs: number of values per counter type belonging to this function | 40 | struct gcov_info; |
42 | * | ||
43 | * This data is generated by gcc during compilation and doesn't change | ||
44 | * at run-time. | ||
45 | */ | ||
46 | struct gcov_fn_info { | ||
47 | unsigned int ident; | ||
48 | unsigned int checksum; | ||
49 | unsigned int n_ctrs[0]; | ||
50 | }; | ||
51 | |||
52 | /** | ||
53 | * struct gcov_ctr_info - profiling data per counter type | ||
54 | * @num: number of counter values for this type | ||
55 | * @values: array of counter values for this type | ||
56 | * @merge: merge function for counter values of this type (unused) | ||
57 | * | ||
58 | * This data is generated by gcc during compilation and doesn't change | ||
59 | * at run-time with the exception of the values array. | ||
60 | */ | ||
61 | struct gcov_ctr_info { | ||
62 | unsigned int num; | ||
63 | gcov_type *values; | ||
64 | void (*merge)(gcov_type *, unsigned int); | ||
65 | }; | ||
66 | 41 | ||
67 | /** | 42 | /* Interface to access gcov_info data */ |
68 | * struct gcov_info - profiling data per object file | 43 | const char *gcov_info_filename(struct gcov_info *info); |
69 | * @version: gcov version magic indicating the gcc version used for compilation | 44 | unsigned int gcov_info_version(struct gcov_info *info); |
70 | * @next: list head for a singly-linked list | 45 | struct gcov_info *gcov_info_next(struct gcov_info *info); |
71 | * @stamp: time stamp | 46 | void gcov_info_link(struct gcov_info *info); |
72 | * @filename: name of the associated gcov data file | 47 | void gcov_info_unlink(struct gcov_info *prev, struct gcov_info *info); |
73 | * @n_functions: number of instrumented functions | ||
74 | * @functions: function data | ||
75 | * @ctr_mask: mask specifying which counter types are active | ||
76 | * @counts: counter data per counter type | ||
77 | * | ||
78 | * This data is generated by gcc during compilation and doesn't change | ||
79 | * at run-time with the exception of the next pointer. | ||
80 | */ | ||
81 | struct gcov_info { | ||
82 | unsigned int version; | ||
83 | struct gcov_info *next; | ||
84 | unsigned int stamp; | ||
85 | const char *filename; | ||
86 | unsigned int n_functions; | ||
87 | const struct gcov_fn_info *functions; | ||
88 | unsigned int ctr_mask; | ||
89 | struct gcov_ctr_info counts[0]; | ||
90 | }; | ||
91 | 48 | ||
92 | /* Base interface. */ | 49 | /* Base interface. */ |
93 | enum gcov_action { | 50 | enum gcov_action { |
diff --git a/kernel/hung_task.c b/kernel/hung_task.c index 3e97fb126e6b..9328b80eaf14 100644 --- a/kernel/hung_task.c +++ b/kernel/hung_task.c | |||
@@ -16,11 +16,12 @@ | |||
16 | #include <linux/export.h> | 16 | #include <linux/export.h> |
17 | #include <linux/sysctl.h> | 17 | #include <linux/sysctl.h> |
18 | #include <linux/utsname.h> | 18 | #include <linux/utsname.h> |
19 | #include <trace/events/sched.h> | ||
19 | 20 | ||
20 | /* | 21 | /* |
21 | * The number of tasks checked: | 22 | * The number of tasks checked: |
22 | */ | 23 | */ |
23 | unsigned long __read_mostly sysctl_hung_task_check_count = PID_MAX_LIMIT; | 24 | int __read_mostly sysctl_hung_task_check_count = PID_MAX_LIMIT; |
24 | 25 | ||
25 | /* | 26 | /* |
26 | * Limit number of tasks checked in a batch. | 27 | * Limit number of tasks checked in a batch. |
@@ -92,6 +93,9 @@ static void check_hung_task(struct task_struct *t, unsigned long timeout) | |||
92 | t->last_switch_count = switch_count; | 93 | t->last_switch_count = switch_count; |
93 | return; | 94 | return; |
94 | } | 95 | } |
96 | |||
97 | trace_sched_process_hang(t); | ||
98 | |||
95 | if (!sysctl_hung_task_warnings) | 99 | if (!sysctl_hung_task_warnings) |
96 | return; | 100 | return; |
97 | sysctl_hung_task_warnings--; | 101 | sysctl_hung_task_warnings--; |
@@ -203,6 +207,14 @@ int proc_dohung_task_timeout_secs(struct ctl_table *table, int write, | |||
203 | return ret; | 207 | return ret; |
204 | } | 208 | } |
205 | 209 | ||
210 | static atomic_t reset_hung_task = ATOMIC_INIT(0); | ||
211 | |||
212 | void reset_hung_task_detector(void) | ||
213 | { | ||
214 | atomic_set(&reset_hung_task, 1); | ||
215 | } | ||
216 | EXPORT_SYMBOL_GPL(reset_hung_task_detector); | ||
217 | |||
206 | /* | 218 | /* |
207 | * kthread which checks for tasks stuck in D state | 219 | * kthread which checks for tasks stuck in D state |
208 | */ | 220 | */ |
@@ -216,6 +228,9 @@ static int watchdog(void *dummy) | |||
216 | while (schedule_timeout_interruptible(timeout_jiffies(timeout))) | 228 | while (schedule_timeout_interruptible(timeout_jiffies(timeout))) |
217 | timeout = sysctl_hung_task_timeout_secs; | 229 | timeout = sysctl_hung_task_timeout_secs; |
218 | 230 | ||
231 | if (atomic_xchg(&reset_hung_task, 0)) | ||
232 | continue; | ||
233 | |||
219 | check_hung_uninterruptible_tasks(timeout); | 234 | check_hung_uninterruptible_tasks(timeout); |
220 | } | 235 | } |
221 | 236 | ||
diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c index a3bb14fbe5c6..dc04c166c54d 100644 --- a/kernel/irq/chip.c +++ b/kernel/irq/chip.c | |||
@@ -214,7 +214,7 @@ void irq_enable(struct irq_desc *desc) | |||
214 | } | 214 | } |
215 | 215 | ||
216 | /** | 216 | /** |
217 | * irq_disable - Mark interupt disabled | 217 | * irq_disable - Mark interrupt disabled |
218 | * @desc: irq descriptor which should be disabled | 218 | * @desc: irq descriptor which should be disabled |
219 | * | 219 | * |
220 | * If the chip does not implement the irq_disable callback, we | 220 | * If the chip does not implement the irq_disable callback, we |
diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c index 706724e9835d..cf68bb36fe58 100644 --- a/kernel/irq/irqdomain.c +++ b/kernel/irq/irqdomain.c | |||
@@ -465,27 +465,26 @@ int irq_create_strict_mappings(struct irq_domain *domain, unsigned int irq_base, | |||
465 | } | 465 | } |
466 | EXPORT_SYMBOL_GPL(irq_create_strict_mappings); | 466 | EXPORT_SYMBOL_GPL(irq_create_strict_mappings); |
467 | 467 | ||
468 | unsigned int irq_create_of_mapping(struct device_node *controller, | 468 | unsigned int irq_create_of_mapping(struct of_phandle_args *irq_data) |
469 | const u32 *intspec, unsigned int intsize) | ||
470 | { | 469 | { |
471 | struct irq_domain *domain; | 470 | struct irq_domain *domain; |
472 | irq_hw_number_t hwirq; | 471 | irq_hw_number_t hwirq; |
473 | unsigned int type = IRQ_TYPE_NONE; | 472 | unsigned int type = IRQ_TYPE_NONE; |
474 | unsigned int virq; | 473 | unsigned int virq; |
475 | 474 | ||
476 | domain = controller ? irq_find_host(controller) : irq_default_domain; | 475 | domain = irq_data->np ? irq_find_host(irq_data->np) : irq_default_domain; |
477 | if (!domain) { | 476 | if (!domain) { |
478 | pr_warn("no irq domain found for %s !\n", | 477 | pr_warn("no irq domain found for %s !\n", |
479 | of_node_full_name(controller)); | 478 | of_node_full_name(irq_data->np)); |
480 | return 0; | 479 | return 0; |
481 | } | 480 | } |
482 | 481 | ||
483 | /* If domain has no translation, then we assume interrupt line */ | 482 | /* If domain has no translation, then we assume interrupt line */ |
484 | if (domain->ops->xlate == NULL) | 483 | if (domain->ops->xlate == NULL) |
485 | hwirq = intspec[0]; | 484 | hwirq = irq_data->args[0]; |
486 | else { | 485 | else { |
487 | if (domain->ops->xlate(domain, controller, intspec, intsize, | 486 | if (domain->ops->xlate(domain, irq_data->np, irq_data->args, |
488 | &hwirq, &type)) | 487 | irq_data->args_count, &hwirq, &type)) |
489 | return 0; | 488 | return 0; |
490 | } | 489 | } |
491 | 490 | ||
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index 514bcfd855a8..481a13c43b17 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c | |||
@@ -786,7 +786,7 @@ irq_forced_thread_fn(struct irq_desc *desc, struct irqaction *action) | |||
786 | } | 786 | } |
787 | 787 | ||
788 | /* | 788 | /* |
789 | * Interrupts explicitely requested as threaded interupts want to be | 789 | * Interrupts explicitly requested as threaded interrupts want to be |
790 | * preemtible - many of them need to sleep and wait for slow busses to | 790 | * preemtible - many of them need to sleep and wait for slow busses to |
791 | * complete. | 791 | * complete. |
792 | */ | 792 | */ |
@@ -956,7 +956,7 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new) | |||
956 | goto out_mput; | 956 | goto out_mput; |
957 | } | 957 | } |
958 | 958 | ||
959 | sched_setscheduler(t, SCHED_FIFO, ¶m); | 959 | sched_setscheduler_nocheck(t, SCHED_FIFO, ¶m); |
960 | 960 | ||
961 | /* | 961 | /* |
962 | * We keep the reference to the task struct even if | 962 | * We keep the reference to the task struct even if |
diff --git a/kernel/irq/pm.c b/kernel/irq/pm.c index cb228bf21760..abcd6ca86cb7 100644 --- a/kernel/irq/pm.c +++ b/kernel/irq/pm.c | |||
@@ -50,7 +50,7 @@ static void resume_irqs(bool want_early) | |||
50 | bool is_early = desc->action && | 50 | bool is_early = desc->action && |
51 | desc->action->flags & IRQF_EARLY_RESUME; | 51 | desc->action->flags & IRQF_EARLY_RESUME; |
52 | 52 | ||
53 | if (is_early != want_early) | 53 | if (!is_early && want_early) |
54 | continue; | 54 | continue; |
55 | 55 | ||
56 | raw_spin_lock_irqsave(&desc->lock, flags); | 56 | raw_spin_lock_irqsave(&desc->lock, flags); |
diff --git a/kernel/irq/settings.h b/kernel/irq/settings.h index 1162f1030f18..3320b84cc60f 100644 --- a/kernel/irq/settings.h +++ b/kernel/irq/settings.h | |||
@@ -14,6 +14,7 @@ enum { | |||
14 | _IRQ_NO_BALANCING = IRQ_NO_BALANCING, | 14 | _IRQ_NO_BALANCING = IRQ_NO_BALANCING, |
15 | _IRQ_NESTED_THREAD = IRQ_NESTED_THREAD, | 15 | _IRQ_NESTED_THREAD = IRQ_NESTED_THREAD, |
16 | _IRQ_PER_CPU_DEVID = IRQ_PER_CPU_DEVID, | 16 | _IRQ_PER_CPU_DEVID = IRQ_PER_CPU_DEVID, |
17 | _IRQ_IS_POLLED = IRQ_IS_POLLED, | ||
17 | _IRQF_MODIFY_MASK = IRQF_MODIFY_MASK, | 18 | _IRQF_MODIFY_MASK = IRQF_MODIFY_MASK, |
18 | }; | 19 | }; |
19 | 20 | ||
@@ -26,6 +27,7 @@ enum { | |||
26 | #define IRQ_NOAUTOEN GOT_YOU_MORON | 27 | #define IRQ_NOAUTOEN GOT_YOU_MORON |
27 | #define IRQ_NESTED_THREAD GOT_YOU_MORON | 28 | #define IRQ_NESTED_THREAD GOT_YOU_MORON |
28 | #define IRQ_PER_CPU_DEVID GOT_YOU_MORON | 29 | #define IRQ_PER_CPU_DEVID GOT_YOU_MORON |
30 | #define IRQ_IS_POLLED GOT_YOU_MORON | ||
29 | #undef IRQF_MODIFY_MASK | 31 | #undef IRQF_MODIFY_MASK |
30 | #define IRQF_MODIFY_MASK GOT_YOU_MORON | 32 | #define IRQF_MODIFY_MASK GOT_YOU_MORON |
31 | 33 | ||
@@ -147,3 +149,8 @@ static inline bool irq_settings_is_nested_thread(struct irq_desc *desc) | |||
147 | { | 149 | { |
148 | return desc->status_use_accessors & _IRQ_NESTED_THREAD; | 150 | return desc->status_use_accessors & _IRQ_NESTED_THREAD; |
149 | } | 151 | } |
152 | |||
153 | static inline bool irq_settings_is_polled(struct irq_desc *desc) | ||
154 | { | ||
155 | return desc->status_use_accessors & _IRQ_IS_POLLED; | ||
156 | } | ||
diff --git a/kernel/irq/spurious.c b/kernel/irq/spurious.c index 7b5f012bde9d..a1d8cc63b56e 100644 --- a/kernel/irq/spurious.c +++ b/kernel/irq/spurious.c | |||
@@ -67,8 +67,13 @@ static int try_one_irq(int irq, struct irq_desc *desc, bool force) | |||
67 | 67 | ||
68 | raw_spin_lock(&desc->lock); | 68 | raw_spin_lock(&desc->lock); |
69 | 69 | ||
70 | /* PER_CPU and nested thread interrupts are never polled */ | 70 | /* |
71 | if (irq_settings_is_per_cpu(desc) || irq_settings_is_nested_thread(desc)) | 71 | * PER_CPU, nested thread interrupts and interrupts explicitely |
72 | * marked polled are excluded from polling. | ||
73 | */ | ||
74 | if (irq_settings_is_per_cpu(desc) || | ||
75 | irq_settings_is_nested_thread(desc) || | ||
76 | irq_settings_is_polled(desc)) | ||
72 | goto out; | 77 | goto out; |
73 | 78 | ||
74 | /* | 79 | /* |
@@ -268,7 +273,8 @@ try_misrouted_irq(unsigned int irq, struct irq_desc *desc, | |||
268 | void note_interrupt(unsigned int irq, struct irq_desc *desc, | 273 | void note_interrupt(unsigned int irq, struct irq_desc *desc, |
269 | irqreturn_t action_ret) | 274 | irqreturn_t action_ret) |
270 | { | 275 | { |
271 | if (desc->istate & IRQS_POLL_INPROGRESS) | 276 | if (desc->istate & IRQS_POLL_INPROGRESS || |
277 | irq_settings_is_polled(desc)) | ||
272 | return; | 278 | return; |
273 | 279 | ||
274 | /* we get here again via the threaded handler */ | 280 | /* we get here again via the threaded handler */ |
diff --git a/kernel/jump_label.c b/kernel/jump_label.c index 297a9247a3b3..9019f15deab2 100644 --- a/kernel/jump_label.c +++ b/kernel/jump_label.c | |||
@@ -58,6 +58,7 @@ static void jump_label_update(struct static_key *key, int enable); | |||
58 | 58 | ||
59 | void static_key_slow_inc(struct static_key *key) | 59 | void static_key_slow_inc(struct static_key *key) |
60 | { | 60 | { |
61 | STATIC_KEY_CHECK_USE(); | ||
61 | if (atomic_inc_not_zero(&key->enabled)) | 62 | if (atomic_inc_not_zero(&key->enabled)) |
62 | return; | 63 | return; |
63 | 64 | ||
@@ -103,12 +104,14 @@ static void jump_label_update_timeout(struct work_struct *work) | |||
103 | 104 | ||
104 | void static_key_slow_dec(struct static_key *key) | 105 | void static_key_slow_dec(struct static_key *key) |
105 | { | 106 | { |
107 | STATIC_KEY_CHECK_USE(); | ||
106 | __static_key_slow_dec(key, 0, NULL); | 108 | __static_key_slow_dec(key, 0, NULL); |
107 | } | 109 | } |
108 | EXPORT_SYMBOL_GPL(static_key_slow_dec); | 110 | EXPORT_SYMBOL_GPL(static_key_slow_dec); |
109 | 111 | ||
110 | void static_key_slow_dec_deferred(struct static_key_deferred *key) | 112 | void static_key_slow_dec_deferred(struct static_key_deferred *key) |
111 | { | 113 | { |
114 | STATIC_KEY_CHECK_USE(); | ||
112 | __static_key_slow_dec(&key->key, key->timeout, &key->work); | 115 | __static_key_slow_dec(&key->key, key->timeout, &key->work); |
113 | } | 116 | } |
114 | EXPORT_SYMBOL_GPL(static_key_slow_dec_deferred); | 117 | EXPORT_SYMBOL_GPL(static_key_slow_dec_deferred); |
@@ -116,6 +119,7 @@ EXPORT_SYMBOL_GPL(static_key_slow_dec_deferred); | |||
116 | void jump_label_rate_limit(struct static_key_deferred *key, | 119 | void jump_label_rate_limit(struct static_key_deferred *key, |
117 | unsigned long rl) | 120 | unsigned long rl) |
118 | { | 121 | { |
122 | STATIC_KEY_CHECK_USE(); | ||
119 | key->timeout = rl; | 123 | key->timeout = rl; |
120 | INIT_DELAYED_WORK(&key->work, jump_label_update_timeout); | 124 | INIT_DELAYED_WORK(&key->work, jump_label_update_timeout); |
121 | } | 125 | } |
@@ -212,6 +216,7 @@ void __init jump_label_init(void) | |||
212 | key->next = NULL; | 216 | key->next = NULL; |
213 | #endif | 217 | #endif |
214 | } | 218 | } |
219 | static_key_initialized = true; | ||
215 | jump_label_unlock(); | 220 | jump_label_unlock(); |
216 | } | 221 | } |
217 | 222 | ||
diff --git a/kernel/kexec.c b/kernel/kexec.c index 2a74f307c5ec..9c970167e402 100644 --- a/kernel/kexec.c +++ b/kernel/kexec.c | |||
@@ -47,6 +47,9 @@ u32 vmcoreinfo_note[VMCOREINFO_NOTE_SIZE/4]; | |||
47 | size_t vmcoreinfo_size; | 47 | size_t vmcoreinfo_size; |
48 | size_t vmcoreinfo_max_size = sizeof(vmcoreinfo_data); | 48 | size_t vmcoreinfo_max_size = sizeof(vmcoreinfo_data); |
49 | 49 | ||
50 | /* Flag to indicate we are going to kexec a new kernel */ | ||
51 | bool kexec_in_progress = false; | ||
52 | |||
50 | /* Location of the reserved area for the crash kernel */ | 53 | /* Location of the reserved area for the crash kernel */ |
51 | struct resource crashk_res = { | 54 | struct resource crashk_res = { |
52 | .name = "Crash kernel", | 55 | .name = "Crash kernel", |
@@ -921,7 +924,7 @@ static int kimage_load_segment(struct kimage *image, | |||
921 | * reinitialize them. | 924 | * reinitialize them. |
922 | * | 925 | * |
923 | * - A machine specific part that includes the syscall number | 926 | * - A machine specific part that includes the syscall number |
924 | * and the copies the image to it's final destination. And | 927 | * and then copies the image to it's final destination. And |
925 | * jumps into the image at entry. | 928 | * jumps into the image at entry. |
926 | * | 929 | * |
927 | * kexec does not sync, or unmount filesystems so if you need | 930 | * kexec does not sync, or unmount filesystems so if you need |
@@ -1675,7 +1678,9 @@ int kernel_kexec(void) | |||
1675 | } else | 1678 | } else |
1676 | #endif | 1679 | #endif |
1677 | { | 1680 | { |
1681 | kexec_in_progress = true; | ||
1678 | kernel_restart_prepare(NULL); | 1682 | kernel_restart_prepare(NULL); |
1683 | migrate_to_reboot_cpu(); | ||
1679 | printk(KERN_EMERG "Starting new kernel\n"); | 1684 | printk(KERN_EMERG "Starting new kernel\n"); |
1680 | machine_shutdown(); | 1685 | machine_shutdown(); |
1681 | } | 1686 | } |
diff --git a/kernel/kprobes.c b/kernel/kprobes.c index a0d367a49122..ceeadfcabb76 100644 --- a/kernel/kprobes.c +++ b/kernel/kprobes.c | |||
@@ -2066,7 +2066,7 @@ static int __init init_kprobes(void) | |||
2066 | { | 2066 | { |
2067 | int i, err = 0; | 2067 | int i, err = 0; |
2068 | unsigned long offset = 0, size = 0; | 2068 | unsigned long offset = 0, size = 0; |
2069 | char *modname, namebuf[128]; | 2069 | char *modname, namebuf[KSYM_NAME_LEN]; |
2070 | const char *symbol_name; | 2070 | const char *symbol_name; |
2071 | void *addr; | 2071 | void *addr; |
2072 | struct kprobe_blackpoint *kb; | 2072 | struct kprobe_blackpoint *kb; |
@@ -2192,7 +2192,7 @@ static int __kprobes show_kprobe_addr(struct seq_file *pi, void *v) | |||
2192 | const char *sym = NULL; | 2192 | const char *sym = NULL; |
2193 | unsigned int i = *(loff_t *) v; | 2193 | unsigned int i = *(loff_t *) v; |
2194 | unsigned long offset = 0; | 2194 | unsigned long offset = 0; |
2195 | char *modname, namebuf[128]; | 2195 | char *modname, namebuf[KSYM_NAME_LEN]; |
2196 | 2196 | ||
2197 | head = &kprobe_table[i]; | 2197 | head = &kprobe_table[i]; |
2198 | preempt_disable(); | 2198 | preempt_disable(); |
diff --git a/kernel/kthread.c b/kernel/kthread.c index 760e86df8c20..b5ae3ee860a9 100644 --- a/kernel/kthread.c +++ b/kernel/kthread.c | |||
@@ -33,7 +33,7 @@ struct kthread_create_info | |||
33 | 33 | ||
34 | /* Result passed back to kthread_create() from kthreadd. */ | 34 | /* Result passed back to kthread_create() from kthreadd. */ |
35 | struct task_struct *result; | 35 | struct task_struct *result; |
36 | struct completion done; | 36 | struct completion *done; |
37 | 37 | ||
38 | struct list_head list; | 38 | struct list_head list; |
39 | }; | 39 | }; |
@@ -178,6 +178,7 @@ static int kthread(void *_create) | |||
178 | struct kthread_create_info *create = _create; | 178 | struct kthread_create_info *create = _create; |
179 | int (*threadfn)(void *data) = create->threadfn; | 179 | int (*threadfn)(void *data) = create->threadfn; |
180 | void *data = create->data; | 180 | void *data = create->data; |
181 | struct completion *done; | ||
181 | struct kthread self; | 182 | struct kthread self; |
182 | int ret; | 183 | int ret; |
183 | 184 | ||
@@ -187,10 +188,16 @@ static int kthread(void *_create) | |||
187 | init_completion(&self.parked); | 188 | init_completion(&self.parked); |
188 | current->vfork_done = &self.exited; | 189 | current->vfork_done = &self.exited; |
189 | 190 | ||
191 | /* If user was SIGKILLed, I release the structure. */ | ||
192 | done = xchg(&create->done, NULL); | ||
193 | if (!done) { | ||
194 | kfree(create); | ||
195 | do_exit(-EINTR); | ||
196 | } | ||
190 | /* OK, tell user we're spawned, wait for stop or wakeup */ | 197 | /* OK, tell user we're spawned, wait for stop or wakeup */ |
191 | __set_current_state(TASK_UNINTERRUPTIBLE); | 198 | __set_current_state(TASK_UNINTERRUPTIBLE); |
192 | create->result = current; | 199 | create->result = current; |
193 | complete(&create->done); | 200 | complete(done); |
194 | schedule(); | 201 | schedule(); |
195 | 202 | ||
196 | ret = -EINTR; | 203 | ret = -EINTR; |
@@ -223,8 +230,15 @@ static void create_kthread(struct kthread_create_info *create) | |||
223 | /* We want our own signal handler (we take no signals by default). */ | 230 | /* We want our own signal handler (we take no signals by default). */ |
224 | pid = kernel_thread(kthread, create, CLONE_FS | CLONE_FILES | SIGCHLD); | 231 | pid = kernel_thread(kthread, create, CLONE_FS | CLONE_FILES | SIGCHLD); |
225 | if (pid < 0) { | 232 | if (pid < 0) { |
233 | /* If user was SIGKILLed, I release the structure. */ | ||
234 | struct completion *done = xchg(&create->done, NULL); | ||
235 | |||
236 | if (!done) { | ||
237 | kfree(create); | ||
238 | return; | ||
239 | } | ||
226 | create->result = ERR_PTR(pid); | 240 | create->result = ERR_PTR(pid); |
227 | complete(&create->done); | 241 | complete(done); |
228 | } | 242 | } |
229 | } | 243 | } |
230 | 244 | ||
@@ -255,36 +269,59 @@ struct task_struct *kthread_create_on_node(int (*threadfn)(void *data), | |||
255 | const char namefmt[], | 269 | const char namefmt[], |
256 | ...) | 270 | ...) |
257 | { | 271 | { |
258 | struct kthread_create_info create; | 272 | DECLARE_COMPLETION_ONSTACK(done); |
259 | 273 | struct task_struct *task; | |
260 | create.threadfn = threadfn; | 274 | struct kthread_create_info *create = kmalloc(sizeof(*create), |
261 | create.data = data; | 275 | GFP_KERNEL); |
262 | create.node = node; | 276 | |
263 | init_completion(&create.done); | 277 | if (!create) |
278 | return ERR_PTR(-ENOMEM); | ||
279 | create->threadfn = threadfn; | ||
280 | create->data = data; | ||
281 | create->node = node; | ||
282 | create->done = &done; | ||
264 | 283 | ||
265 | spin_lock(&kthread_create_lock); | 284 | spin_lock(&kthread_create_lock); |
266 | list_add_tail(&create.list, &kthread_create_list); | 285 | list_add_tail(&create->list, &kthread_create_list); |
267 | spin_unlock(&kthread_create_lock); | 286 | spin_unlock(&kthread_create_lock); |
268 | 287 | ||
269 | wake_up_process(kthreadd_task); | 288 | wake_up_process(kthreadd_task); |
270 | wait_for_completion(&create.done); | 289 | /* |
271 | 290 | * Wait for completion in killable state, for I might be chosen by | |
272 | if (!IS_ERR(create.result)) { | 291 | * the OOM killer while kthreadd is trying to allocate memory for |
292 | * new kernel thread. | ||
293 | */ | ||
294 | if (unlikely(wait_for_completion_killable(&done))) { | ||
295 | /* | ||
296 | * If I was SIGKILLed before kthreadd (or new kernel thread) | ||
297 | * calls complete(), leave the cleanup of this structure to | ||
298 | * that thread. | ||
299 | */ | ||
300 | if (xchg(&create->done, NULL)) | ||
301 | return ERR_PTR(-ENOMEM); | ||
302 | /* | ||
303 | * kthreadd (or new kernel thread) will call complete() | ||
304 | * shortly. | ||
305 | */ | ||
306 | wait_for_completion(&done); | ||
307 | } | ||
308 | task = create->result; | ||
309 | if (!IS_ERR(task)) { | ||
273 | static const struct sched_param param = { .sched_priority = 0 }; | 310 | static const struct sched_param param = { .sched_priority = 0 }; |
274 | va_list args; | 311 | va_list args; |
275 | 312 | ||
276 | va_start(args, namefmt); | 313 | va_start(args, namefmt); |
277 | vsnprintf(create.result->comm, sizeof(create.result->comm), | 314 | vsnprintf(task->comm, sizeof(task->comm), namefmt, args); |
278 | namefmt, args); | ||
279 | va_end(args); | 315 | va_end(args); |
280 | /* | 316 | /* |
281 | * root may have changed our (kthreadd's) priority or CPU mask. | 317 | * root may have changed our (kthreadd's) priority or CPU mask. |
282 | * The kernel thread should not inherit these properties. | 318 | * The kernel thread should not inherit these properties. |
283 | */ | 319 | */ |
284 | sched_setscheduler_nocheck(create.result, SCHED_NORMAL, ¶m); | 320 | sched_setscheduler_nocheck(task, SCHED_NORMAL, ¶m); |
285 | set_cpus_allowed_ptr(create.result, cpu_all_mask); | 321 | set_cpus_allowed_ptr(task, cpu_all_mask); |
286 | } | 322 | } |
287 | return create.result; | 323 | kfree(create); |
324 | return task; | ||
288 | } | 325 | } |
289 | EXPORT_SYMBOL(kthread_create_on_node); | 326 | EXPORT_SYMBOL(kthread_create_on_node); |
290 | 327 | ||
diff --git a/kernel/locking/Makefile b/kernel/locking/Makefile new file mode 100644 index 000000000000..baab8e5e7f66 --- /dev/null +++ b/kernel/locking/Makefile | |||
@@ -0,0 +1,25 @@ | |||
1 | |||
2 | obj-y += mutex.o semaphore.o rwsem.o lglock.o | ||
3 | |||
4 | ifdef CONFIG_FUNCTION_TRACER | ||
5 | CFLAGS_REMOVE_lockdep.o = -pg | ||
6 | CFLAGS_REMOVE_lockdep_proc.o = -pg | ||
7 | CFLAGS_REMOVE_mutex-debug.o = -pg | ||
8 | CFLAGS_REMOVE_rtmutex-debug.o = -pg | ||
9 | endif | ||
10 | |||
11 | obj-$(CONFIG_DEBUG_MUTEXES) += mutex-debug.o | ||
12 | obj-$(CONFIG_LOCKDEP) += lockdep.o | ||
13 | ifeq ($(CONFIG_PROC_FS),y) | ||
14 | obj-$(CONFIG_LOCKDEP) += lockdep_proc.o | ||
15 | endif | ||
16 | obj-$(CONFIG_SMP) += spinlock.o | ||
17 | obj-$(CONFIG_PROVE_LOCKING) += spinlock.o | ||
18 | obj-$(CONFIG_RT_MUTEXES) += rtmutex.o | ||
19 | obj-$(CONFIG_DEBUG_RT_MUTEXES) += rtmutex-debug.o | ||
20 | obj-$(CONFIG_RT_MUTEX_TESTER) += rtmutex-tester.o | ||
21 | obj-$(CONFIG_DEBUG_SPINLOCK) += spinlock.o | ||
22 | obj-$(CONFIG_DEBUG_SPINLOCK) += spinlock_debug.o | ||
23 | obj-$(CONFIG_RWSEM_GENERIC_SPINLOCK) += rwsem-spinlock.o | ||
24 | obj-$(CONFIG_RWSEM_XCHGADD_ALGORITHM) += rwsem-xadd.o | ||
25 | obj-$(CONFIG_PERCPU_RWSEM) += percpu-rwsem.o | ||
diff --git a/kernel/lglock.c b/kernel/locking/lglock.c index 86ae2aebf004..86ae2aebf004 100644 --- a/kernel/lglock.c +++ b/kernel/locking/lglock.c | |||
diff --git a/kernel/lockdep.c b/kernel/locking/lockdep.c index e16c45b9ee77..576ba756a32d 100644 --- a/kernel/lockdep.c +++ b/kernel/locking/lockdep.c | |||
@@ -1232,7 +1232,7 @@ static int noop_count(struct lock_list *entry, void *data) | |||
1232 | return 0; | 1232 | return 0; |
1233 | } | 1233 | } |
1234 | 1234 | ||
1235 | unsigned long __lockdep_count_forward_deps(struct lock_list *this) | 1235 | static unsigned long __lockdep_count_forward_deps(struct lock_list *this) |
1236 | { | 1236 | { |
1237 | unsigned long count = 0; | 1237 | unsigned long count = 0; |
1238 | struct lock_list *uninitialized_var(target_entry); | 1238 | struct lock_list *uninitialized_var(target_entry); |
@@ -1258,7 +1258,7 @@ unsigned long lockdep_count_forward_deps(struct lock_class *class) | |||
1258 | return ret; | 1258 | return ret; |
1259 | } | 1259 | } |
1260 | 1260 | ||
1261 | unsigned long __lockdep_count_backward_deps(struct lock_list *this) | 1261 | static unsigned long __lockdep_count_backward_deps(struct lock_list *this) |
1262 | { | 1262 | { |
1263 | unsigned long count = 0; | 1263 | unsigned long count = 0; |
1264 | struct lock_list *uninitialized_var(target_entry); | 1264 | struct lock_list *uninitialized_var(target_entry); |
@@ -4224,7 +4224,7 @@ void lockdep_rcu_suspicious(const char *file, const int line, const char *s) | |||
4224 | printk("\n%srcu_scheduler_active = %d, debug_locks = %d\n", | 4224 | printk("\n%srcu_scheduler_active = %d, debug_locks = %d\n", |
4225 | !rcu_lockdep_current_cpu_online() | 4225 | !rcu_lockdep_current_cpu_online() |
4226 | ? "RCU used illegally from offline CPU!\n" | 4226 | ? "RCU used illegally from offline CPU!\n" |
4227 | : rcu_is_cpu_idle() | 4227 | : !rcu_is_watching() |
4228 | ? "RCU used illegally from idle CPU!\n" | 4228 | ? "RCU used illegally from idle CPU!\n" |
4229 | : "", | 4229 | : "", |
4230 | rcu_scheduler_active, debug_locks); | 4230 | rcu_scheduler_active, debug_locks); |
@@ -4247,7 +4247,7 @@ void lockdep_rcu_suspicious(const char *file, const int line, const char *s) | |||
4247 | * So complain bitterly if someone does call rcu_read_lock(), | 4247 | * So complain bitterly if someone does call rcu_read_lock(), |
4248 | * rcu_read_lock_bh() and so on from extended quiescent states. | 4248 | * rcu_read_lock_bh() and so on from extended quiescent states. |
4249 | */ | 4249 | */ |
4250 | if (rcu_is_cpu_idle()) | 4250 | if (!rcu_is_watching()) |
4251 | printk("RCU used illegally from extended quiescent state!\n"); | 4251 | printk("RCU used illegally from extended quiescent state!\n"); |
4252 | 4252 | ||
4253 | lockdep_print_held_locks(curr); | 4253 | lockdep_print_held_locks(curr); |
diff --git a/kernel/lockdep_internals.h b/kernel/locking/lockdep_internals.h index 4f560cfedc8f..4f560cfedc8f 100644 --- a/kernel/lockdep_internals.h +++ b/kernel/locking/lockdep_internals.h | |||
diff --git a/kernel/lockdep_proc.c b/kernel/locking/lockdep_proc.c index b2c71c5873e4..ef43ac4bafb5 100644 --- a/kernel/lockdep_proc.c +++ b/kernel/locking/lockdep_proc.c | |||
@@ -421,6 +421,7 @@ static void seq_lock_time(struct seq_file *m, struct lock_time *lt) | |||
421 | seq_time(m, lt->min); | 421 | seq_time(m, lt->min); |
422 | seq_time(m, lt->max); | 422 | seq_time(m, lt->max); |
423 | seq_time(m, lt->total); | 423 | seq_time(m, lt->total); |
424 | seq_time(m, lt->nr ? div_s64(lt->total, lt->nr) : 0); | ||
424 | } | 425 | } |
425 | 426 | ||
426 | static void seq_stats(struct seq_file *m, struct lock_stat_data *data) | 427 | static void seq_stats(struct seq_file *m, struct lock_stat_data *data) |
@@ -518,20 +519,20 @@ static void seq_stats(struct seq_file *m, struct lock_stat_data *data) | |||
518 | } | 519 | } |
519 | if (i) { | 520 | if (i) { |
520 | seq_puts(m, "\n"); | 521 | seq_puts(m, "\n"); |
521 | seq_line(m, '.', 0, 40 + 1 + 10 * (14 + 1)); | 522 | seq_line(m, '.', 0, 40 + 1 + 12 * (14 + 1)); |
522 | seq_puts(m, "\n"); | 523 | seq_puts(m, "\n"); |
523 | } | 524 | } |
524 | } | 525 | } |
525 | 526 | ||
526 | static void seq_header(struct seq_file *m) | 527 | static void seq_header(struct seq_file *m) |
527 | { | 528 | { |
528 | seq_printf(m, "lock_stat version 0.3\n"); | 529 | seq_puts(m, "lock_stat version 0.4\n"); |
529 | 530 | ||
530 | if (unlikely(!debug_locks)) | 531 | if (unlikely(!debug_locks)) |
531 | seq_printf(m, "*WARNING* lock debugging disabled!! - possibly due to a lockdep warning\n"); | 532 | seq_printf(m, "*WARNING* lock debugging disabled!! - possibly due to a lockdep warning\n"); |
532 | 533 | ||
533 | seq_line(m, '-', 0, 40 + 1 + 10 * (14 + 1)); | 534 | seq_line(m, '-', 0, 40 + 1 + 12 * (14 + 1)); |
534 | seq_printf(m, "%40s %14s %14s %14s %14s %14s %14s %14s %14s " | 535 | seq_printf(m, "%40s %14s %14s %14s %14s %14s %14s %14s %14s %14s %14s " |
535 | "%14s %14s\n", | 536 | "%14s %14s\n", |
536 | "class name", | 537 | "class name", |
537 | "con-bounces", | 538 | "con-bounces", |
@@ -539,12 +540,14 @@ static void seq_header(struct seq_file *m) | |||
539 | "waittime-min", | 540 | "waittime-min", |
540 | "waittime-max", | 541 | "waittime-max", |
541 | "waittime-total", | 542 | "waittime-total", |
543 | "waittime-avg", | ||
542 | "acq-bounces", | 544 | "acq-bounces", |
543 | "acquisitions", | 545 | "acquisitions", |
544 | "holdtime-min", | 546 | "holdtime-min", |
545 | "holdtime-max", | 547 | "holdtime-max", |
546 | "holdtime-total"); | 548 | "holdtime-total", |
547 | seq_line(m, '-', 0, 40 + 1 + 10 * (14 + 1)); | 549 | "holdtime-avg"); |
550 | seq_line(m, '-', 0, 40 + 1 + 12 * (14 + 1)); | ||
548 | seq_printf(m, "\n"); | 551 | seq_printf(m, "\n"); |
549 | } | 552 | } |
550 | 553 | ||
diff --git a/kernel/lockdep_states.h b/kernel/locking/lockdep_states.h index 995b0cc2b84c..995b0cc2b84c 100644 --- a/kernel/lockdep_states.h +++ b/kernel/locking/lockdep_states.h | |||
diff --git a/kernel/mutex-debug.c b/kernel/locking/mutex-debug.c index 7e3443fe1f48..7e3443fe1f48 100644 --- a/kernel/mutex-debug.c +++ b/kernel/locking/mutex-debug.c | |||
diff --git a/kernel/mutex-debug.h b/kernel/locking/mutex-debug.h index 0799fd3e4cfa..0799fd3e4cfa 100644 --- a/kernel/mutex-debug.h +++ b/kernel/locking/mutex-debug.h | |||
diff --git a/kernel/mutex.c b/kernel/locking/mutex.c index d24105b1b794..4dd6e4c219de 100644 --- a/kernel/mutex.c +++ b/kernel/locking/mutex.c | |||
@@ -1,5 +1,5 @@ | |||
1 | /* | 1 | /* |
2 | * kernel/mutex.c | 2 | * kernel/locking/mutex.c |
3 | * | 3 | * |
4 | * Mutexes: blocking mutual exclusion locks | 4 | * Mutexes: blocking mutual exclusion locks |
5 | * | 5 | * |
diff --git a/kernel/mutex.h b/kernel/locking/mutex.h index 4115fbf83b12..4115fbf83b12 100644 --- a/kernel/mutex.h +++ b/kernel/locking/mutex.h | |||
diff --git a/kernel/locking/percpu-rwsem.c b/kernel/locking/percpu-rwsem.c new file mode 100644 index 000000000000..652a8ee8efe9 --- /dev/null +++ b/kernel/locking/percpu-rwsem.c | |||
@@ -0,0 +1,165 @@ | |||
1 | #include <linux/atomic.h> | ||
2 | #include <linux/rwsem.h> | ||
3 | #include <linux/percpu.h> | ||
4 | #include <linux/wait.h> | ||
5 | #include <linux/lockdep.h> | ||
6 | #include <linux/percpu-rwsem.h> | ||
7 | #include <linux/rcupdate.h> | ||
8 | #include <linux/sched.h> | ||
9 | #include <linux/errno.h> | ||
10 | |||
11 | int __percpu_init_rwsem(struct percpu_rw_semaphore *brw, | ||
12 | const char *name, struct lock_class_key *rwsem_key) | ||
13 | { | ||
14 | brw->fast_read_ctr = alloc_percpu(int); | ||
15 | if (unlikely(!brw->fast_read_ctr)) | ||
16 | return -ENOMEM; | ||
17 | |||
18 | /* ->rw_sem represents the whole percpu_rw_semaphore for lockdep */ | ||
19 | __init_rwsem(&brw->rw_sem, name, rwsem_key); | ||
20 | atomic_set(&brw->write_ctr, 0); | ||
21 | atomic_set(&brw->slow_read_ctr, 0); | ||
22 | init_waitqueue_head(&brw->write_waitq); | ||
23 | return 0; | ||
24 | } | ||
25 | |||
26 | void percpu_free_rwsem(struct percpu_rw_semaphore *brw) | ||
27 | { | ||
28 | free_percpu(brw->fast_read_ctr); | ||
29 | brw->fast_read_ctr = NULL; /* catch use after free bugs */ | ||
30 | } | ||
31 | |||
32 | /* | ||
33 | * This is the fast-path for down_read/up_read, it only needs to ensure | ||
34 | * there is no pending writer (atomic_read(write_ctr) == 0) and inc/dec the | ||
35 | * fast per-cpu counter. The writer uses synchronize_sched_expedited() to | ||
36 | * serialize with the preempt-disabled section below. | ||
37 | * | ||
38 | * The nontrivial part is that we should guarantee acquire/release semantics | ||
39 | * in case when | ||
40 | * | ||
41 | * R_W: down_write() comes after up_read(), the writer should see all | ||
42 | * changes done by the reader | ||
43 | * or | ||
44 | * W_R: down_read() comes after up_write(), the reader should see all | ||
45 | * changes done by the writer | ||
46 | * | ||
47 | * If this helper fails the callers rely on the normal rw_semaphore and | ||
48 | * atomic_dec_and_test(), so in this case we have the necessary barriers. | ||
49 | * | ||
50 | * But if it succeeds we do not have any barriers, atomic_read(write_ctr) or | ||
51 | * __this_cpu_add() below can be reordered with any LOAD/STORE done by the | ||
52 | * reader inside the critical section. See the comments in down_write and | ||
53 | * up_write below. | ||
54 | */ | ||
55 | static bool update_fast_ctr(struct percpu_rw_semaphore *brw, unsigned int val) | ||
56 | { | ||
57 | bool success = false; | ||
58 | |||
59 | preempt_disable(); | ||
60 | if (likely(!atomic_read(&brw->write_ctr))) { | ||
61 | __this_cpu_add(*brw->fast_read_ctr, val); | ||
62 | success = true; | ||
63 | } | ||
64 | preempt_enable(); | ||
65 | |||
66 | return success; | ||
67 | } | ||
68 | |||
69 | /* | ||
70 | * Like the normal down_read() this is not recursive, the writer can | ||
71 | * come after the first percpu_down_read() and create the deadlock. | ||
72 | * | ||
73 | * Note: returns with lock_is_held(brw->rw_sem) == T for lockdep, | ||
74 | * percpu_up_read() does rwsem_release(). This pairs with the usage | ||
75 | * of ->rw_sem in percpu_down/up_write(). | ||
76 | */ | ||
77 | void percpu_down_read(struct percpu_rw_semaphore *brw) | ||
78 | { | ||
79 | might_sleep(); | ||
80 | if (likely(update_fast_ctr(brw, +1))) { | ||
81 | rwsem_acquire_read(&brw->rw_sem.dep_map, 0, 0, _RET_IP_); | ||
82 | return; | ||
83 | } | ||
84 | |||
85 | down_read(&brw->rw_sem); | ||
86 | atomic_inc(&brw->slow_read_ctr); | ||
87 | /* avoid up_read()->rwsem_release() */ | ||
88 | __up_read(&brw->rw_sem); | ||
89 | } | ||
90 | |||
91 | void percpu_up_read(struct percpu_rw_semaphore *brw) | ||
92 | { | ||
93 | rwsem_release(&brw->rw_sem.dep_map, 1, _RET_IP_); | ||
94 | |||
95 | if (likely(update_fast_ctr(brw, -1))) | ||
96 | return; | ||
97 | |||
98 | /* false-positive is possible but harmless */ | ||
99 | if (atomic_dec_and_test(&brw->slow_read_ctr)) | ||
100 | wake_up_all(&brw->write_waitq); | ||
101 | } | ||
102 | |||
103 | static int clear_fast_ctr(struct percpu_rw_semaphore *brw) | ||
104 | { | ||
105 | unsigned int sum = 0; | ||
106 | int cpu; | ||
107 | |||
108 | for_each_possible_cpu(cpu) { | ||
109 | sum += per_cpu(*brw->fast_read_ctr, cpu); | ||
110 | per_cpu(*brw->fast_read_ctr, cpu) = 0; | ||
111 | } | ||
112 | |||
113 | return sum; | ||
114 | } | ||
115 | |||
116 | /* | ||
117 | * A writer increments ->write_ctr to force the readers to switch to the | ||
118 | * slow mode, note the atomic_read() check in update_fast_ctr(). | ||
119 | * | ||
120 | * After that the readers can only inc/dec the slow ->slow_read_ctr counter, | ||
121 | * ->fast_read_ctr is stable. Once the writer moves its sum into the slow | ||
122 | * counter it represents the number of active readers. | ||
123 | * | ||
124 | * Finally the writer takes ->rw_sem for writing and blocks the new readers, | ||
125 | * then waits until the slow counter becomes zero. | ||
126 | */ | ||
127 | void percpu_down_write(struct percpu_rw_semaphore *brw) | ||
128 | { | ||
129 | /* tell update_fast_ctr() there is a pending writer */ | ||
130 | atomic_inc(&brw->write_ctr); | ||
131 | /* | ||
132 | * 1. Ensures that write_ctr != 0 is visible to any down_read/up_read | ||
133 | * so that update_fast_ctr() can't succeed. | ||
134 | * | ||
135 | * 2. Ensures we see the result of every previous this_cpu_add() in | ||
136 | * update_fast_ctr(). | ||
137 | * | ||
138 | * 3. Ensures that if any reader has exited its critical section via | ||
139 | * fast-path, it executes a full memory barrier before we return. | ||
140 | * See R_W case in the comment above update_fast_ctr(). | ||
141 | */ | ||
142 | synchronize_sched_expedited(); | ||
143 | |||
144 | /* exclude other writers, and block the new readers completely */ | ||
145 | down_write(&brw->rw_sem); | ||
146 | |||
147 | /* nobody can use fast_read_ctr, move its sum into slow_read_ctr */ | ||
148 | atomic_add(clear_fast_ctr(brw), &brw->slow_read_ctr); | ||
149 | |||
150 | /* wait for all readers to complete their percpu_up_read() */ | ||
151 | wait_event(brw->write_waitq, !atomic_read(&brw->slow_read_ctr)); | ||
152 | } | ||
153 | |||
154 | void percpu_up_write(struct percpu_rw_semaphore *brw) | ||
155 | { | ||
156 | /* release the lock, but the readers can't use the fast-path */ | ||
157 | up_write(&brw->rw_sem); | ||
158 | /* | ||
159 | * Insert the barrier before the next fast-path in down_read, | ||
160 | * see W_R case in the comment above update_fast_ctr(). | ||
161 | */ | ||
162 | synchronize_sched_expedited(); | ||
163 | /* the last writer unblocks update_fast_ctr() */ | ||
164 | atomic_dec(&brw->write_ctr); | ||
165 | } | ||
diff --git a/kernel/rtmutex-debug.c b/kernel/locking/rtmutex-debug.c index 13b243a323fa..13b243a323fa 100644 --- a/kernel/rtmutex-debug.c +++ b/kernel/locking/rtmutex-debug.c | |||
diff --git a/kernel/rtmutex-debug.h b/kernel/locking/rtmutex-debug.h index 14193d596d78..14193d596d78 100644 --- a/kernel/rtmutex-debug.h +++ b/kernel/locking/rtmutex-debug.h | |||
diff --git a/kernel/rtmutex-tester.c b/kernel/locking/rtmutex-tester.c index 1d96dd0d93c1..1d96dd0d93c1 100644 --- a/kernel/rtmutex-tester.c +++ b/kernel/locking/rtmutex-tester.c | |||
diff --git a/kernel/rtmutex.c b/kernel/locking/rtmutex.c index 0dd6aec1cb6a..0dd6aec1cb6a 100644 --- a/kernel/rtmutex.c +++ b/kernel/locking/rtmutex.c | |||
diff --git a/kernel/rtmutex.h b/kernel/locking/rtmutex.h index a1a1dd06421d..a1a1dd06421d 100644 --- a/kernel/rtmutex.h +++ b/kernel/locking/rtmutex.h | |||
diff --git a/kernel/rtmutex_common.h b/kernel/locking/rtmutex_common.h index 53a66c85261b..53a66c85261b 100644 --- a/kernel/rtmutex_common.h +++ b/kernel/locking/rtmutex_common.h | |||
diff --git a/kernel/locking/rwsem-spinlock.c b/kernel/locking/rwsem-spinlock.c new file mode 100644 index 000000000000..9be8a9144978 --- /dev/null +++ b/kernel/locking/rwsem-spinlock.c | |||
@@ -0,0 +1,296 @@ | |||
1 | /* rwsem-spinlock.c: R/W semaphores: contention handling functions for | ||
2 | * generic spinlock implementation | ||
3 | * | ||
4 | * Copyright (c) 2001 David Howells (dhowells@redhat.com). | ||
5 | * - Derived partially from idea by Andrea Arcangeli <andrea@suse.de> | ||
6 | * - Derived also from comments by Linus | ||
7 | */ | ||
8 | #include <linux/rwsem.h> | ||
9 | #include <linux/sched.h> | ||
10 | #include <linux/export.h> | ||
11 | |||
12 | enum rwsem_waiter_type { | ||
13 | RWSEM_WAITING_FOR_WRITE, | ||
14 | RWSEM_WAITING_FOR_READ | ||
15 | }; | ||
16 | |||
17 | struct rwsem_waiter { | ||
18 | struct list_head list; | ||
19 | struct task_struct *task; | ||
20 | enum rwsem_waiter_type type; | ||
21 | }; | ||
22 | |||
23 | int rwsem_is_locked(struct rw_semaphore *sem) | ||
24 | { | ||
25 | int ret = 1; | ||
26 | unsigned long flags; | ||
27 | |||
28 | if (raw_spin_trylock_irqsave(&sem->wait_lock, flags)) { | ||
29 | ret = (sem->activity != 0); | ||
30 | raw_spin_unlock_irqrestore(&sem->wait_lock, flags); | ||
31 | } | ||
32 | return ret; | ||
33 | } | ||
34 | EXPORT_SYMBOL(rwsem_is_locked); | ||
35 | |||
36 | /* | ||
37 | * initialise the semaphore | ||
38 | */ | ||
39 | void __init_rwsem(struct rw_semaphore *sem, const char *name, | ||
40 | struct lock_class_key *key) | ||
41 | { | ||
42 | #ifdef CONFIG_DEBUG_LOCK_ALLOC | ||
43 | /* | ||
44 | * Make sure we are not reinitializing a held semaphore: | ||
45 | */ | ||
46 | debug_check_no_locks_freed((void *)sem, sizeof(*sem)); | ||
47 | lockdep_init_map(&sem->dep_map, name, key, 0); | ||
48 | #endif | ||
49 | sem->activity = 0; | ||
50 | raw_spin_lock_init(&sem->wait_lock); | ||
51 | INIT_LIST_HEAD(&sem->wait_list); | ||
52 | } | ||
53 | EXPORT_SYMBOL(__init_rwsem); | ||
54 | |||
55 | /* | ||
56 | * handle the lock release when processes blocked on it that can now run | ||
57 | * - if we come here, then: | ||
58 | * - the 'active count' _reached_ zero | ||
59 | * - the 'waiting count' is non-zero | ||
60 | * - the spinlock must be held by the caller | ||
61 | * - woken process blocks are discarded from the list after having task zeroed | ||
62 | * - writers are only woken if wakewrite is non-zero | ||
63 | */ | ||
64 | static inline struct rw_semaphore * | ||
65 | __rwsem_do_wake(struct rw_semaphore *sem, int wakewrite) | ||
66 | { | ||
67 | struct rwsem_waiter *waiter; | ||
68 | struct task_struct *tsk; | ||
69 | int woken; | ||
70 | |||
71 | waiter = list_entry(sem->wait_list.next, struct rwsem_waiter, list); | ||
72 | |||
73 | if (waiter->type == RWSEM_WAITING_FOR_WRITE) { | ||
74 | if (wakewrite) | ||
75 | /* Wake up a writer. Note that we do not grant it the | ||
76 | * lock - it will have to acquire it when it runs. */ | ||
77 | wake_up_process(waiter->task); | ||
78 | goto out; | ||
79 | } | ||
80 | |||
81 | /* grant an infinite number of read locks to the front of the queue */ | ||
82 | woken = 0; | ||
83 | do { | ||
84 | struct list_head *next = waiter->list.next; | ||
85 | |||
86 | list_del(&waiter->list); | ||
87 | tsk = waiter->task; | ||
88 | smp_mb(); | ||
89 | waiter->task = NULL; | ||
90 | wake_up_process(tsk); | ||
91 | put_task_struct(tsk); | ||
92 | woken++; | ||
93 | if (next == &sem->wait_list) | ||
94 | break; | ||
95 | waiter = list_entry(next, struct rwsem_waiter, list); | ||
96 | } while (waiter->type != RWSEM_WAITING_FOR_WRITE); | ||
97 | |||
98 | sem->activity += woken; | ||
99 | |||
100 | out: | ||
101 | return sem; | ||
102 | } | ||
103 | |||
104 | /* | ||
105 | * wake a single writer | ||
106 | */ | ||
107 | static inline struct rw_semaphore * | ||
108 | __rwsem_wake_one_writer(struct rw_semaphore *sem) | ||
109 | { | ||
110 | struct rwsem_waiter *waiter; | ||
111 | |||
112 | waiter = list_entry(sem->wait_list.next, struct rwsem_waiter, list); | ||
113 | wake_up_process(waiter->task); | ||
114 | |||
115 | return sem; | ||
116 | } | ||
117 | |||
118 | /* | ||
119 | * get a read lock on the semaphore | ||
120 | */ | ||
121 | void __sched __down_read(struct rw_semaphore *sem) | ||
122 | { | ||
123 | struct rwsem_waiter waiter; | ||
124 | struct task_struct *tsk; | ||
125 | unsigned long flags; | ||
126 | |||
127 | raw_spin_lock_irqsave(&sem->wait_lock, flags); | ||
128 | |||
129 | if (sem->activity >= 0 && list_empty(&sem->wait_list)) { | ||
130 | /* granted */ | ||
131 | sem->activity++; | ||
132 | raw_spin_unlock_irqrestore(&sem->wait_lock, flags); | ||
133 | goto out; | ||
134 | } | ||
135 | |||
136 | tsk = current; | ||
137 | set_task_state(tsk, TASK_UNINTERRUPTIBLE); | ||
138 | |||
139 | /* set up my own style of waitqueue */ | ||
140 | waiter.task = tsk; | ||
141 | waiter.type = RWSEM_WAITING_FOR_READ; | ||
142 | get_task_struct(tsk); | ||
143 | |||
144 | list_add_tail(&waiter.list, &sem->wait_list); | ||
145 | |||
146 | /* we don't need to touch the semaphore struct anymore */ | ||
147 | raw_spin_unlock_irqrestore(&sem->wait_lock, flags); | ||
148 | |||
149 | /* wait to be given the lock */ | ||
150 | for (;;) { | ||
151 | if (!waiter.task) | ||
152 | break; | ||
153 | schedule(); | ||
154 | set_task_state(tsk, TASK_UNINTERRUPTIBLE); | ||
155 | } | ||
156 | |||
157 | tsk->state = TASK_RUNNING; | ||
158 | out: | ||
159 | ; | ||
160 | } | ||
161 | |||
162 | /* | ||
163 | * trylock for reading -- returns 1 if successful, 0 if contention | ||
164 | */ | ||
165 | int __down_read_trylock(struct rw_semaphore *sem) | ||
166 | { | ||
167 | unsigned long flags; | ||
168 | int ret = 0; | ||
169 | |||
170 | |||
171 | raw_spin_lock_irqsave(&sem->wait_lock, flags); | ||
172 | |||
173 | if (sem->activity >= 0 && list_empty(&sem->wait_list)) { | ||
174 | /* granted */ | ||
175 | sem->activity++; | ||
176 | ret = 1; | ||
177 | } | ||
178 | |||
179 | raw_spin_unlock_irqrestore(&sem->wait_lock, flags); | ||
180 | |||
181 | return ret; | ||
182 | } | ||
183 | |||
184 | /* | ||
185 | * get a write lock on the semaphore | ||
186 | */ | ||
187 | void __sched __down_write_nested(struct rw_semaphore *sem, int subclass) | ||
188 | { | ||
189 | struct rwsem_waiter waiter; | ||
190 | struct task_struct *tsk; | ||
191 | unsigned long flags; | ||
192 | |||
193 | raw_spin_lock_irqsave(&sem->wait_lock, flags); | ||
194 | |||
195 | /* set up my own style of waitqueue */ | ||
196 | tsk = current; | ||
197 | waiter.task = tsk; | ||
198 | waiter.type = RWSEM_WAITING_FOR_WRITE; | ||
199 | list_add_tail(&waiter.list, &sem->wait_list); | ||
200 | |||
201 | /* wait for someone to release the lock */ | ||
202 | for (;;) { | ||
203 | /* | ||
204 | * That is the key to support write lock stealing: allows the | ||
205 | * task already on CPU to get the lock soon rather than put | ||
206 | * itself into sleep and waiting for system woke it or someone | ||
207 | * else in the head of the wait list up. | ||
208 | */ | ||
209 | if (sem->activity == 0) | ||
210 | break; | ||
211 | set_task_state(tsk, TASK_UNINTERRUPTIBLE); | ||
212 | raw_spin_unlock_irqrestore(&sem->wait_lock, flags); | ||
213 | schedule(); | ||
214 | raw_spin_lock_irqsave(&sem->wait_lock, flags); | ||
215 | } | ||
216 | /* got the lock */ | ||
217 | sem->activity = -1; | ||
218 | list_del(&waiter.list); | ||
219 | |||
220 | raw_spin_unlock_irqrestore(&sem->wait_lock, flags); | ||
221 | } | ||
222 | |||
223 | void __sched __down_write(struct rw_semaphore *sem) | ||
224 | { | ||
225 | __down_write_nested(sem, 0); | ||
226 | } | ||
227 | |||
228 | /* | ||
229 | * trylock for writing -- returns 1 if successful, 0 if contention | ||
230 | */ | ||
231 | int __down_write_trylock(struct rw_semaphore *sem) | ||
232 | { | ||
233 | unsigned long flags; | ||
234 | int ret = 0; | ||
235 | |||
236 | raw_spin_lock_irqsave(&sem->wait_lock, flags); | ||
237 | |||
238 | if (sem->activity == 0) { | ||
239 | /* got the lock */ | ||
240 | sem->activity = -1; | ||
241 | ret = 1; | ||
242 | } | ||
243 | |||
244 | raw_spin_unlock_irqrestore(&sem->wait_lock, flags); | ||
245 | |||
246 | return ret; | ||
247 | } | ||
248 | |||
249 | /* | ||
250 | * release a read lock on the semaphore | ||
251 | */ | ||
252 | void __up_read(struct rw_semaphore *sem) | ||
253 | { | ||
254 | unsigned long flags; | ||
255 | |||
256 | raw_spin_lock_irqsave(&sem->wait_lock, flags); | ||
257 | |||
258 | if (--sem->activity == 0 && !list_empty(&sem->wait_list)) | ||
259 | sem = __rwsem_wake_one_writer(sem); | ||
260 | |||
261 | raw_spin_unlock_irqrestore(&sem->wait_lock, flags); | ||
262 | } | ||
263 | |||
264 | /* | ||
265 | * release a write lock on the semaphore | ||
266 | */ | ||
267 | void __up_write(struct rw_semaphore *sem) | ||
268 | { | ||
269 | unsigned long flags; | ||
270 | |||
271 | raw_spin_lock_irqsave(&sem->wait_lock, flags); | ||
272 | |||
273 | sem->activity = 0; | ||
274 | if (!list_empty(&sem->wait_list)) | ||
275 | sem = __rwsem_do_wake(sem, 1); | ||
276 | |||
277 | raw_spin_unlock_irqrestore(&sem->wait_lock, flags); | ||
278 | } | ||
279 | |||
280 | /* | ||
281 | * downgrade a write lock into a read lock | ||
282 | * - just wake up any readers at the front of the queue | ||
283 | */ | ||
284 | void __downgrade_write(struct rw_semaphore *sem) | ||
285 | { | ||
286 | unsigned long flags; | ||
287 | |||
288 | raw_spin_lock_irqsave(&sem->wait_lock, flags); | ||
289 | |||
290 | sem->activity = 1; | ||
291 | if (!list_empty(&sem->wait_list)) | ||
292 | sem = __rwsem_do_wake(sem, 0); | ||
293 | |||
294 | raw_spin_unlock_irqrestore(&sem->wait_lock, flags); | ||
295 | } | ||
296 | |||
diff --git a/kernel/locking/rwsem-xadd.c b/kernel/locking/rwsem-xadd.c new file mode 100644 index 000000000000..19c5fa95e0b4 --- /dev/null +++ b/kernel/locking/rwsem-xadd.c | |||
@@ -0,0 +1,293 @@ | |||
1 | /* rwsem.c: R/W semaphores: contention handling functions | ||
2 | * | ||
3 | * Written by David Howells (dhowells@redhat.com). | ||
4 | * Derived from arch/i386/kernel/semaphore.c | ||
5 | * | ||
6 | * Writer lock-stealing by Alex Shi <alex.shi@intel.com> | ||
7 | * and Michel Lespinasse <walken@google.com> | ||
8 | */ | ||
9 | #include <linux/rwsem.h> | ||
10 | #include <linux/sched.h> | ||
11 | #include <linux/init.h> | ||
12 | #include <linux/export.h> | ||
13 | |||
14 | /* | ||
15 | * Initialize an rwsem: | ||
16 | */ | ||
17 | void __init_rwsem(struct rw_semaphore *sem, const char *name, | ||
18 | struct lock_class_key *key) | ||
19 | { | ||
20 | #ifdef CONFIG_DEBUG_LOCK_ALLOC | ||
21 | /* | ||
22 | * Make sure we are not reinitializing a held semaphore: | ||
23 | */ | ||
24 | debug_check_no_locks_freed((void *)sem, sizeof(*sem)); | ||
25 | lockdep_init_map(&sem->dep_map, name, key, 0); | ||
26 | #endif | ||
27 | sem->count = RWSEM_UNLOCKED_VALUE; | ||
28 | raw_spin_lock_init(&sem->wait_lock); | ||
29 | INIT_LIST_HEAD(&sem->wait_list); | ||
30 | } | ||
31 | |||
32 | EXPORT_SYMBOL(__init_rwsem); | ||
33 | |||
34 | enum rwsem_waiter_type { | ||
35 | RWSEM_WAITING_FOR_WRITE, | ||
36 | RWSEM_WAITING_FOR_READ | ||
37 | }; | ||
38 | |||
39 | struct rwsem_waiter { | ||
40 | struct list_head list; | ||
41 | struct task_struct *task; | ||
42 | enum rwsem_waiter_type type; | ||
43 | }; | ||
44 | |||
45 | enum rwsem_wake_type { | ||
46 | RWSEM_WAKE_ANY, /* Wake whatever's at head of wait list */ | ||
47 | RWSEM_WAKE_READERS, /* Wake readers only */ | ||
48 | RWSEM_WAKE_READ_OWNED /* Waker thread holds the read lock */ | ||
49 | }; | ||
50 | |||
51 | /* | ||
52 | * handle the lock release when processes blocked on it that can now run | ||
53 | * - if we come here from up_xxxx(), then: | ||
54 | * - the 'active part' of count (&0x0000ffff) reached 0 (but may have changed) | ||
55 | * - the 'waiting part' of count (&0xffff0000) is -ve (and will still be so) | ||
56 | * - there must be someone on the queue | ||
57 | * - the spinlock must be held by the caller | ||
58 | * - woken process blocks are discarded from the list after having task zeroed | ||
59 | * - writers are only woken if downgrading is false | ||
60 | */ | ||
61 | static struct rw_semaphore * | ||
62 | __rwsem_do_wake(struct rw_semaphore *sem, enum rwsem_wake_type wake_type) | ||
63 | { | ||
64 | struct rwsem_waiter *waiter; | ||
65 | struct task_struct *tsk; | ||
66 | struct list_head *next; | ||
67 | long oldcount, woken, loop, adjustment; | ||
68 | |||
69 | waiter = list_entry(sem->wait_list.next, struct rwsem_waiter, list); | ||
70 | if (waiter->type == RWSEM_WAITING_FOR_WRITE) { | ||
71 | if (wake_type == RWSEM_WAKE_ANY) | ||
72 | /* Wake writer at the front of the queue, but do not | ||
73 | * grant it the lock yet as we want other writers | ||
74 | * to be able to steal it. Readers, on the other hand, | ||
75 | * will block as they will notice the queued writer. | ||
76 | */ | ||
77 | wake_up_process(waiter->task); | ||
78 | goto out; | ||
79 | } | ||
80 | |||
81 | /* Writers might steal the lock before we grant it to the next reader. | ||
82 | * We prefer to do the first reader grant before counting readers | ||
83 | * so we can bail out early if a writer stole the lock. | ||
84 | */ | ||
85 | adjustment = 0; | ||
86 | if (wake_type != RWSEM_WAKE_READ_OWNED) { | ||
87 | adjustment = RWSEM_ACTIVE_READ_BIAS; | ||
88 | try_reader_grant: | ||
89 | oldcount = rwsem_atomic_update(adjustment, sem) - adjustment; | ||
90 | if (unlikely(oldcount < RWSEM_WAITING_BIAS)) { | ||
91 | /* A writer stole the lock. Undo our reader grant. */ | ||
92 | if (rwsem_atomic_update(-adjustment, sem) & | ||
93 | RWSEM_ACTIVE_MASK) | ||
94 | goto out; | ||
95 | /* Last active locker left. Retry waking readers. */ | ||
96 | goto try_reader_grant; | ||
97 | } | ||
98 | } | ||
99 | |||
100 | /* Grant an infinite number of read locks to the readers at the front | ||
101 | * of the queue. Note we increment the 'active part' of the count by | ||
102 | * the number of readers before waking any processes up. | ||
103 | */ | ||
104 | woken = 0; | ||
105 | do { | ||
106 | woken++; | ||
107 | |||
108 | if (waiter->list.next == &sem->wait_list) | ||
109 | break; | ||
110 | |||
111 | waiter = list_entry(waiter->list.next, | ||
112 | struct rwsem_waiter, list); | ||
113 | |||
114 | } while (waiter->type != RWSEM_WAITING_FOR_WRITE); | ||
115 | |||
116 | adjustment = woken * RWSEM_ACTIVE_READ_BIAS - adjustment; | ||
117 | if (waiter->type != RWSEM_WAITING_FOR_WRITE) | ||
118 | /* hit end of list above */ | ||
119 | adjustment -= RWSEM_WAITING_BIAS; | ||
120 | |||
121 | if (adjustment) | ||
122 | rwsem_atomic_add(adjustment, sem); | ||
123 | |||
124 | next = sem->wait_list.next; | ||
125 | loop = woken; | ||
126 | do { | ||
127 | waiter = list_entry(next, struct rwsem_waiter, list); | ||
128 | next = waiter->list.next; | ||
129 | tsk = waiter->task; | ||
130 | smp_mb(); | ||
131 | waiter->task = NULL; | ||
132 | wake_up_process(tsk); | ||
133 | put_task_struct(tsk); | ||
134 | } while (--loop); | ||
135 | |||
136 | sem->wait_list.next = next; | ||
137 | next->prev = &sem->wait_list; | ||
138 | |||
139 | out: | ||
140 | return sem; | ||
141 | } | ||
142 | |||
143 | /* | ||
144 | * wait for the read lock to be granted | ||
145 | */ | ||
146 | struct rw_semaphore __sched *rwsem_down_read_failed(struct rw_semaphore *sem) | ||
147 | { | ||
148 | long count, adjustment = -RWSEM_ACTIVE_READ_BIAS; | ||
149 | struct rwsem_waiter waiter; | ||
150 | struct task_struct *tsk = current; | ||
151 | |||
152 | /* set up my own style of waitqueue */ | ||
153 | waiter.task = tsk; | ||
154 | waiter.type = RWSEM_WAITING_FOR_READ; | ||
155 | get_task_struct(tsk); | ||
156 | |||
157 | raw_spin_lock_irq(&sem->wait_lock); | ||
158 | if (list_empty(&sem->wait_list)) | ||
159 | adjustment += RWSEM_WAITING_BIAS; | ||
160 | list_add_tail(&waiter.list, &sem->wait_list); | ||
161 | |||
162 | /* we're now waiting on the lock, but no longer actively locking */ | ||
163 | count = rwsem_atomic_update(adjustment, sem); | ||
164 | |||
165 | /* If there are no active locks, wake the front queued process(es). | ||
166 | * | ||
167 | * If there are no writers and we are first in the queue, | ||
168 | * wake our own waiter to join the existing active readers ! | ||
169 | */ | ||
170 | if (count == RWSEM_WAITING_BIAS || | ||
171 | (count > RWSEM_WAITING_BIAS && | ||
172 | adjustment != -RWSEM_ACTIVE_READ_BIAS)) | ||
173 | sem = __rwsem_do_wake(sem, RWSEM_WAKE_ANY); | ||
174 | |||
175 | raw_spin_unlock_irq(&sem->wait_lock); | ||
176 | |||
177 | /* wait to be given the lock */ | ||
178 | while (true) { | ||
179 | set_task_state(tsk, TASK_UNINTERRUPTIBLE); | ||
180 | if (!waiter.task) | ||
181 | break; | ||
182 | schedule(); | ||
183 | } | ||
184 | |||
185 | tsk->state = TASK_RUNNING; | ||
186 | |||
187 | return sem; | ||
188 | } | ||
189 | |||
190 | /* | ||
191 | * wait until we successfully acquire the write lock | ||
192 | */ | ||
193 | struct rw_semaphore __sched *rwsem_down_write_failed(struct rw_semaphore *sem) | ||
194 | { | ||
195 | long count, adjustment = -RWSEM_ACTIVE_WRITE_BIAS; | ||
196 | struct rwsem_waiter waiter; | ||
197 | struct task_struct *tsk = current; | ||
198 | |||
199 | /* set up my own style of waitqueue */ | ||
200 | waiter.task = tsk; | ||
201 | waiter.type = RWSEM_WAITING_FOR_WRITE; | ||
202 | |||
203 | raw_spin_lock_irq(&sem->wait_lock); | ||
204 | if (list_empty(&sem->wait_list)) | ||
205 | adjustment += RWSEM_WAITING_BIAS; | ||
206 | list_add_tail(&waiter.list, &sem->wait_list); | ||
207 | |||
208 | /* we're now waiting on the lock, but no longer actively locking */ | ||
209 | count = rwsem_atomic_update(adjustment, sem); | ||
210 | |||
211 | /* If there were already threads queued before us and there are no | ||
212 | * active writers, the lock must be read owned; so we try to wake | ||
213 | * any read locks that were queued ahead of us. */ | ||
214 | if (count > RWSEM_WAITING_BIAS && | ||
215 | adjustment == -RWSEM_ACTIVE_WRITE_BIAS) | ||
216 | sem = __rwsem_do_wake(sem, RWSEM_WAKE_READERS); | ||
217 | |||
218 | /* wait until we successfully acquire the lock */ | ||
219 | set_task_state(tsk, TASK_UNINTERRUPTIBLE); | ||
220 | while (true) { | ||
221 | if (!(count & RWSEM_ACTIVE_MASK)) { | ||
222 | /* Try acquiring the write lock. */ | ||
223 | count = RWSEM_ACTIVE_WRITE_BIAS; | ||
224 | if (!list_is_singular(&sem->wait_list)) | ||
225 | count += RWSEM_WAITING_BIAS; | ||
226 | |||
227 | if (sem->count == RWSEM_WAITING_BIAS && | ||
228 | cmpxchg(&sem->count, RWSEM_WAITING_BIAS, count) == | ||
229 | RWSEM_WAITING_BIAS) | ||
230 | break; | ||
231 | } | ||
232 | |||
233 | raw_spin_unlock_irq(&sem->wait_lock); | ||
234 | |||
235 | /* Block until there are no active lockers. */ | ||
236 | do { | ||
237 | schedule(); | ||
238 | set_task_state(tsk, TASK_UNINTERRUPTIBLE); | ||
239 | } while ((count = sem->count) & RWSEM_ACTIVE_MASK); | ||
240 | |||
241 | raw_spin_lock_irq(&sem->wait_lock); | ||
242 | } | ||
243 | |||
244 | list_del(&waiter.list); | ||
245 | raw_spin_unlock_irq(&sem->wait_lock); | ||
246 | tsk->state = TASK_RUNNING; | ||
247 | |||
248 | return sem; | ||
249 | } | ||
250 | |||
251 | /* | ||
252 | * handle waking up a waiter on the semaphore | ||
253 | * - up_read/up_write has decremented the active part of count if we come here | ||
254 | */ | ||
255 | struct rw_semaphore *rwsem_wake(struct rw_semaphore *sem) | ||
256 | { | ||
257 | unsigned long flags; | ||
258 | |||
259 | raw_spin_lock_irqsave(&sem->wait_lock, flags); | ||
260 | |||
261 | /* do nothing if list empty */ | ||
262 | if (!list_empty(&sem->wait_list)) | ||
263 | sem = __rwsem_do_wake(sem, RWSEM_WAKE_ANY); | ||
264 | |||
265 | raw_spin_unlock_irqrestore(&sem->wait_lock, flags); | ||
266 | |||
267 | return sem; | ||
268 | } | ||
269 | |||
270 | /* | ||
271 | * downgrade a write lock into a read lock | ||
272 | * - caller incremented waiting part of count and discovered it still negative | ||
273 | * - just wake up any readers at the front of the queue | ||
274 | */ | ||
275 | struct rw_semaphore *rwsem_downgrade_wake(struct rw_semaphore *sem) | ||
276 | { | ||
277 | unsigned long flags; | ||
278 | |||
279 | raw_spin_lock_irqsave(&sem->wait_lock, flags); | ||
280 | |||
281 | /* do nothing if list empty */ | ||
282 | if (!list_empty(&sem->wait_list)) | ||
283 | sem = __rwsem_do_wake(sem, RWSEM_WAKE_READ_OWNED); | ||
284 | |||
285 | raw_spin_unlock_irqrestore(&sem->wait_lock, flags); | ||
286 | |||
287 | return sem; | ||
288 | } | ||
289 | |||
290 | EXPORT_SYMBOL(rwsem_down_read_failed); | ||
291 | EXPORT_SYMBOL(rwsem_down_write_failed); | ||
292 | EXPORT_SYMBOL(rwsem_wake); | ||
293 | EXPORT_SYMBOL(rwsem_downgrade_wake); | ||
diff --git a/kernel/rwsem.c b/kernel/locking/rwsem.c index cfff1435bdfb..cfff1435bdfb 100644 --- a/kernel/rwsem.c +++ b/kernel/locking/rwsem.c | |||
diff --git a/kernel/semaphore.c b/kernel/locking/semaphore.c index 6815171a4fff..6815171a4fff 100644 --- a/kernel/semaphore.c +++ b/kernel/locking/semaphore.c | |||
diff --git a/kernel/spinlock.c b/kernel/locking/spinlock.c index 4b082b5cac9e..4b082b5cac9e 100644 --- a/kernel/spinlock.c +++ b/kernel/locking/spinlock.c | |||
diff --git a/kernel/locking/spinlock_debug.c b/kernel/locking/spinlock_debug.c new file mode 100644 index 000000000000..0374a596cffa --- /dev/null +++ b/kernel/locking/spinlock_debug.c | |||
@@ -0,0 +1,302 @@ | |||
1 | /* | ||
2 | * Copyright 2005, Red Hat, Inc., Ingo Molnar | ||
3 | * Released under the General Public License (GPL). | ||
4 | * | ||
5 | * This file contains the spinlock/rwlock implementations for | ||
6 | * DEBUG_SPINLOCK. | ||
7 | */ | ||
8 | |||
9 | #include <linux/spinlock.h> | ||
10 | #include <linux/nmi.h> | ||
11 | #include <linux/interrupt.h> | ||
12 | #include <linux/debug_locks.h> | ||
13 | #include <linux/delay.h> | ||
14 | #include <linux/export.h> | ||
15 | |||
16 | void __raw_spin_lock_init(raw_spinlock_t *lock, const char *name, | ||
17 | struct lock_class_key *key) | ||
18 | { | ||
19 | #ifdef CONFIG_DEBUG_LOCK_ALLOC | ||
20 | /* | ||
21 | * Make sure we are not reinitializing a held lock: | ||
22 | */ | ||
23 | debug_check_no_locks_freed((void *)lock, sizeof(*lock)); | ||
24 | lockdep_init_map(&lock->dep_map, name, key, 0); | ||
25 | #endif | ||
26 | lock->raw_lock = (arch_spinlock_t)__ARCH_SPIN_LOCK_UNLOCKED; | ||
27 | lock->magic = SPINLOCK_MAGIC; | ||
28 | lock->owner = SPINLOCK_OWNER_INIT; | ||
29 | lock->owner_cpu = -1; | ||
30 | } | ||
31 | |||
32 | EXPORT_SYMBOL(__raw_spin_lock_init); | ||
33 | |||
34 | void __rwlock_init(rwlock_t *lock, const char *name, | ||
35 | struct lock_class_key *key) | ||
36 | { | ||
37 | #ifdef CONFIG_DEBUG_LOCK_ALLOC | ||
38 | /* | ||
39 | * Make sure we are not reinitializing a held lock: | ||
40 | */ | ||
41 | debug_check_no_locks_freed((void *)lock, sizeof(*lock)); | ||
42 | lockdep_init_map(&lock->dep_map, name, key, 0); | ||
43 | #endif | ||
44 | lock->raw_lock = (arch_rwlock_t) __ARCH_RW_LOCK_UNLOCKED; | ||
45 | lock->magic = RWLOCK_MAGIC; | ||
46 | lock->owner = SPINLOCK_OWNER_INIT; | ||
47 | lock->owner_cpu = -1; | ||
48 | } | ||
49 | |||
50 | EXPORT_SYMBOL(__rwlock_init); | ||
51 | |||
52 | static void spin_dump(raw_spinlock_t *lock, const char *msg) | ||
53 | { | ||
54 | struct task_struct *owner = NULL; | ||
55 | |||
56 | if (lock->owner && lock->owner != SPINLOCK_OWNER_INIT) | ||
57 | owner = lock->owner; | ||
58 | printk(KERN_EMERG "BUG: spinlock %s on CPU#%d, %s/%d\n", | ||
59 | msg, raw_smp_processor_id(), | ||
60 | current->comm, task_pid_nr(current)); | ||
61 | printk(KERN_EMERG " lock: %pS, .magic: %08x, .owner: %s/%d, " | ||
62 | ".owner_cpu: %d\n", | ||
63 | lock, lock->magic, | ||
64 | owner ? owner->comm : "<none>", | ||
65 | owner ? task_pid_nr(owner) : -1, | ||
66 | lock->owner_cpu); | ||
67 | dump_stack(); | ||
68 | } | ||
69 | |||
70 | static void spin_bug(raw_spinlock_t *lock, const char *msg) | ||
71 | { | ||
72 | if (!debug_locks_off()) | ||
73 | return; | ||
74 | |||
75 | spin_dump(lock, msg); | ||
76 | } | ||
77 | |||
78 | #define SPIN_BUG_ON(cond, lock, msg) if (unlikely(cond)) spin_bug(lock, msg) | ||
79 | |||
80 | static inline void | ||
81 | debug_spin_lock_before(raw_spinlock_t *lock) | ||
82 | { | ||
83 | SPIN_BUG_ON(lock->magic != SPINLOCK_MAGIC, lock, "bad magic"); | ||
84 | SPIN_BUG_ON(lock->owner == current, lock, "recursion"); | ||
85 | SPIN_BUG_ON(lock->owner_cpu == raw_smp_processor_id(), | ||
86 | lock, "cpu recursion"); | ||
87 | } | ||
88 | |||
89 | static inline void debug_spin_lock_after(raw_spinlock_t *lock) | ||
90 | { | ||
91 | lock->owner_cpu = raw_smp_processor_id(); | ||
92 | lock->owner = current; | ||
93 | } | ||
94 | |||
95 | static inline void debug_spin_unlock(raw_spinlock_t *lock) | ||
96 | { | ||
97 | SPIN_BUG_ON(lock->magic != SPINLOCK_MAGIC, lock, "bad magic"); | ||
98 | SPIN_BUG_ON(!raw_spin_is_locked(lock), lock, "already unlocked"); | ||
99 | SPIN_BUG_ON(lock->owner != current, lock, "wrong owner"); | ||
100 | SPIN_BUG_ON(lock->owner_cpu != raw_smp_processor_id(), | ||
101 | lock, "wrong CPU"); | ||
102 | lock->owner = SPINLOCK_OWNER_INIT; | ||
103 | lock->owner_cpu = -1; | ||
104 | } | ||
105 | |||
106 | static void __spin_lock_debug(raw_spinlock_t *lock) | ||
107 | { | ||
108 | u64 i; | ||
109 | u64 loops = loops_per_jiffy * HZ; | ||
110 | |||
111 | for (i = 0; i < loops; i++) { | ||
112 | if (arch_spin_trylock(&lock->raw_lock)) | ||
113 | return; | ||
114 | __delay(1); | ||
115 | } | ||
116 | /* lockup suspected: */ | ||
117 | spin_dump(lock, "lockup suspected"); | ||
118 | #ifdef CONFIG_SMP | ||
119 | trigger_all_cpu_backtrace(); | ||
120 | #endif | ||
121 | |||
122 | /* | ||
123 | * The trylock above was causing a livelock. Give the lower level arch | ||
124 | * specific lock code a chance to acquire the lock. We have already | ||
125 | * printed a warning/backtrace at this point. The non-debug arch | ||
126 | * specific code might actually succeed in acquiring the lock. If it is | ||
127 | * not successful, the end-result is the same - there is no forward | ||
128 | * progress. | ||
129 | */ | ||
130 | arch_spin_lock(&lock->raw_lock); | ||
131 | } | ||
132 | |||
133 | void do_raw_spin_lock(raw_spinlock_t *lock) | ||
134 | { | ||
135 | debug_spin_lock_before(lock); | ||
136 | if (unlikely(!arch_spin_trylock(&lock->raw_lock))) | ||
137 | __spin_lock_debug(lock); | ||
138 | debug_spin_lock_after(lock); | ||
139 | } | ||
140 | |||
141 | int do_raw_spin_trylock(raw_spinlock_t *lock) | ||
142 | { | ||
143 | int ret = arch_spin_trylock(&lock->raw_lock); | ||
144 | |||
145 | if (ret) | ||
146 | debug_spin_lock_after(lock); | ||
147 | #ifndef CONFIG_SMP | ||
148 | /* | ||
149 | * Must not happen on UP: | ||
150 | */ | ||
151 | SPIN_BUG_ON(!ret, lock, "trylock failure on UP"); | ||
152 | #endif | ||
153 | return ret; | ||
154 | } | ||
155 | |||
156 | void do_raw_spin_unlock(raw_spinlock_t *lock) | ||
157 | { | ||
158 | debug_spin_unlock(lock); | ||
159 | arch_spin_unlock(&lock->raw_lock); | ||
160 | } | ||
161 | |||
162 | static void rwlock_bug(rwlock_t *lock, const char *msg) | ||
163 | { | ||
164 | if (!debug_locks_off()) | ||
165 | return; | ||
166 | |||
167 | printk(KERN_EMERG "BUG: rwlock %s on CPU#%d, %s/%d, %p\n", | ||
168 | msg, raw_smp_processor_id(), current->comm, | ||
169 | task_pid_nr(current), lock); | ||
170 | dump_stack(); | ||
171 | } | ||
172 | |||
173 | #define RWLOCK_BUG_ON(cond, lock, msg) if (unlikely(cond)) rwlock_bug(lock, msg) | ||
174 | |||
175 | #if 0 /* __write_lock_debug() can lock up - maybe this can too? */ | ||
176 | static void __read_lock_debug(rwlock_t *lock) | ||
177 | { | ||
178 | u64 i; | ||
179 | u64 loops = loops_per_jiffy * HZ; | ||
180 | int print_once = 1; | ||
181 | |||
182 | for (;;) { | ||
183 | for (i = 0; i < loops; i++) { | ||
184 | if (arch_read_trylock(&lock->raw_lock)) | ||
185 | return; | ||
186 | __delay(1); | ||
187 | } | ||
188 | /* lockup suspected: */ | ||
189 | if (print_once) { | ||
190 | print_once = 0; | ||
191 | printk(KERN_EMERG "BUG: read-lock lockup on CPU#%d, " | ||
192 | "%s/%d, %p\n", | ||
193 | raw_smp_processor_id(), current->comm, | ||
194 | current->pid, lock); | ||
195 | dump_stack(); | ||
196 | } | ||
197 | } | ||
198 | } | ||
199 | #endif | ||
200 | |||
201 | void do_raw_read_lock(rwlock_t *lock) | ||
202 | { | ||
203 | RWLOCK_BUG_ON(lock->magic != RWLOCK_MAGIC, lock, "bad magic"); | ||
204 | arch_read_lock(&lock->raw_lock); | ||
205 | } | ||
206 | |||
207 | int do_raw_read_trylock(rwlock_t *lock) | ||
208 | { | ||
209 | int ret = arch_read_trylock(&lock->raw_lock); | ||
210 | |||
211 | #ifndef CONFIG_SMP | ||
212 | /* | ||
213 | * Must not happen on UP: | ||
214 | */ | ||
215 | RWLOCK_BUG_ON(!ret, lock, "trylock failure on UP"); | ||
216 | #endif | ||
217 | return ret; | ||
218 | } | ||
219 | |||
220 | void do_raw_read_unlock(rwlock_t *lock) | ||
221 | { | ||
222 | RWLOCK_BUG_ON(lock->magic != RWLOCK_MAGIC, lock, "bad magic"); | ||
223 | arch_read_unlock(&lock->raw_lock); | ||
224 | } | ||
225 | |||
226 | static inline void debug_write_lock_before(rwlock_t *lock) | ||
227 | { | ||
228 | RWLOCK_BUG_ON(lock->magic != RWLOCK_MAGIC, lock, "bad magic"); | ||
229 | RWLOCK_BUG_ON(lock->owner == current, lock, "recursion"); | ||
230 | RWLOCK_BUG_ON(lock->owner_cpu == raw_smp_processor_id(), | ||
231 | lock, "cpu recursion"); | ||
232 | } | ||
233 | |||
234 | static inline void debug_write_lock_after(rwlock_t *lock) | ||
235 | { | ||
236 | lock->owner_cpu = raw_smp_processor_id(); | ||
237 | lock->owner = current; | ||
238 | } | ||
239 | |||
240 | static inline void debug_write_unlock(rwlock_t *lock) | ||
241 | { | ||
242 | RWLOCK_BUG_ON(lock->magic != RWLOCK_MAGIC, lock, "bad magic"); | ||
243 | RWLOCK_BUG_ON(lock->owner != current, lock, "wrong owner"); | ||
244 | RWLOCK_BUG_ON(lock->owner_cpu != raw_smp_processor_id(), | ||
245 | lock, "wrong CPU"); | ||
246 | lock->owner = SPINLOCK_OWNER_INIT; | ||
247 | lock->owner_cpu = -1; | ||
248 | } | ||
249 | |||
250 | #if 0 /* This can cause lockups */ | ||
251 | static void __write_lock_debug(rwlock_t *lock) | ||
252 | { | ||
253 | u64 i; | ||
254 | u64 loops = loops_per_jiffy * HZ; | ||
255 | int print_once = 1; | ||
256 | |||
257 | for (;;) { | ||
258 | for (i = 0; i < loops; i++) { | ||
259 | if (arch_write_trylock(&lock->raw_lock)) | ||
260 | return; | ||
261 | __delay(1); | ||
262 | } | ||
263 | /* lockup suspected: */ | ||
264 | if (print_once) { | ||
265 | print_once = 0; | ||
266 | printk(KERN_EMERG "BUG: write-lock lockup on CPU#%d, " | ||
267 | "%s/%d, %p\n", | ||
268 | raw_smp_processor_id(), current->comm, | ||
269 | current->pid, lock); | ||
270 | dump_stack(); | ||
271 | } | ||
272 | } | ||
273 | } | ||
274 | #endif | ||
275 | |||
276 | void do_raw_write_lock(rwlock_t *lock) | ||
277 | { | ||
278 | debug_write_lock_before(lock); | ||
279 | arch_write_lock(&lock->raw_lock); | ||
280 | debug_write_lock_after(lock); | ||
281 | } | ||
282 | |||
283 | int do_raw_write_trylock(rwlock_t *lock) | ||
284 | { | ||
285 | int ret = arch_write_trylock(&lock->raw_lock); | ||
286 | |||
287 | if (ret) | ||
288 | debug_write_lock_after(lock); | ||
289 | #ifndef CONFIG_SMP | ||
290 | /* | ||
291 | * Must not happen on UP: | ||
292 | */ | ||
293 | RWLOCK_BUG_ON(!ret, lock, "trylock failure on UP"); | ||
294 | #endif | ||
295 | return ret; | ||
296 | } | ||
297 | |||
298 | void do_raw_write_unlock(rwlock_t *lock) | ||
299 | { | ||
300 | debug_write_unlock(lock); | ||
301 | arch_write_unlock(&lock->raw_lock); | ||
302 | } | ||
diff --git a/kernel/modsign_certificate.S b/kernel/modsign_certificate.S deleted file mode 100644 index 4a9a86d12c8b..000000000000 --- a/kernel/modsign_certificate.S +++ /dev/null | |||
@@ -1,12 +0,0 @@ | |||
1 | #include <linux/export.h> | ||
2 | |||
3 | #define GLOBAL(name) \ | ||
4 | .globl VMLINUX_SYMBOL(name); \ | ||
5 | VMLINUX_SYMBOL(name): | ||
6 | |||
7 | .section ".init.data","aw" | ||
8 | |||
9 | GLOBAL(modsign_certificate_list) | ||
10 | .incbin "signing_key.x509" | ||
11 | .incbin "extra_certificates" | ||
12 | GLOBAL(modsign_certificate_list_end) | ||
diff --git a/kernel/modsign_pubkey.c b/kernel/modsign_pubkey.c deleted file mode 100644 index 7cbd4507a7e6..000000000000 --- a/kernel/modsign_pubkey.c +++ /dev/null | |||
@@ -1,104 +0,0 @@ | |||
1 | /* Public keys for module signature verification | ||
2 | * | ||
3 | * Copyright (C) 2012 Red Hat, Inc. All Rights Reserved. | ||
4 | * Written by David Howells (dhowells@redhat.com) | ||
5 | * | ||
6 | * This program is free software; you can redistribute it and/or | ||
7 | * modify it under the terms of the GNU General Public Licence | ||
8 | * as published by the Free Software Foundation; either version | ||
9 | * 2 of the Licence, or (at your option) any later version. | ||
10 | */ | ||
11 | |||
12 | #include <linux/kernel.h> | ||
13 | #include <linux/sched.h> | ||
14 | #include <linux/cred.h> | ||
15 | #include <linux/err.h> | ||
16 | #include <keys/asymmetric-type.h> | ||
17 | #include "module-internal.h" | ||
18 | |||
19 | struct key *modsign_keyring; | ||
20 | |||
21 | extern __initconst const u8 modsign_certificate_list[]; | ||
22 | extern __initconst const u8 modsign_certificate_list_end[]; | ||
23 | |||
24 | /* | ||
25 | * We need to make sure ccache doesn't cache the .o file as it doesn't notice | ||
26 | * if modsign.pub changes. | ||
27 | */ | ||
28 | static __initconst const char annoy_ccache[] = __TIME__ "foo"; | ||
29 | |||
30 | /* | ||
31 | * Load the compiled-in keys | ||
32 | */ | ||
33 | static __init int module_verify_init(void) | ||
34 | { | ||
35 | pr_notice("Initialise module verification\n"); | ||
36 | |||
37 | modsign_keyring = keyring_alloc(".module_sign", | ||
38 | KUIDT_INIT(0), KGIDT_INIT(0), | ||
39 | current_cred(), | ||
40 | ((KEY_POS_ALL & ~KEY_POS_SETATTR) | | ||
41 | KEY_USR_VIEW | KEY_USR_READ), | ||
42 | KEY_ALLOC_NOT_IN_QUOTA, NULL); | ||
43 | if (IS_ERR(modsign_keyring)) | ||
44 | panic("Can't allocate module signing keyring\n"); | ||
45 | |||
46 | return 0; | ||
47 | } | ||
48 | |||
49 | /* | ||
50 | * Must be initialised before we try and load the keys into the keyring. | ||
51 | */ | ||
52 | device_initcall(module_verify_init); | ||
53 | |||
54 | /* | ||
55 | * Load the compiled-in keys | ||
56 | */ | ||
57 | static __init int load_module_signing_keys(void) | ||
58 | { | ||
59 | key_ref_t key; | ||
60 | const u8 *p, *end; | ||
61 | size_t plen; | ||
62 | |||
63 | pr_notice("Loading module verification certificates\n"); | ||
64 | |||
65 | end = modsign_certificate_list_end; | ||
66 | p = modsign_certificate_list; | ||
67 | while (p < end) { | ||
68 | /* Each cert begins with an ASN.1 SEQUENCE tag and must be more | ||
69 | * than 256 bytes in size. | ||
70 | */ | ||
71 | if (end - p < 4) | ||
72 | goto dodgy_cert; | ||
73 | if (p[0] != 0x30 && | ||
74 | p[1] != 0x82) | ||
75 | goto dodgy_cert; | ||
76 | plen = (p[2] << 8) | p[3]; | ||
77 | plen += 4; | ||
78 | if (plen > end - p) | ||
79 | goto dodgy_cert; | ||
80 | |||
81 | key = key_create_or_update(make_key_ref(modsign_keyring, 1), | ||
82 | "asymmetric", | ||
83 | NULL, | ||
84 | p, | ||
85 | plen, | ||
86 | (KEY_POS_ALL & ~KEY_POS_SETATTR) | | ||
87 | KEY_USR_VIEW, | ||
88 | KEY_ALLOC_NOT_IN_QUOTA); | ||
89 | if (IS_ERR(key)) | ||
90 | pr_err("MODSIGN: Problem loading in-kernel X.509 certificate (%ld)\n", | ||
91 | PTR_ERR(key)); | ||
92 | else | ||
93 | pr_notice("MODSIGN: Loaded cert '%s'\n", | ||
94 | key_ref_to_ptr(key)->description); | ||
95 | p += plen; | ||
96 | } | ||
97 | |||
98 | return 0; | ||
99 | |||
100 | dodgy_cert: | ||
101 | pr_err("MODSIGN: Problem parsing in-kernel X.509 certificate list\n"); | ||
102 | return 0; | ||
103 | } | ||
104 | late_initcall(load_module_signing_keys); | ||
diff --git a/kernel/module-internal.h b/kernel/module-internal.h index 24f9247b7d02..915e123a430f 100644 --- a/kernel/module-internal.h +++ b/kernel/module-internal.h | |||
@@ -9,6 +9,4 @@ | |||
9 | * 2 of the Licence, or (at your option) any later version. | 9 | * 2 of the Licence, or (at your option) any later version. |
10 | */ | 10 | */ |
11 | 11 | ||
12 | extern struct key *modsign_keyring; | ||
13 | |||
14 | extern int mod_verify_sig(const void *mod, unsigned long *_modlen); | 12 | extern int mod_verify_sig(const void *mod, unsigned long *_modlen); |
diff --git a/kernel/module.c b/kernel/module.c index dc582749fa13..f5a3b1e8ec51 100644 --- a/kernel/module.c +++ b/kernel/module.c | |||
@@ -378,23 +378,21 @@ static bool check_symbol(const struct symsearch *syms, | |||
378 | if (syms->licence == GPL_ONLY) | 378 | if (syms->licence == GPL_ONLY) |
379 | return false; | 379 | return false; |
380 | if (syms->licence == WILL_BE_GPL_ONLY && fsa->warn) { | 380 | if (syms->licence == WILL_BE_GPL_ONLY && fsa->warn) { |
381 | printk(KERN_WARNING "Symbol %s is being used " | 381 | pr_warn("Symbol %s is being used by a non-GPL module, " |
382 | "by a non-GPL module, which will not " | 382 | "which will not be allowed in the future\n", |
383 | "be allowed in the future\n", fsa->name); | 383 | fsa->name); |
384 | } | 384 | } |
385 | } | 385 | } |
386 | 386 | ||
387 | #ifdef CONFIG_UNUSED_SYMBOLS | 387 | #ifdef CONFIG_UNUSED_SYMBOLS |
388 | if (syms->unused && fsa->warn) { | 388 | if (syms->unused && fsa->warn) { |
389 | printk(KERN_WARNING "Symbol %s is marked as UNUSED, " | 389 | pr_warn("Symbol %s is marked as UNUSED, however this module is " |
390 | "however this module is using it.\n", fsa->name); | 390 | "using it.\n", fsa->name); |
391 | printk(KERN_WARNING | 391 | pr_warn("This symbol will go away in the future.\n"); |
392 | "This symbol will go away in the future.\n"); | 392 | pr_warn("Please evalute if this is the right api to use and if " |
393 | printk(KERN_WARNING | 393 | "it really is, submit a report the linux kernel " |
394 | "Please evalute if this is the right api to use and if " | 394 | "mailinglist together with submitting your code for " |
395 | "it really is, submit a report the linux kernel " | 395 | "inclusion.\n"); |
396 | "mailinglist together with submitting your code for " | ||
397 | "inclusion.\n"); | ||
398 | } | 396 | } |
399 | #endif | 397 | #endif |
400 | 398 | ||
@@ -492,16 +490,15 @@ static int percpu_modalloc(struct module *mod, struct load_info *info) | |||
492 | return 0; | 490 | return 0; |
493 | 491 | ||
494 | if (align > PAGE_SIZE) { | 492 | if (align > PAGE_SIZE) { |
495 | printk(KERN_WARNING "%s: per-cpu alignment %li > %li\n", | 493 | pr_warn("%s: per-cpu alignment %li > %li\n", |
496 | mod->name, align, PAGE_SIZE); | 494 | mod->name, align, PAGE_SIZE); |
497 | align = PAGE_SIZE; | 495 | align = PAGE_SIZE; |
498 | } | 496 | } |
499 | 497 | ||
500 | mod->percpu = __alloc_reserved_percpu(pcpusec->sh_size, align); | 498 | mod->percpu = __alloc_reserved_percpu(pcpusec->sh_size, align); |
501 | if (!mod->percpu) { | 499 | if (!mod->percpu) { |
502 | printk(KERN_WARNING | 500 | pr_warn("%s: Could not allocate %lu bytes percpu data\n", |
503 | "%s: Could not allocate %lu bytes percpu data\n", | 501 | mod->name, (unsigned long)pcpusec->sh_size); |
504 | mod->name, (unsigned long)pcpusec->sh_size); | ||
505 | return -ENOMEM; | 502 | return -ENOMEM; |
506 | } | 503 | } |
507 | mod->percpu_size = pcpusec->sh_size; | 504 | mod->percpu_size = pcpusec->sh_size; |
@@ -644,8 +641,6 @@ static int module_unload_init(struct module *mod) | |||
644 | 641 | ||
645 | /* Hold reference count during initialization. */ | 642 | /* Hold reference count during initialization. */ |
646 | __this_cpu_write(mod->refptr->incs, 1); | 643 | __this_cpu_write(mod->refptr->incs, 1); |
647 | /* Backwards compatibility macros put refcount during init. */ | ||
648 | mod->waiter = current; | ||
649 | 644 | ||
650 | return 0; | 645 | return 0; |
651 | } | 646 | } |
@@ -679,7 +674,7 @@ static int add_module_usage(struct module *a, struct module *b) | |||
679 | pr_debug("Allocating new usage for %s.\n", a->name); | 674 | pr_debug("Allocating new usage for %s.\n", a->name); |
680 | use = kmalloc(sizeof(*use), GFP_ATOMIC); | 675 | use = kmalloc(sizeof(*use), GFP_ATOMIC); |
681 | if (!use) { | 676 | if (!use) { |
682 | printk(KERN_WARNING "%s: out of memory loading\n", a->name); | 677 | pr_warn("%s: out of memory loading\n", a->name); |
683 | return -ENOMEM; | 678 | return -ENOMEM; |
684 | } | 679 | } |
685 | 680 | ||
@@ -771,16 +766,9 @@ static int __try_stop_module(void *_sref) | |||
771 | 766 | ||
772 | static int try_stop_module(struct module *mod, int flags, int *forced) | 767 | static int try_stop_module(struct module *mod, int flags, int *forced) |
773 | { | 768 | { |
774 | if (flags & O_NONBLOCK) { | 769 | struct stopref sref = { mod, flags, forced }; |
775 | struct stopref sref = { mod, flags, forced }; | ||
776 | 770 | ||
777 | return stop_machine(__try_stop_module, &sref, NULL); | 771 | return stop_machine(__try_stop_module, &sref, NULL); |
778 | } else { | ||
779 | /* We don't need to stop the machine for this. */ | ||
780 | mod->state = MODULE_STATE_GOING; | ||
781 | synchronize_sched(); | ||
782 | return 0; | ||
783 | } | ||
784 | } | 772 | } |
785 | 773 | ||
786 | unsigned long module_refcount(struct module *mod) | 774 | unsigned long module_refcount(struct module *mod) |
@@ -813,21 +801,6 @@ EXPORT_SYMBOL(module_refcount); | |||
813 | /* This exists whether we can unload or not */ | 801 | /* This exists whether we can unload or not */ |
814 | static void free_module(struct module *mod); | 802 | static void free_module(struct module *mod); |
815 | 803 | ||
816 | static void wait_for_zero_refcount(struct module *mod) | ||
817 | { | ||
818 | /* Since we might sleep for some time, release the mutex first */ | ||
819 | mutex_unlock(&module_mutex); | ||
820 | for (;;) { | ||
821 | pr_debug("Looking at refcount...\n"); | ||
822 | set_current_state(TASK_UNINTERRUPTIBLE); | ||
823 | if (module_refcount(mod) == 0) | ||
824 | break; | ||
825 | schedule(); | ||
826 | } | ||
827 | current->state = TASK_RUNNING; | ||
828 | mutex_lock(&module_mutex); | ||
829 | } | ||
830 | |||
831 | SYSCALL_DEFINE2(delete_module, const char __user *, name_user, | 804 | SYSCALL_DEFINE2(delete_module, const char __user *, name_user, |
832 | unsigned int, flags) | 805 | unsigned int, flags) |
833 | { | 806 | { |
@@ -842,6 +815,11 @@ SYSCALL_DEFINE2(delete_module, const char __user *, name_user, | |||
842 | return -EFAULT; | 815 | return -EFAULT; |
843 | name[MODULE_NAME_LEN-1] = '\0'; | 816 | name[MODULE_NAME_LEN-1] = '\0'; |
844 | 817 | ||
818 | if (!(flags & O_NONBLOCK)) { | ||
819 | printk(KERN_WARNING | ||
820 | "waiting module removal not supported: please upgrade"); | ||
821 | } | ||
822 | |||
845 | if (mutex_lock_interruptible(&module_mutex) != 0) | 823 | if (mutex_lock_interruptible(&module_mutex) != 0) |
846 | return -EINTR; | 824 | return -EINTR; |
847 | 825 | ||
@@ -859,8 +837,7 @@ SYSCALL_DEFINE2(delete_module, const char __user *, name_user, | |||
859 | 837 | ||
860 | /* Doing init or already dying? */ | 838 | /* Doing init or already dying? */ |
861 | if (mod->state != MODULE_STATE_LIVE) { | 839 | if (mod->state != MODULE_STATE_LIVE) { |
862 | /* FIXME: if (force), slam module count and wake up | 840 | /* FIXME: if (force), slam module count damn the torpedoes */ |
863 | waiter --RR */ | ||
864 | pr_debug("%s already dying\n", mod->name); | 841 | pr_debug("%s already dying\n", mod->name); |
865 | ret = -EBUSY; | 842 | ret = -EBUSY; |
866 | goto out; | 843 | goto out; |
@@ -876,18 +853,11 @@ SYSCALL_DEFINE2(delete_module, const char __user *, name_user, | |||
876 | } | 853 | } |
877 | } | 854 | } |
878 | 855 | ||
879 | /* Set this up before setting mod->state */ | ||
880 | mod->waiter = current; | ||
881 | |||
882 | /* Stop the machine so refcounts can't move and disable module. */ | 856 | /* Stop the machine so refcounts can't move and disable module. */ |
883 | ret = try_stop_module(mod, flags, &forced); | 857 | ret = try_stop_module(mod, flags, &forced); |
884 | if (ret != 0) | 858 | if (ret != 0) |
885 | goto out; | 859 | goto out; |
886 | 860 | ||
887 | /* Never wait if forced. */ | ||
888 | if (!forced && module_refcount(mod) != 0) | ||
889 | wait_for_zero_refcount(mod); | ||
890 | |||
891 | mutex_unlock(&module_mutex); | 861 | mutex_unlock(&module_mutex); |
892 | /* Final destruction now no one is using it. */ | 862 | /* Final destruction now no one is using it. */ |
893 | if (mod->exit != NULL) | 863 | if (mod->exit != NULL) |
@@ -1005,9 +975,6 @@ void module_put(struct module *module) | |||
1005 | __this_cpu_inc(module->refptr->decs); | 975 | __this_cpu_inc(module->refptr->decs); |
1006 | 976 | ||
1007 | trace_module_put(module, _RET_IP_); | 977 | trace_module_put(module, _RET_IP_); |
1008 | /* Maybe they're waiting for us to drop reference? */ | ||
1009 | if (unlikely(!module_is_live(module))) | ||
1010 | wake_up_process(module->waiter); | ||
1011 | preempt_enable(); | 978 | preempt_enable(); |
1012 | } | 979 | } |
1013 | } | 980 | } |
@@ -1145,8 +1112,7 @@ static int try_to_force_load(struct module *mod, const char *reason) | |||
1145 | { | 1112 | { |
1146 | #ifdef CONFIG_MODULE_FORCE_LOAD | 1113 | #ifdef CONFIG_MODULE_FORCE_LOAD |
1147 | if (!test_taint(TAINT_FORCED_MODULE)) | 1114 | if (!test_taint(TAINT_FORCED_MODULE)) |
1148 | printk(KERN_WARNING "%s: %s: kernel tainted.\n", | 1115 | pr_warn("%s: %s: kernel tainted.\n", mod->name, reason); |
1149 | mod->name, reason); | ||
1150 | add_taint_module(mod, TAINT_FORCED_MODULE, LOCKDEP_NOW_UNRELIABLE); | 1116 | add_taint_module(mod, TAINT_FORCED_MODULE, LOCKDEP_NOW_UNRELIABLE); |
1151 | return 0; | 1117 | return 0; |
1152 | #else | 1118 | #else |
@@ -1199,8 +1165,7 @@ static int check_version(Elf_Shdr *sechdrs, | |||
1199 | goto bad_version; | 1165 | goto bad_version; |
1200 | } | 1166 | } |
1201 | 1167 | ||
1202 | printk(KERN_WARNING "%s: no symbol version for %s\n", | 1168 | pr_warn("%s: no symbol version for %s\n", mod->name, symname); |
1203 | mod->name, symname); | ||
1204 | return 0; | 1169 | return 0; |
1205 | 1170 | ||
1206 | bad_version: | 1171 | bad_version: |
@@ -1309,8 +1274,8 @@ resolve_symbol_wait(struct module *mod, | |||
1309 | !IS_ERR(ksym = resolve_symbol(mod, info, name, owner)) | 1274 | !IS_ERR(ksym = resolve_symbol(mod, info, name, owner)) |
1310 | || PTR_ERR(ksym) != -EBUSY, | 1275 | || PTR_ERR(ksym) != -EBUSY, |
1311 | 30 * HZ) <= 0) { | 1276 | 30 * HZ) <= 0) { |
1312 | printk(KERN_WARNING "%s: gave up waiting for init of module %s.\n", | 1277 | pr_warn("%s: gave up waiting for init of module %s.\n", |
1313 | mod->name, owner); | 1278 | mod->name, owner); |
1314 | } | 1279 | } |
1315 | return ksym; | 1280 | return ksym; |
1316 | } | 1281 | } |
@@ -1626,15 +1591,14 @@ static int mod_sysfs_init(struct module *mod) | |||
1626 | struct kobject *kobj; | 1591 | struct kobject *kobj; |
1627 | 1592 | ||
1628 | if (!module_sysfs_initialized) { | 1593 | if (!module_sysfs_initialized) { |
1629 | printk(KERN_ERR "%s: module sysfs not initialized\n", | 1594 | pr_err("%s: module sysfs not initialized\n", mod->name); |
1630 | mod->name); | ||
1631 | err = -EINVAL; | 1595 | err = -EINVAL; |
1632 | goto out; | 1596 | goto out; |
1633 | } | 1597 | } |
1634 | 1598 | ||
1635 | kobj = kset_find_obj(module_kset, mod->name); | 1599 | kobj = kset_find_obj(module_kset, mod->name); |
1636 | if (kobj) { | 1600 | if (kobj) { |
1637 | printk(KERN_ERR "%s: module is already loaded\n", mod->name); | 1601 | pr_err("%s: module is already loaded\n", mod->name); |
1638 | kobject_put(kobj); | 1602 | kobject_put(kobj); |
1639 | err = -EINVAL; | 1603 | err = -EINVAL; |
1640 | goto out; | 1604 | goto out; |
@@ -1961,8 +1925,7 @@ static int verify_export_symbols(struct module *mod) | |||
1961 | for (i = 0; i < ARRAY_SIZE(arr); i++) { | 1925 | for (i = 0; i < ARRAY_SIZE(arr); i++) { |
1962 | for (s = arr[i].sym; s < arr[i].sym + arr[i].num; s++) { | 1926 | for (s = arr[i].sym; s < arr[i].sym + arr[i].num; s++) { |
1963 | if (find_symbol(s->name, &owner, NULL, true, false)) { | 1927 | if (find_symbol(s->name, &owner, NULL, true, false)) { |
1964 | printk(KERN_ERR | 1928 | pr_err("%s: exports duplicate symbol %s" |
1965 | "%s: exports duplicate symbol %s" | ||
1966 | " (owned by %s)\n", | 1929 | " (owned by %s)\n", |
1967 | mod->name, s->name, module_name(owner)); | 1930 | mod->name, s->name, module_name(owner)); |
1968 | return -ENOEXEC; | 1931 | return -ENOEXEC; |
@@ -2013,8 +1976,8 @@ static int simplify_symbols(struct module *mod, const struct load_info *info) | |||
2013 | if (!ksym && ELF_ST_BIND(sym[i].st_info) == STB_WEAK) | 1976 | if (!ksym && ELF_ST_BIND(sym[i].st_info) == STB_WEAK) |
2014 | break; | 1977 | break; |
2015 | 1978 | ||
2016 | printk(KERN_WARNING "%s: Unknown symbol %s (err %li)\n", | 1979 | pr_warn("%s: Unknown symbol %s (err %li)\n", |
2017 | mod->name, name, PTR_ERR(ksym)); | 1980 | mod->name, name, PTR_ERR(ksym)); |
2018 | ret = PTR_ERR(ksym) ?: -ENOENT; | 1981 | ret = PTR_ERR(ksym) ?: -ENOENT; |
2019 | break; | 1982 | break; |
2020 | 1983 | ||
@@ -2168,8 +2131,8 @@ static void set_license(struct module *mod, const char *license) | |||
2168 | 2131 | ||
2169 | if (!license_is_gpl_compatible(license)) { | 2132 | if (!license_is_gpl_compatible(license)) { |
2170 | if (!test_taint(TAINT_PROPRIETARY_MODULE)) | 2133 | if (!test_taint(TAINT_PROPRIETARY_MODULE)) |
2171 | printk(KERN_WARNING "%s: module license '%s' taints " | 2134 | pr_warn("%s: module license '%s' taints kernel.\n", |
2172 | "kernel.\n", mod->name, license); | 2135 | mod->name, license); |
2173 | add_taint_module(mod, TAINT_PROPRIETARY_MODULE, | 2136 | add_taint_module(mod, TAINT_PROPRIETARY_MODULE, |
2174 | LOCKDEP_NOW_UNRELIABLE); | 2137 | LOCKDEP_NOW_UNRELIABLE); |
2175 | } | 2138 | } |
@@ -2405,8 +2368,8 @@ static void dynamic_debug_setup(struct _ddebug *debug, unsigned int num) | |||
2405 | return; | 2368 | return; |
2406 | #ifdef CONFIG_DYNAMIC_DEBUG | 2369 | #ifdef CONFIG_DYNAMIC_DEBUG |
2407 | if (ddebug_add_module(debug, num, debug->modname)) | 2370 | if (ddebug_add_module(debug, num, debug->modname)) |
2408 | printk(KERN_ERR "dynamic debug error adding module: %s\n", | 2371 | pr_err("dynamic debug error adding module: %s\n", |
2409 | debug->modname); | 2372 | debug->modname); |
2410 | #endif | 2373 | #endif |
2411 | } | 2374 | } |
2412 | 2375 | ||
@@ -2619,8 +2582,7 @@ static int rewrite_section_headers(struct load_info *info, int flags) | |||
2619 | Elf_Shdr *shdr = &info->sechdrs[i]; | 2582 | Elf_Shdr *shdr = &info->sechdrs[i]; |
2620 | if (shdr->sh_type != SHT_NOBITS | 2583 | if (shdr->sh_type != SHT_NOBITS |
2621 | && info->len < shdr->sh_offset + shdr->sh_size) { | 2584 | && info->len < shdr->sh_offset + shdr->sh_size) { |
2622 | printk(KERN_ERR "Module len %lu truncated\n", | 2585 | pr_err("Module len %lu truncated\n", info->len); |
2623 | info->len); | ||
2624 | return -ENOEXEC; | 2586 | return -ENOEXEC; |
2625 | } | 2587 | } |
2626 | 2588 | ||
@@ -2682,15 +2644,14 @@ static struct module *setup_load_info(struct load_info *info, int flags) | |||
2682 | 2644 | ||
2683 | info->index.mod = find_sec(info, ".gnu.linkonce.this_module"); | 2645 | info->index.mod = find_sec(info, ".gnu.linkonce.this_module"); |
2684 | if (!info->index.mod) { | 2646 | if (!info->index.mod) { |
2685 | printk(KERN_WARNING "No module found in object\n"); | 2647 | pr_warn("No module found in object\n"); |
2686 | return ERR_PTR(-ENOEXEC); | 2648 | return ERR_PTR(-ENOEXEC); |
2687 | } | 2649 | } |
2688 | /* This is temporary: point mod into copy of data. */ | 2650 | /* This is temporary: point mod into copy of data. */ |
2689 | mod = (void *)info->sechdrs[info->index.mod].sh_addr; | 2651 | mod = (void *)info->sechdrs[info->index.mod].sh_addr; |
2690 | 2652 | ||
2691 | if (info->index.sym == 0) { | 2653 | if (info->index.sym == 0) { |
2692 | printk(KERN_WARNING "%s: module has no symbols (stripped?)\n", | 2654 | pr_warn("%s: module has no symbols (stripped?)\n", mod->name); |
2693 | mod->name); | ||
2694 | return ERR_PTR(-ENOEXEC); | 2655 | return ERR_PTR(-ENOEXEC); |
2695 | } | 2656 | } |
2696 | 2657 | ||
@@ -2717,7 +2678,7 @@ static int check_modinfo(struct module *mod, struct load_info *info, int flags) | |||
2717 | if (err) | 2678 | if (err) |
2718 | return err; | 2679 | return err; |
2719 | } else if (!same_magic(modmagic, vermagic, info->index.vers)) { | 2680 | } else if (!same_magic(modmagic, vermagic, info->index.vers)) { |
2720 | printk(KERN_ERR "%s: version magic '%s' should be '%s'\n", | 2681 | pr_err("%s: version magic '%s' should be '%s'\n", |
2721 | mod->name, modmagic, vermagic); | 2682 | mod->name, modmagic, vermagic); |
2722 | return -ENOEXEC; | 2683 | return -ENOEXEC; |
2723 | } | 2684 | } |
@@ -2727,9 +2688,8 @@ static int check_modinfo(struct module *mod, struct load_info *info, int flags) | |||
2727 | 2688 | ||
2728 | if (get_modinfo(info, "staging")) { | 2689 | if (get_modinfo(info, "staging")) { |
2729 | add_taint_module(mod, TAINT_CRAP, LOCKDEP_STILL_OK); | 2690 | add_taint_module(mod, TAINT_CRAP, LOCKDEP_STILL_OK); |
2730 | printk(KERN_WARNING "%s: module is from the staging directory," | 2691 | pr_warn("%s: module is from the staging directory, the quality " |
2731 | " the quality is unknown, you have been warned.\n", | 2692 | "is unknown, you have been warned.\n", mod->name); |
2732 | mod->name); | ||
2733 | } | 2693 | } |
2734 | 2694 | ||
2735 | /* Set up license info based on the info section */ | 2695 | /* Set up license info based on the info section */ |
@@ -2738,7 +2698,7 @@ static int check_modinfo(struct module *mod, struct load_info *info, int flags) | |||
2738 | return 0; | 2698 | return 0; |
2739 | } | 2699 | } |
2740 | 2700 | ||
2741 | static void find_module_sections(struct module *mod, struct load_info *info) | 2701 | static int find_module_sections(struct module *mod, struct load_info *info) |
2742 | { | 2702 | { |
2743 | mod->kp = section_objs(info, "__param", | 2703 | mod->kp = section_objs(info, "__param", |
2744 | sizeof(*mod->kp), &mod->num_kp); | 2704 | sizeof(*mod->kp), &mod->num_kp); |
@@ -2768,6 +2728,18 @@ static void find_module_sections(struct module *mod, struct load_info *info) | |||
2768 | #ifdef CONFIG_CONSTRUCTORS | 2728 | #ifdef CONFIG_CONSTRUCTORS |
2769 | mod->ctors = section_objs(info, ".ctors", | 2729 | mod->ctors = section_objs(info, ".ctors", |
2770 | sizeof(*mod->ctors), &mod->num_ctors); | 2730 | sizeof(*mod->ctors), &mod->num_ctors); |
2731 | if (!mod->ctors) | ||
2732 | mod->ctors = section_objs(info, ".init_array", | ||
2733 | sizeof(*mod->ctors), &mod->num_ctors); | ||
2734 | else if (find_sec(info, ".init_array")) { | ||
2735 | /* | ||
2736 | * This shouldn't happen with same compiler and binutils | ||
2737 | * building all parts of the module. | ||
2738 | */ | ||
2739 | printk(KERN_WARNING "%s: has both .ctors and .init_array.\n", | ||
2740 | mod->name); | ||
2741 | return -EINVAL; | ||
2742 | } | ||
2771 | #endif | 2743 | #endif |
2772 | 2744 | ||
2773 | #ifdef CONFIG_TRACEPOINTS | 2745 | #ifdef CONFIG_TRACEPOINTS |
@@ -2801,11 +2773,12 @@ static void find_module_sections(struct module *mod, struct load_info *info) | |||
2801 | sizeof(*mod->extable), &mod->num_exentries); | 2773 | sizeof(*mod->extable), &mod->num_exentries); |
2802 | 2774 | ||
2803 | if (section_addr(info, "__obsparm")) | 2775 | if (section_addr(info, "__obsparm")) |
2804 | printk(KERN_WARNING "%s: Ignoring obsolete parameters\n", | 2776 | pr_warn("%s: Ignoring obsolete parameters\n", mod->name); |
2805 | mod->name); | ||
2806 | 2777 | ||
2807 | info->debug = section_objs(info, "__verbose", | 2778 | info->debug = section_objs(info, "__verbose", |
2808 | sizeof(*info->debug), &info->num_debug); | 2779 | sizeof(*info->debug), &info->num_debug); |
2780 | |||
2781 | return 0; | ||
2809 | } | 2782 | } |
2810 | 2783 | ||
2811 | static int move_module(struct module *mod, struct load_info *info) | 2784 | static int move_module(struct module *mod, struct load_info *info) |
@@ -3078,11 +3051,10 @@ static int do_init_module(struct module *mod) | |||
3078 | return ret; | 3051 | return ret; |
3079 | } | 3052 | } |
3080 | if (ret > 0) { | 3053 | if (ret > 0) { |
3081 | printk(KERN_WARNING | 3054 | pr_warn("%s: '%s'->init suspiciously returned %d, it should " |
3082 | "%s: '%s'->init suspiciously returned %d, it should follow 0/-E convention\n" | 3055 | "follow 0/-E convention\n" |
3083 | "%s: loading module anyway...\n", | 3056 | "%s: loading module anyway...\n", |
3084 | __func__, mod->name, ret, | 3057 | __func__, mod->name, ret, __func__); |
3085 | __func__); | ||
3086 | dump_stack(); | 3058 | dump_stack(); |
3087 | } | 3059 | } |
3088 | 3060 | ||
@@ -3205,10 +3177,8 @@ static int unknown_module_param_cb(char *param, char *val, const char *modname) | |||
3205 | { | 3177 | { |
3206 | /* Check for magic 'dyndbg' arg */ | 3178 | /* Check for magic 'dyndbg' arg */ |
3207 | int ret = ddebug_dyndbg_module_param_cb(param, val, modname); | 3179 | int ret = ddebug_dyndbg_module_param_cb(param, val, modname); |
3208 | if (ret != 0) { | 3180 | if (ret != 0) |
3209 | printk(KERN_WARNING "%s: unknown parameter '%s' ignored\n", | 3181 | pr_warn("%s: unknown parameter '%s' ignored\n", modname, param); |
3210 | modname, param); | ||
3211 | } | ||
3212 | return 0; | 3182 | return 0; |
3213 | } | 3183 | } |
3214 | 3184 | ||
@@ -3243,10 +3213,9 @@ static int load_module(struct load_info *info, const char __user *uargs, | |||
3243 | #ifdef CONFIG_MODULE_SIG | 3213 | #ifdef CONFIG_MODULE_SIG |
3244 | mod->sig_ok = info->sig_ok; | 3214 | mod->sig_ok = info->sig_ok; |
3245 | if (!mod->sig_ok) { | 3215 | if (!mod->sig_ok) { |
3246 | printk_once(KERN_NOTICE | 3216 | pr_notice_once("%s: module verification failed: signature " |
3247 | "%s: module verification failed: signature and/or" | 3217 | "and/or required key missing - tainting " |
3248 | " required key missing - tainting kernel\n", | 3218 | "kernel\n", mod->name); |
3249 | mod->name); | ||
3250 | add_taint_module(mod, TAINT_FORCED_MODULE, LOCKDEP_STILL_OK); | 3219 | add_taint_module(mod, TAINT_FORCED_MODULE, LOCKDEP_STILL_OK); |
3251 | } | 3220 | } |
3252 | #endif | 3221 | #endif |
@@ -3263,7 +3232,9 @@ static int load_module(struct load_info *info, const char __user *uargs, | |||
3263 | 3232 | ||
3264 | /* Now we've got everything in the final locations, we can | 3233 | /* Now we've got everything in the final locations, we can |
3265 | * find optional sections. */ | 3234 | * find optional sections. */ |
3266 | find_module_sections(mod, info); | 3235 | err = find_module_sections(mod, info); |
3236 | if (err) | ||
3237 | goto free_unload; | ||
3267 | 3238 | ||
3268 | err = check_module_license_and_versions(mod); | 3239 | err = check_module_license_and_versions(mod); |
3269 | if (err) | 3240 | if (err) |
diff --git a/kernel/module_signing.c b/kernel/module_signing.c index f2970bddc5ea..be5b8fac4bd0 100644 --- a/kernel/module_signing.c +++ b/kernel/module_signing.c | |||
@@ -14,6 +14,7 @@ | |||
14 | #include <crypto/public_key.h> | 14 | #include <crypto/public_key.h> |
15 | #include <crypto/hash.h> | 15 | #include <crypto/hash.h> |
16 | #include <keys/asymmetric-type.h> | 16 | #include <keys/asymmetric-type.h> |
17 | #include <keys/system_keyring.h> | ||
17 | #include "module-internal.h" | 18 | #include "module-internal.h" |
18 | 19 | ||
19 | /* | 20 | /* |
@@ -28,7 +29,7 @@ | |||
28 | */ | 29 | */ |
29 | struct module_signature { | 30 | struct module_signature { |
30 | u8 algo; /* Public-key crypto algorithm [enum pkey_algo] */ | 31 | u8 algo; /* Public-key crypto algorithm [enum pkey_algo] */ |
31 | u8 hash; /* Digest algorithm [enum pkey_hash_algo] */ | 32 | u8 hash; /* Digest algorithm [enum hash_algo] */ |
32 | u8 id_type; /* Key identifier type [enum pkey_id_type] */ | 33 | u8 id_type; /* Key identifier type [enum pkey_id_type] */ |
33 | u8 signer_len; /* Length of signer's name */ | 34 | u8 signer_len; /* Length of signer's name */ |
34 | u8 key_id_len; /* Length of key identifier */ | 35 | u8 key_id_len; /* Length of key identifier */ |
@@ -39,7 +40,7 @@ struct module_signature { | |||
39 | /* | 40 | /* |
40 | * Digest the module contents. | 41 | * Digest the module contents. |
41 | */ | 42 | */ |
42 | static struct public_key_signature *mod_make_digest(enum pkey_hash_algo hash, | 43 | static struct public_key_signature *mod_make_digest(enum hash_algo hash, |
43 | const void *mod, | 44 | const void *mod, |
44 | unsigned long modlen) | 45 | unsigned long modlen) |
45 | { | 46 | { |
@@ -54,7 +55,7 @@ static struct public_key_signature *mod_make_digest(enum pkey_hash_algo hash, | |||
54 | /* Allocate the hashing algorithm we're going to need and find out how | 55 | /* Allocate the hashing algorithm we're going to need and find out how |
55 | * big the hash operational data will be. | 56 | * big the hash operational data will be. |
56 | */ | 57 | */ |
57 | tfm = crypto_alloc_shash(pkey_hash_algo[hash], 0, 0); | 58 | tfm = crypto_alloc_shash(hash_algo_name[hash], 0, 0); |
58 | if (IS_ERR(tfm)) | 59 | if (IS_ERR(tfm)) |
59 | return (PTR_ERR(tfm) == -ENOENT) ? ERR_PTR(-ENOPKG) : ERR_CAST(tfm); | 60 | return (PTR_ERR(tfm) == -ENOENT) ? ERR_PTR(-ENOPKG) : ERR_CAST(tfm); |
60 | 61 | ||
@@ -157,7 +158,7 @@ static struct key *request_asymmetric_key(const char *signer, size_t signer_len, | |||
157 | 158 | ||
158 | pr_debug("Look up: \"%s\"\n", id); | 159 | pr_debug("Look up: \"%s\"\n", id); |
159 | 160 | ||
160 | key = keyring_search(make_key_ref(modsign_keyring, 1), | 161 | key = keyring_search(make_key_ref(system_trusted_keyring, 1), |
161 | &key_type_asymmetric, id); | 162 | &key_type_asymmetric, id); |
162 | if (IS_ERR(key)) | 163 | if (IS_ERR(key)) |
163 | pr_warn("Request for unknown module key '%s' err %ld\n", | 164 | pr_warn("Request for unknown module key '%s' err %ld\n", |
@@ -217,7 +218,7 @@ int mod_verify_sig(const void *mod, unsigned long *_modlen) | |||
217 | return -ENOPKG; | 218 | return -ENOPKG; |
218 | 219 | ||
219 | if (ms.hash >= PKEY_HASH__LAST || | 220 | if (ms.hash >= PKEY_HASH__LAST || |
220 | !pkey_hash_algo[ms.hash]) | 221 | !hash_algo_name[ms.hash]) |
221 | return -ENOPKG; | 222 | return -ENOPKG; |
222 | 223 | ||
223 | key = request_asymmetric_key(sig, ms.signer_len, | 224 | key = request_asymmetric_key(sig, ms.signer_len, |
diff --git a/kernel/padata.c b/kernel/padata.c index 07af2c95dcfe..2abd25d79cc8 100644 --- a/kernel/padata.c +++ b/kernel/padata.c | |||
@@ -46,6 +46,7 @@ static int padata_index_to_cpu(struct parallel_data *pd, int cpu_index) | |||
46 | 46 | ||
47 | static int padata_cpu_hash(struct parallel_data *pd) | 47 | static int padata_cpu_hash(struct parallel_data *pd) |
48 | { | 48 | { |
49 | unsigned int seq_nr; | ||
49 | int cpu_index; | 50 | int cpu_index; |
50 | 51 | ||
51 | /* | 52 | /* |
@@ -53,10 +54,8 @@ static int padata_cpu_hash(struct parallel_data *pd) | |||
53 | * seq_nr mod. number of cpus in use. | 54 | * seq_nr mod. number of cpus in use. |
54 | */ | 55 | */ |
55 | 56 | ||
56 | spin_lock(&pd->seq_lock); | 57 | seq_nr = atomic_inc_return(&pd->seq_nr); |
57 | cpu_index = pd->seq_nr % cpumask_weight(pd->cpumask.pcpu); | 58 | cpu_index = seq_nr % cpumask_weight(pd->cpumask.pcpu); |
58 | pd->seq_nr++; | ||
59 | spin_unlock(&pd->seq_lock); | ||
60 | 59 | ||
61 | return padata_index_to_cpu(pd, cpu_index); | 60 | return padata_index_to_cpu(pd, cpu_index); |
62 | } | 61 | } |
@@ -429,7 +428,7 @@ static struct parallel_data *padata_alloc_pd(struct padata_instance *pinst, | |||
429 | padata_init_pqueues(pd); | 428 | padata_init_pqueues(pd); |
430 | padata_init_squeues(pd); | 429 | padata_init_squeues(pd); |
431 | setup_timer(&pd->timer, padata_reorder_timer, (unsigned long)pd); | 430 | setup_timer(&pd->timer, padata_reorder_timer, (unsigned long)pd); |
432 | pd->seq_nr = 0; | 431 | atomic_set(&pd->seq_nr, -1); |
433 | atomic_set(&pd->reorder_objects, 0); | 432 | atomic_set(&pd->reorder_objects, 0); |
434 | atomic_set(&pd->refcnt, 0); | 433 | atomic_set(&pd->refcnt, 0); |
435 | pd->pinst = pinst; | 434 | pd->pinst = pinst; |
diff --git a/kernel/panic.c b/kernel/panic.c index b6c482ccc5db..c00b4ceb39e8 100644 --- a/kernel/panic.c +++ b/kernel/panic.c | |||
@@ -233,7 +233,7 @@ static const struct tnt tnts[] = { | |||
233 | */ | 233 | */ |
234 | const char *print_tainted(void) | 234 | const char *print_tainted(void) |
235 | { | 235 | { |
236 | static char buf[ARRAY_SIZE(tnts) + sizeof("Tainted: ") + 1]; | 236 | static char buf[ARRAY_SIZE(tnts) + sizeof("Tainted: ")]; |
237 | 237 | ||
238 | if (tainted_mask) { | 238 | if (tainted_mask) { |
239 | char *s; | 239 | char *s; |
diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c index 42086551a24a..06c62de9c711 100644 --- a/kernel/pid_namespace.c +++ b/kernel/pid_namespace.c | |||
@@ -132,6 +132,12 @@ out: | |||
132 | return ERR_PTR(err); | 132 | return ERR_PTR(err); |
133 | } | 133 | } |
134 | 134 | ||
135 | static void delayed_free_pidns(struct rcu_head *p) | ||
136 | { | ||
137 | kmem_cache_free(pid_ns_cachep, | ||
138 | container_of(p, struct pid_namespace, rcu)); | ||
139 | } | ||
140 | |||
135 | static void destroy_pid_namespace(struct pid_namespace *ns) | 141 | static void destroy_pid_namespace(struct pid_namespace *ns) |
136 | { | 142 | { |
137 | int i; | 143 | int i; |
@@ -140,7 +146,7 @@ static void destroy_pid_namespace(struct pid_namespace *ns) | |||
140 | for (i = 0; i < PIDMAP_ENTRIES; i++) | 146 | for (i = 0; i < PIDMAP_ENTRIES; i++) |
141 | kfree(ns->pidmap[i].page); | 147 | kfree(ns->pidmap[i].page); |
142 | put_user_ns(ns->user_ns); | 148 | put_user_ns(ns->user_ns); |
143 | kmem_cache_free(pid_ns_cachep, ns); | 149 | call_rcu(&ns->rcu, delayed_free_pidns); |
144 | } | 150 | } |
145 | 151 | ||
146 | struct pid_namespace *copy_pid_ns(unsigned long flags, | 152 | struct pid_namespace *copy_pid_ns(unsigned long flags, |
diff --git a/kernel/power/Kconfig b/kernel/power/Kconfig index d444c4e834f4..2fac9cc79b3d 100644 --- a/kernel/power/Kconfig +++ b/kernel/power/Kconfig | |||
@@ -178,6 +178,22 @@ config PM_SLEEP_DEBUG | |||
178 | def_bool y | 178 | def_bool y |
179 | depends on PM_DEBUG && PM_SLEEP | 179 | depends on PM_DEBUG && PM_SLEEP |
180 | 180 | ||
181 | config DPM_WATCHDOG | ||
182 | bool "Device suspend/resume watchdog" | ||
183 | depends on PM_DEBUG && PSTORE | ||
184 | ---help--- | ||
185 | Sets up a watchdog timer to capture drivers that are | ||
186 | locked up attempting to suspend/resume a device. | ||
187 | A detected lockup causes system panic with message | ||
188 | captured in pstore device for inspection in subsequent | ||
189 | boot session. | ||
190 | |||
191 | config DPM_WATCHDOG_TIMEOUT | ||
192 | int "Watchdog timeout in seconds" | ||
193 | range 1 120 | ||
194 | default 12 | ||
195 | depends on DPM_WATCHDOG | ||
196 | |||
181 | config PM_TRACE | 197 | config PM_TRACE |
182 | bool | 198 | bool |
183 | help | 199 | help |
diff --git a/kernel/power/console.c b/kernel/power/console.c index 463aa6736751..eacb8bd8cab4 100644 --- a/kernel/power/console.c +++ b/kernel/power/console.c | |||
@@ -81,6 +81,7 @@ void pm_vt_switch_unregister(struct device *dev) | |||
81 | list_for_each_entry(tmp, &pm_vt_switch_list, head) { | 81 | list_for_each_entry(tmp, &pm_vt_switch_list, head) { |
82 | if (tmp->dev == dev) { | 82 | if (tmp->dev == dev) { |
83 | list_del(&tmp->head); | 83 | list_del(&tmp->head); |
84 | kfree(tmp); | ||
84 | break; | 85 | break; |
85 | } | 86 | } |
86 | } | 87 | } |
diff --git a/kernel/power/qos.c b/kernel/power/qos.c index a394297f8b2f..8dff9b48075a 100644 --- a/kernel/power/qos.c +++ b/kernel/power/qos.c | |||
@@ -558,30 +558,12 @@ static ssize_t pm_qos_power_write(struct file *filp, const char __user *buf, | |||
558 | if (count == sizeof(s32)) { | 558 | if (count == sizeof(s32)) { |
559 | if (copy_from_user(&value, buf, sizeof(s32))) | 559 | if (copy_from_user(&value, buf, sizeof(s32))) |
560 | return -EFAULT; | 560 | return -EFAULT; |
561 | } else if (count <= 11) { /* ASCII perhaps? */ | 561 | } else { |
562 | char ascii_value[11]; | ||
563 | unsigned long int ulval; | ||
564 | int ret; | 562 | int ret; |
565 | 563 | ||
566 | if (copy_from_user(ascii_value, buf, count)) | 564 | ret = kstrtos32_from_user(buf, count, 16, &value); |
567 | return -EFAULT; | 565 | if (ret) |
568 | 566 | return ret; | |
569 | if (count > 10) { | ||
570 | if (ascii_value[10] == '\n') | ||
571 | ascii_value[10] = '\0'; | ||
572 | else | ||
573 | return -EINVAL; | ||
574 | } else { | ||
575 | ascii_value[count] = '\0'; | ||
576 | } | ||
577 | ret = kstrtoul(ascii_value, 16, &ulval); | ||
578 | if (ret) { | ||
579 | pr_debug("%s, 0x%lx, 0x%x\n", ascii_value, ulval, ret); | ||
580 | return -EINVAL; | ||
581 | } | ||
582 | value = (s32)lower_32_bits(ulval); | ||
583 | } else { | ||
584 | return -EINVAL; | ||
585 | } | 567 | } |
586 | 568 | ||
587 | req = filp->private_data; | 569 | req = filp->private_data; |
diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c index 98c3b34a4cff..b38109e204af 100644 --- a/kernel/power/snapshot.c +++ b/kernel/power/snapshot.c | |||
@@ -792,7 +792,8 @@ void free_basic_memory_bitmaps(void) | |||
792 | { | 792 | { |
793 | struct memory_bitmap *bm1, *bm2; | 793 | struct memory_bitmap *bm1, *bm2; |
794 | 794 | ||
795 | BUG_ON(!(forbidden_pages_map && free_pages_map)); | 795 | if (WARN_ON(!(forbidden_pages_map && free_pages_map))) |
796 | return; | ||
796 | 797 | ||
797 | bm1 = forbidden_pages_map; | 798 | bm1 = forbidden_pages_map; |
798 | bm2 = free_pages_map; | 799 | bm2 = free_pages_map; |
@@ -1402,7 +1403,11 @@ int hibernate_preallocate_memory(void) | |||
1402 | * highmem and non-highmem zones separately. | 1403 | * highmem and non-highmem zones separately. |
1403 | */ | 1404 | */ |
1404 | pages_highmem = preallocate_image_highmem(highmem / 2); | 1405 | pages_highmem = preallocate_image_highmem(highmem / 2); |
1405 | alloc = (count - max_size) - pages_highmem; | 1406 | alloc = count - max_size; |
1407 | if (alloc > pages_highmem) | ||
1408 | alloc -= pages_highmem; | ||
1409 | else | ||
1410 | alloc = 0; | ||
1406 | pages = preallocate_image_memory(alloc, avail_normal); | 1411 | pages = preallocate_image_memory(alloc, avail_normal); |
1407 | if (pages < alloc) { | 1412 | if (pages < alloc) { |
1408 | /* We have exhausted non-highmem pages, try highmem. */ | 1413 | /* We have exhausted non-highmem pages, try highmem. */ |
diff --git a/kernel/power/user.c b/kernel/power/user.c index 957f06164ad1..98d357584cd6 100644 --- a/kernel/power/user.c +++ b/kernel/power/user.c | |||
@@ -36,9 +36,9 @@ static struct snapshot_data { | |||
36 | struct snapshot_handle handle; | 36 | struct snapshot_handle handle; |
37 | int swap; | 37 | int swap; |
38 | int mode; | 38 | int mode; |
39 | char frozen; | 39 | bool frozen; |
40 | char ready; | 40 | bool ready; |
41 | char platform_support; | 41 | bool platform_support; |
42 | bool free_bitmaps; | 42 | bool free_bitmaps; |
43 | } snapshot_state; | 43 | } snapshot_state; |
44 | 44 | ||
@@ -70,6 +70,7 @@ static int snapshot_open(struct inode *inode, struct file *filp) | |||
70 | data->swap = swsusp_resume_device ? | 70 | data->swap = swsusp_resume_device ? |
71 | swap_type_of(swsusp_resume_device, 0, NULL) : -1; | 71 | swap_type_of(swsusp_resume_device, 0, NULL) : -1; |
72 | data->mode = O_RDONLY; | 72 | data->mode = O_RDONLY; |
73 | data->free_bitmaps = false; | ||
73 | error = pm_notifier_call_chain(PM_HIBERNATION_PREPARE); | 74 | error = pm_notifier_call_chain(PM_HIBERNATION_PREPARE); |
74 | if (error) | 75 | if (error) |
75 | pm_notifier_call_chain(PM_POST_HIBERNATION); | 76 | pm_notifier_call_chain(PM_POST_HIBERNATION); |
@@ -93,9 +94,9 @@ static int snapshot_open(struct inode *inode, struct file *filp) | |||
93 | if (error) | 94 | if (error) |
94 | atomic_inc(&snapshot_device_available); | 95 | atomic_inc(&snapshot_device_available); |
95 | 96 | ||
96 | data->frozen = 0; | 97 | data->frozen = false; |
97 | data->ready = 0; | 98 | data->ready = false; |
98 | data->platform_support = 0; | 99 | data->platform_support = false; |
99 | 100 | ||
100 | Unlock: | 101 | Unlock: |
101 | unlock_system_sleep(); | 102 | unlock_system_sleep(); |
@@ -229,7 +230,7 @@ static long snapshot_ioctl(struct file *filp, unsigned int cmd, | |||
229 | if (error) | 230 | if (error) |
230 | thaw_processes(); | 231 | thaw_processes(); |
231 | else | 232 | else |
232 | data->frozen = 1; | 233 | data->frozen = true; |
233 | 234 | ||
234 | break; | 235 | break; |
235 | 236 | ||
@@ -240,7 +241,7 @@ static long snapshot_ioctl(struct file *filp, unsigned int cmd, | |||
240 | free_basic_memory_bitmaps(); | 241 | free_basic_memory_bitmaps(); |
241 | data->free_bitmaps = false; | 242 | data->free_bitmaps = false; |
242 | thaw_processes(); | 243 | thaw_processes(); |
243 | data->frozen = 0; | 244 | data->frozen = false; |
244 | break; | 245 | break; |
245 | 246 | ||
246 | case SNAPSHOT_CREATE_IMAGE: | 247 | case SNAPSHOT_CREATE_IMAGE: |
@@ -270,7 +271,7 @@ static long snapshot_ioctl(struct file *filp, unsigned int cmd, | |||
270 | case SNAPSHOT_FREE: | 271 | case SNAPSHOT_FREE: |
271 | swsusp_free(); | 272 | swsusp_free(); |
272 | memset(&data->handle, 0, sizeof(struct snapshot_handle)); | 273 | memset(&data->handle, 0, sizeof(struct snapshot_handle)); |
273 | data->ready = 0; | 274 | data->ready = false; |
274 | /* | 275 | /* |
275 | * It is necessary to thaw kernel threads here, because | 276 | * It is necessary to thaw kernel threads here, because |
276 | * SNAPSHOT_CREATE_IMAGE may be invoked directly after | 277 | * SNAPSHOT_CREATE_IMAGE may be invoked directly after |
@@ -334,7 +335,7 @@ static long snapshot_ioctl(struct file *filp, unsigned int cmd, | |||
334 | * PM_HIBERNATION_PREPARE | 335 | * PM_HIBERNATION_PREPARE |
335 | */ | 336 | */ |
336 | error = suspend_devices_and_enter(PM_SUSPEND_MEM); | 337 | error = suspend_devices_and_enter(PM_SUSPEND_MEM); |
337 | data->ready = 0; | 338 | data->ready = false; |
338 | break; | 339 | break; |
339 | 340 | ||
340 | case SNAPSHOT_PLATFORM_SUPPORT: | 341 | case SNAPSHOT_PLATFORM_SUPPORT: |
diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c index b4e8500afdb3..be7c86bae576 100644 --- a/kernel/printk/printk.c +++ b/kernel/printk/printk.c | |||
@@ -705,9 +705,9 @@ const struct file_operations kmsg_fops = { | |||
705 | 705 | ||
706 | #ifdef CONFIG_KEXEC | 706 | #ifdef CONFIG_KEXEC |
707 | /* | 707 | /* |
708 | * This appends the listed symbols to /proc/vmcoreinfo | 708 | * This appends the listed symbols to /proc/vmcore |
709 | * | 709 | * |
710 | * /proc/vmcoreinfo is used by various utiilties, like crash and makedumpfile to | 710 | * /proc/vmcore is used by various utilities, like crash and makedumpfile to |
711 | * obtain access to symbols that are otherwise very difficult to locate. These | 711 | * obtain access to symbols that are otherwise very difficult to locate. These |
712 | * symbols are specifically used so that utilities can access and extract the | 712 | * symbols are specifically used so that utilities can access and extract the |
713 | * dmesg log from a vmcore file after a crash. | 713 | * dmesg log from a vmcore file after a crash. |
@@ -791,7 +791,7 @@ static bool __read_mostly ignore_loglevel; | |||
791 | static int __init ignore_loglevel_setup(char *str) | 791 | static int __init ignore_loglevel_setup(char *str) |
792 | { | 792 | { |
793 | ignore_loglevel = 1; | 793 | ignore_loglevel = 1; |
794 | printk(KERN_INFO "debug: ignoring loglevel setting.\n"); | 794 | pr_info("debug: ignoring loglevel setting.\n"); |
795 | 795 | ||
796 | return 0; | 796 | return 0; |
797 | } | 797 | } |
@@ -820,9 +820,9 @@ static int __init boot_delay_setup(char *str) | |||
820 | pr_debug("boot_delay: %u, preset_lpj: %ld, lpj: %lu, " | 820 | pr_debug("boot_delay: %u, preset_lpj: %ld, lpj: %lu, " |
821 | "HZ: %d, loops_per_msec: %llu\n", | 821 | "HZ: %d, loops_per_msec: %llu\n", |
822 | boot_delay, preset_lpj, lpj, HZ, loops_per_msec); | 822 | boot_delay, preset_lpj, lpj, HZ, loops_per_msec); |
823 | return 1; | 823 | return 0; |
824 | } | 824 | } |
825 | __setup("boot_delay=", boot_delay_setup); | 825 | early_param("boot_delay", boot_delay_setup); |
826 | 826 | ||
827 | static void boot_delay_msec(int level) | 827 | static void boot_delay_msec(int level) |
828 | { | 828 | { |
@@ -2193,7 +2193,7 @@ static int __read_mostly keep_bootcon; | |||
2193 | static int __init keep_bootcon_setup(char *str) | 2193 | static int __init keep_bootcon_setup(char *str) |
2194 | { | 2194 | { |
2195 | keep_bootcon = 1; | 2195 | keep_bootcon = 1; |
2196 | printk(KERN_INFO "debug: skip boot console de-registration.\n"); | 2196 | pr_info("debug: skip boot console de-registration.\n"); |
2197 | 2197 | ||
2198 | return 0; | 2198 | return 0; |
2199 | } | 2199 | } |
@@ -2241,7 +2241,7 @@ void register_console(struct console *newcon) | |||
2241 | /* find the last or real console */ | 2241 | /* find the last or real console */ |
2242 | for_each_console(bcon) { | 2242 | for_each_console(bcon) { |
2243 | if (!(bcon->flags & CON_BOOT)) { | 2243 | if (!(bcon->flags & CON_BOOT)) { |
2244 | printk(KERN_INFO "Too late to register bootconsole %s%d\n", | 2244 | pr_info("Too late to register bootconsole %s%d\n", |
2245 | newcon->name, newcon->index); | 2245 | newcon->name, newcon->index); |
2246 | return; | 2246 | return; |
2247 | } | 2247 | } |
@@ -2358,21 +2358,18 @@ void register_console(struct console *newcon) | |||
2358 | * users know there might be something in the kernel's log buffer that | 2358 | * users know there might be something in the kernel's log buffer that |
2359 | * went to the bootconsole (that they do not see on the real console) | 2359 | * went to the bootconsole (that they do not see on the real console) |
2360 | */ | 2360 | */ |
2361 | pr_info("%sconsole [%s%d] enabled\n", | ||
2362 | (newcon->flags & CON_BOOT) ? "boot" : "" , | ||
2363 | newcon->name, newcon->index); | ||
2361 | if (bcon && | 2364 | if (bcon && |
2362 | ((newcon->flags & (CON_CONSDEV | CON_BOOT)) == CON_CONSDEV) && | 2365 | ((newcon->flags & (CON_CONSDEV | CON_BOOT)) == CON_CONSDEV) && |
2363 | !keep_bootcon) { | 2366 | !keep_bootcon) { |
2364 | /* we need to iterate through twice, to make sure we print | 2367 | /* We need to iterate through all boot consoles, to make |
2365 | * everything out, before we unregister the console(s) | 2368 | * sure we print everything out, before we unregister them. |
2366 | */ | 2369 | */ |
2367 | printk(KERN_INFO "console [%s%d] enabled, bootconsole disabled\n", | ||
2368 | newcon->name, newcon->index); | ||
2369 | for_each_console(bcon) | 2370 | for_each_console(bcon) |
2370 | if (bcon->flags & CON_BOOT) | 2371 | if (bcon->flags & CON_BOOT) |
2371 | unregister_console(bcon); | 2372 | unregister_console(bcon); |
2372 | } else { | ||
2373 | printk(KERN_INFO "%sconsole [%s%d] enabled\n", | ||
2374 | (newcon->flags & CON_BOOT) ? "boot" : "" , | ||
2375 | newcon->name, newcon->index); | ||
2376 | } | 2373 | } |
2377 | } | 2374 | } |
2378 | EXPORT_SYMBOL(register_console); | 2375 | EXPORT_SYMBOL(register_console); |
@@ -2382,6 +2379,10 @@ int unregister_console(struct console *console) | |||
2382 | struct console *a, *b; | 2379 | struct console *a, *b; |
2383 | int res; | 2380 | int res; |
2384 | 2381 | ||
2382 | pr_info("%sconsole [%s%d] disabled\n", | ||
2383 | (console->flags & CON_BOOT) ? "boot" : "" , | ||
2384 | console->name, console->index); | ||
2385 | |||
2385 | res = _braille_unregister_console(console); | 2386 | res = _braille_unregister_console(console); |
2386 | if (res) | 2387 | if (res) |
2387 | return res; | 2388 | return res; |
@@ -2421,8 +2422,6 @@ static int __init printk_late_init(void) | |||
2421 | 2422 | ||
2422 | for_each_console(con) { | 2423 | for_each_console(con) { |
2423 | if (!keep_bootcon && con->flags & CON_BOOT) { | 2424 | if (!keep_bootcon && con->flags & CON_BOOT) { |
2424 | printk(KERN_INFO "turn off boot console %s%d\n", | ||
2425 | con->name, con->index); | ||
2426 | unregister_console(con); | 2425 | unregister_console(con); |
2427 | } | 2426 | } |
2428 | } | 2427 | } |
@@ -2449,7 +2448,7 @@ static void wake_up_klogd_work_func(struct irq_work *irq_work) | |||
2449 | 2448 | ||
2450 | if (pending & PRINTK_PENDING_SCHED) { | 2449 | if (pending & PRINTK_PENDING_SCHED) { |
2451 | char *buf = __get_cpu_var(printk_sched_buf); | 2450 | char *buf = __get_cpu_var(printk_sched_buf); |
2452 | printk(KERN_WARNING "[sched_delayed] %s", buf); | 2451 | pr_warn("[sched_delayed] %s", buf); |
2453 | } | 2452 | } |
2454 | 2453 | ||
2455 | if (pending & PRINTK_PENDING_WAKEUP) | 2454 | if (pending & PRINTK_PENDING_WAKEUP) |
diff --git a/kernel/ptrace.c b/kernel/ptrace.c index dd562e9aa2c8..1f4bcb3cc21c 100644 --- a/kernel/ptrace.c +++ b/kernel/ptrace.c | |||
@@ -257,7 +257,8 @@ ok: | |||
257 | if (task->mm) | 257 | if (task->mm) |
258 | dumpable = get_dumpable(task->mm); | 258 | dumpable = get_dumpable(task->mm); |
259 | rcu_read_lock(); | 259 | rcu_read_lock(); |
260 | if (!dumpable && !ptrace_has_cap(__task_cred(task)->user_ns, mode)) { | 260 | if (dumpable != SUID_DUMP_USER && |
261 | !ptrace_has_cap(__task_cred(task)->user_ns, mode)) { | ||
261 | rcu_read_unlock(); | 262 | rcu_read_unlock(); |
262 | return -EPERM; | 263 | return -EPERM; |
263 | } | 264 | } |
diff --git a/kernel/rcu/Makefile b/kernel/rcu/Makefile new file mode 100644 index 000000000000..01e9ec37a3e3 --- /dev/null +++ b/kernel/rcu/Makefile | |||
@@ -0,0 +1,6 @@ | |||
1 | obj-y += update.o srcu.o | ||
2 | obj-$(CONFIG_RCU_TORTURE_TEST) += torture.o | ||
3 | obj-$(CONFIG_TREE_RCU) += tree.o | ||
4 | obj-$(CONFIG_TREE_PREEMPT_RCU) += tree.o | ||
5 | obj-$(CONFIG_TREE_RCU_TRACE) += tree_trace.o | ||
6 | obj-$(CONFIG_TINY_RCU) += tiny.o | ||
diff --git a/kernel/rcu.h b/kernel/rcu/rcu.h index 77131966c4ad..7859a0a3951e 100644 --- a/kernel/rcu.h +++ b/kernel/rcu/rcu.h | |||
@@ -122,4 +122,11 @@ int rcu_jiffies_till_stall_check(void); | |||
122 | 122 | ||
123 | #endif /* #ifdef CONFIG_RCU_STALL_COMMON */ | 123 | #endif /* #ifdef CONFIG_RCU_STALL_COMMON */ |
124 | 124 | ||
125 | /* | ||
126 | * Strings used in tracepoints need to be exported via the | ||
127 | * tracing system such that tools like perf and trace-cmd can | ||
128 | * translate the string address pointers to actual text. | ||
129 | */ | ||
130 | #define TPS(x) tracepoint_string(x) | ||
131 | |||
125 | #endif /* __LINUX_RCU_H */ | 132 | #endif /* __LINUX_RCU_H */ |
diff --git a/kernel/srcu.c b/kernel/rcu/srcu.c index 01d5ccb8bfe3..01d5ccb8bfe3 100644 --- a/kernel/srcu.c +++ b/kernel/rcu/srcu.c | |||
diff --git a/kernel/rcutiny.c b/kernel/rcu/tiny.c index 9ed6075dc562..1254f312d024 100644 --- a/kernel/rcutiny.c +++ b/kernel/rcu/tiny.c | |||
@@ -35,6 +35,7 @@ | |||
35 | #include <linux/time.h> | 35 | #include <linux/time.h> |
36 | #include <linux/cpu.h> | 36 | #include <linux/cpu.h> |
37 | #include <linux/prefetch.h> | 37 | #include <linux/prefetch.h> |
38 | #include <linux/ftrace_event.h> | ||
38 | 39 | ||
39 | #ifdef CONFIG_RCU_TRACE | 40 | #ifdef CONFIG_RCU_TRACE |
40 | #include <trace/events/rcu.h> | 41 | #include <trace/events/rcu.h> |
@@ -42,7 +43,7 @@ | |||
42 | 43 | ||
43 | #include "rcu.h" | 44 | #include "rcu.h" |
44 | 45 | ||
45 | /* Forward declarations for rcutiny_plugin.h. */ | 46 | /* Forward declarations for tiny_plugin.h. */ |
46 | struct rcu_ctrlblk; | 47 | struct rcu_ctrlblk; |
47 | static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp); | 48 | static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp); |
48 | static void rcu_process_callbacks(struct softirq_action *unused); | 49 | static void rcu_process_callbacks(struct softirq_action *unused); |
@@ -52,22 +53,23 @@ static void __call_rcu(struct rcu_head *head, | |||
52 | 53 | ||
53 | static long long rcu_dynticks_nesting = DYNTICK_TASK_EXIT_IDLE; | 54 | static long long rcu_dynticks_nesting = DYNTICK_TASK_EXIT_IDLE; |
54 | 55 | ||
55 | #include "rcutiny_plugin.h" | 56 | #include "tiny_plugin.h" |
56 | 57 | ||
57 | /* Common code for rcu_idle_enter() and rcu_irq_exit(), see kernel/rcutree.c. */ | 58 | /* Common code for rcu_idle_enter() and rcu_irq_exit(), see kernel/rcutree.c. */ |
58 | static void rcu_idle_enter_common(long long newval) | 59 | static void rcu_idle_enter_common(long long newval) |
59 | { | 60 | { |
60 | if (newval) { | 61 | if (newval) { |
61 | RCU_TRACE(trace_rcu_dyntick("--=", | 62 | RCU_TRACE(trace_rcu_dyntick(TPS("--="), |
62 | rcu_dynticks_nesting, newval)); | 63 | rcu_dynticks_nesting, newval)); |
63 | rcu_dynticks_nesting = newval; | 64 | rcu_dynticks_nesting = newval; |
64 | return; | 65 | return; |
65 | } | 66 | } |
66 | RCU_TRACE(trace_rcu_dyntick("Start", rcu_dynticks_nesting, newval)); | 67 | RCU_TRACE(trace_rcu_dyntick(TPS("Start"), |
68 | rcu_dynticks_nesting, newval)); | ||
67 | if (!is_idle_task(current)) { | 69 | if (!is_idle_task(current)) { |
68 | struct task_struct *idle = idle_task(smp_processor_id()); | 70 | struct task_struct *idle __maybe_unused = idle_task(smp_processor_id()); |
69 | 71 | ||
70 | RCU_TRACE(trace_rcu_dyntick("Error on entry: not idle task", | 72 | RCU_TRACE(trace_rcu_dyntick(TPS("Entry error: not idle task"), |
71 | rcu_dynticks_nesting, newval)); | 73 | rcu_dynticks_nesting, newval)); |
72 | ftrace_dump(DUMP_ALL); | 74 | ftrace_dump(DUMP_ALL); |
73 | WARN_ONCE(1, "Current pid: %d comm: %s / Idle pid: %d comm: %s", | 75 | WARN_ONCE(1, "Current pid: %d comm: %s / Idle pid: %d comm: %s", |
@@ -120,15 +122,15 @@ EXPORT_SYMBOL_GPL(rcu_irq_exit); | |||
120 | static void rcu_idle_exit_common(long long oldval) | 122 | static void rcu_idle_exit_common(long long oldval) |
121 | { | 123 | { |
122 | if (oldval) { | 124 | if (oldval) { |
123 | RCU_TRACE(trace_rcu_dyntick("++=", | 125 | RCU_TRACE(trace_rcu_dyntick(TPS("++="), |
124 | oldval, rcu_dynticks_nesting)); | 126 | oldval, rcu_dynticks_nesting)); |
125 | return; | 127 | return; |
126 | } | 128 | } |
127 | RCU_TRACE(trace_rcu_dyntick("End", oldval, rcu_dynticks_nesting)); | 129 | RCU_TRACE(trace_rcu_dyntick(TPS("End"), oldval, rcu_dynticks_nesting)); |
128 | if (!is_idle_task(current)) { | 130 | if (!is_idle_task(current)) { |
129 | struct task_struct *idle = idle_task(smp_processor_id()); | 131 | struct task_struct *idle __maybe_unused = idle_task(smp_processor_id()); |
130 | 132 | ||
131 | RCU_TRACE(trace_rcu_dyntick("Error on exit: not idle task", | 133 | RCU_TRACE(trace_rcu_dyntick(TPS("Exit error: not idle task"), |
132 | oldval, rcu_dynticks_nesting)); | 134 | oldval, rcu_dynticks_nesting)); |
133 | ftrace_dump(DUMP_ALL); | 135 | ftrace_dump(DUMP_ALL); |
134 | WARN_ONCE(1, "Current pid: %d comm: %s / Idle pid: %d comm: %s", | 136 | WARN_ONCE(1, "Current pid: %d comm: %s / Idle pid: %d comm: %s", |
@@ -174,18 +176,18 @@ void rcu_irq_enter(void) | |||
174 | } | 176 | } |
175 | EXPORT_SYMBOL_GPL(rcu_irq_enter); | 177 | EXPORT_SYMBOL_GPL(rcu_irq_enter); |
176 | 178 | ||
177 | #ifdef CONFIG_DEBUG_LOCK_ALLOC | 179 | #if defined(CONFIG_DEBUG_LOCK_ALLOC) || defined(CONFIG_RCU_TRACE) |
178 | 180 | ||
179 | /* | 181 | /* |
180 | * Test whether RCU thinks that the current CPU is idle. | 182 | * Test whether RCU thinks that the current CPU is idle. |
181 | */ | 183 | */ |
182 | int rcu_is_cpu_idle(void) | 184 | bool notrace __rcu_is_watching(void) |
183 | { | 185 | { |
184 | return !rcu_dynticks_nesting; | 186 | return rcu_dynticks_nesting; |
185 | } | 187 | } |
186 | EXPORT_SYMBOL(rcu_is_cpu_idle); | 188 | EXPORT_SYMBOL(__rcu_is_watching); |
187 | 189 | ||
188 | #endif /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */ | 190 | #endif /* defined(CONFIG_DEBUG_LOCK_ALLOC) || defined(CONFIG_RCU_TRACE) */ |
189 | 191 | ||
190 | /* | 192 | /* |
191 | * Test whether the current CPU was interrupted from idle. Nested | 193 | * Test whether the current CPU was interrupted from idle. Nested |
@@ -273,7 +275,7 @@ static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp) | |||
273 | if (&rcp->rcucblist == rcp->donetail) { | 275 | if (&rcp->rcucblist == rcp->donetail) { |
274 | RCU_TRACE(trace_rcu_batch_start(rcp->name, 0, 0, -1)); | 276 | RCU_TRACE(trace_rcu_batch_start(rcp->name, 0, 0, -1)); |
275 | RCU_TRACE(trace_rcu_batch_end(rcp->name, 0, | 277 | RCU_TRACE(trace_rcu_batch_end(rcp->name, 0, |
276 | ACCESS_ONCE(rcp->rcucblist), | 278 | !!ACCESS_ONCE(rcp->rcucblist), |
277 | need_resched(), | 279 | need_resched(), |
278 | is_idle_task(current), | 280 | is_idle_task(current), |
279 | false)); | 281 | false)); |
@@ -304,7 +306,8 @@ static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp) | |||
304 | RCU_TRACE(cb_count++); | 306 | RCU_TRACE(cb_count++); |
305 | } | 307 | } |
306 | RCU_TRACE(rcu_trace_sub_qlen(rcp, cb_count)); | 308 | RCU_TRACE(rcu_trace_sub_qlen(rcp, cb_count)); |
307 | RCU_TRACE(trace_rcu_batch_end(rcp->name, cb_count, 0, need_resched(), | 309 | RCU_TRACE(trace_rcu_batch_end(rcp->name, |
310 | cb_count, 0, need_resched(), | ||
308 | is_idle_task(current), | 311 | is_idle_task(current), |
309 | false)); | 312 | false)); |
310 | } | 313 | } |
diff --git a/kernel/rcutiny_plugin.h b/kernel/rcu/tiny_plugin.h index 280d06cae352..280d06cae352 100644 --- a/kernel/rcutiny_plugin.h +++ b/kernel/rcu/tiny_plugin.h | |||
diff --git a/kernel/rcutorture.c b/kernel/rcu/torture.c index be63101c6175..3929cd451511 100644 --- a/kernel/rcutorture.c +++ b/kernel/rcu/torture.c | |||
@@ -52,6 +52,12 @@ | |||
52 | MODULE_LICENSE("GPL"); | 52 | MODULE_LICENSE("GPL"); |
53 | MODULE_AUTHOR("Paul E. McKenney <paulmck@us.ibm.com> and Josh Triplett <josh@freedesktop.org>"); | 53 | MODULE_AUTHOR("Paul E. McKenney <paulmck@us.ibm.com> and Josh Triplett <josh@freedesktop.org>"); |
54 | 54 | ||
55 | MODULE_ALIAS("rcutorture"); | ||
56 | #ifdef MODULE_PARAM_PREFIX | ||
57 | #undef MODULE_PARAM_PREFIX | ||
58 | #endif | ||
59 | #define MODULE_PARAM_PREFIX "rcutorture." | ||
60 | |||
55 | static int fqs_duration; | 61 | static int fqs_duration; |
56 | module_param(fqs_duration, int, 0444); | 62 | module_param(fqs_duration, int, 0444); |
57 | MODULE_PARM_DESC(fqs_duration, "Duration of fqs bursts (us), 0 to disable"); | 63 | MODULE_PARM_DESC(fqs_duration, "Duration of fqs bursts (us), 0 to disable"); |
diff --git a/kernel/rcutree.c b/kernel/rcu/tree.c index 32618b3fe4e6..dd081987a8ec 100644 --- a/kernel/rcutree.c +++ b/kernel/rcu/tree.c | |||
@@ -41,6 +41,7 @@ | |||
41 | #include <linux/export.h> | 41 | #include <linux/export.h> |
42 | #include <linux/completion.h> | 42 | #include <linux/completion.h> |
43 | #include <linux/moduleparam.h> | 43 | #include <linux/moduleparam.h> |
44 | #include <linux/module.h> | ||
44 | #include <linux/percpu.h> | 45 | #include <linux/percpu.h> |
45 | #include <linux/notifier.h> | 46 | #include <linux/notifier.h> |
46 | #include <linux/cpu.h> | 47 | #include <linux/cpu.h> |
@@ -56,17 +57,16 @@ | |||
56 | #include <linux/ftrace_event.h> | 57 | #include <linux/ftrace_event.h> |
57 | #include <linux/suspend.h> | 58 | #include <linux/suspend.h> |
58 | 59 | ||
59 | #include "rcutree.h" | 60 | #include "tree.h" |
60 | #include <trace/events/rcu.h> | 61 | #include <trace/events/rcu.h> |
61 | 62 | ||
62 | #include "rcu.h" | 63 | #include "rcu.h" |
63 | 64 | ||
64 | /* | 65 | MODULE_ALIAS("rcutree"); |
65 | * Strings used in tracepoints need to be exported via the | 66 | #ifdef MODULE_PARAM_PREFIX |
66 | * tracing system such that tools like perf and trace-cmd can | 67 | #undef MODULE_PARAM_PREFIX |
67 | * translate the string address pointers to actual text. | 68 | #endif |
68 | */ | 69 | #define MODULE_PARAM_PREFIX "rcutree." |
69 | #define TPS(x) tracepoint_string(x) | ||
70 | 70 | ||
71 | /* Data structures. */ | 71 | /* Data structures. */ |
72 | 72 | ||
@@ -222,7 +222,7 @@ void rcu_note_context_switch(int cpu) | |||
222 | } | 222 | } |
223 | EXPORT_SYMBOL_GPL(rcu_note_context_switch); | 223 | EXPORT_SYMBOL_GPL(rcu_note_context_switch); |
224 | 224 | ||
225 | DEFINE_PER_CPU(struct rcu_dynticks, rcu_dynticks) = { | 225 | static DEFINE_PER_CPU(struct rcu_dynticks, rcu_dynticks) = { |
226 | .dynticks_nesting = DYNTICK_TASK_EXIT_IDLE, | 226 | .dynticks_nesting = DYNTICK_TASK_EXIT_IDLE, |
227 | .dynticks = ATOMIC_INIT(1), | 227 | .dynticks = ATOMIC_INIT(1), |
228 | #ifdef CONFIG_NO_HZ_FULL_SYSIDLE | 228 | #ifdef CONFIG_NO_HZ_FULL_SYSIDLE |
@@ -371,7 +371,8 @@ static void rcu_eqs_enter_common(struct rcu_dynticks *rdtp, long long oldval, | |||
371 | { | 371 | { |
372 | trace_rcu_dyntick(TPS("Start"), oldval, rdtp->dynticks_nesting); | 372 | trace_rcu_dyntick(TPS("Start"), oldval, rdtp->dynticks_nesting); |
373 | if (!user && !is_idle_task(current)) { | 373 | if (!user && !is_idle_task(current)) { |
374 | struct task_struct *idle = idle_task(smp_processor_id()); | 374 | struct task_struct *idle __maybe_unused = |
375 | idle_task(smp_processor_id()); | ||
375 | 376 | ||
376 | trace_rcu_dyntick(TPS("Error on entry: not idle task"), oldval, 0); | 377 | trace_rcu_dyntick(TPS("Error on entry: not idle task"), oldval, 0); |
377 | ftrace_dump(DUMP_ORIG); | 378 | ftrace_dump(DUMP_ORIG); |
@@ -407,7 +408,7 @@ static void rcu_eqs_enter(bool user) | |||
407 | long long oldval; | 408 | long long oldval; |
408 | struct rcu_dynticks *rdtp; | 409 | struct rcu_dynticks *rdtp; |
409 | 410 | ||
410 | rdtp = &__get_cpu_var(rcu_dynticks); | 411 | rdtp = this_cpu_ptr(&rcu_dynticks); |
411 | oldval = rdtp->dynticks_nesting; | 412 | oldval = rdtp->dynticks_nesting; |
412 | WARN_ON_ONCE((oldval & DYNTICK_TASK_NEST_MASK) == 0); | 413 | WARN_ON_ONCE((oldval & DYNTICK_TASK_NEST_MASK) == 0); |
413 | if ((oldval & DYNTICK_TASK_NEST_MASK) == DYNTICK_TASK_NEST_VALUE) | 414 | if ((oldval & DYNTICK_TASK_NEST_MASK) == DYNTICK_TASK_NEST_VALUE) |
@@ -435,7 +436,7 @@ void rcu_idle_enter(void) | |||
435 | 436 | ||
436 | local_irq_save(flags); | 437 | local_irq_save(flags); |
437 | rcu_eqs_enter(false); | 438 | rcu_eqs_enter(false); |
438 | rcu_sysidle_enter(&__get_cpu_var(rcu_dynticks), 0); | 439 | rcu_sysidle_enter(this_cpu_ptr(&rcu_dynticks), 0); |
439 | local_irq_restore(flags); | 440 | local_irq_restore(flags); |
440 | } | 441 | } |
441 | EXPORT_SYMBOL_GPL(rcu_idle_enter); | 442 | EXPORT_SYMBOL_GPL(rcu_idle_enter); |
@@ -478,7 +479,7 @@ void rcu_irq_exit(void) | |||
478 | struct rcu_dynticks *rdtp; | 479 | struct rcu_dynticks *rdtp; |
479 | 480 | ||
480 | local_irq_save(flags); | 481 | local_irq_save(flags); |
481 | rdtp = &__get_cpu_var(rcu_dynticks); | 482 | rdtp = this_cpu_ptr(&rcu_dynticks); |
482 | oldval = rdtp->dynticks_nesting; | 483 | oldval = rdtp->dynticks_nesting; |
483 | rdtp->dynticks_nesting--; | 484 | rdtp->dynticks_nesting--; |
484 | WARN_ON_ONCE(rdtp->dynticks_nesting < 0); | 485 | WARN_ON_ONCE(rdtp->dynticks_nesting < 0); |
@@ -508,7 +509,8 @@ static void rcu_eqs_exit_common(struct rcu_dynticks *rdtp, long long oldval, | |||
508 | rcu_cleanup_after_idle(smp_processor_id()); | 509 | rcu_cleanup_after_idle(smp_processor_id()); |
509 | trace_rcu_dyntick(TPS("End"), oldval, rdtp->dynticks_nesting); | 510 | trace_rcu_dyntick(TPS("End"), oldval, rdtp->dynticks_nesting); |
510 | if (!user && !is_idle_task(current)) { | 511 | if (!user && !is_idle_task(current)) { |
511 | struct task_struct *idle = idle_task(smp_processor_id()); | 512 | struct task_struct *idle __maybe_unused = |
513 | idle_task(smp_processor_id()); | ||
512 | 514 | ||
513 | trace_rcu_dyntick(TPS("Error on exit: not idle task"), | 515 | trace_rcu_dyntick(TPS("Error on exit: not idle task"), |
514 | oldval, rdtp->dynticks_nesting); | 516 | oldval, rdtp->dynticks_nesting); |
@@ -528,7 +530,7 @@ static void rcu_eqs_exit(bool user) | |||
528 | struct rcu_dynticks *rdtp; | 530 | struct rcu_dynticks *rdtp; |
529 | long long oldval; | 531 | long long oldval; |
530 | 532 | ||
531 | rdtp = &__get_cpu_var(rcu_dynticks); | 533 | rdtp = this_cpu_ptr(&rcu_dynticks); |
532 | oldval = rdtp->dynticks_nesting; | 534 | oldval = rdtp->dynticks_nesting; |
533 | WARN_ON_ONCE(oldval < 0); | 535 | WARN_ON_ONCE(oldval < 0); |
534 | if (oldval & DYNTICK_TASK_NEST_MASK) | 536 | if (oldval & DYNTICK_TASK_NEST_MASK) |
@@ -555,7 +557,7 @@ void rcu_idle_exit(void) | |||
555 | 557 | ||
556 | local_irq_save(flags); | 558 | local_irq_save(flags); |
557 | rcu_eqs_exit(false); | 559 | rcu_eqs_exit(false); |
558 | rcu_sysidle_exit(&__get_cpu_var(rcu_dynticks), 0); | 560 | rcu_sysidle_exit(this_cpu_ptr(&rcu_dynticks), 0); |
559 | local_irq_restore(flags); | 561 | local_irq_restore(flags); |
560 | } | 562 | } |
561 | EXPORT_SYMBOL_GPL(rcu_idle_exit); | 563 | EXPORT_SYMBOL_GPL(rcu_idle_exit); |
@@ -599,7 +601,7 @@ void rcu_irq_enter(void) | |||
599 | long long oldval; | 601 | long long oldval; |
600 | 602 | ||
601 | local_irq_save(flags); | 603 | local_irq_save(flags); |
602 | rdtp = &__get_cpu_var(rcu_dynticks); | 604 | rdtp = this_cpu_ptr(&rcu_dynticks); |
603 | oldval = rdtp->dynticks_nesting; | 605 | oldval = rdtp->dynticks_nesting; |
604 | rdtp->dynticks_nesting++; | 606 | rdtp->dynticks_nesting++; |
605 | WARN_ON_ONCE(rdtp->dynticks_nesting == 0); | 607 | WARN_ON_ONCE(rdtp->dynticks_nesting == 0); |
@@ -620,7 +622,7 @@ void rcu_irq_enter(void) | |||
620 | */ | 622 | */ |
621 | void rcu_nmi_enter(void) | 623 | void rcu_nmi_enter(void) |
622 | { | 624 | { |
623 | struct rcu_dynticks *rdtp = &__get_cpu_var(rcu_dynticks); | 625 | struct rcu_dynticks *rdtp = this_cpu_ptr(&rcu_dynticks); |
624 | 626 | ||
625 | if (rdtp->dynticks_nmi_nesting == 0 && | 627 | if (rdtp->dynticks_nmi_nesting == 0 && |
626 | (atomic_read(&rdtp->dynticks) & 0x1)) | 628 | (atomic_read(&rdtp->dynticks) & 0x1)) |
@@ -642,7 +644,7 @@ void rcu_nmi_enter(void) | |||
642 | */ | 644 | */ |
643 | void rcu_nmi_exit(void) | 645 | void rcu_nmi_exit(void) |
644 | { | 646 | { |
645 | struct rcu_dynticks *rdtp = &__get_cpu_var(rcu_dynticks); | 647 | struct rcu_dynticks *rdtp = this_cpu_ptr(&rcu_dynticks); |
646 | 648 | ||
647 | if (rdtp->dynticks_nmi_nesting == 0 || | 649 | if (rdtp->dynticks_nmi_nesting == 0 || |
648 | --rdtp->dynticks_nmi_nesting != 0) | 650 | --rdtp->dynticks_nmi_nesting != 0) |
@@ -655,21 +657,34 @@ void rcu_nmi_exit(void) | |||
655 | } | 657 | } |
656 | 658 | ||
657 | /** | 659 | /** |
658 | * rcu_is_cpu_idle - see if RCU thinks that the current CPU is idle | 660 | * __rcu_is_watching - are RCU read-side critical sections safe? |
661 | * | ||
662 | * Return true if RCU is watching the running CPU, which means that | ||
663 | * this CPU can safely enter RCU read-side critical sections. Unlike | ||
664 | * rcu_is_watching(), the caller of __rcu_is_watching() must have at | ||
665 | * least disabled preemption. | ||
666 | */ | ||
667 | bool notrace __rcu_is_watching(void) | ||
668 | { | ||
669 | return atomic_read(this_cpu_ptr(&rcu_dynticks.dynticks)) & 0x1; | ||
670 | } | ||
671 | |||
672 | /** | ||
673 | * rcu_is_watching - see if RCU thinks that the current CPU is idle | ||
659 | * | 674 | * |
660 | * If the current CPU is in its idle loop and is neither in an interrupt | 675 | * If the current CPU is in its idle loop and is neither in an interrupt |
661 | * or NMI handler, return true. | 676 | * or NMI handler, return true. |
662 | */ | 677 | */ |
663 | int rcu_is_cpu_idle(void) | 678 | bool notrace rcu_is_watching(void) |
664 | { | 679 | { |
665 | int ret; | 680 | int ret; |
666 | 681 | ||
667 | preempt_disable(); | 682 | preempt_disable(); |
668 | ret = (atomic_read(&__get_cpu_var(rcu_dynticks).dynticks) & 0x1) == 0; | 683 | ret = __rcu_is_watching(); |
669 | preempt_enable(); | 684 | preempt_enable(); |
670 | return ret; | 685 | return ret; |
671 | } | 686 | } |
672 | EXPORT_SYMBOL(rcu_is_cpu_idle); | 687 | EXPORT_SYMBOL_GPL(rcu_is_watching); |
673 | 688 | ||
674 | #if defined(CONFIG_PROVE_RCU) && defined(CONFIG_HOTPLUG_CPU) | 689 | #if defined(CONFIG_PROVE_RCU) && defined(CONFIG_HOTPLUG_CPU) |
675 | 690 | ||
@@ -703,7 +718,7 @@ bool rcu_lockdep_current_cpu_online(void) | |||
703 | if (in_nmi()) | 718 | if (in_nmi()) |
704 | return 1; | 719 | return 1; |
705 | preempt_disable(); | 720 | preempt_disable(); |
706 | rdp = &__get_cpu_var(rcu_sched_data); | 721 | rdp = this_cpu_ptr(&rcu_sched_data); |
707 | rnp = rdp->mynode; | 722 | rnp = rdp->mynode; |
708 | ret = (rdp->grpmask & rnp->qsmaskinit) || | 723 | ret = (rdp->grpmask & rnp->qsmaskinit) || |
709 | !rcu_scheduler_fully_active; | 724 | !rcu_scheduler_fully_active; |
@@ -723,7 +738,7 @@ EXPORT_SYMBOL_GPL(rcu_lockdep_current_cpu_online); | |||
723 | */ | 738 | */ |
724 | static int rcu_is_cpu_rrupt_from_idle(void) | 739 | static int rcu_is_cpu_rrupt_from_idle(void) |
725 | { | 740 | { |
726 | return __get_cpu_var(rcu_dynticks).dynticks_nesting <= 1; | 741 | return __this_cpu_read(rcu_dynticks.dynticks_nesting) <= 1; |
727 | } | 742 | } |
728 | 743 | ||
729 | /* | 744 | /* |
@@ -802,8 +817,11 @@ static int rcu_implicit_dynticks_qs(struct rcu_data *rdp, | |||
802 | 817 | ||
803 | static void record_gp_stall_check_time(struct rcu_state *rsp) | 818 | static void record_gp_stall_check_time(struct rcu_state *rsp) |
804 | { | 819 | { |
805 | rsp->gp_start = jiffies; | 820 | unsigned long j = ACCESS_ONCE(jiffies); |
806 | rsp->jiffies_stall = jiffies + rcu_jiffies_till_stall_check(); | 821 | |
822 | rsp->gp_start = j; | ||
823 | smp_wmb(); /* Record start time before stall time. */ | ||
824 | rsp->jiffies_stall = j + rcu_jiffies_till_stall_check(); | ||
807 | } | 825 | } |
808 | 826 | ||
809 | /* | 827 | /* |
@@ -898,6 +916,12 @@ static void print_other_cpu_stall(struct rcu_state *rsp) | |||
898 | force_quiescent_state(rsp); /* Kick them all. */ | 916 | force_quiescent_state(rsp); /* Kick them all. */ |
899 | } | 917 | } |
900 | 918 | ||
919 | /* | ||
920 | * This function really isn't for public consumption, but RCU is special in | ||
921 | * that context switches can allow the state machine to make progress. | ||
922 | */ | ||
923 | extern void resched_cpu(int cpu); | ||
924 | |||
901 | static void print_cpu_stall(struct rcu_state *rsp) | 925 | static void print_cpu_stall(struct rcu_state *rsp) |
902 | { | 926 | { |
903 | int cpu; | 927 | int cpu; |
@@ -927,22 +951,60 @@ static void print_cpu_stall(struct rcu_state *rsp) | |||
927 | 3 * rcu_jiffies_till_stall_check() + 3; | 951 | 3 * rcu_jiffies_till_stall_check() + 3; |
928 | raw_spin_unlock_irqrestore(&rnp->lock, flags); | 952 | raw_spin_unlock_irqrestore(&rnp->lock, flags); |
929 | 953 | ||
930 | set_need_resched(); /* kick ourselves to get things going. */ | 954 | /* |
955 | * Attempt to revive the RCU machinery by forcing a context switch. | ||
956 | * | ||
957 | * A context switch would normally allow the RCU state machine to make | ||
958 | * progress and it could be we're stuck in kernel space without context | ||
959 | * switches for an entirely unreasonable amount of time. | ||
960 | */ | ||
961 | resched_cpu(smp_processor_id()); | ||
931 | } | 962 | } |
932 | 963 | ||
933 | static void check_cpu_stall(struct rcu_state *rsp, struct rcu_data *rdp) | 964 | static void check_cpu_stall(struct rcu_state *rsp, struct rcu_data *rdp) |
934 | { | 965 | { |
966 | unsigned long completed; | ||
967 | unsigned long gpnum; | ||
968 | unsigned long gps; | ||
935 | unsigned long j; | 969 | unsigned long j; |
936 | unsigned long js; | 970 | unsigned long js; |
937 | struct rcu_node *rnp; | 971 | struct rcu_node *rnp; |
938 | 972 | ||
939 | if (rcu_cpu_stall_suppress) | 973 | if (rcu_cpu_stall_suppress || !rcu_gp_in_progress(rsp)) |
940 | return; | 974 | return; |
941 | j = ACCESS_ONCE(jiffies); | 975 | j = ACCESS_ONCE(jiffies); |
976 | |||
977 | /* | ||
978 | * Lots of memory barriers to reject false positives. | ||
979 | * | ||
980 | * The idea is to pick up rsp->gpnum, then rsp->jiffies_stall, | ||
981 | * then rsp->gp_start, and finally rsp->completed. These values | ||
982 | * are updated in the opposite order with memory barriers (or | ||
983 | * equivalent) during grace-period initialization and cleanup. | ||
984 | * Now, a false positive can occur if we get an new value of | ||
985 | * rsp->gp_start and a old value of rsp->jiffies_stall. But given | ||
986 | * the memory barriers, the only way that this can happen is if one | ||
987 | * grace period ends and another starts between these two fetches. | ||
988 | * Detect this by comparing rsp->completed with the previous fetch | ||
989 | * from rsp->gpnum. | ||
990 | * | ||
991 | * Given this check, comparisons of jiffies, rsp->jiffies_stall, | ||
992 | * and rsp->gp_start suffice to forestall false positives. | ||
993 | */ | ||
994 | gpnum = ACCESS_ONCE(rsp->gpnum); | ||
995 | smp_rmb(); /* Pick up ->gpnum first... */ | ||
942 | js = ACCESS_ONCE(rsp->jiffies_stall); | 996 | js = ACCESS_ONCE(rsp->jiffies_stall); |
997 | smp_rmb(); /* ...then ->jiffies_stall before the rest... */ | ||
998 | gps = ACCESS_ONCE(rsp->gp_start); | ||
999 | smp_rmb(); /* ...and finally ->gp_start before ->completed. */ | ||
1000 | completed = ACCESS_ONCE(rsp->completed); | ||
1001 | if (ULONG_CMP_GE(completed, gpnum) || | ||
1002 | ULONG_CMP_LT(j, js) || | ||
1003 | ULONG_CMP_GE(gps, js)) | ||
1004 | return; /* No stall or GP completed since entering function. */ | ||
943 | rnp = rdp->mynode; | 1005 | rnp = rdp->mynode; |
944 | if (rcu_gp_in_progress(rsp) && | 1006 | if (rcu_gp_in_progress(rsp) && |
945 | (ACCESS_ONCE(rnp->qsmask) & rdp->grpmask) && ULONG_CMP_GE(j, js)) { | 1007 | (ACCESS_ONCE(rnp->qsmask) & rdp->grpmask)) { |
946 | 1008 | ||
947 | /* We haven't checked in, so go dump stack. */ | 1009 | /* We haven't checked in, so go dump stack. */ |
948 | print_cpu_stall(rsp); | 1010 | print_cpu_stall(rsp); |
@@ -1297,7 +1359,7 @@ static void note_gp_changes(struct rcu_state *rsp, struct rcu_data *rdp) | |||
1297 | } | 1359 | } |
1298 | 1360 | ||
1299 | /* | 1361 | /* |
1300 | * Initialize a new grace period. | 1362 | * Initialize a new grace period. Return 0 if no grace period required. |
1301 | */ | 1363 | */ |
1302 | static int rcu_gp_init(struct rcu_state *rsp) | 1364 | static int rcu_gp_init(struct rcu_state *rsp) |
1303 | { | 1365 | { |
@@ -1306,18 +1368,27 @@ static int rcu_gp_init(struct rcu_state *rsp) | |||
1306 | 1368 | ||
1307 | rcu_bind_gp_kthread(); | 1369 | rcu_bind_gp_kthread(); |
1308 | raw_spin_lock_irq(&rnp->lock); | 1370 | raw_spin_lock_irq(&rnp->lock); |
1371 | if (rsp->gp_flags == 0) { | ||
1372 | /* Spurious wakeup, tell caller to go back to sleep. */ | ||
1373 | raw_spin_unlock_irq(&rnp->lock); | ||
1374 | return 0; | ||
1375 | } | ||
1309 | rsp->gp_flags = 0; /* Clear all flags: New grace period. */ | 1376 | rsp->gp_flags = 0; /* Clear all flags: New grace period. */ |
1310 | 1377 | ||
1311 | if (rcu_gp_in_progress(rsp)) { | 1378 | if (WARN_ON_ONCE(rcu_gp_in_progress(rsp))) { |
1312 | /* Grace period already in progress, don't start another. */ | 1379 | /* |
1380 | * Grace period already in progress, don't start another. | ||
1381 | * Not supposed to be able to happen. | ||
1382 | */ | ||
1313 | raw_spin_unlock_irq(&rnp->lock); | 1383 | raw_spin_unlock_irq(&rnp->lock); |
1314 | return 0; | 1384 | return 0; |
1315 | } | 1385 | } |
1316 | 1386 | ||
1317 | /* Advance to a new grace period and initialize state. */ | 1387 | /* Advance to a new grace period and initialize state. */ |
1388 | record_gp_stall_check_time(rsp); | ||
1389 | smp_wmb(); /* Record GP times before starting GP. */ | ||
1318 | rsp->gpnum++; | 1390 | rsp->gpnum++; |
1319 | trace_rcu_grace_period(rsp->name, rsp->gpnum, TPS("start")); | 1391 | trace_rcu_grace_period(rsp->name, rsp->gpnum, TPS("start")); |
1320 | record_gp_stall_check_time(rsp); | ||
1321 | raw_spin_unlock_irq(&rnp->lock); | 1392 | raw_spin_unlock_irq(&rnp->lock); |
1322 | 1393 | ||
1323 | /* Exclude any concurrent CPU-hotplug operations. */ | 1394 | /* Exclude any concurrent CPU-hotplug operations. */ |
@@ -1366,7 +1437,7 @@ static int rcu_gp_init(struct rcu_state *rsp) | |||
1366 | /* | 1437 | /* |
1367 | * Do one round of quiescent-state forcing. | 1438 | * Do one round of quiescent-state forcing. |
1368 | */ | 1439 | */ |
1369 | int rcu_gp_fqs(struct rcu_state *rsp, int fqs_state_in) | 1440 | static int rcu_gp_fqs(struct rcu_state *rsp, int fqs_state_in) |
1370 | { | 1441 | { |
1371 | int fqs_state = fqs_state_in; | 1442 | int fqs_state = fqs_state_in; |
1372 | bool isidle = false; | 1443 | bool isidle = false; |
@@ -1451,8 +1522,12 @@ static void rcu_gp_cleanup(struct rcu_state *rsp) | |||
1451 | rsp->fqs_state = RCU_GP_IDLE; | 1522 | rsp->fqs_state = RCU_GP_IDLE; |
1452 | rdp = this_cpu_ptr(rsp->rda); | 1523 | rdp = this_cpu_ptr(rsp->rda); |
1453 | rcu_advance_cbs(rsp, rnp, rdp); /* Reduce false positives below. */ | 1524 | rcu_advance_cbs(rsp, rnp, rdp); /* Reduce false positives below. */ |
1454 | if (cpu_needs_another_gp(rsp, rdp)) | 1525 | if (cpu_needs_another_gp(rsp, rdp)) { |
1455 | rsp->gp_flags = 1; | 1526 | rsp->gp_flags = RCU_GP_FLAG_INIT; |
1527 | trace_rcu_grace_period(rsp->name, | ||
1528 | ACCESS_ONCE(rsp->gpnum), | ||
1529 | TPS("newreq")); | ||
1530 | } | ||
1456 | raw_spin_unlock_irq(&rnp->lock); | 1531 | raw_spin_unlock_irq(&rnp->lock); |
1457 | } | 1532 | } |
1458 | 1533 | ||
@@ -1462,6 +1537,7 @@ static void rcu_gp_cleanup(struct rcu_state *rsp) | |||
1462 | static int __noreturn rcu_gp_kthread(void *arg) | 1537 | static int __noreturn rcu_gp_kthread(void *arg) |
1463 | { | 1538 | { |
1464 | int fqs_state; | 1539 | int fqs_state; |
1540 | int gf; | ||
1465 | unsigned long j; | 1541 | unsigned long j; |
1466 | int ret; | 1542 | int ret; |
1467 | struct rcu_state *rsp = arg; | 1543 | struct rcu_state *rsp = arg; |
@@ -1471,14 +1547,19 @@ static int __noreturn rcu_gp_kthread(void *arg) | |||
1471 | 1547 | ||
1472 | /* Handle grace-period start. */ | 1548 | /* Handle grace-period start. */ |
1473 | for (;;) { | 1549 | for (;;) { |
1550 | trace_rcu_grace_period(rsp->name, | ||
1551 | ACCESS_ONCE(rsp->gpnum), | ||
1552 | TPS("reqwait")); | ||
1474 | wait_event_interruptible(rsp->gp_wq, | 1553 | wait_event_interruptible(rsp->gp_wq, |
1475 | rsp->gp_flags & | 1554 | ACCESS_ONCE(rsp->gp_flags) & |
1476 | RCU_GP_FLAG_INIT); | 1555 | RCU_GP_FLAG_INIT); |
1477 | if ((rsp->gp_flags & RCU_GP_FLAG_INIT) && | 1556 | if (rcu_gp_init(rsp)) |
1478 | rcu_gp_init(rsp)) | ||
1479 | break; | 1557 | break; |
1480 | cond_resched(); | 1558 | cond_resched(); |
1481 | flush_signals(current); | 1559 | flush_signals(current); |
1560 | trace_rcu_grace_period(rsp->name, | ||
1561 | ACCESS_ONCE(rsp->gpnum), | ||
1562 | TPS("reqwaitsig")); | ||
1482 | } | 1563 | } |
1483 | 1564 | ||
1484 | /* Handle quiescent-state forcing. */ | 1565 | /* Handle quiescent-state forcing. */ |
@@ -1488,10 +1569,16 @@ static int __noreturn rcu_gp_kthread(void *arg) | |||
1488 | j = HZ; | 1569 | j = HZ; |
1489 | jiffies_till_first_fqs = HZ; | 1570 | jiffies_till_first_fqs = HZ; |
1490 | } | 1571 | } |
1572 | ret = 0; | ||
1491 | for (;;) { | 1573 | for (;;) { |
1492 | rsp->jiffies_force_qs = jiffies + j; | 1574 | if (!ret) |
1575 | rsp->jiffies_force_qs = jiffies + j; | ||
1576 | trace_rcu_grace_period(rsp->name, | ||
1577 | ACCESS_ONCE(rsp->gpnum), | ||
1578 | TPS("fqswait")); | ||
1493 | ret = wait_event_interruptible_timeout(rsp->gp_wq, | 1579 | ret = wait_event_interruptible_timeout(rsp->gp_wq, |
1494 | (rsp->gp_flags & RCU_GP_FLAG_FQS) || | 1580 | ((gf = ACCESS_ONCE(rsp->gp_flags)) & |
1581 | RCU_GP_FLAG_FQS) || | ||
1495 | (!ACCESS_ONCE(rnp->qsmask) && | 1582 | (!ACCESS_ONCE(rnp->qsmask) && |
1496 | !rcu_preempt_blocked_readers_cgp(rnp)), | 1583 | !rcu_preempt_blocked_readers_cgp(rnp)), |
1497 | j); | 1584 | j); |
@@ -1500,13 +1587,23 @@ static int __noreturn rcu_gp_kthread(void *arg) | |||
1500 | !rcu_preempt_blocked_readers_cgp(rnp)) | 1587 | !rcu_preempt_blocked_readers_cgp(rnp)) |
1501 | break; | 1588 | break; |
1502 | /* If time for quiescent-state forcing, do it. */ | 1589 | /* If time for quiescent-state forcing, do it. */ |
1503 | if (ret == 0 || (rsp->gp_flags & RCU_GP_FLAG_FQS)) { | 1590 | if (ULONG_CMP_GE(jiffies, rsp->jiffies_force_qs) || |
1591 | (gf & RCU_GP_FLAG_FQS)) { | ||
1592 | trace_rcu_grace_period(rsp->name, | ||
1593 | ACCESS_ONCE(rsp->gpnum), | ||
1594 | TPS("fqsstart")); | ||
1504 | fqs_state = rcu_gp_fqs(rsp, fqs_state); | 1595 | fqs_state = rcu_gp_fqs(rsp, fqs_state); |
1596 | trace_rcu_grace_period(rsp->name, | ||
1597 | ACCESS_ONCE(rsp->gpnum), | ||
1598 | TPS("fqsend")); | ||
1505 | cond_resched(); | 1599 | cond_resched(); |
1506 | } else { | 1600 | } else { |
1507 | /* Deal with stray signal. */ | 1601 | /* Deal with stray signal. */ |
1508 | cond_resched(); | 1602 | cond_resched(); |
1509 | flush_signals(current); | 1603 | flush_signals(current); |
1604 | trace_rcu_grace_period(rsp->name, | ||
1605 | ACCESS_ONCE(rsp->gpnum), | ||
1606 | TPS("fqswaitsig")); | ||
1510 | } | 1607 | } |
1511 | j = jiffies_till_next_fqs; | 1608 | j = jiffies_till_next_fqs; |
1512 | if (j > HZ) { | 1609 | if (j > HZ) { |
@@ -1554,6 +1651,8 @@ rcu_start_gp_advanced(struct rcu_state *rsp, struct rcu_node *rnp, | |||
1554 | return; | 1651 | return; |
1555 | } | 1652 | } |
1556 | rsp->gp_flags = RCU_GP_FLAG_INIT; | 1653 | rsp->gp_flags = RCU_GP_FLAG_INIT; |
1654 | trace_rcu_grace_period(rsp->name, ACCESS_ONCE(rsp->gpnum), | ||
1655 | TPS("newreq")); | ||
1557 | 1656 | ||
1558 | /* | 1657 | /* |
1559 | * We can't do wakeups while holding the rnp->lock, as that | 1658 | * We can't do wakeups while holding the rnp->lock, as that |
@@ -2255,7 +2354,7 @@ static void __call_rcu_core(struct rcu_state *rsp, struct rcu_data *rdp, | |||
2255 | * If called from an extended quiescent state, invoke the RCU | 2354 | * If called from an extended quiescent state, invoke the RCU |
2256 | * core in order to force a re-evaluation of RCU's idleness. | 2355 | * core in order to force a re-evaluation of RCU's idleness. |
2257 | */ | 2356 | */ |
2258 | if (rcu_is_cpu_idle() && cpu_online(smp_processor_id())) | 2357 | if (!rcu_is_watching() && cpu_online(smp_processor_id())) |
2259 | invoke_rcu_core(); | 2358 | invoke_rcu_core(); |
2260 | 2359 | ||
2261 | /* If interrupts were disabled or CPU offline, don't invoke RCU core. */ | 2360 | /* If interrupts were disabled or CPU offline, don't invoke RCU core. */ |
@@ -2725,10 +2824,13 @@ static int rcu_cpu_has_callbacks(int cpu, bool *all_lazy) | |||
2725 | 2824 | ||
2726 | for_each_rcu_flavor(rsp) { | 2825 | for_each_rcu_flavor(rsp) { |
2727 | rdp = per_cpu_ptr(rsp->rda, cpu); | 2826 | rdp = per_cpu_ptr(rsp->rda, cpu); |
2728 | if (rdp->qlen != rdp->qlen_lazy) | 2827 | if (!rdp->nxtlist) |
2828 | continue; | ||
2829 | hc = true; | ||
2830 | if (rdp->qlen != rdp->qlen_lazy || !all_lazy) { | ||
2729 | al = false; | 2831 | al = false; |
2730 | if (rdp->nxtlist) | 2832 | break; |
2731 | hc = true; | 2833 | } |
2732 | } | 2834 | } |
2733 | if (all_lazy) | 2835 | if (all_lazy) |
2734 | *all_lazy = al; | 2836 | *all_lazy = al; |
@@ -3216,7 +3318,7 @@ static void __init rcu_init_one(struct rcu_state *rsp, | |||
3216 | 3318 | ||
3217 | /* | 3319 | /* |
3218 | * Compute the rcu_node tree geometry from kernel parameters. This cannot | 3320 | * Compute the rcu_node tree geometry from kernel parameters. This cannot |
3219 | * replace the definitions in rcutree.h because those are needed to size | 3321 | * replace the definitions in tree.h because those are needed to size |
3220 | * the ->node array in the rcu_state structure. | 3322 | * the ->node array in the rcu_state structure. |
3221 | */ | 3323 | */ |
3222 | static void __init rcu_init_geometry(void) | 3324 | static void __init rcu_init_geometry(void) |
@@ -3295,8 +3397,8 @@ void __init rcu_init(void) | |||
3295 | 3397 | ||
3296 | rcu_bootup_announce(); | 3398 | rcu_bootup_announce(); |
3297 | rcu_init_geometry(); | 3399 | rcu_init_geometry(); |
3298 | rcu_init_one(&rcu_sched_state, &rcu_sched_data); | ||
3299 | rcu_init_one(&rcu_bh_state, &rcu_bh_data); | 3400 | rcu_init_one(&rcu_bh_state, &rcu_bh_data); |
3401 | rcu_init_one(&rcu_sched_state, &rcu_sched_data); | ||
3300 | __rcu_init_preempt(); | 3402 | __rcu_init_preempt(); |
3301 | open_softirq(RCU_SOFTIRQ, rcu_process_callbacks); | 3403 | open_softirq(RCU_SOFTIRQ, rcu_process_callbacks); |
3302 | 3404 | ||
@@ -3311,4 +3413,4 @@ void __init rcu_init(void) | |||
3311 | rcu_cpu_notify(NULL, CPU_UP_PREPARE, (void *)(long)cpu); | 3413 | rcu_cpu_notify(NULL, CPU_UP_PREPARE, (void *)(long)cpu); |
3312 | } | 3414 | } |
3313 | 3415 | ||
3314 | #include "rcutree_plugin.h" | 3416 | #include "tree_plugin.h" |
diff --git a/kernel/rcutree.h b/kernel/rcu/tree.h index 5f97eab602cd..52be957c9fe2 100644 --- a/kernel/rcutree.h +++ b/kernel/rcu/tree.h | |||
@@ -104,6 +104,8 @@ struct rcu_dynticks { | |||
104 | /* idle-period nonlazy_posted snapshot. */ | 104 | /* idle-period nonlazy_posted snapshot. */ |
105 | unsigned long last_accelerate; | 105 | unsigned long last_accelerate; |
106 | /* Last jiffy CBs were accelerated. */ | 106 | /* Last jiffy CBs were accelerated. */ |
107 | unsigned long last_advance_all; | ||
108 | /* Last jiffy CBs were all advanced. */ | ||
107 | int tick_nohz_enabled_snap; /* Previously seen value from sysfs. */ | 109 | int tick_nohz_enabled_snap; /* Previously seen value from sysfs. */ |
108 | #endif /* #ifdef CONFIG_RCU_FAST_NO_HZ */ | 110 | #endif /* #ifdef CONFIG_RCU_FAST_NO_HZ */ |
109 | }; | 111 | }; |
diff --git a/kernel/rcutree_plugin.h b/kernel/rcu/tree_plugin.h index 130c97b027f2..08a765232432 100644 --- a/kernel/rcutree_plugin.h +++ b/kernel/rcu/tree_plugin.h | |||
@@ -28,7 +28,7 @@ | |||
28 | #include <linux/gfp.h> | 28 | #include <linux/gfp.h> |
29 | #include <linux/oom.h> | 29 | #include <linux/oom.h> |
30 | #include <linux/smpboot.h> | 30 | #include <linux/smpboot.h> |
31 | #include "time/tick-internal.h" | 31 | #include "../time/tick-internal.h" |
32 | 32 | ||
33 | #define RCU_KTHREAD_PRIO 1 | 33 | #define RCU_KTHREAD_PRIO 1 |
34 | 34 | ||
@@ -96,10 +96,15 @@ static void __init rcu_bootup_announce_oddness(void) | |||
96 | #endif /* #ifdef CONFIG_RCU_NOCB_CPU_ZERO */ | 96 | #endif /* #ifdef CONFIG_RCU_NOCB_CPU_ZERO */ |
97 | #ifdef CONFIG_RCU_NOCB_CPU_ALL | 97 | #ifdef CONFIG_RCU_NOCB_CPU_ALL |
98 | pr_info("\tOffload RCU callbacks from all CPUs\n"); | 98 | pr_info("\tOffload RCU callbacks from all CPUs\n"); |
99 | cpumask_setall(rcu_nocb_mask); | 99 | cpumask_copy(rcu_nocb_mask, cpu_possible_mask); |
100 | #endif /* #ifdef CONFIG_RCU_NOCB_CPU_ALL */ | 100 | #endif /* #ifdef CONFIG_RCU_NOCB_CPU_ALL */ |
101 | #endif /* #ifndef CONFIG_RCU_NOCB_CPU_NONE */ | 101 | #endif /* #ifndef CONFIG_RCU_NOCB_CPU_NONE */ |
102 | if (have_rcu_nocb_mask) { | 102 | if (have_rcu_nocb_mask) { |
103 | if (!cpumask_subset(rcu_nocb_mask, cpu_possible_mask)) { | ||
104 | pr_info("\tNote: kernel parameter 'rcu_nocbs=' contains nonexistent CPUs.\n"); | ||
105 | cpumask_and(rcu_nocb_mask, cpu_possible_mask, | ||
106 | rcu_nocb_mask); | ||
107 | } | ||
103 | cpulist_scnprintf(nocb_buf, sizeof(nocb_buf), rcu_nocb_mask); | 108 | cpulist_scnprintf(nocb_buf, sizeof(nocb_buf), rcu_nocb_mask); |
104 | pr_info("\tOffload RCU callbacks from CPUs: %s.\n", nocb_buf); | 109 | pr_info("\tOffload RCU callbacks from CPUs: %s.\n", nocb_buf); |
105 | if (rcu_nocb_poll) | 110 | if (rcu_nocb_poll) |
@@ -660,7 +665,7 @@ static void rcu_preempt_check_callbacks(int cpu) | |||
660 | 665 | ||
661 | static void rcu_preempt_do_callbacks(void) | 666 | static void rcu_preempt_do_callbacks(void) |
662 | { | 667 | { |
663 | rcu_do_batch(&rcu_preempt_state, &__get_cpu_var(rcu_preempt_data)); | 668 | rcu_do_batch(&rcu_preempt_state, this_cpu_ptr(&rcu_preempt_data)); |
664 | } | 669 | } |
665 | 670 | ||
666 | #endif /* #ifdef CONFIG_RCU_BOOST */ | 671 | #endif /* #ifdef CONFIG_RCU_BOOST */ |
@@ -1128,7 +1133,7 @@ void exit_rcu(void) | |||
1128 | 1133 | ||
1129 | #ifdef CONFIG_RCU_BOOST | 1134 | #ifdef CONFIG_RCU_BOOST |
1130 | 1135 | ||
1131 | #include "rtmutex_common.h" | 1136 | #include "../locking/rtmutex_common.h" |
1132 | 1137 | ||
1133 | #ifdef CONFIG_RCU_TRACE | 1138 | #ifdef CONFIG_RCU_TRACE |
1134 | 1139 | ||
@@ -1332,7 +1337,7 @@ static void invoke_rcu_callbacks_kthread(void) | |||
1332 | */ | 1337 | */ |
1333 | static bool rcu_is_callbacks_kthread(void) | 1338 | static bool rcu_is_callbacks_kthread(void) |
1334 | { | 1339 | { |
1335 | return __get_cpu_var(rcu_cpu_kthread_task) == current; | 1340 | return __this_cpu_read(rcu_cpu_kthread_task) == current; |
1336 | } | 1341 | } |
1337 | 1342 | ||
1338 | #define RCU_BOOST_DELAY_JIFFIES DIV_ROUND_UP(CONFIG_RCU_BOOST_DELAY * HZ, 1000) | 1343 | #define RCU_BOOST_DELAY_JIFFIES DIV_ROUND_UP(CONFIG_RCU_BOOST_DELAY * HZ, 1000) |
@@ -1382,8 +1387,8 @@ static int rcu_spawn_one_boost_kthread(struct rcu_state *rsp, | |||
1382 | 1387 | ||
1383 | static void rcu_kthread_do_work(void) | 1388 | static void rcu_kthread_do_work(void) |
1384 | { | 1389 | { |
1385 | rcu_do_batch(&rcu_sched_state, &__get_cpu_var(rcu_sched_data)); | 1390 | rcu_do_batch(&rcu_sched_state, this_cpu_ptr(&rcu_sched_data)); |
1386 | rcu_do_batch(&rcu_bh_state, &__get_cpu_var(rcu_bh_data)); | 1391 | rcu_do_batch(&rcu_bh_state, this_cpu_ptr(&rcu_bh_data)); |
1387 | rcu_preempt_do_callbacks(); | 1392 | rcu_preempt_do_callbacks(); |
1388 | } | 1393 | } |
1389 | 1394 | ||
@@ -1402,7 +1407,7 @@ static void rcu_cpu_kthread_park(unsigned int cpu) | |||
1402 | 1407 | ||
1403 | static int rcu_cpu_kthread_should_run(unsigned int cpu) | 1408 | static int rcu_cpu_kthread_should_run(unsigned int cpu) |
1404 | { | 1409 | { |
1405 | return __get_cpu_var(rcu_cpu_has_work); | 1410 | return __this_cpu_read(rcu_cpu_has_work); |
1406 | } | 1411 | } |
1407 | 1412 | ||
1408 | /* | 1413 | /* |
@@ -1412,8 +1417,8 @@ static int rcu_cpu_kthread_should_run(unsigned int cpu) | |||
1412 | */ | 1417 | */ |
1413 | static void rcu_cpu_kthread(unsigned int cpu) | 1418 | static void rcu_cpu_kthread(unsigned int cpu) |
1414 | { | 1419 | { |
1415 | unsigned int *statusp = &__get_cpu_var(rcu_cpu_kthread_status); | 1420 | unsigned int *statusp = this_cpu_ptr(&rcu_cpu_kthread_status); |
1416 | char work, *workp = &__get_cpu_var(rcu_cpu_has_work); | 1421 | char work, *workp = this_cpu_ptr(&rcu_cpu_has_work); |
1417 | int spincnt; | 1422 | int spincnt; |
1418 | 1423 | ||
1419 | for (spincnt = 0; spincnt < 10; spincnt++) { | 1424 | for (spincnt = 0; spincnt < 10; spincnt++) { |
@@ -1627,20 +1632,26 @@ module_param(rcu_idle_gp_delay, int, 0644); | |||
1627 | static int rcu_idle_lazy_gp_delay = RCU_IDLE_LAZY_GP_DELAY; | 1632 | static int rcu_idle_lazy_gp_delay = RCU_IDLE_LAZY_GP_DELAY; |
1628 | module_param(rcu_idle_lazy_gp_delay, int, 0644); | 1633 | module_param(rcu_idle_lazy_gp_delay, int, 0644); |
1629 | 1634 | ||
1630 | extern int tick_nohz_enabled; | 1635 | extern int tick_nohz_active; |
1631 | 1636 | ||
1632 | /* | 1637 | /* |
1633 | * Try to advance callbacks for all flavors of RCU on the current CPU. | 1638 | * Try to advance callbacks for all flavors of RCU on the current CPU, but |
1634 | * Afterwards, if there are any callbacks ready for immediate invocation, | 1639 | * only if it has been awhile since the last time we did so. Afterwards, |
1635 | * return true. | 1640 | * if there are any callbacks ready for immediate invocation, return true. |
1636 | */ | 1641 | */ |
1637 | static bool rcu_try_advance_all_cbs(void) | 1642 | static bool rcu_try_advance_all_cbs(void) |
1638 | { | 1643 | { |
1639 | bool cbs_ready = false; | 1644 | bool cbs_ready = false; |
1640 | struct rcu_data *rdp; | 1645 | struct rcu_data *rdp; |
1646 | struct rcu_dynticks *rdtp = this_cpu_ptr(&rcu_dynticks); | ||
1641 | struct rcu_node *rnp; | 1647 | struct rcu_node *rnp; |
1642 | struct rcu_state *rsp; | 1648 | struct rcu_state *rsp; |
1643 | 1649 | ||
1650 | /* Exit early if we advanced recently. */ | ||
1651 | if (jiffies == rdtp->last_advance_all) | ||
1652 | return 0; | ||
1653 | rdtp->last_advance_all = jiffies; | ||
1654 | |||
1644 | for_each_rcu_flavor(rsp) { | 1655 | for_each_rcu_flavor(rsp) { |
1645 | rdp = this_cpu_ptr(rsp->rda); | 1656 | rdp = this_cpu_ptr(rsp->rda); |
1646 | rnp = rdp->mynode; | 1657 | rnp = rdp->mynode; |
@@ -1718,7 +1729,7 @@ static void rcu_prepare_for_idle(int cpu) | |||
1718 | int tne; | 1729 | int tne; |
1719 | 1730 | ||
1720 | /* Handle nohz enablement switches conservatively. */ | 1731 | /* Handle nohz enablement switches conservatively. */ |
1721 | tne = ACCESS_ONCE(tick_nohz_enabled); | 1732 | tne = ACCESS_ONCE(tick_nohz_active); |
1722 | if (tne != rdtp->tick_nohz_enabled_snap) { | 1733 | if (tne != rdtp->tick_nohz_enabled_snap) { |
1723 | if (rcu_cpu_has_callbacks(cpu, NULL)) | 1734 | if (rcu_cpu_has_callbacks(cpu, NULL)) |
1724 | invoke_rcu_core(); /* force nohz to see update. */ | 1735 | invoke_rcu_core(); /* force nohz to see update. */ |
@@ -1739,6 +1750,8 @@ static void rcu_prepare_for_idle(int cpu) | |||
1739 | */ | 1750 | */ |
1740 | if (rdtp->all_lazy && | 1751 | if (rdtp->all_lazy && |
1741 | rdtp->nonlazy_posted != rdtp->nonlazy_posted_snap) { | 1752 | rdtp->nonlazy_posted != rdtp->nonlazy_posted_snap) { |
1753 | rdtp->all_lazy = false; | ||
1754 | rdtp->nonlazy_posted_snap = rdtp->nonlazy_posted; | ||
1742 | invoke_rcu_core(); | 1755 | invoke_rcu_core(); |
1743 | return; | 1756 | return; |
1744 | } | 1757 | } |
@@ -1768,17 +1781,11 @@ static void rcu_prepare_for_idle(int cpu) | |||
1768 | */ | 1781 | */ |
1769 | static void rcu_cleanup_after_idle(int cpu) | 1782 | static void rcu_cleanup_after_idle(int cpu) |
1770 | { | 1783 | { |
1771 | struct rcu_data *rdp; | ||
1772 | struct rcu_state *rsp; | ||
1773 | 1784 | ||
1774 | if (rcu_is_nocb_cpu(cpu)) | 1785 | if (rcu_is_nocb_cpu(cpu)) |
1775 | return; | 1786 | return; |
1776 | rcu_try_advance_all_cbs(); | 1787 | if (rcu_try_advance_all_cbs()) |
1777 | for_each_rcu_flavor(rsp) { | 1788 | invoke_rcu_core(); |
1778 | rdp = per_cpu_ptr(rsp->rda, cpu); | ||
1779 | if (cpu_has_callbacks_ready_to_invoke(rdp)) | ||
1780 | invoke_rcu_core(); | ||
1781 | } | ||
1782 | } | 1789 | } |
1783 | 1790 | ||
1784 | /* | 1791 | /* |
@@ -2108,15 +2115,22 @@ static void __call_rcu_nocb_enqueue(struct rcu_data *rdp, | |||
2108 | 2115 | ||
2109 | /* If we are not being polled and there is a kthread, awaken it ... */ | 2116 | /* If we are not being polled and there is a kthread, awaken it ... */ |
2110 | t = ACCESS_ONCE(rdp->nocb_kthread); | 2117 | t = ACCESS_ONCE(rdp->nocb_kthread); |
2111 | if (rcu_nocb_poll | !t) | 2118 | if (rcu_nocb_poll || !t) { |
2119 | trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, | ||
2120 | TPS("WakeNotPoll")); | ||
2112 | return; | 2121 | return; |
2122 | } | ||
2113 | len = atomic_long_read(&rdp->nocb_q_count); | 2123 | len = atomic_long_read(&rdp->nocb_q_count); |
2114 | if (old_rhpp == &rdp->nocb_head) { | 2124 | if (old_rhpp == &rdp->nocb_head) { |
2115 | wake_up(&rdp->nocb_wq); /* ... only if queue was empty ... */ | 2125 | wake_up(&rdp->nocb_wq); /* ... only if queue was empty ... */ |
2116 | rdp->qlen_last_fqs_check = 0; | 2126 | rdp->qlen_last_fqs_check = 0; |
2127 | trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, TPS("WakeEmpty")); | ||
2117 | } else if (len > rdp->qlen_last_fqs_check + qhimark) { | 2128 | } else if (len > rdp->qlen_last_fqs_check + qhimark) { |
2118 | wake_up_process(t); /* ... or if many callbacks queued. */ | 2129 | wake_up_process(t); /* ... or if many callbacks queued. */ |
2119 | rdp->qlen_last_fqs_check = LONG_MAX / 2; | 2130 | rdp->qlen_last_fqs_check = LONG_MAX / 2; |
2131 | trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, TPS("WakeOvf")); | ||
2132 | } else { | ||
2133 | trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, TPS("WakeNot")); | ||
2120 | } | 2134 | } |
2121 | return; | 2135 | return; |
2122 | } | 2136 | } |
@@ -2140,10 +2154,12 @@ static bool __call_rcu_nocb(struct rcu_data *rdp, struct rcu_head *rhp, | |||
2140 | if (__is_kfree_rcu_offset((unsigned long)rhp->func)) | 2154 | if (__is_kfree_rcu_offset((unsigned long)rhp->func)) |
2141 | trace_rcu_kfree_callback(rdp->rsp->name, rhp, | 2155 | trace_rcu_kfree_callback(rdp->rsp->name, rhp, |
2142 | (unsigned long)rhp->func, | 2156 | (unsigned long)rhp->func, |
2143 | rdp->qlen_lazy, rdp->qlen); | 2157 | -atomic_long_read(&rdp->nocb_q_count_lazy), |
2158 | -atomic_long_read(&rdp->nocb_q_count)); | ||
2144 | else | 2159 | else |
2145 | trace_rcu_callback(rdp->rsp->name, rhp, | 2160 | trace_rcu_callback(rdp->rsp->name, rhp, |
2146 | rdp->qlen_lazy, rdp->qlen); | 2161 | -atomic_long_read(&rdp->nocb_q_count_lazy), |
2162 | -atomic_long_read(&rdp->nocb_q_count)); | ||
2147 | return 1; | 2163 | return 1; |
2148 | } | 2164 | } |
2149 | 2165 | ||
@@ -2221,6 +2237,7 @@ static void rcu_nocb_wait_gp(struct rcu_data *rdp) | |||
2221 | static int rcu_nocb_kthread(void *arg) | 2237 | static int rcu_nocb_kthread(void *arg) |
2222 | { | 2238 | { |
2223 | int c, cl; | 2239 | int c, cl; |
2240 | bool firsttime = 1; | ||
2224 | struct rcu_head *list; | 2241 | struct rcu_head *list; |
2225 | struct rcu_head *next; | 2242 | struct rcu_head *next; |
2226 | struct rcu_head **tail; | 2243 | struct rcu_head **tail; |
@@ -2229,14 +2246,27 @@ static int rcu_nocb_kthread(void *arg) | |||
2229 | /* Each pass through this loop invokes one batch of callbacks */ | 2246 | /* Each pass through this loop invokes one batch of callbacks */ |
2230 | for (;;) { | 2247 | for (;;) { |
2231 | /* If not polling, wait for next batch of callbacks. */ | 2248 | /* If not polling, wait for next batch of callbacks. */ |
2232 | if (!rcu_nocb_poll) | 2249 | if (!rcu_nocb_poll) { |
2250 | trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, | ||
2251 | TPS("Sleep")); | ||
2233 | wait_event_interruptible(rdp->nocb_wq, rdp->nocb_head); | 2252 | wait_event_interruptible(rdp->nocb_wq, rdp->nocb_head); |
2253 | } else if (firsttime) { | ||
2254 | firsttime = 0; | ||
2255 | trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, | ||
2256 | TPS("Poll")); | ||
2257 | } | ||
2234 | list = ACCESS_ONCE(rdp->nocb_head); | 2258 | list = ACCESS_ONCE(rdp->nocb_head); |
2235 | if (!list) { | 2259 | if (!list) { |
2260 | if (!rcu_nocb_poll) | ||
2261 | trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, | ||
2262 | TPS("WokeEmpty")); | ||
2236 | schedule_timeout_interruptible(1); | 2263 | schedule_timeout_interruptible(1); |
2237 | flush_signals(current); | 2264 | flush_signals(current); |
2238 | continue; | 2265 | continue; |
2239 | } | 2266 | } |
2267 | firsttime = 1; | ||
2268 | trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, | ||
2269 | TPS("WokeNonEmpty")); | ||
2240 | 2270 | ||
2241 | /* | 2271 | /* |
2242 | * Extract queued callbacks, update counts, and wait | 2272 | * Extract queued callbacks, update counts, and wait |
@@ -2257,7 +2287,11 @@ static int rcu_nocb_kthread(void *arg) | |||
2257 | next = list->next; | 2287 | next = list->next; |
2258 | /* Wait for enqueuing to complete, if needed. */ | 2288 | /* Wait for enqueuing to complete, if needed. */ |
2259 | while (next == NULL && &list->next != tail) { | 2289 | while (next == NULL && &list->next != tail) { |
2290 | trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, | ||
2291 | TPS("WaitQueue")); | ||
2260 | schedule_timeout_interruptible(1); | 2292 | schedule_timeout_interruptible(1); |
2293 | trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, | ||
2294 | TPS("WokeQueue")); | ||
2261 | next = list->next; | 2295 | next = list->next; |
2262 | } | 2296 | } |
2263 | debug_rcu_head_unqueue(list); | 2297 | debug_rcu_head_unqueue(list); |
diff --git a/kernel/rcutree_trace.c b/kernel/rcu/tree_trace.c index cf6c17412932..3596797b7e46 100644 --- a/kernel/rcutree_trace.c +++ b/kernel/rcu/tree_trace.c | |||
@@ -44,7 +44,7 @@ | |||
44 | #include <linux/seq_file.h> | 44 | #include <linux/seq_file.h> |
45 | 45 | ||
46 | #define RCU_TREE_NONCORE | 46 | #define RCU_TREE_NONCORE |
47 | #include "rcutree.h" | 47 | #include "tree.h" |
48 | 48 | ||
49 | static int r_open(struct inode *inode, struct file *file, | 49 | static int r_open(struct inode *inode, struct file *file, |
50 | const struct seq_operations *op) | 50 | const struct seq_operations *op) |
diff --git a/kernel/rcupdate.c b/kernel/rcu/update.c index b02a339836b4..6cb3dff89e2b 100644 --- a/kernel/rcupdate.c +++ b/kernel/rcu/update.c | |||
@@ -53,6 +53,12 @@ | |||
53 | 53 | ||
54 | #include "rcu.h" | 54 | #include "rcu.h" |
55 | 55 | ||
56 | MODULE_ALIAS("rcupdate"); | ||
57 | #ifdef MODULE_PARAM_PREFIX | ||
58 | #undef MODULE_PARAM_PREFIX | ||
59 | #endif | ||
60 | #define MODULE_PARAM_PREFIX "rcupdate." | ||
61 | |||
56 | module_param(rcu_expedited, int, 0); | 62 | module_param(rcu_expedited, int, 0); |
57 | 63 | ||
58 | #ifdef CONFIG_PREEMPT_RCU | 64 | #ifdef CONFIG_PREEMPT_RCU |
@@ -148,7 +154,7 @@ int rcu_read_lock_bh_held(void) | |||
148 | { | 154 | { |
149 | if (!debug_lockdep_rcu_enabled()) | 155 | if (!debug_lockdep_rcu_enabled()) |
150 | return 1; | 156 | return 1; |
151 | if (rcu_is_cpu_idle()) | 157 | if (!rcu_is_watching()) |
152 | return 0; | 158 | return 0; |
153 | if (!rcu_lockdep_current_cpu_online()) | 159 | if (!rcu_lockdep_current_cpu_online()) |
154 | return 0; | 160 | return 0; |
@@ -298,7 +304,7 @@ EXPORT_SYMBOL_GPL(do_trace_rcu_torture_read); | |||
298 | #endif | 304 | #endif |
299 | 305 | ||
300 | int rcu_cpu_stall_suppress __read_mostly; /* 1 = suppress stall warnings. */ | 306 | int rcu_cpu_stall_suppress __read_mostly; /* 1 = suppress stall warnings. */ |
301 | int rcu_cpu_stall_timeout __read_mostly = CONFIG_RCU_CPU_STALL_TIMEOUT; | 307 | static int rcu_cpu_stall_timeout __read_mostly = CONFIG_RCU_CPU_STALL_TIMEOUT; |
302 | 308 | ||
303 | module_param(rcu_cpu_stall_suppress, int, 0644); | 309 | module_param(rcu_cpu_stall_suppress, int, 0644); |
304 | module_param(rcu_cpu_stall_timeout, int, 0644); | 310 | module_param(rcu_cpu_stall_timeout, int, 0644); |
diff --git a/kernel/reboot.c b/kernel/reboot.c index f813b3474646..662c83fc16b7 100644 --- a/kernel/reboot.c +++ b/kernel/reboot.c | |||
@@ -104,7 +104,7 @@ int unregister_reboot_notifier(struct notifier_block *nb) | |||
104 | } | 104 | } |
105 | EXPORT_SYMBOL(unregister_reboot_notifier); | 105 | EXPORT_SYMBOL(unregister_reboot_notifier); |
106 | 106 | ||
107 | static void migrate_to_reboot_cpu(void) | 107 | void migrate_to_reboot_cpu(void) |
108 | { | 108 | { |
109 | /* The boot cpu is always logical cpu 0 */ | 109 | /* The boot cpu is always logical cpu 0 */ |
110 | int cpu = reboot_cpu; | 110 | int cpu = reboot_cpu; |
diff --git a/kernel/sched/Makefile b/kernel/sched/Makefile index 54adcf35f495..7b621409cf15 100644 --- a/kernel/sched/Makefile +++ b/kernel/sched/Makefile | |||
@@ -12,6 +12,7 @@ CFLAGS_core.o := $(PROFILING) -fno-omit-frame-pointer | |||
12 | endif | 12 | endif |
13 | 13 | ||
14 | obj-y += core.o proc.o clock.o cputime.o idle_task.o fair.o rt.o stop_task.o | 14 | obj-y += core.o proc.o clock.o cputime.o idle_task.o fair.o rt.o stop_task.o |
15 | obj-y += wait.o completion.o | ||
15 | obj-$(CONFIG_SMP) += cpupri.o | 16 | obj-$(CONFIG_SMP) += cpupri.o |
16 | obj-$(CONFIG_SCHED_AUTOGROUP) += auto_group.o | 17 | obj-$(CONFIG_SCHED_AUTOGROUP) += auto_group.o |
17 | obj-$(CONFIG_SCHEDSTATS) += stats.o | 18 | obj-$(CONFIG_SCHEDSTATS) += stats.o |
diff --git a/kernel/sched/completion.c b/kernel/sched/completion.c new file mode 100644 index 000000000000..a63f4dc27909 --- /dev/null +++ b/kernel/sched/completion.c | |||
@@ -0,0 +1,299 @@ | |||
1 | /* | ||
2 | * Generic wait-for-completion handler; | ||
3 | * | ||
4 | * It differs from semaphores in that their default case is the opposite, | ||
5 | * wait_for_completion default blocks whereas semaphore default non-block. The | ||
6 | * interface also makes it easy to 'complete' multiple waiting threads, | ||
7 | * something which isn't entirely natural for semaphores. | ||
8 | * | ||
9 | * But more importantly, the primitive documents the usage. Semaphores would | ||
10 | * typically be used for exclusion which gives rise to priority inversion. | ||
11 | * Waiting for completion is a typically sync point, but not an exclusion point. | ||
12 | */ | ||
13 | |||
14 | #include <linux/sched.h> | ||
15 | #include <linux/completion.h> | ||
16 | |||
17 | /** | ||
18 | * complete: - signals a single thread waiting on this completion | ||
19 | * @x: holds the state of this particular completion | ||
20 | * | ||
21 | * This will wake up a single thread waiting on this completion. Threads will be | ||
22 | * awakened in the same order in which they were queued. | ||
23 | * | ||
24 | * See also complete_all(), wait_for_completion() and related routines. | ||
25 | * | ||
26 | * It may be assumed that this function implies a write memory barrier before | ||
27 | * changing the task state if and only if any tasks are woken up. | ||
28 | */ | ||
29 | void complete(struct completion *x) | ||
30 | { | ||
31 | unsigned long flags; | ||
32 | |||
33 | spin_lock_irqsave(&x->wait.lock, flags); | ||
34 | x->done++; | ||
35 | __wake_up_locked(&x->wait, TASK_NORMAL, 1); | ||
36 | spin_unlock_irqrestore(&x->wait.lock, flags); | ||
37 | } | ||
38 | EXPORT_SYMBOL(complete); | ||
39 | |||
40 | /** | ||
41 | * complete_all: - signals all threads waiting on this completion | ||
42 | * @x: holds the state of this particular completion | ||
43 | * | ||
44 | * This will wake up all threads waiting on this particular completion event. | ||
45 | * | ||
46 | * It may be assumed that this function implies a write memory barrier before | ||
47 | * changing the task state if and only if any tasks are woken up. | ||
48 | */ | ||
49 | void complete_all(struct completion *x) | ||
50 | { | ||
51 | unsigned long flags; | ||
52 | |||
53 | spin_lock_irqsave(&x->wait.lock, flags); | ||
54 | x->done += UINT_MAX/2; | ||
55 | __wake_up_locked(&x->wait, TASK_NORMAL, 0); | ||
56 | spin_unlock_irqrestore(&x->wait.lock, flags); | ||
57 | } | ||
58 | EXPORT_SYMBOL(complete_all); | ||
59 | |||
60 | static inline long __sched | ||
61 | do_wait_for_common(struct completion *x, | ||
62 | long (*action)(long), long timeout, int state) | ||
63 | { | ||
64 | if (!x->done) { | ||
65 | DECLARE_WAITQUEUE(wait, current); | ||
66 | |||
67 | __add_wait_queue_tail_exclusive(&x->wait, &wait); | ||
68 | do { | ||
69 | if (signal_pending_state(state, current)) { | ||
70 | timeout = -ERESTARTSYS; | ||
71 | break; | ||
72 | } | ||
73 | __set_current_state(state); | ||
74 | spin_unlock_irq(&x->wait.lock); | ||
75 | timeout = action(timeout); | ||
76 | spin_lock_irq(&x->wait.lock); | ||
77 | } while (!x->done && timeout); | ||
78 | __remove_wait_queue(&x->wait, &wait); | ||
79 | if (!x->done) | ||
80 | return timeout; | ||
81 | } | ||
82 | x->done--; | ||
83 | return timeout ?: 1; | ||
84 | } | ||
85 | |||
86 | static inline long __sched | ||
87 | __wait_for_common(struct completion *x, | ||
88 | long (*action)(long), long timeout, int state) | ||
89 | { | ||
90 | might_sleep(); | ||
91 | |||
92 | spin_lock_irq(&x->wait.lock); | ||
93 | timeout = do_wait_for_common(x, action, timeout, state); | ||
94 | spin_unlock_irq(&x->wait.lock); | ||
95 | return timeout; | ||
96 | } | ||
97 | |||
98 | static long __sched | ||
99 | wait_for_common(struct completion *x, long timeout, int state) | ||
100 | { | ||
101 | return __wait_for_common(x, schedule_timeout, timeout, state); | ||
102 | } | ||
103 | |||
104 | static long __sched | ||
105 | wait_for_common_io(struct completion *x, long timeout, int state) | ||
106 | { | ||
107 | return __wait_for_common(x, io_schedule_timeout, timeout, state); | ||
108 | } | ||
109 | |||
110 | /** | ||
111 | * wait_for_completion: - waits for completion of a task | ||
112 | * @x: holds the state of this particular completion | ||
113 | * | ||
114 | * This waits to be signaled for completion of a specific task. It is NOT | ||
115 | * interruptible and there is no timeout. | ||
116 | * | ||
117 | * See also similar routines (i.e. wait_for_completion_timeout()) with timeout | ||
118 | * and interrupt capability. Also see complete(). | ||
119 | */ | ||
120 | void __sched wait_for_completion(struct completion *x) | ||
121 | { | ||
122 | wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_UNINTERRUPTIBLE); | ||
123 | } | ||
124 | EXPORT_SYMBOL(wait_for_completion); | ||
125 | |||
126 | /** | ||
127 | * wait_for_completion_timeout: - waits for completion of a task (w/timeout) | ||
128 | * @x: holds the state of this particular completion | ||
129 | * @timeout: timeout value in jiffies | ||
130 | * | ||
131 | * This waits for either a completion of a specific task to be signaled or for a | ||
132 | * specified timeout to expire. The timeout is in jiffies. It is not | ||
133 | * interruptible. | ||
134 | * | ||
135 | * Return: 0 if timed out, and positive (at least 1, or number of jiffies left | ||
136 | * till timeout) if completed. | ||
137 | */ | ||
138 | unsigned long __sched | ||
139 | wait_for_completion_timeout(struct completion *x, unsigned long timeout) | ||
140 | { | ||
141 | return wait_for_common(x, timeout, TASK_UNINTERRUPTIBLE); | ||
142 | } | ||
143 | EXPORT_SYMBOL(wait_for_completion_timeout); | ||
144 | |||
145 | /** | ||
146 | * wait_for_completion_io: - waits for completion of a task | ||
147 | * @x: holds the state of this particular completion | ||
148 | * | ||
149 | * This waits to be signaled for completion of a specific task. It is NOT | ||
150 | * interruptible and there is no timeout. The caller is accounted as waiting | ||
151 | * for IO. | ||
152 | */ | ||
153 | void __sched wait_for_completion_io(struct completion *x) | ||
154 | { | ||
155 | wait_for_common_io(x, MAX_SCHEDULE_TIMEOUT, TASK_UNINTERRUPTIBLE); | ||
156 | } | ||
157 | EXPORT_SYMBOL(wait_for_completion_io); | ||
158 | |||
159 | /** | ||
160 | * wait_for_completion_io_timeout: - waits for completion of a task (w/timeout) | ||
161 | * @x: holds the state of this particular completion | ||
162 | * @timeout: timeout value in jiffies | ||
163 | * | ||
164 | * This waits for either a completion of a specific task to be signaled or for a | ||
165 | * specified timeout to expire. The timeout is in jiffies. It is not | ||
166 | * interruptible. The caller is accounted as waiting for IO. | ||
167 | * | ||
168 | * Return: 0 if timed out, and positive (at least 1, or number of jiffies left | ||
169 | * till timeout) if completed. | ||
170 | */ | ||
171 | unsigned long __sched | ||
172 | wait_for_completion_io_timeout(struct completion *x, unsigned long timeout) | ||
173 | { | ||
174 | return wait_for_common_io(x, timeout, TASK_UNINTERRUPTIBLE); | ||
175 | } | ||
176 | EXPORT_SYMBOL(wait_for_completion_io_timeout); | ||
177 | |||
178 | /** | ||
179 | * wait_for_completion_interruptible: - waits for completion of a task (w/intr) | ||
180 | * @x: holds the state of this particular completion | ||
181 | * | ||
182 | * This waits for completion of a specific task to be signaled. It is | ||
183 | * interruptible. | ||
184 | * | ||
185 | * Return: -ERESTARTSYS if interrupted, 0 if completed. | ||
186 | */ | ||
187 | int __sched wait_for_completion_interruptible(struct completion *x) | ||
188 | { | ||
189 | long t = wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_INTERRUPTIBLE); | ||
190 | if (t == -ERESTARTSYS) | ||
191 | return t; | ||
192 | return 0; | ||
193 | } | ||
194 | EXPORT_SYMBOL(wait_for_completion_interruptible); | ||
195 | |||
196 | /** | ||
197 | * wait_for_completion_interruptible_timeout: - waits for completion (w/(to,intr)) | ||
198 | * @x: holds the state of this particular completion | ||
199 | * @timeout: timeout value in jiffies | ||
200 | * | ||
201 | * This waits for either a completion of a specific task to be signaled or for a | ||
202 | * specified timeout to expire. It is interruptible. The timeout is in jiffies. | ||
203 | * | ||
204 | * Return: -ERESTARTSYS if interrupted, 0 if timed out, positive (at least 1, | ||
205 | * or number of jiffies left till timeout) if completed. | ||
206 | */ | ||
207 | long __sched | ||
208 | wait_for_completion_interruptible_timeout(struct completion *x, | ||
209 | unsigned long timeout) | ||
210 | { | ||
211 | return wait_for_common(x, timeout, TASK_INTERRUPTIBLE); | ||
212 | } | ||
213 | EXPORT_SYMBOL(wait_for_completion_interruptible_timeout); | ||
214 | |||
215 | /** | ||
216 | * wait_for_completion_killable: - waits for completion of a task (killable) | ||
217 | * @x: holds the state of this particular completion | ||
218 | * | ||
219 | * This waits to be signaled for completion of a specific task. It can be | ||
220 | * interrupted by a kill signal. | ||
221 | * | ||
222 | * Return: -ERESTARTSYS if interrupted, 0 if completed. | ||
223 | */ | ||
224 | int __sched wait_for_completion_killable(struct completion *x) | ||
225 | { | ||
226 | long t = wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_KILLABLE); | ||
227 | if (t == -ERESTARTSYS) | ||
228 | return t; | ||
229 | return 0; | ||
230 | } | ||
231 | EXPORT_SYMBOL(wait_for_completion_killable); | ||
232 | |||
233 | /** | ||
234 | * wait_for_completion_killable_timeout: - waits for completion of a task (w/(to,killable)) | ||
235 | * @x: holds the state of this particular completion | ||
236 | * @timeout: timeout value in jiffies | ||
237 | * | ||
238 | * This waits for either a completion of a specific task to be | ||
239 | * signaled or for a specified timeout to expire. It can be | ||
240 | * interrupted by a kill signal. The timeout is in jiffies. | ||
241 | * | ||
242 | * Return: -ERESTARTSYS if interrupted, 0 if timed out, positive (at least 1, | ||
243 | * or number of jiffies left till timeout) if completed. | ||
244 | */ | ||
245 | long __sched | ||
246 | wait_for_completion_killable_timeout(struct completion *x, | ||
247 | unsigned long timeout) | ||
248 | { | ||
249 | return wait_for_common(x, timeout, TASK_KILLABLE); | ||
250 | } | ||
251 | EXPORT_SYMBOL(wait_for_completion_killable_timeout); | ||
252 | |||
253 | /** | ||
254 | * try_wait_for_completion - try to decrement a completion without blocking | ||
255 | * @x: completion structure | ||
256 | * | ||
257 | * Return: 0 if a decrement cannot be done without blocking | ||
258 | * 1 if a decrement succeeded. | ||
259 | * | ||
260 | * If a completion is being used as a counting completion, | ||
261 | * attempt to decrement the counter without blocking. This | ||
262 | * enables us to avoid waiting if the resource the completion | ||
263 | * is protecting is not available. | ||
264 | */ | ||
265 | bool try_wait_for_completion(struct completion *x) | ||
266 | { | ||
267 | unsigned long flags; | ||
268 | int ret = 1; | ||
269 | |||
270 | spin_lock_irqsave(&x->wait.lock, flags); | ||
271 | if (!x->done) | ||
272 | ret = 0; | ||
273 | else | ||
274 | x->done--; | ||
275 | spin_unlock_irqrestore(&x->wait.lock, flags); | ||
276 | return ret; | ||
277 | } | ||
278 | EXPORT_SYMBOL(try_wait_for_completion); | ||
279 | |||
280 | /** | ||
281 | * completion_done - Test to see if a completion has any waiters | ||
282 | * @x: completion structure | ||
283 | * | ||
284 | * Return: 0 if there are waiters (wait_for_completion() in progress) | ||
285 | * 1 if there are no waiters. | ||
286 | * | ||
287 | */ | ||
288 | bool completion_done(struct completion *x) | ||
289 | { | ||
290 | unsigned long flags; | ||
291 | int ret = 1; | ||
292 | |||
293 | spin_lock_irqsave(&x->wait.lock, flags); | ||
294 | if (!x->done) | ||
295 | ret = 0; | ||
296 | spin_unlock_irqrestore(&x->wait.lock, flags); | ||
297 | return ret; | ||
298 | } | ||
299 | EXPORT_SYMBOL(completion_done); | ||
diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 5ac63c9a995a..a88f4a485c5e 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c | |||
@@ -513,12 +513,11 @@ static inline void init_hrtick(void) | |||
513 | * might also involve a cross-CPU call to trigger the scheduler on | 513 | * might also involve a cross-CPU call to trigger the scheduler on |
514 | * the target CPU. | 514 | * the target CPU. |
515 | */ | 515 | */ |
516 | #ifdef CONFIG_SMP | ||
517 | void resched_task(struct task_struct *p) | 516 | void resched_task(struct task_struct *p) |
518 | { | 517 | { |
519 | int cpu; | 518 | int cpu; |
520 | 519 | ||
521 | assert_raw_spin_locked(&task_rq(p)->lock); | 520 | lockdep_assert_held(&task_rq(p)->lock); |
522 | 521 | ||
523 | if (test_tsk_need_resched(p)) | 522 | if (test_tsk_need_resched(p)) |
524 | return; | 523 | return; |
@@ -526,8 +525,10 @@ void resched_task(struct task_struct *p) | |||
526 | set_tsk_need_resched(p); | 525 | set_tsk_need_resched(p); |
527 | 526 | ||
528 | cpu = task_cpu(p); | 527 | cpu = task_cpu(p); |
529 | if (cpu == smp_processor_id()) | 528 | if (cpu == smp_processor_id()) { |
529 | set_preempt_need_resched(); | ||
530 | return; | 530 | return; |
531 | } | ||
531 | 532 | ||
532 | /* NEED_RESCHED must be visible before we test polling */ | 533 | /* NEED_RESCHED must be visible before we test polling */ |
533 | smp_mb(); | 534 | smp_mb(); |
@@ -546,6 +547,7 @@ void resched_cpu(int cpu) | |||
546 | raw_spin_unlock_irqrestore(&rq->lock, flags); | 547 | raw_spin_unlock_irqrestore(&rq->lock, flags); |
547 | } | 548 | } |
548 | 549 | ||
550 | #ifdef CONFIG_SMP | ||
549 | #ifdef CONFIG_NO_HZ_COMMON | 551 | #ifdef CONFIG_NO_HZ_COMMON |
550 | /* | 552 | /* |
551 | * In the semi idle case, use the nearest busy cpu for migrating timers | 553 | * In the semi idle case, use the nearest busy cpu for migrating timers |
@@ -693,12 +695,6 @@ void sched_avg_update(struct rq *rq) | |||
693 | } | 695 | } |
694 | } | 696 | } |
695 | 697 | ||
696 | #else /* !CONFIG_SMP */ | ||
697 | void resched_task(struct task_struct *p) | ||
698 | { | ||
699 | assert_raw_spin_locked(&task_rq(p)->lock); | ||
700 | set_tsk_need_resched(p); | ||
701 | } | ||
702 | #endif /* CONFIG_SMP */ | 698 | #endif /* CONFIG_SMP */ |
703 | 699 | ||
704 | #if defined(CONFIG_RT_GROUP_SCHED) || (defined(CONFIG_FAIR_GROUP_SCHED) && \ | 700 | #if defined(CONFIG_RT_GROUP_SCHED) || (defined(CONFIG_FAIR_GROUP_SCHED) && \ |
@@ -767,14 +763,14 @@ static void set_load_weight(struct task_struct *p) | |||
767 | static void enqueue_task(struct rq *rq, struct task_struct *p, int flags) | 763 | static void enqueue_task(struct rq *rq, struct task_struct *p, int flags) |
768 | { | 764 | { |
769 | update_rq_clock(rq); | 765 | update_rq_clock(rq); |
770 | sched_info_queued(p); | 766 | sched_info_queued(rq, p); |
771 | p->sched_class->enqueue_task(rq, p, flags); | 767 | p->sched_class->enqueue_task(rq, p, flags); |
772 | } | 768 | } |
773 | 769 | ||
774 | static void dequeue_task(struct rq *rq, struct task_struct *p, int flags) | 770 | static void dequeue_task(struct rq *rq, struct task_struct *p, int flags) |
775 | { | 771 | { |
776 | update_rq_clock(rq); | 772 | update_rq_clock(rq); |
777 | sched_info_dequeued(p); | 773 | sched_info_dequeued(rq, p); |
778 | p->sched_class->dequeue_task(rq, p, flags); | 774 | p->sched_class->dequeue_task(rq, p, flags); |
779 | } | 775 | } |
780 | 776 | ||
@@ -987,7 +983,7 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu) | |||
987 | * ttwu() will sort out the placement. | 983 | * ttwu() will sort out the placement. |
988 | */ | 984 | */ |
989 | WARN_ON_ONCE(p->state != TASK_RUNNING && p->state != TASK_WAKING && | 985 | WARN_ON_ONCE(p->state != TASK_RUNNING && p->state != TASK_WAKING && |
990 | !(task_thread_info(p)->preempt_count & PREEMPT_ACTIVE)); | 986 | !(task_preempt_count(p) & PREEMPT_ACTIVE)); |
991 | 987 | ||
992 | #ifdef CONFIG_LOCKDEP | 988 | #ifdef CONFIG_LOCKDEP |
993 | /* | 989 | /* |
@@ -1017,6 +1013,107 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu) | |||
1017 | __set_task_cpu(p, new_cpu); | 1013 | __set_task_cpu(p, new_cpu); |
1018 | } | 1014 | } |
1019 | 1015 | ||
1016 | static void __migrate_swap_task(struct task_struct *p, int cpu) | ||
1017 | { | ||
1018 | if (p->on_rq) { | ||
1019 | struct rq *src_rq, *dst_rq; | ||
1020 | |||
1021 | src_rq = task_rq(p); | ||
1022 | dst_rq = cpu_rq(cpu); | ||
1023 | |||
1024 | deactivate_task(src_rq, p, 0); | ||
1025 | set_task_cpu(p, cpu); | ||
1026 | activate_task(dst_rq, p, 0); | ||
1027 | check_preempt_curr(dst_rq, p, 0); | ||
1028 | } else { | ||
1029 | /* | ||
1030 | * Task isn't running anymore; make it appear like we migrated | ||
1031 | * it before it went to sleep. This means on wakeup we make the | ||
1032 | * previous cpu our targer instead of where it really is. | ||
1033 | */ | ||
1034 | p->wake_cpu = cpu; | ||
1035 | } | ||
1036 | } | ||
1037 | |||
1038 | struct migration_swap_arg { | ||
1039 | struct task_struct *src_task, *dst_task; | ||
1040 | int src_cpu, dst_cpu; | ||
1041 | }; | ||
1042 | |||
1043 | static int migrate_swap_stop(void *data) | ||
1044 | { | ||
1045 | struct migration_swap_arg *arg = data; | ||
1046 | struct rq *src_rq, *dst_rq; | ||
1047 | int ret = -EAGAIN; | ||
1048 | |||
1049 | src_rq = cpu_rq(arg->src_cpu); | ||
1050 | dst_rq = cpu_rq(arg->dst_cpu); | ||
1051 | |||
1052 | double_raw_lock(&arg->src_task->pi_lock, | ||
1053 | &arg->dst_task->pi_lock); | ||
1054 | double_rq_lock(src_rq, dst_rq); | ||
1055 | if (task_cpu(arg->dst_task) != arg->dst_cpu) | ||
1056 | goto unlock; | ||
1057 | |||
1058 | if (task_cpu(arg->src_task) != arg->src_cpu) | ||
1059 | goto unlock; | ||
1060 | |||
1061 | if (!cpumask_test_cpu(arg->dst_cpu, tsk_cpus_allowed(arg->src_task))) | ||
1062 | goto unlock; | ||
1063 | |||
1064 | if (!cpumask_test_cpu(arg->src_cpu, tsk_cpus_allowed(arg->dst_task))) | ||
1065 | goto unlock; | ||
1066 | |||
1067 | __migrate_swap_task(arg->src_task, arg->dst_cpu); | ||
1068 | __migrate_swap_task(arg->dst_task, arg->src_cpu); | ||
1069 | |||
1070 | ret = 0; | ||
1071 | |||
1072 | unlock: | ||
1073 | double_rq_unlock(src_rq, dst_rq); | ||
1074 | raw_spin_unlock(&arg->dst_task->pi_lock); | ||
1075 | raw_spin_unlock(&arg->src_task->pi_lock); | ||
1076 | |||
1077 | return ret; | ||
1078 | } | ||
1079 | |||
1080 | /* | ||
1081 | * Cross migrate two tasks | ||
1082 | */ | ||
1083 | int migrate_swap(struct task_struct *cur, struct task_struct *p) | ||
1084 | { | ||
1085 | struct migration_swap_arg arg; | ||
1086 | int ret = -EINVAL; | ||
1087 | |||
1088 | arg = (struct migration_swap_arg){ | ||
1089 | .src_task = cur, | ||
1090 | .src_cpu = task_cpu(cur), | ||
1091 | .dst_task = p, | ||
1092 | .dst_cpu = task_cpu(p), | ||
1093 | }; | ||
1094 | |||
1095 | if (arg.src_cpu == arg.dst_cpu) | ||
1096 | goto out; | ||
1097 | |||
1098 | /* | ||
1099 | * These three tests are all lockless; this is OK since all of them | ||
1100 | * will be re-checked with proper locks held further down the line. | ||
1101 | */ | ||
1102 | if (!cpu_active(arg.src_cpu) || !cpu_active(arg.dst_cpu)) | ||
1103 | goto out; | ||
1104 | |||
1105 | if (!cpumask_test_cpu(arg.dst_cpu, tsk_cpus_allowed(arg.src_task))) | ||
1106 | goto out; | ||
1107 | |||
1108 | if (!cpumask_test_cpu(arg.src_cpu, tsk_cpus_allowed(arg.dst_task))) | ||
1109 | goto out; | ||
1110 | |||
1111 | ret = stop_two_cpus(arg.dst_cpu, arg.src_cpu, migrate_swap_stop, &arg); | ||
1112 | |||
1113 | out: | ||
1114 | return ret; | ||
1115 | } | ||
1116 | |||
1020 | struct migration_arg { | 1117 | struct migration_arg { |
1021 | struct task_struct *task; | 1118 | struct task_struct *task; |
1022 | int dest_cpu; | 1119 | int dest_cpu; |
@@ -1236,9 +1333,9 @@ out: | |||
1236 | * The caller (fork, wakeup) owns p->pi_lock, ->cpus_allowed is stable. | 1333 | * The caller (fork, wakeup) owns p->pi_lock, ->cpus_allowed is stable. |
1237 | */ | 1334 | */ |
1238 | static inline | 1335 | static inline |
1239 | int select_task_rq(struct task_struct *p, int sd_flags, int wake_flags) | 1336 | int select_task_rq(struct task_struct *p, int cpu, int sd_flags, int wake_flags) |
1240 | { | 1337 | { |
1241 | int cpu = p->sched_class->select_task_rq(p, sd_flags, wake_flags); | 1338 | cpu = p->sched_class->select_task_rq(p, cpu, sd_flags, wake_flags); |
1242 | 1339 | ||
1243 | /* | 1340 | /* |
1244 | * In order not to call set_task_cpu() on a blocking task we need | 1341 | * In order not to call set_task_cpu() on a blocking task we need |
@@ -1330,12 +1427,13 @@ ttwu_do_wakeup(struct rq *rq, struct task_struct *p, int wake_flags) | |||
1330 | 1427 | ||
1331 | if (rq->idle_stamp) { | 1428 | if (rq->idle_stamp) { |
1332 | u64 delta = rq_clock(rq) - rq->idle_stamp; | 1429 | u64 delta = rq_clock(rq) - rq->idle_stamp; |
1333 | u64 max = 2*sysctl_sched_migration_cost; | 1430 | u64 max = 2*rq->max_idle_balance_cost; |
1334 | 1431 | ||
1335 | if (delta > max) | 1432 | update_avg(&rq->avg_idle, delta); |
1433 | |||
1434 | if (rq->avg_idle > max) | ||
1336 | rq->avg_idle = max; | 1435 | rq->avg_idle = max; |
1337 | else | 1436 | |
1338 | update_avg(&rq->avg_idle, delta); | ||
1339 | rq->idle_stamp = 0; | 1437 | rq->idle_stamp = 0; |
1340 | } | 1438 | } |
1341 | #endif | 1439 | #endif |
@@ -1396,6 +1494,14 @@ static void sched_ttwu_pending(void) | |||
1396 | 1494 | ||
1397 | void scheduler_ipi(void) | 1495 | void scheduler_ipi(void) |
1398 | { | 1496 | { |
1497 | /* | ||
1498 | * Fold TIF_NEED_RESCHED into the preempt_count; anybody setting | ||
1499 | * TIF_NEED_RESCHED remotely (for the first time) will also send | ||
1500 | * this IPI. | ||
1501 | */ | ||
1502 | if (tif_need_resched()) | ||
1503 | set_preempt_need_resched(); | ||
1504 | |||
1399 | if (llist_empty(&this_rq()->wake_list) | 1505 | if (llist_empty(&this_rq()->wake_list) |
1400 | && !tick_nohz_full_cpu(smp_processor_id()) | 1506 | && !tick_nohz_full_cpu(smp_processor_id()) |
1401 | && !got_nohz_idle_kick()) | 1507 | && !got_nohz_idle_kick()) |
@@ -1513,7 +1619,7 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags) | |||
1513 | if (p->sched_class->task_waking) | 1619 | if (p->sched_class->task_waking) |
1514 | p->sched_class->task_waking(p); | 1620 | p->sched_class->task_waking(p); |
1515 | 1621 | ||
1516 | cpu = select_task_rq(p, SD_BALANCE_WAKE, wake_flags); | 1622 | cpu = select_task_rq(p, p->wake_cpu, SD_BALANCE_WAKE, wake_flags); |
1517 | if (task_cpu(p) != cpu) { | 1623 | if (task_cpu(p) != cpu) { |
1518 | wake_flags |= WF_MIGRATED; | 1624 | wake_flags |= WF_MIGRATED; |
1519 | set_task_cpu(p, cpu); | 1625 | set_task_cpu(p, cpu); |
@@ -1595,7 +1701,7 @@ int wake_up_state(struct task_struct *p, unsigned int state) | |||
1595 | * | 1701 | * |
1596 | * __sched_fork() is basic setup used by init_idle() too: | 1702 | * __sched_fork() is basic setup used by init_idle() too: |
1597 | */ | 1703 | */ |
1598 | static void __sched_fork(struct task_struct *p) | 1704 | static void __sched_fork(unsigned long clone_flags, struct task_struct *p) |
1599 | { | 1705 | { |
1600 | p->on_rq = 0; | 1706 | p->on_rq = 0; |
1601 | 1707 | ||
@@ -1619,16 +1725,24 @@ static void __sched_fork(struct task_struct *p) | |||
1619 | 1725 | ||
1620 | #ifdef CONFIG_NUMA_BALANCING | 1726 | #ifdef CONFIG_NUMA_BALANCING |
1621 | if (p->mm && atomic_read(&p->mm->mm_users) == 1) { | 1727 | if (p->mm && atomic_read(&p->mm->mm_users) == 1) { |
1622 | p->mm->numa_next_scan = jiffies; | 1728 | p->mm->numa_next_scan = jiffies + msecs_to_jiffies(sysctl_numa_balancing_scan_delay); |
1623 | p->mm->numa_next_reset = jiffies; | ||
1624 | p->mm->numa_scan_seq = 0; | 1729 | p->mm->numa_scan_seq = 0; |
1625 | } | 1730 | } |
1626 | 1731 | ||
1732 | if (clone_flags & CLONE_VM) | ||
1733 | p->numa_preferred_nid = current->numa_preferred_nid; | ||
1734 | else | ||
1735 | p->numa_preferred_nid = -1; | ||
1736 | |||
1627 | p->node_stamp = 0ULL; | 1737 | p->node_stamp = 0ULL; |
1628 | p->numa_scan_seq = p->mm ? p->mm->numa_scan_seq : 0; | 1738 | p->numa_scan_seq = p->mm ? p->mm->numa_scan_seq : 0; |
1629 | p->numa_migrate_seq = p->mm ? p->mm->numa_scan_seq - 1 : 0; | ||
1630 | p->numa_scan_period = sysctl_numa_balancing_scan_delay; | 1739 | p->numa_scan_period = sysctl_numa_balancing_scan_delay; |
1631 | p->numa_work.next = &p->numa_work; | 1740 | p->numa_work.next = &p->numa_work; |
1741 | p->numa_faults = NULL; | ||
1742 | p->numa_faults_buffer = NULL; | ||
1743 | |||
1744 | INIT_LIST_HEAD(&p->numa_entry); | ||
1745 | p->numa_group = NULL; | ||
1632 | #endif /* CONFIG_NUMA_BALANCING */ | 1746 | #endif /* CONFIG_NUMA_BALANCING */ |
1633 | } | 1747 | } |
1634 | 1748 | ||
@@ -1654,12 +1768,12 @@ void set_numabalancing_state(bool enabled) | |||
1654 | /* | 1768 | /* |
1655 | * fork()/clone()-time setup: | 1769 | * fork()/clone()-time setup: |
1656 | */ | 1770 | */ |
1657 | void sched_fork(struct task_struct *p) | 1771 | void sched_fork(unsigned long clone_flags, struct task_struct *p) |
1658 | { | 1772 | { |
1659 | unsigned long flags; | 1773 | unsigned long flags; |
1660 | int cpu = get_cpu(); | 1774 | int cpu = get_cpu(); |
1661 | 1775 | ||
1662 | __sched_fork(p); | 1776 | __sched_fork(clone_flags, p); |
1663 | /* | 1777 | /* |
1664 | * We mark the process as running here. This guarantees that | 1778 | * We mark the process as running here. This guarantees that |
1665 | * nobody will actually run it, and a signal or other external | 1779 | * nobody will actually run it, and a signal or other external |
@@ -1717,10 +1831,7 @@ void sched_fork(struct task_struct *p) | |||
1717 | #if defined(CONFIG_SMP) | 1831 | #if defined(CONFIG_SMP) |
1718 | p->on_cpu = 0; | 1832 | p->on_cpu = 0; |
1719 | #endif | 1833 | #endif |
1720 | #ifdef CONFIG_PREEMPT_COUNT | 1834 | init_task_preempt_count(p); |
1721 | /* Want to start with kernel preemption disabled. */ | ||
1722 | task_thread_info(p)->preempt_count = 1; | ||
1723 | #endif | ||
1724 | #ifdef CONFIG_SMP | 1835 | #ifdef CONFIG_SMP |
1725 | plist_node_init(&p->pushable_tasks, MAX_PRIO); | 1836 | plist_node_init(&p->pushable_tasks, MAX_PRIO); |
1726 | #endif | 1837 | #endif |
@@ -1747,7 +1858,7 @@ void wake_up_new_task(struct task_struct *p) | |||
1747 | * - cpus_allowed can change in the fork path | 1858 | * - cpus_allowed can change in the fork path |
1748 | * - any previously selected cpu might disappear through hotplug | 1859 | * - any previously selected cpu might disappear through hotplug |
1749 | */ | 1860 | */ |
1750 | set_task_cpu(p, select_task_rq(p, SD_BALANCE_FORK, 0)); | 1861 | set_task_cpu(p, select_task_rq(p, task_cpu(p), SD_BALANCE_FORK, 0)); |
1751 | #endif | 1862 | #endif |
1752 | 1863 | ||
1753 | /* Initialize new task's runnable average */ | 1864 | /* Initialize new task's runnable average */ |
@@ -1838,7 +1949,7 @@ prepare_task_switch(struct rq *rq, struct task_struct *prev, | |||
1838 | struct task_struct *next) | 1949 | struct task_struct *next) |
1839 | { | 1950 | { |
1840 | trace_sched_switch(prev, next); | 1951 | trace_sched_switch(prev, next); |
1841 | sched_info_switch(prev, next); | 1952 | sched_info_switch(rq, prev, next); |
1842 | perf_event_task_sched_out(prev, next); | 1953 | perf_event_task_sched_out(prev, next); |
1843 | fire_sched_out_preempt_notifiers(prev, next); | 1954 | fire_sched_out_preempt_notifiers(prev, next); |
1844 | prepare_lock_switch(rq, next); | 1955 | prepare_lock_switch(rq, next); |
@@ -1890,6 +2001,8 @@ static void finish_task_switch(struct rq *rq, struct task_struct *prev) | |||
1890 | if (mm) | 2001 | if (mm) |
1891 | mmdrop(mm); | 2002 | mmdrop(mm); |
1892 | if (unlikely(prev_state == TASK_DEAD)) { | 2003 | if (unlikely(prev_state == TASK_DEAD)) { |
2004 | task_numa_free(prev); | ||
2005 | |||
1893 | /* | 2006 | /* |
1894 | * Remove function-return probe instances associated with this | 2007 | * Remove function-return probe instances associated with this |
1895 | * task and put them back on the free list. | 2008 | * task and put them back on the free list. |
@@ -2073,7 +2186,7 @@ void sched_exec(void) | |||
2073 | int dest_cpu; | 2186 | int dest_cpu; |
2074 | 2187 | ||
2075 | raw_spin_lock_irqsave(&p->pi_lock, flags); | 2188 | raw_spin_lock_irqsave(&p->pi_lock, flags); |
2076 | dest_cpu = p->sched_class->select_task_rq(p, SD_BALANCE_EXEC, 0); | 2189 | dest_cpu = p->sched_class->select_task_rq(p, task_cpu(p), SD_BALANCE_EXEC, 0); |
2077 | if (dest_cpu == smp_processor_id()) | 2190 | if (dest_cpu == smp_processor_id()) |
2078 | goto unlock; | 2191 | goto unlock; |
2079 | 2192 | ||
@@ -2140,6 +2253,20 @@ unsigned long long task_sched_runtime(struct task_struct *p) | |||
2140 | struct rq *rq; | 2253 | struct rq *rq; |
2141 | u64 ns = 0; | 2254 | u64 ns = 0; |
2142 | 2255 | ||
2256 | #if defined(CONFIG_64BIT) && defined(CONFIG_SMP) | ||
2257 | /* | ||
2258 | * 64-bit doesn't need locks to atomically read a 64bit value. | ||
2259 | * So we have a optimization chance when the task's delta_exec is 0. | ||
2260 | * Reading ->on_cpu is racy, but this is ok. | ||
2261 | * | ||
2262 | * If we race with it leaving cpu, we'll take a lock. So we're correct. | ||
2263 | * If we race with it entering cpu, unaccounted time is 0. This is | ||
2264 | * indistinguishable from the read occurring a few cycles earlier. | ||
2265 | */ | ||
2266 | if (!p->on_cpu) | ||
2267 | return p->se.sum_exec_runtime; | ||
2268 | #endif | ||
2269 | |||
2143 | rq = task_rq_lock(p, &flags); | 2270 | rq = task_rq_lock(p, &flags); |
2144 | ns = p->se.sum_exec_runtime + do_task_delta_exec(p, rq); | 2271 | ns = p->se.sum_exec_runtime + do_task_delta_exec(p, rq); |
2145 | task_rq_unlock(rq, p, &flags); | 2272 | task_rq_unlock(rq, p, &flags); |
@@ -2215,7 +2342,7 @@ notrace unsigned long get_parent_ip(unsigned long addr) | |||
2215 | #if defined(CONFIG_PREEMPT) && (defined(CONFIG_DEBUG_PREEMPT) || \ | 2342 | #if defined(CONFIG_PREEMPT) && (defined(CONFIG_DEBUG_PREEMPT) || \ |
2216 | defined(CONFIG_PREEMPT_TRACER)) | 2343 | defined(CONFIG_PREEMPT_TRACER)) |
2217 | 2344 | ||
2218 | void __kprobes add_preempt_count(int val) | 2345 | void __kprobes preempt_count_add(int val) |
2219 | { | 2346 | { |
2220 | #ifdef CONFIG_DEBUG_PREEMPT | 2347 | #ifdef CONFIG_DEBUG_PREEMPT |
2221 | /* | 2348 | /* |
@@ -2224,7 +2351,7 @@ void __kprobes add_preempt_count(int val) | |||
2224 | if (DEBUG_LOCKS_WARN_ON((preempt_count() < 0))) | 2351 | if (DEBUG_LOCKS_WARN_ON((preempt_count() < 0))) |
2225 | return; | 2352 | return; |
2226 | #endif | 2353 | #endif |
2227 | preempt_count() += val; | 2354 | __preempt_count_add(val); |
2228 | #ifdef CONFIG_DEBUG_PREEMPT | 2355 | #ifdef CONFIG_DEBUG_PREEMPT |
2229 | /* | 2356 | /* |
2230 | * Spinlock count overflowing soon? | 2357 | * Spinlock count overflowing soon? |
@@ -2235,9 +2362,9 @@ void __kprobes add_preempt_count(int val) | |||
2235 | if (preempt_count() == val) | 2362 | if (preempt_count() == val) |
2236 | trace_preempt_off(CALLER_ADDR0, get_parent_ip(CALLER_ADDR1)); | 2363 | trace_preempt_off(CALLER_ADDR0, get_parent_ip(CALLER_ADDR1)); |
2237 | } | 2364 | } |
2238 | EXPORT_SYMBOL(add_preempt_count); | 2365 | EXPORT_SYMBOL(preempt_count_add); |
2239 | 2366 | ||
2240 | void __kprobes sub_preempt_count(int val) | 2367 | void __kprobes preempt_count_sub(int val) |
2241 | { | 2368 | { |
2242 | #ifdef CONFIG_DEBUG_PREEMPT | 2369 | #ifdef CONFIG_DEBUG_PREEMPT |
2243 | /* | 2370 | /* |
@@ -2255,9 +2382,9 @@ void __kprobes sub_preempt_count(int val) | |||
2255 | 2382 | ||
2256 | if (preempt_count() == val) | 2383 | if (preempt_count() == val) |
2257 | trace_preempt_on(CALLER_ADDR0, get_parent_ip(CALLER_ADDR1)); | 2384 | trace_preempt_on(CALLER_ADDR0, get_parent_ip(CALLER_ADDR1)); |
2258 | preempt_count() -= val; | 2385 | __preempt_count_sub(val); |
2259 | } | 2386 | } |
2260 | EXPORT_SYMBOL(sub_preempt_count); | 2387 | EXPORT_SYMBOL(preempt_count_sub); |
2261 | 2388 | ||
2262 | #endif | 2389 | #endif |
2263 | 2390 | ||
@@ -2430,6 +2557,7 @@ need_resched: | |||
2430 | put_prev_task(rq, prev); | 2557 | put_prev_task(rq, prev); |
2431 | next = pick_next_task(rq); | 2558 | next = pick_next_task(rq); |
2432 | clear_tsk_need_resched(prev); | 2559 | clear_tsk_need_resched(prev); |
2560 | clear_preempt_need_resched(); | ||
2433 | rq->skip_clock_update = 0; | 2561 | rq->skip_clock_update = 0; |
2434 | 2562 | ||
2435 | if (likely(prev != next)) { | 2563 | if (likely(prev != next)) { |
@@ -2520,9 +2648,9 @@ asmlinkage void __sched notrace preempt_schedule(void) | |||
2520 | return; | 2648 | return; |
2521 | 2649 | ||
2522 | do { | 2650 | do { |
2523 | add_preempt_count_notrace(PREEMPT_ACTIVE); | 2651 | __preempt_count_add(PREEMPT_ACTIVE); |
2524 | __schedule(); | 2652 | __schedule(); |
2525 | sub_preempt_count_notrace(PREEMPT_ACTIVE); | 2653 | __preempt_count_sub(PREEMPT_ACTIVE); |
2526 | 2654 | ||
2527 | /* | 2655 | /* |
2528 | * Check again in case we missed a preemption opportunity | 2656 | * Check again in case we missed a preemption opportunity |
@@ -2532,6 +2660,7 @@ asmlinkage void __sched notrace preempt_schedule(void) | |||
2532 | } while (need_resched()); | 2660 | } while (need_resched()); |
2533 | } | 2661 | } |
2534 | EXPORT_SYMBOL(preempt_schedule); | 2662 | EXPORT_SYMBOL(preempt_schedule); |
2663 | #endif /* CONFIG_PREEMPT */ | ||
2535 | 2664 | ||
2536 | /* | 2665 | /* |
2537 | * this is the entry point to schedule() from kernel preemption | 2666 | * this is the entry point to schedule() from kernel preemption |
@@ -2541,20 +2670,19 @@ EXPORT_SYMBOL(preempt_schedule); | |||
2541 | */ | 2670 | */ |
2542 | asmlinkage void __sched preempt_schedule_irq(void) | 2671 | asmlinkage void __sched preempt_schedule_irq(void) |
2543 | { | 2672 | { |
2544 | struct thread_info *ti = current_thread_info(); | ||
2545 | enum ctx_state prev_state; | 2673 | enum ctx_state prev_state; |
2546 | 2674 | ||
2547 | /* Catch callers which need to be fixed */ | 2675 | /* Catch callers which need to be fixed */ |
2548 | BUG_ON(ti->preempt_count || !irqs_disabled()); | 2676 | BUG_ON(preempt_count() || !irqs_disabled()); |
2549 | 2677 | ||
2550 | prev_state = exception_enter(); | 2678 | prev_state = exception_enter(); |
2551 | 2679 | ||
2552 | do { | 2680 | do { |
2553 | add_preempt_count(PREEMPT_ACTIVE); | 2681 | __preempt_count_add(PREEMPT_ACTIVE); |
2554 | local_irq_enable(); | 2682 | local_irq_enable(); |
2555 | __schedule(); | 2683 | __schedule(); |
2556 | local_irq_disable(); | 2684 | local_irq_disable(); |
2557 | sub_preempt_count(PREEMPT_ACTIVE); | 2685 | __preempt_count_sub(PREEMPT_ACTIVE); |
2558 | 2686 | ||
2559 | /* | 2687 | /* |
2560 | * Check again in case we missed a preemption opportunity | 2688 | * Check again in case we missed a preemption opportunity |
@@ -2566,8 +2694,6 @@ asmlinkage void __sched preempt_schedule_irq(void) | |||
2566 | exception_exit(prev_state); | 2694 | exception_exit(prev_state); |
2567 | } | 2695 | } |
2568 | 2696 | ||
2569 | #endif /* CONFIG_PREEMPT */ | ||
2570 | |||
2571 | int default_wake_function(wait_queue_t *curr, unsigned mode, int wake_flags, | 2697 | int default_wake_function(wait_queue_t *curr, unsigned mode, int wake_flags, |
2572 | void *key) | 2698 | void *key) |
2573 | { | 2699 | { |
@@ -2575,393 +2701,6 @@ int default_wake_function(wait_queue_t *curr, unsigned mode, int wake_flags, | |||
2575 | } | 2701 | } |
2576 | EXPORT_SYMBOL(default_wake_function); | 2702 | EXPORT_SYMBOL(default_wake_function); |
2577 | 2703 | ||
2578 | /* | ||
2579 | * The core wakeup function. Non-exclusive wakeups (nr_exclusive == 0) just | ||
2580 | * wake everything up. If it's an exclusive wakeup (nr_exclusive == small +ve | ||
2581 | * number) then we wake all the non-exclusive tasks and one exclusive task. | ||
2582 | * | ||
2583 | * There are circumstances in which we can try to wake a task which has already | ||
2584 | * started to run but is not in state TASK_RUNNING. try_to_wake_up() returns | ||
2585 | * zero in this (rare) case, and we handle it by continuing to scan the queue. | ||
2586 | */ | ||
2587 | static void __wake_up_common(wait_queue_head_t *q, unsigned int mode, | ||
2588 | int nr_exclusive, int wake_flags, void *key) | ||
2589 | { | ||
2590 | wait_queue_t *curr, *next; | ||
2591 | |||
2592 | list_for_each_entry_safe(curr, next, &q->task_list, task_list) { | ||
2593 | unsigned flags = curr->flags; | ||
2594 | |||
2595 | if (curr->func(curr, mode, wake_flags, key) && | ||
2596 | (flags & WQ_FLAG_EXCLUSIVE) && !--nr_exclusive) | ||
2597 | break; | ||
2598 | } | ||
2599 | } | ||
2600 | |||
2601 | /** | ||
2602 | * __wake_up - wake up threads blocked on a waitqueue. | ||
2603 | * @q: the waitqueue | ||
2604 | * @mode: which threads | ||
2605 | * @nr_exclusive: how many wake-one or wake-many threads to wake up | ||
2606 | * @key: is directly passed to the wakeup function | ||
2607 | * | ||
2608 | * It may be assumed that this function implies a write memory barrier before | ||
2609 | * changing the task state if and only if any tasks are woken up. | ||
2610 | */ | ||
2611 | void __wake_up(wait_queue_head_t *q, unsigned int mode, | ||
2612 | int nr_exclusive, void *key) | ||
2613 | { | ||
2614 | unsigned long flags; | ||
2615 | |||
2616 | spin_lock_irqsave(&q->lock, flags); | ||
2617 | __wake_up_common(q, mode, nr_exclusive, 0, key); | ||
2618 | spin_unlock_irqrestore(&q->lock, flags); | ||
2619 | } | ||
2620 | EXPORT_SYMBOL(__wake_up); | ||
2621 | |||
2622 | /* | ||
2623 | * Same as __wake_up but called with the spinlock in wait_queue_head_t held. | ||
2624 | */ | ||
2625 | void __wake_up_locked(wait_queue_head_t *q, unsigned int mode, int nr) | ||
2626 | { | ||
2627 | __wake_up_common(q, mode, nr, 0, NULL); | ||
2628 | } | ||
2629 | EXPORT_SYMBOL_GPL(__wake_up_locked); | ||
2630 | |||
2631 | void __wake_up_locked_key(wait_queue_head_t *q, unsigned int mode, void *key) | ||
2632 | { | ||
2633 | __wake_up_common(q, mode, 1, 0, key); | ||
2634 | } | ||
2635 | EXPORT_SYMBOL_GPL(__wake_up_locked_key); | ||
2636 | |||
2637 | /** | ||
2638 | * __wake_up_sync_key - wake up threads blocked on a waitqueue. | ||
2639 | * @q: the waitqueue | ||
2640 | * @mode: which threads | ||
2641 | * @nr_exclusive: how many wake-one or wake-many threads to wake up | ||
2642 | * @key: opaque value to be passed to wakeup targets | ||
2643 | * | ||
2644 | * The sync wakeup differs that the waker knows that it will schedule | ||
2645 | * away soon, so while the target thread will be woken up, it will not | ||
2646 | * be migrated to another CPU - ie. the two threads are 'synchronized' | ||
2647 | * with each other. This can prevent needless bouncing between CPUs. | ||
2648 | * | ||
2649 | * On UP it can prevent extra preemption. | ||
2650 | * | ||
2651 | * It may be assumed that this function implies a write memory barrier before | ||
2652 | * changing the task state if and only if any tasks are woken up. | ||
2653 | */ | ||
2654 | void __wake_up_sync_key(wait_queue_head_t *q, unsigned int mode, | ||
2655 | int nr_exclusive, void *key) | ||
2656 | { | ||
2657 | unsigned long flags; | ||
2658 | int wake_flags = WF_SYNC; | ||
2659 | |||
2660 | if (unlikely(!q)) | ||
2661 | return; | ||
2662 | |||
2663 | if (unlikely(nr_exclusive != 1)) | ||
2664 | wake_flags = 0; | ||
2665 | |||
2666 | spin_lock_irqsave(&q->lock, flags); | ||
2667 | __wake_up_common(q, mode, nr_exclusive, wake_flags, key); | ||
2668 | spin_unlock_irqrestore(&q->lock, flags); | ||
2669 | } | ||
2670 | EXPORT_SYMBOL_GPL(__wake_up_sync_key); | ||
2671 | |||
2672 | /* | ||
2673 | * __wake_up_sync - see __wake_up_sync_key() | ||
2674 | */ | ||
2675 | void __wake_up_sync(wait_queue_head_t *q, unsigned int mode, int nr_exclusive) | ||
2676 | { | ||
2677 | __wake_up_sync_key(q, mode, nr_exclusive, NULL); | ||
2678 | } | ||
2679 | EXPORT_SYMBOL_GPL(__wake_up_sync); /* For internal use only */ | ||
2680 | |||
2681 | /** | ||
2682 | * complete: - signals a single thread waiting on this completion | ||
2683 | * @x: holds the state of this particular completion | ||
2684 | * | ||
2685 | * This will wake up a single thread waiting on this completion. Threads will be | ||
2686 | * awakened in the same order in which they were queued. | ||
2687 | * | ||
2688 | * See also complete_all(), wait_for_completion() and related routines. | ||
2689 | * | ||
2690 | * It may be assumed that this function implies a write memory barrier before | ||
2691 | * changing the task state if and only if any tasks are woken up. | ||
2692 | */ | ||
2693 | void complete(struct completion *x) | ||
2694 | { | ||
2695 | unsigned long flags; | ||
2696 | |||
2697 | spin_lock_irqsave(&x->wait.lock, flags); | ||
2698 | x->done++; | ||
2699 | __wake_up_common(&x->wait, TASK_NORMAL, 1, 0, NULL); | ||
2700 | spin_unlock_irqrestore(&x->wait.lock, flags); | ||
2701 | } | ||
2702 | EXPORT_SYMBOL(complete); | ||
2703 | |||
2704 | /** | ||
2705 | * complete_all: - signals all threads waiting on this completion | ||
2706 | * @x: holds the state of this particular completion | ||
2707 | * | ||
2708 | * This will wake up all threads waiting on this particular completion event. | ||
2709 | * | ||
2710 | * It may be assumed that this function implies a write memory barrier before | ||
2711 | * changing the task state if and only if any tasks are woken up. | ||
2712 | */ | ||
2713 | void complete_all(struct completion *x) | ||
2714 | { | ||
2715 | unsigned long flags; | ||
2716 | |||
2717 | spin_lock_irqsave(&x->wait.lock, flags); | ||
2718 | x->done += UINT_MAX/2; | ||
2719 | __wake_up_common(&x->wait, TASK_NORMAL, 0, 0, NULL); | ||
2720 | spin_unlock_irqrestore(&x->wait.lock, flags); | ||
2721 | } | ||
2722 | EXPORT_SYMBOL(complete_all); | ||
2723 | |||
2724 | static inline long __sched | ||
2725 | do_wait_for_common(struct completion *x, | ||
2726 | long (*action)(long), long timeout, int state) | ||
2727 | { | ||
2728 | if (!x->done) { | ||
2729 | DECLARE_WAITQUEUE(wait, current); | ||
2730 | |||
2731 | __add_wait_queue_tail_exclusive(&x->wait, &wait); | ||
2732 | do { | ||
2733 | if (signal_pending_state(state, current)) { | ||
2734 | timeout = -ERESTARTSYS; | ||
2735 | break; | ||
2736 | } | ||
2737 | __set_current_state(state); | ||
2738 | spin_unlock_irq(&x->wait.lock); | ||
2739 | timeout = action(timeout); | ||
2740 | spin_lock_irq(&x->wait.lock); | ||
2741 | } while (!x->done && timeout); | ||
2742 | __remove_wait_queue(&x->wait, &wait); | ||
2743 | if (!x->done) | ||
2744 | return timeout; | ||
2745 | } | ||
2746 | x->done--; | ||
2747 | return timeout ?: 1; | ||
2748 | } | ||
2749 | |||
2750 | static inline long __sched | ||
2751 | __wait_for_common(struct completion *x, | ||
2752 | long (*action)(long), long timeout, int state) | ||
2753 | { | ||
2754 | might_sleep(); | ||
2755 | |||
2756 | spin_lock_irq(&x->wait.lock); | ||
2757 | timeout = do_wait_for_common(x, action, timeout, state); | ||
2758 | spin_unlock_irq(&x->wait.lock); | ||
2759 | return timeout; | ||
2760 | } | ||
2761 | |||
2762 | static long __sched | ||
2763 | wait_for_common(struct completion *x, long timeout, int state) | ||
2764 | { | ||
2765 | return __wait_for_common(x, schedule_timeout, timeout, state); | ||
2766 | } | ||
2767 | |||
2768 | static long __sched | ||
2769 | wait_for_common_io(struct completion *x, long timeout, int state) | ||
2770 | { | ||
2771 | return __wait_for_common(x, io_schedule_timeout, timeout, state); | ||
2772 | } | ||
2773 | |||
2774 | /** | ||
2775 | * wait_for_completion: - waits for completion of a task | ||
2776 | * @x: holds the state of this particular completion | ||
2777 | * | ||
2778 | * This waits to be signaled for completion of a specific task. It is NOT | ||
2779 | * interruptible and there is no timeout. | ||
2780 | * | ||
2781 | * See also similar routines (i.e. wait_for_completion_timeout()) with timeout | ||
2782 | * and interrupt capability. Also see complete(). | ||
2783 | */ | ||
2784 | void __sched wait_for_completion(struct completion *x) | ||
2785 | { | ||
2786 | wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_UNINTERRUPTIBLE); | ||
2787 | } | ||
2788 | EXPORT_SYMBOL(wait_for_completion); | ||
2789 | |||
2790 | /** | ||
2791 | * wait_for_completion_timeout: - waits for completion of a task (w/timeout) | ||
2792 | * @x: holds the state of this particular completion | ||
2793 | * @timeout: timeout value in jiffies | ||
2794 | * | ||
2795 | * This waits for either a completion of a specific task to be signaled or for a | ||
2796 | * specified timeout to expire. The timeout is in jiffies. It is not | ||
2797 | * interruptible. | ||
2798 | * | ||
2799 | * Return: 0 if timed out, and positive (at least 1, or number of jiffies left | ||
2800 | * till timeout) if completed. | ||
2801 | */ | ||
2802 | unsigned long __sched | ||
2803 | wait_for_completion_timeout(struct completion *x, unsigned long timeout) | ||
2804 | { | ||
2805 | return wait_for_common(x, timeout, TASK_UNINTERRUPTIBLE); | ||
2806 | } | ||
2807 | EXPORT_SYMBOL(wait_for_completion_timeout); | ||
2808 | |||
2809 | /** | ||
2810 | * wait_for_completion_io: - waits for completion of a task | ||
2811 | * @x: holds the state of this particular completion | ||
2812 | * | ||
2813 | * This waits to be signaled for completion of a specific task. It is NOT | ||
2814 | * interruptible and there is no timeout. The caller is accounted as waiting | ||
2815 | * for IO. | ||
2816 | */ | ||
2817 | void __sched wait_for_completion_io(struct completion *x) | ||
2818 | { | ||
2819 | wait_for_common_io(x, MAX_SCHEDULE_TIMEOUT, TASK_UNINTERRUPTIBLE); | ||
2820 | } | ||
2821 | EXPORT_SYMBOL(wait_for_completion_io); | ||
2822 | |||
2823 | /** | ||
2824 | * wait_for_completion_io_timeout: - waits for completion of a task (w/timeout) | ||
2825 | * @x: holds the state of this particular completion | ||
2826 | * @timeout: timeout value in jiffies | ||
2827 | * | ||
2828 | * This waits for either a completion of a specific task to be signaled or for a | ||
2829 | * specified timeout to expire. The timeout is in jiffies. It is not | ||
2830 | * interruptible. The caller is accounted as waiting for IO. | ||
2831 | * | ||
2832 | * Return: 0 if timed out, and positive (at least 1, or number of jiffies left | ||
2833 | * till timeout) if completed. | ||
2834 | */ | ||
2835 | unsigned long __sched | ||
2836 | wait_for_completion_io_timeout(struct completion *x, unsigned long timeout) | ||
2837 | { | ||
2838 | return wait_for_common_io(x, timeout, TASK_UNINTERRUPTIBLE); | ||
2839 | } | ||
2840 | EXPORT_SYMBOL(wait_for_completion_io_timeout); | ||
2841 | |||
2842 | /** | ||
2843 | * wait_for_completion_interruptible: - waits for completion of a task (w/intr) | ||
2844 | * @x: holds the state of this particular completion | ||
2845 | * | ||
2846 | * This waits for completion of a specific task to be signaled. It is | ||
2847 | * interruptible. | ||
2848 | * | ||
2849 | * Return: -ERESTARTSYS if interrupted, 0 if completed. | ||
2850 | */ | ||
2851 | int __sched wait_for_completion_interruptible(struct completion *x) | ||
2852 | { | ||
2853 | long t = wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_INTERRUPTIBLE); | ||
2854 | if (t == -ERESTARTSYS) | ||
2855 | return t; | ||
2856 | return 0; | ||
2857 | } | ||
2858 | EXPORT_SYMBOL(wait_for_completion_interruptible); | ||
2859 | |||
2860 | /** | ||
2861 | * wait_for_completion_interruptible_timeout: - waits for completion (w/(to,intr)) | ||
2862 | * @x: holds the state of this particular completion | ||
2863 | * @timeout: timeout value in jiffies | ||
2864 | * | ||
2865 | * This waits for either a completion of a specific task to be signaled or for a | ||
2866 | * specified timeout to expire. It is interruptible. The timeout is in jiffies. | ||
2867 | * | ||
2868 | * Return: -ERESTARTSYS if interrupted, 0 if timed out, positive (at least 1, | ||
2869 | * or number of jiffies left till timeout) if completed. | ||
2870 | */ | ||
2871 | long __sched | ||
2872 | wait_for_completion_interruptible_timeout(struct completion *x, | ||
2873 | unsigned long timeout) | ||
2874 | { | ||
2875 | return wait_for_common(x, timeout, TASK_INTERRUPTIBLE); | ||
2876 | } | ||
2877 | EXPORT_SYMBOL(wait_for_completion_interruptible_timeout); | ||
2878 | |||
2879 | /** | ||
2880 | * wait_for_completion_killable: - waits for completion of a task (killable) | ||
2881 | * @x: holds the state of this particular completion | ||
2882 | * | ||
2883 | * This waits to be signaled for completion of a specific task. It can be | ||
2884 | * interrupted by a kill signal. | ||
2885 | * | ||
2886 | * Return: -ERESTARTSYS if interrupted, 0 if completed. | ||
2887 | */ | ||
2888 | int __sched wait_for_completion_killable(struct completion *x) | ||
2889 | { | ||
2890 | long t = wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_KILLABLE); | ||
2891 | if (t == -ERESTARTSYS) | ||
2892 | return t; | ||
2893 | return 0; | ||
2894 | } | ||
2895 | EXPORT_SYMBOL(wait_for_completion_killable); | ||
2896 | |||
2897 | /** | ||
2898 | * wait_for_completion_killable_timeout: - waits for completion of a task (w/(to,killable)) | ||
2899 | * @x: holds the state of this particular completion | ||
2900 | * @timeout: timeout value in jiffies | ||
2901 | * | ||
2902 | * This waits for either a completion of a specific task to be | ||
2903 | * signaled or for a specified timeout to expire. It can be | ||
2904 | * interrupted by a kill signal. The timeout is in jiffies. | ||
2905 | * | ||
2906 | * Return: -ERESTARTSYS if interrupted, 0 if timed out, positive (at least 1, | ||
2907 | * or number of jiffies left till timeout) if completed. | ||
2908 | */ | ||
2909 | long __sched | ||
2910 | wait_for_completion_killable_timeout(struct completion *x, | ||
2911 | unsigned long timeout) | ||
2912 | { | ||
2913 | return wait_for_common(x, timeout, TASK_KILLABLE); | ||
2914 | } | ||
2915 | EXPORT_SYMBOL(wait_for_completion_killable_timeout); | ||
2916 | |||
2917 | /** | ||
2918 | * try_wait_for_completion - try to decrement a completion without blocking | ||
2919 | * @x: completion structure | ||
2920 | * | ||
2921 | * Return: 0 if a decrement cannot be done without blocking | ||
2922 | * 1 if a decrement succeeded. | ||
2923 | * | ||
2924 | * If a completion is being used as a counting completion, | ||
2925 | * attempt to decrement the counter without blocking. This | ||
2926 | * enables us to avoid waiting if the resource the completion | ||
2927 | * is protecting is not available. | ||
2928 | */ | ||
2929 | bool try_wait_for_completion(struct completion *x) | ||
2930 | { | ||
2931 | unsigned long flags; | ||
2932 | int ret = 1; | ||
2933 | |||
2934 | spin_lock_irqsave(&x->wait.lock, flags); | ||
2935 | if (!x->done) | ||
2936 | ret = 0; | ||
2937 | else | ||
2938 | x->done--; | ||
2939 | spin_unlock_irqrestore(&x->wait.lock, flags); | ||
2940 | return ret; | ||
2941 | } | ||
2942 | EXPORT_SYMBOL(try_wait_for_completion); | ||
2943 | |||
2944 | /** | ||
2945 | * completion_done - Test to see if a completion has any waiters | ||
2946 | * @x: completion structure | ||
2947 | * | ||
2948 | * Return: 0 if there are waiters (wait_for_completion() in progress) | ||
2949 | * 1 if there are no waiters. | ||
2950 | * | ||
2951 | */ | ||
2952 | bool completion_done(struct completion *x) | ||
2953 | { | ||
2954 | unsigned long flags; | ||
2955 | int ret = 1; | ||
2956 | |||
2957 | spin_lock_irqsave(&x->wait.lock, flags); | ||
2958 | if (!x->done) | ||
2959 | ret = 0; | ||
2960 | spin_unlock_irqrestore(&x->wait.lock, flags); | ||
2961 | return ret; | ||
2962 | } | ||
2963 | EXPORT_SYMBOL(completion_done); | ||
2964 | |||
2965 | static long __sched | 2704 | static long __sched |
2966 | sleep_on_common(wait_queue_head_t *q, int state, long timeout) | 2705 | sleep_on_common(wait_queue_head_t *q, int state, long timeout) |
2967 | { | 2706 | { |
@@ -3598,13 +3337,11 @@ long sched_setaffinity(pid_t pid, const struct cpumask *in_mask) | |||
3598 | struct task_struct *p; | 3337 | struct task_struct *p; |
3599 | int retval; | 3338 | int retval; |
3600 | 3339 | ||
3601 | get_online_cpus(); | ||
3602 | rcu_read_lock(); | 3340 | rcu_read_lock(); |
3603 | 3341 | ||
3604 | p = find_process_by_pid(pid); | 3342 | p = find_process_by_pid(pid); |
3605 | if (!p) { | 3343 | if (!p) { |
3606 | rcu_read_unlock(); | 3344 | rcu_read_unlock(); |
3607 | put_online_cpus(); | ||
3608 | return -ESRCH; | 3345 | return -ESRCH; |
3609 | } | 3346 | } |
3610 | 3347 | ||
@@ -3661,7 +3398,6 @@ out_free_cpus_allowed: | |||
3661 | free_cpumask_var(cpus_allowed); | 3398 | free_cpumask_var(cpus_allowed); |
3662 | out_put_task: | 3399 | out_put_task: |
3663 | put_task_struct(p); | 3400 | put_task_struct(p); |
3664 | put_online_cpus(); | ||
3665 | return retval; | 3401 | return retval; |
3666 | } | 3402 | } |
3667 | 3403 | ||
@@ -3706,7 +3442,6 @@ long sched_getaffinity(pid_t pid, struct cpumask *mask) | |||
3706 | unsigned long flags; | 3442 | unsigned long flags; |
3707 | int retval; | 3443 | int retval; |
3708 | 3444 | ||
3709 | get_online_cpus(); | ||
3710 | rcu_read_lock(); | 3445 | rcu_read_lock(); |
3711 | 3446 | ||
3712 | retval = -ESRCH; | 3447 | retval = -ESRCH; |
@@ -3719,12 +3454,11 @@ long sched_getaffinity(pid_t pid, struct cpumask *mask) | |||
3719 | goto out_unlock; | 3454 | goto out_unlock; |
3720 | 3455 | ||
3721 | raw_spin_lock_irqsave(&p->pi_lock, flags); | 3456 | raw_spin_lock_irqsave(&p->pi_lock, flags); |
3722 | cpumask_and(mask, &p->cpus_allowed, cpu_online_mask); | 3457 | cpumask_and(mask, &p->cpus_allowed, cpu_active_mask); |
3723 | raw_spin_unlock_irqrestore(&p->pi_lock, flags); | 3458 | raw_spin_unlock_irqrestore(&p->pi_lock, flags); |
3724 | 3459 | ||
3725 | out_unlock: | 3460 | out_unlock: |
3726 | rcu_read_unlock(); | 3461 | rcu_read_unlock(); |
3727 | put_online_cpus(); | ||
3728 | 3462 | ||
3729 | return retval; | 3463 | return retval; |
3730 | } | 3464 | } |
@@ -3794,16 +3528,11 @@ SYSCALL_DEFINE0(sched_yield) | |||
3794 | return 0; | 3528 | return 0; |
3795 | } | 3529 | } |
3796 | 3530 | ||
3797 | static inline int should_resched(void) | ||
3798 | { | ||
3799 | return need_resched() && !(preempt_count() & PREEMPT_ACTIVE); | ||
3800 | } | ||
3801 | |||
3802 | static void __cond_resched(void) | 3531 | static void __cond_resched(void) |
3803 | { | 3532 | { |
3804 | add_preempt_count(PREEMPT_ACTIVE); | 3533 | __preempt_count_add(PREEMPT_ACTIVE); |
3805 | __schedule(); | 3534 | __schedule(); |
3806 | sub_preempt_count(PREEMPT_ACTIVE); | 3535 | __preempt_count_sub(PREEMPT_ACTIVE); |
3807 | } | 3536 | } |
3808 | 3537 | ||
3809 | int __sched _cond_resched(void) | 3538 | int __sched _cond_resched(void) |
@@ -4186,7 +3915,7 @@ void init_idle(struct task_struct *idle, int cpu) | |||
4186 | 3915 | ||
4187 | raw_spin_lock_irqsave(&rq->lock, flags); | 3916 | raw_spin_lock_irqsave(&rq->lock, flags); |
4188 | 3917 | ||
4189 | __sched_fork(idle); | 3918 | __sched_fork(0, idle); |
4190 | idle->state = TASK_RUNNING; | 3919 | idle->state = TASK_RUNNING; |
4191 | idle->se.exec_start = sched_clock(); | 3920 | idle->se.exec_start = sched_clock(); |
4192 | 3921 | ||
@@ -4212,7 +3941,7 @@ void init_idle(struct task_struct *idle, int cpu) | |||
4212 | raw_spin_unlock_irqrestore(&rq->lock, flags); | 3941 | raw_spin_unlock_irqrestore(&rq->lock, flags); |
4213 | 3942 | ||
4214 | /* Set the preempt count _outside_ the spinlocks! */ | 3943 | /* Set the preempt count _outside_ the spinlocks! */ |
4215 | task_thread_info(idle)->preempt_count = 0; | 3944 | init_idle_preempt_count(idle, cpu); |
4216 | 3945 | ||
4217 | /* | 3946 | /* |
4218 | * The idle tasks have their own, simple scheduling class: | 3947 | * The idle tasks have their own, simple scheduling class: |
@@ -4346,6 +4075,53 @@ fail: | |||
4346 | return ret; | 4075 | return ret; |
4347 | } | 4076 | } |
4348 | 4077 | ||
4078 | #ifdef CONFIG_NUMA_BALANCING | ||
4079 | /* Migrate current task p to target_cpu */ | ||
4080 | int migrate_task_to(struct task_struct *p, int target_cpu) | ||
4081 | { | ||
4082 | struct migration_arg arg = { p, target_cpu }; | ||
4083 | int curr_cpu = task_cpu(p); | ||
4084 | |||
4085 | if (curr_cpu == target_cpu) | ||
4086 | return 0; | ||
4087 | |||
4088 | if (!cpumask_test_cpu(target_cpu, tsk_cpus_allowed(p))) | ||
4089 | return -EINVAL; | ||
4090 | |||
4091 | /* TODO: This is not properly updating schedstats */ | ||
4092 | |||
4093 | return stop_one_cpu(curr_cpu, migration_cpu_stop, &arg); | ||
4094 | } | ||
4095 | |||
4096 | /* | ||
4097 | * Requeue a task on a given node and accurately track the number of NUMA | ||
4098 | * tasks on the runqueues | ||
4099 | */ | ||
4100 | void sched_setnuma(struct task_struct *p, int nid) | ||
4101 | { | ||
4102 | struct rq *rq; | ||
4103 | unsigned long flags; | ||
4104 | bool on_rq, running; | ||
4105 | |||
4106 | rq = task_rq_lock(p, &flags); | ||
4107 | on_rq = p->on_rq; | ||
4108 | running = task_current(rq, p); | ||
4109 | |||
4110 | if (on_rq) | ||
4111 | dequeue_task(rq, p, 0); | ||
4112 | if (running) | ||
4113 | p->sched_class->put_prev_task(rq, p); | ||
4114 | |||
4115 | p->numa_preferred_nid = nid; | ||
4116 | |||
4117 | if (running) | ||
4118 | p->sched_class->set_curr_task(rq); | ||
4119 | if (on_rq) | ||
4120 | enqueue_task(rq, p, 0); | ||
4121 | task_rq_unlock(rq, p, &flags); | ||
4122 | } | ||
4123 | #endif | ||
4124 | |||
4349 | /* | 4125 | /* |
4350 | * migration_cpu_stop - this will be executed by a highprio stopper thread | 4126 | * migration_cpu_stop - this will be executed by a highprio stopper thread |
4351 | * and performs thread migration by bumping thread off CPU then | 4127 | * and performs thread migration by bumping thread off CPU then |
@@ -4985,7 +4761,7 @@ static void rq_attach_root(struct rq *rq, struct root_domain *rd) | |||
4985 | cpumask_clear_cpu(rq->cpu, old_rd->span); | 4761 | cpumask_clear_cpu(rq->cpu, old_rd->span); |
4986 | 4762 | ||
4987 | /* | 4763 | /* |
4988 | * If we dont want to free the old_rt yet then | 4764 | * If we dont want to free the old_rd yet then |
4989 | * set old_rd to NULL to skip the freeing later | 4765 | * set old_rd to NULL to skip the freeing later |
4990 | * in this function: | 4766 | * in this function: |
4991 | */ | 4767 | */ |
@@ -5119,10 +4895,14 @@ static void destroy_sched_domains(struct sched_domain *sd, int cpu) | |||
5119 | DEFINE_PER_CPU(struct sched_domain *, sd_llc); | 4895 | DEFINE_PER_CPU(struct sched_domain *, sd_llc); |
5120 | DEFINE_PER_CPU(int, sd_llc_size); | 4896 | DEFINE_PER_CPU(int, sd_llc_size); |
5121 | DEFINE_PER_CPU(int, sd_llc_id); | 4897 | DEFINE_PER_CPU(int, sd_llc_id); |
4898 | DEFINE_PER_CPU(struct sched_domain *, sd_numa); | ||
4899 | DEFINE_PER_CPU(struct sched_domain *, sd_busy); | ||
4900 | DEFINE_PER_CPU(struct sched_domain *, sd_asym); | ||
5122 | 4901 | ||
5123 | static void update_top_cache_domain(int cpu) | 4902 | static void update_top_cache_domain(int cpu) |
5124 | { | 4903 | { |
5125 | struct sched_domain *sd; | 4904 | struct sched_domain *sd; |
4905 | struct sched_domain *busy_sd = NULL; | ||
5126 | int id = cpu; | 4906 | int id = cpu; |
5127 | int size = 1; | 4907 | int size = 1; |
5128 | 4908 | ||
@@ -5130,11 +4910,19 @@ static void update_top_cache_domain(int cpu) | |||
5130 | if (sd) { | 4910 | if (sd) { |
5131 | id = cpumask_first(sched_domain_span(sd)); | 4911 | id = cpumask_first(sched_domain_span(sd)); |
5132 | size = cpumask_weight(sched_domain_span(sd)); | 4912 | size = cpumask_weight(sched_domain_span(sd)); |
4913 | busy_sd = sd->parent; /* sd_busy */ | ||
5133 | } | 4914 | } |
4915 | rcu_assign_pointer(per_cpu(sd_busy, cpu), busy_sd); | ||
5134 | 4916 | ||
5135 | rcu_assign_pointer(per_cpu(sd_llc, cpu), sd); | 4917 | rcu_assign_pointer(per_cpu(sd_llc, cpu), sd); |
5136 | per_cpu(sd_llc_size, cpu) = size; | 4918 | per_cpu(sd_llc_size, cpu) = size; |
5137 | per_cpu(sd_llc_id, cpu) = id; | 4919 | per_cpu(sd_llc_id, cpu) = id; |
4920 | |||
4921 | sd = lowest_flag_domain(cpu, SD_NUMA); | ||
4922 | rcu_assign_pointer(per_cpu(sd_numa, cpu), sd); | ||
4923 | |||
4924 | sd = highest_flag_domain(cpu, SD_ASYM_PACKING); | ||
4925 | rcu_assign_pointer(per_cpu(sd_asym, cpu), sd); | ||
5138 | } | 4926 | } |
5139 | 4927 | ||
5140 | /* | 4928 | /* |
@@ -5325,6 +5113,7 @@ build_overlap_sched_groups(struct sched_domain *sd, int cpu) | |||
5325 | * die on a /0 trap. | 5113 | * die on a /0 trap. |
5326 | */ | 5114 | */ |
5327 | sg->sgp->power = SCHED_POWER_SCALE * cpumask_weight(sg_span); | 5115 | sg->sgp->power = SCHED_POWER_SCALE * cpumask_weight(sg_span); |
5116 | sg->sgp->power_orig = sg->sgp->power; | ||
5328 | 5117 | ||
5329 | /* | 5118 | /* |
5330 | * Make sure the first group of this domain contains the | 5119 | * Make sure the first group of this domain contains the |
@@ -5654,6 +5443,7 @@ sd_numa_init(struct sched_domain_topology_level *tl, int cpu) | |||
5654 | | 0*SD_SHARE_PKG_RESOURCES | 5443 | | 0*SD_SHARE_PKG_RESOURCES |
5655 | | 1*SD_SERIALIZE | 5444 | | 1*SD_SERIALIZE |
5656 | | 0*SD_PREFER_SIBLING | 5445 | | 0*SD_PREFER_SIBLING |
5446 | | 1*SD_NUMA | ||
5657 | | sd_local_flags(level) | 5447 | | sd_local_flags(level) |
5658 | , | 5448 | , |
5659 | .last_balance = jiffies, | 5449 | .last_balance = jiffies, |
@@ -6335,14 +6125,17 @@ void __init sched_init_smp(void) | |||
6335 | 6125 | ||
6336 | sched_init_numa(); | 6126 | sched_init_numa(); |
6337 | 6127 | ||
6338 | get_online_cpus(); | 6128 | /* |
6129 | * There's no userspace yet to cause hotplug operations; hence all the | ||
6130 | * cpu masks are stable and all blatant races in the below code cannot | ||
6131 | * happen. | ||
6132 | */ | ||
6339 | mutex_lock(&sched_domains_mutex); | 6133 | mutex_lock(&sched_domains_mutex); |
6340 | init_sched_domains(cpu_active_mask); | 6134 | init_sched_domains(cpu_active_mask); |
6341 | cpumask_andnot(non_isolated_cpus, cpu_possible_mask, cpu_isolated_map); | 6135 | cpumask_andnot(non_isolated_cpus, cpu_possible_mask, cpu_isolated_map); |
6342 | if (cpumask_empty(non_isolated_cpus)) | 6136 | if (cpumask_empty(non_isolated_cpus)) |
6343 | cpumask_set_cpu(smp_processor_id(), non_isolated_cpus); | 6137 | cpumask_set_cpu(smp_processor_id(), non_isolated_cpus); |
6344 | mutex_unlock(&sched_domains_mutex); | 6138 | mutex_unlock(&sched_domains_mutex); |
6345 | put_online_cpus(); | ||
6346 | 6139 | ||
6347 | hotcpu_notifier(sched_domains_numa_masks_update, CPU_PRI_SCHED_ACTIVE); | 6140 | hotcpu_notifier(sched_domains_numa_masks_update, CPU_PRI_SCHED_ACTIVE); |
6348 | hotcpu_notifier(cpuset_cpu_active, CPU_PRI_CPUSET_ACTIVE); | 6141 | hotcpu_notifier(cpuset_cpu_active, CPU_PRI_CPUSET_ACTIVE); |
@@ -6505,6 +6298,7 @@ void __init sched_init(void) | |||
6505 | rq->online = 0; | 6298 | rq->online = 0; |
6506 | rq->idle_stamp = 0; | 6299 | rq->idle_stamp = 0; |
6507 | rq->avg_idle = 2*sysctl_sched_migration_cost; | 6300 | rq->avg_idle = 2*sysctl_sched_migration_cost; |
6301 | rq->max_idle_balance_cost = sysctl_sched_migration_cost; | ||
6508 | 6302 | ||
6509 | INIT_LIST_HEAD(&rq->cfs_tasks); | 6303 | INIT_LIST_HEAD(&rq->cfs_tasks); |
6510 | 6304 | ||
@@ -7277,7 +7071,12 @@ static int tg_set_cfs_bandwidth(struct task_group *tg, u64 period, u64 quota) | |||
7277 | 7071 | ||
7278 | runtime_enabled = quota != RUNTIME_INF; | 7072 | runtime_enabled = quota != RUNTIME_INF; |
7279 | runtime_was_enabled = cfs_b->quota != RUNTIME_INF; | 7073 | runtime_was_enabled = cfs_b->quota != RUNTIME_INF; |
7280 | account_cfs_bandwidth_used(runtime_enabled, runtime_was_enabled); | 7074 | /* |
7075 | * If we need to toggle cfs_bandwidth_used, off->on must occur | ||
7076 | * before making related changes, and on->off must occur afterwards | ||
7077 | */ | ||
7078 | if (runtime_enabled && !runtime_was_enabled) | ||
7079 | cfs_bandwidth_usage_inc(); | ||
7281 | raw_spin_lock_irq(&cfs_b->lock); | 7080 | raw_spin_lock_irq(&cfs_b->lock); |
7282 | cfs_b->period = ns_to_ktime(period); | 7081 | cfs_b->period = ns_to_ktime(period); |
7283 | cfs_b->quota = quota; | 7082 | cfs_b->quota = quota; |
@@ -7303,6 +7102,8 @@ static int tg_set_cfs_bandwidth(struct task_group *tg, u64 period, u64 quota) | |||
7303 | unthrottle_cfs_rq(cfs_rq); | 7102 | unthrottle_cfs_rq(cfs_rq); |
7304 | raw_spin_unlock_irq(&rq->lock); | 7103 | raw_spin_unlock_irq(&rq->lock); |
7305 | } | 7104 | } |
7105 | if (runtime_was_enabled && !runtime_enabled) | ||
7106 | cfs_bandwidth_usage_dec(); | ||
7306 | out_unlock: | 7107 | out_unlock: |
7307 | mutex_unlock(&cfs_constraints_mutex); | 7108 | mutex_unlock(&cfs_constraints_mutex); |
7308 | 7109 | ||
diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c index 196559994f7c..5c34d1817e8f 100644 --- a/kernel/sched/debug.c +++ b/kernel/sched/debug.c | |||
@@ -15,6 +15,7 @@ | |||
15 | #include <linux/seq_file.h> | 15 | #include <linux/seq_file.h> |
16 | #include <linux/kallsyms.h> | 16 | #include <linux/kallsyms.h> |
17 | #include <linux/utsname.h> | 17 | #include <linux/utsname.h> |
18 | #include <linux/mempolicy.h> | ||
18 | 19 | ||
19 | #include "sched.h" | 20 | #include "sched.h" |
20 | 21 | ||
@@ -137,6 +138,9 @@ print_task(struct seq_file *m, struct rq *rq, struct task_struct *p) | |||
137 | SEQ_printf(m, "%15Ld %15Ld %15Ld.%06ld %15Ld.%06ld %15Ld.%06ld", | 138 | SEQ_printf(m, "%15Ld %15Ld %15Ld.%06ld %15Ld.%06ld %15Ld.%06ld", |
138 | 0LL, 0LL, 0LL, 0L, 0LL, 0L, 0LL, 0L); | 139 | 0LL, 0LL, 0LL, 0L, 0LL, 0L, 0LL, 0L); |
139 | #endif | 140 | #endif |
141 | #ifdef CONFIG_NUMA_BALANCING | ||
142 | SEQ_printf(m, " %d", cpu_to_node(task_cpu(p))); | ||
143 | #endif | ||
140 | #ifdef CONFIG_CGROUP_SCHED | 144 | #ifdef CONFIG_CGROUP_SCHED |
141 | SEQ_printf(m, " %s", task_group_path(task_group(p))); | 145 | SEQ_printf(m, " %s", task_group_path(task_group(p))); |
142 | #endif | 146 | #endif |
@@ -159,7 +163,7 @@ static void print_rq(struct seq_file *m, struct rq *rq, int rq_cpu) | |||
159 | read_lock_irqsave(&tasklist_lock, flags); | 163 | read_lock_irqsave(&tasklist_lock, flags); |
160 | 164 | ||
161 | do_each_thread(g, p) { | 165 | do_each_thread(g, p) { |
162 | if (!p->on_rq || task_cpu(p) != rq_cpu) | 166 | if (task_cpu(p) != rq_cpu) |
163 | continue; | 167 | continue; |
164 | 168 | ||
165 | print_task(m, rq, p); | 169 | print_task(m, rq, p); |
@@ -225,6 +229,14 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq) | |||
225 | atomic_read(&cfs_rq->tg->runnable_avg)); | 229 | atomic_read(&cfs_rq->tg->runnable_avg)); |
226 | #endif | 230 | #endif |
227 | #endif | 231 | #endif |
232 | #ifdef CONFIG_CFS_BANDWIDTH | ||
233 | SEQ_printf(m, " .%-30s: %d\n", "tg->cfs_bandwidth.timer_active", | ||
234 | cfs_rq->tg->cfs_bandwidth.timer_active); | ||
235 | SEQ_printf(m, " .%-30s: %d\n", "throttled", | ||
236 | cfs_rq->throttled); | ||
237 | SEQ_printf(m, " .%-30s: %d\n", "throttle_count", | ||
238 | cfs_rq->throttle_count); | ||
239 | #endif | ||
228 | 240 | ||
229 | #ifdef CONFIG_FAIR_GROUP_SCHED | 241 | #ifdef CONFIG_FAIR_GROUP_SCHED |
230 | print_cfs_group_stats(m, cpu, cfs_rq->tg); | 242 | print_cfs_group_stats(m, cpu, cfs_rq->tg); |
@@ -345,7 +357,7 @@ static void sched_debug_header(struct seq_file *m) | |||
345 | cpu_clk = local_clock(); | 357 | cpu_clk = local_clock(); |
346 | local_irq_restore(flags); | 358 | local_irq_restore(flags); |
347 | 359 | ||
348 | SEQ_printf(m, "Sched Debug Version: v0.10, %s %.*s\n", | 360 | SEQ_printf(m, "Sched Debug Version: v0.11, %s %.*s\n", |
349 | init_utsname()->release, | 361 | init_utsname()->release, |
350 | (int)strcspn(init_utsname()->version, " "), | 362 | (int)strcspn(init_utsname()->version, " "), |
351 | init_utsname()->version); | 363 | init_utsname()->version); |
@@ -488,6 +500,56 @@ static int __init init_sched_debug_procfs(void) | |||
488 | 500 | ||
489 | __initcall(init_sched_debug_procfs); | 501 | __initcall(init_sched_debug_procfs); |
490 | 502 | ||
503 | #define __P(F) \ | ||
504 | SEQ_printf(m, "%-45s:%21Ld\n", #F, (long long)F) | ||
505 | #define P(F) \ | ||
506 | SEQ_printf(m, "%-45s:%21Ld\n", #F, (long long)p->F) | ||
507 | #define __PN(F) \ | ||
508 | SEQ_printf(m, "%-45s:%14Ld.%06ld\n", #F, SPLIT_NS((long long)F)) | ||
509 | #define PN(F) \ | ||
510 | SEQ_printf(m, "%-45s:%14Ld.%06ld\n", #F, SPLIT_NS((long long)p->F)) | ||
511 | |||
512 | |||
513 | static void sched_show_numa(struct task_struct *p, struct seq_file *m) | ||
514 | { | ||
515 | #ifdef CONFIG_NUMA_BALANCING | ||
516 | struct mempolicy *pol; | ||
517 | int node, i; | ||
518 | |||
519 | if (p->mm) | ||
520 | P(mm->numa_scan_seq); | ||
521 | |||
522 | task_lock(p); | ||
523 | pol = p->mempolicy; | ||
524 | if (pol && !(pol->flags & MPOL_F_MORON)) | ||
525 | pol = NULL; | ||
526 | mpol_get(pol); | ||
527 | task_unlock(p); | ||
528 | |||
529 | SEQ_printf(m, "numa_migrations, %ld\n", xchg(&p->numa_pages_migrated, 0)); | ||
530 | |||
531 | for_each_online_node(node) { | ||
532 | for (i = 0; i < 2; i++) { | ||
533 | unsigned long nr_faults = -1; | ||
534 | int cpu_current, home_node; | ||
535 | |||
536 | if (p->numa_faults) | ||
537 | nr_faults = p->numa_faults[2*node + i]; | ||
538 | |||
539 | cpu_current = !i ? (task_node(p) == node) : | ||
540 | (pol && node_isset(node, pol->v.nodes)); | ||
541 | |||
542 | home_node = (p->numa_preferred_nid == node); | ||
543 | |||
544 | SEQ_printf(m, "numa_faults, %d, %d, %d, %d, %ld\n", | ||
545 | i, node, cpu_current, home_node, nr_faults); | ||
546 | } | ||
547 | } | ||
548 | |||
549 | mpol_put(pol); | ||
550 | #endif | ||
551 | } | ||
552 | |||
491 | void proc_sched_show_task(struct task_struct *p, struct seq_file *m) | 553 | void proc_sched_show_task(struct task_struct *p, struct seq_file *m) |
492 | { | 554 | { |
493 | unsigned long nr_switches; | 555 | unsigned long nr_switches; |
@@ -591,6 +653,8 @@ void proc_sched_show_task(struct task_struct *p, struct seq_file *m) | |||
591 | SEQ_printf(m, "%-45s:%21Ld\n", | 653 | SEQ_printf(m, "%-45s:%21Ld\n", |
592 | "clock-delta", (long long)(t1-t0)); | 654 | "clock-delta", (long long)(t1-t0)); |
593 | } | 655 | } |
656 | |||
657 | sched_show_numa(p, m); | ||
594 | } | 658 | } |
595 | 659 | ||
596 | void proc_sched_set_task(struct task_struct *p) | 660 | void proc_sched_set_task(struct task_struct *p) |
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 7c70201fbc61..e64b0794060e 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c | |||
@@ -178,59 +178,61 @@ void sched_init_granularity(void) | |||
178 | update_sysctl(); | 178 | update_sysctl(); |
179 | } | 179 | } |
180 | 180 | ||
181 | #if BITS_PER_LONG == 32 | 181 | #define WMULT_CONST (~0U) |
182 | # define WMULT_CONST (~0UL) | ||
183 | #else | ||
184 | # define WMULT_CONST (1UL << 32) | ||
185 | #endif | ||
186 | |||
187 | #define WMULT_SHIFT 32 | 182 | #define WMULT_SHIFT 32 |
188 | 183 | ||
189 | /* | 184 | static void __update_inv_weight(struct load_weight *lw) |
190 | * Shift right and round: | 185 | { |
191 | */ | 186 | unsigned long w; |
192 | #define SRR(x, y) (((x) + (1UL << ((y) - 1))) >> (y)) | 187 | |
188 | if (likely(lw->inv_weight)) | ||
189 | return; | ||
190 | |||
191 | w = scale_load_down(lw->weight); | ||
192 | |||
193 | if (BITS_PER_LONG > 32 && unlikely(w >= WMULT_CONST)) | ||
194 | lw->inv_weight = 1; | ||
195 | else if (unlikely(!w)) | ||
196 | lw->inv_weight = WMULT_CONST; | ||
197 | else | ||
198 | lw->inv_weight = WMULT_CONST / w; | ||
199 | } | ||
193 | 200 | ||
194 | /* | 201 | /* |
195 | * delta *= weight / lw | 202 | * delta_exec * weight / lw.weight |
203 | * OR | ||
204 | * (delta_exec * (weight * lw->inv_weight)) >> WMULT_SHIFT | ||
205 | * | ||
206 | * Either weight := NICE_0_LOAD and lw \e prio_to_wmult[], in which case | ||
207 | * we're guaranteed shift stays positive because inv_weight is guaranteed to | ||
208 | * fit 32 bits, and NICE_0_LOAD gives another 10 bits; therefore shift >= 22. | ||
209 | * | ||
210 | * Or, weight =< lw.weight (because lw.weight is the runqueue weight), thus | ||
211 | * weight/lw.weight <= 1, and therefore our shift will also be positive. | ||
196 | */ | 212 | */ |
197 | static unsigned long | 213 | static u64 __calc_delta(u64 delta_exec, unsigned long weight, struct load_weight *lw) |
198 | calc_delta_mine(unsigned long delta_exec, unsigned long weight, | ||
199 | struct load_weight *lw) | ||
200 | { | 214 | { |
201 | u64 tmp; | 215 | u64 fact = scale_load_down(weight); |
202 | 216 | int shift = WMULT_SHIFT; | |
203 | /* | ||
204 | * weight can be less than 2^SCHED_LOAD_RESOLUTION for task group sched | ||
205 | * entities since MIN_SHARES = 2. Treat weight as 1 if less than | ||
206 | * 2^SCHED_LOAD_RESOLUTION. | ||
207 | */ | ||
208 | if (likely(weight > (1UL << SCHED_LOAD_RESOLUTION))) | ||
209 | tmp = (u64)delta_exec * scale_load_down(weight); | ||
210 | else | ||
211 | tmp = (u64)delta_exec; | ||
212 | 217 | ||
213 | if (!lw->inv_weight) { | 218 | __update_inv_weight(lw); |
214 | unsigned long w = scale_load_down(lw->weight); | ||
215 | 219 | ||
216 | if (BITS_PER_LONG > 32 && unlikely(w >= WMULT_CONST)) | 220 | if (unlikely(fact >> 32)) { |
217 | lw->inv_weight = 1; | 221 | while (fact >> 32) { |
218 | else if (unlikely(!w)) | 222 | fact >>= 1; |
219 | lw->inv_weight = WMULT_CONST; | 223 | shift--; |
220 | else | 224 | } |
221 | lw->inv_weight = WMULT_CONST / w; | ||
222 | } | 225 | } |
223 | 226 | ||
224 | /* | 227 | /* hint to use a 32x32->64 mul */ |
225 | * Check whether we'd overflow the 64-bit multiplication: | 228 | fact = (u64)(u32)fact * lw->inv_weight; |
226 | */ | 229 | |
227 | if (unlikely(tmp > WMULT_CONST)) | 230 | while (fact >> 32) { |
228 | tmp = SRR(SRR(tmp, WMULT_SHIFT/2) * lw->inv_weight, | 231 | fact >>= 1; |
229 | WMULT_SHIFT/2); | 232 | shift--; |
230 | else | 233 | } |
231 | tmp = SRR(tmp * lw->inv_weight, WMULT_SHIFT); | ||
232 | 234 | ||
233 | return (unsigned long)min(tmp, (u64)(unsigned long)LONG_MAX); | 235 | return mul_u64_u32_shr(delta_exec, fact, shift); |
234 | } | 236 | } |
235 | 237 | ||
236 | 238 | ||
@@ -443,7 +445,7 @@ find_matching_se(struct sched_entity **se, struct sched_entity **pse) | |||
443 | #endif /* CONFIG_FAIR_GROUP_SCHED */ | 445 | #endif /* CONFIG_FAIR_GROUP_SCHED */ |
444 | 446 | ||
445 | static __always_inline | 447 | static __always_inline |
446 | void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, unsigned long delta_exec); | 448 | void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec); |
447 | 449 | ||
448 | /************************************************************** | 450 | /************************************************************** |
449 | * Scheduling class tree data structure manipulation methods: | 451 | * Scheduling class tree data structure manipulation methods: |
@@ -612,11 +614,10 @@ int sched_proc_update_handler(struct ctl_table *table, int write, | |||
612 | /* | 614 | /* |
613 | * delta /= w | 615 | * delta /= w |
614 | */ | 616 | */ |
615 | static inline unsigned long | 617 | static inline u64 calc_delta_fair(u64 delta, struct sched_entity *se) |
616 | calc_delta_fair(unsigned long delta, struct sched_entity *se) | ||
617 | { | 618 | { |
618 | if (unlikely(se->load.weight != NICE_0_LOAD)) | 619 | if (unlikely(se->load.weight != NICE_0_LOAD)) |
619 | delta = calc_delta_mine(delta, NICE_0_LOAD, &se->load); | 620 | delta = __calc_delta(delta, NICE_0_LOAD, &se->load); |
620 | 621 | ||
621 | return delta; | 622 | return delta; |
622 | } | 623 | } |
@@ -665,7 +666,7 @@ static u64 sched_slice(struct cfs_rq *cfs_rq, struct sched_entity *se) | |||
665 | update_load_add(&lw, se->load.weight); | 666 | update_load_add(&lw, se->load.weight); |
666 | load = &lw; | 667 | load = &lw; |
667 | } | 668 | } |
668 | slice = calc_delta_mine(slice, se->load.weight, load); | 669 | slice = __calc_delta(slice, se->load.weight, load); |
669 | } | 670 | } |
670 | return slice; | 671 | return slice; |
671 | } | 672 | } |
@@ -681,6 +682,8 @@ static u64 sched_vslice(struct cfs_rq *cfs_rq, struct sched_entity *se) | |||
681 | } | 682 | } |
682 | 683 | ||
683 | #ifdef CONFIG_SMP | 684 | #ifdef CONFIG_SMP |
685 | static unsigned long task_h_load(struct task_struct *p); | ||
686 | |||
684 | static inline void __update_task_entity_contrib(struct sched_entity *se); | 687 | static inline void __update_task_entity_contrib(struct sched_entity *se); |
685 | 688 | ||
686 | /* Give new task start runnable values to heavy its load in infant time */ | 689 | /* Give new task start runnable values to heavy its load in infant time */ |
@@ -701,47 +704,32 @@ void init_task_runnable_average(struct task_struct *p) | |||
701 | #endif | 704 | #endif |
702 | 705 | ||
703 | /* | 706 | /* |
704 | * Update the current task's runtime statistics. Skip current tasks that | 707 | * Update the current task's runtime statistics. |
705 | * are not in our scheduling class. | ||
706 | */ | 708 | */ |
707 | static inline void | ||
708 | __update_curr(struct cfs_rq *cfs_rq, struct sched_entity *curr, | ||
709 | unsigned long delta_exec) | ||
710 | { | ||
711 | unsigned long delta_exec_weighted; | ||
712 | |||
713 | schedstat_set(curr->statistics.exec_max, | ||
714 | max((u64)delta_exec, curr->statistics.exec_max)); | ||
715 | |||
716 | curr->sum_exec_runtime += delta_exec; | ||
717 | schedstat_add(cfs_rq, exec_clock, delta_exec); | ||
718 | delta_exec_weighted = calc_delta_fair(delta_exec, curr); | ||
719 | |||
720 | curr->vruntime += delta_exec_weighted; | ||
721 | update_min_vruntime(cfs_rq); | ||
722 | } | ||
723 | |||
724 | static void update_curr(struct cfs_rq *cfs_rq) | 709 | static void update_curr(struct cfs_rq *cfs_rq) |
725 | { | 710 | { |
726 | struct sched_entity *curr = cfs_rq->curr; | 711 | struct sched_entity *curr = cfs_rq->curr; |
727 | u64 now = rq_clock_task(rq_of(cfs_rq)); | 712 | u64 now = rq_clock_task(rq_of(cfs_rq)); |
728 | unsigned long delta_exec; | 713 | u64 delta_exec; |
729 | 714 | ||
730 | if (unlikely(!curr)) | 715 | if (unlikely(!curr)) |
731 | return; | 716 | return; |
732 | 717 | ||
733 | /* | 718 | delta_exec = now - curr->exec_start; |
734 | * Get the amount of time the current task was running | 719 | if (unlikely((s64)delta_exec <= 0)) |
735 | * since the last time we changed load (this cannot | ||
736 | * overflow on 32 bits): | ||
737 | */ | ||
738 | delta_exec = (unsigned long)(now - curr->exec_start); | ||
739 | if (!delta_exec) | ||
740 | return; | 720 | return; |
741 | 721 | ||
742 | __update_curr(cfs_rq, curr, delta_exec); | ||
743 | curr->exec_start = now; | 722 | curr->exec_start = now; |
744 | 723 | ||
724 | schedstat_set(curr->statistics.exec_max, | ||
725 | max(delta_exec, curr->statistics.exec_max)); | ||
726 | |||
727 | curr->sum_exec_runtime += delta_exec; | ||
728 | schedstat_add(cfs_rq, exec_clock, delta_exec); | ||
729 | |||
730 | curr->vruntime += calc_delta_fair(delta_exec, curr); | ||
731 | update_min_vruntime(cfs_rq); | ||
732 | |||
745 | if (entity_is_task(curr)) { | 733 | if (entity_is_task(curr)) { |
746 | struct task_struct *curtask = task_of(curr); | 734 | struct task_struct *curtask = task_of(curr); |
747 | 735 | ||
@@ -818,11 +806,12 @@ update_stats_curr_start(struct cfs_rq *cfs_rq, struct sched_entity *se) | |||
818 | 806 | ||
819 | #ifdef CONFIG_NUMA_BALANCING | 807 | #ifdef CONFIG_NUMA_BALANCING |
820 | /* | 808 | /* |
821 | * numa task sample period in ms | 809 | * Approximate time to scan a full NUMA task in ms. The task scan period is |
810 | * calculated based on the tasks virtual memory size and | ||
811 | * numa_balancing_scan_size. | ||
822 | */ | 812 | */ |
823 | unsigned int sysctl_numa_balancing_scan_period_min = 100; | 813 | unsigned int sysctl_numa_balancing_scan_period_min = 1000; |
824 | unsigned int sysctl_numa_balancing_scan_period_max = 100*50; | 814 | unsigned int sysctl_numa_balancing_scan_period_max = 60000; |
825 | unsigned int sysctl_numa_balancing_scan_period_reset = 100*600; | ||
826 | 815 | ||
827 | /* Portion of address space to scan in MB */ | 816 | /* Portion of address space to scan in MB */ |
828 | unsigned int sysctl_numa_balancing_scan_size = 256; | 817 | unsigned int sysctl_numa_balancing_scan_size = 256; |
@@ -830,41 +819,835 @@ unsigned int sysctl_numa_balancing_scan_size = 256; | |||
830 | /* Scan @scan_size MB every @scan_period after an initial @scan_delay in ms */ | 819 | /* Scan @scan_size MB every @scan_period after an initial @scan_delay in ms */ |
831 | unsigned int sysctl_numa_balancing_scan_delay = 1000; | 820 | unsigned int sysctl_numa_balancing_scan_delay = 1000; |
832 | 821 | ||
833 | static void task_numa_placement(struct task_struct *p) | 822 | /* |
823 | * After skipping a page migration on a shared page, skip N more numa page | ||
824 | * migrations unconditionally. This reduces the number of NUMA migrations | ||
825 | * in shared memory workloads, and has the effect of pulling tasks towards | ||
826 | * where their memory lives, over pulling the memory towards the task. | ||
827 | */ | ||
828 | unsigned int sysctl_numa_balancing_migrate_deferred = 16; | ||
829 | |||
830 | static unsigned int task_nr_scan_windows(struct task_struct *p) | ||
831 | { | ||
832 | unsigned long rss = 0; | ||
833 | unsigned long nr_scan_pages; | ||
834 | |||
835 | /* | ||
836 | * Calculations based on RSS as non-present and empty pages are skipped | ||
837 | * by the PTE scanner and NUMA hinting faults should be trapped based | ||
838 | * on resident pages | ||
839 | */ | ||
840 | nr_scan_pages = sysctl_numa_balancing_scan_size << (20 - PAGE_SHIFT); | ||
841 | rss = get_mm_rss(p->mm); | ||
842 | if (!rss) | ||
843 | rss = nr_scan_pages; | ||
844 | |||
845 | rss = round_up(rss, nr_scan_pages); | ||
846 | return rss / nr_scan_pages; | ||
847 | } | ||
848 | |||
849 | /* For sanitys sake, never scan more PTEs than MAX_SCAN_WINDOW MB/sec. */ | ||
850 | #define MAX_SCAN_WINDOW 2560 | ||
851 | |||
852 | static unsigned int task_scan_min(struct task_struct *p) | ||
853 | { | ||
854 | unsigned int scan, floor; | ||
855 | unsigned int windows = 1; | ||
856 | |||
857 | if (sysctl_numa_balancing_scan_size < MAX_SCAN_WINDOW) | ||
858 | windows = MAX_SCAN_WINDOW / sysctl_numa_balancing_scan_size; | ||
859 | floor = 1000 / windows; | ||
860 | |||
861 | scan = sysctl_numa_balancing_scan_period_min / task_nr_scan_windows(p); | ||
862 | return max_t(unsigned int, floor, scan); | ||
863 | } | ||
864 | |||
865 | static unsigned int task_scan_max(struct task_struct *p) | ||
866 | { | ||
867 | unsigned int smin = task_scan_min(p); | ||
868 | unsigned int smax; | ||
869 | |||
870 | /* Watch for min being lower than max due to floor calculations */ | ||
871 | smax = sysctl_numa_balancing_scan_period_max / task_nr_scan_windows(p); | ||
872 | return max(smin, smax); | ||
873 | } | ||
874 | |||
875 | /* | ||
876 | * Once a preferred node is selected the scheduler balancer will prefer moving | ||
877 | * a task to that node for sysctl_numa_balancing_settle_count number of PTE | ||
878 | * scans. This will give the process the chance to accumulate more faults on | ||
879 | * the preferred node but still allow the scheduler to move the task again if | ||
880 | * the nodes CPUs are overloaded. | ||
881 | */ | ||
882 | unsigned int sysctl_numa_balancing_settle_count __read_mostly = 4; | ||
883 | |||
884 | static void account_numa_enqueue(struct rq *rq, struct task_struct *p) | ||
885 | { | ||
886 | rq->nr_numa_running += (p->numa_preferred_nid != -1); | ||
887 | rq->nr_preferred_running += (p->numa_preferred_nid == task_node(p)); | ||
888 | } | ||
889 | |||
890 | static void account_numa_dequeue(struct rq *rq, struct task_struct *p) | ||
891 | { | ||
892 | rq->nr_numa_running -= (p->numa_preferred_nid != -1); | ||
893 | rq->nr_preferred_running -= (p->numa_preferred_nid == task_node(p)); | ||
894 | } | ||
895 | |||
896 | struct numa_group { | ||
897 | atomic_t refcount; | ||
898 | |||
899 | spinlock_t lock; /* nr_tasks, tasks */ | ||
900 | int nr_tasks; | ||
901 | pid_t gid; | ||
902 | struct list_head task_list; | ||
903 | |||
904 | struct rcu_head rcu; | ||
905 | unsigned long total_faults; | ||
906 | unsigned long faults[0]; | ||
907 | }; | ||
908 | |||
909 | pid_t task_numa_group_id(struct task_struct *p) | ||
910 | { | ||
911 | return p->numa_group ? p->numa_group->gid : 0; | ||
912 | } | ||
913 | |||
914 | static inline int task_faults_idx(int nid, int priv) | ||
915 | { | ||
916 | return 2 * nid + priv; | ||
917 | } | ||
918 | |||
919 | static inline unsigned long task_faults(struct task_struct *p, int nid) | ||
920 | { | ||
921 | if (!p->numa_faults) | ||
922 | return 0; | ||
923 | |||
924 | return p->numa_faults[task_faults_idx(nid, 0)] + | ||
925 | p->numa_faults[task_faults_idx(nid, 1)]; | ||
926 | } | ||
927 | |||
928 | static inline unsigned long group_faults(struct task_struct *p, int nid) | ||
929 | { | ||
930 | if (!p->numa_group) | ||
931 | return 0; | ||
932 | |||
933 | return p->numa_group->faults[2*nid] + p->numa_group->faults[2*nid+1]; | ||
934 | } | ||
935 | |||
936 | /* | ||
937 | * These return the fraction of accesses done by a particular task, or | ||
938 | * task group, on a particular numa node. The group weight is given a | ||
939 | * larger multiplier, in order to group tasks together that are almost | ||
940 | * evenly spread out between numa nodes. | ||
941 | */ | ||
942 | static inline unsigned long task_weight(struct task_struct *p, int nid) | ||
943 | { | ||
944 | unsigned long total_faults; | ||
945 | |||
946 | if (!p->numa_faults) | ||
947 | return 0; | ||
948 | |||
949 | total_faults = p->total_numa_faults; | ||
950 | |||
951 | if (!total_faults) | ||
952 | return 0; | ||
953 | |||
954 | return 1000 * task_faults(p, nid) / total_faults; | ||
955 | } | ||
956 | |||
957 | static inline unsigned long group_weight(struct task_struct *p, int nid) | ||
958 | { | ||
959 | if (!p->numa_group || !p->numa_group->total_faults) | ||
960 | return 0; | ||
961 | |||
962 | return 1000 * group_faults(p, nid) / p->numa_group->total_faults; | ||
963 | } | ||
964 | |||
965 | static unsigned long weighted_cpuload(const int cpu); | ||
966 | static unsigned long source_load(int cpu, int type); | ||
967 | static unsigned long target_load(int cpu, int type); | ||
968 | static unsigned long power_of(int cpu); | ||
969 | static long effective_load(struct task_group *tg, int cpu, long wl, long wg); | ||
970 | |||
971 | /* Cached statistics for all CPUs within a node */ | ||
972 | struct numa_stats { | ||
973 | unsigned long nr_running; | ||
974 | unsigned long load; | ||
975 | |||
976 | /* Total compute capacity of CPUs on a node */ | ||
977 | unsigned long power; | ||
978 | |||
979 | /* Approximate capacity in terms of runnable tasks on a node */ | ||
980 | unsigned long capacity; | ||
981 | int has_capacity; | ||
982 | }; | ||
983 | |||
984 | /* | ||
985 | * XXX borrowed from update_sg_lb_stats | ||
986 | */ | ||
987 | static void update_numa_stats(struct numa_stats *ns, int nid) | ||
988 | { | ||
989 | int cpu, cpus = 0; | ||
990 | |||
991 | memset(ns, 0, sizeof(*ns)); | ||
992 | for_each_cpu(cpu, cpumask_of_node(nid)) { | ||
993 | struct rq *rq = cpu_rq(cpu); | ||
994 | |||
995 | ns->nr_running += rq->nr_running; | ||
996 | ns->load += weighted_cpuload(cpu); | ||
997 | ns->power += power_of(cpu); | ||
998 | |||
999 | cpus++; | ||
1000 | } | ||
1001 | |||
1002 | /* | ||
1003 | * If we raced with hotplug and there are no CPUs left in our mask | ||
1004 | * the @ns structure is NULL'ed and task_numa_compare() will | ||
1005 | * not find this node attractive. | ||
1006 | * | ||
1007 | * We'll either bail at !has_capacity, or we'll detect a huge imbalance | ||
1008 | * and bail there. | ||
1009 | */ | ||
1010 | if (!cpus) | ||
1011 | return; | ||
1012 | |||
1013 | ns->load = (ns->load * SCHED_POWER_SCALE) / ns->power; | ||
1014 | ns->capacity = DIV_ROUND_CLOSEST(ns->power, SCHED_POWER_SCALE); | ||
1015 | ns->has_capacity = (ns->nr_running < ns->capacity); | ||
1016 | } | ||
1017 | |||
1018 | struct task_numa_env { | ||
1019 | struct task_struct *p; | ||
1020 | |||
1021 | int src_cpu, src_nid; | ||
1022 | int dst_cpu, dst_nid; | ||
1023 | |||
1024 | struct numa_stats src_stats, dst_stats; | ||
1025 | |||
1026 | int imbalance_pct, idx; | ||
1027 | |||
1028 | struct task_struct *best_task; | ||
1029 | long best_imp; | ||
1030 | int best_cpu; | ||
1031 | }; | ||
1032 | |||
1033 | static void task_numa_assign(struct task_numa_env *env, | ||
1034 | struct task_struct *p, long imp) | ||
1035 | { | ||
1036 | if (env->best_task) | ||
1037 | put_task_struct(env->best_task); | ||
1038 | if (p) | ||
1039 | get_task_struct(p); | ||
1040 | |||
1041 | env->best_task = p; | ||
1042 | env->best_imp = imp; | ||
1043 | env->best_cpu = env->dst_cpu; | ||
1044 | } | ||
1045 | |||
1046 | /* | ||
1047 | * This checks if the overall compute and NUMA accesses of the system would | ||
1048 | * be improved if the source tasks was migrated to the target dst_cpu taking | ||
1049 | * into account that it might be best if task running on the dst_cpu should | ||
1050 | * be exchanged with the source task | ||
1051 | */ | ||
1052 | static void task_numa_compare(struct task_numa_env *env, | ||
1053 | long taskimp, long groupimp) | ||
1054 | { | ||
1055 | struct rq *src_rq = cpu_rq(env->src_cpu); | ||
1056 | struct rq *dst_rq = cpu_rq(env->dst_cpu); | ||
1057 | struct task_struct *cur; | ||
1058 | long dst_load, src_load; | ||
1059 | long load; | ||
1060 | long imp = (groupimp > 0) ? groupimp : taskimp; | ||
1061 | |||
1062 | rcu_read_lock(); | ||
1063 | cur = ACCESS_ONCE(dst_rq->curr); | ||
1064 | if (cur->pid == 0) /* idle */ | ||
1065 | cur = NULL; | ||
1066 | |||
1067 | /* | ||
1068 | * "imp" is the fault differential for the source task between the | ||
1069 | * source and destination node. Calculate the total differential for | ||
1070 | * the source task and potential destination task. The more negative | ||
1071 | * the value is, the more rmeote accesses that would be expected to | ||
1072 | * be incurred if the tasks were swapped. | ||
1073 | */ | ||
1074 | if (cur) { | ||
1075 | /* Skip this swap candidate if cannot move to the source cpu */ | ||
1076 | if (!cpumask_test_cpu(env->src_cpu, tsk_cpus_allowed(cur))) | ||
1077 | goto unlock; | ||
1078 | |||
1079 | /* | ||
1080 | * If dst and source tasks are in the same NUMA group, or not | ||
1081 | * in any group then look only at task weights. | ||
1082 | */ | ||
1083 | if (cur->numa_group == env->p->numa_group) { | ||
1084 | imp = taskimp + task_weight(cur, env->src_nid) - | ||
1085 | task_weight(cur, env->dst_nid); | ||
1086 | /* | ||
1087 | * Add some hysteresis to prevent swapping the | ||
1088 | * tasks within a group over tiny differences. | ||
1089 | */ | ||
1090 | if (cur->numa_group) | ||
1091 | imp -= imp/16; | ||
1092 | } else { | ||
1093 | /* | ||
1094 | * Compare the group weights. If a task is all by | ||
1095 | * itself (not part of a group), use the task weight | ||
1096 | * instead. | ||
1097 | */ | ||
1098 | if (env->p->numa_group) | ||
1099 | imp = groupimp; | ||
1100 | else | ||
1101 | imp = taskimp; | ||
1102 | |||
1103 | if (cur->numa_group) | ||
1104 | imp += group_weight(cur, env->src_nid) - | ||
1105 | group_weight(cur, env->dst_nid); | ||
1106 | else | ||
1107 | imp += task_weight(cur, env->src_nid) - | ||
1108 | task_weight(cur, env->dst_nid); | ||
1109 | } | ||
1110 | } | ||
1111 | |||
1112 | if (imp < env->best_imp) | ||
1113 | goto unlock; | ||
1114 | |||
1115 | if (!cur) { | ||
1116 | /* Is there capacity at our destination? */ | ||
1117 | if (env->src_stats.has_capacity && | ||
1118 | !env->dst_stats.has_capacity) | ||
1119 | goto unlock; | ||
1120 | |||
1121 | goto balance; | ||
1122 | } | ||
1123 | |||
1124 | /* Balance doesn't matter much if we're running a task per cpu */ | ||
1125 | if (src_rq->nr_running == 1 && dst_rq->nr_running == 1) | ||
1126 | goto assign; | ||
1127 | |||
1128 | /* | ||
1129 | * In the overloaded case, try and keep the load balanced. | ||
1130 | */ | ||
1131 | balance: | ||
1132 | dst_load = env->dst_stats.load; | ||
1133 | src_load = env->src_stats.load; | ||
1134 | |||
1135 | /* XXX missing power terms */ | ||
1136 | load = task_h_load(env->p); | ||
1137 | dst_load += load; | ||
1138 | src_load -= load; | ||
1139 | |||
1140 | if (cur) { | ||
1141 | load = task_h_load(cur); | ||
1142 | dst_load -= load; | ||
1143 | src_load += load; | ||
1144 | } | ||
1145 | |||
1146 | /* make src_load the smaller */ | ||
1147 | if (dst_load < src_load) | ||
1148 | swap(dst_load, src_load); | ||
1149 | |||
1150 | if (src_load * env->imbalance_pct < dst_load * 100) | ||
1151 | goto unlock; | ||
1152 | |||
1153 | assign: | ||
1154 | task_numa_assign(env, cur, imp); | ||
1155 | unlock: | ||
1156 | rcu_read_unlock(); | ||
1157 | } | ||
1158 | |||
1159 | static void task_numa_find_cpu(struct task_numa_env *env, | ||
1160 | long taskimp, long groupimp) | ||
834 | { | 1161 | { |
835 | int seq; | 1162 | int cpu; |
1163 | |||
1164 | for_each_cpu(cpu, cpumask_of_node(env->dst_nid)) { | ||
1165 | /* Skip this CPU if the source task cannot migrate */ | ||
1166 | if (!cpumask_test_cpu(cpu, tsk_cpus_allowed(env->p))) | ||
1167 | continue; | ||
1168 | |||
1169 | env->dst_cpu = cpu; | ||
1170 | task_numa_compare(env, taskimp, groupimp); | ||
1171 | } | ||
1172 | } | ||
1173 | |||
1174 | static int task_numa_migrate(struct task_struct *p) | ||
1175 | { | ||
1176 | struct task_numa_env env = { | ||
1177 | .p = p, | ||
1178 | |||
1179 | .src_cpu = task_cpu(p), | ||
1180 | .src_nid = task_node(p), | ||
1181 | |||
1182 | .imbalance_pct = 112, | ||
1183 | |||
1184 | .best_task = NULL, | ||
1185 | .best_imp = 0, | ||
1186 | .best_cpu = -1 | ||
1187 | }; | ||
1188 | struct sched_domain *sd; | ||
1189 | unsigned long taskweight, groupweight; | ||
1190 | int nid, ret; | ||
1191 | long taskimp, groupimp; | ||
1192 | |||
1193 | /* | ||
1194 | * Pick the lowest SD_NUMA domain, as that would have the smallest | ||
1195 | * imbalance and would be the first to start moving tasks about. | ||
1196 | * | ||
1197 | * And we want to avoid any moving of tasks about, as that would create | ||
1198 | * random movement of tasks -- counter the numa conditions we're trying | ||
1199 | * to satisfy here. | ||
1200 | */ | ||
1201 | rcu_read_lock(); | ||
1202 | sd = rcu_dereference(per_cpu(sd_numa, env.src_cpu)); | ||
1203 | if (sd) | ||
1204 | env.imbalance_pct = 100 + (sd->imbalance_pct - 100) / 2; | ||
1205 | rcu_read_unlock(); | ||
1206 | |||
1207 | /* | ||
1208 | * Cpusets can break the scheduler domain tree into smaller | ||
1209 | * balance domains, some of which do not cross NUMA boundaries. | ||
1210 | * Tasks that are "trapped" in such domains cannot be migrated | ||
1211 | * elsewhere, so there is no point in (re)trying. | ||
1212 | */ | ||
1213 | if (unlikely(!sd)) { | ||
1214 | p->numa_preferred_nid = cpu_to_node(task_cpu(p)); | ||
1215 | return -EINVAL; | ||
1216 | } | ||
1217 | |||
1218 | taskweight = task_weight(p, env.src_nid); | ||
1219 | groupweight = group_weight(p, env.src_nid); | ||
1220 | update_numa_stats(&env.src_stats, env.src_nid); | ||
1221 | env.dst_nid = p->numa_preferred_nid; | ||
1222 | taskimp = task_weight(p, env.dst_nid) - taskweight; | ||
1223 | groupimp = group_weight(p, env.dst_nid) - groupweight; | ||
1224 | update_numa_stats(&env.dst_stats, env.dst_nid); | ||
1225 | |||
1226 | /* If the preferred nid has capacity, try to use it. */ | ||
1227 | if (env.dst_stats.has_capacity) | ||
1228 | task_numa_find_cpu(&env, taskimp, groupimp); | ||
1229 | |||
1230 | /* No space available on the preferred nid. Look elsewhere. */ | ||
1231 | if (env.best_cpu == -1) { | ||
1232 | for_each_online_node(nid) { | ||
1233 | if (nid == env.src_nid || nid == p->numa_preferred_nid) | ||
1234 | continue; | ||
1235 | |||
1236 | /* Only consider nodes where both task and groups benefit */ | ||
1237 | taskimp = task_weight(p, nid) - taskweight; | ||
1238 | groupimp = group_weight(p, nid) - groupweight; | ||
1239 | if (taskimp < 0 && groupimp < 0) | ||
1240 | continue; | ||
1241 | |||
1242 | env.dst_nid = nid; | ||
1243 | update_numa_stats(&env.dst_stats, env.dst_nid); | ||
1244 | task_numa_find_cpu(&env, taskimp, groupimp); | ||
1245 | } | ||
1246 | } | ||
1247 | |||
1248 | /* No better CPU than the current one was found. */ | ||
1249 | if (env.best_cpu == -1) | ||
1250 | return -EAGAIN; | ||
1251 | |||
1252 | sched_setnuma(p, env.dst_nid); | ||
1253 | |||
1254 | /* | ||
1255 | * Reset the scan period if the task is being rescheduled on an | ||
1256 | * alternative node to recheck if the tasks is now properly placed. | ||
1257 | */ | ||
1258 | p->numa_scan_period = task_scan_min(p); | ||
1259 | |||
1260 | if (env.best_task == NULL) { | ||
1261 | int ret = migrate_task_to(p, env.best_cpu); | ||
1262 | return ret; | ||
1263 | } | ||
1264 | |||
1265 | ret = migrate_swap(p, env.best_task); | ||
1266 | put_task_struct(env.best_task); | ||
1267 | return ret; | ||
1268 | } | ||
1269 | |||
1270 | /* Attempt to migrate a task to a CPU on the preferred node. */ | ||
1271 | static void numa_migrate_preferred(struct task_struct *p) | ||
1272 | { | ||
1273 | /* This task has no NUMA fault statistics yet */ | ||
1274 | if (unlikely(p->numa_preferred_nid == -1 || !p->numa_faults)) | ||
1275 | return; | ||
1276 | |||
1277 | /* Periodically retry migrating the task to the preferred node */ | ||
1278 | p->numa_migrate_retry = jiffies + HZ; | ||
1279 | |||
1280 | /* Success if task is already running on preferred CPU */ | ||
1281 | if (cpu_to_node(task_cpu(p)) == p->numa_preferred_nid) | ||
1282 | return; | ||
1283 | |||
1284 | /* Otherwise, try migrate to a CPU on the preferred node */ | ||
1285 | task_numa_migrate(p); | ||
1286 | } | ||
1287 | |||
1288 | /* | ||
1289 | * When adapting the scan rate, the period is divided into NUMA_PERIOD_SLOTS | ||
1290 | * increments. The more local the fault statistics are, the higher the scan | ||
1291 | * period will be for the next scan window. If local/remote ratio is below | ||
1292 | * NUMA_PERIOD_THRESHOLD (where range of ratio is 1..NUMA_PERIOD_SLOTS) the | ||
1293 | * scan period will decrease | ||
1294 | */ | ||
1295 | #define NUMA_PERIOD_SLOTS 10 | ||
1296 | #define NUMA_PERIOD_THRESHOLD 3 | ||
1297 | |||
1298 | /* | ||
1299 | * Increase the scan period (slow down scanning) if the majority of | ||
1300 | * our memory is already on our local node, or if the majority of | ||
1301 | * the page accesses are shared with other processes. | ||
1302 | * Otherwise, decrease the scan period. | ||
1303 | */ | ||
1304 | static void update_task_scan_period(struct task_struct *p, | ||
1305 | unsigned long shared, unsigned long private) | ||
1306 | { | ||
1307 | unsigned int period_slot; | ||
1308 | int ratio; | ||
1309 | int diff; | ||
1310 | |||
1311 | unsigned long remote = p->numa_faults_locality[0]; | ||
1312 | unsigned long local = p->numa_faults_locality[1]; | ||
1313 | |||
1314 | /* | ||
1315 | * If there were no record hinting faults then either the task is | ||
1316 | * completely idle or all activity is areas that are not of interest | ||
1317 | * to automatic numa balancing. Scan slower | ||
1318 | */ | ||
1319 | if (local + shared == 0) { | ||
1320 | p->numa_scan_period = min(p->numa_scan_period_max, | ||
1321 | p->numa_scan_period << 1); | ||
1322 | |||
1323 | p->mm->numa_next_scan = jiffies + | ||
1324 | msecs_to_jiffies(p->numa_scan_period); | ||
836 | 1325 | ||
837 | if (!p->mm) /* for example, ksmd faulting in a user's mm */ | ||
838 | return; | 1326 | return; |
1327 | } | ||
1328 | |||
1329 | /* | ||
1330 | * Prepare to scale scan period relative to the current period. | ||
1331 | * == NUMA_PERIOD_THRESHOLD scan period stays the same | ||
1332 | * < NUMA_PERIOD_THRESHOLD scan period decreases (scan faster) | ||
1333 | * >= NUMA_PERIOD_THRESHOLD scan period increases (scan slower) | ||
1334 | */ | ||
1335 | period_slot = DIV_ROUND_UP(p->numa_scan_period, NUMA_PERIOD_SLOTS); | ||
1336 | ratio = (local * NUMA_PERIOD_SLOTS) / (local + remote); | ||
1337 | if (ratio >= NUMA_PERIOD_THRESHOLD) { | ||
1338 | int slot = ratio - NUMA_PERIOD_THRESHOLD; | ||
1339 | if (!slot) | ||
1340 | slot = 1; | ||
1341 | diff = slot * period_slot; | ||
1342 | } else { | ||
1343 | diff = -(NUMA_PERIOD_THRESHOLD - ratio) * period_slot; | ||
1344 | |||
1345 | /* | ||
1346 | * Scale scan rate increases based on sharing. There is an | ||
1347 | * inverse relationship between the degree of sharing and | ||
1348 | * the adjustment made to the scanning period. Broadly | ||
1349 | * speaking the intent is that there is little point | ||
1350 | * scanning faster if shared accesses dominate as it may | ||
1351 | * simply bounce migrations uselessly | ||
1352 | */ | ||
1353 | period_slot = DIV_ROUND_UP(diff, NUMA_PERIOD_SLOTS); | ||
1354 | ratio = DIV_ROUND_UP(private * NUMA_PERIOD_SLOTS, (private + shared)); | ||
1355 | diff = (diff * ratio) / NUMA_PERIOD_SLOTS; | ||
1356 | } | ||
1357 | |||
1358 | p->numa_scan_period = clamp(p->numa_scan_period + diff, | ||
1359 | task_scan_min(p), task_scan_max(p)); | ||
1360 | memset(p->numa_faults_locality, 0, sizeof(p->numa_faults_locality)); | ||
1361 | } | ||
1362 | |||
1363 | static void task_numa_placement(struct task_struct *p) | ||
1364 | { | ||
1365 | int seq, nid, max_nid = -1, max_group_nid = -1; | ||
1366 | unsigned long max_faults = 0, max_group_faults = 0; | ||
1367 | unsigned long fault_types[2] = { 0, 0 }; | ||
1368 | spinlock_t *group_lock = NULL; | ||
1369 | |||
839 | seq = ACCESS_ONCE(p->mm->numa_scan_seq); | 1370 | seq = ACCESS_ONCE(p->mm->numa_scan_seq); |
840 | if (p->numa_scan_seq == seq) | 1371 | if (p->numa_scan_seq == seq) |
841 | return; | 1372 | return; |
842 | p->numa_scan_seq = seq; | 1373 | p->numa_scan_seq = seq; |
1374 | p->numa_scan_period_max = task_scan_max(p); | ||
1375 | |||
1376 | /* If the task is part of a group prevent parallel updates to group stats */ | ||
1377 | if (p->numa_group) { | ||
1378 | group_lock = &p->numa_group->lock; | ||
1379 | spin_lock(group_lock); | ||
1380 | } | ||
1381 | |||
1382 | /* Find the node with the highest number of faults */ | ||
1383 | for_each_online_node(nid) { | ||
1384 | unsigned long faults = 0, group_faults = 0; | ||
1385 | int priv, i; | ||
1386 | |||
1387 | for (priv = 0; priv < 2; priv++) { | ||
1388 | long diff; | ||
1389 | |||
1390 | i = task_faults_idx(nid, priv); | ||
1391 | diff = -p->numa_faults[i]; | ||
1392 | |||
1393 | /* Decay existing window, copy faults since last scan */ | ||
1394 | p->numa_faults[i] >>= 1; | ||
1395 | p->numa_faults[i] += p->numa_faults_buffer[i]; | ||
1396 | fault_types[priv] += p->numa_faults_buffer[i]; | ||
1397 | p->numa_faults_buffer[i] = 0; | ||
1398 | |||
1399 | faults += p->numa_faults[i]; | ||
1400 | diff += p->numa_faults[i]; | ||
1401 | p->total_numa_faults += diff; | ||
1402 | if (p->numa_group) { | ||
1403 | /* safe because we can only change our own group */ | ||
1404 | p->numa_group->faults[i] += diff; | ||
1405 | p->numa_group->total_faults += diff; | ||
1406 | group_faults += p->numa_group->faults[i]; | ||
1407 | } | ||
1408 | } | ||
843 | 1409 | ||
844 | /* FIXME: Scheduling placement policy hints go here */ | 1410 | if (faults > max_faults) { |
1411 | max_faults = faults; | ||
1412 | max_nid = nid; | ||
1413 | } | ||
1414 | |||
1415 | if (group_faults > max_group_faults) { | ||
1416 | max_group_faults = group_faults; | ||
1417 | max_group_nid = nid; | ||
1418 | } | ||
1419 | } | ||
1420 | |||
1421 | update_task_scan_period(p, fault_types[0], fault_types[1]); | ||
1422 | |||
1423 | if (p->numa_group) { | ||
1424 | /* | ||
1425 | * If the preferred task and group nids are different, | ||
1426 | * iterate over the nodes again to find the best place. | ||
1427 | */ | ||
1428 | if (max_nid != max_group_nid) { | ||
1429 | unsigned long weight, max_weight = 0; | ||
1430 | |||
1431 | for_each_online_node(nid) { | ||
1432 | weight = task_weight(p, nid) + group_weight(p, nid); | ||
1433 | if (weight > max_weight) { | ||
1434 | max_weight = weight; | ||
1435 | max_nid = nid; | ||
1436 | } | ||
1437 | } | ||
1438 | } | ||
1439 | |||
1440 | spin_unlock(group_lock); | ||
1441 | } | ||
1442 | |||
1443 | /* Preferred node as the node with the most faults */ | ||
1444 | if (max_faults && max_nid != p->numa_preferred_nid) { | ||
1445 | /* Update the preferred nid and migrate task if possible */ | ||
1446 | sched_setnuma(p, max_nid); | ||
1447 | numa_migrate_preferred(p); | ||
1448 | } | ||
1449 | } | ||
1450 | |||
1451 | static inline int get_numa_group(struct numa_group *grp) | ||
1452 | { | ||
1453 | return atomic_inc_not_zero(&grp->refcount); | ||
1454 | } | ||
1455 | |||
1456 | static inline void put_numa_group(struct numa_group *grp) | ||
1457 | { | ||
1458 | if (atomic_dec_and_test(&grp->refcount)) | ||
1459 | kfree_rcu(grp, rcu); | ||
1460 | } | ||
1461 | |||
1462 | static void task_numa_group(struct task_struct *p, int cpupid, int flags, | ||
1463 | int *priv) | ||
1464 | { | ||
1465 | struct numa_group *grp, *my_grp; | ||
1466 | struct task_struct *tsk; | ||
1467 | bool join = false; | ||
1468 | int cpu = cpupid_to_cpu(cpupid); | ||
1469 | int i; | ||
1470 | |||
1471 | if (unlikely(!p->numa_group)) { | ||
1472 | unsigned int size = sizeof(struct numa_group) + | ||
1473 | 2*nr_node_ids*sizeof(unsigned long); | ||
1474 | |||
1475 | grp = kzalloc(size, GFP_KERNEL | __GFP_NOWARN); | ||
1476 | if (!grp) | ||
1477 | return; | ||
1478 | |||
1479 | atomic_set(&grp->refcount, 1); | ||
1480 | spin_lock_init(&grp->lock); | ||
1481 | INIT_LIST_HEAD(&grp->task_list); | ||
1482 | grp->gid = p->pid; | ||
1483 | |||
1484 | for (i = 0; i < 2*nr_node_ids; i++) | ||
1485 | grp->faults[i] = p->numa_faults[i]; | ||
1486 | |||
1487 | grp->total_faults = p->total_numa_faults; | ||
1488 | |||
1489 | list_add(&p->numa_entry, &grp->task_list); | ||
1490 | grp->nr_tasks++; | ||
1491 | rcu_assign_pointer(p->numa_group, grp); | ||
1492 | } | ||
1493 | |||
1494 | rcu_read_lock(); | ||
1495 | tsk = ACCESS_ONCE(cpu_rq(cpu)->curr); | ||
1496 | |||
1497 | if (!cpupid_match_pid(tsk, cpupid)) | ||
1498 | goto no_join; | ||
1499 | |||
1500 | grp = rcu_dereference(tsk->numa_group); | ||
1501 | if (!grp) | ||
1502 | goto no_join; | ||
1503 | |||
1504 | my_grp = p->numa_group; | ||
1505 | if (grp == my_grp) | ||
1506 | goto no_join; | ||
1507 | |||
1508 | /* | ||
1509 | * Only join the other group if its bigger; if we're the bigger group, | ||
1510 | * the other task will join us. | ||
1511 | */ | ||
1512 | if (my_grp->nr_tasks > grp->nr_tasks) | ||
1513 | goto no_join; | ||
1514 | |||
1515 | /* | ||
1516 | * Tie-break on the grp address. | ||
1517 | */ | ||
1518 | if (my_grp->nr_tasks == grp->nr_tasks && my_grp > grp) | ||
1519 | goto no_join; | ||
1520 | |||
1521 | /* Always join threads in the same process. */ | ||
1522 | if (tsk->mm == current->mm) | ||
1523 | join = true; | ||
1524 | |||
1525 | /* Simple filter to avoid false positives due to PID collisions */ | ||
1526 | if (flags & TNF_SHARED) | ||
1527 | join = true; | ||
1528 | |||
1529 | /* Update priv based on whether false sharing was detected */ | ||
1530 | *priv = !join; | ||
1531 | |||
1532 | if (join && !get_numa_group(grp)) | ||
1533 | goto no_join; | ||
1534 | |||
1535 | rcu_read_unlock(); | ||
1536 | |||
1537 | if (!join) | ||
1538 | return; | ||
1539 | |||
1540 | double_lock(&my_grp->lock, &grp->lock); | ||
1541 | |||
1542 | for (i = 0; i < 2*nr_node_ids; i++) { | ||
1543 | my_grp->faults[i] -= p->numa_faults[i]; | ||
1544 | grp->faults[i] += p->numa_faults[i]; | ||
1545 | } | ||
1546 | my_grp->total_faults -= p->total_numa_faults; | ||
1547 | grp->total_faults += p->total_numa_faults; | ||
1548 | |||
1549 | list_move(&p->numa_entry, &grp->task_list); | ||
1550 | my_grp->nr_tasks--; | ||
1551 | grp->nr_tasks++; | ||
1552 | |||
1553 | spin_unlock(&my_grp->lock); | ||
1554 | spin_unlock(&grp->lock); | ||
1555 | |||
1556 | rcu_assign_pointer(p->numa_group, grp); | ||
1557 | |||
1558 | put_numa_group(my_grp); | ||
1559 | return; | ||
1560 | |||
1561 | no_join: | ||
1562 | rcu_read_unlock(); | ||
1563 | return; | ||
1564 | } | ||
1565 | |||
1566 | void task_numa_free(struct task_struct *p) | ||
1567 | { | ||
1568 | struct numa_group *grp = p->numa_group; | ||
1569 | int i; | ||
1570 | void *numa_faults = p->numa_faults; | ||
1571 | |||
1572 | if (grp) { | ||
1573 | spin_lock(&grp->lock); | ||
1574 | for (i = 0; i < 2*nr_node_ids; i++) | ||
1575 | grp->faults[i] -= p->numa_faults[i]; | ||
1576 | grp->total_faults -= p->total_numa_faults; | ||
1577 | |||
1578 | list_del(&p->numa_entry); | ||
1579 | grp->nr_tasks--; | ||
1580 | spin_unlock(&grp->lock); | ||
1581 | rcu_assign_pointer(p->numa_group, NULL); | ||
1582 | put_numa_group(grp); | ||
1583 | } | ||
1584 | |||
1585 | p->numa_faults = NULL; | ||
1586 | p->numa_faults_buffer = NULL; | ||
1587 | kfree(numa_faults); | ||
845 | } | 1588 | } |
846 | 1589 | ||
847 | /* | 1590 | /* |
848 | * Got a PROT_NONE fault for a page on @node. | 1591 | * Got a PROT_NONE fault for a page on @node. |
849 | */ | 1592 | */ |
850 | void task_numa_fault(int node, int pages, bool migrated) | 1593 | void task_numa_fault(int last_cpupid, int node, int pages, int flags) |
851 | { | 1594 | { |
852 | struct task_struct *p = current; | 1595 | struct task_struct *p = current; |
1596 | bool migrated = flags & TNF_MIGRATED; | ||
1597 | int priv; | ||
853 | 1598 | ||
854 | if (!numabalancing_enabled) | 1599 | if (!numabalancing_enabled) |
855 | return; | 1600 | return; |
856 | 1601 | ||
857 | /* FIXME: Allocate task-specific structure for placement policy here */ | 1602 | /* for example, ksmd faulting in a user's mm */ |
1603 | if (!p->mm) | ||
1604 | return; | ||
1605 | |||
1606 | /* Do not worry about placement if exiting */ | ||
1607 | if (p->state == TASK_DEAD) | ||
1608 | return; | ||
1609 | |||
1610 | /* Allocate buffer to track faults on a per-node basis */ | ||
1611 | if (unlikely(!p->numa_faults)) { | ||
1612 | int size = sizeof(*p->numa_faults) * 2 * nr_node_ids; | ||
1613 | |||
1614 | /* numa_faults and numa_faults_buffer share the allocation */ | ||
1615 | p->numa_faults = kzalloc(size * 2, GFP_KERNEL|__GFP_NOWARN); | ||
1616 | if (!p->numa_faults) | ||
1617 | return; | ||
1618 | |||
1619 | BUG_ON(p->numa_faults_buffer); | ||
1620 | p->numa_faults_buffer = p->numa_faults + (2 * nr_node_ids); | ||
1621 | p->total_numa_faults = 0; | ||
1622 | memset(p->numa_faults_locality, 0, sizeof(p->numa_faults_locality)); | ||
1623 | } | ||
858 | 1624 | ||
859 | /* | 1625 | /* |
860 | * If pages are properly placed (did not migrate) then scan slower. | 1626 | * First accesses are treated as private, otherwise consider accesses |
861 | * This is reset periodically in case of phase changes | 1627 | * to be private if the accessing pid has not changed |
862 | */ | 1628 | */ |
863 | if (!migrated) | 1629 | if (unlikely(last_cpupid == (-1 & LAST_CPUPID_MASK))) { |
864 | p->numa_scan_period = min(sysctl_numa_balancing_scan_period_max, | 1630 | priv = 1; |
865 | p->numa_scan_period + jiffies_to_msecs(10)); | 1631 | } else { |
1632 | priv = cpupid_match_pid(p, last_cpupid); | ||
1633 | if (!priv && !(flags & TNF_NO_GROUP)) | ||
1634 | task_numa_group(p, last_cpupid, flags, &priv); | ||
1635 | } | ||
866 | 1636 | ||
867 | task_numa_placement(p); | 1637 | task_numa_placement(p); |
1638 | |||
1639 | /* | ||
1640 | * Retry task to preferred node migration periodically, in case it | ||
1641 | * case it previously failed, or the scheduler moved us. | ||
1642 | */ | ||
1643 | if (time_after(jiffies, p->numa_migrate_retry)) | ||
1644 | numa_migrate_preferred(p); | ||
1645 | |||
1646 | if (migrated) | ||
1647 | p->numa_pages_migrated += pages; | ||
1648 | |||
1649 | p->numa_faults_buffer[task_faults_idx(node, priv)] += pages; | ||
1650 | p->numa_faults_locality[!!(flags & TNF_FAULT_LOCAL)] += pages; | ||
868 | } | 1651 | } |
869 | 1652 | ||
870 | static void reset_ptenuma_scan(struct task_struct *p) | 1653 | static void reset_ptenuma_scan(struct task_struct *p) |
@@ -884,6 +1667,7 @@ void task_numa_work(struct callback_head *work) | |||
884 | struct mm_struct *mm = p->mm; | 1667 | struct mm_struct *mm = p->mm; |
885 | struct vm_area_struct *vma; | 1668 | struct vm_area_struct *vma; |
886 | unsigned long start, end; | 1669 | unsigned long start, end; |
1670 | unsigned long nr_pte_updates = 0; | ||
887 | long pages; | 1671 | long pages; |
888 | 1672 | ||
889 | WARN_ON_ONCE(p != container_of(work, struct task_struct, numa_work)); | 1673 | WARN_ON_ONCE(p != container_of(work, struct task_struct, numa_work)); |
@@ -900,35 +1684,9 @@ void task_numa_work(struct callback_head *work) | |||
900 | if (p->flags & PF_EXITING) | 1684 | if (p->flags & PF_EXITING) |
901 | return; | 1685 | return; |
902 | 1686 | ||
903 | /* | 1687 | if (!mm->numa_next_scan) { |
904 | * We do not care about task placement until a task runs on a node | 1688 | mm->numa_next_scan = now + |
905 | * other than the first one used by the address space. This is | 1689 | msecs_to_jiffies(sysctl_numa_balancing_scan_delay); |
906 | * largely because migrations are driven by what CPU the task | ||
907 | * is running on. If it's never scheduled on another node, it'll | ||
908 | * not migrate so why bother trapping the fault. | ||
909 | */ | ||
910 | if (mm->first_nid == NUMA_PTE_SCAN_INIT) | ||
911 | mm->first_nid = numa_node_id(); | ||
912 | if (mm->first_nid != NUMA_PTE_SCAN_ACTIVE) { | ||
913 | /* Are we running on a new node yet? */ | ||
914 | if (numa_node_id() == mm->first_nid && | ||
915 | !sched_feat_numa(NUMA_FORCE)) | ||
916 | return; | ||
917 | |||
918 | mm->first_nid = NUMA_PTE_SCAN_ACTIVE; | ||
919 | } | ||
920 | |||
921 | /* | ||
922 | * Reset the scan period if enough time has gone by. Objective is that | ||
923 | * scanning will be reduced if pages are properly placed. As tasks | ||
924 | * can enter different phases this needs to be re-examined. Lacking | ||
925 | * proper tracking of reference behaviour, this blunt hammer is used. | ||
926 | */ | ||
927 | migrate = mm->numa_next_reset; | ||
928 | if (time_after(now, migrate)) { | ||
929 | p->numa_scan_period = sysctl_numa_balancing_scan_period_min; | ||
930 | next_scan = now + msecs_to_jiffies(sysctl_numa_balancing_scan_period_reset); | ||
931 | xchg(&mm->numa_next_reset, next_scan); | ||
932 | } | 1690 | } |
933 | 1691 | ||
934 | /* | 1692 | /* |
@@ -938,20 +1696,20 @@ void task_numa_work(struct callback_head *work) | |||
938 | if (time_before(now, migrate)) | 1696 | if (time_before(now, migrate)) |
939 | return; | 1697 | return; |
940 | 1698 | ||
941 | if (p->numa_scan_period == 0) | 1699 | if (p->numa_scan_period == 0) { |
942 | p->numa_scan_period = sysctl_numa_balancing_scan_period_min; | 1700 | p->numa_scan_period_max = task_scan_max(p); |
1701 | p->numa_scan_period = task_scan_min(p); | ||
1702 | } | ||
943 | 1703 | ||
944 | next_scan = now + msecs_to_jiffies(p->numa_scan_period); | 1704 | next_scan = now + msecs_to_jiffies(p->numa_scan_period); |
945 | if (cmpxchg(&mm->numa_next_scan, migrate, next_scan) != migrate) | 1705 | if (cmpxchg(&mm->numa_next_scan, migrate, next_scan) != migrate) |
946 | return; | 1706 | return; |
947 | 1707 | ||
948 | /* | 1708 | /* |
949 | * Do not set pte_numa if the current running node is rate-limited. | 1709 | * Delay this task enough that another task of this mm will likely win |
950 | * This loses statistics on the fault but if we are unwilling to | 1710 | * the next time around. |
951 | * migrate to this node, it is less likely we can do useful work | ||
952 | */ | 1711 | */ |
953 | if (migrate_ratelimited(numa_node_id())) | 1712 | p->node_stamp += 2 * TICK_NSEC; |
954 | return; | ||
955 | 1713 | ||
956 | start = mm->numa_scan_offset; | 1714 | start = mm->numa_scan_offset; |
957 | pages = sysctl_numa_balancing_scan_size; | 1715 | pages = sysctl_numa_balancing_scan_size; |
@@ -967,18 +1725,39 @@ void task_numa_work(struct callback_head *work) | |||
967 | vma = mm->mmap; | 1725 | vma = mm->mmap; |
968 | } | 1726 | } |
969 | for (; vma; vma = vma->vm_next) { | 1727 | for (; vma; vma = vma->vm_next) { |
970 | if (!vma_migratable(vma)) | 1728 | if (!vma_migratable(vma) || !vma_policy_mof(p, vma)) |
971 | continue; | 1729 | continue; |
972 | 1730 | ||
973 | /* Skip small VMAs. They are not likely to be of relevance */ | 1731 | /* |
974 | if (vma->vm_end - vma->vm_start < HPAGE_SIZE) | 1732 | * Shared library pages mapped by multiple processes are not |
1733 | * migrated as it is expected they are cache replicated. Avoid | ||
1734 | * hinting faults in read-only file-backed mappings or the vdso | ||
1735 | * as migrating the pages will be of marginal benefit. | ||
1736 | */ | ||
1737 | if (!vma->vm_mm || | ||
1738 | (vma->vm_file && (vma->vm_flags & (VM_READ|VM_WRITE)) == (VM_READ))) | ||
1739 | continue; | ||
1740 | |||
1741 | /* | ||
1742 | * Skip inaccessible VMAs to avoid any confusion between | ||
1743 | * PROT_NONE and NUMA hinting ptes | ||
1744 | */ | ||
1745 | if (!(vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE))) | ||
975 | continue; | 1746 | continue; |
976 | 1747 | ||
977 | do { | 1748 | do { |
978 | start = max(start, vma->vm_start); | 1749 | start = max(start, vma->vm_start); |
979 | end = ALIGN(start + (pages << PAGE_SHIFT), HPAGE_SIZE); | 1750 | end = ALIGN(start + (pages << PAGE_SHIFT), HPAGE_SIZE); |
980 | end = min(end, vma->vm_end); | 1751 | end = min(end, vma->vm_end); |
981 | pages -= change_prot_numa(vma, start, end); | 1752 | nr_pte_updates += change_prot_numa(vma, start, end); |
1753 | |||
1754 | /* | ||
1755 | * Scan sysctl_numa_balancing_scan_size but ensure that | ||
1756 | * at least one PTE is updated so that unused virtual | ||
1757 | * address space is quickly skipped. | ||
1758 | */ | ||
1759 | if (nr_pte_updates) | ||
1760 | pages -= (end - start) >> PAGE_SHIFT; | ||
982 | 1761 | ||
983 | start = end; | 1762 | start = end; |
984 | if (pages <= 0) | 1763 | if (pages <= 0) |
@@ -988,10 +1767,10 @@ void task_numa_work(struct callback_head *work) | |||
988 | 1767 | ||
989 | out: | 1768 | out: |
990 | /* | 1769 | /* |
991 | * It is possible to reach the end of the VMA list but the last few VMAs are | 1770 | * It is possible to reach the end of the VMA list but the last few |
992 | * not guaranteed to the vma_migratable. If they are not, we would find the | 1771 | * VMAs are not guaranteed to the vma_migratable. If they are not, we |
993 | * !migratable VMA on the next scan but not reset the scanner to the start | 1772 | * would find the !migratable VMA on the next scan but not reset the |
994 | * so check it now. | 1773 | * scanner to the start so check it now. |
995 | */ | 1774 | */ |
996 | if (vma) | 1775 | if (vma) |
997 | mm->numa_scan_offset = start; | 1776 | mm->numa_scan_offset = start; |
@@ -1025,8 +1804,8 @@ void task_tick_numa(struct rq *rq, struct task_struct *curr) | |||
1025 | 1804 | ||
1026 | if (now - curr->node_stamp > period) { | 1805 | if (now - curr->node_stamp > period) { |
1027 | if (!curr->node_stamp) | 1806 | if (!curr->node_stamp) |
1028 | curr->numa_scan_period = sysctl_numa_balancing_scan_period_min; | 1807 | curr->numa_scan_period = task_scan_min(curr); |
1029 | curr->node_stamp = now; | 1808 | curr->node_stamp += period; |
1030 | 1809 | ||
1031 | if (!time_before(jiffies, curr->mm->numa_next_scan)) { | 1810 | if (!time_before(jiffies, curr->mm->numa_next_scan)) { |
1032 | init_task_work(work, task_numa_work); /* TODO: move this into sched_fork() */ | 1811 | init_task_work(work, task_numa_work); /* TODO: move this into sched_fork() */ |
@@ -1038,6 +1817,14 @@ void task_tick_numa(struct rq *rq, struct task_struct *curr) | |||
1038 | static void task_tick_numa(struct rq *rq, struct task_struct *curr) | 1817 | static void task_tick_numa(struct rq *rq, struct task_struct *curr) |
1039 | { | 1818 | { |
1040 | } | 1819 | } |
1820 | |||
1821 | static inline void account_numa_enqueue(struct rq *rq, struct task_struct *p) | ||
1822 | { | ||
1823 | } | ||
1824 | |||
1825 | static inline void account_numa_dequeue(struct rq *rq, struct task_struct *p) | ||
1826 | { | ||
1827 | } | ||
1041 | #endif /* CONFIG_NUMA_BALANCING */ | 1828 | #endif /* CONFIG_NUMA_BALANCING */ |
1042 | 1829 | ||
1043 | static void | 1830 | static void |
@@ -1047,8 +1834,12 @@ account_entity_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se) | |||
1047 | if (!parent_entity(se)) | 1834 | if (!parent_entity(se)) |
1048 | update_load_add(&rq_of(cfs_rq)->load, se->load.weight); | 1835 | update_load_add(&rq_of(cfs_rq)->load, se->load.weight); |
1049 | #ifdef CONFIG_SMP | 1836 | #ifdef CONFIG_SMP |
1050 | if (entity_is_task(se)) | 1837 | if (entity_is_task(se)) { |
1051 | list_add(&se->group_node, &rq_of(cfs_rq)->cfs_tasks); | 1838 | struct rq *rq = rq_of(cfs_rq); |
1839 | |||
1840 | account_numa_enqueue(rq, task_of(se)); | ||
1841 | list_add(&se->group_node, &rq->cfs_tasks); | ||
1842 | } | ||
1052 | #endif | 1843 | #endif |
1053 | cfs_rq->nr_running++; | 1844 | cfs_rq->nr_running++; |
1054 | } | 1845 | } |
@@ -1059,8 +1850,10 @@ account_entity_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se) | |||
1059 | update_load_sub(&cfs_rq->load, se->load.weight); | 1850 | update_load_sub(&cfs_rq->load, se->load.weight); |
1060 | if (!parent_entity(se)) | 1851 | if (!parent_entity(se)) |
1061 | update_load_sub(&rq_of(cfs_rq)->load, se->load.weight); | 1852 | update_load_sub(&rq_of(cfs_rq)->load, se->load.weight); |
1062 | if (entity_is_task(se)) | 1853 | if (entity_is_task(se)) { |
1854 | account_numa_dequeue(rq_of(cfs_rq), task_of(se)); | ||
1063 | list_del_init(&se->group_node); | 1855 | list_del_init(&se->group_node); |
1856 | } | ||
1064 | cfs_rq->nr_running--; | 1857 | cfs_rq->nr_running--; |
1065 | } | 1858 | } |
1066 | 1859 | ||
@@ -1378,7 +2171,7 @@ static inline void __update_tg_runnable_avg(struct sched_avg *sa, | |||
1378 | long contrib; | 2171 | long contrib; |
1379 | 2172 | ||
1380 | /* The fraction of a cpu used by this cfs_rq */ | 2173 | /* The fraction of a cpu used by this cfs_rq */ |
1381 | contrib = div_u64(sa->runnable_avg_sum << NICE_0_SHIFT, | 2174 | contrib = div_u64((u64)sa->runnable_avg_sum << NICE_0_SHIFT, |
1382 | sa->runnable_avg_period + 1); | 2175 | sa->runnable_avg_period + 1); |
1383 | contrib -= cfs_rq->tg_runnable_contrib; | 2176 | contrib -= cfs_rq->tg_runnable_contrib; |
1384 | 2177 | ||
@@ -2070,13 +2863,14 @@ static inline bool cfs_bandwidth_used(void) | |||
2070 | return static_key_false(&__cfs_bandwidth_used); | 2863 | return static_key_false(&__cfs_bandwidth_used); |
2071 | } | 2864 | } |
2072 | 2865 | ||
2073 | void account_cfs_bandwidth_used(int enabled, int was_enabled) | 2866 | void cfs_bandwidth_usage_inc(void) |
2074 | { | 2867 | { |
2075 | /* only need to count groups transitioning between enabled/!enabled */ | 2868 | static_key_slow_inc(&__cfs_bandwidth_used); |
2076 | if (enabled && !was_enabled) | 2869 | } |
2077 | static_key_slow_inc(&__cfs_bandwidth_used); | 2870 | |
2078 | else if (!enabled && was_enabled) | 2871 | void cfs_bandwidth_usage_dec(void) |
2079 | static_key_slow_dec(&__cfs_bandwidth_used); | 2872 | { |
2873 | static_key_slow_dec(&__cfs_bandwidth_used); | ||
2080 | } | 2874 | } |
2081 | #else /* HAVE_JUMP_LABEL */ | 2875 | #else /* HAVE_JUMP_LABEL */ |
2082 | static bool cfs_bandwidth_used(void) | 2876 | static bool cfs_bandwidth_used(void) |
@@ -2084,7 +2878,8 @@ static bool cfs_bandwidth_used(void) | |||
2084 | return true; | 2878 | return true; |
2085 | } | 2879 | } |
2086 | 2880 | ||
2087 | void account_cfs_bandwidth_used(int enabled, int was_enabled) {} | 2881 | void cfs_bandwidth_usage_inc(void) {} |
2882 | void cfs_bandwidth_usage_dec(void) {} | ||
2088 | #endif /* HAVE_JUMP_LABEL */ | 2883 | #endif /* HAVE_JUMP_LABEL */ |
2089 | 2884 | ||
2090 | /* | 2885 | /* |
@@ -2213,8 +3008,7 @@ static void expire_cfs_rq_runtime(struct cfs_rq *cfs_rq) | |||
2213 | } | 3008 | } |
2214 | } | 3009 | } |
2215 | 3010 | ||
2216 | static void __account_cfs_rq_runtime(struct cfs_rq *cfs_rq, | 3011 | static void __account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec) |
2217 | unsigned long delta_exec) | ||
2218 | { | 3012 | { |
2219 | /* dock delta_exec before expiring quota (as it could span periods) */ | 3013 | /* dock delta_exec before expiring quota (as it could span periods) */ |
2220 | cfs_rq->runtime_remaining -= delta_exec; | 3014 | cfs_rq->runtime_remaining -= delta_exec; |
@@ -2232,7 +3026,7 @@ static void __account_cfs_rq_runtime(struct cfs_rq *cfs_rq, | |||
2232 | } | 3026 | } |
2233 | 3027 | ||
2234 | static __always_inline | 3028 | static __always_inline |
2235 | void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, unsigned long delta_exec) | 3029 | void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec) |
2236 | { | 3030 | { |
2237 | if (!cfs_bandwidth_used() || !cfs_rq->runtime_enabled) | 3031 | if (!cfs_bandwidth_used() || !cfs_rq->runtime_enabled) |
2238 | return; | 3032 | return; |
@@ -2335,6 +3129,8 @@ static void throttle_cfs_rq(struct cfs_rq *cfs_rq) | |||
2335 | cfs_rq->throttled_clock = rq_clock(rq); | 3129 | cfs_rq->throttled_clock = rq_clock(rq); |
2336 | raw_spin_lock(&cfs_b->lock); | 3130 | raw_spin_lock(&cfs_b->lock); |
2337 | list_add_tail_rcu(&cfs_rq->throttled_list, &cfs_b->throttled_cfs_rq); | 3131 | list_add_tail_rcu(&cfs_rq->throttled_list, &cfs_b->throttled_cfs_rq); |
3132 | if (!cfs_b->timer_active) | ||
3133 | __start_cfs_bandwidth(cfs_b); | ||
2338 | raw_spin_unlock(&cfs_b->lock); | 3134 | raw_spin_unlock(&cfs_b->lock); |
2339 | } | 3135 | } |
2340 | 3136 | ||
@@ -2448,6 +3244,13 @@ static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun) | |||
2448 | if (idle) | 3244 | if (idle) |
2449 | goto out_unlock; | 3245 | goto out_unlock; |
2450 | 3246 | ||
3247 | /* | ||
3248 | * if we have relooped after returning idle once, we need to update our | ||
3249 | * status as actually running, so that other cpus doing | ||
3250 | * __start_cfs_bandwidth will stop trying to cancel us. | ||
3251 | */ | ||
3252 | cfs_b->timer_active = 1; | ||
3253 | |||
2451 | __refill_cfs_bandwidth_runtime(cfs_b); | 3254 | __refill_cfs_bandwidth_runtime(cfs_b); |
2452 | 3255 | ||
2453 | if (!throttled) { | 3256 | if (!throttled) { |
@@ -2508,7 +3311,13 @@ static const u64 min_bandwidth_expiration = 2 * NSEC_PER_MSEC; | |||
2508 | /* how long we wait to gather additional slack before distributing */ | 3311 | /* how long we wait to gather additional slack before distributing */ |
2509 | static const u64 cfs_bandwidth_slack_period = 5 * NSEC_PER_MSEC; | 3312 | static const u64 cfs_bandwidth_slack_period = 5 * NSEC_PER_MSEC; |
2510 | 3313 | ||
2511 | /* are we near the end of the current quota period? */ | 3314 | /* |
3315 | * Are we near the end of the current quota period? | ||
3316 | * | ||
3317 | * Requires cfs_b->lock for hrtimer_expires_remaining to be safe against the | ||
3318 | * hrtimer base being cleared by __hrtimer_start_range_ns. In the case of | ||
3319 | * migrate_hrtimers, base is never cleared, so we are fine. | ||
3320 | */ | ||
2512 | static int runtime_refresh_within(struct cfs_bandwidth *cfs_b, u64 min_expire) | 3321 | static int runtime_refresh_within(struct cfs_bandwidth *cfs_b, u64 min_expire) |
2513 | { | 3322 | { |
2514 | struct hrtimer *refresh_timer = &cfs_b->period_timer; | 3323 | struct hrtimer *refresh_timer = &cfs_b->period_timer; |
@@ -2584,10 +3393,12 @@ static void do_sched_cfs_slack_timer(struct cfs_bandwidth *cfs_b) | |||
2584 | u64 expires; | 3393 | u64 expires; |
2585 | 3394 | ||
2586 | /* confirm we're still not at a refresh boundary */ | 3395 | /* confirm we're still not at a refresh boundary */ |
2587 | if (runtime_refresh_within(cfs_b, min_bandwidth_expiration)) | 3396 | raw_spin_lock(&cfs_b->lock); |
3397 | if (runtime_refresh_within(cfs_b, min_bandwidth_expiration)) { | ||
3398 | raw_spin_unlock(&cfs_b->lock); | ||
2588 | return; | 3399 | return; |
3400 | } | ||
2589 | 3401 | ||
2590 | raw_spin_lock(&cfs_b->lock); | ||
2591 | if (cfs_b->quota != RUNTIME_INF && cfs_b->runtime > slice) { | 3402 | if (cfs_b->quota != RUNTIME_INF && cfs_b->runtime > slice) { |
2592 | runtime = cfs_b->runtime; | 3403 | runtime = cfs_b->runtime; |
2593 | cfs_b->runtime = 0; | 3404 | cfs_b->runtime = 0; |
@@ -2708,11 +3519,11 @@ void __start_cfs_bandwidth(struct cfs_bandwidth *cfs_b) | |||
2708 | * (timer_active==0 becomes visible before the hrtimer call-back | 3519 | * (timer_active==0 becomes visible before the hrtimer call-back |
2709 | * terminates). In either case we ensure that it's re-programmed | 3520 | * terminates). In either case we ensure that it's re-programmed |
2710 | */ | 3521 | */ |
2711 | while (unlikely(hrtimer_active(&cfs_b->period_timer))) { | 3522 | while (unlikely(hrtimer_active(&cfs_b->period_timer)) && |
3523 | hrtimer_try_to_cancel(&cfs_b->period_timer) < 0) { | ||
3524 | /* bounce the lock to allow do_sched_cfs_period_timer to run */ | ||
2712 | raw_spin_unlock(&cfs_b->lock); | 3525 | raw_spin_unlock(&cfs_b->lock); |
2713 | /* ensure cfs_b->lock is available while we wait */ | 3526 | cpu_relax(); |
2714 | hrtimer_cancel(&cfs_b->period_timer); | ||
2715 | |||
2716 | raw_spin_lock(&cfs_b->lock); | 3527 | raw_spin_lock(&cfs_b->lock); |
2717 | /* if someone else restarted the timer then we're done */ | 3528 | /* if someone else restarted the timer then we're done */ |
2718 | if (cfs_b->timer_active) | 3529 | if (cfs_b->timer_active) |
@@ -2755,8 +3566,7 @@ static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq) | |||
2755 | return rq_clock_task(rq_of(cfs_rq)); | 3566 | return rq_clock_task(rq_of(cfs_rq)); |
2756 | } | 3567 | } |
2757 | 3568 | ||
2758 | static void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, | 3569 | static void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec) {} |
2759 | unsigned long delta_exec) {} | ||
2760 | static void check_cfs_rq_runtime(struct cfs_rq *cfs_rq) {} | 3570 | static void check_cfs_rq_runtime(struct cfs_rq *cfs_rq) {} |
2761 | static void check_enqueue_throttle(struct cfs_rq *cfs_rq) {} | 3571 | static void check_enqueue_throttle(struct cfs_rq *cfs_rq) {} |
2762 | static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq) {} | 3572 | static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq) {} |
@@ -3166,8 +3976,7 @@ static long effective_load(struct task_group *tg, int cpu, long wl, long wg) | |||
3166 | } | 3976 | } |
3167 | #else | 3977 | #else |
3168 | 3978 | ||
3169 | static inline unsigned long effective_load(struct task_group *tg, int cpu, | 3979 | static long effective_load(struct task_group *tg, int cpu, long wl, long wg) |
3170 | unsigned long wl, unsigned long wg) | ||
3171 | { | 3980 | { |
3172 | return wl; | 3981 | return wl; |
3173 | } | 3982 | } |
@@ -3420,11 +4229,10 @@ done: | |||
3420 | * preempt must be disabled. | 4229 | * preempt must be disabled. |
3421 | */ | 4230 | */ |
3422 | static int | 4231 | static int |
3423 | select_task_rq_fair(struct task_struct *p, int sd_flag, int wake_flags) | 4232 | select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_flags) |
3424 | { | 4233 | { |
3425 | struct sched_domain *tmp, *affine_sd = NULL, *sd = NULL; | 4234 | struct sched_domain *tmp, *affine_sd = NULL, *sd = NULL; |
3426 | int cpu = smp_processor_id(); | 4235 | int cpu = smp_processor_id(); |
3427 | int prev_cpu = task_cpu(p); | ||
3428 | int new_cpu = cpu; | 4236 | int new_cpu = cpu; |
3429 | int want_affine = 0; | 4237 | int want_affine = 0; |
3430 | int sync = wake_flags & WF_SYNC; | 4238 | int sync = wake_flags & WF_SYNC; |
@@ -3904,9 +4712,12 @@ static bool yield_to_task_fair(struct rq *rq, struct task_struct *p, bool preemp | |||
3904 | 4712 | ||
3905 | static unsigned long __read_mostly max_load_balance_interval = HZ/10; | 4713 | static unsigned long __read_mostly max_load_balance_interval = HZ/10; |
3906 | 4714 | ||
4715 | enum fbq_type { regular, remote, all }; | ||
4716 | |||
3907 | #define LBF_ALL_PINNED 0x01 | 4717 | #define LBF_ALL_PINNED 0x01 |
3908 | #define LBF_NEED_BREAK 0x02 | 4718 | #define LBF_NEED_BREAK 0x02 |
3909 | #define LBF_SOME_PINNED 0x04 | 4719 | #define LBF_DST_PINNED 0x04 |
4720 | #define LBF_SOME_PINNED 0x08 | ||
3910 | 4721 | ||
3911 | struct lb_env { | 4722 | struct lb_env { |
3912 | struct sched_domain *sd; | 4723 | struct sched_domain *sd; |
@@ -3929,6 +4740,8 @@ struct lb_env { | |||
3929 | unsigned int loop; | 4740 | unsigned int loop; |
3930 | unsigned int loop_break; | 4741 | unsigned int loop_break; |
3931 | unsigned int loop_max; | 4742 | unsigned int loop_max; |
4743 | |||
4744 | enum fbq_type fbq_type; | ||
3932 | }; | 4745 | }; |
3933 | 4746 | ||
3934 | /* | 4747 | /* |
@@ -3975,6 +4788,78 @@ task_hot(struct task_struct *p, u64 now, struct sched_domain *sd) | |||
3975 | return delta < (s64)sysctl_sched_migration_cost; | 4788 | return delta < (s64)sysctl_sched_migration_cost; |
3976 | } | 4789 | } |
3977 | 4790 | ||
4791 | #ifdef CONFIG_NUMA_BALANCING | ||
4792 | /* Returns true if the destination node has incurred more faults */ | ||
4793 | static bool migrate_improves_locality(struct task_struct *p, struct lb_env *env) | ||
4794 | { | ||
4795 | int src_nid, dst_nid; | ||
4796 | |||
4797 | if (!sched_feat(NUMA_FAVOUR_HIGHER) || !p->numa_faults || | ||
4798 | !(env->sd->flags & SD_NUMA)) { | ||
4799 | return false; | ||
4800 | } | ||
4801 | |||
4802 | src_nid = cpu_to_node(env->src_cpu); | ||
4803 | dst_nid = cpu_to_node(env->dst_cpu); | ||
4804 | |||
4805 | if (src_nid == dst_nid) | ||
4806 | return false; | ||
4807 | |||
4808 | /* Always encourage migration to the preferred node. */ | ||
4809 | if (dst_nid == p->numa_preferred_nid) | ||
4810 | return true; | ||
4811 | |||
4812 | /* If both task and group weight improve, this move is a winner. */ | ||
4813 | if (task_weight(p, dst_nid) > task_weight(p, src_nid) && | ||
4814 | group_weight(p, dst_nid) > group_weight(p, src_nid)) | ||
4815 | return true; | ||
4816 | |||
4817 | return false; | ||
4818 | } | ||
4819 | |||
4820 | |||
4821 | static bool migrate_degrades_locality(struct task_struct *p, struct lb_env *env) | ||
4822 | { | ||
4823 | int src_nid, dst_nid; | ||
4824 | |||
4825 | if (!sched_feat(NUMA) || !sched_feat(NUMA_RESIST_LOWER)) | ||
4826 | return false; | ||
4827 | |||
4828 | if (!p->numa_faults || !(env->sd->flags & SD_NUMA)) | ||
4829 | return false; | ||
4830 | |||
4831 | src_nid = cpu_to_node(env->src_cpu); | ||
4832 | dst_nid = cpu_to_node(env->dst_cpu); | ||
4833 | |||
4834 | if (src_nid == dst_nid) | ||
4835 | return false; | ||
4836 | |||
4837 | /* Migrating away from the preferred node is always bad. */ | ||
4838 | if (src_nid == p->numa_preferred_nid) | ||
4839 | return true; | ||
4840 | |||
4841 | /* If either task or group weight get worse, don't do it. */ | ||
4842 | if (task_weight(p, dst_nid) < task_weight(p, src_nid) || | ||
4843 | group_weight(p, dst_nid) < group_weight(p, src_nid)) | ||
4844 | return true; | ||
4845 | |||
4846 | return false; | ||
4847 | } | ||
4848 | |||
4849 | #else | ||
4850 | static inline bool migrate_improves_locality(struct task_struct *p, | ||
4851 | struct lb_env *env) | ||
4852 | { | ||
4853 | return false; | ||
4854 | } | ||
4855 | |||
4856 | static inline bool migrate_degrades_locality(struct task_struct *p, | ||
4857 | struct lb_env *env) | ||
4858 | { | ||
4859 | return false; | ||
4860 | } | ||
4861 | #endif | ||
4862 | |||
3978 | /* | 4863 | /* |
3979 | * can_migrate_task - may task p from runqueue rq be migrated to this_cpu? | 4864 | * can_migrate_task - may task p from runqueue rq be migrated to this_cpu? |
3980 | */ | 4865 | */ |
@@ -3997,6 +4882,8 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env) | |||
3997 | 4882 | ||
3998 | schedstat_inc(p, se.statistics.nr_failed_migrations_affine); | 4883 | schedstat_inc(p, se.statistics.nr_failed_migrations_affine); |
3999 | 4884 | ||
4885 | env->flags |= LBF_SOME_PINNED; | ||
4886 | |||
4000 | /* | 4887 | /* |
4001 | * Remember if this task can be migrated to any other cpu in | 4888 | * Remember if this task can be migrated to any other cpu in |
4002 | * our sched_group. We may want to revisit it if we couldn't | 4889 | * our sched_group. We may want to revisit it if we couldn't |
@@ -4005,13 +4892,13 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env) | |||
4005 | * Also avoid computing new_dst_cpu if we have already computed | 4892 | * Also avoid computing new_dst_cpu if we have already computed |
4006 | * one in current iteration. | 4893 | * one in current iteration. |
4007 | */ | 4894 | */ |
4008 | if (!env->dst_grpmask || (env->flags & LBF_SOME_PINNED)) | 4895 | if (!env->dst_grpmask || (env->flags & LBF_DST_PINNED)) |
4009 | return 0; | 4896 | return 0; |
4010 | 4897 | ||
4011 | /* Prevent to re-select dst_cpu via env's cpus */ | 4898 | /* Prevent to re-select dst_cpu via env's cpus */ |
4012 | for_each_cpu_and(cpu, env->dst_grpmask, env->cpus) { | 4899 | for_each_cpu_and(cpu, env->dst_grpmask, env->cpus) { |
4013 | if (cpumask_test_cpu(cpu, tsk_cpus_allowed(p))) { | 4900 | if (cpumask_test_cpu(cpu, tsk_cpus_allowed(p))) { |
4014 | env->flags |= LBF_SOME_PINNED; | 4901 | env->flags |= LBF_DST_PINNED; |
4015 | env->new_dst_cpu = cpu; | 4902 | env->new_dst_cpu = cpu; |
4016 | break; | 4903 | break; |
4017 | } | 4904 | } |
@@ -4030,11 +4917,24 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env) | |||
4030 | 4917 | ||
4031 | /* | 4918 | /* |
4032 | * Aggressive migration if: | 4919 | * Aggressive migration if: |
4033 | * 1) task is cache cold, or | 4920 | * 1) destination numa is preferred |
4034 | * 2) too many balance attempts have failed. | 4921 | * 2) task is cache cold, or |
4922 | * 3) too many balance attempts have failed. | ||
4035 | */ | 4923 | */ |
4036 | |||
4037 | tsk_cache_hot = task_hot(p, rq_clock_task(env->src_rq), env->sd); | 4924 | tsk_cache_hot = task_hot(p, rq_clock_task(env->src_rq), env->sd); |
4925 | if (!tsk_cache_hot) | ||
4926 | tsk_cache_hot = migrate_degrades_locality(p, env); | ||
4927 | |||
4928 | if (migrate_improves_locality(p, env)) { | ||
4929 | #ifdef CONFIG_SCHEDSTATS | ||
4930 | if (tsk_cache_hot) { | ||
4931 | schedstat_inc(env->sd, lb_hot_gained[env->idle]); | ||
4932 | schedstat_inc(p, se.statistics.nr_forced_migrations); | ||
4933 | } | ||
4934 | #endif | ||
4935 | return 1; | ||
4936 | } | ||
4937 | |||
4038 | if (!tsk_cache_hot || | 4938 | if (!tsk_cache_hot || |
4039 | env->sd->nr_balance_failed > env->sd->cache_nice_tries) { | 4939 | env->sd->nr_balance_failed > env->sd->cache_nice_tries) { |
4040 | 4940 | ||
@@ -4077,8 +4977,6 @@ static int move_one_task(struct lb_env *env) | |||
4077 | return 0; | 4977 | return 0; |
4078 | } | 4978 | } |
4079 | 4979 | ||
4080 | static unsigned long task_h_load(struct task_struct *p); | ||
4081 | |||
4082 | static const unsigned int sched_nr_migrate_break = 32; | 4980 | static const unsigned int sched_nr_migrate_break = 32; |
4083 | 4981 | ||
4084 | /* | 4982 | /* |
@@ -4291,6 +5189,10 @@ struct sg_lb_stats { | |||
4291 | unsigned int group_weight; | 5189 | unsigned int group_weight; |
4292 | int group_imb; /* Is there an imbalance in the group ? */ | 5190 | int group_imb; /* Is there an imbalance in the group ? */ |
4293 | int group_has_capacity; /* Is there extra capacity in the group? */ | 5191 | int group_has_capacity; /* Is there extra capacity in the group? */ |
5192 | #ifdef CONFIG_NUMA_BALANCING | ||
5193 | unsigned int nr_numa_running; | ||
5194 | unsigned int nr_preferred_running; | ||
5195 | #endif | ||
4294 | }; | 5196 | }; |
4295 | 5197 | ||
4296 | /* | 5198 | /* |
@@ -4330,7 +5232,7 @@ static inline void init_sd_lb_stats(struct sd_lb_stats *sds) | |||
4330 | /** | 5232 | /** |
4331 | * get_sd_load_idx - Obtain the load index for a given sched domain. | 5233 | * get_sd_load_idx - Obtain the load index for a given sched domain. |
4332 | * @sd: The sched_domain whose load_idx is to be obtained. | 5234 | * @sd: The sched_domain whose load_idx is to be obtained. |
4333 | * @idle: The Idle status of the CPU for whose sd load_icx is obtained. | 5235 | * @idle: The idle status of the CPU for whose sd load_idx is obtained. |
4334 | * | 5236 | * |
4335 | * Return: The load index. | 5237 | * Return: The load index. |
4336 | */ | 5238 | */ |
@@ -4447,7 +5349,7 @@ void update_group_power(struct sched_domain *sd, int cpu) | |||
4447 | { | 5349 | { |
4448 | struct sched_domain *child = sd->child; | 5350 | struct sched_domain *child = sd->child; |
4449 | struct sched_group *group, *sdg = sd->groups; | 5351 | struct sched_group *group, *sdg = sd->groups; |
4450 | unsigned long power; | 5352 | unsigned long power, power_orig; |
4451 | unsigned long interval; | 5353 | unsigned long interval; |
4452 | 5354 | ||
4453 | interval = msecs_to_jiffies(sd->balance_interval); | 5355 | interval = msecs_to_jiffies(sd->balance_interval); |
@@ -4459,7 +5361,7 @@ void update_group_power(struct sched_domain *sd, int cpu) | |||
4459 | return; | 5361 | return; |
4460 | } | 5362 | } |
4461 | 5363 | ||
4462 | power = 0; | 5364 | power_orig = power = 0; |
4463 | 5365 | ||
4464 | if (child->flags & SD_OVERLAP) { | 5366 | if (child->flags & SD_OVERLAP) { |
4465 | /* | 5367 | /* |
@@ -4467,8 +5369,33 @@ void update_group_power(struct sched_domain *sd, int cpu) | |||
4467 | * span the current group. | 5369 | * span the current group. |
4468 | */ | 5370 | */ |
4469 | 5371 | ||
4470 | for_each_cpu(cpu, sched_group_cpus(sdg)) | 5372 | for_each_cpu(cpu, sched_group_cpus(sdg)) { |
4471 | power += power_of(cpu); | 5373 | struct sched_group_power *sgp; |
5374 | struct rq *rq = cpu_rq(cpu); | ||
5375 | |||
5376 | /* | ||
5377 | * build_sched_domains() -> init_sched_groups_power() | ||
5378 | * gets here before we've attached the domains to the | ||
5379 | * runqueues. | ||
5380 | * | ||
5381 | * Use power_of(), which is set irrespective of domains | ||
5382 | * in update_cpu_power(). | ||
5383 | * | ||
5384 | * This avoids power/power_orig from being 0 and | ||
5385 | * causing divide-by-zero issues on boot. | ||
5386 | * | ||
5387 | * Runtime updates will correct power_orig. | ||
5388 | */ | ||
5389 | if (unlikely(!rq->sd)) { | ||
5390 | power_orig += power_of(cpu); | ||
5391 | power += power_of(cpu); | ||
5392 | continue; | ||
5393 | } | ||
5394 | |||
5395 | sgp = rq->sd->groups->sgp; | ||
5396 | power_orig += sgp->power_orig; | ||
5397 | power += sgp->power; | ||
5398 | } | ||
4472 | } else { | 5399 | } else { |
4473 | /* | 5400 | /* |
4474 | * !SD_OVERLAP domains can assume that child groups | 5401 | * !SD_OVERLAP domains can assume that child groups |
@@ -4477,12 +5404,14 @@ void update_group_power(struct sched_domain *sd, int cpu) | |||
4477 | 5404 | ||
4478 | group = child->groups; | 5405 | group = child->groups; |
4479 | do { | 5406 | do { |
5407 | power_orig += group->sgp->power_orig; | ||
4480 | power += group->sgp->power; | 5408 | power += group->sgp->power; |
4481 | group = group->next; | 5409 | group = group->next; |
4482 | } while (group != child->groups); | 5410 | } while (group != child->groups); |
4483 | } | 5411 | } |
4484 | 5412 | ||
4485 | sdg->sgp->power_orig = sdg->sgp->power = power; | 5413 | sdg->sgp->power_orig = power_orig; |
5414 | sdg->sgp->power = power; | ||
4486 | } | 5415 | } |
4487 | 5416 | ||
4488 | /* | 5417 | /* |
@@ -4526,13 +5455,12 @@ fix_small_capacity(struct sched_domain *sd, struct sched_group *group) | |||
4526 | * cpu 3 and leave one of the cpus in the second group unused. | 5455 | * cpu 3 and leave one of the cpus in the second group unused. |
4527 | * | 5456 | * |
4528 | * The current solution to this issue is detecting the skew in the first group | 5457 | * The current solution to this issue is detecting the skew in the first group |
4529 | * by noticing it has a cpu that is overloaded while the remaining cpus are | 5458 | * by noticing the lower domain failed to reach balance and had difficulty |
4530 | * idle -- or rather, there's a distinct imbalance in the cpus; see | 5459 | * moving tasks due to affinity constraints. |
4531 | * sg_imbalanced(). | ||
4532 | * | 5460 | * |
4533 | * When this is so detected; this group becomes a candidate for busiest; see | 5461 | * When this is so detected; this group becomes a candidate for busiest; see |
4534 | * update_sd_pick_busiest(). And calculcate_imbalance() and | 5462 | * update_sd_pick_busiest(). And calculate_imbalance() and |
4535 | * find_busiest_group() avoid some of the usual balance conditional to allow it | 5463 | * find_busiest_group() avoid some of the usual balance conditions to allow it |
4536 | * to create an effective group imbalance. | 5464 | * to create an effective group imbalance. |
4537 | * | 5465 | * |
4538 | * This is a somewhat tricky proposition since the next run might not find the | 5466 | * This is a somewhat tricky proposition since the next run might not find the |
@@ -4540,49 +5468,36 @@ fix_small_capacity(struct sched_domain *sd, struct sched_group *group) | |||
4540 | * subtle and fragile situation. | 5468 | * subtle and fragile situation. |
4541 | */ | 5469 | */ |
4542 | 5470 | ||
4543 | struct sg_imb_stats { | 5471 | static inline int sg_imbalanced(struct sched_group *group) |
4544 | unsigned long max_nr_running, min_nr_running; | ||
4545 | unsigned long max_cpu_load, min_cpu_load; | ||
4546 | }; | ||
4547 | |||
4548 | static inline void init_sg_imb_stats(struct sg_imb_stats *sgi) | ||
4549 | { | 5472 | { |
4550 | sgi->max_cpu_load = sgi->max_nr_running = 0UL; | 5473 | return group->sgp->imbalance; |
4551 | sgi->min_cpu_load = sgi->min_nr_running = ~0UL; | ||
4552 | } | 5474 | } |
4553 | 5475 | ||
4554 | static inline void | 5476 | /* |
4555 | update_sg_imb_stats(struct sg_imb_stats *sgi, | 5477 | * Compute the group capacity. |
4556 | unsigned long load, unsigned long nr_running) | 5478 | * |
5479 | * Avoid the issue where N*frac(smt_power) >= 1 creates 'phantom' cores by | ||
5480 | * first dividing out the smt factor and computing the actual number of cores | ||
5481 | * and limit power unit capacity with that. | ||
5482 | */ | ||
5483 | static inline int sg_capacity(struct lb_env *env, struct sched_group *group) | ||
4557 | { | 5484 | { |
4558 | if (load > sgi->max_cpu_load) | 5485 | unsigned int capacity, smt, cpus; |
4559 | sgi->max_cpu_load = load; | 5486 | unsigned int power, power_orig; |
4560 | if (sgi->min_cpu_load > load) | ||
4561 | sgi->min_cpu_load = load; | ||
4562 | 5487 | ||
4563 | if (nr_running > sgi->max_nr_running) | 5488 | power = group->sgp->power; |
4564 | sgi->max_nr_running = nr_running; | 5489 | power_orig = group->sgp->power_orig; |
4565 | if (sgi->min_nr_running > nr_running) | 5490 | cpus = group->group_weight; |
4566 | sgi->min_nr_running = nr_running; | ||
4567 | } | ||
4568 | 5491 | ||
4569 | static inline int | 5492 | /* smt := ceil(cpus / power), assumes: 1 < smt_power < 2 */ |
4570 | sg_imbalanced(struct sg_lb_stats *sgs, struct sg_imb_stats *sgi) | 5493 | smt = DIV_ROUND_UP(SCHED_POWER_SCALE * cpus, power_orig); |
4571 | { | 5494 | capacity = cpus / smt; /* cores */ |
4572 | /* | ||
4573 | * Consider the group unbalanced when the imbalance is larger | ||
4574 | * than the average weight of a task. | ||
4575 | * | ||
4576 | * APZ: with cgroup the avg task weight can vary wildly and | ||
4577 | * might not be a suitable number - should we keep a | ||
4578 | * normalized nr_running number somewhere that negates | ||
4579 | * the hierarchy? | ||
4580 | */ | ||
4581 | if ((sgi->max_cpu_load - sgi->min_cpu_load) >= sgs->load_per_task && | ||
4582 | (sgi->max_nr_running - sgi->min_nr_running) > 1) | ||
4583 | return 1; | ||
4584 | 5495 | ||
4585 | return 0; | 5496 | capacity = min_t(unsigned, capacity, DIV_ROUND_CLOSEST(power, SCHED_POWER_SCALE)); |
5497 | if (!capacity) | ||
5498 | capacity = fix_small_capacity(env->sd, group); | ||
5499 | |||
5500 | return capacity; | ||
4586 | } | 5501 | } |
4587 | 5502 | ||
4588 | /** | 5503 | /** |
@@ -4597,12 +5512,11 @@ static inline void update_sg_lb_stats(struct lb_env *env, | |||
4597 | struct sched_group *group, int load_idx, | 5512 | struct sched_group *group, int load_idx, |
4598 | int local_group, struct sg_lb_stats *sgs) | 5513 | int local_group, struct sg_lb_stats *sgs) |
4599 | { | 5514 | { |
4600 | struct sg_imb_stats sgi; | ||
4601 | unsigned long nr_running; | 5515 | unsigned long nr_running; |
4602 | unsigned long load; | 5516 | unsigned long load; |
4603 | int i; | 5517 | int i; |
4604 | 5518 | ||
4605 | init_sg_imb_stats(&sgi); | 5519 | memset(sgs, 0, sizeof(*sgs)); |
4606 | 5520 | ||
4607 | for_each_cpu_and(i, sched_group_cpus(group), env->cpus) { | 5521 | for_each_cpu_and(i, sched_group_cpus(group), env->cpus) { |
4608 | struct rq *rq = cpu_rq(i); | 5522 | struct rq *rq = cpu_rq(i); |
@@ -4610,24 +5524,22 @@ static inline void update_sg_lb_stats(struct lb_env *env, | |||
4610 | nr_running = rq->nr_running; | 5524 | nr_running = rq->nr_running; |
4611 | 5525 | ||
4612 | /* Bias balancing toward cpus of our domain */ | 5526 | /* Bias balancing toward cpus of our domain */ |
4613 | if (local_group) { | 5527 | if (local_group) |
4614 | load = target_load(i, load_idx); | 5528 | load = target_load(i, load_idx); |
4615 | } else { | 5529 | else |
4616 | load = source_load(i, load_idx); | 5530 | load = source_load(i, load_idx); |
4617 | update_sg_imb_stats(&sgi, load, nr_running); | ||
4618 | } | ||
4619 | 5531 | ||
4620 | sgs->group_load += load; | 5532 | sgs->group_load += load; |
4621 | sgs->sum_nr_running += nr_running; | 5533 | sgs->sum_nr_running += nr_running; |
5534 | #ifdef CONFIG_NUMA_BALANCING | ||
5535 | sgs->nr_numa_running += rq->nr_numa_running; | ||
5536 | sgs->nr_preferred_running += rq->nr_preferred_running; | ||
5537 | #endif | ||
4622 | sgs->sum_weighted_load += weighted_cpuload(i); | 5538 | sgs->sum_weighted_load += weighted_cpuload(i); |
4623 | if (idle_cpu(i)) | 5539 | if (idle_cpu(i)) |
4624 | sgs->idle_cpus++; | 5540 | sgs->idle_cpus++; |
4625 | } | 5541 | } |
4626 | 5542 | ||
4627 | if (local_group && (env->idle != CPU_NEWLY_IDLE || | ||
4628 | time_after_eq(jiffies, group->sgp->next_update))) | ||
4629 | update_group_power(env->sd, env->dst_cpu); | ||
4630 | |||
4631 | /* Adjust by relative CPU power of the group */ | 5543 | /* Adjust by relative CPU power of the group */ |
4632 | sgs->group_power = group->sgp->power; | 5544 | sgs->group_power = group->sgp->power; |
4633 | sgs->avg_load = (sgs->group_load*SCHED_POWER_SCALE) / sgs->group_power; | 5545 | sgs->avg_load = (sgs->group_load*SCHED_POWER_SCALE) / sgs->group_power; |
@@ -4635,16 +5547,11 @@ static inline void update_sg_lb_stats(struct lb_env *env, | |||
4635 | if (sgs->sum_nr_running) | 5547 | if (sgs->sum_nr_running) |
4636 | sgs->load_per_task = sgs->sum_weighted_load / sgs->sum_nr_running; | 5548 | sgs->load_per_task = sgs->sum_weighted_load / sgs->sum_nr_running; |
4637 | 5549 | ||
4638 | sgs->group_imb = sg_imbalanced(sgs, &sgi); | ||
4639 | |||
4640 | sgs->group_capacity = | ||
4641 | DIV_ROUND_CLOSEST(sgs->group_power, SCHED_POWER_SCALE); | ||
4642 | |||
4643 | if (!sgs->group_capacity) | ||
4644 | sgs->group_capacity = fix_small_capacity(env->sd, group); | ||
4645 | |||
4646 | sgs->group_weight = group->group_weight; | 5550 | sgs->group_weight = group->group_weight; |
4647 | 5551 | ||
5552 | sgs->group_imb = sg_imbalanced(group); | ||
5553 | sgs->group_capacity = sg_capacity(env, group); | ||
5554 | |||
4648 | if (sgs->group_capacity > sgs->sum_nr_running) | 5555 | if (sgs->group_capacity > sgs->sum_nr_running) |
4649 | sgs->group_has_capacity = 1; | 5556 | sgs->group_has_capacity = 1; |
4650 | } | 5557 | } |
@@ -4693,14 +5600,42 @@ static bool update_sd_pick_busiest(struct lb_env *env, | |||
4693 | return false; | 5600 | return false; |
4694 | } | 5601 | } |
4695 | 5602 | ||
5603 | #ifdef CONFIG_NUMA_BALANCING | ||
5604 | static inline enum fbq_type fbq_classify_group(struct sg_lb_stats *sgs) | ||
5605 | { | ||
5606 | if (sgs->sum_nr_running > sgs->nr_numa_running) | ||
5607 | return regular; | ||
5608 | if (sgs->sum_nr_running > sgs->nr_preferred_running) | ||
5609 | return remote; | ||
5610 | return all; | ||
5611 | } | ||
5612 | |||
5613 | static inline enum fbq_type fbq_classify_rq(struct rq *rq) | ||
5614 | { | ||
5615 | if (rq->nr_running > rq->nr_numa_running) | ||
5616 | return regular; | ||
5617 | if (rq->nr_running > rq->nr_preferred_running) | ||
5618 | return remote; | ||
5619 | return all; | ||
5620 | } | ||
5621 | #else | ||
5622 | static inline enum fbq_type fbq_classify_group(struct sg_lb_stats *sgs) | ||
5623 | { | ||
5624 | return all; | ||
5625 | } | ||
5626 | |||
5627 | static inline enum fbq_type fbq_classify_rq(struct rq *rq) | ||
5628 | { | ||
5629 | return regular; | ||
5630 | } | ||
5631 | #endif /* CONFIG_NUMA_BALANCING */ | ||
5632 | |||
4696 | /** | 5633 | /** |
4697 | * update_sd_lb_stats - Update sched_domain's statistics for load balancing. | 5634 | * update_sd_lb_stats - Update sched_domain's statistics for load balancing. |
4698 | * @env: The load balancing environment. | 5635 | * @env: The load balancing environment. |
4699 | * @balance: Should we balance. | ||
4700 | * @sds: variable to hold the statistics for this sched_domain. | 5636 | * @sds: variable to hold the statistics for this sched_domain. |
4701 | */ | 5637 | */ |
4702 | static inline void update_sd_lb_stats(struct lb_env *env, | 5638 | static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sds) |
4703 | struct sd_lb_stats *sds) | ||
4704 | { | 5639 | { |
4705 | struct sched_domain *child = env->sd->child; | 5640 | struct sched_domain *child = env->sd->child; |
4706 | struct sched_group *sg = env->sd->groups; | 5641 | struct sched_group *sg = env->sd->groups; |
@@ -4720,11 +5655,17 @@ static inline void update_sd_lb_stats(struct lb_env *env, | |||
4720 | if (local_group) { | 5655 | if (local_group) { |
4721 | sds->local = sg; | 5656 | sds->local = sg; |
4722 | sgs = &sds->local_stat; | 5657 | sgs = &sds->local_stat; |
5658 | |||
5659 | if (env->idle != CPU_NEWLY_IDLE || | ||
5660 | time_after_eq(jiffies, sg->sgp->next_update)) | ||
5661 | update_group_power(env->sd, env->dst_cpu); | ||
4723 | } | 5662 | } |
4724 | 5663 | ||
4725 | memset(sgs, 0, sizeof(*sgs)); | ||
4726 | update_sg_lb_stats(env, sg, load_idx, local_group, sgs); | 5664 | update_sg_lb_stats(env, sg, load_idx, local_group, sgs); |
4727 | 5665 | ||
5666 | if (local_group) | ||
5667 | goto next_group; | ||
5668 | |||
4728 | /* | 5669 | /* |
4729 | * In case the child domain prefers tasks go to siblings | 5670 | * In case the child domain prefers tasks go to siblings |
4730 | * first, lower the sg capacity to one so that we'll try | 5671 | * first, lower the sg capacity to one so that we'll try |
@@ -4735,21 +5676,25 @@ static inline void update_sd_lb_stats(struct lb_env *env, | |||
4735 | * heaviest group when it is already under-utilized (possible | 5676 | * heaviest group when it is already under-utilized (possible |
4736 | * with a large weight task outweighs the tasks on the system). | 5677 | * with a large weight task outweighs the tasks on the system). |
4737 | */ | 5678 | */ |
4738 | if (prefer_sibling && !local_group && | 5679 | if (prefer_sibling && sds->local && |
4739 | sds->local && sds->local_stat.group_has_capacity) | 5680 | sds->local_stat.group_has_capacity) |
4740 | sgs->group_capacity = min(sgs->group_capacity, 1U); | 5681 | sgs->group_capacity = min(sgs->group_capacity, 1U); |
4741 | 5682 | ||
4742 | /* Now, start updating sd_lb_stats */ | 5683 | if (update_sd_pick_busiest(env, sds, sg, sgs)) { |
4743 | sds->total_load += sgs->group_load; | ||
4744 | sds->total_pwr += sgs->group_power; | ||
4745 | |||
4746 | if (!local_group && update_sd_pick_busiest(env, sds, sg, sgs)) { | ||
4747 | sds->busiest = sg; | 5684 | sds->busiest = sg; |
4748 | sds->busiest_stat = *sgs; | 5685 | sds->busiest_stat = *sgs; |
4749 | } | 5686 | } |
4750 | 5687 | ||
5688 | next_group: | ||
5689 | /* Now, start updating sd_lb_stats */ | ||
5690 | sds->total_load += sgs->group_load; | ||
5691 | sds->total_pwr += sgs->group_power; | ||
5692 | |||
4751 | sg = sg->next; | 5693 | sg = sg->next; |
4752 | } while (sg != env->sd->groups); | 5694 | } while (sg != env->sd->groups); |
5695 | |||
5696 | if (env->sd->flags & SD_NUMA) | ||
5697 | env->fbq_type = fbq_classify_group(&sds->busiest_stat); | ||
4753 | } | 5698 | } |
4754 | 5699 | ||
4755 | /** | 5700 | /** |
@@ -5053,15 +5998,39 @@ static struct rq *find_busiest_queue(struct lb_env *env, | |||
5053 | int i; | 5998 | int i; |
5054 | 5999 | ||
5055 | for_each_cpu_and(i, sched_group_cpus(group), env->cpus) { | 6000 | for_each_cpu_and(i, sched_group_cpus(group), env->cpus) { |
5056 | unsigned long power = power_of(i); | 6001 | unsigned long power, capacity, wl; |
5057 | unsigned long capacity = DIV_ROUND_CLOSEST(power, | 6002 | enum fbq_type rt; |
5058 | SCHED_POWER_SCALE); | 6003 | |
5059 | unsigned long wl; | 6004 | rq = cpu_rq(i); |
6005 | rt = fbq_classify_rq(rq); | ||
5060 | 6006 | ||
6007 | /* | ||
6008 | * We classify groups/runqueues into three groups: | ||
6009 | * - regular: there are !numa tasks | ||
6010 | * - remote: there are numa tasks that run on the 'wrong' node | ||
6011 | * - all: there is no distinction | ||
6012 | * | ||
6013 | * In order to avoid migrating ideally placed numa tasks, | ||
6014 | * ignore those when there's better options. | ||
6015 | * | ||
6016 | * If we ignore the actual busiest queue to migrate another | ||
6017 | * task, the next balance pass can still reduce the busiest | ||
6018 | * queue by moving tasks around inside the node. | ||
6019 | * | ||
6020 | * If we cannot move enough load due to this classification | ||
6021 | * the next pass will adjust the group classification and | ||
6022 | * allow migration of more tasks. | ||
6023 | * | ||
6024 | * Both cases only affect the total convergence complexity. | ||
6025 | */ | ||
6026 | if (rt > env->fbq_type) | ||
6027 | continue; | ||
6028 | |||
6029 | power = power_of(i); | ||
6030 | capacity = DIV_ROUND_CLOSEST(power, SCHED_POWER_SCALE); | ||
5061 | if (!capacity) | 6031 | if (!capacity) |
5062 | capacity = fix_small_capacity(env->sd, group); | 6032 | capacity = fix_small_capacity(env->sd, group); |
5063 | 6033 | ||
5064 | rq = cpu_rq(i); | ||
5065 | wl = weighted_cpuload(i); | 6034 | wl = weighted_cpuload(i); |
5066 | 6035 | ||
5067 | /* | 6036 | /* |
@@ -5164,6 +6133,7 @@ static int load_balance(int this_cpu, struct rq *this_rq, | |||
5164 | int *continue_balancing) | 6133 | int *continue_balancing) |
5165 | { | 6134 | { |
5166 | int ld_moved, cur_ld_moved, active_balance = 0; | 6135 | int ld_moved, cur_ld_moved, active_balance = 0; |
6136 | struct sched_domain *sd_parent = sd->parent; | ||
5167 | struct sched_group *group; | 6137 | struct sched_group *group; |
5168 | struct rq *busiest; | 6138 | struct rq *busiest; |
5169 | unsigned long flags; | 6139 | unsigned long flags; |
@@ -5177,6 +6147,7 @@ static int load_balance(int this_cpu, struct rq *this_rq, | |||
5177 | .idle = idle, | 6147 | .idle = idle, |
5178 | .loop_break = sched_nr_migrate_break, | 6148 | .loop_break = sched_nr_migrate_break, |
5179 | .cpus = cpus, | 6149 | .cpus = cpus, |
6150 | .fbq_type = all, | ||
5180 | }; | 6151 | }; |
5181 | 6152 | ||
5182 | /* | 6153 | /* |
@@ -5268,17 +6239,17 @@ more_balance: | |||
5268 | * moreover subsequent load balance cycles should correct the | 6239 | * moreover subsequent load balance cycles should correct the |
5269 | * excess load moved. | 6240 | * excess load moved. |
5270 | */ | 6241 | */ |
5271 | if ((env.flags & LBF_SOME_PINNED) && env.imbalance > 0) { | 6242 | if ((env.flags & LBF_DST_PINNED) && env.imbalance > 0) { |
6243 | |||
6244 | /* Prevent to re-select dst_cpu via env's cpus */ | ||
6245 | cpumask_clear_cpu(env.dst_cpu, env.cpus); | ||
5272 | 6246 | ||
5273 | env.dst_rq = cpu_rq(env.new_dst_cpu); | 6247 | env.dst_rq = cpu_rq(env.new_dst_cpu); |
5274 | env.dst_cpu = env.new_dst_cpu; | 6248 | env.dst_cpu = env.new_dst_cpu; |
5275 | env.flags &= ~LBF_SOME_PINNED; | 6249 | env.flags &= ~LBF_DST_PINNED; |
5276 | env.loop = 0; | 6250 | env.loop = 0; |
5277 | env.loop_break = sched_nr_migrate_break; | 6251 | env.loop_break = sched_nr_migrate_break; |
5278 | 6252 | ||
5279 | /* Prevent to re-select dst_cpu via env's cpus */ | ||
5280 | cpumask_clear_cpu(env.dst_cpu, env.cpus); | ||
5281 | |||
5282 | /* | 6253 | /* |
5283 | * Go back to "more_balance" rather than "redo" since we | 6254 | * Go back to "more_balance" rather than "redo" since we |
5284 | * need to continue with same src_cpu. | 6255 | * need to continue with same src_cpu. |
@@ -5286,6 +6257,18 @@ more_balance: | |||
5286 | goto more_balance; | 6257 | goto more_balance; |
5287 | } | 6258 | } |
5288 | 6259 | ||
6260 | /* | ||
6261 | * We failed to reach balance because of affinity. | ||
6262 | */ | ||
6263 | if (sd_parent) { | ||
6264 | int *group_imbalance = &sd_parent->groups->sgp->imbalance; | ||
6265 | |||
6266 | if ((env.flags & LBF_SOME_PINNED) && env.imbalance > 0) { | ||
6267 | *group_imbalance = 1; | ||
6268 | } else if (*group_imbalance) | ||
6269 | *group_imbalance = 0; | ||
6270 | } | ||
6271 | |||
5289 | /* All tasks on this runqueue were pinned by CPU affinity */ | 6272 | /* All tasks on this runqueue were pinned by CPU affinity */ |
5290 | if (unlikely(env.flags & LBF_ALL_PINNED)) { | 6273 | if (unlikely(env.flags & LBF_ALL_PINNED)) { |
5291 | cpumask_clear_cpu(cpu_of(busiest), cpus); | 6274 | cpumask_clear_cpu(cpu_of(busiest), cpus); |
@@ -5393,6 +6376,7 @@ void idle_balance(int this_cpu, struct rq *this_rq) | |||
5393 | struct sched_domain *sd; | 6376 | struct sched_domain *sd; |
5394 | int pulled_task = 0; | 6377 | int pulled_task = 0; |
5395 | unsigned long next_balance = jiffies + HZ; | 6378 | unsigned long next_balance = jiffies + HZ; |
6379 | u64 curr_cost = 0; | ||
5396 | 6380 | ||
5397 | this_rq->idle_stamp = rq_clock(this_rq); | 6381 | this_rq->idle_stamp = rq_clock(this_rq); |
5398 | 6382 | ||
@@ -5409,15 +6393,27 @@ void idle_balance(int this_cpu, struct rq *this_rq) | |||
5409 | for_each_domain(this_cpu, sd) { | 6393 | for_each_domain(this_cpu, sd) { |
5410 | unsigned long interval; | 6394 | unsigned long interval; |
5411 | int continue_balancing = 1; | 6395 | int continue_balancing = 1; |
6396 | u64 t0, domain_cost; | ||
5412 | 6397 | ||
5413 | if (!(sd->flags & SD_LOAD_BALANCE)) | 6398 | if (!(sd->flags & SD_LOAD_BALANCE)) |
5414 | continue; | 6399 | continue; |
5415 | 6400 | ||
6401 | if (this_rq->avg_idle < curr_cost + sd->max_newidle_lb_cost) | ||
6402 | break; | ||
6403 | |||
5416 | if (sd->flags & SD_BALANCE_NEWIDLE) { | 6404 | if (sd->flags & SD_BALANCE_NEWIDLE) { |
6405 | t0 = sched_clock_cpu(this_cpu); | ||
6406 | |||
5417 | /* If we've pulled tasks over stop searching: */ | 6407 | /* If we've pulled tasks over stop searching: */ |
5418 | pulled_task = load_balance(this_cpu, this_rq, | 6408 | pulled_task = load_balance(this_cpu, this_rq, |
5419 | sd, CPU_NEWLY_IDLE, | 6409 | sd, CPU_NEWLY_IDLE, |
5420 | &continue_balancing); | 6410 | &continue_balancing); |
6411 | |||
6412 | domain_cost = sched_clock_cpu(this_cpu) - t0; | ||
6413 | if (domain_cost > sd->max_newidle_lb_cost) | ||
6414 | sd->max_newidle_lb_cost = domain_cost; | ||
6415 | |||
6416 | curr_cost += domain_cost; | ||
5421 | } | 6417 | } |
5422 | 6418 | ||
5423 | interval = msecs_to_jiffies(sd->balance_interval); | 6419 | interval = msecs_to_jiffies(sd->balance_interval); |
@@ -5439,6 +6435,9 @@ void idle_balance(int this_cpu, struct rq *this_rq) | |||
5439 | */ | 6435 | */ |
5440 | this_rq->next_balance = next_balance; | 6436 | this_rq->next_balance = next_balance; |
5441 | } | 6437 | } |
6438 | |||
6439 | if (curr_cost > this_rq->max_idle_balance_cost) | ||
6440 | this_rq->max_idle_balance_cost = curr_cost; | ||
5442 | } | 6441 | } |
5443 | 6442 | ||
5444 | /* | 6443 | /* |
@@ -5572,16 +6571,16 @@ static inline void nohz_balance_exit_idle(int cpu) | |||
5572 | static inline void set_cpu_sd_state_busy(void) | 6571 | static inline void set_cpu_sd_state_busy(void) |
5573 | { | 6572 | { |
5574 | struct sched_domain *sd; | 6573 | struct sched_domain *sd; |
6574 | int cpu = smp_processor_id(); | ||
5575 | 6575 | ||
5576 | rcu_read_lock(); | 6576 | rcu_read_lock(); |
5577 | sd = rcu_dereference_check_sched_domain(this_rq()->sd); | 6577 | sd = rcu_dereference(per_cpu(sd_busy, cpu)); |
5578 | 6578 | ||
5579 | if (!sd || !sd->nohz_idle) | 6579 | if (!sd || !sd->nohz_idle) |
5580 | goto unlock; | 6580 | goto unlock; |
5581 | sd->nohz_idle = 0; | 6581 | sd->nohz_idle = 0; |
5582 | 6582 | ||
5583 | for (; sd; sd = sd->parent) | 6583 | atomic_inc(&sd->groups->sgp->nr_busy_cpus); |
5584 | atomic_inc(&sd->groups->sgp->nr_busy_cpus); | ||
5585 | unlock: | 6584 | unlock: |
5586 | rcu_read_unlock(); | 6585 | rcu_read_unlock(); |
5587 | } | 6586 | } |
@@ -5589,16 +6588,16 @@ unlock: | |||
5589 | void set_cpu_sd_state_idle(void) | 6588 | void set_cpu_sd_state_idle(void) |
5590 | { | 6589 | { |
5591 | struct sched_domain *sd; | 6590 | struct sched_domain *sd; |
6591 | int cpu = smp_processor_id(); | ||
5592 | 6592 | ||
5593 | rcu_read_lock(); | 6593 | rcu_read_lock(); |
5594 | sd = rcu_dereference_check_sched_domain(this_rq()->sd); | 6594 | sd = rcu_dereference(per_cpu(sd_busy, cpu)); |
5595 | 6595 | ||
5596 | if (!sd || sd->nohz_idle) | 6596 | if (!sd || sd->nohz_idle) |
5597 | goto unlock; | 6597 | goto unlock; |
5598 | sd->nohz_idle = 1; | 6598 | sd->nohz_idle = 1; |
5599 | 6599 | ||
5600 | for (; sd; sd = sd->parent) | 6600 | atomic_dec(&sd->groups->sgp->nr_busy_cpus); |
5601 | atomic_dec(&sd->groups->sgp->nr_busy_cpus); | ||
5602 | unlock: | 6601 | unlock: |
5603 | rcu_read_unlock(); | 6602 | rcu_read_unlock(); |
5604 | } | 6603 | } |
@@ -5662,15 +6661,39 @@ static void rebalance_domains(int cpu, enum cpu_idle_type idle) | |||
5662 | /* Earliest time when we have to do rebalance again */ | 6661 | /* Earliest time when we have to do rebalance again */ |
5663 | unsigned long next_balance = jiffies + 60*HZ; | 6662 | unsigned long next_balance = jiffies + 60*HZ; |
5664 | int update_next_balance = 0; | 6663 | int update_next_balance = 0; |
5665 | int need_serialize; | 6664 | int need_serialize, need_decay = 0; |
6665 | u64 max_cost = 0; | ||
5666 | 6666 | ||
5667 | update_blocked_averages(cpu); | 6667 | update_blocked_averages(cpu); |
5668 | 6668 | ||
5669 | rcu_read_lock(); | 6669 | rcu_read_lock(); |
5670 | for_each_domain(cpu, sd) { | 6670 | for_each_domain(cpu, sd) { |
6671 | /* | ||
6672 | * Decay the newidle max times here because this is a regular | ||
6673 | * visit to all the domains. Decay ~1% per second. | ||
6674 | */ | ||
6675 | if (time_after(jiffies, sd->next_decay_max_lb_cost)) { | ||
6676 | sd->max_newidle_lb_cost = | ||
6677 | (sd->max_newidle_lb_cost * 253) / 256; | ||
6678 | sd->next_decay_max_lb_cost = jiffies + HZ; | ||
6679 | need_decay = 1; | ||
6680 | } | ||
6681 | max_cost += sd->max_newidle_lb_cost; | ||
6682 | |||
5671 | if (!(sd->flags & SD_LOAD_BALANCE)) | 6683 | if (!(sd->flags & SD_LOAD_BALANCE)) |
5672 | continue; | 6684 | continue; |
5673 | 6685 | ||
6686 | /* | ||
6687 | * Stop the load balance at this level. There is another | ||
6688 | * CPU in our sched group which is doing load balancing more | ||
6689 | * actively. | ||
6690 | */ | ||
6691 | if (!continue_balancing) { | ||
6692 | if (need_decay) | ||
6693 | continue; | ||
6694 | break; | ||
6695 | } | ||
6696 | |||
5674 | interval = sd->balance_interval; | 6697 | interval = sd->balance_interval; |
5675 | if (idle != CPU_IDLE) | 6698 | if (idle != CPU_IDLE) |
5676 | interval *= sd->busy_factor; | 6699 | interval *= sd->busy_factor; |
@@ -5689,7 +6712,7 @@ static void rebalance_domains(int cpu, enum cpu_idle_type idle) | |||
5689 | if (time_after_eq(jiffies, sd->last_balance + interval)) { | 6712 | if (time_after_eq(jiffies, sd->last_balance + interval)) { |
5690 | if (load_balance(cpu, rq, sd, idle, &continue_balancing)) { | 6713 | if (load_balance(cpu, rq, sd, idle, &continue_balancing)) { |
5691 | /* | 6714 | /* |
5692 | * The LBF_SOME_PINNED logic could have changed | 6715 | * The LBF_DST_PINNED logic could have changed |
5693 | * env->dst_cpu, so we can't know our idle | 6716 | * env->dst_cpu, so we can't know our idle |
5694 | * state even if we migrated tasks. Update it. | 6717 | * state even if we migrated tasks. Update it. |
5695 | */ | 6718 | */ |
@@ -5704,14 +6727,14 @@ out: | |||
5704 | next_balance = sd->last_balance + interval; | 6727 | next_balance = sd->last_balance + interval; |
5705 | update_next_balance = 1; | 6728 | update_next_balance = 1; |
5706 | } | 6729 | } |
5707 | 6730 | } | |
6731 | if (need_decay) { | ||
5708 | /* | 6732 | /* |
5709 | * Stop the load balance at this level. There is another | 6733 | * Ensure the rq-wide value also decays but keep it at a |
5710 | * CPU in our sched group which is doing load balancing more | 6734 | * reasonable floor to avoid funnies with rq->avg_idle. |
5711 | * actively. | ||
5712 | */ | 6735 | */ |
5713 | if (!continue_balancing) | 6736 | rq->max_idle_balance_cost = |
5714 | break; | 6737 | max((u64)sysctl_sched_migration_cost, max_cost); |
5715 | } | 6738 | } |
5716 | rcu_read_unlock(); | 6739 | rcu_read_unlock(); |
5717 | 6740 | ||
@@ -5781,6 +6804,8 @@ static inline int nohz_kick_needed(struct rq *rq, int cpu) | |||
5781 | { | 6804 | { |
5782 | unsigned long now = jiffies; | 6805 | unsigned long now = jiffies; |
5783 | struct sched_domain *sd; | 6806 | struct sched_domain *sd; |
6807 | struct sched_group_power *sgp; | ||
6808 | int nr_busy; | ||
5784 | 6809 | ||
5785 | if (unlikely(idle_cpu(cpu))) | 6810 | if (unlikely(idle_cpu(cpu))) |
5786 | return 0; | 6811 | return 0; |
@@ -5806,22 +6831,22 @@ static inline int nohz_kick_needed(struct rq *rq, int cpu) | |||
5806 | goto need_kick; | 6831 | goto need_kick; |
5807 | 6832 | ||
5808 | rcu_read_lock(); | 6833 | rcu_read_lock(); |
5809 | for_each_domain(cpu, sd) { | 6834 | sd = rcu_dereference(per_cpu(sd_busy, cpu)); |
5810 | struct sched_group *sg = sd->groups; | ||
5811 | struct sched_group_power *sgp = sg->sgp; | ||
5812 | int nr_busy = atomic_read(&sgp->nr_busy_cpus); | ||
5813 | 6835 | ||
5814 | if (sd->flags & SD_SHARE_PKG_RESOURCES && nr_busy > 1) | 6836 | if (sd) { |
5815 | goto need_kick_unlock; | 6837 | sgp = sd->groups->sgp; |
6838 | nr_busy = atomic_read(&sgp->nr_busy_cpus); | ||
5816 | 6839 | ||
5817 | if (sd->flags & SD_ASYM_PACKING && nr_busy != sg->group_weight | 6840 | if (nr_busy > 1) |
5818 | && (cpumask_first_and(nohz.idle_cpus_mask, | ||
5819 | sched_domain_span(sd)) < cpu)) | ||
5820 | goto need_kick_unlock; | 6841 | goto need_kick_unlock; |
5821 | |||
5822 | if (!(sd->flags & (SD_SHARE_PKG_RESOURCES | SD_ASYM_PACKING))) | ||
5823 | break; | ||
5824 | } | 6842 | } |
6843 | |||
6844 | sd = rcu_dereference(per_cpu(sd_asym, cpu)); | ||
6845 | |||
6846 | if (sd && (cpumask_first_and(nohz.idle_cpus_mask, | ||
6847 | sched_domain_span(sd)) < cpu)) | ||
6848 | goto need_kick_unlock; | ||
6849 | |||
5825 | rcu_read_unlock(); | 6850 | rcu_read_unlock(); |
5826 | return 0; | 6851 | return 0; |
5827 | 6852 | ||
@@ -6214,7 +7239,8 @@ void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq, | |||
6214 | se->cfs_rq = parent->my_q; | 7239 | se->cfs_rq = parent->my_q; |
6215 | 7240 | ||
6216 | se->my_q = cfs_rq; | 7241 | se->my_q = cfs_rq; |
6217 | update_load_set(&se->load, 0); | 7242 | /* guarantee group entities always have weight */ |
7243 | update_load_set(&se->load, NICE_0_LOAD); | ||
6218 | se->parent = parent; | 7244 | se->parent = parent; |
6219 | } | 7245 | } |
6220 | 7246 | ||
diff --git a/kernel/sched/features.h b/kernel/sched/features.h index 99399f8e4799..5716929a2e3a 100644 --- a/kernel/sched/features.h +++ b/kernel/sched/features.h | |||
@@ -63,10 +63,23 @@ SCHED_FEAT(LB_MIN, false) | |||
63 | /* | 63 | /* |
64 | * Apply the automatic NUMA scheduling policy. Enabled automatically | 64 | * Apply the automatic NUMA scheduling policy. Enabled automatically |
65 | * at runtime if running on a NUMA machine. Can be controlled via | 65 | * at runtime if running on a NUMA machine. Can be controlled via |
66 | * numa_balancing=. Allow PTE scanning to be forced on UMA machines | 66 | * numa_balancing= |
67 | * for debugging the core machinery. | ||
68 | */ | 67 | */ |
69 | #ifdef CONFIG_NUMA_BALANCING | 68 | #ifdef CONFIG_NUMA_BALANCING |
70 | SCHED_FEAT(NUMA, false) | 69 | SCHED_FEAT(NUMA, false) |
71 | SCHED_FEAT(NUMA_FORCE, false) | 70 | |
71 | /* | ||
72 | * NUMA_FAVOUR_HIGHER will favor moving tasks towards nodes where a | ||
73 | * higher number of hinting faults are recorded during active load | ||
74 | * balancing. | ||
75 | */ | ||
76 | SCHED_FEAT(NUMA_FAVOUR_HIGHER, true) | ||
77 | |||
78 | /* | ||
79 | * NUMA_RESIST_LOWER will resist moving tasks towards nodes where a | ||
80 | * lower number of hinting faults have been recorded. As this has | ||
81 | * the potential to prevent a task ever migrating to a new node | ||
82 | * due to CPU overload it is disabled by default. | ||
83 | */ | ||
84 | SCHED_FEAT(NUMA_RESIST_LOWER, false) | ||
72 | #endif | 85 | #endif |
diff --git a/kernel/sched/idle_task.c b/kernel/sched/idle_task.c index d8da01008d39..516c3d9ceea1 100644 --- a/kernel/sched/idle_task.c +++ b/kernel/sched/idle_task.c | |||
@@ -9,7 +9,7 @@ | |||
9 | 9 | ||
10 | #ifdef CONFIG_SMP | 10 | #ifdef CONFIG_SMP |
11 | static int | 11 | static int |
12 | select_task_rq_idle(struct task_struct *p, int sd_flag, int flags) | 12 | select_task_rq_idle(struct task_struct *p, int cpu, int sd_flag, int flags) |
13 | { | 13 | { |
14 | return task_cpu(p); /* IDLE tasks as never migrated */ | 14 | return task_cpu(p); /* IDLE tasks as never migrated */ |
15 | } | 15 | } |
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c index 01970c8e64df..1c4065575fa2 100644 --- a/kernel/sched/rt.c +++ b/kernel/sched/rt.c | |||
@@ -246,8 +246,10 @@ static inline void rt_set_overload(struct rq *rq) | |||
246 | * if we should look at the mask. It would be a shame | 246 | * if we should look at the mask. It would be a shame |
247 | * if we looked at the mask, but the mask was not | 247 | * if we looked at the mask, but the mask was not |
248 | * updated yet. | 248 | * updated yet. |
249 | * | ||
250 | * Matched by the barrier in pull_rt_task(). | ||
249 | */ | 251 | */ |
250 | wmb(); | 252 | smp_wmb(); |
251 | atomic_inc(&rq->rd->rto_count); | 253 | atomic_inc(&rq->rd->rto_count); |
252 | } | 254 | } |
253 | 255 | ||
@@ -899,6 +901,13 @@ inc_rt_prio_smp(struct rt_rq *rt_rq, int prio, int prev_prio) | |||
899 | { | 901 | { |
900 | struct rq *rq = rq_of_rt_rq(rt_rq); | 902 | struct rq *rq = rq_of_rt_rq(rt_rq); |
901 | 903 | ||
904 | #ifdef CONFIG_RT_GROUP_SCHED | ||
905 | /* | ||
906 | * Change rq's cpupri only if rt_rq is the top queue. | ||
907 | */ | ||
908 | if (&rq->rt != rt_rq) | ||
909 | return; | ||
910 | #endif | ||
902 | if (rq->online && prio < prev_prio) | 911 | if (rq->online && prio < prev_prio) |
903 | cpupri_set(&rq->rd->cpupri, rq->cpu, prio); | 912 | cpupri_set(&rq->rd->cpupri, rq->cpu, prio); |
904 | } | 913 | } |
@@ -908,6 +917,13 @@ dec_rt_prio_smp(struct rt_rq *rt_rq, int prio, int prev_prio) | |||
908 | { | 917 | { |
909 | struct rq *rq = rq_of_rt_rq(rt_rq); | 918 | struct rq *rq = rq_of_rt_rq(rt_rq); |
910 | 919 | ||
920 | #ifdef CONFIG_RT_GROUP_SCHED | ||
921 | /* | ||
922 | * Change rq's cpupri only if rt_rq is the top queue. | ||
923 | */ | ||
924 | if (&rq->rt != rt_rq) | ||
925 | return; | ||
926 | #endif | ||
911 | if (rq->online && rt_rq->highest_prio.curr != prev_prio) | 927 | if (rq->online && rt_rq->highest_prio.curr != prev_prio) |
912 | cpupri_set(&rq->rd->cpupri, rq->cpu, rt_rq->highest_prio.curr); | 928 | cpupri_set(&rq->rd->cpupri, rq->cpu, rt_rq->highest_prio.curr); |
913 | } | 929 | } |
@@ -1169,13 +1185,10 @@ static void yield_task_rt(struct rq *rq) | |||
1169 | static int find_lowest_rq(struct task_struct *task); | 1185 | static int find_lowest_rq(struct task_struct *task); |
1170 | 1186 | ||
1171 | static int | 1187 | static int |
1172 | select_task_rq_rt(struct task_struct *p, int sd_flag, int flags) | 1188 | select_task_rq_rt(struct task_struct *p, int cpu, int sd_flag, int flags) |
1173 | { | 1189 | { |
1174 | struct task_struct *curr; | 1190 | struct task_struct *curr; |
1175 | struct rq *rq; | 1191 | struct rq *rq; |
1176 | int cpu; | ||
1177 | |||
1178 | cpu = task_cpu(p); | ||
1179 | 1192 | ||
1180 | if (p->nr_cpus_allowed == 1) | 1193 | if (p->nr_cpus_allowed == 1) |
1181 | goto out; | 1194 | goto out; |
@@ -1213,8 +1226,7 @@ select_task_rq_rt(struct task_struct *p, int sd_flag, int flags) | |||
1213 | */ | 1226 | */ |
1214 | if (curr && unlikely(rt_task(curr)) && | 1227 | if (curr && unlikely(rt_task(curr)) && |
1215 | (curr->nr_cpus_allowed < 2 || | 1228 | (curr->nr_cpus_allowed < 2 || |
1216 | curr->prio <= p->prio) && | 1229 | curr->prio <= p->prio)) { |
1217 | (p->nr_cpus_allowed > 1)) { | ||
1218 | int target = find_lowest_rq(p); | 1230 | int target = find_lowest_rq(p); |
1219 | 1231 | ||
1220 | if (target != -1) | 1232 | if (target != -1) |
@@ -1630,6 +1642,12 @@ static int pull_rt_task(struct rq *this_rq) | |||
1630 | if (likely(!rt_overloaded(this_rq))) | 1642 | if (likely(!rt_overloaded(this_rq))) |
1631 | return 0; | 1643 | return 0; |
1632 | 1644 | ||
1645 | /* | ||
1646 | * Match the barrier from rt_set_overloaded; this guarantees that if we | ||
1647 | * see overloaded we must also see the rto_mask bit. | ||
1648 | */ | ||
1649 | smp_rmb(); | ||
1650 | |||
1633 | for_each_cpu(cpu, this_rq->rd->rto_mask) { | 1651 | for_each_cpu(cpu, this_rq->rd->rto_mask) { |
1634 | if (this_cpu == cpu) | 1652 | if (this_cpu == cpu) |
1635 | continue; | 1653 | continue; |
@@ -1931,8 +1949,8 @@ static void task_tick_rt(struct rq *rq, struct task_struct *p, int queued) | |||
1931 | p->rt.time_slice = sched_rr_timeslice; | 1949 | p->rt.time_slice = sched_rr_timeslice; |
1932 | 1950 | ||
1933 | /* | 1951 | /* |
1934 | * Requeue to the end of queue if we (and all of our ancestors) are the | 1952 | * Requeue to the end of queue if we (and all of our ancestors) are not |
1935 | * only element on the queue | 1953 | * the only element on the queue |
1936 | */ | 1954 | */ |
1937 | for_each_sched_rt_entity(rt_se) { | 1955 | for_each_sched_rt_entity(rt_se) { |
1938 | if (rt_se->run_list.prev != rt_se->run_list.next) { | 1956 | if (rt_se->run_list.prev != rt_se->run_list.next) { |
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index b3c5653e1dca..88c85b21d633 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h | |||
@@ -6,6 +6,7 @@ | |||
6 | #include <linux/spinlock.h> | 6 | #include <linux/spinlock.h> |
7 | #include <linux/stop_machine.h> | 7 | #include <linux/stop_machine.h> |
8 | #include <linux/tick.h> | 8 | #include <linux/tick.h> |
9 | #include <linux/slab.h> | ||
9 | 10 | ||
10 | #include "cpupri.h" | 11 | #include "cpupri.h" |
11 | #include "cpuacct.h" | 12 | #include "cpuacct.h" |
@@ -408,6 +409,10 @@ struct rq { | |||
408 | * remote CPUs use both these fields when doing load calculation. | 409 | * remote CPUs use both these fields when doing load calculation. |
409 | */ | 410 | */ |
410 | unsigned int nr_running; | 411 | unsigned int nr_running; |
412 | #ifdef CONFIG_NUMA_BALANCING | ||
413 | unsigned int nr_numa_running; | ||
414 | unsigned int nr_preferred_running; | ||
415 | #endif | ||
411 | #define CPU_LOAD_IDX_MAX 5 | 416 | #define CPU_LOAD_IDX_MAX 5 |
412 | unsigned long cpu_load[CPU_LOAD_IDX_MAX]; | 417 | unsigned long cpu_load[CPU_LOAD_IDX_MAX]; |
413 | unsigned long last_load_update_tick; | 418 | unsigned long last_load_update_tick; |
@@ -476,6 +481,9 @@ struct rq { | |||
476 | u64 age_stamp; | 481 | u64 age_stamp; |
477 | u64 idle_stamp; | 482 | u64 idle_stamp; |
478 | u64 avg_idle; | 483 | u64 avg_idle; |
484 | |||
485 | /* This is used to determine avg_idle's max value */ | ||
486 | u64 max_idle_balance_cost; | ||
479 | #endif | 487 | #endif |
480 | 488 | ||
481 | #ifdef CONFIG_IRQ_TIME_ACCOUNTING | 489 | #ifdef CONFIG_IRQ_TIME_ACCOUNTING |
@@ -552,6 +560,12 @@ static inline u64 rq_clock_task(struct rq *rq) | |||
552 | return rq->clock_task; | 560 | return rq->clock_task; |
553 | } | 561 | } |
554 | 562 | ||
563 | #ifdef CONFIG_NUMA_BALANCING | ||
564 | extern void sched_setnuma(struct task_struct *p, int node); | ||
565 | extern int migrate_task_to(struct task_struct *p, int cpu); | ||
566 | extern int migrate_swap(struct task_struct *, struct task_struct *); | ||
567 | #endif /* CONFIG_NUMA_BALANCING */ | ||
568 | |||
555 | #ifdef CONFIG_SMP | 569 | #ifdef CONFIG_SMP |
556 | 570 | ||
557 | #define rcu_dereference_check_sched_domain(p) \ | 571 | #define rcu_dereference_check_sched_domain(p) \ |
@@ -593,9 +607,24 @@ static inline struct sched_domain *highest_flag_domain(int cpu, int flag) | |||
593 | return hsd; | 607 | return hsd; |
594 | } | 608 | } |
595 | 609 | ||
610 | static inline struct sched_domain *lowest_flag_domain(int cpu, int flag) | ||
611 | { | ||
612 | struct sched_domain *sd; | ||
613 | |||
614 | for_each_domain(cpu, sd) { | ||
615 | if (sd->flags & flag) | ||
616 | break; | ||
617 | } | ||
618 | |||
619 | return sd; | ||
620 | } | ||
621 | |||
596 | DECLARE_PER_CPU(struct sched_domain *, sd_llc); | 622 | DECLARE_PER_CPU(struct sched_domain *, sd_llc); |
597 | DECLARE_PER_CPU(int, sd_llc_size); | 623 | DECLARE_PER_CPU(int, sd_llc_size); |
598 | DECLARE_PER_CPU(int, sd_llc_id); | 624 | DECLARE_PER_CPU(int, sd_llc_id); |
625 | DECLARE_PER_CPU(struct sched_domain *, sd_numa); | ||
626 | DECLARE_PER_CPU(struct sched_domain *, sd_busy); | ||
627 | DECLARE_PER_CPU(struct sched_domain *, sd_asym); | ||
599 | 628 | ||
600 | struct sched_group_power { | 629 | struct sched_group_power { |
601 | atomic_t ref; | 630 | atomic_t ref; |
@@ -605,6 +634,7 @@ struct sched_group_power { | |||
605 | */ | 634 | */ |
606 | unsigned int power, power_orig; | 635 | unsigned int power, power_orig; |
607 | unsigned long next_update; | 636 | unsigned long next_update; |
637 | int imbalance; /* XXX unrelated to power but shared group state */ | ||
608 | /* | 638 | /* |
609 | * Number of busy cpus in this group. | 639 | * Number of busy cpus in this group. |
610 | */ | 640 | */ |
@@ -719,6 +749,7 @@ static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu) | |||
719 | */ | 749 | */ |
720 | smp_wmb(); | 750 | smp_wmb(); |
721 | task_thread_info(p)->cpu = cpu; | 751 | task_thread_info(p)->cpu = cpu; |
752 | p->wake_cpu = cpu; | ||
722 | #endif | 753 | #endif |
723 | } | 754 | } |
724 | 755 | ||
@@ -974,7 +1005,7 @@ struct sched_class { | |||
974 | void (*put_prev_task) (struct rq *rq, struct task_struct *p); | 1005 | void (*put_prev_task) (struct rq *rq, struct task_struct *p); |
975 | 1006 | ||
976 | #ifdef CONFIG_SMP | 1007 | #ifdef CONFIG_SMP |
977 | int (*select_task_rq)(struct task_struct *p, int sd_flag, int flags); | 1008 | int (*select_task_rq)(struct task_struct *p, int task_cpu, int sd_flag, int flags); |
978 | void (*migrate_task_rq)(struct task_struct *p, int next_cpu); | 1009 | void (*migrate_task_rq)(struct task_struct *p, int next_cpu); |
979 | 1010 | ||
980 | void (*pre_schedule) (struct rq *this_rq, struct task_struct *task); | 1011 | void (*pre_schedule) (struct rq *this_rq, struct task_struct *task); |
@@ -1220,6 +1251,24 @@ static inline void double_unlock_balance(struct rq *this_rq, struct rq *busiest) | |||
1220 | lock_set_subclass(&this_rq->lock.dep_map, 0, _RET_IP_); | 1251 | lock_set_subclass(&this_rq->lock.dep_map, 0, _RET_IP_); |
1221 | } | 1252 | } |
1222 | 1253 | ||
1254 | static inline void double_lock(spinlock_t *l1, spinlock_t *l2) | ||
1255 | { | ||
1256 | if (l1 > l2) | ||
1257 | swap(l1, l2); | ||
1258 | |||
1259 | spin_lock(l1); | ||
1260 | spin_lock_nested(l2, SINGLE_DEPTH_NESTING); | ||
1261 | } | ||
1262 | |||
1263 | static inline void double_raw_lock(raw_spinlock_t *l1, raw_spinlock_t *l2) | ||
1264 | { | ||
1265 | if (l1 > l2) | ||
1266 | swap(l1, l2); | ||
1267 | |||
1268 | raw_spin_lock(l1); | ||
1269 | raw_spin_lock_nested(l2, SINGLE_DEPTH_NESTING); | ||
1270 | } | ||
1271 | |||
1223 | /* | 1272 | /* |
1224 | * double_rq_lock - safely lock two runqueues | 1273 | * double_rq_lock - safely lock two runqueues |
1225 | * | 1274 | * |
@@ -1305,7 +1354,8 @@ extern void print_rt_stats(struct seq_file *m, int cpu); | |||
1305 | extern void init_cfs_rq(struct cfs_rq *cfs_rq); | 1354 | extern void init_cfs_rq(struct cfs_rq *cfs_rq); |
1306 | extern void init_rt_rq(struct rt_rq *rt_rq, struct rq *rq); | 1355 | extern void init_rt_rq(struct rt_rq *rt_rq, struct rq *rq); |
1307 | 1356 | ||
1308 | extern void account_cfs_bandwidth_used(int enabled, int was_enabled); | 1357 | extern void cfs_bandwidth_usage_inc(void); |
1358 | extern void cfs_bandwidth_usage_dec(void); | ||
1309 | 1359 | ||
1310 | #ifdef CONFIG_NO_HZ_COMMON | 1360 | #ifdef CONFIG_NO_HZ_COMMON |
1311 | enum rq_nohz_flag_bits { | 1361 | enum rq_nohz_flag_bits { |
diff --git a/kernel/sched/stats.h b/kernel/sched/stats.h index c7edee71bce8..4ab704339656 100644 --- a/kernel/sched/stats.h +++ b/kernel/sched/stats.h | |||
@@ -59,9 +59,9 @@ static inline void sched_info_reset_dequeued(struct task_struct *t) | |||
59 | * from dequeue_task() to account for possible rq->clock skew across cpus. The | 59 | * from dequeue_task() to account for possible rq->clock skew across cpus. The |
60 | * delta taken on each cpu would annul the skew. | 60 | * delta taken on each cpu would annul the skew. |
61 | */ | 61 | */ |
62 | static inline void sched_info_dequeued(struct task_struct *t) | 62 | static inline void sched_info_dequeued(struct rq *rq, struct task_struct *t) |
63 | { | 63 | { |
64 | unsigned long long now = rq_clock(task_rq(t)), delta = 0; | 64 | unsigned long long now = rq_clock(rq), delta = 0; |
65 | 65 | ||
66 | if (unlikely(sched_info_on())) | 66 | if (unlikely(sched_info_on())) |
67 | if (t->sched_info.last_queued) | 67 | if (t->sched_info.last_queued) |
@@ -69,7 +69,7 @@ static inline void sched_info_dequeued(struct task_struct *t) | |||
69 | sched_info_reset_dequeued(t); | 69 | sched_info_reset_dequeued(t); |
70 | t->sched_info.run_delay += delta; | 70 | t->sched_info.run_delay += delta; |
71 | 71 | ||
72 | rq_sched_info_dequeued(task_rq(t), delta); | 72 | rq_sched_info_dequeued(rq, delta); |
73 | } | 73 | } |
74 | 74 | ||
75 | /* | 75 | /* |
@@ -77,9 +77,9 @@ static inline void sched_info_dequeued(struct task_struct *t) | |||
77 | * long it was waiting to run. We also note when it began so that we | 77 | * long it was waiting to run. We also note when it began so that we |
78 | * can keep stats on how long its timeslice is. | 78 | * can keep stats on how long its timeslice is. |
79 | */ | 79 | */ |
80 | static void sched_info_arrive(struct task_struct *t) | 80 | static void sched_info_arrive(struct rq *rq, struct task_struct *t) |
81 | { | 81 | { |
82 | unsigned long long now = rq_clock(task_rq(t)), delta = 0; | 82 | unsigned long long now = rq_clock(rq), delta = 0; |
83 | 83 | ||
84 | if (t->sched_info.last_queued) | 84 | if (t->sched_info.last_queued) |
85 | delta = now - t->sched_info.last_queued; | 85 | delta = now - t->sched_info.last_queued; |
@@ -88,7 +88,7 @@ static void sched_info_arrive(struct task_struct *t) | |||
88 | t->sched_info.last_arrival = now; | 88 | t->sched_info.last_arrival = now; |
89 | t->sched_info.pcount++; | 89 | t->sched_info.pcount++; |
90 | 90 | ||
91 | rq_sched_info_arrive(task_rq(t), delta); | 91 | rq_sched_info_arrive(rq, delta); |
92 | } | 92 | } |
93 | 93 | ||
94 | /* | 94 | /* |
@@ -96,11 +96,11 @@ static void sched_info_arrive(struct task_struct *t) | |||
96 | * the timestamp if it is already not set. It's assumed that | 96 | * the timestamp if it is already not set. It's assumed that |
97 | * sched_info_dequeued() will clear that stamp when appropriate. | 97 | * sched_info_dequeued() will clear that stamp when appropriate. |
98 | */ | 98 | */ |
99 | static inline void sched_info_queued(struct task_struct *t) | 99 | static inline void sched_info_queued(struct rq *rq, struct task_struct *t) |
100 | { | 100 | { |
101 | if (unlikely(sched_info_on())) | 101 | if (unlikely(sched_info_on())) |
102 | if (!t->sched_info.last_queued) | 102 | if (!t->sched_info.last_queued) |
103 | t->sched_info.last_queued = rq_clock(task_rq(t)); | 103 | t->sched_info.last_queued = rq_clock(rq); |
104 | } | 104 | } |
105 | 105 | ||
106 | /* | 106 | /* |
@@ -111,15 +111,15 @@ static inline void sched_info_queued(struct task_struct *t) | |||
111 | * sched_info_queued() to mark that it has now again started waiting on | 111 | * sched_info_queued() to mark that it has now again started waiting on |
112 | * the runqueue. | 112 | * the runqueue. |
113 | */ | 113 | */ |
114 | static inline void sched_info_depart(struct task_struct *t) | 114 | static inline void sched_info_depart(struct rq *rq, struct task_struct *t) |
115 | { | 115 | { |
116 | unsigned long long delta = rq_clock(task_rq(t)) - | 116 | unsigned long long delta = rq_clock(rq) - |
117 | t->sched_info.last_arrival; | 117 | t->sched_info.last_arrival; |
118 | 118 | ||
119 | rq_sched_info_depart(task_rq(t), delta); | 119 | rq_sched_info_depart(rq, delta); |
120 | 120 | ||
121 | if (t->state == TASK_RUNNING) | 121 | if (t->state == TASK_RUNNING) |
122 | sched_info_queued(t); | 122 | sched_info_queued(rq, t); |
123 | } | 123 | } |
124 | 124 | ||
125 | /* | 125 | /* |
@@ -128,32 +128,34 @@ static inline void sched_info_depart(struct task_struct *t) | |||
128 | * the idle task.) We are only called when prev != next. | 128 | * the idle task.) We are only called when prev != next. |
129 | */ | 129 | */ |
130 | static inline void | 130 | static inline void |
131 | __sched_info_switch(struct task_struct *prev, struct task_struct *next) | 131 | __sched_info_switch(struct rq *rq, |
132 | struct task_struct *prev, struct task_struct *next) | ||
132 | { | 133 | { |
133 | struct rq *rq = task_rq(prev); | ||
134 | |||
135 | /* | 134 | /* |
136 | * prev now departs the cpu. It's not interesting to record | 135 | * prev now departs the cpu. It's not interesting to record |
137 | * stats about how efficient we were at scheduling the idle | 136 | * stats about how efficient we were at scheduling the idle |
138 | * process, however. | 137 | * process, however. |
139 | */ | 138 | */ |
140 | if (prev != rq->idle) | 139 | if (prev != rq->idle) |
141 | sched_info_depart(prev); | 140 | sched_info_depart(rq, prev); |
142 | 141 | ||
143 | if (next != rq->idle) | 142 | if (next != rq->idle) |
144 | sched_info_arrive(next); | 143 | sched_info_arrive(rq, next); |
145 | } | 144 | } |
146 | static inline void | 145 | static inline void |
147 | sched_info_switch(struct task_struct *prev, struct task_struct *next) | 146 | sched_info_switch(struct rq *rq, |
147 | struct task_struct *prev, struct task_struct *next) | ||
148 | { | 148 | { |
149 | if (unlikely(sched_info_on())) | 149 | if (unlikely(sched_info_on())) |
150 | __sched_info_switch(prev, next); | 150 | __sched_info_switch(rq, prev, next); |
151 | } | 151 | } |
152 | #else | 152 | #else |
153 | #define sched_info_queued(t) do { } while (0) | 153 | #define sched_info_queued(rq, t) do { } while (0) |
154 | #define sched_info_reset_dequeued(t) do { } while (0) | 154 | #define sched_info_reset_dequeued(t) do { } while (0) |
155 | #define sched_info_dequeued(t) do { } while (0) | 155 | #define sched_info_dequeued(rq, t) do { } while (0) |
156 | #define sched_info_switch(t, next) do { } while (0) | 156 | #define sched_info_depart(rq, t) do { } while (0) |
157 | #define sched_info_arrive(rq, next) do { } while (0) | ||
158 | #define sched_info_switch(rq, t, next) do { } while (0) | ||
157 | #endif /* CONFIG_SCHEDSTATS || CONFIG_TASK_DELAY_ACCT */ | 159 | #endif /* CONFIG_SCHEDSTATS || CONFIG_TASK_DELAY_ACCT */ |
158 | 160 | ||
159 | /* | 161 | /* |
diff --git a/kernel/sched/stop_task.c b/kernel/sched/stop_task.c index e08fbeeb54b9..47197de8abd9 100644 --- a/kernel/sched/stop_task.c +++ b/kernel/sched/stop_task.c | |||
@@ -11,7 +11,7 @@ | |||
11 | 11 | ||
12 | #ifdef CONFIG_SMP | 12 | #ifdef CONFIG_SMP |
13 | static int | 13 | static int |
14 | select_task_rq_stop(struct task_struct *p, int sd_flag, int flags) | 14 | select_task_rq_stop(struct task_struct *p, int cpu, int sd_flag, int flags) |
15 | { | 15 | { |
16 | return task_cpu(p); /* stop tasks as never migrate */ | 16 | return task_cpu(p); /* stop tasks as never migrate */ |
17 | } | 17 | } |
diff --git a/kernel/wait.c b/kernel/sched/wait.c index d550920e040c..7d50f794e248 100644 --- a/kernel/wait.c +++ b/kernel/sched/wait.c | |||
@@ -53,6 +53,109 @@ EXPORT_SYMBOL(remove_wait_queue); | |||
53 | 53 | ||
54 | 54 | ||
55 | /* | 55 | /* |
56 | * The core wakeup function. Non-exclusive wakeups (nr_exclusive == 0) just | ||
57 | * wake everything up. If it's an exclusive wakeup (nr_exclusive == small +ve | ||
58 | * number) then we wake all the non-exclusive tasks and one exclusive task. | ||
59 | * | ||
60 | * There are circumstances in which we can try to wake a task which has already | ||
61 | * started to run but is not in state TASK_RUNNING. try_to_wake_up() returns | ||
62 | * zero in this (rare) case, and we handle it by continuing to scan the queue. | ||
63 | */ | ||
64 | static void __wake_up_common(wait_queue_head_t *q, unsigned int mode, | ||
65 | int nr_exclusive, int wake_flags, void *key) | ||
66 | { | ||
67 | wait_queue_t *curr, *next; | ||
68 | |||
69 | list_for_each_entry_safe(curr, next, &q->task_list, task_list) { | ||
70 | unsigned flags = curr->flags; | ||
71 | |||
72 | if (curr->func(curr, mode, wake_flags, key) && | ||
73 | (flags & WQ_FLAG_EXCLUSIVE) && !--nr_exclusive) | ||
74 | break; | ||
75 | } | ||
76 | } | ||
77 | |||
78 | /** | ||
79 | * __wake_up - wake up threads blocked on a waitqueue. | ||
80 | * @q: the waitqueue | ||
81 | * @mode: which threads | ||
82 | * @nr_exclusive: how many wake-one or wake-many threads to wake up | ||
83 | * @key: is directly passed to the wakeup function | ||
84 | * | ||
85 | * It may be assumed that this function implies a write memory barrier before | ||
86 | * changing the task state if and only if any tasks are woken up. | ||
87 | */ | ||
88 | void __wake_up(wait_queue_head_t *q, unsigned int mode, | ||
89 | int nr_exclusive, void *key) | ||
90 | { | ||
91 | unsigned long flags; | ||
92 | |||
93 | spin_lock_irqsave(&q->lock, flags); | ||
94 | __wake_up_common(q, mode, nr_exclusive, 0, key); | ||
95 | spin_unlock_irqrestore(&q->lock, flags); | ||
96 | } | ||
97 | EXPORT_SYMBOL(__wake_up); | ||
98 | |||
99 | /* | ||
100 | * Same as __wake_up but called with the spinlock in wait_queue_head_t held. | ||
101 | */ | ||
102 | void __wake_up_locked(wait_queue_head_t *q, unsigned int mode, int nr) | ||
103 | { | ||
104 | __wake_up_common(q, mode, nr, 0, NULL); | ||
105 | } | ||
106 | EXPORT_SYMBOL_GPL(__wake_up_locked); | ||
107 | |||
108 | void __wake_up_locked_key(wait_queue_head_t *q, unsigned int mode, void *key) | ||
109 | { | ||
110 | __wake_up_common(q, mode, 1, 0, key); | ||
111 | } | ||
112 | EXPORT_SYMBOL_GPL(__wake_up_locked_key); | ||
113 | |||
114 | /** | ||
115 | * __wake_up_sync_key - wake up threads blocked on a waitqueue. | ||
116 | * @q: the waitqueue | ||
117 | * @mode: which threads | ||
118 | * @nr_exclusive: how many wake-one or wake-many threads to wake up | ||
119 | * @key: opaque value to be passed to wakeup targets | ||
120 | * | ||
121 | * The sync wakeup differs that the waker knows that it will schedule | ||
122 | * away soon, so while the target thread will be woken up, it will not | ||
123 | * be migrated to another CPU - ie. the two threads are 'synchronized' | ||
124 | * with each other. This can prevent needless bouncing between CPUs. | ||
125 | * | ||
126 | * On UP it can prevent extra preemption. | ||
127 | * | ||
128 | * It may be assumed that this function implies a write memory barrier before | ||
129 | * changing the task state if and only if any tasks are woken up. | ||
130 | */ | ||
131 | void __wake_up_sync_key(wait_queue_head_t *q, unsigned int mode, | ||
132 | int nr_exclusive, void *key) | ||
133 | { | ||
134 | unsigned long flags; | ||
135 | int wake_flags = 1; /* XXX WF_SYNC */ | ||
136 | |||
137 | if (unlikely(!q)) | ||
138 | return; | ||
139 | |||
140 | if (unlikely(nr_exclusive != 1)) | ||
141 | wake_flags = 0; | ||
142 | |||
143 | spin_lock_irqsave(&q->lock, flags); | ||
144 | __wake_up_common(q, mode, nr_exclusive, wake_flags, key); | ||
145 | spin_unlock_irqrestore(&q->lock, flags); | ||
146 | } | ||
147 | EXPORT_SYMBOL_GPL(__wake_up_sync_key); | ||
148 | |||
149 | /* | ||
150 | * __wake_up_sync - see __wake_up_sync_key() | ||
151 | */ | ||
152 | void __wake_up_sync(wait_queue_head_t *q, unsigned int mode, int nr_exclusive) | ||
153 | { | ||
154 | __wake_up_sync_key(q, mode, nr_exclusive, NULL); | ||
155 | } | ||
156 | EXPORT_SYMBOL_GPL(__wake_up_sync); /* For internal use only */ | ||
157 | |||
158 | /* | ||
56 | * Note: we use "set_current_state()" _after_ the wait-queue add, | 159 | * Note: we use "set_current_state()" _after_ the wait-queue add, |
57 | * because we need a memory barrier there on SMP, so that any | 160 | * because we need a memory barrier there on SMP, so that any |
58 | * wake-function that tests for the wait-queue being active | 161 | * wake-function that tests for the wait-queue being active |
@@ -92,6 +195,30 @@ prepare_to_wait_exclusive(wait_queue_head_t *q, wait_queue_t *wait, int state) | |||
92 | } | 195 | } |
93 | EXPORT_SYMBOL(prepare_to_wait_exclusive); | 196 | EXPORT_SYMBOL(prepare_to_wait_exclusive); |
94 | 197 | ||
198 | long prepare_to_wait_event(wait_queue_head_t *q, wait_queue_t *wait, int state) | ||
199 | { | ||
200 | unsigned long flags; | ||
201 | |||
202 | if (signal_pending_state(state, current)) | ||
203 | return -ERESTARTSYS; | ||
204 | |||
205 | wait->private = current; | ||
206 | wait->func = autoremove_wake_function; | ||
207 | |||
208 | spin_lock_irqsave(&q->lock, flags); | ||
209 | if (list_empty(&wait->task_list)) { | ||
210 | if (wait->flags & WQ_FLAG_EXCLUSIVE) | ||
211 | __add_wait_queue_tail(q, wait); | ||
212 | else | ||
213 | __add_wait_queue(q, wait); | ||
214 | } | ||
215 | set_current_state(state); | ||
216 | spin_unlock_irqrestore(&q->lock, flags); | ||
217 | |||
218 | return 0; | ||
219 | } | ||
220 | EXPORT_SYMBOL(prepare_to_wait_event); | ||
221 | |||
95 | /** | 222 | /** |
96 | * finish_wait - clean up after waiting in a queue | 223 | * finish_wait - clean up after waiting in a queue |
97 | * @q: waitqueue waited on | 224 | * @q: waitqueue waited on |
diff --git a/kernel/signal.c b/kernel/signal.c index ded28b91fa53..940b30ee9a30 100644 --- a/kernel/signal.c +++ b/kernel/signal.c | |||
@@ -2723,7 +2723,7 @@ COMPAT_SYSCALL_DEFINE2(rt_sigpending, compat_sigset_t __user *, uset, | |||
2723 | 2723 | ||
2724 | #ifndef HAVE_ARCH_COPY_SIGINFO_TO_USER | 2724 | #ifndef HAVE_ARCH_COPY_SIGINFO_TO_USER |
2725 | 2725 | ||
2726 | int copy_siginfo_to_user(siginfo_t __user *to, siginfo_t *from) | 2726 | int copy_siginfo_to_user(siginfo_t __user *to, const siginfo_t *from) |
2727 | { | 2727 | { |
2728 | int err; | 2728 | int err; |
2729 | 2729 | ||
diff --git a/kernel/smp.c b/kernel/smp.c index 0564571dcdf7..bd9f94028838 100644 --- a/kernel/smp.c +++ b/kernel/smp.c | |||
@@ -15,9 +15,9 @@ | |||
15 | 15 | ||
16 | #include "smpboot.h" | 16 | #include "smpboot.h" |
17 | 17 | ||
18 | #ifdef CONFIG_USE_GENERIC_SMP_HELPERS | ||
19 | enum { | 18 | enum { |
20 | CSD_FLAG_LOCK = 0x01, | 19 | CSD_FLAG_LOCK = 0x01, |
20 | CSD_FLAG_WAIT = 0x02, | ||
21 | }; | 21 | }; |
22 | 22 | ||
23 | struct call_function_data { | 23 | struct call_function_data { |
@@ -124,7 +124,7 @@ static void csd_lock(struct call_single_data *csd) | |||
124 | 124 | ||
125 | static void csd_unlock(struct call_single_data *csd) | 125 | static void csd_unlock(struct call_single_data *csd) |
126 | { | 126 | { |
127 | WARN_ON(!(csd->flags & CSD_FLAG_LOCK)); | 127 | WARN_ON((csd->flags & CSD_FLAG_WAIT) && !(csd->flags & CSD_FLAG_LOCK)); |
128 | 128 | ||
129 | /* | 129 | /* |
130 | * ensure we're all done before releasing data: | 130 | * ensure we're all done before releasing data: |
@@ -139,13 +139,15 @@ static void csd_unlock(struct call_single_data *csd) | |||
139 | * for execution on the given CPU. data must already have | 139 | * for execution on the given CPU. data must already have |
140 | * ->func, ->info, and ->flags set. | 140 | * ->func, ->info, and ->flags set. |
141 | */ | 141 | */ |
142 | static | 142 | static void generic_exec_single(int cpu, struct call_single_data *csd, int wait) |
143 | void generic_exec_single(int cpu, struct call_single_data *csd, int wait) | ||
144 | { | 143 | { |
145 | struct call_single_queue *dst = &per_cpu(call_single_queue, cpu); | 144 | struct call_single_queue *dst = &per_cpu(call_single_queue, cpu); |
146 | unsigned long flags; | 145 | unsigned long flags; |
147 | int ipi; | 146 | int ipi; |
148 | 147 | ||
148 | if (wait) | ||
149 | csd->flags |= CSD_FLAG_WAIT; | ||
150 | |||
149 | raw_spin_lock_irqsave(&dst->lock, flags); | 151 | raw_spin_lock_irqsave(&dst->lock, flags); |
150 | ipi = list_empty(&dst->list); | 152 | ipi = list_empty(&dst->list); |
151 | list_add_tail(&csd->list, &dst->list); | 153 | list_add_tail(&csd->list, &dst->list); |
@@ -340,6 +342,7 @@ void __smp_call_function_single(int cpu, struct call_single_data *csd, | |||
340 | } | 342 | } |
341 | put_cpu(); | 343 | put_cpu(); |
342 | } | 344 | } |
345 | EXPORT_SYMBOL_GPL(__smp_call_function_single); | ||
343 | 346 | ||
344 | /** | 347 | /** |
345 | * smp_call_function_many(): Run a function on a set of other CPUs. | 348 | * smp_call_function_many(): Run a function on a set of other CPUs. |
@@ -459,7 +462,6 @@ int smp_call_function(smp_call_func_t func, void *info, int wait) | |||
459 | return 0; | 462 | return 0; |
460 | } | 463 | } |
461 | EXPORT_SYMBOL(smp_call_function); | 464 | EXPORT_SYMBOL(smp_call_function); |
462 | #endif /* USE_GENERIC_SMP_HELPERS */ | ||
463 | 465 | ||
464 | /* Setup configured maximum number of CPUs to activate */ | 466 | /* Setup configured maximum number of CPUs to activate */ |
465 | unsigned int setup_max_cpus = NR_CPUS; | 467 | unsigned int setup_max_cpus = NR_CPUS; |
@@ -524,6 +526,11 @@ void __init setup_nr_cpu_ids(void) | |||
524 | nr_cpu_ids = find_last_bit(cpumask_bits(cpu_possible_mask),NR_CPUS) + 1; | 526 | nr_cpu_ids = find_last_bit(cpumask_bits(cpu_possible_mask),NR_CPUS) + 1; |
525 | } | 527 | } |
526 | 528 | ||
529 | void __weak smp_announce(void) | ||
530 | { | ||
531 | printk(KERN_INFO "Brought up %d CPUs\n", num_online_cpus()); | ||
532 | } | ||
533 | |||
527 | /* Called by boot processor to activate the rest. */ | 534 | /* Called by boot processor to activate the rest. */ |
528 | void __init smp_init(void) | 535 | void __init smp_init(void) |
529 | { | 536 | { |
@@ -540,7 +547,7 @@ void __init smp_init(void) | |||
540 | } | 547 | } |
541 | 548 | ||
542 | /* Any cleanup work */ | 549 | /* Any cleanup work */ |
543 | printk(KERN_INFO "Brought up %ld CPUs\n", (long)num_online_cpus()); | 550 | smp_announce(); |
544 | smp_cpus_done(setup_max_cpus); | 551 | smp_cpus_done(setup_max_cpus); |
545 | } | 552 | } |
546 | 553 | ||
diff --git a/kernel/softirq.c b/kernel/softirq.c index d7d498d8cc4f..11025ccc06dd 100644 --- a/kernel/softirq.c +++ b/kernel/softirq.c | |||
@@ -6,8 +6,6 @@ | |||
6 | * Distribute under GPLv2. | 6 | * Distribute under GPLv2. |
7 | * | 7 | * |
8 | * Rewritten. Old one was good in 2.2, but in 2.3 it was immoral. --ANK (990903) | 8 | * Rewritten. Old one was good in 2.2, but in 2.3 it was immoral. --ANK (990903) |
9 | * | ||
10 | * Remote softirq infrastructure is by Jens Axboe. | ||
11 | */ | 9 | */ |
12 | 10 | ||
13 | #include <linux/export.h> | 11 | #include <linux/export.h> |
@@ -29,7 +27,6 @@ | |||
29 | #define CREATE_TRACE_POINTS | 27 | #define CREATE_TRACE_POINTS |
30 | #include <trace/events/irq.h> | 28 | #include <trace/events/irq.h> |
31 | 29 | ||
32 | #include <asm/irq.h> | ||
33 | /* | 30 | /* |
34 | - No shared variables, all the data are CPU local. | 31 | - No shared variables, all the data are CPU local. |
35 | - If a softirq needs serialization, let it serialize itself | 32 | - If a softirq needs serialization, let it serialize itself |
@@ -100,13 +97,13 @@ static void __local_bh_disable(unsigned long ip, unsigned int cnt) | |||
100 | 97 | ||
101 | raw_local_irq_save(flags); | 98 | raw_local_irq_save(flags); |
102 | /* | 99 | /* |
103 | * The preempt tracer hooks into add_preempt_count and will break | 100 | * The preempt tracer hooks into preempt_count_add and will break |
104 | * lockdep because it calls back into lockdep after SOFTIRQ_OFFSET | 101 | * lockdep because it calls back into lockdep after SOFTIRQ_OFFSET |
105 | * is set and before current->softirq_enabled is cleared. | 102 | * is set and before current->softirq_enabled is cleared. |
106 | * We must manually increment preempt_count here and manually | 103 | * We must manually increment preempt_count here and manually |
107 | * call the trace_preempt_off later. | 104 | * call the trace_preempt_off later. |
108 | */ | 105 | */ |
109 | preempt_count() += cnt; | 106 | __preempt_count_add(cnt); |
110 | /* | 107 | /* |
111 | * Were softirqs turned off above: | 108 | * Were softirqs turned off above: |
112 | */ | 109 | */ |
@@ -120,7 +117,7 @@ static void __local_bh_disable(unsigned long ip, unsigned int cnt) | |||
120 | #else /* !CONFIG_TRACE_IRQFLAGS */ | 117 | #else /* !CONFIG_TRACE_IRQFLAGS */ |
121 | static inline void __local_bh_disable(unsigned long ip, unsigned int cnt) | 118 | static inline void __local_bh_disable(unsigned long ip, unsigned int cnt) |
122 | { | 119 | { |
123 | add_preempt_count(cnt); | 120 | preempt_count_add(cnt); |
124 | barrier(); | 121 | barrier(); |
125 | } | 122 | } |
126 | #endif /* CONFIG_TRACE_IRQFLAGS */ | 123 | #endif /* CONFIG_TRACE_IRQFLAGS */ |
@@ -134,12 +131,11 @@ EXPORT_SYMBOL(local_bh_disable); | |||
134 | 131 | ||
135 | static void __local_bh_enable(unsigned int cnt) | 132 | static void __local_bh_enable(unsigned int cnt) |
136 | { | 133 | { |
137 | WARN_ON_ONCE(in_irq()); | ||
138 | WARN_ON_ONCE(!irqs_disabled()); | 134 | WARN_ON_ONCE(!irqs_disabled()); |
139 | 135 | ||
140 | if (softirq_count() == cnt) | 136 | if (softirq_count() == cnt) |
141 | trace_softirqs_on(_RET_IP_); | 137 | trace_softirqs_on(_RET_IP_); |
142 | sub_preempt_count(cnt); | 138 | preempt_count_sub(cnt); |
143 | } | 139 | } |
144 | 140 | ||
145 | /* | 141 | /* |
@@ -149,6 +145,7 @@ static void __local_bh_enable(unsigned int cnt) | |||
149 | */ | 145 | */ |
150 | void _local_bh_enable(void) | 146 | void _local_bh_enable(void) |
151 | { | 147 | { |
148 | WARN_ON_ONCE(in_irq()); | ||
152 | __local_bh_enable(SOFTIRQ_DISABLE_OFFSET); | 149 | __local_bh_enable(SOFTIRQ_DISABLE_OFFSET); |
153 | } | 150 | } |
154 | 151 | ||
@@ -169,12 +166,17 @@ static inline void _local_bh_enable_ip(unsigned long ip) | |||
169 | * Keep preemption disabled until we are done with | 166 | * Keep preemption disabled until we are done with |
170 | * softirq processing: | 167 | * softirq processing: |
171 | */ | 168 | */ |
172 | sub_preempt_count(SOFTIRQ_DISABLE_OFFSET - 1); | 169 | preempt_count_sub(SOFTIRQ_DISABLE_OFFSET - 1); |
173 | 170 | ||
174 | if (unlikely(!in_interrupt() && local_softirq_pending())) | 171 | if (unlikely(!in_interrupt() && local_softirq_pending())) { |
172 | /* | ||
173 | * Run softirq if any pending. And do it in its own stack | ||
174 | * as we may be calling this deep in a task call stack already. | ||
175 | */ | ||
175 | do_softirq(); | 176 | do_softirq(); |
177 | } | ||
176 | 178 | ||
177 | dec_preempt_count(); | 179 | preempt_count_dec(); |
178 | #ifdef CONFIG_TRACE_IRQFLAGS | 180 | #ifdef CONFIG_TRACE_IRQFLAGS |
179 | local_irq_enable(); | 181 | local_irq_enable(); |
180 | #endif | 182 | #endif |
@@ -256,7 +258,7 @@ restart: | |||
256 | " exited with %08x?\n", vec_nr, | 258 | " exited with %08x?\n", vec_nr, |
257 | softirq_to_name[vec_nr], h->action, | 259 | softirq_to_name[vec_nr], h->action, |
258 | prev_count, preempt_count()); | 260 | prev_count, preempt_count()); |
259 | preempt_count() = prev_count; | 261 | preempt_count_set(prev_count); |
260 | } | 262 | } |
261 | 263 | ||
262 | rcu_bh_qs(cpu); | 264 | rcu_bh_qs(cpu); |
@@ -280,10 +282,11 @@ restart: | |||
280 | 282 | ||
281 | account_irq_exit_time(current); | 283 | account_irq_exit_time(current); |
282 | __local_bh_enable(SOFTIRQ_OFFSET); | 284 | __local_bh_enable(SOFTIRQ_OFFSET); |
285 | WARN_ON_ONCE(in_interrupt()); | ||
283 | tsk_restore_flags(current, old_flags, PF_MEMALLOC); | 286 | tsk_restore_flags(current, old_flags, PF_MEMALLOC); |
284 | } | 287 | } |
285 | 288 | ||
286 | #ifndef __ARCH_HAS_DO_SOFTIRQ | 289 | |
287 | 290 | ||
288 | asmlinkage void do_softirq(void) | 291 | asmlinkage void do_softirq(void) |
289 | { | 292 | { |
@@ -298,13 +301,11 @@ asmlinkage void do_softirq(void) | |||
298 | pending = local_softirq_pending(); | 301 | pending = local_softirq_pending(); |
299 | 302 | ||
300 | if (pending) | 303 | if (pending) |
301 | __do_softirq(); | 304 | do_softirq_own_stack(); |
302 | 305 | ||
303 | local_irq_restore(flags); | 306 | local_irq_restore(flags); |
304 | } | 307 | } |
305 | 308 | ||
306 | #endif | ||
307 | |||
308 | /* | 309 | /* |
309 | * Enter an interrupt context. | 310 | * Enter an interrupt context. |
310 | */ | 311 | */ |
@@ -329,15 +330,21 @@ void irq_enter(void) | |||
329 | static inline void invoke_softirq(void) | 330 | static inline void invoke_softirq(void) |
330 | { | 331 | { |
331 | if (!force_irqthreads) { | 332 | if (!force_irqthreads) { |
333 | #ifdef CONFIG_HAVE_IRQ_EXIT_ON_IRQ_STACK | ||
332 | /* | 334 | /* |
333 | * We can safely execute softirq on the current stack if | 335 | * We can safely execute softirq on the current stack if |
334 | * it is the irq stack, because it should be near empty | 336 | * it is the irq stack, because it should be near empty |
335 | * at this stage. But we have no way to know if the arch | 337 | * at this stage. |
336 | * calls irq_exit() on the irq stack. So call softirq | ||
337 | * in its own stack to prevent from any overrun on top | ||
338 | * of a potentially deep task stack. | ||
339 | */ | 338 | */ |
340 | do_softirq(); | 339 | __do_softirq(); |
340 | #else | ||
341 | /* | ||
342 | * Otherwise, irq_exit() is called on the task stack that can | ||
343 | * be potentially deep already. So call softirq in its own stack | ||
344 | * to prevent from any overrun. | ||
345 | */ | ||
346 | do_softirq_own_stack(); | ||
347 | #endif | ||
341 | } else { | 348 | } else { |
342 | wakeup_softirqd(); | 349 | wakeup_softirqd(); |
343 | } | 350 | } |
@@ -369,7 +376,7 @@ void irq_exit(void) | |||
369 | 376 | ||
370 | account_irq_exit_time(current); | 377 | account_irq_exit_time(current); |
371 | trace_hardirq_exit(); | 378 | trace_hardirq_exit(); |
372 | sub_preempt_count(HARDIRQ_OFFSET); | 379 | preempt_count_sub(HARDIRQ_OFFSET); |
373 | if (!in_interrupt() && local_softirq_pending()) | 380 | if (!in_interrupt() && local_softirq_pending()) |
374 | invoke_softirq(); | 381 | invoke_softirq(); |
375 | 382 | ||
@@ -618,146 +625,17 @@ void tasklet_hrtimer_init(struct tasklet_hrtimer *ttimer, | |||
618 | } | 625 | } |
619 | EXPORT_SYMBOL_GPL(tasklet_hrtimer_init); | 626 | EXPORT_SYMBOL_GPL(tasklet_hrtimer_init); |
620 | 627 | ||
621 | /* | ||
622 | * Remote softirq bits | ||
623 | */ | ||
624 | |||
625 | DEFINE_PER_CPU(struct list_head [NR_SOFTIRQS], softirq_work_list); | ||
626 | EXPORT_PER_CPU_SYMBOL(softirq_work_list); | ||
627 | |||
628 | static void __local_trigger(struct call_single_data *cp, int softirq) | ||
629 | { | ||
630 | struct list_head *head = &__get_cpu_var(softirq_work_list[softirq]); | ||
631 | |||
632 | list_add_tail(&cp->list, head); | ||
633 | |||
634 | /* Trigger the softirq only if the list was previously empty. */ | ||
635 | if (head->next == &cp->list) | ||
636 | raise_softirq_irqoff(softirq); | ||
637 | } | ||
638 | |||
639 | #ifdef CONFIG_USE_GENERIC_SMP_HELPERS | ||
640 | static void remote_softirq_receive(void *data) | ||
641 | { | ||
642 | struct call_single_data *cp = data; | ||
643 | unsigned long flags; | ||
644 | int softirq; | ||
645 | |||
646 | softirq = *(int *)cp->info; | ||
647 | local_irq_save(flags); | ||
648 | __local_trigger(cp, softirq); | ||
649 | local_irq_restore(flags); | ||
650 | } | ||
651 | |||
652 | static int __try_remote_softirq(struct call_single_data *cp, int cpu, int softirq) | ||
653 | { | ||
654 | if (cpu_online(cpu)) { | ||
655 | cp->func = remote_softirq_receive; | ||
656 | cp->info = &softirq; | ||
657 | cp->flags = 0; | ||
658 | |||
659 | __smp_call_function_single(cpu, cp, 0); | ||
660 | return 0; | ||
661 | } | ||
662 | return 1; | ||
663 | } | ||
664 | #else /* CONFIG_USE_GENERIC_SMP_HELPERS */ | ||
665 | static int __try_remote_softirq(struct call_single_data *cp, int cpu, int softirq) | ||
666 | { | ||
667 | return 1; | ||
668 | } | ||
669 | #endif | ||
670 | |||
671 | /** | ||
672 | * __send_remote_softirq - try to schedule softirq work on a remote cpu | ||
673 | * @cp: private SMP call function data area | ||
674 | * @cpu: the remote cpu | ||
675 | * @this_cpu: the currently executing cpu | ||
676 | * @softirq: the softirq for the work | ||
677 | * | ||
678 | * Attempt to schedule softirq work on a remote cpu. If this cannot be | ||
679 | * done, the work is instead queued up on the local cpu. | ||
680 | * | ||
681 | * Interrupts must be disabled. | ||
682 | */ | ||
683 | void __send_remote_softirq(struct call_single_data *cp, int cpu, int this_cpu, int softirq) | ||
684 | { | ||
685 | if (cpu == this_cpu || __try_remote_softirq(cp, cpu, softirq)) | ||
686 | __local_trigger(cp, softirq); | ||
687 | } | ||
688 | EXPORT_SYMBOL(__send_remote_softirq); | ||
689 | |||
690 | /** | ||
691 | * send_remote_softirq - try to schedule softirq work on a remote cpu | ||
692 | * @cp: private SMP call function data area | ||
693 | * @cpu: the remote cpu | ||
694 | * @softirq: the softirq for the work | ||
695 | * | ||
696 | * Like __send_remote_softirq except that disabling interrupts and | ||
697 | * computing the current cpu is done for the caller. | ||
698 | */ | ||
699 | void send_remote_softirq(struct call_single_data *cp, int cpu, int softirq) | ||
700 | { | ||
701 | unsigned long flags; | ||
702 | int this_cpu; | ||
703 | |||
704 | local_irq_save(flags); | ||
705 | this_cpu = smp_processor_id(); | ||
706 | __send_remote_softirq(cp, cpu, this_cpu, softirq); | ||
707 | local_irq_restore(flags); | ||
708 | } | ||
709 | EXPORT_SYMBOL(send_remote_softirq); | ||
710 | |||
711 | static int remote_softirq_cpu_notify(struct notifier_block *self, | ||
712 | unsigned long action, void *hcpu) | ||
713 | { | ||
714 | /* | ||
715 | * If a CPU goes away, splice its entries to the current CPU | ||
716 | * and trigger a run of the softirq | ||
717 | */ | ||
718 | if (action == CPU_DEAD || action == CPU_DEAD_FROZEN) { | ||
719 | int cpu = (unsigned long) hcpu; | ||
720 | int i; | ||
721 | |||
722 | local_irq_disable(); | ||
723 | for (i = 0; i < NR_SOFTIRQS; i++) { | ||
724 | struct list_head *head = &per_cpu(softirq_work_list[i], cpu); | ||
725 | struct list_head *local_head; | ||
726 | |||
727 | if (list_empty(head)) | ||
728 | continue; | ||
729 | |||
730 | local_head = &__get_cpu_var(softirq_work_list[i]); | ||
731 | list_splice_init(head, local_head); | ||
732 | raise_softirq_irqoff(i); | ||
733 | } | ||
734 | local_irq_enable(); | ||
735 | } | ||
736 | |||
737 | return NOTIFY_OK; | ||
738 | } | ||
739 | |||
740 | static struct notifier_block remote_softirq_cpu_notifier = { | ||
741 | .notifier_call = remote_softirq_cpu_notify, | ||
742 | }; | ||
743 | |||
744 | void __init softirq_init(void) | 628 | void __init softirq_init(void) |
745 | { | 629 | { |
746 | int cpu; | 630 | int cpu; |
747 | 631 | ||
748 | for_each_possible_cpu(cpu) { | 632 | for_each_possible_cpu(cpu) { |
749 | int i; | ||
750 | |||
751 | per_cpu(tasklet_vec, cpu).tail = | 633 | per_cpu(tasklet_vec, cpu).tail = |
752 | &per_cpu(tasklet_vec, cpu).head; | 634 | &per_cpu(tasklet_vec, cpu).head; |
753 | per_cpu(tasklet_hi_vec, cpu).tail = | 635 | per_cpu(tasklet_hi_vec, cpu).tail = |
754 | &per_cpu(tasklet_hi_vec, cpu).head; | 636 | &per_cpu(tasklet_hi_vec, cpu).head; |
755 | for (i = 0; i < NR_SOFTIRQS; i++) | ||
756 | INIT_LIST_HEAD(&per_cpu(softirq_work_list[i], cpu)); | ||
757 | } | 637 | } |
758 | 638 | ||
759 | register_hotcpu_notifier(&remote_softirq_cpu_notifier); | ||
760 | |||
761 | open_softirq(TASKLET_SOFTIRQ, tasklet_action); | 639 | open_softirq(TASKLET_SOFTIRQ, tasklet_action); |
762 | open_softirq(HI_SOFTIRQ, tasklet_hi_action); | 640 | open_softirq(HI_SOFTIRQ, tasklet_hi_action); |
763 | } | 641 | } |
@@ -771,6 +649,10 @@ static void run_ksoftirqd(unsigned int cpu) | |||
771 | { | 649 | { |
772 | local_irq_disable(); | 650 | local_irq_disable(); |
773 | if (local_softirq_pending()) { | 651 | if (local_softirq_pending()) { |
652 | /* | ||
653 | * We can safely run softirq on inline stack, as we are not deep | ||
654 | * in the task stack here. | ||
655 | */ | ||
774 | __do_softirq(); | 656 | __do_softirq(); |
775 | rcu_note_context_switch(cpu); | 657 | rcu_note_context_switch(cpu); |
776 | local_irq_enable(); | 658 | local_irq_enable(); |
diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c index c09f2955ae30..84571e09c907 100644 --- a/kernel/stop_machine.c +++ b/kernel/stop_machine.c | |||
@@ -20,6 +20,7 @@ | |||
20 | #include <linux/kallsyms.h> | 20 | #include <linux/kallsyms.h> |
21 | #include <linux/smpboot.h> | 21 | #include <linux/smpboot.h> |
22 | #include <linux/atomic.h> | 22 | #include <linux/atomic.h> |
23 | #include <linux/lglock.h> | ||
23 | 24 | ||
24 | /* | 25 | /* |
25 | * Structure to determine completion condition and record errors. May | 26 | * Structure to determine completion condition and record errors. May |
@@ -43,6 +44,14 @@ static DEFINE_PER_CPU(struct cpu_stopper, cpu_stopper); | |||
43 | static DEFINE_PER_CPU(struct task_struct *, cpu_stopper_task); | 44 | static DEFINE_PER_CPU(struct task_struct *, cpu_stopper_task); |
44 | static bool stop_machine_initialized = false; | 45 | static bool stop_machine_initialized = false; |
45 | 46 | ||
47 | /* | ||
48 | * Avoids a race between stop_two_cpus and global stop_cpus, where | ||
49 | * the stoppers could get queued up in reverse order, leading to | ||
50 | * system deadlock. Using an lglock means stop_two_cpus remains | ||
51 | * relatively cheap. | ||
52 | */ | ||
53 | DEFINE_STATIC_LGLOCK(stop_cpus_lock); | ||
54 | |||
46 | static void cpu_stop_init_done(struct cpu_stop_done *done, unsigned int nr_todo) | 55 | static void cpu_stop_init_done(struct cpu_stop_done *done, unsigned int nr_todo) |
47 | { | 56 | { |
48 | memset(done, 0, sizeof(*done)); | 57 | memset(done, 0, sizeof(*done)); |
@@ -115,6 +124,184 @@ int stop_one_cpu(unsigned int cpu, cpu_stop_fn_t fn, void *arg) | |||
115 | return done.executed ? done.ret : -ENOENT; | 124 | return done.executed ? done.ret : -ENOENT; |
116 | } | 125 | } |
117 | 126 | ||
127 | /* This controls the threads on each CPU. */ | ||
128 | enum multi_stop_state { | ||
129 | /* Dummy starting state for thread. */ | ||
130 | MULTI_STOP_NONE, | ||
131 | /* Awaiting everyone to be scheduled. */ | ||
132 | MULTI_STOP_PREPARE, | ||
133 | /* Disable interrupts. */ | ||
134 | MULTI_STOP_DISABLE_IRQ, | ||
135 | /* Run the function */ | ||
136 | MULTI_STOP_RUN, | ||
137 | /* Exit */ | ||
138 | MULTI_STOP_EXIT, | ||
139 | }; | ||
140 | |||
141 | struct multi_stop_data { | ||
142 | int (*fn)(void *); | ||
143 | void *data; | ||
144 | /* Like num_online_cpus(), but hotplug cpu uses us, so we need this. */ | ||
145 | unsigned int num_threads; | ||
146 | const struct cpumask *active_cpus; | ||
147 | |||
148 | enum multi_stop_state state; | ||
149 | atomic_t thread_ack; | ||
150 | }; | ||
151 | |||
152 | static void set_state(struct multi_stop_data *msdata, | ||
153 | enum multi_stop_state newstate) | ||
154 | { | ||
155 | /* Reset ack counter. */ | ||
156 | atomic_set(&msdata->thread_ack, msdata->num_threads); | ||
157 | smp_wmb(); | ||
158 | msdata->state = newstate; | ||
159 | } | ||
160 | |||
161 | /* Last one to ack a state moves to the next state. */ | ||
162 | static void ack_state(struct multi_stop_data *msdata) | ||
163 | { | ||
164 | if (atomic_dec_and_test(&msdata->thread_ack)) | ||
165 | set_state(msdata, msdata->state + 1); | ||
166 | } | ||
167 | |||
168 | /* This is the cpu_stop function which stops the CPU. */ | ||
169 | static int multi_cpu_stop(void *data) | ||
170 | { | ||
171 | struct multi_stop_data *msdata = data; | ||
172 | enum multi_stop_state curstate = MULTI_STOP_NONE; | ||
173 | int cpu = smp_processor_id(), err = 0; | ||
174 | unsigned long flags; | ||
175 | bool is_active; | ||
176 | |||
177 | /* | ||
178 | * When called from stop_machine_from_inactive_cpu(), irq might | ||
179 | * already be disabled. Save the state and restore it on exit. | ||
180 | */ | ||
181 | local_save_flags(flags); | ||
182 | |||
183 | if (!msdata->active_cpus) | ||
184 | is_active = cpu == cpumask_first(cpu_online_mask); | ||
185 | else | ||
186 | is_active = cpumask_test_cpu(cpu, msdata->active_cpus); | ||
187 | |||
188 | /* Simple state machine */ | ||
189 | do { | ||
190 | /* Chill out and ensure we re-read multi_stop_state. */ | ||
191 | cpu_relax(); | ||
192 | if (msdata->state != curstate) { | ||
193 | curstate = msdata->state; | ||
194 | switch (curstate) { | ||
195 | case MULTI_STOP_DISABLE_IRQ: | ||
196 | local_irq_disable(); | ||
197 | hard_irq_disable(); | ||
198 | break; | ||
199 | case MULTI_STOP_RUN: | ||
200 | if (is_active) | ||
201 | err = msdata->fn(msdata->data); | ||
202 | break; | ||
203 | default: | ||
204 | break; | ||
205 | } | ||
206 | ack_state(msdata); | ||
207 | } | ||
208 | } while (curstate != MULTI_STOP_EXIT); | ||
209 | |||
210 | local_irq_restore(flags); | ||
211 | return err; | ||
212 | } | ||
213 | |||
214 | struct irq_cpu_stop_queue_work_info { | ||
215 | int cpu1; | ||
216 | int cpu2; | ||
217 | struct cpu_stop_work *work1; | ||
218 | struct cpu_stop_work *work2; | ||
219 | }; | ||
220 | |||
221 | /* | ||
222 | * This function is always run with irqs and preemption disabled. | ||
223 | * This guarantees that both work1 and work2 get queued, before | ||
224 | * our local migrate thread gets the chance to preempt us. | ||
225 | */ | ||
226 | static void irq_cpu_stop_queue_work(void *arg) | ||
227 | { | ||
228 | struct irq_cpu_stop_queue_work_info *info = arg; | ||
229 | cpu_stop_queue_work(info->cpu1, info->work1); | ||
230 | cpu_stop_queue_work(info->cpu2, info->work2); | ||
231 | } | ||
232 | |||
233 | /** | ||
234 | * stop_two_cpus - stops two cpus | ||
235 | * @cpu1: the cpu to stop | ||
236 | * @cpu2: the other cpu to stop | ||
237 | * @fn: function to execute | ||
238 | * @arg: argument to @fn | ||
239 | * | ||
240 | * Stops both the current and specified CPU and runs @fn on one of them. | ||
241 | * | ||
242 | * returns when both are completed. | ||
243 | */ | ||
244 | int stop_two_cpus(unsigned int cpu1, unsigned int cpu2, cpu_stop_fn_t fn, void *arg) | ||
245 | { | ||
246 | struct cpu_stop_done done; | ||
247 | struct cpu_stop_work work1, work2; | ||
248 | struct irq_cpu_stop_queue_work_info call_args; | ||
249 | struct multi_stop_data msdata; | ||
250 | |||
251 | preempt_disable(); | ||
252 | msdata = (struct multi_stop_data){ | ||
253 | .fn = fn, | ||
254 | .data = arg, | ||
255 | .num_threads = 2, | ||
256 | .active_cpus = cpumask_of(cpu1), | ||
257 | }; | ||
258 | |||
259 | work1 = work2 = (struct cpu_stop_work){ | ||
260 | .fn = multi_cpu_stop, | ||
261 | .arg = &msdata, | ||
262 | .done = &done | ||
263 | }; | ||
264 | |||
265 | call_args = (struct irq_cpu_stop_queue_work_info){ | ||
266 | .cpu1 = cpu1, | ||
267 | .cpu2 = cpu2, | ||
268 | .work1 = &work1, | ||
269 | .work2 = &work2, | ||
270 | }; | ||
271 | |||
272 | cpu_stop_init_done(&done, 2); | ||
273 | set_state(&msdata, MULTI_STOP_PREPARE); | ||
274 | |||
275 | /* | ||
276 | * If we observe both CPUs active we know _cpu_down() cannot yet have | ||
277 | * queued its stop_machine works and therefore ours will get executed | ||
278 | * first. Or its not either one of our CPUs that's getting unplugged, | ||
279 | * in which case we don't care. | ||
280 | * | ||
281 | * This relies on the stopper workqueues to be FIFO. | ||
282 | */ | ||
283 | if (!cpu_active(cpu1) || !cpu_active(cpu2)) { | ||
284 | preempt_enable(); | ||
285 | return -ENOENT; | ||
286 | } | ||
287 | |||
288 | lg_local_lock(&stop_cpus_lock); | ||
289 | /* | ||
290 | * Queuing needs to be done by the lowest numbered CPU, to ensure | ||
291 | * that works are always queued in the same order on every CPU. | ||
292 | * This prevents deadlocks. | ||
293 | */ | ||
294 | smp_call_function_single(min(cpu1, cpu2), | ||
295 | &irq_cpu_stop_queue_work, | ||
296 | &call_args, 0); | ||
297 | lg_local_unlock(&stop_cpus_lock); | ||
298 | preempt_enable(); | ||
299 | |||
300 | wait_for_completion(&done.completion); | ||
301 | |||
302 | return done.executed ? done.ret : -ENOENT; | ||
303 | } | ||
304 | |||
118 | /** | 305 | /** |
119 | * stop_one_cpu_nowait - stop a cpu but don't wait for completion | 306 | * stop_one_cpu_nowait - stop a cpu but don't wait for completion |
120 | * @cpu: cpu to stop | 307 | * @cpu: cpu to stop |
@@ -159,10 +346,10 @@ static void queue_stop_cpus_work(const struct cpumask *cpumask, | |||
159 | * preempted by a stopper which might wait for other stoppers | 346 | * preempted by a stopper which might wait for other stoppers |
160 | * to enter @fn which can lead to deadlock. | 347 | * to enter @fn which can lead to deadlock. |
161 | */ | 348 | */ |
162 | preempt_disable(); | 349 | lg_global_lock(&stop_cpus_lock); |
163 | for_each_cpu(cpu, cpumask) | 350 | for_each_cpu(cpu, cpumask) |
164 | cpu_stop_queue_work(cpu, &per_cpu(stop_cpus_work, cpu)); | 351 | cpu_stop_queue_work(cpu, &per_cpu(stop_cpus_work, cpu)); |
165 | preempt_enable(); | 352 | lg_global_unlock(&stop_cpus_lock); |
166 | } | 353 | } |
167 | 354 | ||
168 | static int __stop_cpus(const struct cpumask *cpumask, | 355 | static int __stop_cpus(const struct cpumask *cpumask, |
@@ -359,98 +546,14 @@ early_initcall(cpu_stop_init); | |||
359 | 546 | ||
360 | #ifdef CONFIG_STOP_MACHINE | 547 | #ifdef CONFIG_STOP_MACHINE |
361 | 548 | ||
362 | /* This controls the threads on each CPU. */ | ||
363 | enum stopmachine_state { | ||
364 | /* Dummy starting state for thread. */ | ||
365 | STOPMACHINE_NONE, | ||
366 | /* Awaiting everyone to be scheduled. */ | ||
367 | STOPMACHINE_PREPARE, | ||
368 | /* Disable interrupts. */ | ||
369 | STOPMACHINE_DISABLE_IRQ, | ||
370 | /* Run the function */ | ||
371 | STOPMACHINE_RUN, | ||
372 | /* Exit */ | ||
373 | STOPMACHINE_EXIT, | ||
374 | }; | ||
375 | |||
376 | struct stop_machine_data { | ||
377 | int (*fn)(void *); | ||
378 | void *data; | ||
379 | /* Like num_online_cpus(), but hotplug cpu uses us, so we need this. */ | ||
380 | unsigned int num_threads; | ||
381 | const struct cpumask *active_cpus; | ||
382 | |||
383 | enum stopmachine_state state; | ||
384 | atomic_t thread_ack; | ||
385 | }; | ||
386 | |||
387 | static void set_state(struct stop_machine_data *smdata, | ||
388 | enum stopmachine_state newstate) | ||
389 | { | ||
390 | /* Reset ack counter. */ | ||
391 | atomic_set(&smdata->thread_ack, smdata->num_threads); | ||
392 | smp_wmb(); | ||
393 | smdata->state = newstate; | ||
394 | } | ||
395 | |||
396 | /* Last one to ack a state moves to the next state. */ | ||
397 | static void ack_state(struct stop_machine_data *smdata) | ||
398 | { | ||
399 | if (atomic_dec_and_test(&smdata->thread_ack)) | ||
400 | set_state(smdata, smdata->state + 1); | ||
401 | } | ||
402 | |||
403 | /* This is the cpu_stop function which stops the CPU. */ | ||
404 | static int stop_machine_cpu_stop(void *data) | ||
405 | { | ||
406 | struct stop_machine_data *smdata = data; | ||
407 | enum stopmachine_state curstate = STOPMACHINE_NONE; | ||
408 | int cpu = smp_processor_id(), err = 0; | ||
409 | unsigned long flags; | ||
410 | bool is_active; | ||
411 | |||
412 | /* | ||
413 | * When called from stop_machine_from_inactive_cpu(), irq might | ||
414 | * already be disabled. Save the state and restore it on exit. | ||
415 | */ | ||
416 | local_save_flags(flags); | ||
417 | |||
418 | if (!smdata->active_cpus) | ||
419 | is_active = cpu == cpumask_first(cpu_online_mask); | ||
420 | else | ||
421 | is_active = cpumask_test_cpu(cpu, smdata->active_cpus); | ||
422 | |||
423 | /* Simple state machine */ | ||
424 | do { | ||
425 | /* Chill out and ensure we re-read stopmachine_state. */ | ||
426 | cpu_relax(); | ||
427 | if (smdata->state != curstate) { | ||
428 | curstate = smdata->state; | ||
429 | switch (curstate) { | ||
430 | case STOPMACHINE_DISABLE_IRQ: | ||
431 | local_irq_disable(); | ||
432 | hard_irq_disable(); | ||
433 | break; | ||
434 | case STOPMACHINE_RUN: | ||
435 | if (is_active) | ||
436 | err = smdata->fn(smdata->data); | ||
437 | break; | ||
438 | default: | ||
439 | break; | ||
440 | } | ||
441 | ack_state(smdata); | ||
442 | } | ||
443 | } while (curstate != STOPMACHINE_EXIT); | ||
444 | |||
445 | local_irq_restore(flags); | ||
446 | return err; | ||
447 | } | ||
448 | |||
449 | int __stop_machine(int (*fn)(void *), void *data, const struct cpumask *cpus) | 549 | int __stop_machine(int (*fn)(void *), void *data, const struct cpumask *cpus) |
450 | { | 550 | { |
451 | struct stop_machine_data smdata = { .fn = fn, .data = data, | 551 | struct multi_stop_data msdata = { |
452 | .num_threads = num_online_cpus(), | 552 | .fn = fn, |
453 | .active_cpus = cpus }; | 553 | .data = data, |
554 | .num_threads = num_online_cpus(), | ||
555 | .active_cpus = cpus, | ||
556 | }; | ||
454 | 557 | ||
455 | if (!stop_machine_initialized) { | 558 | if (!stop_machine_initialized) { |
456 | /* | 559 | /* |
@@ -461,7 +564,7 @@ int __stop_machine(int (*fn)(void *), void *data, const struct cpumask *cpus) | |||
461 | unsigned long flags; | 564 | unsigned long flags; |
462 | int ret; | 565 | int ret; |
463 | 566 | ||
464 | WARN_ON_ONCE(smdata.num_threads != 1); | 567 | WARN_ON_ONCE(msdata.num_threads != 1); |
465 | 568 | ||
466 | local_irq_save(flags); | 569 | local_irq_save(flags); |
467 | hard_irq_disable(); | 570 | hard_irq_disable(); |
@@ -472,8 +575,8 @@ int __stop_machine(int (*fn)(void *), void *data, const struct cpumask *cpus) | |||
472 | } | 575 | } |
473 | 576 | ||
474 | /* Set the initial state and stop all online cpus. */ | 577 | /* Set the initial state and stop all online cpus. */ |
475 | set_state(&smdata, STOPMACHINE_PREPARE); | 578 | set_state(&msdata, MULTI_STOP_PREPARE); |
476 | return stop_cpus(cpu_online_mask, stop_machine_cpu_stop, &smdata); | 579 | return stop_cpus(cpu_online_mask, multi_cpu_stop, &msdata); |
477 | } | 580 | } |
478 | 581 | ||
479 | int stop_machine(int (*fn)(void *), void *data, const struct cpumask *cpus) | 582 | int stop_machine(int (*fn)(void *), void *data, const struct cpumask *cpus) |
@@ -513,25 +616,25 @@ EXPORT_SYMBOL_GPL(stop_machine); | |||
513 | int stop_machine_from_inactive_cpu(int (*fn)(void *), void *data, | 616 | int stop_machine_from_inactive_cpu(int (*fn)(void *), void *data, |
514 | const struct cpumask *cpus) | 617 | const struct cpumask *cpus) |
515 | { | 618 | { |
516 | struct stop_machine_data smdata = { .fn = fn, .data = data, | 619 | struct multi_stop_data msdata = { .fn = fn, .data = data, |
517 | .active_cpus = cpus }; | 620 | .active_cpus = cpus }; |
518 | struct cpu_stop_done done; | 621 | struct cpu_stop_done done; |
519 | int ret; | 622 | int ret; |
520 | 623 | ||
521 | /* Local CPU must be inactive and CPU hotplug in progress. */ | 624 | /* Local CPU must be inactive and CPU hotplug in progress. */ |
522 | BUG_ON(cpu_active(raw_smp_processor_id())); | 625 | BUG_ON(cpu_active(raw_smp_processor_id())); |
523 | smdata.num_threads = num_active_cpus() + 1; /* +1 for local */ | 626 | msdata.num_threads = num_active_cpus() + 1; /* +1 for local */ |
524 | 627 | ||
525 | /* No proper task established and can't sleep - busy wait for lock. */ | 628 | /* No proper task established and can't sleep - busy wait for lock. */ |
526 | while (!mutex_trylock(&stop_cpus_mutex)) | 629 | while (!mutex_trylock(&stop_cpus_mutex)) |
527 | cpu_relax(); | 630 | cpu_relax(); |
528 | 631 | ||
529 | /* Schedule work on other CPUs and execute directly for local CPU */ | 632 | /* Schedule work on other CPUs and execute directly for local CPU */ |
530 | set_state(&smdata, STOPMACHINE_PREPARE); | 633 | set_state(&msdata, MULTI_STOP_PREPARE); |
531 | cpu_stop_init_done(&done, num_active_cpus()); | 634 | cpu_stop_init_done(&done, num_active_cpus()); |
532 | queue_stop_cpus_work(cpu_active_mask, stop_machine_cpu_stop, &smdata, | 635 | queue_stop_cpus_work(cpu_active_mask, multi_cpu_stop, &msdata, |
533 | &done); | 636 | &done); |
534 | ret = stop_machine_cpu_stop(&smdata); | 637 | ret = multi_cpu_stop(&msdata); |
535 | 638 | ||
536 | /* Busy wait for completion. */ | 639 | /* Busy wait for completion. */ |
537 | while (!completion_done(&done.completion)) | 640 | while (!completion_done(&done.completion)) |
diff --git a/kernel/sys.c b/kernel/sys.c index c18ecca575b4..c72311324ea7 100644 --- a/kernel/sys.c +++ b/kernel/sys.c | |||
@@ -16,7 +16,6 @@ | |||
16 | #include <linux/perf_event.h> | 16 | #include <linux/perf_event.h> |
17 | #include <linux/resource.h> | 17 | #include <linux/resource.h> |
18 | #include <linux/kernel.h> | 18 | #include <linux/kernel.h> |
19 | #include <linux/kexec.h> | ||
20 | #include <linux/workqueue.h> | 19 | #include <linux/workqueue.h> |
21 | #include <linux/capability.h> | 20 | #include <linux/capability.h> |
22 | #include <linux/device.h> | 21 | #include <linux/device.h> |
diff --git a/kernel/sysctl.c b/kernel/sysctl.c index b2f06f3c6a3f..34a604726d0b 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c | |||
@@ -190,7 +190,7 @@ static int proc_dostring_coredump(struct ctl_table *table, int write, | |||
190 | 190 | ||
191 | #ifdef CONFIG_MAGIC_SYSRQ | 191 | #ifdef CONFIG_MAGIC_SYSRQ |
192 | /* Note: sysrq code uses it's own private copy */ | 192 | /* Note: sysrq code uses it's own private copy */ |
193 | static int __sysrq_enabled = SYSRQ_DEFAULT_ENABLE; | 193 | static int __sysrq_enabled = CONFIG_MAGIC_SYSRQ_DEFAULT_ENABLE; |
194 | 194 | ||
195 | static int sysrq_sysctl_handler(ctl_table *table, int write, | 195 | static int sysrq_sysctl_handler(ctl_table *table, int write, |
196 | void __user *buffer, size_t *lenp, | 196 | void __user *buffer, size_t *lenp, |
@@ -371,13 +371,6 @@ static struct ctl_table kern_table[] = { | |||
371 | .proc_handler = proc_dointvec, | 371 | .proc_handler = proc_dointvec, |
372 | }, | 372 | }, |
373 | { | 373 | { |
374 | .procname = "numa_balancing_scan_period_reset", | ||
375 | .data = &sysctl_numa_balancing_scan_period_reset, | ||
376 | .maxlen = sizeof(unsigned int), | ||
377 | .mode = 0644, | ||
378 | .proc_handler = proc_dointvec, | ||
379 | }, | ||
380 | { | ||
381 | .procname = "numa_balancing_scan_period_max_ms", | 374 | .procname = "numa_balancing_scan_period_max_ms", |
382 | .data = &sysctl_numa_balancing_scan_period_max, | 375 | .data = &sysctl_numa_balancing_scan_period_max, |
383 | .maxlen = sizeof(unsigned int), | 376 | .maxlen = sizeof(unsigned int), |
@@ -391,6 +384,20 @@ static struct ctl_table kern_table[] = { | |||
391 | .mode = 0644, | 384 | .mode = 0644, |
392 | .proc_handler = proc_dointvec, | 385 | .proc_handler = proc_dointvec, |
393 | }, | 386 | }, |
387 | { | ||
388 | .procname = "numa_balancing_settle_count", | ||
389 | .data = &sysctl_numa_balancing_settle_count, | ||
390 | .maxlen = sizeof(unsigned int), | ||
391 | .mode = 0644, | ||
392 | .proc_handler = proc_dointvec, | ||
393 | }, | ||
394 | { | ||
395 | .procname = "numa_balancing_migrate_deferred", | ||
396 | .data = &sysctl_numa_balancing_migrate_deferred, | ||
397 | .maxlen = sizeof(unsigned int), | ||
398 | .mode = 0644, | ||
399 | .proc_handler = proc_dointvec, | ||
400 | }, | ||
394 | #endif /* CONFIG_NUMA_BALANCING */ | 401 | #endif /* CONFIG_NUMA_BALANCING */ |
395 | #endif /* CONFIG_SCHED_DEBUG */ | 402 | #endif /* CONFIG_SCHED_DEBUG */ |
396 | { | 403 | { |
@@ -962,9 +969,10 @@ static struct ctl_table kern_table[] = { | |||
962 | { | 969 | { |
963 | .procname = "hung_task_check_count", | 970 | .procname = "hung_task_check_count", |
964 | .data = &sysctl_hung_task_check_count, | 971 | .data = &sysctl_hung_task_check_count, |
965 | .maxlen = sizeof(unsigned long), | 972 | .maxlen = sizeof(int), |
966 | .mode = 0644, | 973 | .mode = 0644, |
967 | .proc_handler = proc_doulongvec_minmax, | 974 | .proc_handler = proc_dointvec_minmax, |
975 | .extra1 = &zero, | ||
968 | }, | 976 | }, |
969 | { | 977 | { |
970 | .procname = "hung_task_timeout_secs", | 978 | .procname = "hung_task_timeout_secs", |
@@ -1049,6 +1057,7 @@ static struct ctl_table kern_table[] = { | |||
1049 | .maxlen = sizeof(sysctl_perf_event_sample_rate), | 1057 | .maxlen = sizeof(sysctl_perf_event_sample_rate), |
1050 | .mode = 0644, | 1058 | .mode = 0644, |
1051 | .proc_handler = perf_proc_update_handler, | 1059 | .proc_handler = perf_proc_update_handler, |
1060 | .extra1 = &one, | ||
1052 | }, | 1061 | }, |
1053 | { | 1062 | { |
1054 | .procname = "perf_cpu_time_max_percent", | 1063 | .procname = "perf_cpu_time_max_percent", |
@@ -2214,8 +2223,11 @@ static int __do_proc_doulongvec_minmax(void *data, struct ctl_table *table, int | |||
2214 | *i = val; | 2223 | *i = val; |
2215 | } else { | 2224 | } else { |
2216 | val = convdiv * (*i) / convmul; | 2225 | val = convdiv * (*i) / convmul; |
2217 | if (!first) | 2226 | if (!first) { |
2218 | err = proc_put_char(&buffer, &left, '\t'); | 2227 | err = proc_put_char(&buffer, &left, '\t'); |
2228 | if (err) | ||
2229 | break; | ||
2230 | } | ||
2219 | err = proc_put_long(&buffer, &left, val, false); | 2231 | err = proc_put_long(&buffer, &left, val, false); |
2220 | if (err) | 2232 | if (err) |
2221 | break; | 2233 | break; |
diff --git a/kernel/sysctl_binary.c b/kernel/sysctl_binary.c index b609213ca9a2..653cbbd9e7ad 100644 --- a/kernel/sysctl_binary.c +++ b/kernel/sysctl_binary.c | |||
@@ -1024,7 +1024,7 @@ static ssize_t bin_intvec(struct file *file, | |||
1024 | if (get_user(value, vec + i)) | 1024 | if (get_user(value, vec + i)) |
1025 | goto out_kfree; | 1025 | goto out_kfree; |
1026 | 1026 | ||
1027 | str += snprintf(str, end - str, "%lu\t", value); | 1027 | str += scnprintf(str, end - str, "%lu\t", value); |
1028 | } | 1028 | } |
1029 | 1029 | ||
1030 | result = kernel_write(file, buffer, str - buffer, 0); | 1030 | result = kernel_write(file, buffer, str - buffer, 0); |
@@ -1095,7 +1095,7 @@ static ssize_t bin_ulongvec(struct file *file, | |||
1095 | if (get_user(value, vec + i)) | 1095 | if (get_user(value, vec + i)) |
1096 | goto out_kfree; | 1096 | goto out_kfree; |
1097 | 1097 | ||
1098 | str += snprintf(str, end - str, "%lu\t", value); | 1098 | str += scnprintf(str, end - str, "%lu\t", value); |
1099 | } | 1099 | } |
1100 | 1100 | ||
1101 | result = kernel_write(file, buffer, str - buffer, 0); | 1101 | result = kernel_write(file, buffer, str - buffer, 0); |
@@ -1205,7 +1205,7 @@ static ssize_t bin_dn_node_address(struct file *file, | |||
1205 | if (get_user(dnaddr, (__le16 __user *)newval)) | 1205 | if (get_user(dnaddr, (__le16 __user *)newval)) |
1206 | goto out; | 1206 | goto out; |
1207 | 1207 | ||
1208 | len = snprintf(buf, sizeof(buf), "%hu.%hu", | 1208 | len = scnprintf(buf, sizeof(buf), "%hu.%hu", |
1209 | le16_to_cpu(dnaddr) >> 10, | 1209 | le16_to_cpu(dnaddr) >> 10, |
1210 | le16_to_cpu(dnaddr) & 0x3ff); | 1210 | le16_to_cpu(dnaddr) & 0x3ff); |
1211 | 1211 | ||
diff --git a/kernel/system_certificates.S b/kernel/system_certificates.S new file mode 100644 index 000000000000..3e9868d47535 --- /dev/null +++ b/kernel/system_certificates.S | |||
@@ -0,0 +1,20 @@ | |||
1 | #include <linux/export.h> | ||
2 | #include <linux/init.h> | ||
3 | |||
4 | __INITRODATA | ||
5 | |||
6 | .align 8 | ||
7 | .globl VMLINUX_SYMBOL(system_certificate_list) | ||
8 | VMLINUX_SYMBOL(system_certificate_list): | ||
9 | __cert_list_start: | ||
10 | .incbin "kernel/x509_certificate_list" | ||
11 | __cert_list_end: | ||
12 | |||
13 | .align 8 | ||
14 | .globl VMLINUX_SYMBOL(system_certificate_list_size) | ||
15 | VMLINUX_SYMBOL(system_certificate_list_size): | ||
16 | #ifdef CONFIG_64BIT | ||
17 | .quad __cert_list_end - __cert_list_start | ||
18 | #else | ||
19 | .long __cert_list_end - __cert_list_start | ||
20 | #endif | ||
diff --git a/kernel/system_keyring.c b/kernel/system_keyring.c new file mode 100644 index 000000000000..52ebc70263f4 --- /dev/null +++ b/kernel/system_keyring.c | |||
@@ -0,0 +1,105 @@ | |||
1 | /* System trusted keyring for trusted public keys | ||
2 | * | ||
3 | * Copyright (C) 2012 Red Hat, Inc. All Rights Reserved. | ||
4 | * Written by David Howells (dhowells@redhat.com) | ||
5 | * | ||
6 | * This program is free software; you can redistribute it and/or | ||
7 | * modify it under the terms of the GNU General Public Licence | ||
8 | * as published by the Free Software Foundation; either version | ||
9 | * 2 of the Licence, or (at your option) any later version. | ||
10 | */ | ||
11 | |||
12 | #include <linux/export.h> | ||
13 | #include <linux/kernel.h> | ||
14 | #include <linux/sched.h> | ||
15 | #include <linux/cred.h> | ||
16 | #include <linux/err.h> | ||
17 | #include <keys/asymmetric-type.h> | ||
18 | #include <keys/system_keyring.h> | ||
19 | #include "module-internal.h" | ||
20 | |||
21 | struct key *system_trusted_keyring; | ||
22 | EXPORT_SYMBOL_GPL(system_trusted_keyring); | ||
23 | |||
24 | extern __initconst const u8 system_certificate_list[]; | ||
25 | extern __initconst const unsigned long system_certificate_list_size; | ||
26 | |||
27 | /* | ||
28 | * Load the compiled-in keys | ||
29 | */ | ||
30 | static __init int system_trusted_keyring_init(void) | ||
31 | { | ||
32 | pr_notice("Initialise system trusted keyring\n"); | ||
33 | |||
34 | system_trusted_keyring = | ||
35 | keyring_alloc(".system_keyring", | ||
36 | KUIDT_INIT(0), KGIDT_INIT(0), current_cred(), | ||
37 | ((KEY_POS_ALL & ~KEY_POS_SETATTR) | | ||
38 | KEY_USR_VIEW | KEY_USR_READ | KEY_USR_SEARCH), | ||
39 | KEY_ALLOC_NOT_IN_QUOTA, NULL); | ||
40 | if (IS_ERR(system_trusted_keyring)) | ||
41 | panic("Can't allocate system trusted keyring\n"); | ||
42 | |||
43 | set_bit(KEY_FLAG_TRUSTED_ONLY, &system_trusted_keyring->flags); | ||
44 | return 0; | ||
45 | } | ||
46 | |||
47 | /* | ||
48 | * Must be initialised before we try and load the keys into the keyring. | ||
49 | */ | ||
50 | device_initcall(system_trusted_keyring_init); | ||
51 | |||
52 | /* | ||
53 | * Load the compiled-in list of X.509 certificates. | ||
54 | */ | ||
55 | static __init int load_system_certificate_list(void) | ||
56 | { | ||
57 | key_ref_t key; | ||
58 | const u8 *p, *end; | ||
59 | size_t plen; | ||
60 | |||
61 | pr_notice("Loading compiled-in X.509 certificates\n"); | ||
62 | |||
63 | p = system_certificate_list; | ||
64 | end = p + system_certificate_list_size; | ||
65 | while (p < end) { | ||
66 | /* Each cert begins with an ASN.1 SEQUENCE tag and must be more | ||
67 | * than 256 bytes in size. | ||
68 | */ | ||
69 | if (end - p < 4) | ||
70 | goto dodgy_cert; | ||
71 | if (p[0] != 0x30 && | ||
72 | p[1] != 0x82) | ||
73 | goto dodgy_cert; | ||
74 | plen = (p[2] << 8) | p[3]; | ||
75 | plen += 4; | ||
76 | if (plen > end - p) | ||
77 | goto dodgy_cert; | ||
78 | |||
79 | key = key_create_or_update(make_key_ref(system_trusted_keyring, 1), | ||
80 | "asymmetric", | ||
81 | NULL, | ||
82 | p, | ||
83 | plen, | ||
84 | ((KEY_POS_ALL & ~KEY_POS_SETATTR) | | ||
85 | KEY_USR_VIEW | KEY_USR_READ), | ||
86 | KEY_ALLOC_NOT_IN_QUOTA | | ||
87 | KEY_ALLOC_TRUSTED); | ||
88 | if (IS_ERR(key)) { | ||
89 | pr_err("Problem loading in-kernel X.509 certificate (%ld)\n", | ||
90 | PTR_ERR(key)); | ||
91 | } else { | ||
92 | pr_notice("Loaded X.509 cert '%s'\n", | ||
93 | key_ref_to_ptr(key)->description); | ||
94 | key_ref_put(key); | ||
95 | } | ||
96 | p += plen; | ||
97 | } | ||
98 | |||
99 | return 0; | ||
100 | |||
101 | dodgy_cert: | ||
102 | pr_err("Problem parsing in-kernel X.509 certificate list\n"); | ||
103 | return 0; | ||
104 | } | ||
105 | late_initcall(load_system_certificate_list); | ||
diff --git a/kernel/taskstats.c b/kernel/taskstats.c index 145bb4d3bd4d..13d2f7cd65db 100644 --- a/kernel/taskstats.c +++ b/kernel/taskstats.c | |||
@@ -290,6 +290,7 @@ static int add_del_listener(pid_t pid, const struct cpumask *mask, int isadd) | |||
290 | struct listener_list *listeners; | 290 | struct listener_list *listeners; |
291 | struct listener *s, *tmp, *s2; | 291 | struct listener *s, *tmp, *s2; |
292 | unsigned int cpu; | 292 | unsigned int cpu; |
293 | int ret = 0; | ||
293 | 294 | ||
294 | if (!cpumask_subset(mask, cpu_possible_mask)) | 295 | if (!cpumask_subset(mask, cpu_possible_mask)) |
295 | return -EINVAL; | 296 | return -EINVAL; |
@@ -304,9 +305,10 @@ static int add_del_listener(pid_t pid, const struct cpumask *mask, int isadd) | |||
304 | for_each_cpu(cpu, mask) { | 305 | for_each_cpu(cpu, mask) { |
305 | s = kmalloc_node(sizeof(struct listener), | 306 | s = kmalloc_node(sizeof(struct listener), |
306 | GFP_KERNEL, cpu_to_node(cpu)); | 307 | GFP_KERNEL, cpu_to_node(cpu)); |
307 | if (!s) | 308 | if (!s) { |
309 | ret = -ENOMEM; | ||
308 | goto cleanup; | 310 | goto cleanup; |
309 | 311 | } | |
310 | s->pid = pid; | 312 | s->pid = pid; |
311 | s->valid = 1; | 313 | s->valid = 1; |
312 | 314 | ||
@@ -339,7 +341,7 @@ cleanup: | |||
339 | } | 341 | } |
340 | up_write(&listeners->sem); | 342 | up_write(&listeners->sem); |
341 | } | 343 | } |
342 | return 0; | 344 | return ret; |
343 | } | 345 | } |
344 | 346 | ||
345 | static int parse(struct nlattr *na, struct cpumask *mask) | 347 | static int parse(struct nlattr *na, struct cpumask *mask) |
@@ -404,11 +406,15 @@ static struct taskstats *mk_reply(struct sk_buff *skb, int type, u32 pid) | |||
404 | if (!na) | 406 | if (!na) |
405 | goto err; | 407 | goto err; |
406 | 408 | ||
407 | if (nla_put(skb, type, sizeof(pid), &pid) < 0) | 409 | if (nla_put(skb, type, sizeof(pid), &pid) < 0) { |
410 | nla_nest_cancel(skb, na); | ||
408 | goto err; | 411 | goto err; |
412 | } | ||
409 | ret = nla_reserve(skb, TASKSTATS_TYPE_STATS, sizeof(struct taskstats)); | 413 | ret = nla_reserve(skb, TASKSTATS_TYPE_STATS, sizeof(struct taskstats)); |
410 | if (!ret) | 414 | if (!ret) { |
415 | nla_nest_cancel(skb, na); | ||
411 | goto err; | 416 | goto err; |
417 | } | ||
412 | nla_nest_end(skb, na); | 418 | nla_nest_end(skb, na); |
413 | 419 | ||
414 | return nla_data(ret); | 420 | return nla_data(ret); |
@@ -667,17 +673,18 @@ err: | |||
667 | nlmsg_free(rep_skb); | 673 | nlmsg_free(rep_skb); |
668 | } | 674 | } |
669 | 675 | ||
670 | static struct genl_ops taskstats_ops = { | 676 | static const struct genl_ops taskstats_ops[] = { |
671 | .cmd = TASKSTATS_CMD_GET, | 677 | { |
672 | .doit = taskstats_user_cmd, | 678 | .cmd = TASKSTATS_CMD_GET, |
673 | .policy = taskstats_cmd_get_policy, | 679 | .doit = taskstats_user_cmd, |
674 | .flags = GENL_ADMIN_PERM, | 680 | .policy = taskstats_cmd_get_policy, |
675 | }; | 681 | .flags = GENL_ADMIN_PERM, |
676 | 682 | }, | |
677 | static struct genl_ops cgroupstats_ops = { | 683 | { |
678 | .cmd = CGROUPSTATS_CMD_GET, | 684 | .cmd = CGROUPSTATS_CMD_GET, |
679 | .doit = cgroupstats_user_cmd, | 685 | .doit = cgroupstats_user_cmd, |
680 | .policy = cgroupstats_cmd_get_policy, | 686 | .policy = cgroupstats_cmd_get_policy, |
687 | }, | ||
681 | }; | 688 | }; |
682 | 689 | ||
683 | /* Needed early in initialization */ | 690 | /* Needed early in initialization */ |
@@ -696,26 +703,13 @@ static int __init taskstats_init(void) | |||
696 | { | 703 | { |
697 | int rc; | 704 | int rc; |
698 | 705 | ||
699 | rc = genl_register_family(&family); | 706 | rc = genl_register_family_with_ops(&family, taskstats_ops); |
700 | if (rc) | 707 | if (rc) |
701 | return rc; | 708 | return rc; |
702 | 709 | ||
703 | rc = genl_register_ops(&family, &taskstats_ops); | ||
704 | if (rc < 0) | ||
705 | goto err; | ||
706 | |||
707 | rc = genl_register_ops(&family, &cgroupstats_ops); | ||
708 | if (rc < 0) | ||
709 | goto err_cgroup_ops; | ||
710 | |||
711 | family_registered = 1; | 710 | family_registered = 1; |
712 | pr_info("registered taskstats version %d\n", TASKSTATS_GENL_VERSION); | 711 | pr_info("registered taskstats version %d\n", TASKSTATS_GENL_VERSION); |
713 | return 0; | 712 | return 0; |
714 | err_cgroup_ops: | ||
715 | genl_unregister_ops(&family, &taskstats_ops); | ||
716 | err: | ||
717 | genl_unregister_family(&family); | ||
718 | return rc; | ||
719 | } | 713 | } |
720 | 714 | ||
721 | /* | 715 | /* |
diff --git a/kernel/time/Kconfig b/kernel/time/Kconfig index 2b62fe86f9ec..3ce6e8c5f3fc 100644 --- a/kernel/time/Kconfig +++ b/kernel/time/Kconfig | |||
@@ -100,7 +100,7 @@ config NO_HZ_FULL | |||
100 | # RCU_USER_QS dependency | 100 | # RCU_USER_QS dependency |
101 | depends on HAVE_CONTEXT_TRACKING | 101 | depends on HAVE_CONTEXT_TRACKING |
102 | # VIRT_CPU_ACCOUNTING_GEN dependency | 102 | # VIRT_CPU_ACCOUNTING_GEN dependency |
103 | depends on 64BIT | 103 | depends on HAVE_VIRT_CPU_ACCOUNTING_GEN |
104 | select NO_HZ_COMMON | 104 | select NO_HZ_COMMON |
105 | select RCU_USER_QS | 105 | select RCU_USER_QS |
106 | select RCU_NOCB_CPU | 106 | select RCU_NOCB_CPU |
diff --git a/kernel/time/alarmtimer.c b/kernel/time/alarmtimer.c index eec50fcef9e4..88c9c65a430d 100644 --- a/kernel/time/alarmtimer.c +++ b/kernel/time/alarmtimer.c | |||
@@ -490,7 +490,7 @@ static int alarm_clock_getres(const clockid_t which_clock, struct timespec *tp) | |||
490 | clockid_t baseid = alarm_bases[clock2alarm(which_clock)].base_clockid; | 490 | clockid_t baseid = alarm_bases[clock2alarm(which_clock)].base_clockid; |
491 | 491 | ||
492 | if (!alarmtimer_get_rtcdev()) | 492 | if (!alarmtimer_get_rtcdev()) |
493 | return -ENOTSUPP; | 493 | return -EINVAL; |
494 | 494 | ||
495 | return hrtimer_get_res(baseid, tp); | 495 | return hrtimer_get_res(baseid, tp); |
496 | } | 496 | } |
@@ -507,7 +507,7 @@ static int alarm_clock_get(clockid_t which_clock, struct timespec *tp) | |||
507 | struct alarm_base *base = &alarm_bases[clock2alarm(which_clock)]; | 507 | struct alarm_base *base = &alarm_bases[clock2alarm(which_clock)]; |
508 | 508 | ||
509 | if (!alarmtimer_get_rtcdev()) | 509 | if (!alarmtimer_get_rtcdev()) |
510 | return -ENOTSUPP; | 510 | return -EINVAL; |
511 | 511 | ||
512 | *tp = ktime_to_timespec(base->gettime()); | 512 | *tp = ktime_to_timespec(base->gettime()); |
513 | return 0; | 513 | return 0; |
diff --git a/kernel/time/clockevents.c b/kernel/time/clockevents.c index 662c5798a685..086ad6043bcb 100644 --- a/kernel/time/clockevents.c +++ b/kernel/time/clockevents.c | |||
@@ -619,7 +619,7 @@ static ssize_t sysfs_unbind_tick_dev(struct device *dev, | |||
619 | const char *buf, size_t count) | 619 | const char *buf, size_t count) |
620 | { | 620 | { |
621 | char name[CS_NAME_LEN]; | 621 | char name[CS_NAME_LEN]; |
622 | size_t ret = sysfs_get_uname(buf, name, count); | 622 | ssize_t ret = sysfs_get_uname(buf, name, count); |
623 | struct clock_event_device *ce; | 623 | struct clock_event_device *ce; |
624 | 624 | ||
625 | if (ret < 0) | 625 | if (ret < 0) |
diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c index 50a8736757f3..ba3e502c955a 100644 --- a/kernel/time/clocksource.c +++ b/kernel/time/clocksource.c | |||
@@ -479,6 +479,7 @@ static inline void clocksource_dequeue_watchdog(struct clocksource *cs) { } | |||
479 | static inline void clocksource_resume_watchdog(void) { } | 479 | static inline void clocksource_resume_watchdog(void) { } |
480 | static inline int __clocksource_watchdog_kthread(void) { return 0; } | 480 | static inline int __clocksource_watchdog_kthread(void) { return 0; } |
481 | static bool clocksource_is_watchdog(struct clocksource *cs) { return false; } | 481 | static bool clocksource_is_watchdog(struct clocksource *cs) { return false; } |
482 | void clocksource_mark_unstable(struct clocksource *cs) { } | ||
482 | 483 | ||
483 | #endif /* CONFIG_CLOCKSOURCE_WATCHDOG */ | 484 | #endif /* CONFIG_CLOCKSOURCE_WATCHDOG */ |
484 | 485 | ||
@@ -537,40 +538,55 @@ static u32 clocksource_max_adjustment(struct clocksource *cs) | |||
537 | } | 538 | } |
538 | 539 | ||
539 | /** | 540 | /** |
540 | * clocksource_max_deferment - Returns max time the clocksource can be deferred | 541 | * clocks_calc_max_nsecs - Returns maximum nanoseconds that can be converted |
541 | * @cs: Pointer to clocksource | 542 | * @mult: cycle to nanosecond multiplier |
542 | * | 543 | * @shift: cycle to nanosecond divisor (power of two) |
544 | * @maxadj: maximum adjustment value to mult (~11%) | ||
545 | * @mask: bitmask for two's complement subtraction of non 64 bit counters | ||
543 | */ | 546 | */ |
544 | static u64 clocksource_max_deferment(struct clocksource *cs) | 547 | u64 clocks_calc_max_nsecs(u32 mult, u32 shift, u32 maxadj, u64 mask) |
545 | { | 548 | { |
546 | u64 max_nsecs, max_cycles; | 549 | u64 max_nsecs, max_cycles; |
547 | 550 | ||
548 | /* | 551 | /* |
549 | * Calculate the maximum number of cycles that we can pass to the | 552 | * Calculate the maximum number of cycles that we can pass to the |
550 | * cyc2ns function without overflowing a 64-bit signed result. The | 553 | * cyc2ns function without overflowing a 64-bit signed result. The |
551 | * maximum number of cycles is equal to ULLONG_MAX/(cs->mult+cs->maxadj) | 554 | * maximum number of cycles is equal to ULLONG_MAX/(mult+maxadj) |
552 | * which is equivalent to the below. | 555 | * which is equivalent to the below. |
553 | * max_cycles < (2^63)/(cs->mult + cs->maxadj) | 556 | * max_cycles < (2^63)/(mult + maxadj) |
554 | * max_cycles < 2^(log2((2^63)/(cs->mult + cs->maxadj))) | 557 | * max_cycles < 2^(log2((2^63)/(mult + maxadj))) |
555 | * max_cycles < 2^(log2(2^63) - log2(cs->mult + cs->maxadj)) | 558 | * max_cycles < 2^(log2(2^63) - log2(mult + maxadj)) |
556 | * max_cycles < 2^(63 - log2(cs->mult + cs->maxadj)) | 559 | * max_cycles < 2^(63 - log2(mult + maxadj)) |
557 | * max_cycles < 1 << (63 - log2(cs->mult + cs->maxadj)) | 560 | * max_cycles < 1 << (63 - log2(mult + maxadj)) |
558 | * Please note that we add 1 to the result of the log2 to account for | 561 | * Please note that we add 1 to the result of the log2 to account for |
559 | * any rounding errors, ensure the above inequality is satisfied and | 562 | * any rounding errors, ensure the above inequality is satisfied and |
560 | * no overflow will occur. | 563 | * no overflow will occur. |
561 | */ | 564 | */ |
562 | max_cycles = 1ULL << (63 - (ilog2(cs->mult + cs->maxadj) + 1)); | 565 | max_cycles = 1ULL << (63 - (ilog2(mult + maxadj) + 1)); |
563 | 566 | ||
564 | /* | 567 | /* |
565 | * The actual maximum number of cycles we can defer the clocksource is | 568 | * The actual maximum number of cycles we can defer the clocksource is |
566 | * determined by the minimum of max_cycles and cs->mask. | 569 | * determined by the minimum of max_cycles and mask. |
567 | * Note: Here we subtract the maxadj to make sure we don't sleep for | 570 | * Note: Here we subtract the maxadj to make sure we don't sleep for |
568 | * too long if there's a large negative adjustment. | 571 | * too long if there's a large negative adjustment. |
569 | */ | 572 | */ |
570 | max_cycles = min_t(u64, max_cycles, (u64) cs->mask); | 573 | max_cycles = min(max_cycles, mask); |
571 | max_nsecs = clocksource_cyc2ns(max_cycles, cs->mult - cs->maxadj, | 574 | max_nsecs = clocksource_cyc2ns(max_cycles, mult - maxadj, shift); |
572 | cs->shift); | 575 | |
576 | return max_nsecs; | ||
577 | } | ||
578 | |||
579 | /** | ||
580 | * clocksource_max_deferment - Returns max time the clocksource can be deferred | ||
581 | * @cs: Pointer to clocksource | ||
582 | * | ||
583 | */ | ||
584 | static u64 clocksource_max_deferment(struct clocksource *cs) | ||
585 | { | ||
586 | u64 max_nsecs; | ||
573 | 587 | ||
588 | max_nsecs = clocks_calc_max_nsecs(cs->mult, cs->shift, cs->maxadj, | ||
589 | cs->mask); | ||
574 | /* | 590 | /* |
575 | * To ensure that the clocksource does not wrap whilst we are idle, | 591 | * To ensure that the clocksource does not wrap whilst we are idle, |
576 | * limit the time the clocksource can be deferred by 12.5%. Please | 592 | * limit the time the clocksource can be deferred by 12.5%. Please |
@@ -893,7 +909,7 @@ sysfs_show_current_clocksources(struct device *dev, | |||
893 | return count; | 909 | return count; |
894 | } | 910 | } |
895 | 911 | ||
896 | size_t sysfs_get_uname(const char *buf, char *dst, size_t cnt) | 912 | ssize_t sysfs_get_uname(const char *buf, char *dst, size_t cnt) |
897 | { | 913 | { |
898 | size_t ret = cnt; | 914 | size_t ret = cnt; |
899 | 915 | ||
@@ -924,7 +940,7 @@ static ssize_t sysfs_override_clocksource(struct device *dev, | |||
924 | struct device_attribute *attr, | 940 | struct device_attribute *attr, |
925 | const char *buf, size_t count) | 941 | const char *buf, size_t count) |
926 | { | 942 | { |
927 | size_t ret; | 943 | ssize_t ret; |
928 | 944 | ||
929 | mutex_lock(&clocksource_mutex); | 945 | mutex_lock(&clocksource_mutex); |
930 | 946 | ||
@@ -952,7 +968,7 @@ static ssize_t sysfs_unbind_clocksource(struct device *dev, | |||
952 | { | 968 | { |
953 | struct clocksource *cs; | 969 | struct clocksource *cs; |
954 | char name[CS_NAME_LEN]; | 970 | char name[CS_NAME_LEN]; |
955 | size_t ret; | 971 | ssize_t ret; |
956 | 972 | ||
957 | ret = sysfs_get_uname(buf, name, count); | 973 | ret = sysfs_get_uname(buf, name, count); |
958 | if (ret < 0) | 974 | if (ret < 0) |
diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c index bb2215174f05..af8d1d4f3d55 100644 --- a/kernel/time/ntp.c +++ b/kernel/time/ntp.c | |||
@@ -475,6 +475,7 @@ static void sync_cmos_clock(struct work_struct *work) | |||
475 | * called as close as possible to 500 ms before the new second starts. | 475 | * called as close as possible to 500 ms before the new second starts. |
476 | * This code is run on a timer. If the clock is set, that timer | 476 | * This code is run on a timer. If the clock is set, that timer |
477 | * may not expire at the correct time. Thus, we adjust... | 477 | * may not expire at the correct time. Thus, we adjust... |
478 | * We want the clock to be within a couple of ticks from the target. | ||
478 | */ | 479 | */ |
479 | if (!ntp_synced()) { | 480 | if (!ntp_synced()) { |
480 | /* | 481 | /* |
@@ -485,7 +486,7 @@ static void sync_cmos_clock(struct work_struct *work) | |||
485 | } | 486 | } |
486 | 487 | ||
487 | getnstimeofday(&now); | 488 | getnstimeofday(&now); |
488 | if (abs(now.tv_nsec - (NSEC_PER_SEC / 2)) <= tick_nsec / 2) { | 489 | if (abs(now.tv_nsec - (NSEC_PER_SEC / 2)) <= tick_nsec * 5) { |
489 | struct timespec adjust = now; | 490 | struct timespec adjust = now; |
490 | 491 | ||
491 | fail = -ENODEV; | 492 | fail = -ENODEV; |
diff --git a/kernel/time/sched_clock.c b/kernel/time/sched_clock.c index 0b479a6a22bb..0abb36464281 100644 --- a/kernel/time/sched_clock.c +++ b/kernel/time/sched_clock.c | |||
@@ -8,25 +8,28 @@ | |||
8 | #include <linux/clocksource.h> | 8 | #include <linux/clocksource.h> |
9 | #include <linux/init.h> | 9 | #include <linux/init.h> |
10 | #include <linux/jiffies.h> | 10 | #include <linux/jiffies.h> |
11 | #include <linux/ktime.h> | ||
11 | #include <linux/kernel.h> | 12 | #include <linux/kernel.h> |
12 | #include <linux/moduleparam.h> | 13 | #include <linux/moduleparam.h> |
13 | #include <linux/sched.h> | 14 | #include <linux/sched.h> |
14 | #include <linux/syscore_ops.h> | 15 | #include <linux/syscore_ops.h> |
15 | #include <linux/timer.h> | 16 | #include <linux/hrtimer.h> |
16 | #include <linux/sched_clock.h> | 17 | #include <linux/sched_clock.h> |
18 | #include <linux/seqlock.h> | ||
19 | #include <linux/bitops.h> | ||
17 | 20 | ||
18 | struct clock_data { | 21 | struct clock_data { |
22 | ktime_t wrap_kt; | ||
19 | u64 epoch_ns; | 23 | u64 epoch_ns; |
20 | u32 epoch_cyc; | 24 | u64 epoch_cyc; |
21 | u32 epoch_cyc_copy; | 25 | seqcount_t seq; |
22 | unsigned long rate; | 26 | unsigned long rate; |
23 | u32 mult; | 27 | u32 mult; |
24 | u32 shift; | 28 | u32 shift; |
25 | bool suspended; | 29 | bool suspended; |
26 | }; | 30 | }; |
27 | 31 | ||
28 | static void sched_clock_poll(unsigned long wrap_ticks); | 32 | static struct hrtimer sched_clock_timer; |
29 | static DEFINE_TIMER(sched_clock_timer, sched_clock_poll, 0, 0); | ||
30 | static int irqtime = -1; | 33 | static int irqtime = -1; |
31 | 34 | ||
32 | core_param(irqtime, irqtime, int, 0400); | 35 | core_param(irqtime, irqtime, int, 0400); |
@@ -35,42 +38,46 @@ static struct clock_data cd = { | |||
35 | .mult = NSEC_PER_SEC / HZ, | 38 | .mult = NSEC_PER_SEC / HZ, |
36 | }; | 39 | }; |
37 | 40 | ||
38 | static u32 __read_mostly sched_clock_mask = 0xffffffff; | 41 | static u64 __read_mostly sched_clock_mask; |
39 | 42 | ||
40 | static u32 notrace jiffy_sched_clock_read(void) | 43 | static u64 notrace jiffy_sched_clock_read(void) |
41 | { | 44 | { |
42 | return (u32)(jiffies - INITIAL_JIFFIES); | 45 | /* |
46 | * We don't need to use get_jiffies_64 on 32-bit arches here | ||
47 | * because we register with BITS_PER_LONG | ||
48 | */ | ||
49 | return (u64)(jiffies - INITIAL_JIFFIES); | ||
43 | } | 50 | } |
44 | 51 | ||
45 | static u32 __read_mostly (*read_sched_clock)(void) = jiffy_sched_clock_read; | 52 | static u32 __read_mostly (*read_sched_clock_32)(void); |
53 | |||
54 | static u64 notrace read_sched_clock_32_wrapper(void) | ||
55 | { | ||
56 | return read_sched_clock_32(); | ||
57 | } | ||
58 | |||
59 | static u64 __read_mostly (*read_sched_clock)(void) = jiffy_sched_clock_read; | ||
46 | 60 | ||
47 | static inline u64 notrace cyc_to_ns(u64 cyc, u32 mult, u32 shift) | 61 | static inline u64 notrace cyc_to_ns(u64 cyc, u32 mult, u32 shift) |
48 | { | 62 | { |
49 | return (cyc * mult) >> shift; | 63 | return (cyc * mult) >> shift; |
50 | } | 64 | } |
51 | 65 | ||
52 | static unsigned long long notrace sched_clock_32(void) | 66 | unsigned long long notrace sched_clock(void) |
53 | { | 67 | { |
54 | u64 epoch_ns; | 68 | u64 epoch_ns; |
55 | u32 epoch_cyc; | 69 | u64 epoch_cyc; |
56 | u32 cyc; | 70 | u64 cyc; |
71 | unsigned long seq; | ||
57 | 72 | ||
58 | if (cd.suspended) | 73 | if (cd.suspended) |
59 | return cd.epoch_ns; | 74 | return cd.epoch_ns; |
60 | 75 | ||
61 | /* | ||
62 | * Load the epoch_cyc and epoch_ns atomically. We do this by | ||
63 | * ensuring that we always write epoch_cyc, epoch_ns and | ||
64 | * epoch_cyc_copy in strict order, and read them in strict order. | ||
65 | * If epoch_cyc and epoch_cyc_copy are not equal, then we're in | ||
66 | * the middle of an update, and we should repeat the load. | ||
67 | */ | ||
68 | do { | 76 | do { |
77 | seq = raw_read_seqcount_begin(&cd.seq); | ||
69 | epoch_cyc = cd.epoch_cyc; | 78 | epoch_cyc = cd.epoch_cyc; |
70 | smp_rmb(); | ||
71 | epoch_ns = cd.epoch_ns; | 79 | epoch_ns = cd.epoch_ns; |
72 | smp_rmb(); | 80 | } while (read_seqcount_retry(&cd.seq, seq)); |
73 | } while (epoch_cyc != cd.epoch_cyc_copy); | ||
74 | 81 | ||
75 | cyc = read_sched_clock(); | 82 | cyc = read_sched_clock(); |
76 | cyc = (cyc - epoch_cyc) & sched_clock_mask; | 83 | cyc = (cyc - epoch_cyc) & sched_clock_mask; |
@@ -83,49 +90,46 @@ static unsigned long long notrace sched_clock_32(void) | |||
83 | static void notrace update_sched_clock(void) | 90 | static void notrace update_sched_clock(void) |
84 | { | 91 | { |
85 | unsigned long flags; | 92 | unsigned long flags; |
86 | u32 cyc; | 93 | u64 cyc; |
87 | u64 ns; | 94 | u64 ns; |
88 | 95 | ||
89 | cyc = read_sched_clock(); | 96 | cyc = read_sched_clock(); |
90 | ns = cd.epoch_ns + | 97 | ns = cd.epoch_ns + |
91 | cyc_to_ns((cyc - cd.epoch_cyc) & sched_clock_mask, | 98 | cyc_to_ns((cyc - cd.epoch_cyc) & sched_clock_mask, |
92 | cd.mult, cd.shift); | 99 | cd.mult, cd.shift); |
93 | /* | 100 | |
94 | * Write epoch_cyc and epoch_ns in a way that the update is | ||
95 | * detectable in cyc_to_fixed_sched_clock(). | ||
96 | */ | ||
97 | raw_local_irq_save(flags); | 101 | raw_local_irq_save(flags); |
98 | cd.epoch_cyc_copy = cyc; | 102 | raw_write_seqcount_begin(&cd.seq); |
99 | smp_wmb(); | ||
100 | cd.epoch_ns = ns; | 103 | cd.epoch_ns = ns; |
101 | smp_wmb(); | ||
102 | cd.epoch_cyc = cyc; | 104 | cd.epoch_cyc = cyc; |
105 | raw_write_seqcount_end(&cd.seq); | ||
103 | raw_local_irq_restore(flags); | 106 | raw_local_irq_restore(flags); |
104 | } | 107 | } |
105 | 108 | ||
106 | static void sched_clock_poll(unsigned long wrap_ticks) | 109 | static enum hrtimer_restart sched_clock_poll(struct hrtimer *hrt) |
107 | { | 110 | { |
108 | mod_timer(&sched_clock_timer, round_jiffies(jiffies + wrap_ticks)); | ||
109 | update_sched_clock(); | 111 | update_sched_clock(); |
112 | hrtimer_forward_now(hrt, cd.wrap_kt); | ||
113 | return HRTIMER_RESTART; | ||
110 | } | 114 | } |
111 | 115 | ||
112 | void __init setup_sched_clock(u32 (*read)(void), int bits, unsigned long rate) | 116 | void __init sched_clock_register(u64 (*read)(void), int bits, |
117 | unsigned long rate) | ||
113 | { | 118 | { |
114 | unsigned long r, w; | 119 | unsigned long r; |
115 | u64 res, wrap; | 120 | u64 res, wrap; |
116 | char r_unit; | 121 | char r_unit; |
117 | 122 | ||
118 | if (cd.rate > rate) | 123 | if (cd.rate > rate) |
119 | return; | 124 | return; |
120 | 125 | ||
121 | BUG_ON(bits > 32); | ||
122 | WARN_ON(!irqs_disabled()); | 126 | WARN_ON(!irqs_disabled()); |
123 | read_sched_clock = read; | 127 | read_sched_clock = read; |
124 | sched_clock_mask = (1ULL << bits) - 1; | 128 | sched_clock_mask = CLOCKSOURCE_MASK(bits); |
125 | cd.rate = rate; | 129 | cd.rate = rate; |
126 | 130 | ||
127 | /* calculate the mult/shift to convert counter ticks to ns. */ | 131 | /* calculate the mult/shift to convert counter ticks to ns. */ |
128 | clocks_calc_mult_shift(&cd.mult, &cd.shift, rate, NSEC_PER_SEC, 0); | 132 | clocks_calc_mult_shift(&cd.mult, &cd.shift, rate, NSEC_PER_SEC, 3600); |
129 | 133 | ||
130 | r = rate; | 134 | r = rate; |
131 | if (r >= 4000000) { | 135 | if (r >= 4000000) { |
@@ -138,20 +142,14 @@ void __init setup_sched_clock(u32 (*read)(void), int bits, unsigned long rate) | |||
138 | r_unit = ' '; | 142 | r_unit = ' '; |
139 | 143 | ||
140 | /* calculate how many ns until we wrap */ | 144 | /* calculate how many ns until we wrap */ |
141 | wrap = cyc_to_ns((1ULL << bits) - 1, cd.mult, cd.shift); | 145 | wrap = clocks_calc_max_nsecs(cd.mult, cd.shift, 0, sched_clock_mask); |
142 | do_div(wrap, NSEC_PER_MSEC); | 146 | cd.wrap_kt = ns_to_ktime(wrap - (wrap >> 3)); |
143 | w = wrap; | ||
144 | 147 | ||
145 | /* calculate the ns resolution of this counter */ | 148 | /* calculate the ns resolution of this counter */ |
146 | res = cyc_to_ns(1ULL, cd.mult, cd.shift); | 149 | res = cyc_to_ns(1ULL, cd.mult, cd.shift); |
147 | pr_info("sched_clock: %u bits at %lu%cHz, resolution %lluns, wraps every %lums\n", | 150 | pr_info("sched_clock: %u bits at %lu%cHz, resolution %lluns, wraps every %lluns\n", |
148 | bits, r, r_unit, res, w); | 151 | bits, r, r_unit, res, wrap); |
149 | 152 | ||
150 | /* | ||
151 | * Start the timer to keep sched_clock() properly updated and | ||
152 | * sets the initial epoch. | ||
153 | */ | ||
154 | sched_clock_timer.data = msecs_to_jiffies(w - (w / 10)); | ||
155 | update_sched_clock(); | 153 | update_sched_clock(); |
156 | 154 | ||
157 | /* | 155 | /* |
@@ -166,11 +164,10 @@ void __init setup_sched_clock(u32 (*read)(void), int bits, unsigned long rate) | |||
166 | pr_debug("Registered %pF as sched_clock source\n", read); | 164 | pr_debug("Registered %pF as sched_clock source\n", read); |
167 | } | 165 | } |
168 | 166 | ||
169 | unsigned long long __read_mostly (*sched_clock_func)(void) = sched_clock_32; | 167 | void __init setup_sched_clock(u32 (*read)(void), int bits, unsigned long rate) |
170 | |||
171 | unsigned long long notrace sched_clock(void) | ||
172 | { | 168 | { |
173 | return sched_clock_func(); | 169 | read_sched_clock_32 = read; |
170 | sched_clock_register(read_sched_clock_32_wrapper, bits, rate); | ||
174 | } | 171 | } |
175 | 172 | ||
176 | void __init sched_clock_postinit(void) | 173 | void __init sched_clock_postinit(void) |
@@ -180,14 +177,22 @@ void __init sched_clock_postinit(void) | |||
180 | * make it the final one one. | 177 | * make it the final one one. |
181 | */ | 178 | */ |
182 | if (read_sched_clock == jiffy_sched_clock_read) | 179 | if (read_sched_clock == jiffy_sched_clock_read) |
183 | setup_sched_clock(jiffy_sched_clock_read, 32, HZ); | 180 | sched_clock_register(jiffy_sched_clock_read, BITS_PER_LONG, HZ); |
184 | 181 | ||
185 | sched_clock_poll(sched_clock_timer.data); | 182 | update_sched_clock(); |
183 | |||
184 | /* | ||
185 | * Start the timer to keep sched_clock() properly updated and | ||
186 | * sets the initial epoch. | ||
187 | */ | ||
188 | hrtimer_init(&sched_clock_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); | ||
189 | sched_clock_timer.function = sched_clock_poll; | ||
190 | hrtimer_start(&sched_clock_timer, cd.wrap_kt, HRTIMER_MODE_REL); | ||
186 | } | 191 | } |
187 | 192 | ||
188 | static int sched_clock_suspend(void) | 193 | static int sched_clock_suspend(void) |
189 | { | 194 | { |
190 | sched_clock_poll(sched_clock_timer.data); | 195 | sched_clock_poll(&sched_clock_timer); |
191 | cd.suspended = true; | 196 | cd.suspended = true; |
192 | return 0; | 197 | return 0; |
193 | } | 198 | } |
@@ -195,7 +200,6 @@ static int sched_clock_suspend(void) | |||
195 | static void sched_clock_resume(void) | 200 | static void sched_clock_resume(void) |
196 | { | 201 | { |
197 | cd.epoch_cyc = read_sched_clock(); | 202 | cd.epoch_cyc = read_sched_clock(); |
198 | cd.epoch_cyc_copy = cd.epoch_cyc; | ||
199 | cd.suspended = false; | 203 | cd.suspended = false; |
200 | } | 204 | } |
201 | 205 | ||
diff --git a/kernel/time/tick-broadcast.c b/kernel/time/tick-broadcast.c index 218bcb565fed..9532690daaa9 100644 --- a/kernel/time/tick-broadcast.c +++ b/kernel/time/tick-broadcast.c | |||
@@ -70,6 +70,7 @@ static bool tick_check_broadcast_device(struct clock_event_device *curdev, | |||
70 | struct clock_event_device *newdev) | 70 | struct clock_event_device *newdev) |
71 | { | 71 | { |
72 | if ((newdev->features & CLOCK_EVT_FEAT_DUMMY) || | 72 | if ((newdev->features & CLOCK_EVT_FEAT_DUMMY) || |
73 | (newdev->features & CLOCK_EVT_FEAT_PERCPU) || | ||
73 | (newdev->features & CLOCK_EVT_FEAT_C3STOP)) | 74 | (newdev->features & CLOCK_EVT_FEAT_C3STOP)) |
74 | return false; | 75 | return false; |
75 | 76 | ||
diff --git a/kernel/time/tick-common.c b/kernel/time/tick-common.c index 64522ecdfe0e..162b03ab0ad2 100644 --- a/kernel/time/tick-common.c +++ b/kernel/time/tick-common.c | |||
@@ -33,6 +33,21 @@ DEFINE_PER_CPU(struct tick_device, tick_cpu_device); | |||
33 | */ | 33 | */ |
34 | ktime_t tick_next_period; | 34 | ktime_t tick_next_period; |
35 | ktime_t tick_period; | 35 | ktime_t tick_period; |
36 | |||
37 | /* | ||
38 | * tick_do_timer_cpu is a timer core internal variable which holds the CPU NR | ||
39 | * which is responsible for calling do_timer(), i.e. the timekeeping stuff. This | ||
40 | * variable has two functions: | ||
41 | * | ||
42 | * 1) Prevent a thundering herd issue of a gazillion of CPUs trying to grab the | ||
43 | * timekeeping lock all at once. Only the CPU which is assigned to do the | ||
44 | * update is handling it. | ||
45 | * | ||
46 | * 2) Hand off the duty in the NOHZ idle case by setting the value to | ||
47 | * TICK_DO_TIMER_NONE, i.e. a non existing CPU. So the next cpu which looks | ||
48 | * at it will take over and keep the time keeping alive. The handover | ||
49 | * procedure also covers cpu hotplug. | ||
50 | */ | ||
36 | int tick_do_timer_cpu __read_mostly = TICK_DO_TIMER_BOOT; | 51 | int tick_do_timer_cpu __read_mostly = TICK_DO_TIMER_BOOT; |
37 | 52 | ||
38 | /* | 53 | /* |
diff --git a/kernel/time/tick-internal.h b/kernel/time/tick-internal.h index bc906cad709b..18e71f7fbc2a 100644 --- a/kernel/time/tick-internal.h +++ b/kernel/time/tick-internal.h | |||
@@ -31,7 +31,7 @@ extern void tick_install_replacement(struct clock_event_device *dev); | |||
31 | 31 | ||
32 | extern void clockevents_shutdown(struct clock_event_device *dev); | 32 | extern void clockevents_shutdown(struct clock_event_device *dev); |
33 | 33 | ||
34 | extern size_t sysfs_get_uname(const char *buf, char *dst, size_t cnt); | 34 | extern ssize_t sysfs_get_uname(const char *buf, char *dst, size_t cnt); |
35 | 35 | ||
36 | /* | 36 | /* |
37 | * NO_HZ / high resolution timer shared code | 37 | * NO_HZ / high resolution timer shared code |
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index 3612fc77f834..ea20f7d1ac2c 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c | |||
@@ -361,8 +361,8 @@ void __init tick_nohz_init(void) | |||
361 | /* | 361 | /* |
362 | * NO HZ enabled ? | 362 | * NO HZ enabled ? |
363 | */ | 363 | */ |
364 | int tick_nohz_enabled __read_mostly = 1; | 364 | static int tick_nohz_enabled __read_mostly = 1; |
365 | 365 | int tick_nohz_active __read_mostly; | |
366 | /* | 366 | /* |
367 | * Enable / Disable tickless mode | 367 | * Enable / Disable tickless mode |
368 | */ | 368 | */ |
@@ -465,7 +465,7 @@ u64 get_cpu_idle_time_us(int cpu, u64 *last_update_time) | |||
465 | struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu); | 465 | struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu); |
466 | ktime_t now, idle; | 466 | ktime_t now, idle; |
467 | 467 | ||
468 | if (!tick_nohz_enabled) | 468 | if (!tick_nohz_active) |
469 | return -1; | 469 | return -1; |
470 | 470 | ||
471 | now = ktime_get(); | 471 | now = ktime_get(); |
@@ -506,7 +506,7 @@ u64 get_cpu_iowait_time_us(int cpu, u64 *last_update_time) | |||
506 | struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu); | 506 | struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu); |
507 | ktime_t now, iowait; | 507 | ktime_t now, iowait; |
508 | 508 | ||
509 | if (!tick_nohz_enabled) | 509 | if (!tick_nohz_active) |
510 | return -1; | 510 | return -1; |
511 | 511 | ||
512 | now = ktime_get(); | 512 | now = ktime_get(); |
@@ -711,8 +711,10 @@ static bool can_stop_idle_tick(int cpu, struct tick_sched *ts) | |||
711 | return false; | 711 | return false; |
712 | } | 712 | } |
713 | 713 | ||
714 | if (unlikely(ts->nohz_mode == NOHZ_MODE_INACTIVE)) | 714 | if (unlikely(ts->nohz_mode == NOHZ_MODE_INACTIVE)) { |
715 | ts->sleep_length = (ktime_t) { .tv64 = NSEC_PER_SEC/HZ }; | ||
715 | return false; | 716 | return false; |
717 | } | ||
716 | 718 | ||
717 | if (need_resched()) | 719 | if (need_resched()) |
718 | return false; | 720 | return false; |
@@ -799,11 +801,6 @@ void tick_nohz_idle_enter(void) | |||
799 | local_irq_disable(); | 801 | local_irq_disable(); |
800 | 802 | ||
801 | ts = &__get_cpu_var(tick_cpu_sched); | 803 | ts = &__get_cpu_var(tick_cpu_sched); |
802 | /* | ||
803 | * set ts->inidle unconditionally. even if the system did not | ||
804 | * switch to nohz mode the cpu frequency governers rely on the | ||
805 | * update of the idle time accounting in tick_nohz_start_idle(). | ||
806 | */ | ||
807 | ts->inidle = 1; | 804 | ts->inidle = 1; |
808 | __tick_nohz_idle_enter(ts); | 805 | __tick_nohz_idle_enter(ts); |
809 | 806 | ||
@@ -973,7 +970,7 @@ static void tick_nohz_switch_to_nohz(void) | |||
973 | struct tick_sched *ts = &__get_cpu_var(tick_cpu_sched); | 970 | struct tick_sched *ts = &__get_cpu_var(tick_cpu_sched); |
974 | ktime_t next; | 971 | ktime_t next; |
975 | 972 | ||
976 | if (!tick_nohz_enabled) | 973 | if (!tick_nohz_active) |
977 | return; | 974 | return; |
978 | 975 | ||
979 | local_irq_disable(); | 976 | local_irq_disable(); |
@@ -981,7 +978,7 @@ static void tick_nohz_switch_to_nohz(void) | |||
981 | local_irq_enable(); | 978 | local_irq_enable(); |
982 | return; | 979 | return; |
983 | } | 980 | } |
984 | 981 | tick_nohz_active = 1; | |
985 | ts->nohz_mode = NOHZ_MODE_LOWRES; | 982 | ts->nohz_mode = NOHZ_MODE_LOWRES; |
986 | 983 | ||
987 | /* | 984 | /* |
@@ -1139,8 +1136,10 @@ void tick_setup_sched_timer(void) | |||
1139 | } | 1136 | } |
1140 | 1137 | ||
1141 | #ifdef CONFIG_NO_HZ_COMMON | 1138 | #ifdef CONFIG_NO_HZ_COMMON |
1142 | if (tick_nohz_enabled) | 1139 | if (tick_nohz_enabled) { |
1143 | ts->nohz_mode = NOHZ_MODE_HIGHRES; | 1140 | ts->nohz_mode = NOHZ_MODE_HIGHRES; |
1141 | tick_nohz_active = 1; | ||
1142 | } | ||
1144 | #endif | 1143 | #endif |
1145 | } | 1144 | } |
1146 | #endif /* HIGH_RES_TIMERS */ | 1145 | #endif /* HIGH_RES_TIMERS */ |
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index 947ba25a95a0..87b4f00284c9 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c | |||
@@ -1347,7 +1347,7 @@ static inline void old_vsyscall_fixup(struct timekeeper *tk) | |||
1347 | tk->xtime_nsec -= remainder; | 1347 | tk->xtime_nsec -= remainder; |
1348 | tk->xtime_nsec += 1ULL << tk->shift; | 1348 | tk->xtime_nsec += 1ULL << tk->shift; |
1349 | tk->ntp_error += remainder << tk->ntp_error_shift; | 1349 | tk->ntp_error += remainder << tk->ntp_error_shift; |
1350 | 1350 | tk->ntp_error -= (1ULL << tk->shift) << tk->ntp_error_shift; | |
1351 | } | 1351 | } |
1352 | #else | 1352 | #else |
1353 | #define old_vsyscall_fixup(tk) | 1353 | #define old_vsyscall_fixup(tk) |
@@ -1613,9 +1613,10 @@ void get_xtime_and_monotonic_and_sleep_offset(struct timespec *xtim, | |||
1613 | * ktime_get_update_offsets - hrtimer helper | 1613 | * ktime_get_update_offsets - hrtimer helper |
1614 | * @offs_real: pointer to storage for monotonic -> realtime offset | 1614 | * @offs_real: pointer to storage for monotonic -> realtime offset |
1615 | * @offs_boot: pointer to storage for monotonic -> boottime offset | 1615 | * @offs_boot: pointer to storage for monotonic -> boottime offset |
1616 | * @offs_tai: pointer to storage for monotonic -> clock tai offset | ||
1616 | * | 1617 | * |
1617 | * Returns current monotonic time and updates the offsets | 1618 | * Returns current monotonic time and updates the offsets |
1618 | * Called from hrtimer_interupt() or retrigger_next_event() | 1619 | * Called from hrtimer_interrupt() or retrigger_next_event() |
1619 | */ | 1620 | */ |
1620 | ktime_t ktime_get_update_offsets(ktime_t *offs_real, ktime_t *offs_boot, | 1621 | ktime_t ktime_get_update_offsets(ktime_t *offs_real, ktime_t *offs_boot, |
1621 | ktime_t *offs_tai) | 1622 | ktime_t *offs_tai) |
diff --git a/kernel/time/timer_stats.c b/kernel/time/timer_stats.c index 0b537f27b559..1fb08f21302e 100644 --- a/kernel/time/timer_stats.c +++ b/kernel/time/timer_stats.c | |||
@@ -298,15 +298,15 @@ static int tstats_show(struct seq_file *m, void *v) | |||
298 | period = ktime_to_timespec(time); | 298 | period = ktime_to_timespec(time); |
299 | ms = period.tv_nsec / 1000000; | 299 | ms = period.tv_nsec / 1000000; |
300 | 300 | ||
301 | seq_puts(m, "Timer Stats Version: v0.2\n"); | 301 | seq_puts(m, "Timer Stats Version: v0.3\n"); |
302 | seq_printf(m, "Sample period: %ld.%03ld s\n", period.tv_sec, ms); | 302 | seq_printf(m, "Sample period: %ld.%03ld s\n", period.tv_sec, ms); |
303 | if (atomic_read(&overflow_count)) | 303 | if (atomic_read(&overflow_count)) |
304 | seq_printf(m, "Overflow: %d entries\n", | 304 | seq_printf(m, "Overflow: %d entries\n", atomic_read(&overflow_count)); |
305 | atomic_read(&overflow_count)); | 305 | seq_printf(m, "Collection: %s\n", timer_stats_active ? "active" : "inactive"); |
306 | 306 | ||
307 | for (i = 0; i < nr_entries; i++) { | 307 | for (i = 0; i < nr_entries; i++) { |
308 | entry = entries + i; | 308 | entry = entries + i; |
309 | if (entry->timer_flag & TIMER_STATS_FLAG_DEFERRABLE) { | 309 | if (entry->timer_flag & TIMER_STATS_FLAG_DEFERRABLE) { |
310 | seq_printf(m, "%4luD, %5d %-16s ", | 310 | seq_printf(m, "%4luD, %5d %-16s ", |
311 | entry->count, entry->pid, entry->comm); | 311 | entry->count, entry->pid, entry->comm); |
312 | } else { | 312 | } else { |
diff --git a/kernel/timer.c b/kernel/timer.c index 4296d13db3d1..accfd241b9e5 100644 --- a/kernel/timer.c +++ b/kernel/timer.c | |||
@@ -1092,7 +1092,7 @@ static int cascade(struct tvec_base *base, struct tvec *tv, int index) | |||
1092 | static void call_timer_fn(struct timer_list *timer, void (*fn)(unsigned long), | 1092 | static void call_timer_fn(struct timer_list *timer, void (*fn)(unsigned long), |
1093 | unsigned long data) | 1093 | unsigned long data) |
1094 | { | 1094 | { |
1095 | int preempt_count = preempt_count(); | 1095 | int count = preempt_count(); |
1096 | 1096 | ||
1097 | #ifdef CONFIG_LOCKDEP | 1097 | #ifdef CONFIG_LOCKDEP |
1098 | /* | 1098 | /* |
@@ -1119,16 +1119,16 @@ static void call_timer_fn(struct timer_list *timer, void (*fn)(unsigned long), | |||
1119 | 1119 | ||
1120 | lock_map_release(&lockdep_map); | 1120 | lock_map_release(&lockdep_map); |
1121 | 1121 | ||
1122 | if (preempt_count != preempt_count()) { | 1122 | if (count != preempt_count()) { |
1123 | WARN_ONCE(1, "timer: %pF preempt leak: %08x -> %08x\n", | 1123 | WARN_ONCE(1, "timer: %pF preempt leak: %08x -> %08x\n", |
1124 | fn, preempt_count, preempt_count()); | 1124 | fn, count, preempt_count()); |
1125 | /* | 1125 | /* |
1126 | * Restore the preempt count. That gives us a decent | 1126 | * Restore the preempt count. That gives us a decent |
1127 | * chance to survive and extract information. If the | 1127 | * chance to survive and extract information. If the |
1128 | * callback kept a lock held, bad luck, but not worse | 1128 | * callback kept a lock held, bad luck, but not worse |
1129 | * than the BUG() we had. | 1129 | * than the BUG() we had. |
1130 | */ | 1130 | */ |
1131 | preempt_count() = preempt_count; | 1131 | preempt_count_set(count); |
1132 | } | 1132 | } |
1133 | } | 1133 | } |
1134 | 1134 | ||
@@ -1518,9 +1518,8 @@ static int init_timers_cpu(int cpu) | |||
1518 | /* | 1518 | /* |
1519 | * The APs use this path later in boot | 1519 | * The APs use this path later in boot |
1520 | */ | 1520 | */ |
1521 | base = kmalloc_node(sizeof(*base), | 1521 | base = kzalloc_node(sizeof(*base), GFP_KERNEL, |
1522 | GFP_KERNEL | __GFP_ZERO, | 1522 | cpu_to_node(cpu)); |
1523 | cpu_to_node(cpu)); | ||
1524 | if (!base) | 1523 | if (!base) |
1525 | return -ENOMEM; | 1524 | return -ENOMEM; |
1526 | 1525 | ||
diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c index b8b8560bfb95..f785aef65799 100644 --- a/kernel/trace/blktrace.c +++ b/kernel/trace/blktrace.c | |||
@@ -26,6 +26,7 @@ | |||
26 | #include <linux/export.h> | 26 | #include <linux/export.h> |
27 | #include <linux/time.h> | 27 | #include <linux/time.h> |
28 | #include <linux/uaccess.h> | 28 | #include <linux/uaccess.h> |
29 | #include <linux/list.h> | ||
29 | 30 | ||
30 | #include <trace/events/block.h> | 31 | #include <trace/events/block.h> |
31 | 32 | ||
@@ -38,6 +39,9 @@ static unsigned int blktrace_seq __read_mostly = 1; | |||
38 | static struct trace_array *blk_tr; | 39 | static struct trace_array *blk_tr; |
39 | static bool blk_tracer_enabled __read_mostly; | 40 | static bool blk_tracer_enabled __read_mostly; |
40 | 41 | ||
42 | static LIST_HEAD(running_trace_list); | ||
43 | static __cacheline_aligned_in_smp DEFINE_SPINLOCK(running_trace_lock); | ||
44 | |||
41 | /* Select an alternative, minimalistic output than the original one */ | 45 | /* Select an alternative, minimalistic output than the original one */ |
42 | #define TRACE_BLK_OPT_CLASSIC 0x1 | 46 | #define TRACE_BLK_OPT_CLASSIC 0x1 |
43 | 47 | ||
@@ -107,10 +111,18 @@ record_it: | |||
107 | * Send out a notify for this process, if we haven't done so since a trace | 111 | * Send out a notify for this process, if we haven't done so since a trace |
108 | * started | 112 | * started |
109 | */ | 113 | */ |
110 | static void trace_note_tsk(struct blk_trace *bt, struct task_struct *tsk) | 114 | static void trace_note_tsk(struct task_struct *tsk) |
111 | { | 115 | { |
116 | unsigned long flags; | ||
117 | struct blk_trace *bt; | ||
118 | |||
112 | tsk->btrace_seq = blktrace_seq; | 119 | tsk->btrace_seq = blktrace_seq; |
113 | trace_note(bt, tsk->pid, BLK_TN_PROCESS, tsk->comm, sizeof(tsk->comm)); | 120 | spin_lock_irqsave(&running_trace_lock, flags); |
121 | list_for_each_entry(bt, &running_trace_list, running_list) { | ||
122 | trace_note(bt, tsk->pid, BLK_TN_PROCESS, tsk->comm, | ||
123 | sizeof(tsk->comm)); | ||
124 | } | ||
125 | spin_unlock_irqrestore(&running_trace_lock, flags); | ||
114 | } | 126 | } |
115 | 127 | ||
116 | static void trace_note_time(struct blk_trace *bt) | 128 | static void trace_note_time(struct blk_trace *bt) |
@@ -229,16 +241,15 @@ static void __blk_add_trace(struct blk_trace *bt, sector_t sector, int bytes, | |||
229 | goto record_it; | 241 | goto record_it; |
230 | } | 242 | } |
231 | 243 | ||
244 | if (unlikely(tsk->btrace_seq != blktrace_seq)) | ||
245 | trace_note_tsk(tsk); | ||
246 | |||
232 | /* | 247 | /* |
233 | * A word about the locking here - we disable interrupts to reserve | 248 | * A word about the locking here - we disable interrupts to reserve |
234 | * some space in the relay per-cpu buffer, to prevent an irq | 249 | * some space in the relay per-cpu buffer, to prevent an irq |
235 | * from coming in and stepping on our toes. | 250 | * from coming in and stepping on our toes. |
236 | */ | 251 | */ |
237 | local_irq_save(flags); | 252 | local_irq_save(flags); |
238 | |||
239 | if (unlikely(tsk->btrace_seq != blktrace_seq)) | ||
240 | trace_note_tsk(bt, tsk); | ||
241 | |||
242 | t = relay_reserve(bt->rchan, sizeof(*t) + pdu_len); | 253 | t = relay_reserve(bt->rchan, sizeof(*t) + pdu_len); |
243 | if (t) { | 254 | if (t) { |
244 | sequence = per_cpu_ptr(bt->sequence, cpu); | 255 | sequence = per_cpu_ptr(bt->sequence, cpu); |
@@ -477,6 +488,7 @@ int do_blk_trace_setup(struct request_queue *q, char *name, dev_t dev, | |||
477 | bt->dir = dir; | 488 | bt->dir = dir; |
478 | bt->dev = dev; | 489 | bt->dev = dev; |
479 | atomic_set(&bt->dropped, 0); | 490 | atomic_set(&bt->dropped, 0); |
491 | INIT_LIST_HEAD(&bt->running_list); | ||
480 | 492 | ||
481 | ret = -EIO; | 493 | ret = -EIO; |
482 | bt->dropped_file = debugfs_create_file("dropped", 0444, dir, bt, | 494 | bt->dropped_file = debugfs_create_file("dropped", 0444, dir, bt, |
@@ -567,13 +579,12 @@ static int compat_blk_trace_setup(struct request_queue *q, char *name, | |||
567 | .end_lba = cbuts.end_lba, | 579 | .end_lba = cbuts.end_lba, |
568 | .pid = cbuts.pid, | 580 | .pid = cbuts.pid, |
569 | }; | 581 | }; |
570 | memcpy(&buts.name, &cbuts.name, 32); | ||
571 | 582 | ||
572 | ret = do_blk_trace_setup(q, name, dev, bdev, &buts); | 583 | ret = do_blk_trace_setup(q, name, dev, bdev, &buts); |
573 | if (ret) | 584 | if (ret) |
574 | return ret; | 585 | return ret; |
575 | 586 | ||
576 | if (copy_to_user(arg, &buts.name, 32)) { | 587 | if (copy_to_user(arg, &buts.name, ARRAY_SIZE(buts.name))) { |
577 | blk_trace_remove(q); | 588 | blk_trace_remove(q); |
578 | return -EFAULT; | 589 | return -EFAULT; |
579 | } | 590 | } |
@@ -601,6 +612,9 @@ int blk_trace_startstop(struct request_queue *q, int start) | |||
601 | blktrace_seq++; | 612 | blktrace_seq++; |
602 | smp_mb(); | 613 | smp_mb(); |
603 | bt->trace_state = Blktrace_running; | 614 | bt->trace_state = Blktrace_running; |
615 | spin_lock_irq(&running_trace_lock); | ||
616 | list_add(&bt->running_list, &running_trace_list); | ||
617 | spin_unlock_irq(&running_trace_lock); | ||
604 | 618 | ||
605 | trace_note_time(bt); | 619 | trace_note_time(bt); |
606 | ret = 0; | 620 | ret = 0; |
@@ -608,6 +622,9 @@ int blk_trace_startstop(struct request_queue *q, int start) | |||
608 | } else { | 622 | } else { |
609 | if (bt->trace_state == Blktrace_running) { | 623 | if (bt->trace_state == Blktrace_running) { |
610 | bt->trace_state = Blktrace_stopped; | 624 | bt->trace_state = Blktrace_stopped; |
625 | spin_lock_irq(&running_trace_lock); | ||
626 | list_del_init(&bt->running_list); | ||
627 | spin_unlock_irq(&running_trace_lock); | ||
611 | relay_flush(bt->rchan); | 628 | relay_flush(bt->rchan); |
612 | ret = 0; | 629 | ret = 0; |
613 | } | 630 | } |
@@ -1472,6 +1489,9 @@ static int blk_trace_remove_queue(struct request_queue *q) | |||
1472 | if (atomic_dec_and_test(&blk_probes_ref)) | 1489 | if (atomic_dec_and_test(&blk_probes_ref)) |
1473 | blk_unregister_tracepoints(); | 1490 | blk_unregister_tracepoints(); |
1474 | 1491 | ||
1492 | spin_lock_irq(&running_trace_lock); | ||
1493 | list_del(&bt->running_list); | ||
1494 | spin_unlock_irq(&running_trace_lock); | ||
1475 | blk_trace_free(bt); | 1495 | blk_trace_free(bt); |
1476 | return 0; | 1496 | return 0; |
1477 | } | 1497 | } |
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 03cf44ac54d3..72a0f81dc5a8 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c | |||
@@ -367,9 +367,6 @@ static int remove_ftrace_list_ops(struct ftrace_ops **list, | |||
367 | 367 | ||
368 | static int __register_ftrace_function(struct ftrace_ops *ops) | 368 | static int __register_ftrace_function(struct ftrace_ops *ops) |
369 | { | 369 | { |
370 | if (unlikely(ftrace_disabled)) | ||
371 | return -ENODEV; | ||
372 | |||
373 | if (FTRACE_WARN_ON(ops == &global_ops)) | 370 | if (FTRACE_WARN_ON(ops == &global_ops)) |
374 | return -EINVAL; | 371 | return -EINVAL; |
375 | 372 | ||
@@ -428,9 +425,6 @@ static int __unregister_ftrace_function(struct ftrace_ops *ops) | |||
428 | { | 425 | { |
429 | int ret; | 426 | int ret; |
430 | 427 | ||
431 | if (ftrace_disabled) | ||
432 | return -ENODEV; | ||
433 | |||
434 | if (WARN_ON(!(ops->flags & FTRACE_OPS_FL_ENABLED))) | 428 | if (WARN_ON(!(ops->flags & FTRACE_OPS_FL_ENABLED))) |
435 | return -EBUSY; | 429 | return -EBUSY; |
436 | 430 | ||
@@ -781,7 +775,7 @@ static int ftrace_profile_init(void) | |||
781 | int cpu; | 775 | int cpu; |
782 | int ret = 0; | 776 | int ret = 0; |
783 | 777 | ||
784 | for_each_online_cpu(cpu) { | 778 | for_each_possible_cpu(cpu) { |
785 | ret = ftrace_profile_init_cpu(cpu); | 779 | ret = ftrace_profile_init_cpu(cpu); |
786 | if (ret) | 780 | if (ret) |
787 | break; | 781 | break; |
@@ -2088,10 +2082,15 @@ static void ftrace_startup_enable(int command) | |||
2088 | static int ftrace_startup(struct ftrace_ops *ops, int command) | 2082 | static int ftrace_startup(struct ftrace_ops *ops, int command) |
2089 | { | 2083 | { |
2090 | bool hash_enable = true; | 2084 | bool hash_enable = true; |
2085 | int ret; | ||
2091 | 2086 | ||
2092 | if (unlikely(ftrace_disabled)) | 2087 | if (unlikely(ftrace_disabled)) |
2093 | return -ENODEV; | 2088 | return -ENODEV; |
2094 | 2089 | ||
2090 | ret = __register_ftrace_function(ops); | ||
2091 | if (ret) | ||
2092 | return ret; | ||
2093 | |||
2095 | ftrace_start_up++; | 2094 | ftrace_start_up++; |
2096 | command |= FTRACE_UPDATE_CALLS; | 2095 | command |= FTRACE_UPDATE_CALLS; |
2097 | 2096 | ||
@@ -2113,12 +2112,17 @@ static int ftrace_startup(struct ftrace_ops *ops, int command) | |||
2113 | return 0; | 2112 | return 0; |
2114 | } | 2113 | } |
2115 | 2114 | ||
2116 | static void ftrace_shutdown(struct ftrace_ops *ops, int command) | 2115 | static int ftrace_shutdown(struct ftrace_ops *ops, int command) |
2117 | { | 2116 | { |
2118 | bool hash_disable = true; | 2117 | bool hash_disable = true; |
2118 | int ret; | ||
2119 | 2119 | ||
2120 | if (unlikely(ftrace_disabled)) | 2120 | if (unlikely(ftrace_disabled)) |
2121 | return; | 2121 | return -ENODEV; |
2122 | |||
2123 | ret = __unregister_ftrace_function(ops); | ||
2124 | if (ret) | ||
2125 | return ret; | ||
2122 | 2126 | ||
2123 | ftrace_start_up--; | 2127 | ftrace_start_up--; |
2124 | /* | 2128 | /* |
@@ -2153,9 +2157,10 @@ static void ftrace_shutdown(struct ftrace_ops *ops, int command) | |||
2153 | } | 2157 | } |
2154 | 2158 | ||
2155 | if (!command || !ftrace_enabled) | 2159 | if (!command || !ftrace_enabled) |
2156 | return; | 2160 | return 0; |
2157 | 2161 | ||
2158 | ftrace_run_update_code(command); | 2162 | ftrace_run_update_code(command); |
2163 | return 0; | ||
2159 | } | 2164 | } |
2160 | 2165 | ||
2161 | static void ftrace_startup_sysctl(void) | 2166 | static void ftrace_startup_sysctl(void) |
@@ -3060,16 +3065,13 @@ static void __enable_ftrace_function_probe(void) | |||
3060 | if (i == FTRACE_FUNC_HASHSIZE) | 3065 | if (i == FTRACE_FUNC_HASHSIZE) |
3061 | return; | 3066 | return; |
3062 | 3067 | ||
3063 | ret = __register_ftrace_function(&trace_probe_ops); | 3068 | ret = ftrace_startup(&trace_probe_ops, 0); |
3064 | if (!ret) | ||
3065 | ret = ftrace_startup(&trace_probe_ops, 0); | ||
3066 | 3069 | ||
3067 | ftrace_probe_registered = 1; | 3070 | ftrace_probe_registered = 1; |
3068 | } | 3071 | } |
3069 | 3072 | ||
3070 | static void __disable_ftrace_function_probe(void) | 3073 | static void __disable_ftrace_function_probe(void) |
3071 | { | 3074 | { |
3072 | int ret; | ||
3073 | int i; | 3075 | int i; |
3074 | 3076 | ||
3075 | if (!ftrace_probe_registered) | 3077 | if (!ftrace_probe_registered) |
@@ -3082,9 +3084,7 @@ static void __disable_ftrace_function_probe(void) | |||
3082 | } | 3084 | } |
3083 | 3085 | ||
3084 | /* no more funcs left */ | 3086 | /* no more funcs left */ |
3085 | ret = __unregister_ftrace_function(&trace_probe_ops); | 3087 | ftrace_shutdown(&trace_probe_ops, 0); |
3086 | if (!ret) | ||
3087 | ftrace_shutdown(&trace_probe_ops, 0); | ||
3088 | 3088 | ||
3089 | ftrace_probe_registered = 0; | 3089 | ftrace_probe_registered = 0; |
3090 | } | 3090 | } |
@@ -3307,7 +3307,11 @@ void unregister_ftrace_function_probe_all(char *glob) | |||
3307 | static LIST_HEAD(ftrace_commands); | 3307 | static LIST_HEAD(ftrace_commands); |
3308 | static DEFINE_MUTEX(ftrace_cmd_mutex); | 3308 | static DEFINE_MUTEX(ftrace_cmd_mutex); |
3309 | 3309 | ||
3310 | int register_ftrace_command(struct ftrace_func_command *cmd) | 3310 | /* |
3311 | * Currently we only register ftrace commands from __init, so mark this | ||
3312 | * __init too. | ||
3313 | */ | ||
3314 | __init int register_ftrace_command(struct ftrace_func_command *cmd) | ||
3311 | { | 3315 | { |
3312 | struct ftrace_func_command *p; | 3316 | struct ftrace_func_command *p; |
3313 | int ret = 0; | 3317 | int ret = 0; |
@@ -3326,7 +3330,11 @@ int register_ftrace_command(struct ftrace_func_command *cmd) | |||
3326 | return ret; | 3330 | return ret; |
3327 | } | 3331 | } |
3328 | 3332 | ||
3329 | int unregister_ftrace_command(struct ftrace_func_command *cmd) | 3333 | /* |
3334 | * Currently we only unregister ftrace commands from __init, so mark | ||
3335 | * this __init too. | ||
3336 | */ | ||
3337 | __init int unregister_ftrace_command(struct ftrace_func_command *cmd) | ||
3330 | { | 3338 | { |
3331 | struct ftrace_func_command *p, *n; | 3339 | struct ftrace_func_command *p, *n; |
3332 | int ret = -ENODEV; | 3340 | int ret = -ENODEV; |
@@ -3641,7 +3649,7 @@ __setup("ftrace_filter=", set_ftrace_filter); | |||
3641 | 3649 | ||
3642 | #ifdef CONFIG_FUNCTION_GRAPH_TRACER | 3650 | #ifdef CONFIG_FUNCTION_GRAPH_TRACER |
3643 | static char ftrace_graph_buf[FTRACE_FILTER_SIZE] __initdata; | 3651 | static char ftrace_graph_buf[FTRACE_FILTER_SIZE] __initdata; |
3644 | static int ftrace_set_func(unsigned long *array, int *idx, char *buffer); | 3652 | static int ftrace_set_func(unsigned long *array, int *idx, int size, char *buffer); |
3645 | 3653 | ||
3646 | static int __init set_graph_function(char *str) | 3654 | static int __init set_graph_function(char *str) |
3647 | { | 3655 | { |
@@ -3659,7 +3667,7 @@ static void __init set_ftrace_early_graph(char *buf) | |||
3659 | func = strsep(&buf, ","); | 3667 | func = strsep(&buf, ","); |
3660 | /* we allow only one expression at a time */ | 3668 | /* we allow only one expression at a time */ |
3661 | ret = ftrace_set_func(ftrace_graph_funcs, &ftrace_graph_count, | 3669 | ret = ftrace_set_func(ftrace_graph_funcs, &ftrace_graph_count, |
3662 | func); | 3670 | FTRACE_GRAPH_MAX_FUNCS, func); |
3663 | if (ret) | 3671 | if (ret) |
3664 | printk(KERN_DEBUG "ftrace: function %s not " | 3672 | printk(KERN_DEBUG "ftrace: function %s not " |
3665 | "traceable\n", func); | 3673 | "traceable\n", func); |
@@ -3776,15 +3784,25 @@ static const struct file_operations ftrace_notrace_fops = { | |||
3776 | static DEFINE_MUTEX(graph_lock); | 3784 | static DEFINE_MUTEX(graph_lock); |
3777 | 3785 | ||
3778 | int ftrace_graph_count; | 3786 | int ftrace_graph_count; |
3779 | int ftrace_graph_filter_enabled; | 3787 | int ftrace_graph_notrace_count; |
3780 | unsigned long ftrace_graph_funcs[FTRACE_GRAPH_MAX_FUNCS] __read_mostly; | 3788 | unsigned long ftrace_graph_funcs[FTRACE_GRAPH_MAX_FUNCS] __read_mostly; |
3789 | unsigned long ftrace_graph_notrace_funcs[FTRACE_GRAPH_MAX_FUNCS] __read_mostly; | ||
3790 | |||
3791 | struct ftrace_graph_data { | ||
3792 | unsigned long *table; | ||
3793 | size_t size; | ||
3794 | int *count; | ||
3795 | const struct seq_operations *seq_ops; | ||
3796 | }; | ||
3781 | 3797 | ||
3782 | static void * | 3798 | static void * |
3783 | __g_next(struct seq_file *m, loff_t *pos) | 3799 | __g_next(struct seq_file *m, loff_t *pos) |
3784 | { | 3800 | { |
3785 | if (*pos >= ftrace_graph_count) | 3801 | struct ftrace_graph_data *fgd = m->private; |
3802 | |||
3803 | if (*pos >= *fgd->count) | ||
3786 | return NULL; | 3804 | return NULL; |
3787 | return &ftrace_graph_funcs[*pos]; | 3805 | return &fgd->table[*pos]; |
3788 | } | 3806 | } |
3789 | 3807 | ||
3790 | static void * | 3808 | static void * |
@@ -3796,10 +3814,12 @@ g_next(struct seq_file *m, void *v, loff_t *pos) | |||
3796 | 3814 | ||
3797 | static void *g_start(struct seq_file *m, loff_t *pos) | 3815 | static void *g_start(struct seq_file *m, loff_t *pos) |
3798 | { | 3816 | { |
3817 | struct ftrace_graph_data *fgd = m->private; | ||
3818 | |||
3799 | mutex_lock(&graph_lock); | 3819 | mutex_lock(&graph_lock); |
3800 | 3820 | ||
3801 | /* Nothing, tell g_show to print all functions are enabled */ | 3821 | /* Nothing, tell g_show to print all functions are enabled */ |
3802 | if (!ftrace_graph_filter_enabled && !*pos) | 3822 | if (!*fgd->count && !*pos) |
3803 | return (void *)1; | 3823 | return (void *)1; |
3804 | 3824 | ||
3805 | return __g_next(m, pos); | 3825 | return __g_next(m, pos); |
@@ -3835,38 +3855,88 @@ static const struct seq_operations ftrace_graph_seq_ops = { | |||
3835 | }; | 3855 | }; |
3836 | 3856 | ||
3837 | static int | 3857 | static int |
3838 | ftrace_graph_open(struct inode *inode, struct file *file) | 3858 | __ftrace_graph_open(struct inode *inode, struct file *file, |
3859 | struct ftrace_graph_data *fgd) | ||
3839 | { | 3860 | { |
3840 | int ret = 0; | 3861 | int ret = 0; |
3841 | 3862 | ||
3842 | if (unlikely(ftrace_disabled)) | ||
3843 | return -ENODEV; | ||
3844 | |||
3845 | mutex_lock(&graph_lock); | 3863 | mutex_lock(&graph_lock); |
3846 | if ((file->f_mode & FMODE_WRITE) && | 3864 | if ((file->f_mode & FMODE_WRITE) && |
3847 | (file->f_flags & O_TRUNC)) { | 3865 | (file->f_flags & O_TRUNC)) { |
3848 | ftrace_graph_filter_enabled = 0; | 3866 | *fgd->count = 0; |
3849 | ftrace_graph_count = 0; | 3867 | memset(fgd->table, 0, fgd->size * sizeof(*fgd->table)); |
3850 | memset(ftrace_graph_funcs, 0, sizeof(ftrace_graph_funcs)); | ||
3851 | } | 3868 | } |
3852 | mutex_unlock(&graph_lock); | 3869 | mutex_unlock(&graph_lock); |
3853 | 3870 | ||
3854 | if (file->f_mode & FMODE_READ) | 3871 | if (file->f_mode & FMODE_READ) { |
3855 | ret = seq_open(file, &ftrace_graph_seq_ops); | 3872 | ret = seq_open(file, fgd->seq_ops); |
3873 | if (!ret) { | ||
3874 | struct seq_file *m = file->private_data; | ||
3875 | m->private = fgd; | ||
3876 | } | ||
3877 | } else | ||
3878 | file->private_data = fgd; | ||
3856 | 3879 | ||
3857 | return ret; | 3880 | return ret; |
3858 | } | 3881 | } |
3859 | 3882 | ||
3860 | static int | 3883 | static int |
3884 | ftrace_graph_open(struct inode *inode, struct file *file) | ||
3885 | { | ||
3886 | struct ftrace_graph_data *fgd; | ||
3887 | |||
3888 | if (unlikely(ftrace_disabled)) | ||
3889 | return -ENODEV; | ||
3890 | |||
3891 | fgd = kmalloc(sizeof(*fgd), GFP_KERNEL); | ||
3892 | if (fgd == NULL) | ||
3893 | return -ENOMEM; | ||
3894 | |||
3895 | fgd->table = ftrace_graph_funcs; | ||
3896 | fgd->size = FTRACE_GRAPH_MAX_FUNCS; | ||
3897 | fgd->count = &ftrace_graph_count; | ||
3898 | fgd->seq_ops = &ftrace_graph_seq_ops; | ||
3899 | |||
3900 | return __ftrace_graph_open(inode, file, fgd); | ||
3901 | } | ||
3902 | |||
3903 | static int | ||
3904 | ftrace_graph_notrace_open(struct inode *inode, struct file *file) | ||
3905 | { | ||
3906 | struct ftrace_graph_data *fgd; | ||
3907 | |||
3908 | if (unlikely(ftrace_disabled)) | ||
3909 | return -ENODEV; | ||
3910 | |||
3911 | fgd = kmalloc(sizeof(*fgd), GFP_KERNEL); | ||
3912 | if (fgd == NULL) | ||
3913 | return -ENOMEM; | ||
3914 | |||
3915 | fgd->table = ftrace_graph_notrace_funcs; | ||
3916 | fgd->size = FTRACE_GRAPH_MAX_FUNCS; | ||
3917 | fgd->count = &ftrace_graph_notrace_count; | ||
3918 | fgd->seq_ops = &ftrace_graph_seq_ops; | ||
3919 | |||
3920 | return __ftrace_graph_open(inode, file, fgd); | ||
3921 | } | ||
3922 | |||
3923 | static int | ||
3861 | ftrace_graph_release(struct inode *inode, struct file *file) | 3924 | ftrace_graph_release(struct inode *inode, struct file *file) |
3862 | { | 3925 | { |
3863 | if (file->f_mode & FMODE_READ) | 3926 | if (file->f_mode & FMODE_READ) { |
3927 | struct seq_file *m = file->private_data; | ||
3928 | |||
3929 | kfree(m->private); | ||
3864 | seq_release(inode, file); | 3930 | seq_release(inode, file); |
3931 | } else { | ||
3932 | kfree(file->private_data); | ||
3933 | } | ||
3934 | |||
3865 | return 0; | 3935 | return 0; |
3866 | } | 3936 | } |
3867 | 3937 | ||
3868 | static int | 3938 | static int |
3869 | ftrace_set_func(unsigned long *array, int *idx, char *buffer) | 3939 | ftrace_set_func(unsigned long *array, int *idx, int size, char *buffer) |
3870 | { | 3940 | { |
3871 | struct dyn_ftrace *rec; | 3941 | struct dyn_ftrace *rec; |
3872 | struct ftrace_page *pg; | 3942 | struct ftrace_page *pg; |
@@ -3879,7 +3949,7 @@ ftrace_set_func(unsigned long *array, int *idx, char *buffer) | |||
3879 | 3949 | ||
3880 | /* decode regex */ | 3950 | /* decode regex */ |
3881 | type = filter_parse_regex(buffer, strlen(buffer), &search, ¬); | 3951 | type = filter_parse_regex(buffer, strlen(buffer), &search, ¬); |
3882 | if (!not && *idx >= FTRACE_GRAPH_MAX_FUNCS) | 3952 | if (!not && *idx >= size) |
3883 | return -EBUSY; | 3953 | return -EBUSY; |
3884 | 3954 | ||
3885 | search_len = strlen(search); | 3955 | search_len = strlen(search); |
@@ -3907,7 +3977,7 @@ ftrace_set_func(unsigned long *array, int *idx, char *buffer) | |||
3907 | fail = 0; | 3977 | fail = 0; |
3908 | if (!exists) { | 3978 | if (!exists) { |
3909 | array[(*idx)++] = rec->ip; | 3979 | array[(*idx)++] = rec->ip; |
3910 | if (*idx >= FTRACE_GRAPH_MAX_FUNCS) | 3980 | if (*idx >= size) |
3911 | goto out; | 3981 | goto out; |
3912 | } | 3982 | } |
3913 | } else { | 3983 | } else { |
@@ -3925,8 +3995,6 @@ out: | |||
3925 | if (fail) | 3995 | if (fail) |
3926 | return -EINVAL; | 3996 | return -EINVAL; |
3927 | 3997 | ||
3928 | ftrace_graph_filter_enabled = !!(*idx); | ||
3929 | |||
3930 | return 0; | 3998 | return 0; |
3931 | } | 3999 | } |
3932 | 4000 | ||
@@ -3935,36 +4003,33 @@ ftrace_graph_write(struct file *file, const char __user *ubuf, | |||
3935 | size_t cnt, loff_t *ppos) | 4003 | size_t cnt, loff_t *ppos) |
3936 | { | 4004 | { |
3937 | struct trace_parser parser; | 4005 | struct trace_parser parser; |
3938 | ssize_t read, ret; | 4006 | ssize_t read, ret = 0; |
4007 | struct ftrace_graph_data *fgd = file->private_data; | ||
3939 | 4008 | ||
3940 | if (!cnt) | 4009 | if (!cnt) |
3941 | return 0; | 4010 | return 0; |
3942 | 4011 | ||
3943 | mutex_lock(&graph_lock); | 4012 | if (trace_parser_get_init(&parser, FTRACE_BUFF_MAX)) |
3944 | 4013 | return -ENOMEM; | |
3945 | if (trace_parser_get_init(&parser, FTRACE_BUFF_MAX)) { | ||
3946 | ret = -ENOMEM; | ||
3947 | goto out_unlock; | ||
3948 | } | ||
3949 | 4014 | ||
3950 | read = trace_get_user(&parser, ubuf, cnt, ppos); | 4015 | read = trace_get_user(&parser, ubuf, cnt, ppos); |
3951 | 4016 | ||
3952 | if (read >= 0 && trace_parser_loaded((&parser))) { | 4017 | if (read >= 0 && trace_parser_loaded((&parser))) { |
3953 | parser.buffer[parser.idx] = 0; | 4018 | parser.buffer[parser.idx] = 0; |
3954 | 4019 | ||
4020 | mutex_lock(&graph_lock); | ||
4021 | |||
3955 | /* we allow only one expression at a time */ | 4022 | /* we allow only one expression at a time */ |
3956 | ret = ftrace_set_func(ftrace_graph_funcs, &ftrace_graph_count, | 4023 | ret = ftrace_set_func(fgd->table, fgd->count, fgd->size, |
3957 | parser.buffer); | 4024 | parser.buffer); |
3958 | if (ret) | 4025 | |
3959 | goto out_free; | 4026 | mutex_unlock(&graph_lock); |
3960 | } | 4027 | } |
3961 | 4028 | ||
3962 | ret = read; | 4029 | if (!ret) |
4030 | ret = read; | ||
3963 | 4031 | ||
3964 | out_free: | ||
3965 | trace_parser_put(&parser); | 4032 | trace_parser_put(&parser); |
3966 | out_unlock: | ||
3967 | mutex_unlock(&graph_lock); | ||
3968 | 4033 | ||
3969 | return ret; | 4034 | return ret; |
3970 | } | 4035 | } |
@@ -3976,6 +4041,14 @@ static const struct file_operations ftrace_graph_fops = { | |||
3976 | .llseek = ftrace_filter_lseek, | 4041 | .llseek = ftrace_filter_lseek, |
3977 | .release = ftrace_graph_release, | 4042 | .release = ftrace_graph_release, |
3978 | }; | 4043 | }; |
4044 | |||
4045 | static const struct file_operations ftrace_graph_notrace_fops = { | ||
4046 | .open = ftrace_graph_notrace_open, | ||
4047 | .read = seq_read, | ||
4048 | .write = ftrace_graph_write, | ||
4049 | .llseek = ftrace_filter_lseek, | ||
4050 | .release = ftrace_graph_release, | ||
4051 | }; | ||
3979 | #endif /* CONFIG_FUNCTION_GRAPH_TRACER */ | 4052 | #endif /* CONFIG_FUNCTION_GRAPH_TRACER */ |
3980 | 4053 | ||
3981 | static __init int ftrace_init_dyn_debugfs(struct dentry *d_tracer) | 4054 | static __init int ftrace_init_dyn_debugfs(struct dentry *d_tracer) |
@@ -3997,6 +4070,9 @@ static __init int ftrace_init_dyn_debugfs(struct dentry *d_tracer) | |||
3997 | trace_create_file("set_graph_function", 0444, d_tracer, | 4070 | trace_create_file("set_graph_function", 0444, d_tracer, |
3998 | NULL, | 4071 | NULL, |
3999 | &ftrace_graph_fops); | 4072 | &ftrace_graph_fops); |
4073 | trace_create_file("set_graph_notrace", 0444, d_tracer, | ||
4074 | NULL, | ||
4075 | &ftrace_graph_notrace_fops); | ||
4000 | #endif /* CONFIG_FUNCTION_GRAPH_TRACER */ | 4076 | #endif /* CONFIG_FUNCTION_GRAPH_TRACER */ |
4001 | 4077 | ||
4002 | return 0; | 4078 | return 0; |
@@ -4290,12 +4366,15 @@ core_initcall(ftrace_nodyn_init); | |||
4290 | static inline int ftrace_init_dyn_debugfs(struct dentry *d_tracer) { return 0; } | 4366 | static inline int ftrace_init_dyn_debugfs(struct dentry *d_tracer) { return 0; } |
4291 | static inline void ftrace_startup_enable(int command) { } | 4367 | static inline void ftrace_startup_enable(int command) { } |
4292 | /* Keep as macros so we do not need to define the commands */ | 4368 | /* Keep as macros so we do not need to define the commands */ |
4293 | # define ftrace_startup(ops, command) \ | 4369 | # define ftrace_startup(ops, command) \ |
4294 | ({ \ | 4370 | ({ \ |
4295 | (ops)->flags |= FTRACE_OPS_FL_ENABLED; \ | 4371 | int ___ret = __register_ftrace_function(ops); \ |
4296 | 0; \ | 4372 | if (!___ret) \ |
4373 | (ops)->flags |= FTRACE_OPS_FL_ENABLED; \ | ||
4374 | ___ret; \ | ||
4297 | }) | 4375 | }) |
4298 | # define ftrace_shutdown(ops, command) do { } while (0) | 4376 | # define ftrace_shutdown(ops, command) __unregister_ftrace_function(ops) |
4377 | |||
4299 | # define ftrace_startup_sysctl() do { } while (0) | 4378 | # define ftrace_startup_sysctl() do { } while (0) |
4300 | # define ftrace_shutdown_sysctl() do { } while (0) | 4379 | # define ftrace_shutdown_sysctl() do { } while (0) |
4301 | 4380 | ||
@@ -4320,12 +4399,21 @@ ftrace_ops_control_func(unsigned long ip, unsigned long parent_ip, | |||
4320 | */ | 4399 | */ |
4321 | preempt_disable_notrace(); | 4400 | preempt_disable_notrace(); |
4322 | trace_recursion_set(TRACE_CONTROL_BIT); | 4401 | trace_recursion_set(TRACE_CONTROL_BIT); |
4402 | |||
4403 | /* | ||
4404 | * Control funcs (perf) uses RCU. Only trace if | ||
4405 | * RCU is currently active. | ||
4406 | */ | ||
4407 | if (!rcu_is_watching()) | ||
4408 | goto out; | ||
4409 | |||
4323 | do_for_each_ftrace_op(op, ftrace_control_list) { | 4410 | do_for_each_ftrace_op(op, ftrace_control_list) { |
4324 | if (!(op->flags & FTRACE_OPS_FL_STUB) && | 4411 | if (!(op->flags & FTRACE_OPS_FL_STUB) && |
4325 | !ftrace_function_local_disabled(op) && | 4412 | !ftrace_function_local_disabled(op) && |
4326 | ftrace_ops_test(op, ip, regs)) | 4413 | ftrace_ops_test(op, ip, regs)) |
4327 | op->func(ip, parent_ip, op, regs); | 4414 | op->func(ip, parent_ip, op, regs); |
4328 | } while_for_each_ftrace_op(op); | 4415 | } while_for_each_ftrace_op(op); |
4416 | out: | ||
4329 | trace_recursion_clear(TRACE_CONTROL_BIT); | 4417 | trace_recursion_clear(TRACE_CONTROL_BIT); |
4330 | preempt_enable_notrace(); | 4418 | preempt_enable_notrace(); |
4331 | } | 4419 | } |
@@ -4695,9 +4783,7 @@ int register_ftrace_function(struct ftrace_ops *ops) | |||
4695 | 4783 | ||
4696 | mutex_lock(&ftrace_lock); | 4784 | mutex_lock(&ftrace_lock); |
4697 | 4785 | ||
4698 | ret = __register_ftrace_function(ops); | 4786 | ret = ftrace_startup(ops, 0); |
4699 | if (!ret) | ||
4700 | ret = ftrace_startup(ops, 0); | ||
4701 | 4787 | ||
4702 | mutex_unlock(&ftrace_lock); | 4788 | mutex_unlock(&ftrace_lock); |
4703 | 4789 | ||
@@ -4716,9 +4802,7 @@ int unregister_ftrace_function(struct ftrace_ops *ops) | |||
4716 | int ret; | 4802 | int ret; |
4717 | 4803 | ||
4718 | mutex_lock(&ftrace_lock); | 4804 | mutex_lock(&ftrace_lock); |
4719 | ret = __unregister_ftrace_function(ops); | 4805 | ret = ftrace_shutdown(ops, 0); |
4720 | if (!ret) | ||
4721 | ftrace_shutdown(ops, 0); | ||
4722 | mutex_unlock(&ftrace_lock); | 4806 | mutex_unlock(&ftrace_lock); |
4723 | 4807 | ||
4724 | return ret; | 4808 | return ret; |
@@ -4912,6 +4996,13 @@ ftrace_suspend_notifier_call(struct notifier_block *bl, unsigned long state, | |||
4912 | return NOTIFY_DONE; | 4996 | return NOTIFY_DONE; |
4913 | } | 4997 | } |
4914 | 4998 | ||
4999 | /* Just a place holder for function graph */ | ||
5000 | static struct ftrace_ops fgraph_ops __read_mostly = { | ||
5001 | .func = ftrace_stub, | ||
5002 | .flags = FTRACE_OPS_FL_STUB | FTRACE_OPS_FL_GLOBAL | | ||
5003 | FTRACE_OPS_FL_RECURSION_SAFE, | ||
5004 | }; | ||
5005 | |||
4915 | int register_ftrace_graph(trace_func_graph_ret_t retfunc, | 5006 | int register_ftrace_graph(trace_func_graph_ret_t retfunc, |
4916 | trace_func_graph_ent_t entryfunc) | 5007 | trace_func_graph_ent_t entryfunc) |
4917 | { | 5008 | { |
@@ -4938,7 +5029,7 @@ int register_ftrace_graph(trace_func_graph_ret_t retfunc, | |||
4938 | ftrace_graph_return = retfunc; | 5029 | ftrace_graph_return = retfunc; |
4939 | ftrace_graph_entry = entryfunc; | 5030 | ftrace_graph_entry = entryfunc; |
4940 | 5031 | ||
4941 | ret = ftrace_startup(&global_ops, FTRACE_START_FUNC_RET); | 5032 | ret = ftrace_startup(&fgraph_ops, FTRACE_START_FUNC_RET); |
4942 | 5033 | ||
4943 | out: | 5034 | out: |
4944 | mutex_unlock(&ftrace_lock); | 5035 | mutex_unlock(&ftrace_lock); |
@@ -4955,7 +5046,7 @@ void unregister_ftrace_graph(void) | |||
4955 | ftrace_graph_active--; | 5046 | ftrace_graph_active--; |
4956 | ftrace_graph_return = (trace_func_graph_ret_t)ftrace_stub; | 5047 | ftrace_graph_return = (trace_func_graph_ret_t)ftrace_stub; |
4957 | ftrace_graph_entry = ftrace_graph_entry_stub; | 5048 | ftrace_graph_entry = ftrace_graph_entry_stub; |
4958 | ftrace_shutdown(&global_ops, FTRACE_STOP_FUNC_RET); | 5049 | ftrace_shutdown(&fgraph_ops, FTRACE_STOP_FUNC_RET); |
4959 | unregister_pm_notifier(&ftrace_suspend_notifier); | 5050 | unregister_pm_notifier(&ftrace_suspend_notifier); |
4960 | unregister_trace_sched_switch(ftrace_graph_probe_sched_switch, NULL); | 5051 | unregister_trace_sched_switch(ftrace_graph_probe_sched_switch, NULL); |
4961 | 5052 | ||
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 7974ba20557d..9d20cd9743ef 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c | |||
@@ -235,13 +235,33 @@ void trace_array_put(struct trace_array *this_tr) | |||
235 | mutex_unlock(&trace_types_lock); | 235 | mutex_unlock(&trace_types_lock); |
236 | } | 236 | } |
237 | 237 | ||
238 | int filter_current_check_discard(struct ring_buffer *buffer, | 238 | int filter_check_discard(struct ftrace_event_file *file, void *rec, |
239 | struct ftrace_event_call *call, void *rec, | 239 | struct ring_buffer *buffer, |
240 | struct ring_buffer_event *event) | 240 | struct ring_buffer_event *event) |
241 | { | 241 | { |
242 | return filter_check_discard(call, rec, buffer, event); | 242 | if (unlikely(file->flags & FTRACE_EVENT_FL_FILTERED) && |
243 | !filter_match_preds(file->filter, rec)) { | ||
244 | ring_buffer_discard_commit(buffer, event); | ||
245 | return 1; | ||
246 | } | ||
247 | |||
248 | return 0; | ||
249 | } | ||
250 | EXPORT_SYMBOL_GPL(filter_check_discard); | ||
251 | |||
252 | int call_filter_check_discard(struct ftrace_event_call *call, void *rec, | ||
253 | struct ring_buffer *buffer, | ||
254 | struct ring_buffer_event *event) | ||
255 | { | ||
256 | if (unlikely(call->flags & TRACE_EVENT_FL_FILTERED) && | ||
257 | !filter_match_preds(call->filter, rec)) { | ||
258 | ring_buffer_discard_commit(buffer, event); | ||
259 | return 1; | ||
260 | } | ||
261 | |||
262 | return 0; | ||
243 | } | 263 | } |
244 | EXPORT_SYMBOL_GPL(filter_current_check_discard); | 264 | EXPORT_SYMBOL_GPL(call_filter_check_discard); |
245 | 265 | ||
246 | cycle_t buffer_ftrace_now(struct trace_buffer *buf, int cpu) | 266 | cycle_t buffer_ftrace_now(struct trace_buffer *buf, int cpu) |
247 | { | 267 | { |
@@ -843,9 +863,12 @@ int trace_get_user(struct trace_parser *parser, const char __user *ubuf, | |||
843 | if (isspace(ch)) { | 863 | if (isspace(ch)) { |
844 | parser->buffer[parser->idx] = 0; | 864 | parser->buffer[parser->idx] = 0; |
845 | parser->cont = false; | 865 | parser->cont = false; |
846 | } else { | 866 | } else if (parser->idx < parser->size - 1) { |
847 | parser->cont = true; | 867 | parser->cont = true; |
848 | parser->buffer[parser->idx++] = ch; | 868 | parser->buffer[parser->idx++] = ch; |
869 | } else { | ||
870 | ret = -EINVAL; | ||
871 | goto out; | ||
849 | } | 872 | } |
850 | 873 | ||
851 | *ppos += read; | 874 | *ppos += read; |
@@ -1261,21 +1284,6 @@ int is_tracing_stopped(void) | |||
1261 | } | 1284 | } |
1262 | 1285 | ||
1263 | /** | 1286 | /** |
1264 | * ftrace_off_permanent - disable all ftrace code permanently | ||
1265 | * | ||
1266 | * This should only be called when a serious anomally has | ||
1267 | * been detected. This will turn off the function tracing, | ||
1268 | * ring buffers, and other tracing utilites. It takes no | ||
1269 | * locks and can be called from any context. | ||
1270 | */ | ||
1271 | void ftrace_off_permanent(void) | ||
1272 | { | ||
1273 | tracing_disabled = 1; | ||
1274 | ftrace_stop(); | ||
1275 | tracing_off_permanent(); | ||
1276 | } | ||
1277 | |||
1278 | /** | ||
1279 | * tracing_start - quick start of the tracer | 1287 | * tracing_start - quick start of the tracer |
1280 | * | 1288 | * |
1281 | * If tracing is enabled but was stopped by tracing_stop, | 1289 | * If tracing is enabled but was stopped by tracing_stop, |
@@ -1509,7 +1517,8 @@ tracing_generic_entry_update(struct trace_entry *entry, unsigned long flags, | |||
1509 | #endif | 1517 | #endif |
1510 | ((pc & HARDIRQ_MASK) ? TRACE_FLAG_HARDIRQ : 0) | | 1518 | ((pc & HARDIRQ_MASK) ? TRACE_FLAG_HARDIRQ : 0) | |
1511 | ((pc & SOFTIRQ_MASK) ? TRACE_FLAG_SOFTIRQ : 0) | | 1519 | ((pc & SOFTIRQ_MASK) ? TRACE_FLAG_SOFTIRQ : 0) | |
1512 | (need_resched() ? TRACE_FLAG_NEED_RESCHED : 0); | 1520 | (tif_need_resched() ? TRACE_FLAG_NEED_RESCHED : 0) | |
1521 | (test_preempt_need_resched() ? TRACE_FLAG_PREEMPT_RESCHED : 0); | ||
1513 | } | 1522 | } |
1514 | EXPORT_SYMBOL_GPL(tracing_generic_entry_update); | 1523 | EXPORT_SYMBOL_GPL(tracing_generic_entry_update); |
1515 | 1524 | ||
@@ -1630,7 +1639,7 @@ trace_function(struct trace_array *tr, | |||
1630 | entry->ip = ip; | 1639 | entry->ip = ip; |
1631 | entry->parent_ip = parent_ip; | 1640 | entry->parent_ip = parent_ip; |
1632 | 1641 | ||
1633 | if (!filter_check_discard(call, entry, buffer, event)) | 1642 | if (!call_filter_check_discard(call, entry, buffer, event)) |
1634 | __buffer_unlock_commit(buffer, event); | 1643 | __buffer_unlock_commit(buffer, event); |
1635 | } | 1644 | } |
1636 | 1645 | ||
@@ -1714,7 +1723,7 @@ static void __ftrace_trace_stack(struct ring_buffer *buffer, | |||
1714 | 1723 | ||
1715 | entry->size = trace.nr_entries; | 1724 | entry->size = trace.nr_entries; |
1716 | 1725 | ||
1717 | if (!filter_check_discard(call, entry, buffer, event)) | 1726 | if (!call_filter_check_discard(call, entry, buffer, event)) |
1718 | __buffer_unlock_commit(buffer, event); | 1727 | __buffer_unlock_commit(buffer, event); |
1719 | 1728 | ||
1720 | out: | 1729 | out: |
@@ -1816,7 +1825,7 @@ ftrace_trace_userstack(struct ring_buffer *buffer, unsigned long flags, int pc) | |||
1816 | trace.entries = entry->caller; | 1825 | trace.entries = entry->caller; |
1817 | 1826 | ||
1818 | save_stack_trace_user(&trace); | 1827 | save_stack_trace_user(&trace); |
1819 | if (!filter_check_discard(call, entry, buffer, event)) | 1828 | if (!call_filter_check_discard(call, entry, buffer, event)) |
1820 | __buffer_unlock_commit(buffer, event); | 1829 | __buffer_unlock_commit(buffer, event); |
1821 | 1830 | ||
1822 | out_drop_count: | 1831 | out_drop_count: |
@@ -2008,7 +2017,7 @@ int trace_vbprintk(unsigned long ip, const char *fmt, va_list args) | |||
2008 | entry->fmt = fmt; | 2017 | entry->fmt = fmt; |
2009 | 2018 | ||
2010 | memcpy(entry->buf, tbuffer, sizeof(u32) * len); | 2019 | memcpy(entry->buf, tbuffer, sizeof(u32) * len); |
2011 | if (!filter_check_discard(call, entry, buffer, event)) { | 2020 | if (!call_filter_check_discard(call, entry, buffer, event)) { |
2012 | __buffer_unlock_commit(buffer, event); | 2021 | __buffer_unlock_commit(buffer, event); |
2013 | ftrace_trace_stack(buffer, flags, 6, pc); | 2022 | ftrace_trace_stack(buffer, flags, 6, pc); |
2014 | } | 2023 | } |
@@ -2063,7 +2072,7 @@ __trace_array_vprintk(struct ring_buffer *buffer, | |||
2063 | 2072 | ||
2064 | memcpy(&entry->buf, tbuffer, len); | 2073 | memcpy(&entry->buf, tbuffer, len); |
2065 | entry->buf[len] = '\0'; | 2074 | entry->buf[len] = '\0'; |
2066 | if (!filter_check_discard(call, entry, buffer, event)) { | 2075 | if (!call_filter_check_discard(call, entry, buffer, event)) { |
2067 | __buffer_unlock_commit(buffer, event); | 2076 | __buffer_unlock_commit(buffer, event); |
2068 | ftrace_trace_stack(buffer, flags, 6, pc); | 2077 | ftrace_trace_stack(buffer, flags, 6, pc); |
2069 | } | 2078 | } |
@@ -2760,7 +2769,7 @@ static void show_snapshot_main_help(struct seq_file *m) | |||
2760 | seq_printf(m, "# echo 0 > snapshot : Clears and frees snapshot buffer\n"); | 2769 | seq_printf(m, "# echo 0 > snapshot : Clears and frees snapshot buffer\n"); |
2761 | seq_printf(m, "# echo 1 > snapshot : Allocates snapshot buffer, if not already allocated.\n"); | 2770 | seq_printf(m, "# echo 1 > snapshot : Allocates snapshot buffer, if not already allocated.\n"); |
2762 | seq_printf(m, "# Takes a snapshot of the main buffer.\n"); | 2771 | seq_printf(m, "# Takes a snapshot of the main buffer.\n"); |
2763 | seq_printf(m, "# echo 2 > snapshot : Clears snapshot buffer (but does not allocate)\n"); | 2772 | seq_printf(m, "# echo 2 > snapshot : Clears snapshot buffer (but does not allocate or free)\n"); |
2764 | seq_printf(m, "# (Doesn't have to be '2' works with any number that\n"); | 2773 | seq_printf(m, "# (Doesn't have to be '2' works with any number that\n"); |
2765 | seq_printf(m, "# is not a '0' or '1')\n"); | 2774 | seq_printf(m, "# is not a '0' or '1')\n"); |
2766 | } | 2775 | } |
@@ -2964,6 +2973,11 @@ int tracing_open_generic(struct inode *inode, struct file *filp) | |||
2964 | return 0; | 2973 | return 0; |
2965 | } | 2974 | } |
2966 | 2975 | ||
2976 | bool tracing_is_disabled(void) | ||
2977 | { | ||
2978 | return (tracing_disabled) ? true: false; | ||
2979 | } | ||
2980 | |||
2967 | /* | 2981 | /* |
2968 | * Open and update trace_array ref count. | 2982 | * Open and update trace_array ref count. |
2969 | * Must have the current trace_array passed to it. | 2983 | * Must have the current trace_array passed to it. |
@@ -5454,12 +5468,12 @@ static struct ftrace_func_command ftrace_snapshot_cmd = { | |||
5454 | .func = ftrace_trace_snapshot_callback, | 5468 | .func = ftrace_trace_snapshot_callback, |
5455 | }; | 5469 | }; |
5456 | 5470 | ||
5457 | static int register_snapshot_cmd(void) | 5471 | static __init int register_snapshot_cmd(void) |
5458 | { | 5472 | { |
5459 | return register_ftrace_command(&ftrace_snapshot_cmd); | 5473 | return register_ftrace_command(&ftrace_snapshot_cmd); |
5460 | } | 5474 | } |
5461 | #else | 5475 | #else |
5462 | static inline int register_snapshot_cmd(void) { return 0; } | 5476 | static inline __init int register_snapshot_cmd(void) { return 0; } |
5463 | #endif /* defined(CONFIG_TRACER_SNAPSHOT) && defined(CONFIG_DYNAMIC_FTRACE) */ | 5477 | #endif /* defined(CONFIG_TRACER_SNAPSHOT) && defined(CONFIG_DYNAMIC_FTRACE) */ |
5464 | 5478 | ||
5465 | struct dentry *tracing_init_dentry_tr(struct trace_array *tr) | 5479 | struct dentry *tracing_init_dentry_tr(struct trace_array *tr) |
@@ -6253,6 +6267,17 @@ void trace_init_global_iter(struct trace_iterator *iter) | |||
6253 | iter->trace = iter->tr->current_trace; | 6267 | iter->trace = iter->tr->current_trace; |
6254 | iter->cpu_file = RING_BUFFER_ALL_CPUS; | 6268 | iter->cpu_file = RING_BUFFER_ALL_CPUS; |
6255 | iter->trace_buffer = &global_trace.trace_buffer; | 6269 | iter->trace_buffer = &global_trace.trace_buffer; |
6270 | |||
6271 | if (iter->trace && iter->trace->open) | ||
6272 | iter->trace->open(iter); | ||
6273 | |||
6274 | /* Annotate start of buffers if we had overruns */ | ||
6275 | if (ring_buffer_overruns(iter->trace_buffer->buffer)) | ||
6276 | iter->iter_flags |= TRACE_FILE_ANNOTATE; | ||
6277 | |||
6278 | /* Output in nanoseconds only if we are using a clock in nanoseconds. */ | ||
6279 | if (trace_clocks[iter->tr->clock_id].in_ns) | ||
6280 | iter->iter_flags |= TRACE_FILE_TIME_IN_NS; | ||
6256 | } | 6281 | } |
6257 | 6282 | ||
6258 | void ftrace_dump(enum ftrace_dump_mode oops_dump_mode) | 6283 | void ftrace_dump(enum ftrace_dump_mode oops_dump_mode) |
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 10c86fb7a2b4..ea189e027b80 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h | |||
@@ -124,6 +124,7 @@ enum trace_flag_type { | |||
124 | TRACE_FLAG_NEED_RESCHED = 0x04, | 124 | TRACE_FLAG_NEED_RESCHED = 0x04, |
125 | TRACE_FLAG_HARDIRQ = 0x08, | 125 | TRACE_FLAG_HARDIRQ = 0x08, |
126 | TRACE_FLAG_SOFTIRQ = 0x10, | 126 | TRACE_FLAG_SOFTIRQ = 0x10, |
127 | TRACE_FLAG_PREEMPT_RESCHED = 0x20, | ||
127 | }; | 128 | }; |
128 | 129 | ||
129 | #define TRACE_BUF_SIZE 1024 | 130 | #define TRACE_BUF_SIZE 1024 |
@@ -192,8 +193,8 @@ struct trace_array { | |||
192 | #ifdef CONFIG_FTRACE_SYSCALLS | 193 | #ifdef CONFIG_FTRACE_SYSCALLS |
193 | int sys_refcount_enter; | 194 | int sys_refcount_enter; |
194 | int sys_refcount_exit; | 195 | int sys_refcount_exit; |
195 | DECLARE_BITMAP(enabled_enter_syscalls, NR_syscalls); | 196 | struct ftrace_event_file __rcu *enter_syscall_files[NR_syscalls]; |
196 | DECLARE_BITMAP(enabled_exit_syscalls, NR_syscalls); | 197 | struct ftrace_event_file __rcu *exit_syscall_files[NR_syscalls]; |
197 | #endif | 198 | #endif |
198 | int stop_count; | 199 | int stop_count; |
199 | int clock_id; | 200 | int clock_id; |
@@ -514,6 +515,7 @@ void tracing_reset_online_cpus(struct trace_buffer *buf); | |||
514 | void tracing_reset_current(int cpu); | 515 | void tracing_reset_current(int cpu); |
515 | void tracing_reset_all_online_cpus(void); | 516 | void tracing_reset_all_online_cpus(void); |
516 | int tracing_open_generic(struct inode *inode, struct file *filp); | 517 | int tracing_open_generic(struct inode *inode, struct file *filp); |
518 | bool tracing_is_disabled(void); | ||
517 | struct dentry *trace_create_file(const char *name, | 519 | struct dentry *trace_create_file(const char *name, |
518 | umode_t mode, | 520 | umode_t mode, |
519 | struct dentry *parent, | 521 | struct dentry *parent, |
@@ -711,6 +713,8 @@ extern unsigned long trace_flags; | |||
711 | #define TRACE_GRAPH_PRINT_PROC 0x8 | 713 | #define TRACE_GRAPH_PRINT_PROC 0x8 |
712 | #define TRACE_GRAPH_PRINT_DURATION 0x10 | 714 | #define TRACE_GRAPH_PRINT_DURATION 0x10 |
713 | #define TRACE_GRAPH_PRINT_ABS_TIME 0x20 | 715 | #define TRACE_GRAPH_PRINT_ABS_TIME 0x20 |
716 | #define TRACE_GRAPH_PRINT_FILL_SHIFT 28 | ||
717 | #define TRACE_GRAPH_PRINT_FILL_MASK (0x3 << TRACE_GRAPH_PRINT_FILL_SHIFT) | ||
714 | 718 | ||
715 | extern enum print_line_t | 719 | extern enum print_line_t |
716 | print_graph_function_flags(struct trace_iterator *iter, u32 flags); | 720 | print_graph_function_flags(struct trace_iterator *iter, u32 flags); |
@@ -730,15 +734,16 @@ extern void __trace_graph_return(struct trace_array *tr, | |||
730 | #ifdef CONFIG_DYNAMIC_FTRACE | 734 | #ifdef CONFIG_DYNAMIC_FTRACE |
731 | /* TODO: make this variable */ | 735 | /* TODO: make this variable */ |
732 | #define FTRACE_GRAPH_MAX_FUNCS 32 | 736 | #define FTRACE_GRAPH_MAX_FUNCS 32 |
733 | extern int ftrace_graph_filter_enabled; | ||
734 | extern int ftrace_graph_count; | 737 | extern int ftrace_graph_count; |
735 | extern unsigned long ftrace_graph_funcs[FTRACE_GRAPH_MAX_FUNCS]; | 738 | extern unsigned long ftrace_graph_funcs[FTRACE_GRAPH_MAX_FUNCS]; |
739 | extern int ftrace_graph_notrace_count; | ||
740 | extern unsigned long ftrace_graph_notrace_funcs[FTRACE_GRAPH_MAX_FUNCS]; | ||
736 | 741 | ||
737 | static inline int ftrace_graph_addr(unsigned long addr) | 742 | static inline int ftrace_graph_addr(unsigned long addr) |
738 | { | 743 | { |
739 | int i; | 744 | int i; |
740 | 745 | ||
741 | if (!ftrace_graph_filter_enabled) | 746 | if (!ftrace_graph_count) |
742 | return 1; | 747 | return 1; |
743 | 748 | ||
744 | for (i = 0; i < ftrace_graph_count; i++) { | 749 | for (i = 0; i < ftrace_graph_count; i++) { |
@@ -758,11 +763,31 @@ static inline int ftrace_graph_addr(unsigned long addr) | |||
758 | 763 | ||
759 | return 0; | 764 | return 0; |
760 | } | 765 | } |
766 | |||
767 | static inline int ftrace_graph_notrace_addr(unsigned long addr) | ||
768 | { | ||
769 | int i; | ||
770 | |||
771 | if (!ftrace_graph_notrace_count) | ||
772 | return 0; | ||
773 | |||
774 | for (i = 0; i < ftrace_graph_notrace_count; i++) { | ||
775 | if (addr == ftrace_graph_notrace_funcs[i]) | ||
776 | return 1; | ||
777 | } | ||
778 | |||
779 | return 0; | ||
780 | } | ||
761 | #else | 781 | #else |
762 | static inline int ftrace_graph_addr(unsigned long addr) | 782 | static inline int ftrace_graph_addr(unsigned long addr) |
763 | { | 783 | { |
764 | return 1; | 784 | return 1; |
765 | } | 785 | } |
786 | |||
787 | static inline int ftrace_graph_notrace_addr(unsigned long addr) | ||
788 | { | ||
789 | return 0; | ||
790 | } | ||
766 | #endif /* CONFIG_DYNAMIC_FTRACE */ | 791 | #endif /* CONFIG_DYNAMIC_FTRACE */ |
767 | #else /* CONFIG_FUNCTION_GRAPH_TRACER */ | 792 | #else /* CONFIG_FUNCTION_GRAPH_TRACER */ |
768 | static inline enum print_line_t | 793 | static inline enum print_line_t |
@@ -986,9 +1011,9 @@ struct filter_pred { | |||
986 | 1011 | ||
987 | extern enum regex_type | 1012 | extern enum regex_type |
988 | filter_parse_regex(char *buff, int len, char **search, int *not); | 1013 | filter_parse_regex(char *buff, int len, char **search, int *not); |
989 | extern void print_event_filter(struct ftrace_event_call *call, | 1014 | extern void print_event_filter(struct ftrace_event_file *file, |
990 | struct trace_seq *s); | 1015 | struct trace_seq *s); |
991 | extern int apply_event_filter(struct ftrace_event_call *call, | 1016 | extern int apply_event_filter(struct ftrace_event_file *file, |
992 | char *filter_string); | 1017 | char *filter_string); |
993 | extern int apply_subsystem_event_filter(struct ftrace_subsystem_dir *dir, | 1018 | extern int apply_subsystem_event_filter(struct ftrace_subsystem_dir *dir, |
994 | char *filter_string); | 1019 | char *filter_string); |
@@ -999,20 +1024,6 @@ extern int filter_assign_type(const char *type); | |||
999 | struct ftrace_event_field * | 1024 | struct ftrace_event_field * |
1000 | trace_find_event_field(struct ftrace_event_call *call, char *name); | 1025 | trace_find_event_field(struct ftrace_event_call *call, char *name); |
1001 | 1026 | ||
1002 | static inline int | ||
1003 | filter_check_discard(struct ftrace_event_call *call, void *rec, | ||
1004 | struct ring_buffer *buffer, | ||
1005 | struct ring_buffer_event *event) | ||
1006 | { | ||
1007 | if (unlikely(call->flags & TRACE_EVENT_FL_FILTERED) && | ||
1008 | !filter_match_preds(call->filter, rec)) { | ||
1009 | ring_buffer_discard_commit(buffer, event); | ||
1010 | return 1; | ||
1011 | } | ||
1012 | |||
1013 | return 0; | ||
1014 | } | ||
1015 | |||
1016 | extern void trace_event_enable_cmd_record(bool enable); | 1027 | extern void trace_event_enable_cmd_record(bool enable); |
1017 | extern int event_trace_add_tracer(struct dentry *parent, struct trace_array *tr); | 1028 | extern int event_trace_add_tracer(struct dentry *parent, struct trace_array *tr); |
1018 | extern int event_trace_del_tracer(struct trace_array *tr); | 1029 | extern int event_trace_del_tracer(struct trace_array *tr); |
diff --git a/kernel/trace/trace_branch.c b/kernel/trace/trace_branch.c index d594da0dc03c..697fb9bac8f0 100644 --- a/kernel/trace/trace_branch.c +++ b/kernel/trace/trace_branch.c | |||
@@ -78,7 +78,7 @@ probe_likely_condition(struct ftrace_branch_data *f, int val, int expect) | |||
78 | entry->line = f->line; | 78 | entry->line = f->line; |
79 | entry->correct = val == expect; | 79 | entry->correct = val == expect; |
80 | 80 | ||
81 | if (!filter_check_discard(call, entry, buffer, event)) | 81 | if (!call_filter_check_discard(call, entry, buffer, event)) |
82 | __buffer_unlock_commit(buffer, event); | 82 | __buffer_unlock_commit(buffer, event); |
83 | 83 | ||
84 | out: | 84 | out: |
diff --git a/kernel/trace/trace_event_perf.c b/kernel/trace/trace_event_perf.c index 80c36bcf66e8..e854f420e033 100644 --- a/kernel/trace/trace_event_perf.c +++ b/kernel/trace/trace_event_perf.c | |||
@@ -24,9 +24,15 @@ static int total_ref_count; | |||
24 | static int perf_trace_event_perm(struct ftrace_event_call *tp_event, | 24 | static int perf_trace_event_perm(struct ftrace_event_call *tp_event, |
25 | struct perf_event *p_event) | 25 | struct perf_event *p_event) |
26 | { | 26 | { |
27 | if (tp_event->perf_perm) { | ||
28 | int ret = tp_event->perf_perm(tp_event, p_event); | ||
29 | if (ret) | ||
30 | return ret; | ||
31 | } | ||
32 | |||
27 | /* The ftrace function trace is allowed only for root. */ | 33 | /* The ftrace function trace is allowed only for root. */ |
28 | if (ftrace_event_is_function(tp_event) && | 34 | if (ftrace_event_is_function(tp_event) && |
29 | perf_paranoid_kernel() && !capable(CAP_SYS_ADMIN)) | 35 | perf_paranoid_tracepoint_raw() && !capable(CAP_SYS_ADMIN)) |
30 | return -EPERM; | 36 | return -EPERM; |
31 | 37 | ||
32 | /* No tracing, just counting, so no obvious leak */ | 38 | /* No tracing, just counting, so no obvious leak */ |
@@ -173,7 +179,7 @@ static int perf_trace_event_init(struct ftrace_event_call *tp_event, | |||
173 | int perf_trace_init(struct perf_event *p_event) | 179 | int perf_trace_init(struct perf_event *p_event) |
174 | { | 180 | { |
175 | struct ftrace_event_call *tp_event; | 181 | struct ftrace_event_call *tp_event; |
176 | int event_id = p_event->attr.config; | 182 | u64 event_id = p_event->attr.config; |
177 | int ret = -EINVAL; | 183 | int ret = -EINVAL; |
178 | 184 | ||
179 | mutex_lock(&event_mutex); | 185 | mutex_lock(&event_mutex); |
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index 368a4d50cc30..a11800ae96de 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c | |||
@@ -989,7 +989,7 @@ static ssize_t | |||
989 | event_filter_read(struct file *filp, char __user *ubuf, size_t cnt, | 989 | event_filter_read(struct file *filp, char __user *ubuf, size_t cnt, |
990 | loff_t *ppos) | 990 | loff_t *ppos) |
991 | { | 991 | { |
992 | struct ftrace_event_call *call; | 992 | struct ftrace_event_file *file; |
993 | struct trace_seq *s; | 993 | struct trace_seq *s; |
994 | int r = -ENODEV; | 994 | int r = -ENODEV; |
995 | 995 | ||
@@ -1004,12 +1004,12 @@ event_filter_read(struct file *filp, char __user *ubuf, size_t cnt, | |||
1004 | trace_seq_init(s); | 1004 | trace_seq_init(s); |
1005 | 1005 | ||
1006 | mutex_lock(&event_mutex); | 1006 | mutex_lock(&event_mutex); |
1007 | call = event_file_data(filp); | 1007 | file = event_file_data(filp); |
1008 | if (call) | 1008 | if (file) |
1009 | print_event_filter(call, s); | 1009 | print_event_filter(file, s); |
1010 | mutex_unlock(&event_mutex); | 1010 | mutex_unlock(&event_mutex); |
1011 | 1011 | ||
1012 | if (call) | 1012 | if (file) |
1013 | r = simple_read_from_buffer(ubuf, cnt, ppos, s->buffer, s->len); | 1013 | r = simple_read_from_buffer(ubuf, cnt, ppos, s->buffer, s->len); |
1014 | 1014 | ||
1015 | kfree(s); | 1015 | kfree(s); |
@@ -1021,7 +1021,7 @@ static ssize_t | |||
1021 | event_filter_write(struct file *filp, const char __user *ubuf, size_t cnt, | 1021 | event_filter_write(struct file *filp, const char __user *ubuf, size_t cnt, |
1022 | loff_t *ppos) | 1022 | loff_t *ppos) |
1023 | { | 1023 | { |
1024 | struct ftrace_event_call *call; | 1024 | struct ftrace_event_file *file; |
1025 | char *buf; | 1025 | char *buf; |
1026 | int err = -ENODEV; | 1026 | int err = -ENODEV; |
1027 | 1027 | ||
@@ -1039,9 +1039,9 @@ event_filter_write(struct file *filp, const char __user *ubuf, size_t cnt, | |||
1039 | buf[cnt] = '\0'; | 1039 | buf[cnt] = '\0'; |
1040 | 1040 | ||
1041 | mutex_lock(&event_mutex); | 1041 | mutex_lock(&event_mutex); |
1042 | call = event_file_data(filp); | 1042 | file = event_file_data(filp); |
1043 | if (call) | 1043 | if (file) |
1044 | err = apply_event_filter(call, buf); | 1044 | err = apply_event_filter(file, buf); |
1045 | mutex_unlock(&event_mutex); | 1045 | mutex_unlock(&event_mutex); |
1046 | 1046 | ||
1047 | free_page((unsigned long) buf); | 1047 | free_page((unsigned long) buf); |
@@ -1062,6 +1062,9 @@ static int subsystem_open(struct inode *inode, struct file *filp) | |||
1062 | struct trace_array *tr; | 1062 | struct trace_array *tr; |
1063 | int ret; | 1063 | int ret; |
1064 | 1064 | ||
1065 | if (tracing_is_disabled()) | ||
1066 | return -ENODEV; | ||
1067 | |||
1065 | /* Make sure the system still exists */ | 1068 | /* Make sure the system still exists */ |
1066 | mutex_lock(&trace_types_lock); | 1069 | mutex_lock(&trace_types_lock); |
1067 | mutex_lock(&event_mutex); | 1070 | mutex_lock(&event_mutex); |
@@ -1108,6 +1111,9 @@ static int system_tr_open(struct inode *inode, struct file *filp) | |||
1108 | struct trace_array *tr = inode->i_private; | 1111 | struct trace_array *tr = inode->i_private; |
1109 | int ret; | 1112 | int ret; |
1110 | 1113 | ||
1114 | if (tracing_is_disabled()) | ||
1115 | return -ENODEV; | ||
1116 | |||
1111 | if (trace_array_get(tr) < 0) | 1117 | if (trace_array_get(tr) < 0) |
1112 | return -ENODEV; | 1118 | return -ENODEV; |
1113 | 1119 | ||
@@ -1124,11 +1130,12 @@ static int system_tr_open(struct inode *inode, struct file *filp) | |||
1124 | if (ret < 0) { | 1130 | if (ret < 0) { |
1125 | trace_array_put(tr); | 1131 | trace_array_put(tr); |
1126 | kfree(dir); | 1132 | kfree(dir); |
1133 | return ret; | ||
1127 | } | 1134 | } |
1128 | 1135 | ||
1129 | filp->private_data = dir; | 1136 | filp->private_data = dir; |
1130 | 1137 | ||
1131 | return ret; | 1138 | return 0; |
1132 | } | 1139 | } |
1133 | 1140 | ||
1134 | static int subsystem_release(struct inode *inode, struct file *file) | 1141 | static int subsystem_release(struct inode *inode, struct file *file) |
@@ -1539,7 +1546,7 @@ event_create_dir(struct dentry *parent, struct ftrace_event_file *file) | |||
1539 | return -1; | 1546 | return -1; |
1540 | } | 1547 | } |
1541 | } | 1548 | } |
1542 | trace_create_file("filter", 0644, file->dir, call, | 1549 | trace_create_file("filter", 0644, file->dir, file, |
1543 | &ftrace_event_filter_fops); | 1550 | &ftrace_event_filter_fops); |
1544 | 1551 | ||
1545 | trace_create_file("format", 0444, file->dir, call, | 1552 | trace_create_file("format", 0444, file->dir, call, |
@@ -1577,6 +1584,7 @@ static void event_remove(struct ftrace_event_call *call) | |||
1577 | if (file->event_call != call) | 1584 | if (file->event_call != call) |
1578 | continue; | 1585 | continue; |
1579 | ftrace_event_enable_disable(file, 0); | 1586 | ftrace_event_enable_disable(file, 0); |
1587 | destroy_preds(file); | ||
1580 | /* | 1588 | /* |
1581 | * The do_for_each_event_file() is | 1589 | * The do_for_each_event_file() is |
1582 | * a double loop. After finding the call for this | 1590 | * a double loop. After finding the call for this |
@@ -1700,7 +1708,7 @@ static void __trace_remove_event_call(struct ftrace_event_call *call) | |||
1700 | { | 1708 | { |
1701 | event_remove(call); | 1709 | event_remove(call); |
1702 | trace_destroy_fields(call); | 1710 | trace_destroy_fields(call); |
1703 | destroy_preds(call); | 1711 | destroy_call_preds(call); |
1704 | } | 1712 | } |
1705 | 1713 | ||
1706 | static int probe_remove_event_call(struct ftrace_event_call *call) | 1714 | static int probe_remove_event_call(struct ftrace_event_call *call) |
@@ -2306,6 +2314,9 @@ int event_trace_del_tracer(struct trace_array *tr) | |||
2306 | /* Disable any running events */ | 2314 | /* Disable any running events */ |
2307 | __ftrace_set_clr_event_nolock(tr, NULL, NULL, NULL, 0); | 2315 | __ftrace_set_clr_event_nolock(tr, NULL, NULL, NULL, 0); |
2308 | 2316 | ||
2317 | /* Access to events are within rcu_read_lock_sched() */ | ||
2318 | synchronize_sched(); | ||
2319 | |||
2309 | down_write(&trace_event_sem); | 2320 | down_write(&trace_event_sem); |
2310 | __trace_remove_event_dirs(tr); | 2321 | __trace_remove_event_dirs(tr); |
2311 | debugfs_remove_recursive(tr->event_dir); | 2322 | debugfs_remove_recursive(tr->event_dir); |
diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c index 97daa8cf958d..2468f56dc5db 100644 --- a/kernel/trace/trace_events_filter.c +++ b/kernel/trace/trace_events_filter.c | |||
@@ -637,10 +637,18 @@ static void append_filter_err(struct filter_parse_state *ps, | |||
637 | free_page((unsigned long) buf); | 637 | free_page((unsigned long) buf); |
638 | } | 638 | } |
639 | 639 | ||
640 | static inline struct event_filter *event_filter(struct ftrace_event_file *file) | ||
641 | { | ||
642 | if (file->event_call->flags & TRACE_EVENT_FL_USE_CALL_FILTER) | ||
643 | return file->event_call->filter; | ||
644 | else | ||
645 | return file->filter; | ||
646 | } | ||
647 | |||
640 | /* caller must hold event_mutex */ | 648 | /* caller must hold event_mutex */ |
641 | void print_event_filter(struct ftrace_event_call *call, struct trace_seq *s) | 649 | void print_event_filter(struct ftrace_event_file *file, struct trace_seq *s) |
642 | { | 650 | { |
643 | struct event_filter *filter = call->filter; | 651 | struct event_filter *filter = event_filter(file); |
644 | 652 | ||
645 | if (filter && filter->filter_string) | 653 | if (filter && filter->filter_string) |
646 | trace_seq_printf(s, "%s\n", filter->filter_string); | 654 | trace_seq_printf(s, "%s\n", filter->filter_string); |
@@ -766,11 +774,21 @@ static void __free_preds(struct event_filter *filter) | |||
766 | filter->n_preds = 0; | 774 | filter->n_preds = 0; |
767 | } | 775 | } |
768 | 776 | ||
769 | static void filter_disable(struct ftrace_event_call *call) | 777 | static void call_filter_disable(struct ftrace_event_call *call) |
770 | { | 778 | { |
771 | call->flags &= ~TRACE_EVENT_FL_FILTERED; | 779 | call->flags &= ~TRACE_EVENT_FL_FILTERED; |
772 | } | 780 | } |
773 | 781 | ||
782 | static void filter_disable(struct ftrace_event_file *file) | ||
783 | { | ||
784 | struct ftrace_event_call *call = file->event_call; | ||
785 | |||
786 | if (call->flags & TRACE_EVENT_FL_USE_CALL_FILTER) | ||
787 | call_filter_disable(call); | ||
788 | else | ||
789 | file->flags &= ~FTRACE_EVENT_FL_FILTERED; | ||
790 | } | ||
791 | |||
774 | static void __free_filter(struct event_filter *filter) | 792 | static void __free_filter(struct event_filter *filter) |
775 | { | 793 | { |
776 | if (!filter) | 794 | if (!filter) |
@@ -781,16 +799,30 @@ static void __free_filter(struct event_filter *filter) | |||
781 | kfree(filter); | 799 | kfree(filter); |
782 | } | 800 | } |
783 | 801 | ||
802 | void destroy_call_preds(struct ftrace_event_call *call) | ||
803 | { | ||
804 | __free_filter(call->filter); | ||
805 | call->filter = NULL; | ||
806 | } | ||
807 | |||
808 | static void destroy_file_preds(struct ftrace_event_file *file) | ||
809 | { | ||
810 | __free_filter(file->filter); | ||
811 | file->filter = NULL; | ||
812 | } | ||
813 | |||
784 | /* | 814 | /* |
785 | * Called when destroying the ftrace_event_call. | 815 | * Called when destroying the ftrace_event_file. |
786 | * The call is being freed, so we do not need to worry about | 816 | * The file is being freed, so we do not need to worry about |
787 | * the call being currently used. This is for module code removing | 817 | * the file being currently used. This is for module code removing |
788 | * the tracepoints from within it. | 818 | * the tracepoints from within it. |
789 | */ | 819 | */ |
790 | void destroy_preds(struct ftrace_event_call *call) | 820 | void destroy_preds(struct ftrace_event_file *file) |
791 | { | 821 | { |
792 | __free_filter(call->filter); | 822 | if (file->event_call->flags & TRACE_EVENT_FL_USE_CALL_FILTER) |
793 | call->filter = NULL; | 823 | destroy_call_preds(file->event_call); |
824 | else | ||
825 | destroy_file_preds(file); | ||
794 | } | 826 | } |
795 | 827 | ||
796 | static struct event_filter *__alloc_filter(void) | 828 | static struct event_filter *__alloc_filter(void) |
@@ -825,28 +857,56 @@ static int __alloc_preds(struct event_filter *filter, int n_preds) | |||
825 | return 0; | 857 | return 0; |
826 | } | 858 | } |
827 | 859 | ||
828 | static void filter_free_subsystem_preds(struct event_subsystem *system) | 860 | static inline void __remove_filter(struct ftrace_event_file *file) |
829 | { | 861 | { |
862 | struct ftrace_event_call *call = file->event_call; | ||
863 | |||
864 | filter_disable(file); | ||
865 | if (call->flags & TRACE_EVENT_FL_USE_CALL_FILTER) | ||
866 | remove_filter_string(call->filter); | ||
867 | else | ||
868 | remove_filter_string(file->filter); | ||
869 | } | ||
870 | |||
871 | static void filter_free_subsystem_preds(struct event_subsystem *system, | ||
872 | struct trace_array *tr) | ||
873 | { | ||
874 | struct ftrace_event_file *file; | ||
830 | struct ftrace_event_call *call; | 875 | struct ftrace_event_call *call; |
831 | 876 | ||
832 | list_for_each_entry(call, &ftrace_events, list) { | 877 | list_for_each_entry(file, &tr->events, list) { |
878 | call = file->event_call; | ||
833 | if (strcmp(call->class->system, system->name) != 0) | 879 | if (strcmp(call->class->system, system->name) != 0) |
834 | continue; | 880 | continue; |
835 | 881 | ||
836 | filter_disable(call); | 882 | __remove_filter(file); |
837 | remove_filter_string(call->filter); | ||
838 | } | 883 | } |
839 | } | 884 | } |
840 | 885 | ||
841 | static void filter_free_subsystem_filters(struct event_subsystem *system) | 886 | static inline void __free_subsystem_filter(struct ftrace_event_file *file) |
842 | { | 887 | { |
888 | struct ftrace_event_call *call = file->event_call; | ||
889 | |||
890 | if (call->flags & TRACE_EVENT_FL_USE_CALL_FILTER) { | ||
891 | __free_filter(call->filter); | ||
892 | call->filter = NULL; | ||
893 | } else { | ||
894 | __free_filter(file->filter); | ||
895 | file->filter = NULL; | ||
896 | } | ||
897 | } | ||
898 | |||
899 | static void filter_free_subsystem_filters(struct event_subsystem *system, | ||
900 | struct trace_array *tr) | ||
901 | { | ||
902 | struct ftrace_event_file *file; | ||
843 | struct ftrace_event_call *call; | 903 | struct ftrace_event_call *call; |
844 | 904 | ||
845 | list_for_each_entry(call, &ftrace_events, list) { | 905 | list_for_each_entry(file, &tr->events, list) { |
906 | call = file->event_call; | ||
846 | if (strcmp(call->class->system, system->name) != 0) | 907 | if (strcmp(call->class->system, system->name) != 0) |
847 | continue; | 908 | continue; |
848 | __free_filter(call->filter); | 909 | __free_subsystem_filter(file); |
849 | call->filter = NULL; | ||
850 | } | 910 | } |
851 | } | 911 | } |
852 | 912 | ||
@@ -1617,15 +1677,85 @@ fail: | |||
1617 | return err; | 1677 | return err; |
1618 | } | 1678 | } |
1619 | 1679 | ||
1680 | static inline void event_set_filtered_flag(struct ftrace_event_file *file) | ||
1681 | { | ||
1682 | struct ftrace_event_call *call = file->event_call; | ||
1683 | |||
1684 | if (call->flags & TRACE_EVENT_FL_USE_CALL_FILTER) | ||
1685 | call->flags |= TRACE_EVENT_FL_FILTERED; | ||
1686 | else | ||
1687 | file->flags |= FTRACE_EVENT_FL_FILTERED; | ||
1688 | } | ||
1689 | |||
1690 | static inline void event_set_filter(struct ftrace_event_file *file, | ||
1691 | struct event_filter *filter) | ||
1692 | { | ||
1693 | struct ftrace_event_call *call = file->event_call; | ||
1694 | |||
1695 | if (call->flags & TRACE_EVENT_FL_USE_CALL_FILTER) | ||
1696 | rcu_assign_pointer(call->filter, filter); | ||
1697 | else | ||
1698 | rcu_assign_pointer(file->filter, filter); | ||
1699 | } | ||
1700 | |||
1701 | static inline void event_clear_filter(struct ftrace_event_file *file) | ||
1702 | { | ||
1703 | struct ftrace_event_call *call = file->event_call; | ||
1704 | |||
1705 | if (call->flags & TRACE_EVENT_FL_USE_CALL_FILTER) | ||
1706 | RCU_INIT_POINTER(call->filter, NULL); | ||
1707 | else | ||
1708 | RCU_INIT_POINTER(file->filter, NULL); | ||
1709 | } | ||
1710 | |||
1711 | static inline void | ||
1712 | event_set_no_set_filter_flag(struct ftrace_event_file *file) | ||
1713 | { | ||
1714 | struct ftrace_event_call *call = file->event_call; | ||
1715 | |||
1716 | if (call->flags & TRACE_EVENT_FL_USE_CALL_FILTER) | ||
1717 | call->flags |= TRACE_EVENT_FL_NO_SET_FILTER; | ||
1718 | else | ||
1719 | file->flags |= FTRACE_EVENT_FL_NO_SET_FILTER; | ||
1720 | } | ||
1721 | |||
1722 | static inline void | ||
1723 | event_clear_no_set_filter_flag(struct ftrace_event_file *file) | ||
1724 | { | ||
1725 | struct ftrace_event_call *call = file->event_call; | ||
1726 | |||
1727 | if (call->flags & TRACE_EVENT_FL_USE_CALL_FILTER) | ||
1728 | call->flags &= ~TRACE_EVENT_FL_NO_SET_FILTER; | ||
1729 | else | ||
1730 | file->flags &= ~FTRACE_EVENT_FL_NO_SET_FILTER; | ||
1731 | } | ||
1732 | |||
1733 | static inline bool | ||
1734 | event_no_set_filter_flag(struct ftrace_event_file *file) | ||
1735 | { | ||
1736 | struct ftrace_event_call *call = file->event_call; | ||
1737 | |||
1738 | if (file->flags & FTRACE_EVENT_FL_NO_SET_FILTER) | ||
1739 | return true; | ||
1740 | |||
1741 | if ((call->flags & TRACE_EVENT_FL_USE_CALL_FILTER) && | ||
1742 | (call->flags & TRACE_EVENT_FL_NO_SET_FILTER)) | ||
1743 | return true; | ||
1744 | |||
1745 | return false; | ||
1746 | } | ||
1747 | |||
1620 | struct filter_list { | 1748 | struct filter_list { |
1621 | struct list_head list; | 1749 | struct list_head list; |
1622 | struct event_filter *filter; | 1750 | struct event_filter *filter; |
1623 | }; | 1751 | }; |
1624 | 1752 | ||
1625 | static int replace_system_preds(struct event_subsystem *system, | 1753 | static int replace_system_preds(struct event_subsystem *system, |
1754 | struct trace_array *tr, | ||
1626 | struct filter_parse_state *ps, | 1755 | struct filter_parse_state *ps, |
1627 | char *filter_string) | 1756 | char *filter_string) |
1628 | { | 1757 | { |
1758 | struct ftrace_event_file *file; | ||
1629 | struct ftrace_event_call *call; | 1759 | struct ftrace_event_call *call; |
1630 | struct filter_list *filter_item; | 1760 | struct filter_list *filter_item; |
1631 | struct filter_list *tmp; | 1761 | struct filter_list *tmp; |
@@ -1633,8 +1763,8 @@ static int replace_system_preds(struct event_subsystem *system, | |||
1633 | bool fail = true; | 1763 | bool fail = true; |
1634 | int err; | 1764 | int err; |
1635 | 1765 | ||
1636 | list_for_each_entry(call, &ftrace_events, list) { | 1766 | list_for_each_entry(file, &tr->events, list) { |
1637 | 1767 | call = file->event_call; | |
1638 | if (strcmp(call->class->system, system->name) != 0) | 1768 | if (strcmp(call->class->system, system->name) != 0) |
1639 | continue; | 1769 | continue; |
1640 | 1770 | ||
@@ -1644,18 +1774,20 @@ static int replace_system_preds(struct event_subsystem *system, | |||
1644 | */ | 1774 | */ |
1645 | err = replace_preds(call, NULL, ps, filter_string, true); | 1775 | err = replace_preds(call, NULL, ps, filter_string, true); |
1646 | if (err) | 1776 | if (err) |
1647 | call->flags |= TRACE_EVENT_FL_NO_SET_FILTER; | 1777 | event_set_no_set_filter_flag(file); |
1648 | else | 1778 | else |
1649 | call->flags &= ~TRACE_EVENT_FL_NO_SET_FILTER; | 1779 | event_clear_no_set_filter_flag(file); |
1650 | } | 1780 | } |
1651 | 1781 | ||
1652 | list_for_each_entry(call, &ftrace_events, list) { | 1782 | list_for_each_entry(file, &tr->events, list) { |
1653 | struct event_filter *filter; | 1783 | struct event_filter *filter; |
1654 | 1784 | ||
1785 | call = file->event_call; | ||
1786 | |||
1655 | if (strcmp(call->class->system, system->name) != 0) | 1787 | if (strcmp(call->class->system, system->name) != 0) |
1656 | continue; | 1788 | continue; |
1657 | 1789 | ||
1658 | if (call->flags & TRACE_EVENT_FL_NO_SET_FILTER) | 1790 | if (event_no_set_filter_flag(file)) |
1659 | continue; | 1791 | continue; |
1660 | 1792 | ||
1661 | filter_item = kzalloc(sizeof(*filter_item), GFP_KERNEL); | 1793 | filter_item = kzalloc(sizeof(*filter_item), GFP_KERNEL); |
@@ -1676,17 +1808,17 @@ static int replace_system_preds(struct event_subsystem *system, | |||
1676 | 1808 | ||
1677 | err = replace_preds(call, filter, ps, filter_string, false); | 1809 | err = replace_preds(call, filter, ps, filter_string, false); |
1678 | if (err) { | 1810 | if (err) { |
1679 | filter_disable(call); | 1811 | filter_disable(file); |
1680 | parse_error(ps, FILT_ERR_BAD_SUBSYS_FILTER, 0); | 1812 | parse_error(ps, FILT_ERR_BAD_SUBSYS_FILTER, 0); |
1681 | append_filter_err(ps, filter); | 1813 | append_filter_err(ps, filter); |
1682 | } else | 1814 | } else |
1683 | call->flags |= TRACE_EVENT_FL_FILTERED; | 1815 | event_set_filtered_flag(file); |
1684 | /* | 1816 | /* |
1685 | * Regardless of if this returned an error, we still | 1817 | * Regardless of if this returned an error, we still |
1686 | * replace the filter for the call. | 1818 | * replace the filter for the call. |
1687 | */ | 1819 | */ |
1688 | filter = call->filter; | 1820 | filter = event_filter(file); |
1689 | rcu_assign_pointer(call->filter, filter_item->filter); | 1821 | event_set_filter(file, filter_item->filter); |
1690 | filter_item->filter = filter; | 1822 | filter_item->filter = filter; |
1691 | 1823 | ||
1692 | fail = false; | 1824 | fail = false; |
@@ -1816,6 +1948,7 @@ static int create_filter(struct ftrace_event_call *call, | |||
1816 | * and always remembers @filter_str. | 1948 | * and always remembers @filter_str. |
1817 | */ | 1949 | */ |
1818 | static int create_system_filter(struct event_subsystem *system, | 1950 | static int create_system_filter(struct event_subsystem *system, |
1951 | struct trace_array *tr, | ||
1819 | char *filter_str, struct event_filter **filterp) | 1952 | char *filter_str, struct event_filter **filterp) |
1820 | { | 1953 | { |
1821 | struct event_filter *filter = NULL; | 1954 | struct event_filter *filter = NULL; |
@@ -1824,7 +1957,7 @@ static int create_system_filter(struct event_subsystem *system, | |||
1824 | 1957 | ||
1825 | err = create_filter_start(filter_str, true, &ps, &filter); | 1958 | err = create_filter_start(filter_str, true, &ps, &filter); |
1826 | if (!err) { | 1959 | if (!err) { |
1827 | err = replace_system_preds(system, ps, filter_str); | 1960 | err = replace_system_preds(system, tr, ps, filter_str); |
1828 | if (!err) { | 1961 | if (!err) { |
1829 | /* System filters just show a default message */ | 1962 | /* System filters just show a default message */ |
1830 | kfree(filter->filter_string); | 1963 | kfree(filter->filter_string); |
@@ -1840,20 +1973,25 @@ static int create_system_filter(struct event_subsystem *system, | |||
1840 | } | 1973 | } |
1841 | 1974 | ||
1842 | /* caller must hold event_mutex */ | 1975 | /* caller must hold event_mutex */ |
1843 | int apply_event_filter(struct ftrace_event_call *call, char *filter_string) | 1976 | int apply_event_filter(struct ftrace_event_file *file, char *filter_string) |
1844 | { | 1977 | { |
1978 | struct ftrace_event_call *call = file->event_call; | ||
1845 | struct event_filter *filter; | 1979 | struct event_filter *filter; |
1846 | int err; | 1980 | int err; |
1847 | 1981 | ||
1848 | if (!strcmp(strstrip(filter_string), "0")) { | 1982 | if (!strcmp(strstrip(filter_string), "0")) { |
1849 | filter_disable(call); | 1983 | filter_disable(file); |
1850 | filter = call->filter; | 1984 | filter = event_filter(file); |
1985 | |||
1851 | if (!filter) | 1986 | if (!filter) |
1852 | return 0; | 1987 | return 0; |
1853 | RCU_INIT_POINTER(call->filter, NULL); | 1988 | |
1989 | event_clear_filter(file); | ||
1990 | |||
1854 | /* Make sure the filter is not being used */ | 1991 | /* Make sure the filter is not being used */ |
1855 | synchronize_sched(); | 1992 | synchronize_sched(); |
1856 | __free_filter(filter); | 1993 | __free_filter(filter); |
1994 | |||
1857 | return 0; | 1995 | return 0; |
1858 | } | 1996 | } |
1859 | 1997 | ||
@@ -1866,14 +2004,15 @@ int apply_event_filter(struct ftrace_event_call *call, char *filter_string) | |||
1866 | * string | 2004 | * string |
1867 | */ | 2005 | */ |
1868 | if (filter) { | 2006 | if (filter) { |
1869 | struct event_filter *tmp = call->filter; | 2007 | struct event_filter *tmp; |
1870 | 2008 | ||
2009 | tmp = event_filter(file); | ||
1871 | if (!err) | 2010 | if (!err) |
1872 | call->flags |= TRACE_EVENT_FL_FILTERED; | 2011 | event_set_filtered_flag(file); |
1873 | else | 2012 | else |
1874 | filter_disable(call); | 2013 | filter_disable(file); |
1875 | 2014 | ||
1876 | rcu_assign_pointer(call->filter, filter); | 2015 | event_set_filter(file, filter); |
1877 | 2016 | ||
1878 | if (tmp) { | 2017 | if (tmp) { |
1879 | /* Make sure the call is done with the filter */ | 2018 | /* Make sure the call is done with the filter */ |
@@ -1889,6 +2028,7 @@ int apply_subsystem_event_filter(struct ftrace_subsystem_dir *dir, | |||
1889 | char *filter_string) | 2028 | char *filter_string) |
1890 | { | 2029 | { |
1891 | struct event_subsystem *system = dir->subsystem; | 2030 | struct event_subsystem *system = dir->subsystem; |
2031 | struct trace_array *tr = dir->tr; | ||
1892 | struct event_filter *filter; | 2032 | struct event_filter *filter; |
1893 | int err = 0; | 2033 | int err = 0; |
1894 | 2034 | ||
@@ -1901,18 +2041,18 @@ int apply_subsystem_event_filter(struct ftrace_subsystem_dir *dir, | |||
1901 | } | 2041 | } |
1902 | 2042 | ||
1903 | if (!strcmp(strstrip(filter_string), "0")) { | 2043 | if (!strcmp(strstrip(filter_string), "0")) { |
1904 | filter_free_subsystem_preds(system); | 2044 | filter_free_subsystem_preds(system, tr); |
1905 | remove_filter_string(system->filter); | 2045 | remove_filter_string(system->filter); |
1906 | filter = system->filter; | 2046 | filter = system->filter; |
1907 | system->filter = NULL; | 2047 | system->filter = NULL; |
1908 | /* Ensure all filters are no longer used */ | 2048 | /* Ensure all filters are no longer used */ |
1909 | synchronize_sched(); | 2049 | synchronize_sched(); |
1910 | filter_free_subsystem_filters(system); | 2050 | filter_free_subsystem_filters(system, tr); |
1911 | __free_filter(filter); | 2051 | __free_filter(filter); |
1912 | goto out_unlock; | 2052 | goto out_unlock; |
1913 | } | 2053 | } |
1914 | 2054 | ||
1915 | err = create_system_filter(system, filter_string, &filter); | 2055 | err = create_system_filter(system, tr, filter_string, &filter); |
1916 | if (filter) { | 2056 | if (filter) { |
1917 | /* | 2057 | /* |
1918 | * No event actually uses the system filter | 2058 | * No event actually uses the system filter |
diff --git a/kernel/trace/trace_export.c b/kernel/trace/trace_export.c index d21a74670088..7c3e3e72e2b6 100644 --- a/kernel/trace/trace_export.c +++ b/kernel/trace/trace_export.c | |||
@@ -180,7 +180,7 @@ struct ftrace_event_call __used event_##call = { \ | |||
180 | .event.type = etype, \ | 180 | .event.type = etype, \ |
181 | .class = &event_class_ftrace_##call, \ | 181 | .class = &event_class_ftrace_##call, \ |
182 | .print_fmt = print, \ | 182 | .print_fmt = print, \ |
183 | .flags = TRACE_EVENT_FL_IGNORE_ENABLE, \ | 183 | .flags = TRACE_EVENT_FL_IGNORE_ENABLE | TRACE_EVENT_FL_USE_CALL_FILTER, \ |
184 | }; \ | 184 | }; \ |
185 | struct ftrace_event_call __used \ | 185 | struct ftrace_event_call __used \ |
186 | __attribute__((section("_ftrace_events"))) *__event_##call = &event_##call; | 186 | __attribute__((section("_ftrace_events"))) *__event_##call = &event_##call; |
diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c index b5c09242683d..0b99120d395c 100644 --- a/kernel/trace/trace_functions_graph.c +++ b/kernel/trace/trace_functions_graph.c | |||
@@ -82,9 +82,9 @@ static struct trace_array *graph_array; | |||
82 | * to fill in space into DURATION column. | 82 | * to fill in space into DURATION column. |
83 | */ | 83 | */ |
84 | enum { | 84 | enum { |
85 | DURATION_FILL_FULL = -1, | 85 | FLAGS_FILL_FULL = 1 << TRACE_GRAPH_PRINT_FILL_SHIFT, |
86 | DURATION_FILL_START = -2, | 86 | FLAGS_FILL_START = 2 << TRACE_GRAPH_PRINT_FILL_SHIFT, |
87 | DURATION_FILL_END = -3, | 87 | FLAGS_FILL_END = 3 << TRACE_GRAPH_PRINT_FILL_SHIFT, |
88 | }; | 88 | }; |
89 | 89 | ||
90 | static enum print_line_t | 90 | static enum print_line_t |
@@ -114,16 +114,37 @@ ftrace_push_return_trace(unsigned long ret, unsigned long func, int *depth, | |||
114 | return -EBUSY; | 114 | return -EBUSY; |
115 | } | 115 | } |
116 | 116 | ||
117 | /* | ||
118 | * The curr_ret_stack is an index to ftrace return stack of | ||
119 | * current task. Its value should be in [0, FTRACE_RETFUNC_ | ||
120 | * DEPTH) when the function graph tracer is used. To support | ||
121 | * filtering out specific functions, it makes the index | ||
122 | * negative by subtracting huge value (FTRACE_NOTRACE_DEPTH) | ||
123 | * so when it sees a negative index the ftrace will ignore | ||
124 | * the record. And the index gets recovered when returning | ||
125 | * from the filtered function by adding the FTRACE_NOTRACE_ | ||
126 | * DEPTH and then it'll continue to record functions normally. | ||
127 | * | ||
128 | * The curr_ret_stack is initialized to -1 and get increased | ||
129 | * in this function. So it can be less than -1 only if it was | ||
130 | * filtered out via ftrace_graph_notrace_addr() which can be | ||
131 | * set from set_graph_notrace file in debugfs by user. | ||
132 | */ | ||
133 | if (current->curr_ret_stack < -1) | ||
134 | return -EBUSY; | ||
135 | |||
117 | calltime = trace_clock_local(); | 136 | calltime = trace_clock_local(); |
118 | 137 | ||
119 | index = ++current->curr_ret_stack; | 138 | index = ++current->curr_ret_stack; |
139 | if (ftrace_graph_notrace_addr(func)) | ||
140 | current->curr_ret_stack -= FTRACE_NOTRACE_DEPTH; | ||
120 | barrier(); | 141 | barrier(); |
121 | current->ret_stack[index].ret = ret; | 142 | current->ret_stack[index].ret = ret; |
122 | current->ret_stack[index].func = func; | 143 | current->ret_stack[index].func = func; |
123 | current->ret_stack[index].calltime = calltime; | 144 | current->ret_stack[index].calltime = calltime; |
124 | current->ret_stack[index].subtime = 0; | 145 | current->ret_stack[index].subtime = 0; |
125 | current->ret_stack[index].fp = frame_pointer; | 146 | current->ret_stack[index].fp = frame_pointer; |
126 | *depth = index; | 147 | *depth = current->curr_ret_stack; |
127 | 148 | ||
128 | return 0; | 149 | return 0; |
129 | } | 150 | } |
@@ -137,7 +158,17 @@ ftrace_pop_return_trace(struct ftrace_graph_ret *trace, unsigned long *ret, | |||
137 | 158 | ||
138 | index = current->curr_ret_stack; | 159 | index = current->curr_ret_stack; |
139 | 160 | ||
140 | if (unlikely(index < 0)) { | 161 | /* |
162 | * A negative index here means that it's just returned from a | ||
163 | * notrace'd function. Recover index to get an original | ||
164 | * return address. See ftrace_push_return_trace(). | ||
165 | * | ||
166 | * TODO: Need to check whether the stack gets corrupted. | ||
167 | */ | ||
168 | if (index < 0) | ||
169 | index += FTRACE_NOTRACE_DEPTH; | ||
170 | |||
171 | if (unlikely(index < 0 || index >= FTRACE_RETFUNC_DEPTH)) { | ||
141 | ftrace_graph_stop(); | 172 | ftrace_graph_stop(); |
142 | WARN_ON(1); | 173 | WARN_ON(1); |
143 | /* Might as well panic, otherwise we have no where to go */ | 174 | /* Might as well panic, otherwise we have no where to go */ |
@@ -193,6 +224,15 @@ unsigned long ftrace_return_to_handler(unsigned long frame_pointer) | |||
193 | trace.rettime = trace_clock_local(); | 224 | trace.rettime = trace_clock_local(); |
194 | barrier(); | 225 | barrier(); |
195 | current->curr_ret_stack--; | 226 | current->curr_ret_stack--; |
227 | /* | ||
228 | * The curr_ret_stack can be less than -1 only if it was | ||
229 | * filtered out and it's about to return from the function. | ||
230 | * Recover the index and continue to trace normal functions. | ||
231 | */ | ||
232 | if (current->curr_ret_stack < -1) { | ||
233 | current->curr_ret_stack += FTRACE_NOTRACE_DEPTH; | ||
234 | return ret; | ||
235 | } | ||
196 | 236 | ||
197 | /* | 237 | /* |
198 | * The trace should run after decrementing the ret counter | 238 | * The trace should run after decrementing the ret counter |
@@ -230,7 +270,7 @@ int __trace_graph_entry(struct trace_array *tr, | |||
230 | return 0; | 270 | return 0; |
231 | entry = ring_buffer_event_data(event); | 271 | entry = ring_buffer_event_data(event); |
232 | entry->graph_ent = *trace; | 272 | entry->graph_ent = *trace; |
233 | if (!filter_current_check_discard(buffer, call, entry, event)) | 273 | if (!call_filter_check_discard(call, entry, buffer, event)) |
234 | __buffer_unlock_commit(buffer, event); | 274 | __buffer_unlock_commit(buffer, event); |
235 | 275 | ||
236 | return 1; | 276 | return 1; |
@@ -259,10 +299,20 @@ int trace_graph_entry(struct ftrace_graph_ent *trace) | |||
259 | 299 | ||
260 | /* trace it when it is-nested-in or is a function enabled. */ | 300 | /* trace it when it is-nested-in or is a function enabled. */ |
261 | if ((!(trace->depth || ftrace_graph_addr(trace->func)) || | 301 | if ((!(trace->depth || ftrace_graph_addr(trace->func)) || |
262 | ftrace_graph_ignore_irqs()) || | 302 | ftrace_graph_ignore_irqs()) || (trace->depth < 0) || |
263 | (max_depth && trace->depth >= max_depth)) | 303 | (max_depth && trace->depth >= max_depth)) |
264 | return 0; | 304 | return 0; |
265 | 305 | ||
306 | /* | ||
307 | * Do not trace a function if it's filtered by set_graph_notrace. | ||
308 | * Make the index of ret stack negative to indicate that it should | ||
309 | * ignore further functions. But it needs its own ret stack entry | ||
310 | * to recover the original index in order to continue tracing after | ||
311 | * returning from the function. | ||
312 | */ | ||
313 | if (ftrace_graph_notrace_addr(trace->func)) | ||
314 | return 1; | ||
315 | |||
266 | local_irq_save(flags); | 316 | local_irq_save(flags); |
267 | cpu = raw_smp_processor_id(); | 317 | cpu = raw_smp_processor_id(); |
268 | data = per_cpu_ptr(tr->trace_buffer.data, cpu); | 318 | data = per_cpu_ptr(tr->trace_buffer.data, cpu); |
@@ -335,7 +385,7 @@ void __trace_graph_return(struct trace_array *tr, | |||
335 | return; | 385 | return; |
336 | entry = ring_buffer_event_data(event); | 386 | entry = ring_buffer_event_data(event); |
337 | entry->ret = *trace; | 387 | entry->ret = *trace; |
338 | if (!filter_current_check_discard(buffer, call, entry, event)) | 388 | if (!call_filter_check_discard(call, entry, buffer, event)) |
339 | __buffer_unlock_commit(buffer, event); | 389 | __buffer_unlock_commit(buffer, event); |
340 | } | 390 | } |
341 | 391 | ||
@@ -652,7 +702,7 @@ print_graph_irq(struct trace_iterator *iter, unsigned long addr, | |||
652 | } | 702 | } |
653 | 703 | ||
654 | /* No overhead */ | 704 | /* No overhead */ |
655 | ret = print_graph_duration(DURATION_FILL_START, s, flags); | 705 | ret = print_graph_duration(0, s, flags | FLAGS_FILL_START); |
656 | if (ret != TRACE_TYPE_HANDLED) | 706 | if (ret != TRACE_TYPE_HANDLED) |
657 | return ret; | 707 | return ret; |
658 | 708 | ||
@@ -664,7 +714,7 @@ print_graph_irq(struct trace_iterator *iter, unsigned long addr, | |||
664 | if (!ret) | 714 | if (!ret) |
665 | return TRACE_TYPE_PARTIAL_LINE; | 715 | return TRACE_TYPE_PARTIAL_LINE; |
666 | 716 | ||
667 | ret = print_graph_duration(DURATION_FILL_END, s, flags); | 717 | ret = print_graph_duration(0, s, flags | FLAGS_FILL_END); |
668 | if (ret != TRACE_TYPE_HANDLED) | 718 | if (ret != TRACE_TYPE_HANDLED) |
669 | return ret; | 719 | return ret; |
670 | 720 | ||
@@ -729,14 +779,14 @@ print_graph_duration(unsigned long long duration, struct trace_seq *s, | |||
729 | return TRACE_TYPE_HANDLED; | 779 | return TRACE_TYPE_HANDLED; |
730 | 780 | ||
731 | /* No real adata, just filling the column with spaces */ | 781 | /* No real adata, just filling the column with spaces */ |
732 | switch (duration) { | 782 | switch (flags & TRACE_GRAPH_PRINT_FILL_MASK) { |
733 | case DURATION_FILL_FULL: | 783 | case FLAGS_FILL_FULL: |
734 | ret = trace_seq_puts(s, " | "); | 784 | ret = trace_seq_puts(s, " | "); |
735 | return ret ? TRACE_TYPE_HANDLED : TRACE_TYPE_PARTIAL_LINE; | 785 | return ret ? TRACE_TYPE_HANDLED : TRACE_TYPE_PARTIAL_LINE; |
736 | case DURATION_FILL_START: | 786 | case FLAGS_FILL_START: |
737 | ret = trace_seq_puts(s, " "); | 787 | ret = trace_seq_puts(s, " "); |
738 | return ret ? TRACE_TYPE_HANDLED : TRACE_TYPE_PARTIAL_LINE; | 788 | return ret ? TRACE_TYPE_HANDLED : TRACE_TYPE_PARTIAL_LINE; |
739 | case DURATION_FILL_END: | 789 | case FLAGS_FILL_END: |
740 | ret = trace_seq_puts(s, " |"); | 790 | ret = trace_seq_puts(s, " |"); |
741 | return ret ? TRACE_TYPE_HANDLED : TRACE_TYPE_PARTIAL_LINE; | 791 | return ret ? TRACE_TYPE_HANDLED : TRACE_TYPE_PARTIAL_LINE; |
742 | } | 792 | } |
@@ -852,7 +902,7 @@ print_graph_entry_nested(struct trace_iterator *iter, | |||
852 | } | 902 | } |
853 | 903 | ||
854 | /* No time */ | 904 | /* No time */ |
855 | ret = print_graph_duration(DURATION_FILL_FULL, s, flags); | 905 | ret = print_graph_duration(0, s, flags | FLAGS_FILL_FULL); |
856 | if (ret != TRACE_TYPE_HANDLED) | 906 | if (ret != TRACE_TYPE_HANDLED) |
857 | return ret; | 907 | return ret; |
858 | 908 | ||
@@ -1172,7 +1222,7 @@ print_graph_comment(struct trace_seq *s, struct trace_entry *ent, | |||
1172 | return TRACE_TYPE_PARTIAL_LINE; | 1222 | return TRACE_TYPE_PARTIAL_LINE; |
1173 | 1223 | ||
1174 | /* No time */ | 1224 | /* No time */ |
1175 | ret = print_graph_duration(DURATION_FILL_FULL, s, flags); | 1225 | ret = print_graph_duration(0, s, flags | FLAGS_FILL_FULL); |
1176 | if (ret != TRACE_TYPE_HANDLED) | 1226 | if (ret != TRACE_TYPE_HANDLED) |
1177 | return ret; | 1227 | return ret; |
1178 | 1228 | ||
diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index 243f6834d026..dae9541ada9e 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c | |||
@@ -835,7 +835,7 @@ __kprobe_trace_func(struct trace_probe *tp, struct pt_regs *regs, | |||
835 | entry->ip = (unsigned long)tp->rp.kp.addr; | 835 | entry->ip = (unsigned long)tp->rp.kp.addr; |
836 | store_trace_args(sizeof(*entry), tp, regs, (u8 *)&entry[1], dsize); | 836 | store_trace_args(sizeof(*entry), tp, regs, (u8 *)&entry[1], dsize); |
837 | 837 | ||
838 | if (!filter_current_check_discard(buffer, call, entry, event)) | 838 | if (!filter_check_discard(ftrace_file, entry, buffer, event)) |
839 | trace_buffer_unlock_commit_regs(buffer, event, | 839 | trace_buffer_unlock_commit_regs(buffer, event, |
840 | irq_flags, pc, regs); | 840 | irq_flags, pc, regs); |
841 | } | 841 | } |
@@ -884,7 +884,7 @@ __kretprobe_trace_func(struct trace_probe *tp, struct kretprobe_instance *ri, | |||
884 | entry->ret_ip = (unsigned long)ri->ret_addr; | 884 | entry->ret_ip = (unsigned long)ri->ret_addr; |
885 | store_trace_args(sizeof(*entry), tp, regs, (u8 *)&entry[1], dsize); | 885 | store_trace_args(sizeof(*entry), tp, regs, (u8 *)&entry[1], dsize); |
886 | 886 | ||
887 | if (!filter_current_check_discard(buffer, call, entry, event)) | 887 | if (!filter_check_discard(ftrace_file, entry, buffer, event)) |
888 | trace_buffer_unlock_commit_regs(buffer, event, | 888 | trace_buffer_unlock_commit_regs(buffer, event, |
889 | irq_flags, pc, regs); | 889 | irq_flags, pc, regs); |
890 | } | 890 | } |
diff --git a/kernel/trace/trace_mmiotrace.c b/kernel/trace/trace_mmiotrace.c index b3dcfb2f0fef..0abd9b863474 100644 --- a/kernel/trace/trace_mmiotrace.c +++ b/kernel/trace/trace_mmiotrace.c | |||
@@ -323,7 +323,7 @@ static void __trace_mmiotrace_rw(struct trace_array *tr, | |||
323 | entry = ring_buffer_event_data(event); | 323 | entry = ring_buffer_event_data(event); |
324 | entry->rw = *rw; | 324 | entry->rw = *rw; |
325 | 325 | ||
326 | if (!filter_check_discard(call, entry, buffer, event)) | 326 | if (!call_filter_check_discard(call, entry, buffer, event)) |
327 | trace_buffer_unlock_commit(buffer, event, 0, pc); | 327 | trace_buffer_unlock_commit(buffer, event, 0, pc); |
328 | } | 328 | } |
329 | 329 | ||
@@ -353,7 +353,7 @@ static void __trace_mmiotrace_map(struct trace_array *tr, | |||
353 | entry = ring_buffer_event_data(event); | 353 | entry = ring_buffer_event_data(event); |
354 | entry->map = *map; | 354 | entry->map = *map; |
355 | 355 | ||
356 | if (!filter_check_discard(call, entry, buffer, event)) | 356 | if (!call_filter_check_discard(call, entry, buffer, event)) |
357 | trace_buffer_unlock_commit(buffer, event, 0, pc); | 357 | trace_buffer_unlock_commit(buffer, event, 0, pc); |
358 | } | 358 | } |
359 | 359 | ||
diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c index 34e7cbac0c9c..ed32284fbe32 100644 --- a/kernel/trace/trace_output.c +++ b/kernel/trace/trace_output.c | |||
@@ -618,8 +618,23 @@ int trace_print_lat_fmt(struct trace_seq *s, struct trace_entry *entry) | |||
618 | (entry->flags & TRACE_FLAG_IRQS_OFF) ? 'd' : | 618 | (entry->flags & TRACE_FLAG_IRQS_OFF) ? 'd' : |
619 | (entry->flags & TRACE_FLAG_IRQS_NOSUPPORT) ? 'X' : | 619 | (entry->flags & TRACE_FLAG_IRQS_NOSUPPORT) ? 'X' : |
620 | '.'; | 620 | '.'; |
621 | need_resched = | 621 | |
622 | (entry->flags & TRACE_FLAG_NEED_RESCHED) ? 'N' : '.'; | 622 | switch (entry->flags & (TRACE_FLAG_NEED_RESCHED | |
623 | TRACE_FLAG_PREEMPT_RESCHED)) { | ||
624 | case TRACE_FLAG_NEED_RESCHED | TRACE_FLAG_PREEMPT_RESCHED: | ||
625 | need_resched = 'N'; | ||
626 | break; | ||
627 | case TRACE_FLAG_NEED_RESCHED: | ||
628 | need_resched = 'n'; | ||
629 | break; | ||
630 | case TRACE_FLAG_PREEMPT_RESCHED: | ||
631 | need_resched = 'p'; | ||
632 | break; | ||
633 | default: | ||
634 | need_resched = '.'; | ||
635 | break; | ||
636 | } | ||
637 | |||
623 | hardsoft_irq = | 638 | hardsoft_irq = |
624 | (hardirq && softirq) ? 'H' : | 639 | (hardirq && softirq) ? 'H' : |
625 | hardirq ? 'h' : | 640 | hardirq ? 'h' : |
diff --git a/kernel/trace/trace_sched_switch.c b/kernel/trace/trace_sched_switch.c index 4e98e3b257a3..3f34dc9b40f3 100644 --- a/kernel/trace/trace_sched_switch.c +++ b/kernel/trace/trace_sched_switch.c | |||
@@ -45,7 +45,7 @@ tracing_sched_switch_trace(struct trace_array *tr, | |||
45 | entry->next_state = next->state; | 45 | entry->next_state = next->state; |
46 | entry->next_cpu = task_cpu(next); | 46 | entry->next_cpu = task_cpu(next); |
47 | 47 | ||
48 | if (!filter_check_discard(call, entry, buffer, event)) | 48 | if (!call_filter_check_discard(call, entry, buffer, event)) |
49 | trace_buffer_unlock_commit(buffer, event, flags, pc); | 49 | trace_buffer_unlock_commit(buffer, event, flags, pc); |
50 | } | 50 | } |
51 | 51 | ||
@@ -101,7 +101,7 @@ tracing_sched_wakeup_trace(struct trace_array *tr, | |||
101 | entry->next_state = wakee->state; | 101 | entry->next_state = wakee->state; |
102 | entry->next_cpu = task_cpu(wakee); | 102 | entry->next_cpu = task_cpu(wakee); |
103 | 103 | ||
104 | if (!filter_check_discard(call, entry, buffer, event)) | 104 | if (!call_filter_check_discard(call, entry, buffer, event)) |
105 | trace_buffer_unlock_commit(buffer, event, flags, pc); | 105 | trace_buffer_unlock_commit(buffer, event, flags, pc); |
106 | } | 106 | } |
107 | 107 | ||
diff --git a/kernel/trace/trace_stat.c b/kernel/trace/trace_stat.c index 847f88a6194b..7af67360b330 100644 --- a/kernel/trace/trace_stat.c +++ b/kernel/trace/trace_stat.c | |||
@@ -43,46 +43,15 @@ static DEFINE_MUTEX(all_stat_sessions_mutex); | |||
43 | /* The root directory for all stat files */ | 43 | /* The root directory for all stat files */ |
44 | static struct dentry *stat_dir; | 44 | static struct dentry *stat_dir; |
45 | 45 | ||
46 | /* | 46 | static void __reset_stat_session(struct stat_session *session) |
47 | * Iterate through the rbtree using a post order traversal path | ||
48 | * to release the next node. | ||
49 | * It won't necessary release one at each iteration | ||
50 | * but it will at least advance closer to the next one | ||
51 | * to be released. | ||
52 | */ | ||
53 | static struct rb_node *release_next(struct tracer_stat *ts, | ||
54 | struct rb_node *node) | ||
55 | { | 47 | { |
56 | struct stat_node *snode; | 48 | struct stat_node *snode, *n; |
57 | struct rb_node *parent = rb_parent(node); | ||
58 | |||
59 | if (node->rb_left) | ||
60 | return node->rb_left; | ||
61 | else if (node->rb_right) | ||
62 | return node->rb_right; | ||
63 | else { | ||
64 | if (!parent) | ||
65 | ; | ||
66 | else if (parent->rb_left == node) | ||
67 | parent->rb_left = NULL; | ||
68 | else | ||
69 | parent->rb_right = NULL; | ||
70 | 49 | ||
71 | snode = container_of(node, struct stat_node, node); | 50 | rbtree_postorder_for_each_entry_safe(snode, n, &session->stat_root, node) { |
72 | if (ts->stat_release) | 51 | if (session->ts->stat_release) |
73 | ts->stat_release(snode->stat); | 52 | session->ts->stat_release(snode->stat); |
74 | kfree(snode); | 53 | kfree(snode); |
75 | |||
76 | return parent; | ||
77 | } | 54 | } |
78 | } | ||
79 | |||
80 | static void __reset_stat_session(struct stat_session *session) | ||
81 | { | ||
82 | struct rb_node *node = session->stat_root.rb_node; | ||
83 | |||
84 | while (node) | ||
85 | node = release_next(session->ts, node); | ||
86 | 55 | ||
87 | session->stat_root = RB_ROOT; | 56 | session->stat_root = RB_ROOT; |
88 | } | 57 | } |
diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c index 559329d9bd2f..ea90eb5f6f17 100644 --- a/kernel/trace/trace_syscalls.c +++ b/kernel/trace/trace_syscalls.c | |||
@@ -302,6 +302,7 @@ static int __init syscall_exit_define_fields(struct ftrace_event_call *call) | |||
302 | static void ftrace_syscall_enter(void *data, struct pt_regs *regs, long id) | 302 | static void ftrace_syscall_enter(void *data, struct pt_regs *regs, long id) |
303 | { | 303 | { |
304 | struct trace_array *tr = data; | 304 | struct trace_array *tr = data; |
305 | struct ftrace_event_file *ftrace_file; | ||
305 | struct syscall_trace_enter *entry; | 306 | struct syscall_trace_enter *entry; |
306 | struct syscall_metadata *sys_data; | 307 | struct syscall_metadata *sys_data; |
307 | struct ring_buffer_event *event; | 308 | struct ring_buffer_event *event; |
@@ -314,7 +315,13 @@ static void ftrace_syscall_enter(void *data, struct pt_regs *regs, long id) | |||
314 | syscall_nr = trace_get_syscall_nr(current, regs); | 315 | syscall_nr = trace_get_syscall_nr(current, regs); |
315 | if (syscall_nr < 0) | 316 | if (syscall_nr < 0) |
316 | return; | 317 | return; |
317 | if (!test_bit(syscall_nr, tr->enabled_enter_syscalls)) | 318 | |
319 | /* Here we're inside tp handler's rcu_read_lock_sched (__DO_TRACE) */ | ||
320 | ftrace_file = rcu_dereference_sched(tr->enter_syscall_files[syscall_nr]); | ||
321 | if (!ftrace_file) | ||
322 | return; | ||
323 | |||
324 | if (test_bit(FTRACE_EVENT_FL_SOFT_DISABLED_BIT, &ftrace_file->flags)) | ||
318 | return; | 325 | return; |
319 | 326 | ||
320 | sys_data = syscall_nr_to_meta(syscall_nr); | 327 | sys_data = syscall_nr_to_meta(syscall_nr); |
@@ -336,8 +343,7 @@ static void ftrace_syscall_enter(void *data, struct pt_regs *regs, long id) | |||
336 | entry->nr = syscall_nr; | 343 | entry->nr = syscall_nr; |
337 | syscall_get_arguments(current, regs, 0, sys_data->nb_args, entry->args); | 344 | syscall_get_arguments(current, regs, 0, sys_data->nb_args, entry->args); |
338 | 345 | ||
339 | if (!filter_current_check_discard(buffer, sys_data->enter_event, | 346 | if (!filter_check_discard(ftrace_file, entry, buffer, event)) |
340 | entry, event)) | ||
341 | trace_current_buffer_unlock_commit(buffer, event, | 347 | trace_current_buffer_unlock_commit(buffer, event, |
342 | irq_flags, pc); | 348 | irq_flags, pc); |
343 | } | 349 | } |
@@ -345,6 +351,7 @@ static void ftrace_syscall_enter(void *data, struct pt_regs *regs, long id) | |||
345 | static void ftrace_syscall_exit(void *data, struct pt_regs *regs, long ret) | 351 | static void ftrace_syscall_exit(void *data, struct pt_regs *regs, long ret) |
346 | { | 352 | { |
347 | struct trace_array *tr = data; | 353 | struct trace_array *tr = data; |
354 | struct ftrace_event_file *ftrace_file; | ||
348 | struct syscall_trace_exit *entry; | 355 | struct syscall_trace_exit *entry; |
349 | struct syscall_metadata *sys_data; | 356 | struct syscall_metadata *sys_data; |
350 | struct ring_buffer_event *event; | 357 | struct ring_buffer_event *event; |
@@ -356,7 +363,13 @@ static void ftrace_syscall_exit(void *data, struct pt_regs *regs, long ret) | |||
356 | syscall_nr = trace_get_syscall_nr(current, regs); | 363 | syscall_nr = trace_get_syscall_nr(current, regs); |
357 | if (syscall_nr < 0) | 364 | if (syscall_nr < 0) |
358 | return; | 365 | return; |
359 | if (!test_bit(syscall_nr, tr->enabled_exit_syscalls)) | 366 | |
367 | /* Here we're inside tp handler's rcu_read_lock_sched (__DO_TRACE()) */ | ||
368 | ftrace_file = rcu_dereference_sched(tr->exit_syscall_files[syscall_nr]); | ||
369 | if (!ftrace_file) | ||
370 | return; | ||
371 | |||
372 | if (test_bit(FTRACE_EVENT_FL_SOFT_DISABLED_BIT, &ftrace_file->flags)) | ||
360 | return; | 373 | return; |
361 | 374 | ||
362 | sys_data = syscall_nr_to_meta(syscall_nr); | 375 | sys_data = syscall_nr_to_meta(syscall_nr); |
@@ -377,8 +390,7 @@ static void ftrace_syscall_exit(void *data, struct pt_regs *regs, long ret) | |||
377 | entry->nr = syscall_nr; | 390 | entry->nr = syscall_nr; |
378 | entry->ret = syscall_get_return_value(current, regs); | 391 | entry->ret = syscall_get_return_value(current, regs); |
379 | 392 | ||
380 | if (!filter_current_check_discard(buffer, sys_data->exit_event, | 393 | if (!filter_check_discard(ftrace_file, entry, buffer, event)) |
381 | entry, event)) | ||
382 | trace_current_buffer_unlock_commit(buffer, event, | 394 | trace_current_buffer_unlock_commit(buffer, event, |
383 | irq_flags, pc); | 395 | irq_flags, pc); |
384 | } | 396 | } |
@@ -397,7 +409,7 @@ static int reg_event_syscall_enter(struct ftrace_event_file *file, | |||
397 | if (!tr->sys_refcount_enter) | 409 | if (!tr->sys_refcount_enter) |
398 | ret = register_trace_sys_enter(ftrace_syscall_enter, tr); | 410 | ret = register_trace_sys_enter(ftrace_syscall_enter, tr); |
399 | if (!ret) { | 411 | if (!ret) { |
400 | set_bit(num, tr->enabled_enter_syscalls); | 412 | rcu_assign_pointer(tr->enter_syscall_files[num], file); |
401 | tr->sys_refcount_enter++; | 413 | tr->sys_refcount_enter++; |
402 | } | 414 | } |
403 | mutex_unlock(&syscall_trace_lock); | 415 | mutex_unlock(&syscall_trace_lock); |
@@ -415,7 +427,7 @@ static void unreg_event_syscall_enter(struct ftrace_event_file *file, | |||
415 | return; | 427 | return; |
416 | mutex_lock(&syscall_trace_lock); | 428 | mutex_lock(&syscall_trace_lock); |
417 | tr->sys_refcount_enter--; | 429 | tr->sys_refcount_enter--; |
418 | clear_bit(num, tr->enabled_enter_syscalls); | 430 | rcu_assign_pointer(tr->enter_syscall_files[num], NULL); |
419 | if (!tr->sys_refcount_enter) | 431 | if (!tr->sys_refcount_enter) |
420 | unregister_trace_sys_enter(ftrace_syscall_enter, tr); | 432 | unregister_trace_sys_enter(ftrace_syscall_enter, tr); |
421 | mutex_unlock(&syscall_trace_lock); | 433 | mutex_unlock(&syscall_trace_lock); |
@@ -435,7 +447,7 @@ static int reg_event_syscall_exit(struct ftrace_event_file *file, | |||
435 | if (!tr->sys_refcount_exit) | 447 | if (!tr->sys_refcount_exit) |
436 | ret = register_trace_sys_exit(ftrace_syscall_exit, tr); | 448 | ret = register_trace_sys_exit(ftrace_syscall_exit, tr); |
437 | if (!ret) { | 449 | if (!ret) { |
438 | set_bit(num, tr->enabled_exit_syscalls); | 450 | rcu_assign_pointer(tr->exit_syscall_files[num], file); |
439 | tr->sys_refcount_exit++; | 451 | tr->sys_refcount_exit++; |
440 | } | 452 | } |
441 | mutex_unlock(&syscall_trace_lock); | 453 | mutex_unlock(&syscall_trace_lock); |
@@ -453,7 +465,7 @@ static void unreg_event_syscall_exit(struct ftrace_event_file *file, | |||
453 | return; | 465 | return; |
454 | mutex_lock(&syscall_trace_lock); | 466 | mutex_lock(&syscall_trace_lock); |
455 | tr->sys_refcount_exit--; | 467 | tr->sys_refcount_exit--; |
456 | clear_bit(num, tr->enabled_exit_syscalls); | 468 | rcu_assign_pointer(tr->exit_syscall_files[num], NULL); |
457 | if (!tr->sys_refcount_exit) | 469 | if (!tr->sys_refcount_exit) |
458 | unregister_trace_sys_exit(ftrace_syscall_exit, tr); | 470 | unregister_trace_sys_exit(ftrace_syscall_exit, tr); |
459 | mutex_unlock(&syscall_trace_lock); | 471 | mutex_unlock(&syscall_trace_lock); |
diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c index 272261b5f94f..b6dcc42ef7f5 100644 --- a/kernel/trace/trace_uprobe.c +++ b/kernel/trace/trace_uprobe.c | |||
@@ -128,6 +128,7 @@ alloc_trace_uprobe(const char *group, const char *event, int nargs, bool is_ret) | |||
128 | if (is_ret) | 128 | if (is_ret) |
129 | tu->consumer.ret_handler = uretprobe_dispatcher; | 129 | tu->consumer.ret_handler = uretprobe_dispatcher; |
130 | init_trace_uprobe_filter(&tu->filter); | 130 | init_trace_uprobe_filter(&tu->filter); |
131 | tu->call.flags |= TRACE_EVENT_FL_USE_CALL_FILTER; | ||
131 | return tu; | 132 | return tu; |
132 | 133 | ||
133 | error: | 134 | error: |
@@ -561,7 +562,7 @@ static void uprobe_trace_print(struct trace_uprobe *tu, | |||
561 | for (i = 0; i < tu->nr_args; i++) | 562 | for (i = 0; i < tu->nr_args; i++) |
562 | call_fetch(&tu->args[i].fetch, regs, data + tu->args[i].offset); | 563 | call_fetch(&tu->args[i].fetch, regs, data + tu->args[i].offset); |
563 | 564 | ||
564 | if (!filter_current_check_discard(buffer, call, entry, event)) | 565 | if (!call_filter_check_discard(call, entry, buffer, event)) |
565 | trace_buffer_unlock_commit(buffer, event, 0, 0); | 566 | trace_buffer_unlock_commit(buffer, event, 0, 0); |
566 | } | 567 | } |
567 | 568 | ||
diff --git a/kernel/up.c b/kernel/up.c index 630d72bf7e41..509403e3fbc6 100644 --- a/kernel/up.c +++ b/kernel/up.c | |||
@@ -22,6 +22,17 @@ int smp_call_function_single(int cpu, void (*func) (void *info), void *info, | |||
22 | } | 22 | } |
23 | EXPORT_SYMBOL(smp_call_function_single); | 23 | EXPORT_SYMBOL(smp_call_function_single); |
24 | 24 | ||
25 | void __smp_call_function_single(int cpu, struct call_single_data *csd, | ||
26 | int wait) | ||
27 | { | ||
28 | unsigned long flags; | ||
29 | |||
30 | local_irq_save(flags); | ||
31 | csd->func(csd->info); | ||
32 | local_irq_restore(flags); | ||
33 | } | ||
34 | EXPORT_SYMBOL(__smp_call_function_single); | ||
35 | |||
25 | int on_each_cpu(smp_call_func_t func, void *info, int wait) | 36 | int on_each_cpu(smp_call_func_t func, void *info, int wait) |
26 | { | 37 | { |
27 | unsigned long flags; | 38 | unsigned long flags; |
diff --git a/kernel/user.c b/kernel/user.c index 5bbb91988e69..c006131beb77 100644 --- a/kernel/user.c +++ b/kernel/user.c | |||
@@ -51,6 +51,10 @@ struct user_namespace init_user_ns = { | |||
51 | .owner = GLOBAL_ROOT_UID, | 51 | .owner = GLOBAL_ROOT_UID, |
52 | .group = GLOBAL_ROOT_GID, | 52 | .group = GLOBAL_ROOT_GID, |
53 | .proc_inum = PROC_USER_INIT_INO, | 53 | .proc_inum = PROC_USER_INIT_INO, |
54 | #ifdef CONFIG_PERSISTENT_KEYRINGS | ||
55 | .persistent_keyring_register_sem = | ||
56 | __RWSEM_INITIALIZER(init_user_ns.persistent_keyring_register_sem), | ||
57 | #endif | ||
54 | }; | 58 | }; |
55 | EXPORT_SYMBOL_GPL(init_user_ns); | 59 | EXPORT_SYMBOL_GPL(init_user_ns); |
56 | 60 | ||
diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c index 13fb1134ba58..240fb62cf394 100644 --- a/kernel/user_namespace.c +++ b/kernel/user_namespace.c | |||
@@ -101,6 +101,9 @@ int create_user_ns(struct cred *new) | |||
101 | 101 | ||
102 | set_cred_user_ns(new, ns); | 102 | set_cred_user_ns(new, ns); |
103 | 103 | ||
104 | #ifdef CONFIG_PERSISTENT_KEYRINGS | ||
105 | init_rwsem(&ns->persistent_keyring_register_sem); | ||
106 | #endif | ||
104 | return 0; | 107 | return 0; |
105 | } | 108 | } |
106 | 109 | ||
@@ -130,6 +133,9 @@ void free_user_ns(struct user_namespace *ns) | |||
130 | 133 | ||
131 | do { | 134 | do { |
132 | parent = ns->parent; | 135 | parent = ns->parent; |
136 | #ifdef CONFIG_PERSISTENT_KEYRINGS | ||
137 | key_put(ns->persistent_keyring_register); | ||
138 | #endif | ||
133 | proc_free_inum(ns->proc_inum); | 139 | proc_free_inum(ns->proc_inum); |
134 | kmem_cache_free(user_ns_cachep, ns); | 140 | kmem_cache_free(user_ns_cachep, ns); |
135 | ns = parent; | 141 | ns = parent; |
diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 987293d03ebc..b010eac595d2 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c | |||
@@ -305,6 +305,9 @@ static DEFINE_HASHTABLE(unbound_pool_hash, UNBOUND_POOL_HASH_ORDER); | |||
305 | /* I: attributes used when instantiating standard unbound pools on demand */ | 305 | /* I: attributes used when instantiating standard unbound pools on demand */ |
306 | static struct workqueue_attrs *unbound_std_wq_attrs[NR_STD_WORKER_POOLS]; | 306 | static struct workqueue_attrs *unbound_std_wq_attrs[NR_STD_WORKER_POOLS]; |
307 | 307 | ||
308 | /* I: attributes used when instantiating ordered pools on demand */ | ||
309 | static struct workqueue_attrs *ordered_wq_attrs[NR_STD_WORKER_POOLS]; | ||
310 | |||
308 | struct workqueue_struct *system_wq __read_mostly; | 311 | struct workqueue_struct *system_wq __read_mostly; |
309 | EXPORT_SYMBOL(system_wq); | 312 | EXPORT_SYMBOL(system_wq); |
310 | struct workqueue_struct *system_highpri_wq __read_mostly; | 313 | struct workqueue_struct *system_highpri_wq __read_mostly; |
@@ -518,14 +521,21 @@ static inline void debug_work_activate(struct work_struct *work) { } | |||
518 | static inline void debug_work_deactivate(struct work_struct *work) { } | 521 | static inline void debug_work_deactivate(struct work_struct *work) { } |
519 | #endif | 522 | #endif |
520 | 523 | ||
521 | /* allocate ID and assign it to @pool */ | 524 | /** |
525 | * worker_pool_assign_id - allocate ID and assing it to @pool | ||
526 | * @pool: the pool pointer of interest | ||
527 | * | ||
528 | * Returns 0 if ID in [0, WORK_OFFQ_POOL_NONE) is allocated and assigned | ||
529 | * successfully, -errno on failure. | ||
530 | */ | ||
522 | static int worker_pool_assign_id(struct worker_pool *pool) | 531 | static int worker_pool_assign_id(struct worker_pool *pool) |
523 | { | 532 | { |
524 | int ret; | 533 | int ret; |
525 | 534 | ||
526 | lockdep_assert_held(&wq_pool_mutex); | 535 | lockdep_assert_held(&wq_pool_mutex); |
527 | 536 | ||
528 | ret = idr_alloc(&worker_pool_idr, pool, 0, 0, GFP_KERNEL); | 537 | ret = idr_alloc(&worker_pool_idr, pool, 0, WORK_OFFQ_POOL_NONE, |
538 | GFP_KERNEL); | ||
529 | if (ret >= 0) { | 539 | if (ret >= 0) { |
530 | pool->id = ret; | 540 | pool->id = ret; |
531 | return 0; | 541 | return 0; |
@@ -1320,7 +1330,7 @@ static void __queue_work(int cpu, struct workqueue_struct *wq, | |||
1320 | 1330 | ||
1321 | debug_work_activate(work); | 1331 | debug_work_activate(work); |
1322 | 1332 | ||
1323 | /* if dying, only works from the same workqueue are allowed */ | 1333 | /* if draining, only works from the same workqueue are allowed */ |
1324 | if (unlikely(wq->flags & __WQ_DRAINING) && | 1334 | if (unlikely(wq->flags & __WQ_DRAINING) && |
1325 | WARN_ON_ONCE(!is_chained_work(wq))) | 1335 | WARN_ON_ONCE(!is_chained_work(wq))) |
1326 | return; | 1336 | return; |
@@ -1736,16 +1746,17 @@ static struct worker *create_worker(struct worker_pool *pool) | |||
1736 | if (IS_ERR(worker->task)) | 1746 | if (IS_ERR(worker->task)) |
1737 | goto fail; | 1747 | goto fail; |
1738 | 1748 | ||
1749 | set_user_nice(worker->task, pool->attrs->nice); | ||
1750 | |||
1751 | /* prevent userland from meddling with cpumask of workqueue workers */ | ||
1752 | worker->task->flags |= PF_NO_SETAFFINITY; | ||
1753 | |||
1739 | /* | 1754 | /* |
1740 | * set_cpus_allowed_ptr() will fail if the cpumask doesn't have any | 1755 | * set_cpus_allowed_ptr() will fail if the cpumask doesn't have any |
1741 | * online CPUs. It'll be re-applied when any of the CPUs come up. | 1756 | * online CPUs. It'll be re-applied when any of the CPUs come up. |
1742 | */ | 1757 | */ |
1743 | set_user_nice(worker->task, pool->attrs->nice); | ||
1744 | set_cpus_allowed_ptr(worker->task, pool->attrs->cpumask); | 1758 | set_cpus_allowed_ptr(worker->task, pool->attrs->cpumask); |
1745 | 1759 | ||
1746 | /* prevent userland from meddling with cpumask of workqueue workers */ | ||
1747 | worker->task->flags |= PF_NO_SETAFFINITY; | ||
1748 | |||
1749 | /* | 1760 | /* |
1750 | * The caller is responsible for ensuring %POOL_DISASSOCIATED | 1761 | * The caller is responsible for ensuring %POOL_DISASSOCIATED |
1751 | * remains stable across this function. See the comments above the | 1762 | * remains stable across this function. See the comments above the |
@@ -2840,19 +2851,6 @@ already_gone: | |||
2840 | return false; | 2851 | return false; |
2841 | } | 2852 | } |
2842 | 2853 | ||
2843 | static bool __flush_work(struct work_struct *work) | ||
2844 | { | ||
2845 | struct wq_barrier barr; | ||
2846 | |||
2847 | if (start_flush_work(work, &barr)) { | ||
2848 | wait_for_completion(&barr.done); | ||
2849 | destroy_work_on_stack(&barr.work); | ||
2850 | return true; | ||
2851 | } else { | ||
2852 | return false; | ||
2853 | } | ||
2854 | } | ||
2855 | |||
2856 | /** | 2854 | /** |
2857 | * flush_work - wait for a work to finish executing the last queueing instance | 2855 | * flush_work - wait for a work to finish executing the last queueing instance |
2858 | * @work: the work to flush | 2856 | * @work: the work to flush |
@@ -2866,10 +2864,18 @@ static bool __flush_work(struct work_struct *work) | |||
2866 | */ | 2864 | */ |
2867 | bool flush_work(struct work_struct *work) | 2865 | bool flush_work(struct work_struct *work) |
2868 | { | 2866 | { |
2867 | struct wq_barrier barr; | ||
2868 | |||
2869 | lock_map_acquire(&work->lockdep_map); | 2869 | lock_map_acquire(&work->lockdep_map); |
2870 | lock_map_release(&work->lockdep_map); | 2870 | lock_map_release(&work->lockdep_map); |
2871 | 2871 | ||
2872 | return __flush_work(work); | 2872 | if (start_flush_work(work, &barr)) { |
2873 | wait_for_completion(&barr.done); | ||
2874 | destroy_work_on_stack(&barr.work); | ||
2875 | return true; | ||
2876 | } else { | ||
2877 | return false; | ||
2878 | } | ||
2873 | } | 2879 | } |
2874 | EXPORT_SYMBOL_GPL(flush_work); | 2880 | EXPORT_SYMBOL_GPL(flush_work); |
2875 | 2881 | ||
@@ -4106,7 +4112,7 @@ out_unlock: | |||
4106 | static int alloc_and_link_pwqs(struct workqueue_struct *wq) | 4112 | static int alloc_and_link_pwqs(struct workqueue_struct *wq) |
4107 | { | 4113 | { |
4108 | bool highpri = wq->flags & WQ_HIGHPRI; | 4114 | bool highpri = wq->flags & WQ_HIGHPRI; |
4109 | int cpu; | 4115 | int cpu, ret; |
4110 | 4116 | ||
4111 | if (!(wq->flags & WQ_UNBOUND)) { | 4117 | if (!(wq->flags & WQ_UNBOUND)) { |
4112 | wq->cpu_pwqs = alloc_percpu(struct pool_workqueue); | 4118 | wq->cpu_pwqs = alloc_percpu(struct pool_workqueue); |
@@ -4126,6 +4132,13 @@ static int alloc_and_link_pwqs(struct workqueue_struct *wq) | |||
4126 | mutex_unlock(&wq->mutex); | 4132 | mutex_unlock(&wq->mutex); |
4127 | } | 4133 | } |
4128 | return 0; | 4134 | return 0; |
4135 | } else if (wq->flags & __WQ_ORDERED) { | ||
4136 | ret = apply_workqueue_attrs(wq, ordered_wq_attrs[highpri]); | ||
4137 | /* there should only be single pwq for ordering guarantee */ | ||
4138 | WARN(!ret && (wq->pwqs.next != &wq->dfl_pwq->pwqs_node || | ||
4139 | wq->pwqs.prev != &wq->dfl_pwq->pwqs_node), | ||
4140 | "ordering guarantee broken for workqueue %s\n", wq->name); | ||
4141 | return ret; | ||
4129 | } else { | 4142 | } else { |
4130 | return apply_workqueue_attrs(wq, unbound_std_wq_attrs[highpri]); | 4143 | return apply_workqueue_attrs(wq, unbound_std_wq_attrs[highpri]); |
4131 | } | 4144 | } |
@@ -4814,14 +4827,7 @@ long work_on_cpu(int cpu, long (*fn)(void *), void *arg) | |||
4814 | 4827 | ||
4815 | INIT_WORK_ONSTACK(&wfc.work, work_for_cpu_fn); | 4828 | INIT_WORK_ONSTACK(&wfc.work, work_for_cpu_fn); |
4816 | schedule_work_on(cpu, &wfc.work); | 4829 | schedule_work_on(cpu, &wfc.work); |
4817 | 4830 | flush_work(&wfc.work); | |
4818 | /* | ||
4819 | * The work item is on-stack and can't lead to deadlock through | ||
4820 | * flushing. Use __flush_work() to avoid spurious lockdep warnings | ||
4821 | * when work_on_cpu()s are nested. | ||
4822 | */ | ||
4823 | __flush_work(&wfc.work); | ||
4824 | |||
4825 | return wfc.ret; | 4831 | return wfc.ret; |
4826 | } | 4832 | } |
4827 | EXPORT_SYMBOL_GPL(work_on_cpu); | 4833 | EXPORT_SYMBOL_GPL(work_on_cpu); |
@@ -5009,10 +5015,6 @@ static int __init init_workqueues(void) | |||
5009 | int std_nice[NR_STD_WORKER_POOLS] = { 0, HIGHPRI_NICE_LEVEL }; | 5015 | int std_nice[NR_STD_WORKER_POOLS] = { 0, HIGHPRI_NICE_LEVEL }; |
5010 | int i, cpu; | 5016 | int i, cpu; |
5011 | 5017 | ||
5012 | /* make sure we have enough bits for OFFQ pool ID */ | ||
5013 | BUILD_BUG_ON((1LU << (BITS_PER_LONG - WORK_OFFQ_POOL_SHIFT)) < | ||
5014 | WORK_CPU_END * NR_STD_WORKER_POOLS); | ||
5015 | |||
5016 | WARN_ON(__alignof__(struct pool_workqueue) < __alignof__(long long)); | 5018 | WARN_ON(__alignof__(struct pool_workqueue) < __alignof__(long long)); |
5017 | 5019 | ||
5018 | pwq_cache = KMEM_CACHE(pool_workqueue, SLAB_PANIC); | 5020 | pwq_cache = KMEM_CACHE(pool_workqueue, SLAB_PANIC); |
@@ -5051,13 +5053,23 @@ static int __init init_workqueues(void) | |||
5051 | } | 5053 | } |
5052 | } | 5054 | } |
5053 | 5055 | ||
5054 | /* create default unbound wq attrs */ | 5056 | /* create default unbound and ordered wq attrs */ |
5055 | for (i = 0; i < NR_STD_WORKER_POOLS; i++) { | 5057 | for (i = 0; i < NR_STD_WORKER_POOLS; i++) { |
5056 | struct workqueue_attrs *attrs; | 5058 | struct workqueue_attrs *attrs; |
5057 | 5059 | ||
5058 | BUG_ON(!(attrs = alloc_workqueue_attrs(GFP_KERNEL))); | 5060 | BUG_ON(!(attrs = alloc_workqueue_attrs(GFP_KERNEL))); |
5059 | attrs->nice = std_nice[i]; | 5061 | attrs->nice = std_nice[i]; |
5060 | unbound_std_wq_attrs[i] = attrs; | 5062 | unbound_std_wq_attrs[i] = attrs; |
5063 | |||
5064 | /* | ||
5065 | * An ordered wq should have only one pwq as ordering is | ||
5066 | * guaranteed by max_active which is enforced by pwqs. | ||
5067 | * Turn off NUMA so that dfl_pwq is used for all nodes. | ||
5068 | */ | ||
5069 | BUG_ON(!(attrs = alloc_workqueue_attrs(GFP_KERNEL))); | ||
5070 | attrs->nice = std_nice[i]; | ||
5071 | attrs->no_numa = true; | ||
5072 | ordered_wq_attrs[i] = attrs; | ||
5061 | } | 5073 | } |
5062 | 5074 | ||
5063 | system_wq = alloc_workqueue("events", 0, 0); | 5075 | system_wq = alloc_workqueue("events", 0, 0); |