diff options
Diffstat (limited to 'kernel')
134 files changed, 11155 insertions, 6708 deletions
diff --git a/kernel/Kconfig.locks b/kernel/Kconfig.locks index 2251882daf53..44511d100eaa 100644 --- a/kernel/Kconfig.locks +++ b/kernel/Kconfig.locks | |||
@@ -87,6 +87,9 @@ config ARCH_INLINE_WRITE_UNLOCK_IRQ | |||
87 | config ARCH_INLINE_WRITE_UNLOCK_IRQRESTORE | 87 | config ARCH_INLINE_WRITE_UNLOCK_IRQRESTORE |
88 | bool | 88 | bool |
89 | 89 | ||
90 | config UNINLINE_SPIN_UNLOCK | ||
91 | bool | ||
92 | |||
90 | # | 93 | # |
91 | # lock_* functions are inlined when: | 94 | # lock_* functions are inlined when: |
92 | # - DEBUG_SPINLOCK=n and GENERIC_LOCKBREAK=n and ARCH_INLINE_*LOCK=y | 95 | # - DEBUG_SPINLOCK=n and GENERIC_LOCKBREAK=n and ARCH_INLINE_*LOCK=y |
@@ -103,100 +106,120 @@ config ARCH_INLINE_WRITE_UNLOCK_IRQRESTORE | |||
103 | # - DEBUG_SPINLOCK=n and ARCH_INLINE_*LOCK=y | 106 | # - DEBUG_SPINLOCK=n and ARCH_INLINE_*LOCK=y |
104 | # | 107 | # |
105 | 108 | ||
109 | if !DEBUG_SPINLOCK | ||
110 | |||
106 | config INLINE_SPIN_TRYLOCK | 111 | config INLINE_SPIN_TRYLOCK |
107 | def_bool !DEBUG_SPINLOCK && ARCH_INLINE_SPIN_TRYLOCK | 112 | def_bool y |
113 | depends on ARCH_INLINE_SPIN_TRYLOCK | ||
108 | 114 | ||
109 | config INLINE_SPIN_TRYLOCK_BH | 115 | config INLINE_SPIN_TRYLOCK_BH |
110 | def_bool !DEBUG_SPINLOCK && ARCH_INLINE_SPIN_TRYLOCK_BH | 116 | def_bool y |
117 | depends on ARCH_INLINE_SPIN_TRYLOCK_BH | ||
111 | 118 | ||
112 | config INLINE_SPIN_LOCK | 119 | config INLINE_SPIN_LOCK |
113 | def_bool !DEBUG_SPINLOCK && !GENERIC_LOCKBREAK && ARCH_INLINE_SPIN_LOCK | 120 | def_bool y |
121 | depends on !GENERIC_LOCKBREAK && ARCH_INLINE_SPIN_LOCK | ||
114 | 122 | ||
115 | config INLINE_SPIN_LOCK_BH | 123 | config INLINE_SPIN_LOCK_BH |
116 | def_bool !DEBUG_SPINLOCK && !GENERIC_LOCKBREAK && \ | 124 | def_bool y |
117 | ARCH_INLINE_SPIN_LOCK_BH | 125 | depends on !GENERIC_LOCKBREAK && ARCH_INLINE_SPIN_LOCK_BH |
118 | 126 | ||
119 | config INLINE_SPIN_LOCK_IRQ | 127 | config INLINE_SPIN_LOCK_IRQ |
120 | def_bool !DEBUG_SPINLOCK && !GENERIC_LOCKBREAK && \ | 128 | def_bool y |
121 | ARCH_INLINE_SPIN_LOCK_IRQ | 129 | depends on !GENERIC_LOCKBREAK && ARCH_INLINE_SPIN_LOCK_IRQ |
122 | 130 | ||
123 | config INLINE_SPIN_LOCK_IRQSAVE | 131 | config INLINE_SPIN_LOCK_IRQSAVE |
124 | def_bool !DEBUG_SPINLOCK && !GENERIC_LOCKBREAK && \ | 132 | def_bool y |
125 | ARCH_INLINE_SPIN_LOCK_IRQSAVE | 133 | depends on !GENERIC_LOCKBREAK && ARCH_INLINE_SPIN_LOCK_IRQSAVE |
126 | |||
127 | config UNINLINE_SPIN_UNLOCK | ||
128 | bool | ||
129 | 134 | ||
130 | config INLINE_SPIN_UNLOCK_BH | 135 | config INLINE_SPIN_UNLOCK_BH |
131 | def_bool !DEBUG_SPINLOCK && ARCH_INLINE_SPIN_UNLOCK_BH | 136 | def_bool y |
137 | depends on ARCH_INLINE_SPIN_UNLOCK_BH | ||
132 | 138 | ||
133 | config INLINE_SPIN_UNLOCK_IRQ | 139 | config INLINE_SPIN_UNLOCK_IRQ |
134 | def_bool !DEBUG_SPINLOCK && (!PREEMPT || ARCH_INLINE_SPIN_UNLOCK_BH) | 140 | def_bool y |
141 | depends on !PREEMPT || ARCH_INLINE_SPIN_UNLOCK_BH | ||
135 | 142 | ||
136 | config INLINE_SPIN_UNLOCK_IRQRESTORE | 143 | config INLINE_SPIN_UNLOCK_IRQRESTORE |
137 | def_bool !DEBUG_SPINLOCK && ARCH_INLINE_SPIN_UNLOCK_IRQRESTORE | 144 | def_bool y |
145 | depends on ARCH_INLINE_SPIN_UNLOCK_IRQRESTORE | ||
138 | 146 | ||
139 | 147 | ||
140 | config INLINE_READ_TRYLOCK | 148 | config INLINE_READ_TRYLOCK |
141 | def_bool !DEBUG_SPINLOCK && ARCH_INLINE_READ_TRYLOCK | 149 | def_bool y |
150 | depends on ARCH_INLINE_READ_TRYLOCK | ||
142 | 151 | ||
143 | config INLINE_READ_LOCK | 152 | config INLINE_READ_LOCK |
144 | def_bool !DEBUG_SPINLOCK && !GENERIC_LOCKBREAK && ARCH_INLINE_READ_LOCK | 153 | def_bool y |
154 | depends on !GENERIC_LOCKBREAK && ARCH_INLINE_READ_LOCK | ||
145 | 155 | ||
146 | config INLINE_READ_LOCK_BH | 156 | config INLINE_READ_LOCK_BH |
147 | def_bool !DEBUG_SPINLOCK && !GENERIC_LOCKBREAK && \ | 157 | def_bool y |
148 | ARCH_INLINE_READ_LOCK_BH | 158 | depends on !GENERIC_LOCKBREAK && ARCH_INLINE_READ_LOCK_BH |
149 | 159 | ||
150 | config INLINE_READ_LOCK_IRQ | 160 | config INLINE_READ_LOCK_IRQ |
151 | def_bool !DEBUG_SPINLOCK && !GENERIC_LOCKBREAK && \ | 161 | def_bool y |
152 | ARCH_INLINE_READ_LOCK_IRQ | 162 | depends on !GENERIC_LOCKBREAK && ARCH_INLINE_READ_LOCK_IRQ |
153 | 163 | ||
154 | config INLINE_READ_LOCK_IRQSAVE | 164 | config INLINE_READ_LOCK_IRQSAVE |
155 | def_bool !DEBUG_SPINLOCK && !GENERIC_LOCKBREAK && \ | 165 | def_bool y |
156 | ARCH_INLINE_READ_LOCK_IRQSAVE | 166 | depends on !GENERIC_LOCKBREAK && ARCH_INLINE_READ_LOCK_IRQSAVE |
157 | 167 | ||
158 | config INLINE_READ_UNLOCK | 168 | config INLINE_READ_UNLOCK |
159 | def_bool !DEBUG_SPINLOCK && (!PREEMPT || ARCH_INLINE_READ_UNLOCK) | 169 | def_bool y |
170 | depends on !PREEMPT || ARCH_INLINE_READ_UNLOCK | ||
160 | 171 | ||
161 | config INLINE_READ_UNLOCK_BH | 172 | config INLINE_READ_UNLOCK_BH |
162 | def_bool !DEBUG_SPINLOCK && ARCH_INLINE_READ_UNLOCK_BH | 173 | def_bool y |
174 | depends on ARCH_INLINE_READ_UNLOCK_BH | ||
163 | 175 | ||
164 | config INLINE_READ_UNLOCK_IRQ | 176 | config INLINE_READ_UNLOCK_IRQ |
165 | def_bool !DEBUG_SPINLOCK && (!PREEMPT || ARCH_INLINE_READ_UNLOCK_BH) | 177 | def_bool y |
178 | depends on !PREEMPT || ARCH_INLINE_READ_UNLOCK_BH | ||
166 | 179 | ||
167 | config INLINE_READ_UNLOCK_IRQRESTORE | 180 | config INLINE_READ_UNLOCK_IRQRESTORE |
168 | def_bool !DEBUG_SPINLOCK && ARCH_INLINE_READ_UNLOCK_IRQRESTORE | 181 | def_bool y |
182 | depends on ARCH_INLINE_READ_UNLOCK_IRQRESTORE | ||
169 | 183 | ||
170 | 184 | ||
171 | config INLINE_WRITE_TRYLOCK | 185 | config INLINE_WRITE_TRYLOCK |
172 | def_bool !DEBUG_SPINLOCK && ARCH_INLINE_WRITE_TRYLOCK | 186 | def_bool y |
187 | depends on ARCH_INLINE_WRITE_TRYLOCK | ||
173 | 188 | ||
174 | config INLINE_WRITE_LOCK | 189 | config INLINE_WRITE_LOCK |
175 | def_bool !DEBUG_SPINLOCK && !GENERIC_LOCKBREAK && ARCH_INLINE_WRITE_LOCK | 190 | def_bool y |
191 | depends on !GENERIC_LOCKBREAK && ARCH_INLINE_WRITE_LOCK | ||
176 | 192 | ||
177 | config INLINE_WRITE_LOCK_BH | 193 | config INLINE_WRITE_LOCK_BH |
178 | def_bool !DEBUG_SPINLOCK && !GENERIC_LOCKBREAK && \ | 194 | def_bool y |
179 | ARCH_INLINE_WRITE_LOCK_BH | 195 | depends on !GENERIC_LOCKBREAK && ARCH_INLINE_WRITE_LOCK_BH |
180 | 196 | ||
181 | config INLINE_WRITE_LOCK_IRQ | 197 | config INLINE_WRITE_LOCK_IRQ |
182 | def_bool !DEBUG_SPINLOCK && !GENERIC_LOCKBREAK && \ | 198 | def_bool y |
183 | ARCH_INLINE_WRITE_LOCK_IRQ | 199 | depends on !GENERIC_LOCKBREAK && ARCH_INLINE_WRITE_LOCK_IRQ |
184 | 200 | ||
185 | config INLINE_WRITE_LOCK_IRQSAVE | 201 | config INLINE_WRITE_LOCK_IRQSAVE |
186 | def_bool !DEBUG_SPINLOCK && !GENERIC_LOCKBREAK && \ | 202 | def_bool y |
187 | ARCH_INLINE_WRITE_LOCK_IRQSAVE | 203 | depends on !GENERIC_LOCKBREAK && ARCH_INLINE_WRITE_LOCK_IRQSAVE |
188 | 204 | ||
189 | config INLINE_WRITE_UNLOCK | 205 | config INLINE_WRITE_UNLOCK |
190 | def_bool !DEBUG_SPINLOCK && (!PREEMPT || ARCH_INLINE_WRITE_UNLOCK) | 206 | def_bool y |
207 | depends on !PREEMPT || ARCH_INLINE_WRITE_UNLOCK | ||
191 | 208 | ||
192 | config INLINE_WRITE_UNLOCK_BH | 209 | config INLINE_WRITE_UNLOCK_BH |
193 | def_bool !DEBUG_SPINLOCK && ARCH_INLINE_WRITE_UNLOCK_BH | 210 | def_bool y |
211 | depends on ARCH_INLINE_WRITE_UNLOCK_BH | ||
194 | 212 | ||
195 | config INLINE_WRITE_UNLOCK_IRQ | 213 | config INLINE_WRITE_UNLOCK_IRQ |
196 | def_bool !DEBUG_SPINLOCK && (!PREEMPT || ARCH_INLINE_WRITE_UNLOCK_BH) | 214 | def_bool y |
215 | depends on !PREEMPT || ARCH_INLINE_WRITE_UNLOCK_BH | ||
197 | 216 | ||
198 | config INLINE_WRITE_UNLOCK_IRQRESTORE | 217 | config INLINE_WRITE_UNLOCK_IRQRESTORE |
199 | def_bool !DEBUG_SPINLOCK && ARCH_INLINE_WRITE_UNLOCK_IRQRESTORE | 218 | def_bool y |
219 | depends on ARCH_INLINE_WRITE_UNLOCK_IRQRESTORE | ||
220 | |||
221 | endif | ||
200 | 222 | ||
201 | config MUTEX_SPIN_ON_OWNER | 223 | config MUTEX_SPIN_ON_OWNER |
202 | def_bool SMP && !DEBUG_MUTEXES | 224 | def_bool y |
225 | depends on SMP && !DEBUG_MUTEXES | ||
diff --git a/kernel/Makefile b/kernel/Makefile index c0cc67ad764c..6c072b6da239 100644 --- a/kernel/Makefile +++ b/kernel/Makefile | |||
@@ -10,7 +10,7 @@ obj-y = fork.o exec_domain.o panic.o printk.o \ | |||
10 | kthread.o wait.o kfifo.o sys_ni.o posix-cpu-timers.o mutex.o \ | 10 | kthread.o wait.o kfifo.o sys_ni.o posix-cpu-timers.o mutex.o \ |
11 | hrtimer.o rwsem.o nsproxy.o srcu.o semaphore.o \ | 11 | hrtimer.o rwsem.o nsproxy.o srcu.o semaphore.o \ |
12 | notifier.o ksysfs.o cred.o \ | 12 | notifier.o ksysfs.o cred.o \ |
13 | async.o range.o groups.o lglock.o | 13 | async.o range.o groups.o lglock.o smpboot.o |
14 | 14 | ||
15 | ifdef CONFIG_FUNCTION_TRACER | 15 | ifdef CONFIG_FUNCTION_TRACER |
16 | # Do not trace debug files and internal ftrace files | 16 | # Do not trace debug files and internal ftrace files |
@@ -46,7 +46,6 @@ obj-$(CONFIG_DEBUG_RT_MUTEXES) += rtmutex-debug.o | |||
46 | obj-$(CONFIG_RT_MUTEX_TESTER) += rtmutex-tester.o | 46 | obj-$(CONFIG_RT_MUTEX_TESTER) += rtmutex-tester.o |
47 | obj-$(CONFIG_GENERIC_ISA_DMA) += dma.o | 47 | obj-$(CONFIG_GENERIC_ISA_DMA) += dma.o |
48 | obj-$(CONFIG_SMP) += smp.o | 48 | obj-$(CONFIG_SMP) += smp.o |
49 | obj-$(CONFIG_SMP) += smpboot.o | ||
50 | ifneq ($(CONFIG_SMP),y) | 49 | ifneq ($(CONFIG_SMP),y) |
51 | obj-y += up.o | 50 | obj-y += up.o |
52 | endif | 51 | endif |
@@ -55,6 +54,7 @@ obj-$(CONFIG_DEBUG_SPINLOCK) += spinlock.o | |||
55 | obj-$(CONFIG_PROVE_LOCKING) += spinlock.o | 54 | obj-$(CONFIG_PROVE_LOCKING) += spinlock.o |
56 | obj-$(CONFIG_UID16) += uid16.o | 55 | obj-$(CONFIG_UID16) += uid16.o |
57 | obj-$(CONFIG_MODULES) += module.o | 56 | obj-$(CONFIG_MODULES) += module.o |
57 | obj-$(CONFIG_MODULE_SIG) += module_signing.o modsign_pubkey.o modsign_certificate.o | ||
58 | obj-$(CONFIG_KALLSYMS) += kallsyms.o | 58 | obj-$(CONFIG_KALLSYMS) += kallsyms.o |
59 | obj-$(CONFIG_BSD_PROCESS_ACCT) += acct.o | 59 | obj-$(CONFIG_BSD_PROCESS_ACCT) += acct.o |
60 | obj-$(CONFIG_KEXEC) += kexec.o | 60 | obj-$(CONFIG_KEXEC) += kexec.o |
@@ -98,7 +98,7 @@ obj-$(CONFIG_COMPAT_BINFMT_ELF) += elfcore.o | |||
98 | obj-$(CONFIG_BINFMT_ELF_FDPIC) += elfcore.o | 98 | obj-$(CONFIG_BINFMT_ELF_FDPIC) += elfcore.o |
99 | obj-$(CONFIG_FUNCTION_TRACER) += trace/ | 99 | obj-$(CONFIG_FUNCTION_TRACER) += trace/ |
100 | obj-$(CONFIG_TRACING) += trace/ | 100 | obj-$(CONFIG_TRACING) += trace/ |
101 | obj-$(CONFIG_X86_DS) += trace/ | 101 | obj-$(CONFIG_TRACE_CLOCK) += trace/ |
102 | obj-$(CONFIG_RING_BUFFER) += trace/ | 102 | obj-$(CONFIG_RING_BUFFER) += trace/ |
103 | obj-$(CONFIG_TRACEPOINTS) += trace/ | 103 | obj-$(CONFIG_TRACEPOINTS) += trace/ |
104 | obj-$(CONFIG_IRQ_WORK) += irq_work.o | 104 | obj-$(CONFIG_IRQ_WORK) += irq_work.o |
@@ -110,6 +110,7 @@ obj-$(CONFIG_USER_RETURN_NOTIFIER) += user-return-notifier.o | |||
110 | obj-$(CONFIG_PADATA) += padata.o | 110 | obj-$(CONFIG_PADATA) += padata.o |
111 | obj-$(CONFIG_CRASH_DUMP) += crash_dump.o | 111 | obj-$(CONFIG_CRASH_DUMP) += crash_dump.o |
112 | obj-$(CONFIG_JUMP_LABEL) += jump_label.o | 112 | obj-$(CONFIG_JUMP_LABEL) += jump_label.o |
113 | obj-$(CONFIG_CONTEXT_TRACKING) += context_tracking.o | ||
113 | 114 | ||
114 | $(obj)/configs.o: $(obj)/config_data.h | 115 | $(obj)/configs.o: $(obj)/config_data.h |
115 | 116 | ||
@@ -131,3 +132,81 @@ quiet_cmd_timeconst = TIMEC $@ | |||
131 | targets += timeconst.h | 132 | targets += timeconst.h |
132 | $(obj)/timeconst.h: $(src)/timeconst.pl FORCE | 133 | $(obj)/timeconst.h: $(src)/timeconst.pl FORCE |
133 | $(call if_changed,timeconst) | 134 | $(call if_changed,timeconst) |
135 | |||
136 | ifeq ($(CONFIG_MODULE_SIG),y) | ||
137 | # | ||
138 | # Pull the signing certificate and any extra certificates into the kernel | ||
139 | # | ||
140 | |||
141 | quiet_cmd_touch = TOUCH $@ | ||
142 | cmd_touch = touch $@ | ||
143 | |||
144 | extra_certificates: | ||
145 | $(call cmd,touch) | ||
146 | |||
147 | kernel/modsign_certificate.o: signing_key.x509 extra_certificates | ||
148 | |||
149 | ############################################################################### | ||
150 | # | ||
151 | # If module signing is requested, say by allyesconfig, but a key has not been | ||
152 | # supplied, then one will need to be generated to make sure the build does not | ||
153 | # fail and that the kernel may be used afterwards. | ||
154 | # | ||
155 | ############################################################################### | ||
156 | sign_key_with_hash := | ||
157 | ifeq ($(CONFIG_MODULE_SIG_SHA1),y) | ||
158 | sign_key_with_hash := -sha1 | ||
159 | endif | ||
160 | ifeq ($(CONFIG_MODULE_SIG_SHA224),y) | ||
161 | sign_key_with_hash := -sha224 | ||
162 | endif | ||
163 | ifeq ($(CONFIG_MODULE_SIG_SHA256),y) | ||
164 | sign_key_with_hash := -sha256 | ||
165 | endif | ||
166 | ifeq ($(CONFIG_MODULE_SIG_SHA384),y) | ||
167 | sign_key_with_hash := -sha384 | ||
168 | endif | ||
169 | ifeq ($(CONFIG_MODULE_SIG_SHA512),y) | ||
170 | sign_key_with_hash := -sha512 | ||
171 | endif | ||
172 | ifeq ($(sign_key_with_hash),) | ||
173 | $(error Could not determine digest type to use from kernel config) | ||
174 | endif | ||
175 | |||
176 | signing_key.priv signing_key.x509: x509.genkey | ||
177 | @echo "###" | ||
178 | @echo "### Now generating an X.509 key pair to be used for signing modules." | ||
179 | @echo "###" | ||
180 | @echo "### If this takes a long time, you might wish to run rngd in the" | ||
181 | @echo "### background to keep the supply of entropy topped up. It" | ||
182 | @echo "### needs to be run as root, and uses a hardware random" | ||
183 | @echo "### number generator if one is available." | ||
184 | @echo "###" | ||
185 | openssl req -new -nodes -utf8 $(sign_key_with_hash) -days 36500 -batch \ | ||
186 | -x509 -config x509.genkey \ | ||
187 | -outform DER -out signing_key.x509 \ | ||
188 | -keyout signing_key.priv | ||
189 | @echo "###" | ||
190 | @echo "### Key pair generated." | ||
191 | @echo "###" | ||
192 | |||
193 | x509.genkey: | ||
194 | @echo Generating X.509 key generation config | ||
195 | @echo >x509.genkey "[ req ]" | ||
196 | @echo >>x509.genkey "default_bits = 4096" | ||
197 | @echo >>x509.genkey "distinguished_name = req_distinguished_name" | ||
198 | @echo >>x509.genkey "prompt = no" | ||
199 | @echo >>x509.genkey "string_mask = utf8only" | ||
200 | @echo >>x509.genkey "x509_extensions = myexts" | ||
201 | @echo >>x509.genkey | ||
202 | @echo >>x509.genkey "[ req_distinguished_name ]" | ||
203 | @echo >>x509.genkey "O = Magrathea" | ||
204 | @echo >>x509.genkey "CN = Glacier signing key" | ||
205 | @echo >>x509.genkey "emailAddress = slartibartfast@magrathea.h2g2" | ||
206 | @echo >>x509.genkey | ||
207 | @echo >>x509.genkey "[ myexts ]" | ||
208 | @echo >>x509.genkey "basicConstraints=critical,CA:FALSE" | ||
209 | @echo >>x509.genkey "keyUsage=digitalSignature" | ||
210 | @echo >>x509.genkey "subjectKeyIdentifier=hash" | ||
211 | @echo >>x509.genkey "authorityKeyIdentifier=keyid" | ||
212 | endif | ||
diff --git a/kernel/acct.c b/kernel/acct.c index 02e6167a53b0..051e071a06e7 100644 --- a/kernel/acct.c +++ b/kernel/acct.c | |||
@@ -193,7 +193,7 @@ static void acct_file_reopen(struct bsd_acct_struct *acct, struct file *file, | |||
193 | } | 193 | } |
194 | } | 194 | } |
195 | 195 | ||
196 | static int acct_on(char *name) | 196 | static int acct_on(struct filename *pathname) |
197 | { | 197 | { |
198 | struct file *file; | 198 | struct file *file; |
199 | struct vfsmount *mnt; | 199 | struct vfsmount *mnt; |
@@ -201,7 +201,7 @@ static int acct_on(char *name) | |||
201 | struct bsd_acct_struct *acct = NULL; | 201 | struct bsd_acct_struct *acct = NULL; |
202 | 202 | ||
203 | /* Difference from BSD - they don't do O_APPEND */ | 203 | /* Difference from BSD - they don't do O_APPEND */ |
204 | file = filp_open(name, O_WRONLY|O_APPEND|O_LARGEFILE, 0); | 204 | file = file_open_name(pathname, O_WRONLY|O_APPEND|O_LARGEFILE, 0); |
205 | if (IS_ERR(file)) | 205 | if (IS_ERR(file)) |
206 | return PTR_ERR(file); | 206 | return PTR_ERR(file); |
207 | 207 | ||
@@ -260,7 +260,7 @@ SYSCALL_DEFINE1(acct, const char __user *, name) | |||
260 | return -EPERM; | 260 | return -EPERM; |
261 | 261 | ||
262 | if (name) { | 262 | if (name) { |
263 | char *tmp = getname(name); | 263 | struct filename *tmp = getname(name); |
264 | if (IS_ERR(tmp)) | 264 | if (IS_ERR(tmp)) |
265 | return (PTR_ERR(tmp)); | 265 | return (PTR_ERR(tmp)); |
266 | error = acct_on(tmp); | 266 | error = acct_on(tmp); |
@@ -507,8 +507,8 @@ static void do_acct_process(struct bsd_acct_struct *acct, | |||
507 | do_div(elapsed, AHZ); | 507 | do_div(elapsed, AHZ); |
508 | ac.ac_btime = get_seconds() - elapsed; | 508 | ac.ac_btime = get_seconds() - elapsed; |
509 | /* we really need to bite the bullet and change layout */ | 509 | /* we really need to bite the bullet and change layout */ |
510 | ac.ac_uid = orig_cred->uid; | 510 | ac.ac_uid = from_kuid_munged(file->f_cred->user_ns, orig_cred->uid); |
511 | ac.ac_gid = orig_cred->gid; | 511 | ac.ac_gid = from_kgid_munged(file->f_cred->user_ns, orig_cred->gid); |
512 | #if ACCT_VERSION==2 | 512 | #if ACCT_VERSION==2 |
513 | ac.ac_ahz = AHZ; | 513 | ac.ac_ahz = AHZ; |
514 | #endif | 514 | #endif |
diff --git a/kernel/audit.c b/kernel/audit.c index ea3b7b6191c7..40414e9143db 100644 --- a/kernel/audit.c +++ b/kernel/audit.c | |||
@@ -61,6 +61,7 @@ | |||
61 | #include <linux/netlink.h> | 61 | #include <linux/netlink.h> |
62 | #include <linux/freezer.h> | 62 | #include <linux/freezer.h> |
63 | #include <linux/tty.h> | 63 | #include <linux/tty.h> |
64 | #include <linux/pid_namespace.h> | ||
64 | 65 | ||
65 | #include "audit.h" | 66 | #include "audit.h" |
66 | 67 | ||
@@ -87,11 +88,11 @@ static int audit_failure = AUDIT_FAIL_PRINTK; | |||
87 | 88 | ||
88 | /* | 89 | /* |
89 | * If audit records are to be written to the netlink socket, audit_pid | 90 | * If audit records are to be written to the netlink socket, audit_pid |
90 | * contains the pid of the auditd process and audit_nlk_pid contains | 91 | * contains the pid of the auditd process and audit_nlk_portid contains |
91 | * the pid to use to send netlink messages to that process. | 92 | * the portid to use to send netlink messages to that process. |
92 | */ | 93 | */ |
93 | int audit_pid; | 94 | int audit_pid; |
94 | static int audit_nlk_pid; | 95 | static int audit_nlk_portid; |
95 | 96 | ||
96 | /* If audit_rate_limit is non-zero, limit the rate of sending audit records | 97 | /* If audit_rate_limit is non-zero, limit the rate of sending audit records |
97 | * to that number per second. This prevents DoS attacks, but results in | 98 | * to that number per second. This prevents DoS attacks, but results in |
@@ -104,7 +105,7 @@ static int audit_backlog_wait_time = 60 * HZ; | |||
104 | static int audit_backlog_wait_overflow = 0; | 105 | static int audit_backlog_wait_overflow = 0; |
105 | 106 | ||
106 | /* The identity of the user shutting down the audit system. */ | 107 | /* The identity of the user shutting down the audit system. */ |
107 | uid_t audit_sig_uid = -1; | 108 | kuid_t audit_sig_uid = INVALID_UID; |
108 | pid_t audit_sig_pid = -1; | 109 | pid_t audit_sig_pid = -1; |
109 | u32 audit_sig_sid = 0; | 110 | u32 audit_sig_sid = 0; |
110 | 111 | ||
@@ -264,7 +265,7 @@ void audit_log_lost(const char *message) | |||
264 | } | 265 | } |
265 | 266 | ||
266 | static int audit_log_config_change(char *function_name, int new, int old, | 267 | static int audit_log_config_change(char *function_name, int new, int old, |
267 | uid_t loginuid, u32 sessionid, u32 sid, | 268 | kuid_t loginuid, u32 sessionid, u32 sid, |
268 | int allow_changes) | 269 | int allow_changes) |
269 | { | 270 | { |
270 | struct audit_buffer *ab; | 271 | struct audit_buffer *ab; |
@@ -272,7 +273,7 @@ static int audit_log_config_change(char *function_name, int new, int old, | |||
272 | 273 | ||
273 | ab = audit_log_start(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE); | 274 | ab = audit_log_start(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE); |
274 | audit_log_format(ab, "%s=%d old=%d auid=%u ses=%u", function_name, new, | 275 | audit_log_format(ab, "%s=%d old=%d auid=%u ses=%u", function_name, new, |
275 | old, loginuid, sessionid); | 276 | old, from_kuid(&init_user_ns, loginuid), sessionid); |
276 | if (sid) { | 277 | if (sid) { |
277 | char *ctx = NULL; | 278 | char *ctx = NULL; |
278 | u32 len; | 279 | u32 len; |
@@ -292,7 +293,7 @@ static int audit_log_config_change(char *function_name, int new, int old, | |||
292 | } | 293 | } |
293 | 294 | ||
294 | static int audit_do_config_change(char *function_name, int *to_change, | 295 | static int audit_do_config_change(char *function_name, int *to_change, |
295 | int new, uid_t loginuid, u32 sessionid, | 296 | int new, kuid_t loginuid, u32 sessionid, |
296 | u32 sid) | 297 | u32 sid) |
297 | { | 298 | { |
298 | int allow_changes, rc = 0, old = *to_change; | 299 | int allow_changes, rc = 0, old = *to_change; |
@@ -319,21 +320,21 @@ static int audit_do_config_change(char *function_name, int *to_change, | |||
319 | return rc; | 320 | return rc; |
320 | } | 321 | } |
321 | 322 | ||
322 | static int audit_set_rate_limit(int limit, uid_t loginuid, u32 sessionid, | 323 | static int audit_set_rate_limit(int limit, kuid_t loginuid, u32 sessionid, |
323 | u32 sid) | 324 | u32 sid) |
324 | { | 325 | { |
325 | return audit_do_config_change("audit_rate_limit", &audit_rate_limit, | 326 | return audit_do_config_change("audit_rate_limit", &audit_rate_limit, |
326 | limit, loginuid, sessionid, sid); | 327 | limit, loginuid, sessionid, sid); |
327 | } | 328 | } |
328 | 329 | ||
329 | static int audit_set_backlog_limit(int limit, uid_t loginuid, u32 sessionid, | 330 | static int audit_set_backlog_limit(int limit, kuid_t loginuid, u32 sessionid, |
330 | u32 sid) | 331 | u32 sid) |
331 | { | 332 | { |
332 | return audit_do_config_change("audit_backlog_limit", &audit_backlog_limit, | 333 | return audit_do_config_change("audit_backlog_limit", &audit_backlog_limit, |
333 | limit, loginuid, sessionid, sid); | 334 | limit, loginuid, sessionid, sid); |
334 | } | 335 | } |
335 | 336 | ||
336 | static int audit_set_enabled(int state, uid_t loginuid, u32 sessionid, u32 sid) | 337 | static int audit_set_enabled(int state, kuid_t loginuid, u32 sessionid, u32 sid) |
337 | { | 338 | { |
338 | int rc; | 339 | int rc; |
339 | if (state < AUDIT_OFF || state > AUDIT_LOCKED) | 340 | if (state < AUDIT_OFF || state > AUDIT_LOCKED) |
@@ -348,7 +349,7 @@ static int audit_set_enabled(int state, uid_t loginuid, u32 sessionid, u32 sid) | |||
348 | return rc; | 349 | return rc; |
349 | } | 350 | } |
350 | 351 | ||
351 | static int audit_set_failure(int state, uid_t loginuid, u32 sessionid, u32 sid) | 352 | static int audit_set_failure(int state, kuid_t loginuid, u32 sessionid, u32 sid) |
352 | { | 353 | { |
353 | if (state != AUDIT_FAIL_SILENT | 354 | if (state != AUDIT_FAIL_SILENT |
354 | && state != AUDIT_FAIL_PRINTK | 355 | && state != AUDIT_FAIL_PRINTK |
@@ -401,7 +402,7 @@ static void kauditd_send_skb(struct sk_buff *skb) | |||
401 | int err; | 402 | int err; |
402 | /* take a reference in case we can't send it and we want to hold it */ | 403 | /* take a reference in case we can't send it and we want to hold it */ |
403 | skb_get(skb); | 404 | skb_get(skb); |
404 | err = netlink_unicast(audit_sock, skb, audit_nlk_pid, 0); | 405 | err = netlink_unicast(audit_sock, skb, audit_nlk_portid, 0); |
405 | if (err < 0) { | 406 | if (err < 0) { |
406 | BUG_ON(err != -ECONNREFUSED); /* Shouldn't happen */ | 407 | BUG_ON(err != -ECONNREFUSED); /* Shouldn't happen */ |
407 | printk(KERN_ERR "audit: *NO* daemon at audit_pid=%d\n", audit_pid); | 408 | printk(KERN_ERR "audit: *NO* daemon at audit_pid=%d\n", audit_pid); |
@@ -467,24 +468,6 @@ static int kauditd_thread(void *dummy) | |||
467 | return 0; | 468 | return 0; |
468 | } | 469 | } |
469 | 470 | ||
470 | static int audit_prepare_user_tty(pid_t pid, uid_t loginuid, u32 sessionid) | ||
471 | { | ||
472 | struct task_struct *tsk; | ||
473 | int err; | ||
474 | |||
475 | rcu_read_lock(); | ||
476 | tsk = find_task_by_vpid(pid); | ||
477 | if (!tsk) { | ||
478 | rcu_read_unlock(); | ||
479 | return -ESRCH; | ||
480 | } | ||
481 | get_task_struct(tsk); | ||
482 | rcu_read_unlock(); | ||
483 | err = tty_audit_push_task(tsk, loginuid, sessionid); | ||
484 | put_task_struct(tsk); | ||
485 | return err; | ||
486 | } | ||
487 | |||
488 | int audit_send_list(void *_dest) | 471 | int audit_send_list(void *_dest) |
489 | { | 472 | { |
490 | struct audit_netlink_list *dest = _dest; | 473 | struct audit_netlink_list *dest = _dest; |
@@ -588,6 +571,11 @@ static int audit_netlink_ok(struct sk_buff *skb, u16 msg_type) | |||
588 | { | 571 | { |
589 | int err = 0; | 572 | int err = 0; |
590 | 573 | ||
574 | /* Only support the initial namespaces for now. */ | ||
575 | if ((current_user_ns() != &init_user_ns) || | ||
576 | (task_active_pid_ns(current) != &init_pid_ns)) | ||
577 | return -EPERM; | ||
578 | |||
591 | switch (msg_type) { | 579 | switch (msg_type) { |
592 | case AUDIT_GET: | 580 | case AUDIT_GET: |
593 | case AUDIT_LIST: | 581 | case AUDIT_LIST: |
@@ -619,8 +607,7 @@ static int audit_netlink_ok(struct sk_buff *skb, u16 msg_type) | |||
619 | } | 607 | } |
620 | 608 | ||
621 | static int audit_log_common_recv_msg(struct audit_buffer **ab, u16 msg_type, | 609 | static int audit_log_common_recv_msg(struct audit_buffer **ab, u16 msg_type, |
622 | u32 pid, u32 uid, uid_t auid, u32 ses, | 610 | kuid_t auid, u32 ses, u32 sid) |
623 | u32 sid) | ||
624 | { | 611 | { |
625 | int rc = 0; | 612 | int rc = 0; |
626 | char *ctx = NULL; | 613 | char *ctx = NULL; |
@@ -633,7 +620,9 @@ static int audit_log_common_recv_msg(struct audit_buffer **ab, u16 msg_type, | |||
633 | 620 | ||
634 | *ab = audit_log_start(NULL, GFP_KERNEL, msg_type); | 621 | *ab = audit_log_start(NULL, GFP_KERNEL, msg_type); |
635 | audit_log_format(*ab, "pid=%d uid=%u auid=%u ses=%u", | 622 | audit_log_format(*ab, "pid=%d uid=%u auid=%u ses=%u", |
636 | pid, uid, auid, ses); | 623 | task_tgid_vnr(current), |
624 | from_kuid(&init_user_ns, current_uid()), | ||
625 | from_kuid(&init_user_ns, auid), ses); | ||
637 | if (sid) { | 626 | if (sid) { |
638 | rc = security_secid_to_secctx(sid, &ctx, &len); | 627 | rc = security_secid_to_secctx(sid, &ctx, &len); |
639 | if (rc) | 628 | if (rc) |
@@ -649,13 +638,13 @@ static int audit_log_common_recv_msg(struct audit_buffer **ab, u16 msg_type, | |||
649 | 638 | ||
650 | static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh) | 639 | static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh) |
651 | { | 640 | { |
652 | u32 uid, pid, seq, sid; | 641 | u32 seq, sid; |
653 | void *data; | 642 | void *data; |
654 | struct audit_status *status_get, status_set; | 643 | struct audit_status *status_get, status_set; |
655 | int err; | 644 | int err; |
656 | struct audit_buffer *ab; | 645 | struct audit_buffer *ab; |
657 | u16 msg_type = nlh->nlmsg_type; | 646 | u16 msg_type = nlh->nlmsg_type; |
658 | uid_t loginuid; /* loginuid of sender */ | 647 | kuid_t loginuid; /* loginuid of sender */ |
659 | u32 sessionid; | 648 | u32 sessionid; |
660 | struct audit_sig_info *sig_data; | 649 | struct audit_sig_info *sig_data; |
661 | char *ctx = NULL; | 650 | char *ctx = NULL; |
@@ -675,8 +664,6 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh) | |||
675 | return err; | 664 | return err; |
676 | } | 665 | } |
677 | 666 | ||
678 | pid = NETLINK_CREDS(skb)->pid; | ||
679 | uid = NETLINK_CREDS(skb)->uid; | ||
680 | loginuid = audit_get_loginuid(current); | 667 | loginuid = audit_get_loginuid(current); |
681 | sessionid = audit_get_sessionid(current); | 668 | sessionid = audit_get_sessionid(current); |
682 | security_task_getsecid(current, &sid); | 669 | security_task_getsecid(current, &sid); |
@@ -692,7 +679,7 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh) | |||
692 | status_set.backlog_limit = audit_backlog_limit; | 679 | status_set.backlog_limit = audit_backlog_limit; |
693 | status_set.lost = atomic_read(&audit_lost); | 680 | status_set.lost = atomic_read(&audit_lost); |
694 | status_set.backlog = skb_queue_len(&audit_skb_queue); | 681 | status_set.backlog = skb_queue_len(&audit_skb_queue); |
695 | audit_send_reply(NETLINK_CB(skb).pid, seq, AUDIT_GET, 0, 0, | 682 | audit_send_reply(NETLINK_CB(skb).portid, seq, AUDIT_GET, 0, 0, |
696 | &status_set, sizeof(status_set)); | 683 | &status_set, sizeof(status_set)); |
697 | break; | 684 | break; |
698 | case AUDIT_SET: | 685 | case AUDIT_SET: |
@@ -720,7 +707,7 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh) | |||
720 | sessionid, sid, 1); | 707 | sessionid, sid, 1); |
721 | 708 | ||
722 | audit_pid = new_pid; | 709 | audit_pid = new_pid; |
723 | audit_nlk_pid = NETLINK_CB(skb).pid; | 710 | audit_nlk_portid = NETLINK_CB(skb).portid; |
724 | } | 711 | } |
725 | if (status_get->mask & AUDIT_STATUS_RATE_LIMIT) { | 712 | if (status_get->mask & AUDIT_STATUS_RATE_LIMIT) { |
726 | err = audit_set_rate_limit(status_get->rate_limit, | 713 | err = audit_set_rate_limit(status_get->rate_limit, |
@@ -738,16 +725,16 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh) | |||
738 | if (!audit_enabled && msg_type != AUDIT_USER_AVC) | 725 | if (!audit_enabled && msg_type != AUDIT_USER_AVC) |
739 | return 0; | 726 | return 0; |
740 | 727 | ||
741 | err = audit_filter_user(&NETLINK_CB(skb)); | 728 | err = audit_filter_user(); |
742 | if (err == 1) { | 729 | if (err == 1) { |
743 | err = 0; | 730 | err = 0; |
744 | if (msg_type == AUDIT_USER_TTY) { | 731 | if (msg_type == AUDIT_USER_TTY) { |
745 | err = audit_prepare_user_tty(pid, loginuid, | 732 | err = tty_audit_push_task(current, loginuid, |
746 | sessionid); | 733 | sessionid); |
747 | if (err) | 734 | if (err) |
748 | break; | 735 | break; |
749 | } | 736 | } |
750 | audit_log_common_recv_msg(&ab, msg_type, pid, uid, | 737 | audit_log_common_recv_msg(&ab, msg_type, |
751 | loginuid, sessionid, sid); | 738 | loginuid, sessionid, sid); |
752 | 739 | ||
753 | if (msg_type != AUDIT_USER_TTY) | 740 | if (msg_type != AUDIT_USER_TTY) |
@@ -763,7 +750,7 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh) | |||
763 | size--; | 750 | size--; |
764 | audit_log_n_untrustedstring(ab, data, size); | 751 | audit_log_n_untrustedstring(ab, data, size); |
765 | } | 752 | } |
766 | audit_set_pid(ab, pid); | 753 | audit_set_pid(ab, NETLINK_CB(skb).portid); |
767 | audit_log_end(ab); | 754 | audit_log_end(ab); |
768 | } | 755 | } |
769 | break; | 756 | break; |
@@ -772,8 +759,8 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh) | |||
772 | if (nlmsg_len(nlh) < sizeof(struct audit_rule)) | 759 | if (nlmsg_len(nlh) < sizeof(struct audit_rule)) |
773 | return -EINVAL; | 760 | return -EINVAL; |
774 | if (audit_enabled == AUDIT_LOCKED) { | 761 | if (audit_enabled == AUDIT_LOCKED) { |
775 | audit_log_common_recv_msg(&ab, AUDIT_CONFIG_CHANGE, pid, | 762 | audit_log_common_recv_msg(&ab, AUDIT_CONFIG_CHANGE, |
776 | uid, loginuid, sessionid, sid); | 763 | loginuid, sessionid, sid); |
777 | 764 | ||
778 | audit_log_format(ab, " audit_enabled=%d res=0", | 765 | audit_log_format(ab, " audit_enabled=%d res=0", |
779 | audit_enabled); | 766 | audit_enabled); |
@@ -782,8 +769,8 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh) | |||
782 | } | 769 | } |
783 | /* fallthrough */ | 770 | /* fallthrough */ |
784 | case AUDIT_LIST: | 771 | case AUDIT_LIST: |
785 | err = audit_receive_filter(msg_type, NETLINK_CB(skb).pid, | 772 | err = audit_receive_filter(msg_type, NETLINK_CB(skb).portid, |
786 | uid, seq, data, nlmsg_len(nlh), | 773 | seq, data, nlmsg_len(nlh), |
787 | loginuid, sessionid, sid); | 774 | loginuid, sessionid, sid); |
788 | break; | 775 | break; |
789 | case AUDIT_ADD_RULE: | 776 | case AUDIT_ADD_RULE: |
@@ -791,8 +778,8 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh) | |||
791 | if (nlmsg_len(nlh) < sizeof(struct audit_rule_data)) | 778 | if (nlmsg_len(nlh) < sizeof(struct audit_rule_data)) |
792 | return -EINVAL; | 779 | return -EINVAL; |
793 | if (audit_enabled == AUDIT_LOCKED) { | 780 | if (audit_enabled == AUDIT_LOCKED) { |
794 | audit_log_common_recv_msg(&ab, AUDIT_CONFIG_CHANGE, pid, | 781 | audit_log_common_recv_msg(&ab, AUDIT_CONFIG_CHANGE, |
795 | uid, loginuid, sessionid, sid); | 782 | loginuid, sessionid, sid); |
796 | 783 | ||
797 | audit_log_format(ab, " audit_enabled=%d res=0", | 784 | audit_log_format(ab, " audit_enabled=%d res=0", |
798 | audit_enabled); | 785 | audit_enabled); |
@@ -801,15 +788,15 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh) | |||
801 | } | 788 | } |
802 | /* fallthrough */ | 789 | /* fallthrough */ |
803 | case AUDIT_LIST_RULES: | 790 | case AUDIT_LIST_RULES: |
804 | err = audit_receive_filter(msg_type, NETLINK_CB(skb).pid, | 791 | err = audit_receive_filter(msg_type, NETLINK_CB(skb).portid, |
805 | uid, seq, data, nlmsg_len(nlh), | 792 | seq, data, nlmsg_len(nlh), |
806 | loginuid, sessionid, sid); | 793 | loginuid, sessionid, sid); |
807 | break; | 794 | break; |
808 | case AUDIT_TRIM: | 795 | case AUDIT_TRIM: |
809 | audit_trim_trees(); | 796 | audit_trim_trees(); |
810 | 797 | ||
811 | audit_log_common_recv_msg(&ab, AUDIT_CONFIG_CHANGE, pid, | 798 | audit_log_common_recv_msg(&ab, AUDIT_CONFIG_CHANGE, |
812 | uid, loginuid, sessionid, sid); | 799 | loginuid, sessionid, sid); |
813 | 800 | ||
814 | audit_log_format(ab, " op=trim res=1"); | 801 | audit_log_format(ab, " op=trim res=1"); |
815 | audit_log_end(ab); | 802 | audit_log_end(ab); |
@@ -840,8 +827,8 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh) | |||
840 | /* OK, here comes... */ | 827 | /* OK, here comes... */ |
841 | err = audit_tag_tree(old, new); | 828 | err = audit_tag_tree(old, new); |
842 | 829 | ||
843 | audit_log_common_recv_msg(&ab, AUDIT_CONFIG_CHANGE, pid, | 830 | audit_log_common_recv_msg(&ab, AUDIT_CONFIG_CHANGE, |
844 | uid, loginuid, sessionid, sid); | 831 | loginuid, sessionid, sid); |
845 | 832 | ||
846 | audit_log_format(ab, " op=make_equiv old="); | 833 | audit_log_format(ab, " op=make_equiv old="); |
847 | audit_log_untrustedstring(ab, old); | 834 | audit_log_untrustedstring(ab, old); |
@@ -866,53 +853,41 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh) | |||
866 | security_release_secctx(ctx, len); | 853 | security_release_secctx(ctx, len); |
867 | return -ENOMEM; | 854 | return -ENOMEM; |
868 | } | 855 | } |
869 | sig_data->uid = audit_sig_uid; | 856 | sig_data->uid = from_kuid(&init_user_ns, audit_sig_uid); |
870 | sig_data->pid = audit_sig_pid; | 857 | sig_data->pid = audit_sig_pid; |
871 | if (audit_sig_sid) { | 858 | if (audit_sig_sid) { |
872 | memcpy(sig_data->ctx, ctx, len); | 859 | memcpy(sig_data->ctx, ctx, len); |
873 | security_release_secctx(ctx, len); | 860 | security_release_secctx(ctx, len); |
874 | } | 861 | } |
875 | audit_send_reply(NETLINK_CB(skb).pid, seq, AUDIT_SIGNAL_INFO, | 862 | audit_send_reply(NETLINK_CB(skb).portid, seq, AUDIT_SIGNAL_INFO, |
876 | 0, 0, sig_data, sizeof(*sig_data) + len); | 863 | 0, 0, sig_data, sizeof(*sig_data) + len); |
877 | kfree(sig_data); | 864 | kfree(sig_data); |
878 | break; | 865 | break; |
879 | case AUDIT_TTY_GET: { | 866 | case AUDIT_TTY_GET: { |
880 | struct audit_tty_status s; | 867 | struct audit_tty_status s; |
881 | struct task_struct *tsk; | 868 | struct task_struct *tsk = current; |
882 | unsigned long flags; | 869 | |
883 | 870 | spin_lock_irq(&tsk->sighand->siglock); | |
884 | rcu_read_lock(); | 871 | s.enabled = tsk->signal->audit_tty != 0; |
885 | tsk = find_task_by_vpid(pid); | 872 | spin_unlock_irq(&tsk->sighand->siglock); |
886 | if (tsk && lock_task_sighand(tsk, &flags)) { | 873 | |
887 | s.enabled = tsk->signal->audit_tty != 0; | 874 | audit_send_reply(NETLINK_CB(skb).portid, seq, |
888 | unlock_task_sighand(tsk, &flags); | 875 | AUDIT_TTY_GET, 0, 0, &s, sizeof(s)); |
889 | } else | ||
890 | err = -ESRCH; | ||
891 | rcu_read_unlock(); | ||
892 | |||
893 | if (!err) | ||
894 | audit_send_reply(NETLINK_CB(skb).pid, seq, | ||
895 | AUDIT_TTY_GET, 0, 0, &s, sizeof(s)); | ||
896 | break; | 876 | break; |
897 | } | 877 | } |
898 | case AUDIT_TTY_SET: { | 878 | case AUDIT_TTY_SET: { |
899 | struct audit_tty_status *s; | 879 | struct audit_tty_status *s; |
900 | struct task_struct *tsk; | 880 | struct task_struct *tsk = current; |
901 | unsigned long flags; | ||
902 | 881 | ||
903 | if (nlh->nlmsg_len < sizeof(struct audit_tty_status)) | 882 | if (nlh->nlmsg_len < sizeof(struct audit_tty_status)) |
904 | return -EINVAL; | 883 | return -EINVAL; |
905 | s = data; | 884 | s = data; |
906 | if (s->enabled != 0 && s->enabled != 1) | 885 | if (s->enabled != 0 && s->enabled != 1) |
907 | return -EINVAL; | 886 | return -EINVAL; |
908 | rcu_read_lock(); | 887 | |
909 | tsk = find_task_by_vpid(pid); | 888 | spin_lock_irq(&tsk->sighand->siglock); |
910 | if (tsk && lock_task_sighand(tsk, &flags)) { | 889 | tsk->signal->audit_tty = s->enabled != 0; |
911 | tsk->signal->audit_tty = s->enabled != 0; | 890 | spin_unlock_irq(&tsk->sighand->siglock); |
912 | unlock_task_sighand(tsk, &flags); | ||
913 | } else | ||
914 | err = -ESRCH; | ||
915 | rcu_read_unlock(); | ||
916 | break; | 891 | break; |
917 | } | 892 | } |
918 | default: | 893 | default: |
@@ -971,8 +946,7 @@ static int __init audit_init(void) | |||
971 | 946 | ||
972 | printk(KERN_INFO "audit: initializing netlink socket (%s)\n", | 947 | printk(KERN_INFO "audit: initializing netlink socket (%s)\n", |
973 | audit_default ? "enabled" : "disabled"); | 948 | audit_default ? "enabled" : "disabled"); |
974 | audit_sock = netlink_kernel_create(&init_net, NETLINK_AUDIT, | 949 | audit_sock = netlink_kernel_create(&init_net, NETLINK_AUDIT, &cfg); |
975 | THIS_MODULE, &cfg); | ||
976 | if (!audit_sock) | 950 | if (!audit_sock) |
977 | audit_panic("cannot initialize netlink socket"); | 951 | audit_panic("cannot initialize netlink socket"); |
978 | else | 952 | else |
@@ -1466,6 +1440,8 @@ void audit_log_link_denied(const char *operation, struct path *link) | |||
1466 | 1440 | ||
1467 | ab = audit_log_start(current->audit_context, GFP_KERNEL, | 1441 | ab = audit_log_start(current->audit_context, GFP_KERNEL, |
1468 | AUDIT_ANOM_LINK); | 1442 | AUDIT_ANOM_LINK); |
1443 | if (!ab) | ||
1444 | return; | ||
1469 | audit_log_format(ab, "op=%s action=denied", operation); | 1445 | audit_log_format(ab, "op=%s action=denied", operation); |
1470 | audit_log_format(ab, " pid=%d comm=", current->pid); | 1446 | audit_log_format(ab, " pid=%d comm=", current->pid); |
1471 | audit_log_untrustedstring(ab, current->comm); | 1447 | audit_log_untrustedstring(ab, current->comm); |
diff --git a/kernel/audit.h b/kernel/audit.h index 816766803371..d51cba868e1b 100644 --- a/kernel/audit.h +++ b/kernel/audit.h | |||
@@ -74,10 +74,15 @@ static inline int audit_hash_ino(u32 ino) | |||
74 | return (ino & (AUDIT_INODE_BUCKETS-1)); | 74 | return (ino & (AUDIT_INODE_BUCKETS-1)); |
75 | } | 75 | } |
76 | 76 | ||
77 | /* Indicates that audit should log the full pathname. */ | ||
78 | #define AUDIT_NAME_FULL -1 | ||
79 | |||
77 | extern int audit_match_class(int class, unsigned syscall); | 80 | extern int audit_match_class(int class, unsigned syscall); |
78 | extern int audit_comparator(const u32 left, const u32 op, const u32 right); | 81 | extern int audit_comparator(const u32 left, const u32 op, const u32 right); |
79 | extern int audit_compare_dname_path(const char *dname, const char *path, | 82 | extern int audit_uid_comparator(kuid_t left, u32 op, kuid_t right); |
80 | int *dirlen); | 83 | extern int audit_gid_comparator(kgid_t left, u32 op, kgid_t right); |
84 | extern int parent_len(const char *path); | ||
85 | extern int audit_compare_dname_path(const char *dname, const char *path, int plen); | ||
81 | extern struct sk_buff * audit_make_reply(int pid, int seq, int type, | 86 | extern struct sk_buff * audit_make_reply(int pid, int seq, int type, |
82 | int done, int multi, | 87 | int done, int multi, |
83 | const void *payload, int size); | 88 | const void *payload, int size); |
@@ -144,7 +149,7 @@ extern void audit_kill_trees(struct list_head *); | |||
144 | extern char *audit_unpack_string(void **, size_t *, size_t); | 149 | extern char *audit_unpack_string(void **, size_t *, size_t); |
145 | 150 | ||
146 | extern pid_t audit_sig_pid; | 151 | extern pid_t audit_sig_pid; |
147 | extern uid_t audit_sig_uid; | 152 | extern kuid_t audit_sig_uid; |
148 | extern u32 audit_sig_sid; | 153 | extern u32 audit_sig_sid; |
149 | 154 | ||
150 | #ifdef CONFIG_AUDITSYSCALL | 155 | #ifdef CONFIG_AUDITSYSCALL |
diff --git a/kernel/audit_watch.c b/kernel/audit_watch.c index a66affc1c12c..4a599f699adc 100644 --- a/kernel/audit_watch.c +++ b/kernel/audit_watch.c | |||
@@ -241,7 +241,7 @@ static void audit_watch_log_rule_change(struct audit_krule *r, struct audit_watc | |||
241 | struct audit_buffer *ab; | 241 | struct audit_buffer *ab; |
242 | ab = audit_log_start(NULL, GFP_NOFS, AUDIT_CONFIG_CHANGE); | 242 | ab = audit_log_start(NULL, GFP_NOFS, AUDIT_CONFIG_CHANGE); |
243 | audit_log_format(ab, "auid=%u ses=%u op=", | 243 | audit_log_format(ab, "auid=%u ses=%u op=", |
244 | audit_get_loginuid(current), | 244 | from_kuid(&init_user_ns, audit_get_loginuid(current)), |
245 | audit_get_sessionid(current)); | 245 | audit_get_sessionid(current)); |
246 | audit_log_string(ab, op); | 246 | audit_log_string(ab, op); |
247 | audit_log_format(ab, " path="); | 247 | audit_log_format(ab, " path="); |
@@ -265,7 +265,8 @@ static void audit_update_watch(struct audit_parent *parent, | |||
265 | /* Run all of the watches on this parent looking for the one that | 265 | /* Run all of the watches on this parent looking for the one that |
266 | * matches the given dname */ | 266 | * matches the given dname */ |
267 | list_for_each_entry_safe(owatch, nextw, &parent->watches, wlist) { | 267 | list_for_each_entry_safe(owatch, nextw, &parent->watches, wlist) { |
268 | if (audit_compare_dname_path(dname, owatch->path, NULL)) | 268 | if (audit_compare_dname_path(dname, owatch->path, |
269 | AUDIT_NAME_FULL)) | ||
269 | continue; | 270 | continue; |
270 | 271 | ||
271 | /* If the update involves invalidating rules, do the inode-based | 272 | /* If the update involves invalidating rules, do the inode-based |
diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c index a6c3f1abd206..7f19f23d38a3 100644 --- a/kernel/auditfilter.c +++ b/kernel/auditfilter.c | |||
@@ -342,6 +342,8 @@ static struct audit_entry *audit_rule_to_entry(struct audit_rule *rule) | |||
342 | 342 | ||
343 | f->type = rule->fields[i] & ~(AUDIT_NEGATE|AUDIT_OPERATORS); | 343 | f->type = rule->fields[i] & ~(AUDIT_NEGATE|AUDIT_OPERATORS); |
344 | f->val = rule->values[i]; | 344 | f->val = rule->values[i]; |
345 | f->uid = INVALID_UID; | ||
346 | f->gid = INVALID_GID; | ||
345 | 347 | ||
346 | err = -EINVAL; | 348 | err = -EINVAL; |
347 | if (f->op == Audit_bad) | 349 | if (f->op == Audit_bad) |
@@ -350,16 +352,32 @@ static struct audit_entry *audit_rule_to_entry(struct audit_rule *rule) | |||
350 | switch(f->type) { | 352 | switch(f->type) { |
351 | default: | 353 | default: |
352 | goto exit_free; | 354 | goto exit_free; |
353 | case AUDIT_PID: | ||
354 | case AUDIT_UID: | 355 | case AUDIT_UID: |
355 | case AUDIT_EUID: | 356 | case AUDIT_EUID: |
356 | case AUDIT_SUID: | 357 | case AUDIT_SUID: |
357 | case AUDIT_FSUID: | 358 | case AUDIT_FSUID: |
359 | case AUDIT_LOGINUID: | ||
360 | /* bit ops not implemented for uid comparisons */ | ||
361 | if (f->op == Audit_bitmask || f->op == Audit_bittest) | ||
362 | goto exit_free; | ||
363 | |||
364 | f->uid = make_kuid(current_user_ns(), f->val); | ||
365 | if (!uid_valid(f->uid)) | ||
366 | goto exit_free; | ||
367 | break; | ||
358 | case AUDIT_GID: | 368 | case AUDIT_GID: |
359 | case AUDIT_EGID: | 369 | case AUDIT_EGID: |
360 | case AUDIT_SGID: | 370 | case AUDIT_SGID: |
361 | case AUDIT_FSGID: | 371 | case AUDIT_FSGID: |
362 | case AUDIT_LOGINUID: | 372 | /* bit ops not implemented for gid comparisons */ |
373 | if (f->op == Audit_bitmask || f->op == Audit_bittest) | ||
374 | goto exit_free; | ||
375 | |||
376 | f->gid = make_kgid(current_user_ns(), f->val); | ||
377 | if (!gid_valid(f->gid)) | ||
378 | goto exit_free; | ||
379 | break; | ||
380 | case AUDIT_PID: | ||
363 | case AUDIT_PERS: | 381 | case AUDIT_PERS: |
364 | case AUDIT_MSGTYPE: | 382 | case AUDIT_MSGTYPE: |
365 | case AUDIT_PPID: | 383 | case AUDIT_PPID: |
@@ -437,19 +455,39 @@ static struct audit_entry *audit_data_to_entry(struct audit_rule_data *data, | |||
437 | 455 | ||
438 | f->type = data->fields[i]; | 456 | f->type = data->fields[i]; |
439 | f->val = data->values[i]; | 457 | f->val = data->values[i]; |
458 | f->uid = INVALID_UID; | ||
459 | f->gid = INVALID_GID; | ||
440 | f->lsm_str = NULL; | 460 | f->lsm_str = NULL; |
441 | f->lsm_rule = NULL; | 461 | f->lsm_rule = NULL; |
442 | switch(f->type) { | 462 | switch(f->type) { |
443 | case AUDIT_PID: | ||
444 | case AUDIT_UID: | 463 | case AUDIT_UID: |
445 | case AUDIT_EUID: | 464 | case AUDIT_EUID: |
446 | case AUDIT_SUID: | 465 | case AUDIT_SUID: |
447 | case AUDIT_FSUID: | 466 | case AUDIT_FSUID: |
467 | case AUDIT_LOGINUID: | ||
468 | case AUDIT_OBJ_UID: | ||
469 | /* bit ops not implemented for uid comparisons */ | ||
470 | if (f->op == Audit_bitmask || f->op == Audit_bittest) | ||
471 | goto exit_free; | ||
472 | |||
473 | f->uid = make_kuid(current_user_ns(), f->val); | ||
474 | if (!uid_valid(f->uid)) | ||
475 | goto exit_free; | ||
476 | break; | ||
448 | case AUDIT_GID: | 477 | case AUDIT_GID: |
449 | case AUDIT_EGID: | 478 | case AUDIT_EGID: |
450 | case AUDIT_SGID: | 479 | case AUDIT_SGID: |
451 | case AUDIT_FSGID: | 480 | case AUDIT_FSGID: |
452 | case AUDIT_LOGINUID: | 481 | case AUDIT_OBJ_GID: |
482 | /* bit ops not implemented for gid comparisons */ | ||
483 | if (f->op == Audit_bitmask || f->op == Audit_bittest) | ||
484 | goto exit_free; | ||
485 | |||
486 | f->gid = make_kgid(current_user_ns(), f->val); | ||
487 | if (!gid_valid(f->gid)) | ||
488 | goto exit_free; | ||
489 | break; | ||
490 | case AUDIT_PID: | ||
453 | case AUDIT_PERS: | 491 | case AUDIT_PERS: |
454 | case AUDIT_MSGTYPE: | 492 | case AUDIT_MSGTYPE: |
455 | case AUDIT_PPID: | 493 | case AUDIT_PPID: |
@@ -461,8 +499,6 @@ static struct audit_entry *audit_data_to_entry(struct audit_rule_data *data, | |||
461 | case AUDIT_ARG1: | 499 | case AUDIT_ARG1: |
462 | case AUDIT_ARG2: | 500 | case AUDIT_ARG2: |
463 | case AUDIT_ARG3: | 501 | case AUDIT_ARG3: |
464 | case AUDIT_OBJ_UID: | ||
465 | case AUDIT_OBJ_GID: | ||
466 | break; | 502 | break; |
467 | case AUDIT_ARCH: | 503 | case AUDIT_ARCH: |
468 | entry->rule.arch_f = f; | 504 | entry->rule.arch_f = f; |
@@ -707,6 +743,23 @@ static int audit_compare_rule(struct audit_krule *a, struct audit_krule *b) | |||
707 | if (strcmp(a->filterkey, b->filterkey)) | 743 | if (strcmp(a->filterkey, b->filterkey)) |
708 | return 1; | 744 | return 1; |
709 | break; | 745 | break; |
746 | case AUDIT_UID: | ||
747 | case AUDIT_EUID: | ||
748 | case AUDIT_SUID: | ||
749 | case AUDIT_FSUID: | ||
750 | case AUDIT_LOGINUID: | ||
751 | case AUDIT_OBJ_UID: | ||
752 | if (!uid_eq(a->fields[i].uid, b->fields[i].uid)) | ||
753 | return 1; | ||
754 | break; | ||
755 | case AUDIT_GID: | ||
756 | case AUDIT_EGID: | ||
757 | case AUDIT_SGID: | ||
758 | case AUDIT_FSGID: | ||
759 | case AUDIT_OBJ_GID: | ||
760 | if (!gid_eq(a->fields[i].gid, b->fields[i].gid)) | ||
761 | return 1; | ||
762 | break; | ||
710 | default: | 763 | default: |
711 | if (a->fields[i].val != b->fields[i].val) | 764 | if (a->fields[i].val != b->fields[i].val) |
712 | return 1; | 765 | return 1; |
@@ -1056,7 +1109,7 @@ static void audit_list_rules(int pid, int seq, struct sk_buff_head *q) | |||
1056 | } | 1109 | } |
1057 | 1110 | ||
1058 | /* Log rule additions and removals */ | 1111 | /* Log rule additions and removals */ |
1059 | static void audit_log_rule_change(uid_t loginuid, u32 sessionid, u32 sid, | 1112 | static void audit_log_rule_change(kuid_t loginuid, u32 sessionid, u32 sid, |
1060 | char *action, struct audit_krule *rule, | 1113 | char *action, struct audit_krule *rule, |
1061 | int res) | 1114 | int res) |
1062 | { | 1115 | { |
@@ -1068,7 +1121,8 @@ static void audit_log_rule_change(uid_t loginuid, u32 sessionid, u32 sid, | |||
1068 | ab = audit_log_start(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE); | 1121 | ab = audit_log_start(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE); |
1069 | if (!ab) | 1122 | if (!ab) |
1070 | return; | 1123 | return; |
1071 | audit_log_format(ab, "auid=%u ses=%u", loginuid, sessionid); | 1124 | audit_log_format(ab, "auid=%u ses=%u", |
1125 | from_kuid(&init_user_ns, loginuid), sessionid); | ||
1072 | if (sid) { | 1126 | if (sid) { |
1073 | char *ctx = NULL; | 1127 | char *ctx = NULL; |
1074 | u32 len; | 1128 | u32 len; |
@@ -1098,8 +1152,8 @@ static void audit_log_rule_change(uid_t loginuid, u32 sessionid, u32 sid, | |||
1098 | * @sessionid: sessionid for netlink audit message | 1152 | * @sessionid: sessionid for netlink audit message |
1099 | * @sid: SE Linux Security ID of sender | 1153 | * @sid: SE Linux Security ID of sender |
1100 | */ | 1154 | */ |
1101 | int audit_receive_filter(int type, int pid, int uid, int seq, void *data, | 1155 | int audit_receive_filter(int type, int pid, int seq, void *data, |
1102 | size_t datasz, uid_t loginuid, u32 sessionid, u32 sid) | 1156 | size_t datasz, kuid_t loginuid, u32 sessionid, u32 sid) |
1103 | { | 1157 | { |
1104 | struct task_struct *tsk; | 1158 | struct task_struct *tsk; |
1105 | struct audit_netlink_list *dest; | 1159 | struct audit_netlink_list *dest; |
@@ -1198,46 +1252,110 @@ int audit_comparator(u32 left, u32 op, u32 right) | |||
1198 | } | 1252 | } |
1199 | } | 1253 | } |
1200 | 1254 | ||
1201 | /* Compare given dentry name with last component in given path, | 1255 | int audit_uid_comparator(kuid_t left, u32 op, kuid_t right) |
1202 | * return of 0 indicates a match. */ | ||
1203 | int audit_compare_dname_path(const char *dname, const char *path, | ||
1204 | int *dirlen) | ||
1205 | { | 1256 | { |
1206 | int dlen, plen; | 1257 | switch (op) { |
1207 | const char *p; | 1258 | case Audit_equal: |
1259 | return uid_eq(left, right); | ||
1260 | case Audit_not_equal: | ||
1261 | return !uid_eq(left, right); | ||
1262 | case Audit_lt: | ||
1263 | return uid_lt(left, right); | ||
1264 | case Audit_le: | ||
1265 | return uid_lte(left, right); | ||
1266 | case Audit_gt: | ||
1267 | return uid_gt(left, right); | ||
1268 | case Audit_ge: | ||
1269 | return uid_gte(left, right); | ||
1270 | case Audit_bitmask: | ||
1271 | case Audit_bittest: | ||
1272 | default: | ||
1273 | BUG(); | ||
1274 | return 0; | ||
1275 | } | ||
1276 | } | ||
1208 | 1277 | ||
1209 | if (!dname || !path) | 1278 | int audit_gid_comparator(kgid_t left, u32 op, kgid_t right) |
1210 | return 1; | 1279 | { |
1280 | switch (op) { | ||
1281 | case Audit_equal: | ||
1282 | return gid_eq(left, right); | ||
1283 | case Audit_not_equal: | ||
1284 | return !gid_eq(left, right); | ||
1285 | case Audit_lt: | ||
1286 | return gid_lt(left, right); | ||
1287 | case Audit_le: | ||
1288 | return gid_lte(left, right); | ||
1289 | case Audit_gt: | ||
1290 | return gid_gt(left, right); | ||
1291 | case Audit_ge: | ||
1292 | return gid_gte(left, right); | ||
1293 | case Audit_bitmask: | ||
1294 | case Audit_bittest: | ||
1295 | default: | ||
1296 | BUG(); | ||
1297 | return 0; | ||
1298 | } | ||
1299 | } | ||
1300 | |||
1301 | /** | ||
1302 | * parent_len - find the length of the parent portion of a pathname | ||
1303 | * @path: pathname of which to determine length | ||
1304 | */ | ||
1305 | int parent_len(const char *path) | ||
1306 | { | ||
1307 | int plen; | ||
1308 | const char *p; | ||
1211 | 1309 | ||
1212 | dlen = strlen(dname); | ||
1213 | plen = strlen(path); | 1310 | plen = strlen(path); |
1214 | if (plen < dlen) | 1311 | |
1215 | return 1; | 1312 | if (plen == 0) |
1313 | return plen; | ||
1216 | 1314 | ||
1217 | /* disregard trailing slashes */ | 1315 | /* disregard trailing slashes */ |
1218 | p = path + plen - 1; | 1316 | p = path + plen - 1; |
1219 | while ((*p == '/') && (p > path)) | 1317 | while ((*p == '/') && (p > path)) |
1220 | p--; | 1318 | p--; |
1221 | 1319 | ||
1222 | /* find last path component */ | 1320 | /* walk backward until we find the next slash or hit beginning */ |
1223 | p = p - dlen + 1; | 1321 | while ((*p != '/') && (p > path)) |
1224 | if (p < path) | 1322 | p--; |
1323 | |||
1324 | /* did we find a slash? Then increment to include it in path */ | ||
1325 | if (*p == '/') | ||
1326 | p++; | ||
1327 | |||
1328 | return p - path; | ||
1329 | } | ||
1330 | |||
1331 | /** | ||
1332 | * audit_compare_dname_path - compare given dentry name with last component in | ||
1333 | * given path. Return of 0 indicates a match. | ||
1334 | * @dname: dentry name that we're comparing | ||
1335 | * @path: full pathname that we're comparing | ||
1336 | * @parentlen: length of the parent if known. Passing in AUDIT_NAME_FULL | ||
1337 | * here indicates that we must compute this value. | ||
1338 | */ | ||
1339 | int audit_compare_dname_path(const char *dname, const char *path, int parentlen) | ||
1340 | { | ||
1341 | int dlen, pathlen; | ||
1342 | const char *p; | ||
1343 | |||
1344 | dlen = strlen(dname); | ||
1345 | pathlen = strlen(path); | ||
1346 | if (pathlen < dlen) | ||
1225 | return 1; | 1347 | return 1; |
1226 | else if (p > path) { | ||
1227 | if (*--p != '/') | ||
1228 | return 1; | ||
1229 | else | ||
1230 | p++; | ||
1231 | } | ||
1232 | 1348 | ||
1233 | /* return length of path's directory component */ | 1349 | parentlen = parentlen == AUDIT_NAME_FULL ? parent_len(path) : parentlen; |
1234 | if (dirlen) | 1350 | if (pathlen - parentlen != dlen) |
1235 | *dirlen = p - path; | 1351 | return 1; |
1352 | |||
1353 | p = path + parentlen; | ||
1354 | |||
1236 | return strncmp(p, dname, dlen); | 1355 | return strncmp(p, dname, dlen); |
1237 | } | 1356 | } |
1238 | 1357 | ||
1239 | static int audit_filter_user_rules(struct netlink_skb_parms *cb, | 1358 | static int audit_filter_user_rules(struct audit_krule *rule, |
1240 | struct audit_krule *rule, | ||
1241 | enum audit_state *state) | 1359 | enum audit_state *state) |
1242 | { | 1360 | { |
1243 | int i; | 1361 | int i; |
@@ -1249,17 +1367,17 @@ static int audit_filter_user_rules(struct netlink_skb_parms *cb, | |||
1249 | 1367 | ||
1250 | switch (f->type) { | 1368 | switch (f->type) { |
1251 | case AUDIT_PID: | 1369 | case AUDIT_PID: |
1252 | result = audit_comparator(cb->creds.pid, f->op, f->val); | 1370 | result = audit_comparator(task_pid_vnr(current), f->op, f->val); |
1253 | break; | 1371 | break; |
1254 | case AUDIT_UID: | 1372 | case AUDIT_UID: |
1255 | result = audit_comparator(cb->creds.uid, f->op, f->val); | 1373 | result = audit_uid_comparator(current_uid(), f->op, f->uid); |
1256 | break; | 1374 | break; |
1257 | case AUDIT_GID: | 1375 | case AUDIT_GID: |
1258 | result = audit_comparator(cb->creds.gid, f->op, f->val); | 1376 | result = audit_gid_comparator(current_gid(), f->op, f->gid); |
1259 | break; | 1377 | break; |
1260 | case AUDIT_LOGINUID: | 1378 | case AUDIT_LOGINUID: |
1261 | result = audit_comparator(audit_get_loginuid(current), | 1379 | result = audit_uid_comparator(audit_get_loginuid(current), |
1262 | f->op, f->val); | 1380 | f->op, f->uid); |
1263 | break; | 1381 | break; |
1264 | case AUDIT_SUBJ_USER: | 1382 | case AUDIT_SUBJ_USER: |
1265 | case AUDIT_SUBJ_ROLE: | 1383 | case AUDIT_SUBJ_ROLE: |
@@ -1287,7 +1405,7 @@ static int audit_filter_user_rules(struct netlink_skb_parms *cb, | |||
1287 | return 1; | 1405 | return 1; |
1288 | } | 1406 | } |
1289 | 1407 | ||
1290 | int audit_filter_user(struct netlink_skb_parms *cb) | 1408 | int audit_filter_user(void) |
1291 | { | 1409 | { |
1292 | enum audit_state state = AUDIT_DISABLED; | 1410 | enum audit_state state = AUDIT_DISABLED; |
1293 | struct audit_entry *e; | 1411 | struct audit_entry *e; |
@@ -1295,7 +1413,7 @@ int audit_filter_user(struct netlink_skb_parms *cb) | |||
1295 | 1413 | ||
1296 | rcu_read_lock(); | 1414 | rcu_read_lock(); |
1297 | list_for_each_entry_rcu(e, &audit_filter_list[AUDIT_FILTER_USER], list) { | 1415 | list_for_each_entry_rcu(e, &audit_filter_list[AUDIT_FILTER_USER], list) { |
1298 | if (audit_filter_user_rules(cb, &e->rule, &state)) { | 1416 | if (audit_filter_user_rules(&e->rule, &state)) { |
1299 | if (state == AUDIT_DISABLED) | 1417 | if (state == AUDIT_DISABLED) |
1300 | ret = 0; | 1418 | ret = 0; |
1301 | break; | 1419 | break; |
diff --git a/kernel/auditsc.c b/kernel/auditsc.c index 4b96415527b8..e37e6a12c5e3 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c | |||
@@ -81,9 +81,6 @@ | |||
81 | * a name dynamically and also add those to the list anchored by names_list. */ | 81 | * a name dynamically and also add those to the list anchored by names_list. */ |
82 | #define AUDIT_NAMES 5 | 82 | #define AUDIT_NAMES 5 |
83 | 83 | ||
84 | /* Indicates that audit should log the full pathname. */ | ||
85 | #define AUDIT_NAME_FULL -1 | ||
86 | |||
87 | /* no execve audit message should be longer than this (userspace limits) */ | 84 | /* no execve audit message should be longer than this (userspace limits) */ |
88 | #define MAX_EXECVE_AUDIT_LEN 7500 | 85 | #define MAX_EXECVE_AUDIT_LEN 7500 |
89 | 86 | ||
@@ -106,27 +103,29 @@ struct audit_cap_data { | |||
106 | * we don't let putname() free it (instead we free all of the saved | 103 | * we don't let putname() free it (instead we free all of the saved |
107 | * pointers at syscall exit time). | 104 | * pointers at syscall exit time). |
108 | * | 105 | * |
109 | * Further, in fs/namei.c:path_lookup() we store the inode and device. */ | 106 | * Further, in fs/namei.c:path_lookup() we store the inode and device. |
107 | */ | ||
110 | struct audit_names { | 108 | struct audit_names { |
111 | struct list_head list; /* audit_context->names_list */ | 109 | struct list_head list; /* audit_context->names_list */ |
112 | const char *name; | 110 | struct filename *name; |
113 | unsigned long ino; | 111 | unsigned long ino; |
114 | dev_t dev; | 112 | dev_t dev; |
115 | umode_t mode; | 113 | umode_t mode; |
116 | uid_t uid; | 114 | kuid_t uid; |
117 | gid_t gid; | 115 | kgid_t gid; |
118 | dev_t rdev; | 116 | dev_t rdev; |
119 | u32 osid; | 117 | u32 osid; |
120 | struct audit_cap_data fcap; | 118 | struct audit_cap_data fcap; |
121 | unsigned int fcap_ver; | 119 | unsigned int fcap_ver; |
122 | int name_len; /* number of name's characters to log */ | 120 | int name_len; /* number of name's characters to log */ |
123 | bool name_put; /* call __putname() for this name */ | 121 | unsigned char type; /* record type */ |
122 | bool name_put; /* call __putname() for this name */ | ||
124 | /* | 123 | /* |
125 | * This was an allocated audit_names and not from the array of | 124 | * This was an allocated audit_names and not from the array of |
126 | * names allocated in the task audit context. Thus this name | 125 | * names allocated in the task audit context. Thus this name |
127 | * should be freed on syscall exit | 126 | * should be freed on syscall exit |
128 | */ | 127 | */ |
129 | bool should_free; | 128 | bool should_free; |
130 | }; | 129 | }; |
131 | 130 | ||
132 | struct audit_aux_data { | 131 | struct audit_aux_data { |
@@ -149,8 +148,8 @@ struct audit_aux_data_execve { | |||
149 | struct audit_aux_data_pids { | 148 | struct audit_aux_data_pids { |
150 | struct audit_aux_data d; | 149 | struct audit_aux_data d; |
151 | pid_t target_pid[AUDIT_AUX_PIDS]; | 150 | pid_t target_pid[AUDIT_AUX_PIDS]; |
152 | uid_t target_auid[AUDIT_AUX_PIDS]; | 151 | kuid_t target_auid[AUDIT_AUX_PIDS]; |
153 | uid_t target_uid[AUDIT_AUX_PIDS]; | 152 | kuid_t target_uid[AUDIT_AUX_PIDS]; |
154 | unsigned int target_sessionid[AUDIT_AUX_PIDS]; | 153 | unsigned int target_sessionid[AUDIT_AUX_PIDS]; |
155 | u32 target_sid[AUDIT_AUX_PIDS]; | 154 | u32 target_sid[AUDIT_AUX_PIDS]; |
156 | char target_comm[AUDIT_AUX_PIDS][TASK_COMM_LEN]; | 155 | char target_comm[AUDIT_AUX_PIDS][TASK_COMM_LEN]; |
@@ -201,21 +200,20 @@ struct audit_context { | |||
201 | struct list_head names_list; /* anchor for struct audit_names->list */ | 200 | struct list_head names_list; /* anchor for struct audit_names->list */ |
202 | char * filterkey; /* key for rule that triggered record */ | 201 | char * filterkey; /* key for rule that triggered record */ |
203 | struct path pwd; | 202 | struct path pwd; |
204 | struct audit_context *previous; /* For nested syscalls */ | ||
205 | struct audit_aux_data *aux; | 203 | struct audit_aux_data *aux; |
206 | struct audit_aux_data *aux_pids; | 204 | struct audit_aux_data *aux_pids; |
207 | struct sockaddr_storage *sockaddr; | 205 | struct sockaddr_storage *sockaddr; |
208 | size_t sockaddr_len; | 206 | size_t sockaddr_len; |
209 | /* Save things to print about task_struct */ | 207 | /* Save things to print about task_struct */ |
210 | pid_t pid, ppid; | 208 | pid_t pid, ppid; |
211 | uid_t uid, euid, suid, fsuid; | 209 | kuid_t uid, euid, suid, fsuid; |
212 | gid_t gid, egid, sgid, fsgid; | 210 | kgid_t gid, egid, sgid, fsgid; |
213 | unsigned long personality; | 211 | unsigned long personality; |
214 | int arch; | 212 | int arch; |
215 | 213 | ||
216 | pid_t target_pid; | 214 | pid_t target_pid; |
217 | uid_t target_auid; | 215 | kuid_t target_auid; |
218 | uid_t target_uid; | 216 | kuid_t target_uid; |
219 | unsigned int target_sessionid; | 217 | unsigned int target_sessionid; |
220 | u32 target_sid; | 218 | u32 target_sid; |
221 | char target_comm[TASK_COMM_LEN]; | 219 | char target_comm[TASK_COMM_LEN]; |
@@ -231,8 +229,8 @@ struct audit_context { | |||
231 | long args[6]; | 229 | long args[6]; |
232 | } socketcall; | 230 | } socketcall; |
233 | struct { | 231 | struct { |
234 | uid_t uid; | 232 | kuid_t uid; |
235 | gid_t gid; | 233 | kgid_t gid; |
236 | umode_t mode; | 234 | umode_t mode; |
237 | u32 osid; | 235 | u32 osid; |
238 | int has_perm; | 236 | int has_perm; |
@@ -464,37 +462,47 @@ static int match_tree_refs(struct audit_context *ctx, struct audit_tree *tree) | |||
464 | return 0; | 462 | return 0; |
465 | } | 463 | } |
466 | 464 | ||
467 | static int audit_compare_id(uid_t uid1, | 465 | static int audit_compare_uid(kuid_t uid, |
468 | struct audit_names *name, | 466 | struct audit_names *name, |
469 | unsigned long name_offset, | 467 | struct audit_field *f, |
470 | struct audit_field *f, | 468 | struct audit_context *ctx) |
471 | struct audit_context *ctx) | ||
472 | { | 469 | { |
473 | struct audit_names *n; | 470 | struct audit_names *n; |
474 | unsigned long addr; | ||
475 | uid_t uid2; | ||
476 | int rc; | 471 | int rc; |
477 | 472 | ||
478 | BUILD_BUG_ON(sizeof(uid_t) != sizeof(gid_t)); | ||
479 | |||
480 | if (name) { | 473 | if (name) { |
481 | addr = (unsigned long)name; | 474 | rc = audit_uid_comparator(uid, f->op, name->uid); |
482 | addr += name_offset; | ||
483 | |||
484 | uid2 = *(uid_t *)addr; | ||
485 | rc = audit_comparator(uid1, f->op, uid2); | ||
486 | if (rc) | 475 | if (rc) |
487 | return rc; | 476 | return rc; |
488 | } | 477 | } |
489 | 478 | ||
490 | if (ctx) { | 479 | if (ctx) { |
491 | list_for_each_entry(n, &ctx->names_list, list) { | 480 | list_for_each_entry(n, &ctx->names_list, list) { |
492 | addr = (unsigned long)n; | 481 | rc = audit_uid_comparator(uid, f->op, n->uid); |
493 | addr += name_offset; | 482 | if (rc) |
494 | 483 | return rc; | |
495 | uid2 = *(uid_t *)addr; | 484 | } |
485 | } | ||
486 | return 0; | ||
487 | } | ||
496 | 488 | ||
497 | rc = audit_comparator(uid1, f->op, uid2); | 489 | static int audit_compare_gid(kgid_t gid, |
490 | struct audit_names *name, | ||
491 | struct audit_field *f, | ||
492 | struct audit_context *ctx) | ||
493 | { | ||
494 | struct audit_names *n; | ||
495 | int rc; | ||
496 | |||
497 | if (name) { | ||
498 | rc = audit_gid_comparator(gid, f->op, name->gid); | ||
499 | if (rc) | ||
500 | return rc; | ||
501 | } | ||
502 | |||
503 | if (ctx) { | ||
504 | list_for_each_entry(n, &ctx->names_list, list) { | ||
505 | rc = audit_gid_comparator(gid, f->op, n->gid); | ||
498 | if (rc) | 506 | if (rc) |
499 | return rc; | 507 | return rc; |
500 | } | 508 | } |
@@ -511,80 +519,62 @@ static int audit_field_compare(struct task_struct *tsk, | |||
511 | switch (f->val) { | 519 | switch (f->val) { |
512 | /* process to file object comparisons */ | 520 | /* process to file object comparisons */ |
513 | case AUDIT_COMPARE_UID_TO_OBJ_UID: | 521 | case AUDIT_COMPARE_UID_TO_OBJ_UID: |
514 | return audit_compare_id(cred->uid, | 522 | return audit_compare_uid(cred->uid, name, f, ctx); |
515 | name, offsetof(struct audit_names, uid), | ||
516 | f, ctx); | ||
517 | case AUDIT_COMPARE_GID_TO_OBJ_GID: | 523 | case AUDIT_COMPARE_GID_TO_OBJ_GID: |
518 | return audit_compare_id(cred->gid, | 524 | return audit_compare_gid(cred->gid, name, f, ctx); |
519 | name, offsetof(struct audit_names, gid), | ||
520 | f, ctx); | ||
521 | case AUDIT_COMPARE_EUID_TO_OBJ_UID: | 525 | case AUDIT_COMPARE_EUID_TO_OBJ_UID: |
522 | return audit_compare_id(cred->euid, | 526 | return audit_compare_uid(cred->euid, name, f, ctx); |
523 | name, offsetof(struct audit_names, uid), | ||
524 | f, ctx); | ||
525 | case AUDIT_COMPARE_EGID_TO_OBJ_GID: | 527 | case AUDIT_COMPARE_EGID_TO_OBJ_GID: |
526 | return audit_compare_id(cred->egid, | 528 | return audit_compare_gid(cred->egid, name, f, ctx); |
527 | name, offsetof(struct audit_names, gid), | ||
528 | f, ctx); | ||
529 | case AUDIT_COMPARE_AUID_TO_OBJ_UID: | 529 | case AUDIT_COMPARE_AUID_TO_OBJ_UID: |
530 | return audit_compare_id(tsk->loginuid, | 530 | return audit_compare_uid(tsk->loginuid, name, f, ctx); |
531 | name, offsetof(struct audit_names, uid), | ||
532 | f, ctx); | ||
533 | case AUDIT_COMPARE_SUID_TO_OBJ_UID: | 531 | case AUDIT_COMPARE_SUID_TO_OBJ_UID: |
534 | return audit_compare_id(cred->suid, | 532 | return audit_compare_uid(cred->suid, name, f, ctx); |
535 | name, offsetof(struct audit_names, uid), | ||
536 | f, ctx); | ||
537 | case AUDIT_COMPARE_SGID_TO_OBJ_GID: | 533 | case AUDIT_COMPARE_SGID_TO_OBJ_GID: |
538 | return audit_compare_id(cred->sgid, | 534 | return audit_compare_gid(cred->sgid, name, f, ctx); |
539 | name, offsetof(struct audit_names, gid), | ||
540 | f, ctx); | ||
541 | case AUDIT_COMPARE_FSUID_TO_OBJ_UID: | 535 | case AUDIT_COMPARE_FSUID_TO_OBJ_UID: |
542 | return audit_compare_id(cred->fsuid, | 536 | return audit_compare_uid(cred->fsuid, name, f, ctx); |
543 | name, offsetof(struct audit_names, uid), | ||
544 | f, ctx); | ||
545 | case AUDIT_COMPARE_FSGID_TO_OBJ_GID: | 537 | case AUDIT_COMPARE_FSGID_TO_OBJ_GID: |
546 | return audit_compare_id(cred->fsgid, | 538 | return audit_compare_gid(cred->fsgid, name, f, ctx); |
547 | name, offsetof(struct audit_names, gid), | ||
548 | f, ctx); | ||
549 | /* uid comparisons */ | 539 | /* uid comparisons */ |
550 | case AUDIT_COMPARE_UID_TO_AUID: | 540 | case AUDIT_COMPARE_UID_TO_AUID: |
551 | return audit_comparator(cred->uid, f->op, tsk->loginuid); | 541 | return audit_uid_comparator(cred->uid, f->op, tsk->loginuid); |
552 | case AUDIT_COMPARE_UID_TO_EUID: | 542 | case AUDIT_COMPARE_UID_TO_EUID: |
553 | return audit_comparator(cred->uid, f->op, cred->euid); | 543 | return audit_uid_comparator(cred->uid, f->op, cred->euid); |
554 | case AUDIT_COMPARE_UID_TO_SUID: | 544 | case AUDIT_COMPARE_UID_TO_SUID: |
555 | return audit_comparator(cred->uid, f->op, cred->suid); | 545 | return audit_uid_comparator(cred->uid, f->op, cred->suid); |
556 | case AUDIT_COMPARE_UID_TO_FSUID: | 546 | case AUDIT_COMPARE_UID_TO_FSUID: |
557 | return audit_comparator(cred->uid, f->op, cred->fsuid); | 547 | return audit_uid_comparator(cred->uid, f->op, cred->fsuid); |
558 | /* auid comparisons */ | 548 | /* auid comparisons */ |
559 | case AUDIT_COMPARE_AUID_TO_EUID: | 549 | case AUDIT_COMPARE_AUID_TO_EUID: |
560 | return audit_comparator(tsk->loginuid, f->op, cred->euid); | 550 | return audit_uid_comparator(tsk->loginuid, f->op, cred->euid); |
561 | case AUDIT_COMPARE_AUID_TO_SUID: | 551 | case AUDIT_COMPARE_AUID_TO_SUID: |
562 | return audit_comparator(tsk->loginuid, f->op, cred->suid); | 552 | return audit_uid_comparator(tsk->loginuid, f->op, cred->suid); |
563 | case AUDIT_COMPARE_AUID_TO_FSUID: | 553 | case AUDIT_COMPARE_AUID_TO_FSUID: |
564 | return audit_comparator(tsk->loginuid, f->op, cred->fsuid); | 554 | return audit_uid_comparator(tsk->loginuid, f->op, cred->fsuid); |
565 | /* euid comparisons */ | 555 | /* euid comparisons */ |
566 | case AUDIT_COMPARE_EUID_TO_SUID: | 556 | case AUDIT_COMPARE_EUID_TO_SUID: |
567 | return audit_comparator(cred->euid, f->op, cred->suid); | 557 | return audit_uid_comparator(cred->euid, f->op, cred->suid); |
568 | case AUDIT_COMPARE_EUID_TO_FSUID: | 558 | case AUDIT_COMPARE_EUID_TO_FSUID: |
569 | return audit_comparator(cred->euid, f->op, cred->fsuid); | 559 | return audit_uid_comparator(cred->euid, f->op, cred->fsuid); |
570 | /* suid comparisons */ | 560 | /* suid comparisons */ |
571 | case AUDIT_COMPARE_SUID_TO_FSUID: | 561 | case AUDIT_COMPARE_SUID_TO_FSUID: |
572 | return audit_comparator(cred->suid, f->op, cred->fsuid); | 562 | return audit_uid_comparator(cred->suid, f->op, cred->fsuid); |
573 | /* gid comparisons */ | 563 | /* gid comparisons */ |
574 | case AUDIT_COMPARE_GID_TO_EGID: | 564 | case AUDIT_COMPARE_GID_TO_EGID: |
575 | return audit_comparator(cred->gid, f->op, cred->egid); | 565 | return audit_gid_comparator(cred->gid, f->op, cred->egid); |
576 | case AUDIT_COMPARE_GID_TO_SGID: | 566 | case AUDIT_COMPARE_GID_TO_SGID: |
577 | return audit_comparator(cred->gid, f->op, cred->sgid); | 567 | return audit_gid_comparator(cred->gid, f->op, cred->sgid); |
578 | case AUDIT_COMPARE_GID_TO_FSGID: | 568 | case AUDIT_COMPARE_GID_TO_FSGID: |
579 | return audit_comparator(cred->gid, f->op, cred->fsgid); | 569 | return audit_gid_comparator(cred->gid, f->op, cred->fsgid); |
580 | /* egid comparisons */ | 570 | /* egid comparisons */ |
581 | case AUDIT_COMPARE_EGID_TO_SGID: | 571 | case AUDIT_COMPARE_EGID_TO_SGID: |
582 | return audit_comparator(cred->egid, f->op, cred->sgid); | 572 | return audit_gid_comparator(cred->egid, f->op, cred->sgid); |
583 | case AUDIT_COMPARE_EGID_TO_FSGID: | 573 | case AUDIT_COMPARE_EGID_TO_FSGID: |
584 | return audit_comparator(cred->egid, f->op, cred->fsgid); | 574 | return audit_gid_comparator(cred->egid, f->op, cred->fsgid); |
585 | /* sgid comparison */ | 575 | /* sgid comparison */ |
586 | case AUDIT_COMPARE_SGID_TO_FSGID: | 576 | case AUDIT_COMPARE_SGID_TO_FSGID: |
587 | return audit_comparator(cred->sgid, f->op, cred->fsgid); | 577 | return audit_gid_comparator(cred->sgid, f->op, cred->fsgid); |
588 | default: | 578 | default: |
589 | WARN(1, "Missing AUDIT_COMPARE define. Report as a bug\n"); | 579 | WARN(1, "Missing AUDIT_COMPARE define. Report as a bug\n"); |
590 | return 0; | 580 | return 0; |
@@ -630,28 +620,28 @@ static int audit_filter_rules(struct task_struct *tsk, | |||
630 | } | 620 | } |
631 | break; | 621 | break; |
632 | case AUDIT_UID: | 622 | case AUDIT_UID: |
633 | result = audit_comparator(cred->uid, f->op, f->val); | 623 | result = audit_uid_comparator(cred->uid, f->op, f->uid); |
634 | break; | 624 | break; |
635 | case AUDIT_EUID: | 625 | case AUDIT_EUID: |
636 | result = audit_comparator(cred->euid, f->op, f->val); | 626 | result = audit_uid_comparator(cred->euid, f->op, f->uid); |
637 | break; | 627 | break; |
638 | case AUDIT_SUID: | 628 | case AUDIT_SUID: |
639 | result = audit_comparator(cred->suid, f->op, f->val); | 629 | result = audit_uid_comparator(cred->suid, f->op, f->uid); |
640 | break; | 630 | break; |
641 | case AUDIT_FSUID: | 631 | case AUDIT_FSUID: |
642 | result = audit_comparator(cred->fsuid, f->op, f->val); | 632 | result = audit_uid_comparator(cred->fsuid, f->op, f->uid); |
643 | break; | 633 | break; |
644 | case AUDIT_GID: | 634 | case AUDIT_GID: |
645 | result = audit_comparator(cred->gid, f->op, f->val); | 635 | result = audit_gid_comparator(cred->gid, f->op, f->gid); |
646 | break; | 636 | break; |
647 | case AUDIT_EGID: | 637 | case AUDIT_EGID: |
648 | result = audit_comparator(cred->egid, f->op, f->val); | 638 | result = audit_gid_comparator(cred->egid, f->op, f->gid); |
649 | break; | 639 | break; |
650 | case AUDIT_SGID: | 640 | case AUDIT_SGID: |
651 | result = audit_comparator(cred->sgid, f->op, f->val); | 641 | result = audit_gid_comparator(cred->sgid, f->op, f->gid); |
652 | break; | 642 | break; |
653 | case AUDIT_FSGID: | 643 | case AUDIT_FSGID: |
654 | result = audit_comparator(cred->fsgid, f->op, f->val); | 644 | result = audit_gid_comparator(cred->fsgid, f->op, f->gid); |
655 | break; | 645 | break; |
656 | case AUDIT_PERS: | 646 | case AUDIT_PERS: |
657 | result = audit_comparator(tsk->personality, f->op, f->val); | 647 | result = audit_comparator(tsk->personality, f->op, f->val); |
@@ -717,10 +707,10 @@ static int audit_filter_rules(struct task_struct *tsk, | |||
717 | break; | 707 | break; |
718 | case AUDIT_OBJ_UID: | 708 | case AUDIT_OBJ_UID: |
719 | if (name) { | 709 | if (name) { |
720 | result = audit_comparator(name->uid, f->op, f->val); | 710 | result = audit_uid_comparator(name->uid, f->op, f->uid); |
721 | } else if (ctx) { | 711 | } else if (ctx) { |
722 | list_for_each_entry(n, &ctx->names_list, list) { | 712 | list_for_each_entry(n, &ctx->names_list, list) { |
723 | if (audit_comparator(n->uid, f->op, f->val)) { | 713 | if (audit_uid_comparator(n->uid, f->op, f->uid)) { |
724 | ++result; | 714 | ++result; |
725 | break; | 715 | break; |
726 | } | 716 | } |
@@ -729,10 +719,10 @@ static int audit_filter_rules(struct task_struct *tsk, | |||
729 | break; | 719 | break; |
730 | case AUDIT_OBJ_GID: | 720 | case AUDIT_OBJ_GID: |
731 | if (name) { | 721 | if (name) { |
732 | result = audit_comparator(name->gid, f->op, f->val); | 722 | result = audit_gid_comparator(name->gid, f->op, f->gid); |
733 | } else if (ctx) { | 723 | } else if (ctx) { |
734 | list_for_each_entry(n, &ctx->names_list, list) { | 724 | list_for_each_entry(n, &ctx->names_list, list) { |
735 | if (audit_comparator(n->gid, f->op, f->val)) { | 725 | if (audit_gid_comparator(n->gid, f->op, f->gid)) { |
736 | ++result; | 726 | ++result; |
737 | break; | 727 | break; |
738 | } | 728 | } |
@@ -750,7 +740,7 @@ static int audit_filter_rules(struct task_struct *tsk, | |||
750 | case AUDIT_LOGINUID: | 740 | case AUDIT_LOGINUID: |
751 | result = 0; | 741 | result = 0; |
752 | if (ctx) | 742 | if (ctx) |
753 | result = audit_comparator(tsk->loginuid, f->op, f->val); | 743 | result = audit_uid_comparator(tsk->loginuid, f->op, f->uid); |
754 | break; | 744 | break; |
755 | case AUDIT_SUBJ_USER: | 745 | case AUDIT_SUBJ_USER: |
756 | case AUDIT_SUBJ_ROLE: | 746 | case AUDIT_SUBJ_ROLE: |
@@ -1006,7 +996,7 @@ static inline void audit_free_names(struct audit_context *context) | |||
1006 | context->ino_count); | 996 | context->ino_count); |
1007 | list_for_each_entry(n, &context->names_list, list) { | 997 | list_for_each_entry(n, &context->names_list, list) { |
1008 | printk(KERN_ERR "names[%d] = %p = %s\n", i, | 998 | printk(KERN_ERR "names[%d] = %p = %s\n", i, |
1009 | n->name, n->name ?: "(null)"); | 999 | n->name, n->name->name ?: "(null)"); |
1010 | } | 1000 | } |
1011 | dump_stack(); | 1001 | dump_stack(); |
1012 | return; | 1002 | return; |
@@ -1100,29 +1090,13 @@ int audit_alloc(struct task_struct *tsk) | |||
1100 | 1090 | ||
1101 | static inline void audit_free_context(struct audit_context *context) | 1091 | static inline void audit_free_context(struct audit_context *context) |
1102 | { | 1092 | { |
1103 | struct audit_context *previous; | 1093 | audit_free_names(context); |
1104 | int count = 0; | 1094 | unroll_tree_refs(context, NULL, 0); |
1105 | 1095 | free_tree_refs(context); | |
1106 | do { | 1096 | audit_free_aux(context); |
1107 | previous = context->previous; | 1097 | kfree(context->filterkey); |
1108 | if (previous || (count && count < 10)) { | 1098 | kfree(context->sockaddr); |
1109 | ++count; | 1099 | kfree(context); |
1110 | printk(KERN_ERR "audit(:%d): major=%d name_count=%d:" | ||
1111 | " freeing multiple contexts (%d)\n", | ||
1112 | context->serial, context->major, | ||
1113 | context->name_count, count); | ||
1114 | } | ||
1115 | audit_free_names(context); | ||
1116 | unroll_tree_refs(context, NULL, 0); | ||
1117 | free_tree_refs(context); | ||
1118 | audit_free_aux(context); | ||
1119 | kfree(context->filterkey); | ||
1120 | kfree(context->sockaddr); | ||
1121 | kfree(context); | ||
1122 | context = previous; | ||
1123 | } while (context); | ||
1124 | if (count >= 10) | ||
1125 | printk(KERN_ERR "audit: freed %d contexts\n", count); | ||
1126 | } | 1100 | } |
1127 | 1101 | ||
1128 | void audit_log_task_context(struct audit_buffer *ab) | 1102 | void audit_log_task_context(struct audit_buffer *ab) |
@@ -1154,13 +1128,43 @@ error_path: | |||
1154 | 1128 | ||
1155 | EXPORT_SYMBOL(audit_log_task_context); | 1129 | EXPORT_SYMBOL(audit_log_task_context); |
1156 | 1130 | ||
1157 | static void audit_log_task_info(struct audit_buffer *ab, struct task_struct *tsk) | 1131 | void audit_log_task_info(struct audit_buffer *ab, struct task_struct *tsk) |
1158 | { | 1132 | { |
1133 | const struct cred *cred; | ||
1159 | char name[sizeof(tsk->comm)]; | 1134 | char name[sizeof(tsk->comm)]; |
1160 | struct mm_struct *mm = tsk->mm; | 1135 | struct mm_struct *mm = tsk->mm; |
1161 | struct vm_area_struct *vma; | 1136 | char *tty; |
1137 | |||
1138 | if (!ab) | ||
1139 | return; | ||
1162 | 1140 | ||
1163 | /* tsk == current */ | 1141 | /* tsk == current */ |
1142 | cred = current_cred(); | ||
1143 | |||
1144 | spin_lock_irq(&tsk->sighand->siglock); | ||
1145 | if (tsk->signal && tsk->signal->tty) | ||
1146 | tty = tsk->signal->tty->name; | ||
1147 | else | ||
1148 | tty = "(none)"; | ||
1149 | spin_unlock_irq(&tsk->sighand->siglock); | ||
1150 | |||
1151 | |||
1152 | audit_log_format(ab, | ||
1153 | " ppid=%ld pid=%d auid=%u uid=%u gid=%u" | ||
1154 | " euid=%u suid=%u fsuid=%u" | ||
1155 | " egid=%u sgid=%u fsgid=%u ses=%u tty=%s", | ||
1156 | sys_getppid(), | ||
1157 | tsk->pid, | ||
1158 | from_kuid(&init_user_ns, tsk->loginuid), | ||
1159 | from_kuid(&init_user_ns, cred->uid), | ||
1160 | from_kgid(&init_user_ns, cred->gid), | ||
1161 | from_kuid(&init_user_ns, cred->euid), | ||
1162 | from_kuid(&init_user_ns, cred->suid), | ||
1163 | from_kuid(&init_user_ns, cred->fsuid), | ||
1164 | from_kgid(&init_user_ns, cred->egid), | ||
1165 | from_kgid(&init_user_ns, cred->sgid), | ||
1166 | from_kgid(&init_user_ns, cred->fsgid), | ||
1167 | tsk->sessionid, tty); | ||
1164 | 1168 | ||
1165 | get_task_comm(name, tsk); | 1169 | get_task_comm(name, tsk); |
1166 | audit_log_format(ab, " comm="); | 1170 | audit_log_format(ab, " comm="); |
@@ -1168,23 +1172,17 @@ static void audit_log_task_info(struct audit_buffer *ab, struct task_struct *tsk | |||
1168 | 1172 | ||
1169 | if (mm) { | 1173 | if (mm) { |
1170 | down_read(&mm->mmap_sem); | 1174 | down_read(&mm->mmap_sem); |
1171 | vma = mm->mmap; | 1175 | if (mm->exe_file) |
1172 | while (vma) { | 1176 | audit_log_d_path(ab, " exe=", &mm->exe_file->f_path); |
1173 | if ((vma->vm_flags & VM_EXECUTABLE) && | ||
1174 | vma->vm_file) { | ||
1175 | audit_log_d_path(ab, " exe=", | ||
1176 | &vma->vm_file->f_path); | ||
1177 | break; | ||
1178 | } | ||
1179 | vma = vma->vm_next; | ||
1180 | } | ||
1181 | up_read(&mm->mmap_sem); | 1177 | up_read(&mm->mmap_sem); |
1182 | } | 1178 | } |
1183 | audit_log_task_context(ab); | 1179 | audit_log_task_context(ab); |
1184 | } | 1180 | } |
1185 | 1181 | ||
1182 | EXPORT_SYMBOL(audit_log_task_info); | ||
1183 | |||
1186 | static int audit_log_pid_context(struct audit_context *context, pid_t pid, | 1184 | static int audit_log_pid_context(struct audit_context *context, pid_t pid, |
1187 | uid_t auid, uid_t uid, unsigned int sessionid, | 1185 | kuid_t auid, kuid_t uid, unsigned int sessionid, |
1188 | u32 sid, char *comm) | 1186 | u32 sid, char *comm) |
1189 | { | 1187 | { |
1190 | struct audit_buffer *ab; | 1188 | struct audit_buffer *ab; |
@@ -1196,8 +1194,9 @@ static int audit_log_pid_context(struct audit_context *context, pid_t pid, | |||
1196 | if (!ab) | 1194 | if (!ab) |
1197 | return rc; | 1195 | return rc; |
1198 | 1196 | ||
1199 | audit_log_format(ab, "opid=%d oauid=%d ouid=%d oses=%d", pid, auid, | 1197 | audit_log_format(ab, "opid=%d oauid=%d ouid=%d oses=%d", pid, |
1200 | uid, sessionid); | 1198 | from_kuid(&init_user_ns, auid), |
1199 | from_kuid(&init_user_ns, uid), sessionid); | ||
1201 | if (security_secid_to_secctx(sid, &ctx, &len)) { | 1200 | if (security_secid_to_secctx(sid, &ctx, &len)) { |
1202 | audit_log_format(ab, " obj=(none)"); | 1201 | audit_log_format(ab, " obj=(none)"); |
1203 | rc = 1; | 1202 | rc = 1; |
@@ -1447,7 +1446,9 @@ static void show_special(struct audit_context *context, int *call_panic) | |||
1447 | u32 osid = context->ipc.osid; | 1446 | u32 osid = context->ipc.osid; |
1448 | 1447 | ||
1449 | audit_log_format(ab, "ouid=%u ogid=%u mode=%#ho", | 1448 | audit_log_format(ab, "ouid=%u ogid=%u mode=%#ho", |
1450 | context->ipc.uid, context->ipc.gid, context->ipc.mode); | 1449 | from_kuid(&init_user_ns, context->ipc.uid), |
1450 | from_kgid(&init_user_ns, context->ipc.gid), | ||
1451 | context->ipc.mode); | ||
1451 | if (osid) { | 1452 | if (osid) { |
1452 | char *ctx = NULL; | 1453 | char *ctx = NULL; |
1453 | u32 len; | 1454 | u32 len; |
@@ -1536,7 +1537,7 @@ static void audit_log_name(struct audit_context *context, struct audit_names *n, | |||
1536 | case AUDIT_NAME_FULL: | 1537 | case AUDIT_NAME_FULL: |
1537 | /* log the full path */ | 1538 | /* log the full path */ |
1538 | audit_log_format(ab, " name="); | 1539 | audit_log_format(ab, " name="); |
1539 | audit_log_untrustedstring(ab, n->name); | 1540 | audit_log_untrustedstring(ab, n->name->name); |
1540 | break; | 1541 | break; |
1541 | case 0: | 1542 | case 0: |
1542 | /* name was specified as a relative path and the | 1543 | /* name was specified as a relative path and the |
@@ -1546,7 +1547,7 @@ static void audit_log_name(struct audit_context *context, struct audit_names *n, | |||
1546 | default: | 1547 | default: |
1547 | /* log the name's directory component */ | 1548 | /* log the name's directory component */ |
1548 | audit_log_format(ab, " name="); | 1549 | audit_log_format(ab, " name="); |
1549 | audit_log_n_untrustedstring(ab, n->name, | 1550 | audit_log_n_untrustedstring(ab, n->name->name, |
1550 | n->name_len); | 1551 | n->name_len); |
1551 | } | 1552 | } |
1552 | } else | 1553 | } else |
@@ -1560,8 +1561,8 @@ static void audit_log_name(struct audit_context *context, struct audit_names *n, | |||
1560 | MAJOR(n->dev), | 1561 | MAJOR(n->dev), |
1561 | MINOR(n->dev), | 1562 | MINOR(n->dev), |
1562 | n->mode, | 1563 | n->mode, |
1563 | n->uid, | 1564 | from_kuid(&init_user_ns, n->uid), |
1564 | n->gid, | 1565 | from_kgid(&init_user_ns, n->gid), |
1565 | MAJOR(n->rdev), | 1566 | MAJOR(n->rdev), |
1566 | MINOR(n->rdev)); | 1567 | MINOR(n->rdev)); |
1567 | } | 1568 | } |
@@ -1585,26 +1586,12 @@ static void audit_log_name(struct audit_context *context, struct audit_names *n, | |||
1585 | 1586 | ||
1586 | static void audit_log_exit(struct audit_context *context, struct task_struct *tsk) | 1587 | static void audit_log_exit(struct audit_context *context, struct task_struct *tsk) |
1587 | { | 1588 | { |
1588 | const struct cred *cred; | ||
1589 | int i, call_panic = 0; | 1589 | int i, call_panic = 0; |
1590 | struct audit_buffer *ab; | 1590 | struct audit_buffer *ab; |
1591 | struct audit_aux_data *aux; | 1591 | struct audit_aux_data *aux; |
1592 | const char *tty; | ||
1593 | struct audit_names *n; | 1592 | struct audit_names *n; |
1594 | 1593 | ||
1595 | /* tsk == current */ | 1594 | /* tsk == current */ |
1596 | context->pid = tsk->pid; | ||
1597 | if (!context->ppid) | ||
1598 | context->ppid = sys_getppid(); | ||
1599 | cred = current_cred(); | ||
1600 | context->uid = cred->uid; | ||
1601 | context->gid = cred->gid; | ||
1602 | context->euid = cred->euid; | ||
1603 | context->suid = cred->suid; | ||
1604 | context->fsuid = cred->fsuid; | ||
1605 | context->egid = cred->egid; | ||
1606 | context->sgid = cred->sgid; | ||
1607 | context->fsgid = cred->fsgid; | ||
1608 | context->personality = tsk->personality; | 1595 | context->personality = tsk->personality; |
1609 | 1596 | ||
1610 | ab = audit_log_start(context, GFP_KERNEL, AUDIT_SYSCALL); | 1597 | ab = audit_log_start(context, GFP_KERNEL, AUDIT_SYSCALL); |
@@ -1619,32 +1606,13 @@ static void audit_log_exit(struct audit_context *context, struct task_struct *ts | |||
1619 | (context->return_valid==AUDITSC_SUCCESS)?"yes":"no", | 1606 | (context->return_valid==AUDITSC_SUCCESS)?"yes":"no", |
1620 | context->return_code); | 1607 | context->return_code); |
1621 | 1608 | ||
1622 | spin_lock_irq(&tsk->sighand->siglock); | ||
1623 | if (tsk->signal && tsk->signal->tty && tsk->signal->tty->name) | ||
1624 | tty = tsk->signal->tty->name; | ||
1625 | else | ||
1626 | tty = "(none)"; | ||
1627 | spin_unlock_irq(&tsk->sighand->siglock); | ||
1628 | |||
1629 | audit_log_format(ab, | 1609 | audit_log_format(ab, |
1630 | " a0=%lx a1=%lx a2=%lx a3=%lx items=%d" | 1610 | " a0=%lx a1=%lx a2=%lx a3=%lx items=%d", |
1631 | " ppid=%d pid=%d auid=%u uid=%u gid=%u" | 1611 | context->argv[0], |
1632 | " euid=%u suid=%u fsuid=%u" | 1612 | context->argv[1], |
1633 | " egid=%u sgid=%u fsgid=%u tty=%s ses=%u", | 1613 | context->argv[2], |
1634 | context->argv[0], | 1614 | context->argv[3], |
1635 | context->argv[1], | 1615 | context->name_count); |
1636 | context->argv[2], | ||
1637 | context->argv[3], | ||
1638 | context->name_count, | ||
1639 | context->ppid, | ||
1640 | context->pid, | ||
1641 | tsk->loginuid, | ||
1642 | context->uid, | ||
1643 | context->gid, | ||
1644 | context->euid, context->suid, context->fsuid, | ||
1645 | context->egid, context->sgid, context->fsgid, tty, | ||
1646 | tsk->sessionid); | ||
1647 | |||
1648 | 1616 | ||
1649 | audit_log_task_info(ab, tsk); | 1617 | audit_log_task_info(ab, tsk); |
1650 | audit_log_key(ab, context->filterkey); | 1618 | audit_log_key(ab, context->filterkey); |
@@ -1798,42 +1766,6 @@ void __audit_syscall_entry(int arch, int major, | |||
1798 | if (!context) | 1766 | if (!context) |
1799 | return; | 1767 | return; |
1800 | 1768 | ||
1801 | /* | ||
1802 | * This happens only on certain architectures that make system | ||
1803 | * calls in kernel_thread via the entry.S interface, instead of | ||
1804 | * with direct calls. (If you are porting to a new | ||
1805 | * architecture, hitting this condition can indicate that you | ||
1806 | * got the _exit/_leave calls backward in entry.S.) | ||
1807 | * | ||
1808 | * i386 no | ||
1809 | * x86_64 no | ||
1810 | * ppc64 yes (see arch/powerpc/platforms/iseries/misc.S) | ||
1811 | * | ||
1812 | * This also happens with vm86 emulation in a non-nested manner | ||
1813 | * (entries without exits), so this case must be caught. | ||
1814 | */ | ||
1815 | if (context->in_syscall) { | ||
1816 | struct audit_context *newctx; | ||
1817 | |||
1818 | #if AUDIT_DEBUG | ||
1819 | printk(KERN_ERR | ||
1820 | "audit(:%d) pid=%d in syscall=%d;" | ||
1821 | " entering syscall=%d\n", | ||
1822 | context->serial, tsk->pid, context->major, major); | ||
1823 | #endif | ||
1824 | newctx = audit_alloc_context(context->state); | ||
1825 | if (newctx) { | ||
1826 | newctx->previous = context; | ||
1827 | context = newctx; | ||
1828 | tsk->audit_context = newctx; | ||
1829 | } else { | ||
1830 | /* If we can't alloc a new context, the best we | ||
1831 | * can do is to leak memory (any pending putname | ||
1832 | * will be lost). The only other alternative is | ||
1833 | * to abandon auditing. */ | ||
1834 | audit_zero_context(context, context->state); | ||
1835 | } | ||
1836 | } | ||
1837 | BUG_ON(context->in_syscall || context->name_count); | 1769 | BUG_ON(context->in_syscall || context->name_count); |
1838 | 1770 | ||
1839 | if (!audit_enabled) | 1771 | if (!audit_enabled) |
@@ -1896,28 +1828,21 @@ void __audit_syscall_exit(int success, long return_code) | |||
1896 | if (!list_empty(&context->killed_trees)) | 1828 | if (!list_empty(&context->killed_trees)) |
1897 | audit_kill_trees(&context->killed_trees); | 1829 | audit_kill_trees(&context->killed_trees); |
1898 | 1830 | ||
1899 | if (context->previous) { | 1831 | audit_free_names(context); |
1900 | struct audit_context *new_context = context->previous; | 1832 | unroll_tree_refs(context, NULL, 0); |
1901 | context->previous = NULL; | 1833 | audit_free_aux(context); |
1902 | audit_free_context(context); | 1834 | context->aux = NULL; |
1903 | tsk->audit_context = new_context; | 1835 | context->aux_pids = NULL; |
1904 | } else { | 1836 | context->target_pid = 0; |
1905 | audit_free_names(context); | 1837 | context->target_sid = 0; |
1906 | unroll_tree_refs(context, NULL, 0); | 1838 | context->sockaddr_len = 0; |
1907 | audit_free_aux(context); | 1839 | context->type = 0; |
1908 | context->aux = NULL; | 1840 | context->fds[0] = -1; |
1909 | context->aux_pids = NULL; | 1841 | if (context->state != AUDIT_RECORD_CONTEXT) { |
1910 | context->target_pid = 0; | 1842 | kfree(context->filterkey); |
1911 | context->target_sid = 0; | 1843 | context->filterkey = NULL; |
1912 | context->sockaddr_len = 0; | ||
1913 | context->type = 0; | ||
1914 | context->fds[0] = -1; | ||
1915 | if (context->state != AUDIT_RECORD_CONTEXT) { | ||
1916 | kfree(context->filterkey); | ||
1917 | context->filterkey = NULL; | ||
1918 | } | ||
1919 | tsk->audit_context = context; | ||
1920 | } | 1844 | } |
1845 | tsk->audit_context = context; | ||
1921 | } | 1846 | } |
1922 | 1847 | ||
1923 | static inline void handle_one(const struct inode *inode) | 1848 | static inline void handle_one(const struct inode *inode) |
@@ -2009,7 +1934,8 @@ retry: | |||
2009 | #endif | 1934 | #endif |
2010 | } | 1935 | } |
2011 | 1936 | ||
2012 | static struct audit_names *audit_alloc_name(struct audit_context *context) | 1937 | static struct audit_names *audit_alloc_name(struct audit_context *context, |
1938 | unsigned char type) | ||
2013 | { | 1939 | { |
2014 | struct audit_names *aname; | 1940 | struct audit_names *aname; |
2015 | 1941 | ||
@@ -2024,6 +1950,7 @@ static struct audit_names *audit_alloc_name(struct audit_context *context) | |||
2024 | } | 1950 | } |
2025 | 1951 | ||
2026 | aname->ino = (unsigned long)-1; | 1952 | aname->ino = (unsigned long)-1; |
1953 | aname->type = type; | ||
2027 | list_add_tail(&aname->list, &context->names_list); | 1954 | list_add_tail(&aname->list, &context->names_list); |
2028 | 1955 | ||
2029 | context->name_count++; | 1956 | context->name_count++; |
@@ -2034,13 +1961,36 @@ static struct audit_names *audit_alloc_name(struct audit_context *context) | |||
2034 | } | 1961 | } |
2035 | 1962 | ||
2036 | /** | 1963 | /** |
1964 | * audit_reusename - fill out filename with info from existing entry | ||
1965 | * @uptr: userland ptr to pathname | ||
1966 | * | ||
1967 | * Search the audit_names list for the current audit context. If there is an | ||
1968 | * existing entry with a matching "uptr" then return the filename | ||
1969 | * associated with that audit_name. If not, return NULL. | ||
1970 | */ | ||
1971 | struct filename * | ||
1972 | __audit_reusename(const __user char *uptr) | ||
1973 | { | ||
1974 | struct audit_context *context = current->audit_context; | ||
1975 | struct audit_names *n; | ||
1976 | |||
1977 | list_for_each_entry(n, &context->names_list, list) { | ||
1978 | if (!n->name) | ||
1979 | continue; | ||
1980 | if (n->name->uptr == uptr) | ||
1981 | return n->name; | ||
1982 | } | ||
1983 | return NULL; | ||
1984 | } | ||
1985 | |||
1986 | /** | ||
2037 | * audit_getname - add a name to the list | 1987 | * audit_getname - add a name to the list |
2038 | * @name: name to add | 1988 | * @name: name to add |
2039 | * | 1989 | * |
2040 | * Add a name to the list of audit names for this context. | 1990 | * Add a name to the list of audit names for this context. |
2041 | * Called from fs/namei.c:getname(). | 1991 | * Called from fs/namei.c:getname(). |
2042 | */ | 1992 | */ |
2043 | void __audit_getname(const char *name) | 1993 | void __audit_getname(struct filename *name) |
2044 | { | 1994 | { |
2045 | struct audit_context *context = current->audit_context; | 1995 | struct audit_context *context = current->audit_context; |
2046 | struct audit_names *n; | 1996 | struct audit_names *n; |
@@ -2054,13 +2004,19 @@ void __audit_getname(const char *name) | |||
2054 | return; | 2004 | return; |
2055 | } | 2005 | } |
2056 | 2006 | ||
2057 | n = audit_alloc_name(context); | 2007 | #if AUDIT_DEBUG |
2008 | /* The filename _must_ have a populated ->name */ | ||
2009 | BUG_ON(!name->name); | ||
2010 | #endif | ||
2011 | |||
2012 | n = audit_alloc_name(context, AUDIT_TYPE_UNKNOWN); | ||
2058 | if (!n) | 2013 | if (!n) |
2059 | return; | 2014 | return; |
2060 | 2015 | ||
2061 | n->name = name; | 2016 | n->name = name; |
2062 | n->name_len = AUDIT_NAME_FULL; | 2017 | n->name_len = AUDIT_NAME_FULL; |
2063 | n->name_put = true; | 2018 | n->name_put = true; |
2019 | name->aname = n; | ||
2064 | 2020 | ||
2065 | if (!context->pwd.dentry) | 2021 | if (!context->pwd.dentry) |
2066 | get_fs_pwd(current->fs, &context->pwd); | 2022 | get_fs_pwd(current->fs, &context->pwd); |
@@ -2073,7 +2029,7 @@ void __audit_getname(const char *name) | |||
2073 | * then we delay the putname until syscall exit. | 2029 | * then we delay the putname until syscall exit. |
2074 | * Called from include/linux/fs.h:putname(). | 2030 | * Called from include/linux/fs.h:putname(). |
2075 | */ | 2031 | */ |
2076 | void audit_putname(const char *name) | 2032 | void audit_putname(struct filename *name) |
2077 | { | 2033 | { |
2078 | struct audit_context *context = current->audit_context; | 2034 | struct audit_context *context = current->audit_context; |
2079 | 2035 | ||
@@ -2088,7 +2044,7 @@ void audit_putname(const char *name) | |||
2088 | 2044 | ||
2089 | list_for_each_entry(n, &context->names_list, list) | 2045 | list_for_each_entry(n, &context->names_list, list) |
2090 | printk(KERN_ERR "name[%d] = %p = %s\n", i, | 2046 | printk(KERN_ERR "name[%d] = %p = %s\n", i, |
2091 | n->name, n->name ?: "(null)"); | 2047 | n->name, n->name->name ?: "(null)"); |
2092 | } | 2048 | } |
2093 | #endif | 2049 | #endif |
2094 | __putname(name); | 2050 | __putname(name); |
@@ -2102,8 +2058,8 @@ void audit_putname(const char *name) | |||
2102 | " put_count=%d\n", | 2058 | " put_count=%d\n", |
2103 | __FILE__, __LINE__, | 2059 | __FILE__, __LINE__, |
2104 | context->serial, context->major, | 2060 | context->serial, context->major, |
2105 | context->in_syscall, name, context->name_count, | 2061 | context->in_syscall, name->name, |
2106 | context->put_count); | 2062 | context->name_count, context->put_count); |
2107 | dump_stack(); | 2063 | dump_stack(); |
2108 | } | 2064 | } |
2109 | } | 2065 | } |
@@ -2146,13 +2102,13 @@ static void audit_copy_inode(struct audit_names *name, const struct dentry *dent | |||
2146 | } | 2102 | } |
2147 | 2103 | ||
2148 | /** | 2104 | /** |
2149 | * audit_inode - store the inode and device from a lookup | 2105 | * __audit_inode - store the inode and device from a lookup |
2150 | * @name: name being audited | 2106 | * @name: name being audited |
2151 | * @dentry: dentry being audited | 2107 | * @dentry: dentry being audited |
2152 | * | 2108 | * @parent: does this dentry represent the parent? |
2153 | * Called from fs/namei.c:path_lookup(). | ||
2154 | */ | 2109 | */ |
2155 | void __audit_inode(const char *name, const struct dentry *dentry) | 2110 | void __audit_inode(struct filename *name, const struct dentry *dentry, |
2111 | unsigned int parent) | ||
2156 | { | 2112 | { |
2157 | struct audit_context *context = current->audit_context; | 2113 | struct audit_context *context = current->audit_context; |
2158 | const struct inode *inode = dentry->d_inode; | 2114 | const struct inode *inode = dentry->d_inode; |
@@ -2161,24 +2117,69 @@ void __audit_inode(const char *name, const struct dentry *dentry) | |||
2161 | if (!context->in_syscall) | 2117 | if (!context->in_syscall) |
2162 | return; | 2118 | return; |
2163 | 2119 | ||
2120 | if (!name) | ||
2121 | goto out_alloc; | ||
2122 | |||
2123 | #if AUDIT_DEBUG | ||
2124 | /* The struct filename _must_ have a populated ->name */ | ||
2125 | BUG_ON(!name->name); | ||
2126 | #endif | ||
2127 | /* | ||
2128 | * If we have a pointer to an audit_names entry already, then we can | ||
2129 | * just use it directly if the type is correct. | ||
2130 | */ | ||
2131 | n = name->aname; | ||
2132 | if (n) { | ||
2133 | if (parent) { | ||
2134 | if (n->type == AUDIT_TYPE_PARENT || | ||
2135 | n->type == AUDIT_TYPE_UNKNOWN) | ||
2136 | goto out; | ||
2137 | } else { | ||
2138 | if (n->type != AUDIT_TYPE_PARENT) | ||
2139 | goto out; | ||
2140 | } | ||
2141 | } | ||
2142 | |||
2164 | list_for_each_entry_reverse(n, &context->names_list, list) { | 2143 | list_for_each_entry_reverse(n, &context->names_list, list) { |
2165 | if (n->name && (n->name == name)) | 2144 | /* does the name pointer match? */ |
2166 | goto out; | 2145 | if (!n->name || n->name->name != name->name) |
2146 | continue; | ||
2147 | |||
2148 | /* match the correct record type */ | ||
2149 | if (parent) { | ||
2150 | if (n->type == AUDIT_TYPE_PARENT || | ||
2151 | n->type == AUDIT_TYPE_UNKNOWN) | ||
2152 | goto out; | ||
2153 | } else { | ||
2154 | if (n->type != AUDIT_TYPE_PARENT) | ||
2155 | goto out; | ||
2156 | } | ||
2167 | } | 2157 | } |
2168 | 2158 | ||
2169 | /* unable to find the name from a previous getname() */ | 2159 | out_alloc: |
2170 | n = audit_alloc_name(context); | 2160 | /* unable to find the name from a previous getname(). Allocate a new |
2161 | * anonymous entry. | ||
2162 | */ | ||
2163 | n = audit_alloc_name(context, AUDIT_TYPE_NORMAL); | ||
2171 | if (!n) | 2164 | if (!n) |
2172 | return; | 2165 | return; |
2173 | out: | 2166 | out: |
2167 | if (parent) { | ||
2168 | n->name_len = n->name ? parent_len(n->name->name) : AUDIT_NAME_FULL; | ||
2169 | n->type = AUDIT_TYPE_PARENT; | ||
2170 | } else { | ||
2171 | n->name_len = AUDIT_NAME_FULL; | ||
2172 | n->type = AUDIT_TYPE_NORMAL; | ||
2173 | } | ||
2174 | handle_path(dentry); | 2174 | handle_path(dentry); |
2175 | audit_copy_inode(n, dentry, inode); | 2175 | audit_copy_inode(n, dentry, inode); |
2176 | } | 2176 | } |
2177 | 2177 | ||
2178 | /** | 2178 | /** |
2179 | * audit_inode_child - collect inode info for created/removed objects | 2179 | * __audit_inode_child - collect inode info for created/removed objects |
2180 | * @dentry: dentry being audited | ||
2181 | * @parent: inode of dentry parent | 2180 | * @parent: inode of dentry parent |
2181 | * @dentry: dentry being audited | ||
2182 | * @type: AUDIT_TYPE_* value that we're looking for | ||
2182 | * | 2183 | * |
2183 | * For syscalls that create or remove filesystem objects, audit_inode | 2184 | * For syscalls that create or remove filesystem objects, audit_inode |
2184 | * can only collect information for the filesystem object's parent. | 2185 | * can only collect information for the filesystem object's parent. |
@@ -2188,15 +2189,14 @@ out: | |||
2188 | * must be hooked prior, in order to capture the target inode during | 2189 | * must be hooked prior, in order to capture the target inode during |
2189 | * unsuccessful attempts. | 2190 | * unsuccessful attempts. |
2190 | */ | 2191 | */ |
2191 | void __audit_inode_child(const struct dentry *dentry, | 2192 | void __audit_inode_child(const struct inode *parent, |
2192 | const struct inode *parent) | 2193 | const struct dentry *dentry, |
2194 | const unsigned char type) | ||
2193 | { | 2195 | { |
2194 | struct audit_context *context = current->audit_context; | 2196 | struct audit_context *context = current->audit_context; |
2195 | const char *found_parent = NULL, *found_child = NULL; | ||
2196 | const struct inode *inode = dentry->d_inode; | 2197 | const struct inode *inode = dentry->d_inode; |
2197 | const char *dname = dentry->d_name.name; | 2198 | const char *dname = dentry->d_name.name; |
2198 | struct audit_names *n; | 2199 | struct audit_names *n, *found_parent = NULL, *found_child = NULL; |
2199 | int dirlen = 0; | ||
2200 | 2200 | ||
2201 | if (!context->in_syscall) | 2201 | if (!context->in_syscall) |
2202 | return; | 2202 | return; |
@@ -2204,62 +2204,65 @@ void __audit_inode_child(const struct dentry *dentry, | |||
2204 | if (inode) | 2204 | if (inode) |
2205 | handle_one(inode); | 2205 | handle_one(inode); |
2206 | 2206 | ||
2207 | /* parent is more likely, look for it first */ | 2207 | /* look for a parent entry first */ |
2208 | list_for_each_entry(n, &context->names_list, list) { | 2208 | list_for_each_entry(n, &context->names_list, list) { |
2209 | if (!n->name) | 2209 | if (!n->name || n->type != AUDIT_TYPE_PARENT) |
2210 | continue; | 2210 | continue; |
2211 | 2211 | ||
2212 | if (n->ino == parent->i_ino && | 2212 | if (n->ino == parent->i_ino && |
2213 | !audit_compare_dname_path(dname, n->name, &dirlen)) { | 2213 | !audit_compare_dname_path(dname, n->name->name, n->name_len)) { |
2214 | n->name_len = dirlen; /* update parent data in place */ | 2214 | found_parent = n; |
2215 | found_parent = n->name; | 2215 | break; |
2216 | goto add_names; | ||
2217 | } | 2216 | } |
2218 | } | 2217 | } |
2219 | 2218 | ||
2220 | /* no matching parent, look for matching child */ | 2219 | /* is there a matching child entry? */ |
2221 | list_for_each_entry(n, &context->names_list, list) { | 2220 | list_for_each_entry(n, &context->names_list, list) { |
2222 | if (!n->name) | 2221 | /* can only match entries that have a name */ |
2222 | if (!n->name || n->type != type) | ||
2223 | continue; | 2223 | continue; |
2224 | 2224 | ||
2225 | /* strcmp() is the more likely scenario */ | 2225 | /* if we found a parent, make sure this one is a child of it */ |
2226 | if (!strcmp(dname, n->name) || | 2226 | if (found_parent && (n->name != found_parent->name)) |
2227 | !audit_compare_dname_path(dname, n->name, &dirlen)) { | 2227 | continue; |
2228 | if (inode) | 2228 | |
2229 | audit_copy_inode(n, NULL, inode); | 2229 | if (!strcmp(dname, n->name->name) || |
2230 | else | 2230 | !audit_compare_dname_path(dname, n->name->name, |
2231 | n->ino = (unsigned long)-1; | 2231 | found_parent ? |
2232 | found_child = n->name; | 2232 | found_parent->name_len : |
2233 | goto add_names; | 2233 | AUDIT_NAME_FULL)) { |
2234 | found_child = n; | ||
2235 | break; | ||
2234 | } | 2236 | } |
2235 | } | 2237 | } |
2236 | 2238 | ||
2237 | add_names: | ||
2238 | if (!found_parent) { | 2239 | if (!found_parent) { |
2239 | n = audit_alloc_name(context); | 2240 | /* create a new, "anonymous" parent record */ |
2241 | n = audit_alloc_name(context, AUDIT_TYPE_PARENT); | ||
2240 | if (!n) | 2242 | if (!n) |
2241 | return; | 2243 | return; |
2242 | audit_copy_inode(n, NULL, parent); | 2244 | audit_copy_inode(n, NULL, parent); |
2243 | } | 2245 | } |
2244 | 2246 | ||
2245 | if (!found_child) { | 2247 | if (!found_child) { |
2246 | n = audit_alloc_name(context); | 2248 | found_child = audit_alloc_name(context, type); |
2247 | if (!n) | 2249 | if (!found_child) |
2248 | return; | 2250 | return; |
2249 | 2251 | ||
2250 | /* Re-use the name belonging to the slot for a matching parent | 2252 | /* Re-use the name belonging to the slot for a matching parent |
2251 | * directory. All names for this context are relinquished in | 2253 | * directory. All names for this context are relinquished in |
2252 | * audit_free_names() */ | 2254 | * audit_free_names() */ |
2253 | if (found_parent) { | 2255 | if (found_parent) { |
2254 | n->name = found_parent; | 2256 | found_child->name = found_parent->name; |
2255 | n->name_len = AUDIT_NAME_FULL; | 2257 | found_child->name_len = AUDIT_NAME_FULL; |
2256 | /* don't call __putname() */ | 2258 | /* don't call __putname() */ |
2257 | n->name_put = false; | 2259 | found_child->name_put = false; |
2258 | } | 2260 | } |
2259 | |||
2260 | if (inode) | ||
2261 | audit_copy_inode(n, NULL, inode); | ||
2262 | } | 2261 | } |
2262 | if (inode) | ||
2263 | audit_copy_inode(found_child, dentry, inode); | ||
2264 | else | ||
2265 | found_child->ino = (unsigned long)-1; | ||
2263 | } | 2266 | } |
2264 | EXPORT_SYMBOL_GPL(__audit_inode_child); | 2267 | EXPORT_SYMBOL_GPL(__audit_inode_child); |
2265 | 2268 | ||
@@ -2299,14 +2302,14 @@ static atomic_t session_id = ATOMIC_INIT(0); | |||
2299 | * | 2302 | * |
2300 | * Called (set) from fs/proc/base.c::proc_loginuid_write(). | 2303 | * Called (set) from fs/proc/base.c::proc_loginuid_write(). |
2301 | */ | 2304 | */ |
2302 | int audit_set_loginuid(uid_t loginuid) | 2305 | int audit_set_loginuid(kuid_t loginuid) |
2303 | { | 2306 | { |
2304 | struct task_struct *task = current; | 2307 | struct task_struct *task = current; |
2305 | struct audit_context *context = task->audit_context; | 2308 | struct audit_context *context = task->audit_context; |
2306 | unsigned int sessionid; | 2309 | unsigned int sessionid; |
2307 | 2310 | ||
2308 | #ifdef CONFIG_AUDIT_LOGINUID_IMMUTABLE | 2311 | #ifdef CONFIG_AUDIT_LOGINUID_IMMUTABLE |
2309 | if (task->loginuid != -1) | 2312 | if (uid_valid(task->loginuid)) |
2310 | return -EPERM; | 2313 | return -EPERM; |
2311 | #else /* CONFIG_AUDIT_LOGINUID_IMMUTABLE */ | 2314 | #else /* CONFIG_AUDIT_LOGINUID_IMMUTABLE */ |
2312 | if (!capable(CAP_AUDIT_CONTROL)) | 2315 | if (!capable(CAP_AUDIT_CONTROL)) |
@@ -2322,8 +2325,10 @@ int audit_set_loginuid(uid_t loginuid) | |||
2322 | audit_log_format(ab, "login pid=%d uid=%u " | 2325 | audit_log_format(ab, "login pid=%d uid=%u " |
2323 | "old auid=%u new auid=%u" | 2326 | "old auid=%u new auid=%u" |
2324 | " old ses=%u new ses=%u", | 2327 | " old ses=%u new ses=%u", |
2325 | task->pid, task_uid(task), | 2328 | task->pid, |
2326 | task->loginuid, loginuid, | 2329 | from_kuid(&init_user_ns, task_uid(task)), |
2330 | from_kuid(&init_user_ns, task->loginuid), | ||
2331 | from_kuid(&init_user_ns, loginuid), | ||
2327 | task->sessionid, sessionid); | 2332 | task->sessionid, sessionid); |
2328 | audit_log_end(ab); | 2333 | audit_log_end(ab); |
2329 | } | 2334 | } |
@@ -2546,12 +2551,12 @@ int __audit_signal_info(int sig, struct task_struct *t) | |||
2546 | struct audit_aux_data_pids *axp; | 2551 | struct audit_aux_data_pids *axp; |
2547 | struct task_struct *tsk = current; | 2552 | struct task_struct *tsk = current; |
2548 | struct audit_context *ctx = tsk->audit_context; | 2553 | struct audit_context *ctx = tsk->audit_context; |
2549 | uid_t uid = current_uid(), t_uid = task_uid(t); | 2554 | kuid_t uid = current_uid(), t_uid = task_uid(t); |
2550 | 2555 | ||
2551 | if (audit_pid && t->tgid == audit_pid) { | 2556 | if (audit_pid && t->tgid == audit_pid) { |
2552 | if (sig == SIGTERM || sig == SIGHUP || sig == SIGUSR1 || sig == SIGUSR2) { | 2557 | if (sig == SIGTERM || sig == SIGHUP || sig == SIGUSR1 || sig == SIGUSR2) { |
2553 | audit_sig_pid = tsk->pid; | 2558 | audit_sig_pid = tsk->pid; |
2554 | if (tsk->loginuid != -1) | 2559 | if (uid_valid(tsk->loginuid)) |
2555 | audit_sig_uid = tsk->loginuid; | 2560 | audit_sig_uid = tsk->loginuid; |
2556 | else | 2561 | else |
2557 | audit_sig_uid = uid; | 2562 | audit_sig_uid = uid; |
@@ -2672,8 +2677,8 @@ void __audit_mmap_fd(int fd, int flags) | |||
2672 | 2677 | ||
2673 | static void audit_log_abend(struct audit_buffer *ab, char *reason, long signr) | 2678 | static void audit_log_abend(struct audit_buffer *ab, char *reason, long signr) |
2674 | { | 2679 | { |
2675 | uid_t auid, uid; | 2680 | kuid_t auid, uid; |
2676 | gid_t gid; | 2681 | kgid_t gid; |
2677 | unsigned int sessionid; | 2682 | unsigned int sessionid; |
2678 | 2683 | ||
2679 | auid = audit_get_loginuid(current); | 2684 | auid = audit_get_loginuid(current); |
@@ -2681,7 +2686,10 @@ static void audit_log_abend(struct audit_buffer *ab, char *reason, long signr) | |||
2681 | current_uid_gid(&uid, &gid); | 2686 | current_uid_gid(&uid, &gid); |
2682 | 2687 | ||
2683 | audit_log_format(ab, "auid=%u uid=%u gid=%u ses=%u", | 2688 | audit_log_format(ab, "auid=%u uid=%u gid=%u ses=%u", |
2684 | auid, uid, gid, sessionid); | 2689 | from_kuid(&init_user_ns, auid), |
2690 | from_kuid(&init_user_ns, uid), | ||
2691 | from_kgid(&init_user_ns, gid), | ||
2692 | sessionid); | ||
2685 | audit_log_task_context(ab); | 2693 | audit_log_task_context(ab); |
2686 | audit_log_format(ab, " pid=%d comm=", current->pid); | 2694 | audit_log_format(ab, " pid=%d comm=", current->pid); |
2687 | audit_log_untrustedstring(ab, current->comm); | 2695 | audit_log_untrustedstring(ab, current->comm); |
diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 79818507e444..4855892798fd 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c | |||
@@ -88,11 +88,12 @@ static DEFINE_MUTEX(cgroup_root_mutex); | |||
88 | 88 | ||
89 | /* | 89 | /* |
90 | * Generate an array of cgroup subsystem pointers. At boot time, this is | 90 | * Generate an array of cgroup subsystem pointers. At boot time, this is |
91 | * populated up to CGROUP_BUILTIN_SUBSYS_COUNT, and modular subsystems are | 91 | * populated with the built in subsystems, and modular subsystems are |
92 | * registered after that. The mutable section of this array is protected by | 92 | * registered after that. The mutable section of this array is protected by |
93 | * cgroup_mutex. | 93 | * cgroup_mutex. |
94 | */ | 94 | */ |
95 | #define SUBSYS(_x) &_x ## _subsys, | 95 | #define SUBSYS(_x) [_x ## _subsys_id] = &_x ## _subsys, |
96 | #define IS_SUBSYS_ENABLED(option) IS_BUILTIN(option) | ||
96 | static struct cgroup_subsys *subsys[CGROUP_SUBSYS_COUNT] = { | 97 | static struct cgroup_subsys *subsys[CGROUP_SUBSYS_COUNT] = { |
97 | #include <linux/cgroup_subsys.h> | 98 | #include <linux/cgroup_subsys.h> |
98 | }; | 99 | }; |
@@ -111,13 +112,13 @@ struct cgroupfs_root { | |||
111 | * The bitmask of subsystems intended to be attached to this | 112 | * The bitmask of subsystems intended to be attached to this |
112 | * hierarchy | 113 | * hierarchy |
113 | */ | 114 | */ |
114 | unsigned long subsys_bits; | 115 | unsigned long subsys_mask; |
115 | 116 | ||
116 | /* Unique id for this hierarchy. */ | 117 | /* Unique id for this hierarchy. */ |
117 | int hierarchy_id; | 118 | int hierarchy_id; |
118 | 119 | ||
119 | /* The bitmask of subsystems currently attached to this hierarchy */ | 120 | /* The bitmask of subsystems currently attached to this hierarchy */ |
120 | unsigned long actual_subsys_bits; | 121 | unsigned long actual_subsys_mask; |
121 | 122 | ||
122 | /* A list running through the attached subsystems */ | 123 | /* A list running through the attached subsystems */ |
123 | struct list_head subsys_list; | 124 | struct list_head subsys_list; |
@@ -137,6 +138,9 @@ struct cgroupfs_root { | |||
137 | /* Hierarchy-specific flags */ | 138 | /* Hierarchy-specific flags */ |
138 | unsigned long flags; | 139 | unsigned long flags; |
139 | 140 | ||
141 | /* IDs for cgroups in this hierarchy */ | ||
142 | struct ida cgroup_ida; | ||
143 | |||
140 | /* The path to use for release notifications. */ | 144 | /* The path to use for release notifications. */ |
141 | char release_agent_path[PATH_MAX]; | 145 | char release_agent_path[PATH_MAX]; |
142 | 146 | ||
@@ -170,8 +174,8 @@ struct css_id { | |||
170 | * The css to which this ID points. This pointer is set to valid value | 174 | * The css to which this ID points. This pointer is set to valid value |
171 | * after cgroup is populated. If cgroup is removed, this will be NULL. | 175 | * after cgroup is populated. If cgroup is removed, this will be NULL. |
172 | * This pointer is expected to be RCU-safe because destroy() | 176 | * This pointer is expected to be RCU-safe because destroy() |
173 | * is called after synchronize_rcu(). But for safe use, css_is_removed() | 177 | * is called after synchronize_rcu(). But for safe use, css_tryget() |
174 | * css_tryget() should be used for avoiding race. | 178 | * should be used for avoiding race. |
175 | */ | 179 | */ |
176 | struct cgroup_subsys_state __rcu *css; | 180 | struct cgroup_subsys_state __rcu *css; |
177 | /* | 181 | /* |
@@ -241,6 +245,10 @@ static DEFINE_SPINLOCK(hierarchy_id_lock); | |||
241 | */ | 245 | */ |
242 | static int need_forkexit_callback __read_mostly; | 246 | static int need_forkexit_callback __read_mostly; |
243 | 247 | ||
248 | static int cgroup_destroy_locked(struct cgroup *cgrp); | ||
249 | static int cgroup_addrm_files(struct cgroup *cgrp, struct cgroup_subsys *subsys, | ||
250 | struct cftype cfts[], bool is_add); | ||
251 | |||
244 | #ifdef CONFIG_PROVE_LOCKING | 252 | #ifdef CONFIG_PROVE_LOCKING |
245 | int cgroup_lock_is_held(void) | 253 | int cgroup_lock_is_held(void) |
246 | { | 254 | { |
@@ -276,7 +284,8 @@ inline int cgroup_is_removed(const struct cgroup *cgrp) | |||
276 | 284 | ||
277 | /* bits in struct cgroupfs_root flags field */ | 285 | /* bits in struct cgroupfs_root flags field */ |
278 | enum { | 286 | enum { |
279 | ROOT_NOPREFIX, /* mounted subsystems have no named prefix */ | 287 | ROOT_NOPREFIX, /* mounted subsystems have no named prefix */ |
288 | ROOT_XATTR, /* supports extended attributes */ | ||
280 | }; | 289 | }; |
281 | 290 | ||
282 | static int cgroup_is_releasable(const struct cgroup *cgrp) | 291 | static int cgroup_is_releasable(const struct cgroup *cgrp) |
@@ -292,11 +301,6 @@ static int notify_on_release(const struct cgroup *cgrp) | |||
292 | return test_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags); | 301 | return test_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags); |
293 | } | 302 | } |
294 | 303 | ||
295 | static int clone_children(const struct cgroup *cgrp) | ||
296 | { | ||
297 | return test_bit(CGRP_CLONE_CHILDREN, &cgrp->flags); | ||
298 | } | ||
299 | |||
300 | /* | 304 | /* |
301 | * for_each_subsys() allows you to iterate on each subsystem attached to | 305 | * for_each_subsys() allows you to iterate on each subsystem attached to |
302 | * an active hierarchy | 306 | * an active hierarchy |
@@ -556,7 +560,7 @@ static struct css_set *find_existing_css_set( | |||
556 | * won't change, so no need for locking. | 560 | * won't change, so no need for locking. |
557 | */ | 561 | */ |
558 | for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { | 562 | for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { |
559 | if (root->subsys_bits & (1UL << i)) { | 563 | if (root->subsys_mask & (1UL << i)) { |
560 | /* Subsystem is in this hierarchy. So we want | 564 | /* Subsystem is in this hierarchy. So we want |
561 | * the subsystem state from the new | 565 | * the subsystem state from the new |
562 | * cgroup */ | 566 | * cgroup */ |
@@ -780,12 +784,12 @@ static struct cgroup *task_cgroup_from_root(struct task_struct *task, | |||
780 | * The task_lock() exception | 784 | * The task_lock() exception |
781 | * | 785 | * |
782 | * The need for this exception arises from the action of | 786 | * The need for this exception arises from the action of |
783 | * cgroup_attach_task(), which overwrites one tasks cgroup pointer with | 787 | * cgroup_attach_task(), which overwrites one task's cgroup pointer with |
784 | * another. It does so using cgroup_mutex, however there are | 788 | * another. It does so using cgroup_mutex, however there are |
785 | * several performance critical places that need to reference | 789 | * several performance critical places that need to reference |
786 | * task->cgroup without the expense of grabbing a system global | 790 | * task->cgroup without the expense of grabbing a system global |
787 | * mutex. Therefore except as noted below, when dereferencing or, as | 791 | * mutex. Therefore except as noted below, when dereferencing or, as |
788 | * in cgroup_attach_task(), modifying a task'ss cgroup pointer we use | 792 | * in cgroup_attach_task(), modifying a task's cgroup pointer we use |
789 | * task_lock(), which acts on a spinlock (task->alloc_lock) already in | 793 | * task_lock(), which acts on a spinlock (task->alloc_lock) already in |
790 | * the task_struct routinely used for such matters. | 794 | * the task_struct routinely used for such matters. |
791 | * | 795 | * |
@@ -824,7 +828,8 @@ EXPORT_SYMBOL_GPL(cgroup_unlock); | |||
824 | static int cgroup_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode); | 828 | static int cgroup_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode); |
825 | static struct dentry *cgroup_lookup(struct inode *, struct dentry *, unsigned int); | 829 | static struct dentry *cgroup_lookup(struct inode *, struct dentry *, unsigned int); |
826 | static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry); | 830 | static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry); |
827 | static int cgroup_populate_dir(struct cgroup *cgrp); | 831 | static int cgroup_populate_dir(struct cgroup *cgrp, bool base_files, |
832 | unsigned long subsys_mask); | ||
828 | static const struct inode_operations cgroup_dir_inode_operations; | 833 | static const struct inode_operations cgroup_dir_inode_operations; |
829 | static const struct file_operations proc_cgroupstats_operations; | 834 | static const struct file_operations proc_cgroupstats_operations; |
830 | 835 | ||
@@ -851,30 +856,6 @@ static struct inode *cgroup_new_inode(umode_t mode, struct super_block *sb) | |||
851 | return inode; | 856 | return inode; |
852 | } | 857 | } |
853 | 858 | ||
854 | /* | ||
855 | * Call subsys's pre_destroy handler. | ||
856 | * This is called before css refcnt check. | ||
857 | */ | ||
858 | static int cgroup_call_pre_destroy(struct cgroup *cgrp) | ||
859 | { | ||
860 | struct cgroup_subsys *ss; | ||
861 | int ret = 0; | ||
862 | |||
863 | for_each_subsys(cgrp->root, ss) { | ||
864 | if (!ss->pre_destroy) | ||
865 | continue; | ||
866 | |||
867 | ret = ss->pre_destroy(cgrp); | ||
868 | if (ret) { | ||
869 | /* ->pre_destroy() failure is being deprecated */ | ||
870 | WARN_ON_ONCE(!ss->__DEPRECATED_clear_css_refs); | ||
871 | break; | ||
872 | } | ||
873 | } | ||
874 | |||
875 | return ret; | ||
876 | } | ||
877 | |||
878 | static void cgroup_diput(struct dentry *dentry, struct inode *inode) | 859 | static void cgroup_diput(struct dentry *dentry, struct inode *inode) |
879 | { | 860 | { |
880 | /* is dentry a directory ? if so, kfree() associated cgroup */ | 861 | /* is dentry a directory ? if so, kfree() associated cgroup */ |
@@ -895,7 +876,7 @@ static void cgroup_diput(struct dentry *dentry, struct inode *inode) | |||
895 | * Release the subsystem state objects. | 876 | * Release the subsystem state objects. |
896 | */ | 877 | */ |
897 | for_each_subsys(cgrp->root, ss) | 878 | for_each_subsys(cgrp->root, ss) |
898 | ss->destroy(cgrp); | 879 | ss->css_free(cgrp); |
899 | 880 | ||
900 | cgrp->root->number_of_cgroups--; | 881 | cgrp->root->number_of_cgroups--; |
901 | mutex_unlock(&cgroup_mutex); | 882 | mutex_unlock(&cgroup_mutex); |
@@ -912,15 +893,20 @@ static void cgroup_diput(struct dentry *dentry, struct inode *inode) | |||
912 | */ | 893 | */ |
913 | BUG_ON(!list_empty(&cgrp->pidlists)); | 894 | BUG_ON(!list_empty(&cgrp->pidlists)); |
914 | 895 | ||
896 | simple_xattrs_free(&cgrp->xattrs); | ||
897 | |||
898 | ida_simple_remove(&cgrp->root->cgroup_ida, cgrp->id); | ||
915 | kfree_rcu(cgrp, rcu_head); | 899 | kfree_rcu(cgrp, rcu_head); |
916 | } else { | 900 | } else { |
917 | struct cfent *cfe = __d_cfe(dentry); | 901 | struct cfent *cfe = __d_cfe(dentry); |
918 | struct cgroup *cgrp = dentry->d_parent->d_fsdata; | 902 | struct cgroup *cgrp = dentry->d_parent->d_fsdata; |
903 | struct cftype *cft = cfe->type; | ||
919 | 904 | ||
920 | WARN_ONCE(!list_empty(&cfe->node) && | 905 | WARN_ONCE(!list_empty(&cfe->node) && |
921 | cgrp != &cgrp->root->top_cgroup, | 906 | cgrp != &cgrp->root->top_cgroup, |
922 | "cfe still linked for %s\n", cfe->type->name); | 907 | "cfe still linked for %s\n", cfe->type->name); |
923 | kfree(cfe); | 908 | kfree(cfe); |
909 | simple_xattrs_free(&cft->xattrs); | ||
924 | } | 910 | } |
925 | iput(inode); | 911 | iput(inode); |
926 | } | 912 | } |
@@ -963,12 +949,29 @@ static int cgroup_rm_file(struct cgroup *cgrp, const struct cftype *cft) | |||
963 | return -ENOENT; | 949 | return -ENOENT; |
964 | } | 950 | } |
965 | 951 | ||
966 | static void cgroup_clear_directory(struct dentry *dir) | 952 | /** |
953 | * cgroup_clear_directory - selective removal of base and subsystem files | ||
954 | * @dir: directory containing the files | ||
955 | * @base_files: true if the base files should be removed | ||
956 | * @subsys_mask: mask of the subsystem ids whose files should be removed | ||
957 | */ | ||
958 | static void cgroup_clear_directory(struct dentry *dir, bool base_files, | ||
959 | unsigned long subsys_mask) | ||
967 | { | 960 | { |
968 | struct cgroup *cgrp = __d_cgrp(dir); | 961 | struct cgroup *cgrp = __d_cgrp(dir); |
962 | struct cgroup_subsys *ss; | ||
969 | 963 | ||
970 | while (!list_empty(&cgrp->files)) | 964 | for_each_subsys(cgrp->root, ss) { |
971 | cgroup_rm_file(cgrp, NULL); | 965 | struct cftype_set *set; |
966 | if (!test_bit(ss->subsys_id, &subsys_mask)) | ||
967 | continue; | ||
968 | list_for_each_entry(set, &ss->cftsets, node) | ||
969 | cgroup_addrm_files(cgrp, NULL, set->cfts, false); | ||
970 | } | ||
971 | if (base_files) { | ||
972 | while (!list_empty(&cgrp->files)) | ||
973 | cgroup_rm_file(cgrp, NULL); | ||
974 | } | ||
972 | } | 975 | } |
973 | 976 | ||
974 | /* | 977 | /* |
@@ -977,8 +980,9 @@ static void cgroup_clear_directory(struct dentry *dir) | |||
977 | static void cgroup_d_remove_dir(struct dentry *dentry) | 980 | static void cgroup_d_remove_dir(struct dentry *dentry) |
978 | { | 981 | { |
979 | struct dentry *parent; | 982 | struct dentry *parent; |
983 | struct cgroupfs_root *root = dentry->d_sb->s_fs_info; | ||
980 | 984 | ||
981 | cgroup_clear_directory(dentry); | 985 | cgroup_clear_directory(dentry, true, root->subsys_mask); |
982 | 986 | ||
983 | parent = dentry->d_parent; | 987 | parent = dentry->d_parent; |
984 | spin_lock(&parent->d_lock); | 988 | spin_lock(&parent->d_lock); |
@@ -990,54 +994,27 @@ static void cgroup_d_remove_dir(struct dentry *dentry) | |||
990 | } | 994 | } |
991 | 995 | ||
992 | /* | 996 | /* |
993 | * A queue for waiters to do rmdir() cgroup. A tasks will sleep when | ||
994 | * cgroup->count == 0 && list_empty(&cgroup->children) && subsys has some | ||
995 | * reference to css->refcnt. In general, this refcnt is expected to goes down | ||
996 | * to zero, soon. | ||
997 | * | ||
998 | * CGRP_WAIT_ON_RMDIR flag is set under cgroup's inode->i_mutex; | ||
999 | */ | ||
1000 | static DECLARE_WAIT_QUEUE_HEAD(cgroup_rmdir_waitq); | ||
1001 | |||
1002 | static void cgroup_wakeup_rmdir_waiter(struct cgroup *cgrp) | ||
1003 | { | ||
1004 | if (unlikely(test_and_clear_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags))) | ||
1005 | wake_up_all(&cgroup_rmdir_waitq); | ||
1006 | } | ||
1007 | |||
1008 | void cgroup_exclude_rmdir(struct cgroup_subsys_state *css) | ||
1009 | { | ||
1010 | css_get(css); | ||
1011 | } | ||
1012 | |||
1013 | void cgroup_release_and_wakeup_rmdir(struct cgroup_subsys_state *css) | ||
1014 | { | ||
1015 | cgroup_wakeup_rmdir_waiter(css->cgroup); | ||
1016 | css_put(css); | ||
1017 | } | ||
1018 | |||
1019 | /* | ||
1020 | * Call with cgroup_mutex held. Drops reference counts on modules, including | 997 | * Call with cgroup_mutex held. Drops reference counts on modules, including |
1021 | * any duplicate ones that parse_cgroupfs_options took. If this function | 998 | * any duplicate ones that parse_cgroupfs_options took. If this function |
1022 | * returns an error, no reference counts are touched. | 999 | * returns an error, no reference counts are touched. |
1023 | */ | 1000 | */ |
1024 | static int rebind_subsystems(struct cgroupfs_root *root, | 1001 | static int rebind_subsystems(struct cgroupfs_root *root, |
1025 | unsigned long final_bits) | 1002 | unsigned long final_subsys_mask) |
1026 | { | 1003 | { |
1027 | unsigned long added_bits, removed_bits; | 1004 | unsigned long added_mask, removed_mask; |
1028 | struct cgroup *cgrp = &root->top_cgroup; | 1005 | struct cgroup *cgrp = &root->top_cgroup; |
1029 | int i; | 1006 | int i; |
1030 | 1007 | ||
1031 | BUG_ON(!mutex_is_locked(&cgroup_mutex)); | 1008 | BUG_ON(!mutex_is_locked(&cgroup_mutex)); |
1032 | BUG_ON(!mutex_is_locked(&cgroup_root_mutex)); | 1009 | BUG_ON(!mutex_is_locked(&cgroup_root_mutex)); |
1033 | 1010 | ||
1034 | removed_bits = root->actual_subsys_bits & ~final_bits; | 1011 | removed_mask = root->actual_subsys_mask & ~final_subsys_mask; |
1035 | added_bits = final_bits & ~root->actual_subsys_bits; | 1012 | added_mask = final_subsys_mask & ~root->actual_subsys_mask; |
1036 | /* Check that any added subsystems are currently free */ | 1013 | /* Check that any added subsystems are currently free */ |
1037 | for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { | 1014 | for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { |
1038 | unsigned long bit = 1UL << i; | 1015 | unsigned long bit = 1UL << i; |
1039 | struct cgroup_subsys *ss = subsys[i]; | 1016 | struct cgroup_subsys *ss = subsys[i]; |
1040 | if (!(bit & added_bits)) | 1017 | if (!(bit & added_mask)) |
1041 | continue; | 1018 | continue; |
1042 | /* | 1019 | /* |
1043 | * Nobody should tell us to do a subsys that doesn't exist: | 1020 | * Nobody should tell us to do a subsys that doesn't exist: |
@@ -1062,7 +1039,7 @@ static int rebind_subsystems(struct cgroupfs_root *root, | |||
1062 | for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { | 1039 | for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { |
1063 | struct cgroup_subsys *ss = subsys[i]; | 1040 | struct cgroup_subsys *ss = subsys[i]; |
1064 | unsigned long bit = 1UL << i; | 1041 | unsigned long bit = 1UL << i; |
1065 | if (bit & added_bits) { | 1042 | if (bit & added_mask) { |
1066 | /* We're binding this subsystem to this hierarchy */ | 1043 | /* We're binding this subsystem to this hierarchy */ |
1067 | BUG_ON(ss == NULL); | 1044 | BUG_ON(ss == NULL); |
1068 | BUG_ON(cgrp->subsys[i]); | 1045 | BUG_ON(cgrp->subsys[i]); |
@@ -1075,7 +1052,7 @@ static int rebind_subsystems(struct cgroupfs_root *root, | |||
1075 | if (ss->bind) | 1052 | if (ss->bind) |
1076 | ss->bind(cgrp); | 1053 | ss->bind(cgrp); |
1077 | /* refcount was already taken, and we're keeping it */ | 1054 | /* refcount was already taken, and we're keeping it */ |
1078 | } else if (bit & removed_bits) { | 1055 | } else if (bit & removed_mask) { |
1079 | /* We're removing this subsystem */ | 1056 | /* We're removing this subsystem */ |
1080 | BUG_ON(ss == NULL); | 1057 | BUG_ON(ss == NULL); |
1081 | BUG_ON(cgrp->subsys[i] != dummytop->subsys[i]); | 1058 | BUG_ON(cgrp->subsys[i] != dummytop->subsys[i]); |
@@ -1088,7 +1065,7 @@ static int rebind_subsystems(struct cgroupfs_root *root, | |||
1088 | list_move(&ss->sibling, &rootnode.subsys_list); | 1065 | list_move(&ss->sibling, &rootnode.subsys_list); |
1089 | /* subsystem is now free - drop reference on module */ | 1066 | /* subsystem is now free - drop reference on module */ |
1090 | module_put(ss->module); | 1067 | module_put(ss->module); |
1091 | } else if (bit & final_bits) { | 1068 | } else if (bit & final_subsys_mask) { |
1092 | /* Subsystem state should already exist */ | 1069 | /* Subsystem state should already exist */ |
1093 | BUG_ON(ss == NULL); | 1070 | BUG_ON(ss == NULL); |
1094 | BUG_ON(!cgrp->subsys[i]); | 1071 | BUG_ON(!cgrp->subsys[i]); |
@@ -1105,7 +1082,7 @@ static int rebind_subsystems(struct cgroupfs_root *root, | |||
1105 | BUG_ON(cgrp->subsys[i]); | 1082 | BUG_ON(cgrp->subsys[i]); |
1106 | } | 1083 | } |
1107 | } | 1084 | } |
1108 | root->subsys_bits = root->actual_subsys_bits = final_bits; | 1085 | root->subsys_mask = root->actual_subsys_mask = final_subsys_mask; |
1109 | synchronize_rcu(); | 1086 | synchronize_rcu(); |
1110 | 1087 | ||
1111 | return 0; | 1088 | return 0; |
@@ -1121,9 +1098,11 @@ static int cgroup_show_options(struct seq_file *seq, struct dentry *dentry) | |||
1121 | seq_printf(seq, ",%s", ss->name); | 1098 | seq_printf(seq, ",%s", ss->name); |
1122 | if (test_bit(ROOT_NOPREFIX, &root->flags)) | 1099 | if (test_bit(ROOT_NOPREFIX, &root->flags)) |
1123 | seq_puts(seq, ",noprefix"); | 1100 | seq_puts(seq, ",noprefix"); |
1101 | if (test_bit(ROOT_XATTR, &root->flags)) | ||
1102 | seq_puts(seq, ",xattr"); | ||
1124 | if (strlen(root->release_agent_path)) | 1103 | if (strlen(root->release_agent_path)) |
1125 | seq_printf(seq, ",release_agent=%s", root->release_agent_path); | 1104 | seq_printf(seq, ",release_agent=%s", root->release_agent_path); |
1126 | if (clone_children(&root->top_cgroup)) | 1105 | if (test_bit(CGRP_CPUSET_CLONE_CHILDREN, &root->top_cgroup.flags)) |
1127 | seq_puts(seq, ",clone_children"); | 1106 | seq_puts(seq, ",clone_children"); |
1128 | if (strlen(root->name)) | 1107 | if (strlen(root->name)) |
1129 | seq_printf(seq, ",name=%s", root->name); | 1108 | seq_printf(seq, ",name=%s", root->name); |
@@ -1132,10 +1111,10 @@ static int cgroup_show_options(struct seq_file *seq, struct dentry *dentry) | |||
1132 | } | 1111 | } |
1133 | 1112 | ||
1134 | struct cgroup_sb_opts { | 1113 | struct cgroup_sb_opts { |
1135 | unsigned long subsys_bits; | 1114 | unsigned long subsys_mask; |
1136 | unsigned long flags; | 1115 | unsigned long flags; |
1137 | char *release_agent; | 1116 | char *release_agent; |
1138 | bool clone_children; | 1117 | bool cpuset_clone_children; |
1139 | char *name; | 1118 | char *name; |
1140 | /* User explicitly requested empty subsystem */ | 1119 | /* User explicitly requested empty subsystem */ |
1141 | bool none; | 1120 | bool none; |
@@ -1186,7 +1165,11 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts) | |||
1186 | continue; | 1165 | continue; |
1187 | } | 1166 | } |
1188 | if (!strcmp(token, "clone_children")) { | 1167 | if (!strcmp(token, "clone_children")) { |
1189 | opts->clone_children = true; | 1168 | opts->cpuset_clone_children = true; |
1169 | continue; | ||
1170 | } | ||
1171 | if (!strcmp(token, "xattr")) { | ||
1172 | set_bit(ROOT_XATTR, &opts->flags); | ||
1190 | continue; | 1173 | continue; |
1191 | } | 1174 | } |
1192 | if (!strncmp(token, "release_agent=", 14)) { | 1175 | if (!strncmp(token, "release_agent=", 14)) { |
@@ -1237,7 +1220,7 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts) | |||
1237 | /* Mutually exclusive option 'all' + subsystem name */ | 1220 | /* Mutually exclusive option 'all' + subsystem name */ |
1238 | if (all_ss) | 1221 | if (all_ss) |
1239 | return -EINVAL; | 1222 | return -EINVAL; |
1240 | set_bit(i, &opts->subsys_bits); | 1223 | set_bit(i, &opts->subsys_mask); |
1241 | one_ss = true; | 1224 | one_ss = true; |
1242 | 1225 | ||
1243 | break; | 1226 | break; |
@@ -1258,7 +1241,7 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts) | |||
1258 | continue; | 1241 | continue; |
1259 | if (ss->disabled) | 1242 | if (ss->disabled) |
1260 | continue; | 1243 | continue; |
1261 | set_bit(i, &opts->subsys_bits); | 1244 | set_bit(i, &opts->subsys_mask); |
1262 | } | 1245 | } |
1263 | } | 1246 | } |
1264 | 1247 | ||
@@ -1270,19 +1253,19 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts) | |||
1270 | * the cpuset subsystem. | 1253 | * the cpuset subsystem. |
1271 | */ | 1254 | */ |
1272 | if (test_bit(ROOT_NOPREFIX, &opts->flags) && | 1255 | if (test_bit(ROOT_NOPREFIX, &opts->flags) && |
1273 | (opts->subsys_bits & mask)) | 1256 | (opts->subsys_mask & mask)) |
1274 | return -EINVAL; | 1257 | return -EINVAL; |
1275 | 1258 | ||
1276 | 1259 | ||
1277 | /* Can't specify "none" and some subsystems */ | 1260 | /* Can't specify "none" and some subsystems */ |
1278 | if (opts->subsys_bits && opts->none) | 1261 | if (opts->subsys_mask && opts->none) |
1279 | return -EINVAL; | 1262 | return -EINVAL; |
1280 | 1263 | ||
1281 | /* | 1264 | /* |
1282 | * We either have to specify by name or by subsystems. (So all | 1265 | * We either have to specify by name or by subsystems. (So all |
1283 | * empty hierarchies must have a name). | 1266 | * empty hierarchies must have a name). |
1284 | */ | 1267 | */ |
1285 | if (!opts->subsys_bits && !opts->name) | 1268 | if (!opts->subsys_mask && !opts->name) |
1286 | return -EINVAL; | 1269 | return -EINVAL; |
1287 | 1270 | ||
1288 | /* | 1271 | /* |
@@ -1291,10 +1274,10 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts) | |||
1291 | * take duplicate reference counts on a subsystem that's already used, | 1274 | * take duplicate reference counts on a subsystem that's already used, |
1292 | * but rebind_subsystems handles this case. | 1275 | * but rebind_subsystems handles this case. |
1293 | */ | 1276 | */ |
1294 | for (i = CGROUP_BUILTIN_SUBSYS_COUNT; i < CGROUP_SUBSYS_COUNT; i++) { | 1277 | for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { |
1295 | unsigned long bit = 1UL << i; | 1278 | unsigned long bit = 1UL << i; |
1296 | 1279 | ||
1297 | if (!(bit & opts->subsys_bits)) | 1280 | if (!(bit & opts->subsys_mask)) |
1298 | continue; | 1281 | continue; |
1299 | if (!try_module_get(subsys[i]->module)) { | 1282 | if (!try_module_get(subsys[i]->module)) { |
1300 | module_pin_failed = true; | 1283 | module_pin_failed = true; |
@@ -1307,11 +1290,11 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts) | |||
1307 | * raced with a module_delete call, and to the user this is | 1290 | * raced with a module_delete call, and to the user this is |
1308 | * essentially a "subsystem doesn't exist" case. | 1291 | * essentially a "subsystem doesn't exist" case. |
1309 | */ | 1292 | */ |
1310 | for (i--; i >= CGROUP_BUILTIN_SUBSYS_COUNT; i--) { | 1293 | for (i--; i >= 0; i--) { |
1311 | /* drop refcounts only on the ones we took */ | 1294 | /* drop refcounts only on the ones we took */ |
1312 | unsigned long bit = 1UL << i; | 1295 | unsigned long bit = 1UL << i; |
1313 | 1296 | ||
1314 | if (!(bit & opts->subsys_bits)) | 1297 | if (!(bit & opts->subsys_mask)) |
1315 | continue; | 1298 | continue; |
1316 | module_put(subsys[i]->module); | 1299 | module_put(subsys[i]->module); |
1317 | } | 1300 | } |
@@ -1321,13 +1304,13 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts) | |||
1321 | return 0; | 1304 | return 0; |
1322 | } | 1305 | } |
1323 | 1306 | ||
1324 | static void drop_parsed_module_refcounts(unsigned long subsys_bits) | 1307 | static void drop_parsed_module_refcounts(unsigned long subsys_mask) |
1325 | { | 1308 | { |
1326 | int i; | 1309 | int i; |
1327 | for (i = CGROUP_BUILTIN_SUBSYS_COUNT; i < CGROUP_SUBSYS_COUNT; i++) { | 1310 | for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { |
1328 | unsigned long bit = 1UL << i; | 1311 | unsigned long bit = 1UL << i; |
1329 | 1312 | ||
1330 | if (!(bit & subsys_bits)) | 1313 | if (!(bit & subsys_mask)) |
1331 | continue; | 1314 | continue; |
1332 | module_put(subsys[i]->module); | 1315 | module_put(subsys[i]->module); |
1333 | } | 1316 | } |
@@ -1339,6 +1322,7 @@ static int cgroup_remount(struct super_block *sb, int *flags, char *data) | |||
1339 | struct cgroupfs_root *root = sb->s_fs_info; | 1322 | struct cgroupfs_root *root = sb->s_fs_info; |
1340 | struct cgroup *cgrp = &root->top_cgroup; | 1323 | struct cgroup *cgrp = &root->top_cgroup; |
1341 | struct cgroup_sb_opts opts; | 1324 | struct cgroup_sb_opts opts; |
1325 | unsigned long added_mask, removed_mask; | ||
1342 | 1326 | ||
1343 | mutex_lock(&cgrp->dentry->d_inode->i_mutex); | 1327 | mutex_lock(&cgrp->dentry->d_inode->i_mutex); |
1344 | mutex_lock(&cgroup_mutex); | 1328 | mutex_lock(&cgroup_mutex); |
@@ -1349,28 +1333,38 @@ static int cgroup_remount(struct super_block *sb, int *flags, char *data) | |||
1349 | if (ret) | 1333 | if (ret) |
1350 | goto out_unlock; | 1334 | goto out_unlock; |
1351 | 1335 | ||
1352 | /* See feature-removal-schedule.txt */ | 1336 | if (opts.subsys_mask != root->actual_subsys_mask || opts.release_agent) |
1353 | if (opts.subsys_bits != root->actual_subsys_bits || opts.release_agent) | ||
1354 | pr_warning("cgroup: option changes via remount are deprecated (pid=%d comm=%s)\n", | 1337 | pr_warning("cgroup: option changes via remount are deprecated (pid=%d comm=%s)\n", |
1355 | task_tgid_nr(current), current->comm); | 1338 | task_tgid_nr(current), current->comm); |
1356 | 1339 | ||
1340 | added_mask = opts.subsys_mask & ~root->subsys_mask; | ||
1341 | removed_mask = root->subsys_mask & ~opts.subsys_mask; | ||
1342 | |||
1357 | /* Don't allow flags or name to change at remount */ | 1343 | /* Don't allow flags or name to change at remount */ |
1358 | if (opts.flags != root->flags || | 1344 | if (opts.flags != root->flags || |
1359 | (opts.name && strcmp(opts.name, root->name))) { | 1345 | (opts.name && strcmp(opts.name, root->name))) { |
1360 | ret = -EINVAL; | 1346 | ret = -EINVAL; |
1361 | drop_parsed_module_refcounts(opts.subsys_bits); | 1347 | drop_parsed_module_refcounts(opts.subsys_mask); |
1362 | goto out_unlock; | 1348 | goto out_unlock; |
1363 | } | 1349 | } |
1364 | 1350 | ||
1365 | ret = rebind_subsystems(root, opts.subsys_bits); | 1351 | /* |
1352 | * Clear out the files of subsystems that should be removed, do | ||
1353 | * this before rebind_subsystems, since rebind_subsystems may | ||
1354 | * change this hierarchy's subsys_list. | ||
1355 | */ | ||
1356 | cgroup_clear_directory(cgrp->dentry, false, removed_mask); | ||
1357 | |||
1358 | ret = rebind_subsystems(root, opts.subsys_mask); | ||
1366 | if (ret) { | 1359 | if (ret) { |
1367 | drop_parsed_module_refcounts(opts.subsys_bits); | 1360 | /* rebind_subsystems failed, re-populate the removed files */ |
1361 | cgroup_populate_dir(cgrp, false, removed_mask); | ||
1362 | drop_parsed_module_refcounts(opts.subsys_mask); | ||
1368 | goto out_unlock; | 1363 | goto out_unlock; |
1369 | } | 1364 | } |
1370 | 1365 | ||
1371 | /* clear out any existing files and repopulate subsystem files */ | 1366 | /* re-populate subsystem files */ |
1372 | cgroup_clear_directory(cgrp->dentry); | 1367 | cgroup_populate_dir(cgrp, false, added_mask); |
1373 | cgroup_populate_dir(cgrp); | ||
1374 | 1368 | ||
1375 | if (opts.release_agent) | 1369 | if (opts.release_agent) |
1376 | strcpy(root->release_agent_path, opts.release_agent); | 1370 | strcpy(root->release_agent_path, opts.release_agent); |
@@ -1396,11 +1390,13 @@ static void init_cgroup_housekeeping(struct cgroup *cgrp) | |||
1396 | INIT_LIST_HEAD(&cgrp->children); | 1390 | INIT_LIST_HEAD(&cgrp->children); |
1397 | INIT_LIST_HEAD(&cgrp->files); | 1391 | INIT_LIST_HEAD(&cgrp->files); |
1398 | INIT_LIST_HEAD(&cgrp->css_sets); | 1392 | INIT_LIST_HEAD(&cgrp->css_sets); |
1393 | INIT_LIST_HEAD(&cgrp->allcg_node); | ||
1399 | INIT_LIST_HEAD(&cgrp->release_list); | 1394 | INIT_LIST_HEAD(&cgrp->release_list); |
1400 | INIT_LIST_HEAD(&cgrp->pidlists); | 1395 | INIT_LIST_HEAD(&cgrp->pidlists); |
1401 | mutex_init(&cgrp->pidlist_mutex); | 1396 | mutex_init(&cgrp->pidlist_mutex); |
1402 | INIT_LIST_HEAD(&cgrp->event_list); | 1397 | INIT_LIST_HEAD(&cgrp->event_list); |
1403 | spin_lock_init(&cgrp->event_list_lock); | 1398 | spin_lock_init(&cgrp->event_list_lock); |
1399 | simple_xattrs_init(&cgrp->xattrs); | ||
1404 | } | 1400 | } |
1405 | 1401 | ||
1406 | static void init_cgroup_root(struct cgroupfs_root *root) | 1402 | static void init_cgroup_root(struct cgroupfs_root *root) |
@@ -1413,8 +1409,8 @@ static void init_cgroup_root(struct cgroupfs_root *root) | |||
1413 | root->number_of_cgroups = 1; | 1409 | root->number_of_cgroups = 1; |
1414 | cgrp->root = root; | 1410 | cgrp->root = root; |
1415 | cgrp->top_cgroup = cgrp; | 1411 | cgrp->top_cgroup = cgrp; |
1416 | list_add_tail(&cgrp->allcg_node, &root->allcg_list); | ||
1417 | init_cgroup_housekeeping(cgrp); | 1412 | init_cgroup_housekeeping(cgrp); |
1413 | list_add_tail(&cgrp->allcg_node, &root->allcg_list); | ||
1418 | } | 1414 | } |
1419 | 1415 | ||
1420 | static bool init_root_id(struct cgroupfs_root *root) | 1416 | static bool init_root_id(struct cgroupfs_root *root) |
@@ -1455,8 +1451,8 @@ static int cgroup_test_super(struct super_block *sb, void *data) | |||
1455 | * If we asked for subsystems (or explicitly for no | 1451 | * If we asked for subsystems (or explicitly for no |
1456 | * subsystems) then they must match | 1452 | * subsystems) then they must match |
1457 | */ | 1453 | */ |
1458 | if ((opts->subsys_bits || opts->none) | 1454 | if ((opts->subsys_mask || opts->none) |
1459 | && (opts->subsys_bits != root->subsys_bits)) | 1455 | && (opts->subsys_mask != root->subsys_mask)) |
1460 | return 0; | 1456 | return 0; |
1461 | 1457 | ||
1462 | return 1; | 1458 | return 1; |
@@ -1466,7 +1462,7 @@ static struct cgroupfs_root *cgroup_root_from_opts(struct cgroup_sb_opts *opts) | |||
1466 | { | 1462 | { |
1467 | struct cgroupfs_root *root; | 1463 | struct cgroupfs_root *root; |
1468 | 1464 | ||
1469 | if (!opts->subsys_bits && !opts->none) | 1465 | if (!opts->subsys_mask && !opts->none) |
1470 | return NULL; | 1466 | return NULL; |
1471 | 1467 | ||
1472 | root = kzalloc(sizeof(*root), GFP_KERNEL); | 1468 | root = kzalloc(sizeof(*root), GFP_KERNEL); |
@@ -1479,14 +1475,15 @@ static struct cgroupfs_root *cgroup_root_from_opts(struct cgroup_sb_opts *opts) | |||
1479 | } | 1475 | } |
1480 | init_cgroup_root(root); | 1476 | init_cgroup_root(root); |
1481 | 1477 | ||
1482 | root->subsys_bits = opts->subsys_bits; | 1478 | root->subsys_mask = opts->subsys_mask; |
1483 | root->flags = opts->flags; | 1479 | root->flags = opts->flags; |
1480 | ida_init(&root->cgroup_ida); | ||
1484 | if (opts->release_agent) | 1481 | if (opts->release_agent) |
1485 | strcpy(root->release_agent_path, opts->release_agent); | 1482 | strcpy(root->release_agent_path, opts->release_agent); |
1486 | if (opts->name) | 1483 | if (opts->name) |
1487 | strcpy(root->name, opts->name); | 1484 | strcpy(root->name, opts->name); |
1488 | if (opts->clone_children) | 1485 | if (opts->cpuset_clone_children) |
1489 | set_bit(CGRP_CLONE_CHILDREN, &root->top_cgroup.flags); | 1486 | set_bit(CGRP_CPUSET_CLONE_CHILDREN, &root->top_cgroup.flags); |
1490 | return root; | 1487 | return root; |
1491 | } | 1488 | } |
1492 | 1489 | ||
@@ -1499,6 +1496,7 @@ static void cgroup_drop_root(struct cgroupfs_root *root) | |||
1499 | spin_lock(&hierarchy_id_lock); | 1496 | spin_lock(&hierarchy_id_lock); |
1500 | ida_remove(&hierarchy_ida, root->hierarchy_id); | 1497 | ida_remove(&hierarchy_ida, root->hierarchy_id); |
1501 | spin_unlock(&hierarchy_id_lock); | 1498 | spin_unlock(&hierarchy_id_lock); |
1499 | ida_destroy(&root->cgroup_ida); | ||
1502 | kfree(root); | 1500 | kfree(root); |
1503 | } | 1501 | } |
1504 | 1502 | ||
@@ -1511,7 +1509,7 @@ static int cgroup_set_super(struct super_block *sb, void *data) | |||
1511 | if (!opts->new_root) | 1509 | if (!opts->new_root) |
1512 | return -EINVAL; | 1510 | return -EINVAL; |
1513 | 1511 | ||
1514 | BUG_ON(!opts->subsys_bits && !opts->none); | 1512 | BUG_ON(!opts->subsys_mask && !opts->none); |
1515 | 1513 | ||
1516 | ret = set_anon_super(sb, NULL); | 1514 | ret = set_anon_super(sb, NULL); |
1517 | if (ret) | 1515 | if (ret) |
@@ -1629,7 +1627,7 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type, | |||
1629 | if (ret) | 1627 | if (ret) |
1630 | goto unlock_drop; | 1628 | goto unlock_drop; |
1631 | 1629 | ||
1632 | ret = rebind_subsystems(root, root->subsys_bits); | 1630 | ret = rebind_subsystems(root, root->subsys_mask); |
1633 | if (ret == -EBUSY) { | 1631 | if (ret == -EBUSY) { |
1634 | free_cg_links(&tmp_cg_links); | 1632 | free_cg_links(&tmp_cg_links); |
1635 | goto unlock_drop; | 1633 | goto unlock_drop; |
@@ -1664,12 +1662,11 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type, | |||
1664 | 1662 | ||
1665 | free_cg_links(&tmp_cg_links); | 1663 | free_cg_links(&tmp_cg_links); |
1666 | 1664 | ||
1667 | BUG_ON(!list_empty(&root_cgrp->sibling)); | ||
1668 | BUG_ON(!list_empty(&root_cgrp->children)); | 1665 | BUG_ON(!list_empty(&root_cgrp->children)); |
1669 | BUG_ON(root->number_of_cgroups != 1); | 1666 | BUG_ON(root->number_of_cgroups != 1); |
1670 | 1667 | ||
1671 | cred = override_creds(&init_cred); | 1668 | cred = override_creds(&init_cred); |
1672 | cgroup_populate_dir(root_cgrp); | 1669 | cgroup_populate_dir(root_cgrp, true, root->subsys_mask); |
1673 | revert_creds(cred); | 1670 | revert_creds(cred); |
1674 | mutex_unlock(&cgroup_root_mutex); | 1671 | mutex_unlock(&cgroup_root_mutex); |
1675 | mutex_unlock(&cgroup_mutex); | 1672 | mutex_unlock(&cgroup_mutex); |
@@ -1681,7 +1678,7 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type, | |||
1681 | */ | 1678 | */ |
1682 | cgroup_drop_root(opts.new_root); | 1679 | cgroup_drop_root(opts.new_root); |
1683 | /* no subsys rebinding, so refcounts don't change */ | 1680 | /* no subsys rebinding, so refcounts don't change */ |
1684 | drop_parsed_module_refcounts(opts.subsys_bits); | 1681 | drop_parsed_module_refcounts(opts.subsys_mask); |
1685 | } | 1682 | } |
1686 | 1683 | ||
1687 | kfree(opts.release_agent); | 1684 | kfree(opts.release_agent); |
@@ -1695,7 +1692,7 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type, | |||
1695 | drop_new_super: | 1692 | drop_new_super: |
1696 | deactivate_locked_super(sb); | 1693 | deactivate_locked_super(sb); |
1697 | drop_modules: | 1694 | drop_modules: |
1698 | drop_parsed_module_refcounts(opts.subsys_bits); | 1695 | drop_parsed_module_refcounts(opts.subsys_mask); |
1699 | out_err: | 1696 | out_err: |
1700 | kfree(opts.release_agent); | 1697 | kfree(opts.release_agent); |
1701 | kfree(opts.name); | 1698 | kfree(opts.name); |
@@ -1713,7 +1710,6 @@ static void cgroup_kill_sb(struct super_block *sb) { | |||
1713 | 1710 | ||
1714 | BUG_ON(root->number_of_cgroups != 1); | 1711 | BUG_ON(root->number_of_cgroups != 1); |
1715 | BUG_ON(!list_empty(&cgrp->children)); | 1712 | BUG_ON(!list_empty(&cgrp->children)); |
1716 | BUG_ON(!list_empty(&cgrp->sibling)); | ||
1717 | 1713 | ||
1718 | mutex_lock(&cgroup_mutex); | 1714 | mutex_lock(&cgroup_mutex); |
1719 | mutex_lock(&cgroup_root_mutex); | 1715 | mutex_lock(&cgroup_root_mutex); |
@@ -1745,6 +1741,8 @@ static void cgroup_kill_sb(struct super_block *sb) { | |||
1745 | mutex_unlock(&cgroup_root_mutex); | 1741 | mutex_unlock(&cgroup_root_mutex); |
1746 | mutex_unlock(&cgroup_mutex); | 1742 | mutex_unlock(&cgroup_mutex); |
1747 | 1743 | ||
1744 | simple_xattrs_free(&cgrp->xattrs); | ||
1745 | |||
1748 | kill_litter_super(sb); | 1746 | kill_litter_super(sb); |
1749 | cgroup_drop_root(root); | 1747 | cgroup_drop_root(root); |
1750 | } | 1748 | } |
@@ -1769,9 +1767,11 @@ static struct kobject *cgroup_kobj; | |||
1769 | */ | 1767 | */ |
1770 | int cgroup_path(const struct cgroup *cgrp, char *buf, int buflen) | 1768 | int cgroup_path(const struct cgroup *cgrp, char *buf, int buflen) |
1771 | { | 1769 | { |
1770 | struct dentry *dentry = cgrp->dentry; | ||
1772 | char *start; | 1771 | char *start; |
1773 | struct dentry *dentry = rcu_dereference_check(cgrp->dentry, | 1772 | |
1774 | cgroup_lock_is_held()); | 1773 | rcu_lockdep_assert(rcu_read_lock_held() || cgroup_lock_is_held(), |
1774 | "cgroup_path() called without proper locking"); | ||
1775 | 1775 | ||
1776 | if (!dentry || cgrp == dummytop) { | 1776 | if (!dentry || cgrp == dummytop) { |
1777 | /* | 1777 | /* |
@@ -1782,9 +1782,9 @@ int cgroup_path(const struct cgroup *cgrp, char *buf, int buflen) | |||
1782 | return 0; | 1782 | return 0; |
1783 | } | 1783 | } |
1784 | 1784 | ||
1785 | start = buf + buflen; | 1785 | start = buf + buflen - 1; |
1786 | 1786 | ||
1787 | *--start = '\0'; | 1787 | *start = '\0'; |
1788 | for (;;) { | 1788 | for (;;) { |
1789 | int len = dentry->d_name.len; | 1789 | int len = dentry->d_name.len; |
1790 | 1790 | ||
@@ -1795,8 +1795,7 @@ int cgroup_path(const struct cgroup *cgrp, char *buf, int buflen) | |||
1795 | if (!cgrp) | 1795 | if (!cgrp) |
1796 | break; | 1796 | break; |
1797 | 1797 | ||
1798 | dentry = rcu_dereference_check(cgrp->dentry, | 1798 | dentry = cgrp->dentry; |
1799 | cgroup_lock_is_held()); | ||
1800 | if (!cgrp->parent) | 1799 | if (!cgrp->parent) |
1801 | continue; | 1800 | continue; |
1802 | if (--start < buf) | 1801 | if (--start < buf) |
@@ -1891,9 +1890,7 @@ EXPORT_SYMBOL_GPL(cgroup_taskset_size); | |||
1891 | /* | 1890 | /* |
1892 | * cgroup_task_migrate - move a task from one cgroup to another. | 1891 | * cgroup_task_migrate - move a task from one cgroup to another. |
1893 | * | 1892 | * |
1894 | * 'guarantee' is set if the caller promises that a new css_set for the task | 1893 | * Must be called with cgroup_mutex and threadgroup locked. |
1895 | * will already exist. If not set, this function might sleep, and can fail with | ||
1896 | * -ENOMEM. Must be called with cgroup_mutex and threadgroup locked. | ||
1897 | */ | 1894 | */ |
1898 | static void cgroup_task_migrate(struct cgroup *cgrp, struct cgroup *oldcgrp, | 1895 | static void cgroup_task_migrate(struct cgroup *cgrp, struct cgroup *oldcgrp, |
1899 | struct task_struct *tsk, struct css_set *newcg) | 1896 | struct task_struct *tsk, struct css_set *newcg) |
@@ -1923,9 +1920,8 @@ static void cgroup_task_migrate(struct cgroup *cgrp, struct cgroup *oldcgrp, | |||
1923 | * trading it for newcg is protected by cgroup_mutex, we're safe to drop | 1920 | * trading it for newcg is protected by cgroup_mutex, we're safe to drop |
1924 | * it here; it will be freed under RCU. | 1921 | * it here; it will be freed under RCU. |
1925 | */ | 1922 | */ |
1926 | put_css_set(oldcg); | ||
1927 | |||
1928 | set_bit(CGRP_RELEASABLE, &oldcgrp->flags); | 1923 | set_bit(CGRP_RELEASABLE, &oldcgrp->flags); |
1924 | put_css_set(oldcg); | ||
1929 | } | 1925 | } |
1930 | 1926 | ||
1931 | /** | 1927 | /** |
@@ -1987,12 +1983,6 @@ int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk) | |||
1987 | } | 1983 | } |
1988 | 1984 | ||
1989 | synchronize_rcu(); | 1985 | synchronize_rcu(); |
1990 | |||
1991 | /* | ||
1992 | * wake up rmdir() waiter. the rmdir should fail since the cgroup | ||
1993 | * is no longer empty. | ||
1994 | */ | ||
1995 | cgroup_wakeup_rmdir_waiter(cgrp); | ||
1996 | out: | 1986 | out: |
1997 | if (retval) { | 1987 | if (retval) { |
1998 | for_each_subsys(root, ss) { | 1988 | for_each_subsys(root, ss) { |
@@ -2162,7 +2152,6 @@ static int cgroup_attach_proc(struct cgroup *cgrp, struct task_struct *leader) | |||
2162 | * step 5: success! and cleanup | 2152 | * step 5: success! and cleanup |
2163 | */ | 2153 | */ |
2164 | synchronize_rcu(); | 2154 | synchronize_rcu(); |
2165 | cgroup_wakeup_rmdir_waiter(cgrp); | ||
2166 | retval = 0; | 2155 | retval = 0; |
2167 | out_put_css_set_refs: | 2156 | out_put_css_set_refs: |
2168 | if (retval) { | 2157 | if (retval) { |
@@ -2551,6 +2540,64 @@ static int cgroup_rename(struct inode *old_dir, struct dentry *old_dentry, | |||
2551 | return simple_rename(old_dir, old_dentry, new_dir, new_dentry); | 2540 | return simple_rename(old_dir, old_dentry, new_dir, new_dentry); |
2552 | } | 2541 | } |
2553 | 2542 | ||
2543 | static struct simple_xattrs *__d_xattrs(struct dentry *dentry) | ||
2544 | { | ||
2545 | if (S_ISDIR(dentry->d_inode->i_mode)) | ||
2546 | return &__d_cgrp(dentry)->xattrs; | ||
2547 | else | ||
2548 | return &__d_cft(dentry)->xattrs; | ||
2549 | } | ||
2550 | |||
2551 | static inline int xattr_enabled(struct dentry *dentry) | ||
2552 | { | ||
2553 | struct cgroupfs_root *root = dentry->d_sb->s_fs_info; | ||
2554 | return test_bit(ROOT_XATTR, &root->flags); | ||
2555 | } | ||
2556 | |||
2557 | static bool is_valid_xattr(const char *name) | ||
2558 | { | ||
2559 | if (!strncmp(name, XATTR_TRUSTED_PREFIX, XATTR_TRUSTED_PREFIX_LEN) || | ||
2560 | !strncmp(name, XATTR_SECURITY_PREFIX, XATTR_SECURITY_PREFIX_LEN)) | ||
2561 | return true; | ||
2562 | return false; | ||
2563 | } | ||
2564 | |||
2565 | static int cgroup_setxattr(struct dentry *dentry, const char *name, | ||
2566 | const void *val, size_t size, int flags) | ||
2567 | { | ||
2568 | if (!xattr_enabled(dentry)) | ||
2569 | return -EOPNOTSUPP; | ||
2570 | if (!is_valid_xattr(name)) | ||
2571 | return -EINVAL; | ||
2572 | return simple_xattr_set(__d_xattrs(dentry), name, val, size, flags); | ||
2573 | } | ||
2574 | |||
2575 | static int cgroup_removexattr(struct dentry *dentry, const char *name) | ||
2576 | { | ||
2577 | if (!xattr_enabled(dentry)) | ||
2578 | return -EOPNOTSUPP; | ||
2579 | if (!is_valid_xattr(name)) | ||
2580 | return -EINVAL; | ||
2581 | return simple_xattr_remove(__d_xattrs(dentry), name); | ||
2582 | } | ||
2583 | |||
2584 | static ssize_t cgroup_getxattr(struct dentry *dentry, const char *name, | ||
2585 | void *buf, size_t size) | ||
2586 | { | ||
2587 | if (!xattr_enabled(dentry)) | ||
2588 | return -EOPNOTSUPP; | ||
2589 | if (!is_valid_xattr(name)) | ||
2590 | return -EINVAL; | ||
2591 | return simple_xattr_get(__d_xattrs(dentry), name, buf, size); | ||
2592 | } | ||
2593 | |||
2594 | static ssize_t cgroup_listxattr(struct dentry *dentry, char *buf, size_t size) | ||
2595 | { | ||
2596 | if (!xattr_enabled(dentry)) | ||
2597 | return -EOPNOTSUPP; | ||
2598 | return simple_xattr_list(__d_xattrs(dentry), buf, size); | ||
2599 | } | ||
2600 | |||
2554 | static const struct file_operations cgroup_file_operations = { | 2601 | static const struct file_operations cgroup_file_operations = { |
2555 | .read = cgroup_file_read, | 2602 | .read = cgroup_file_read, |
2556 | .write = cgroup_file_write, | 2603 | .write = cgroup_file_write, |
@@ -2559,11 +2606,22 @@ static const struct file_operations cgroup_file_operations = { | |||
2559 | .release = cgroup_file_release, | 2606 | .release = cgroup_file_release, |
2560 | }; | 2607 | }; |
2561 | 2608 | ||
2609 | static const struct inode_operations cgroup_file_inode_operations = { | ||
2610 | .setxattr = cgroup_setxattr, | ||
2611 | .getxattr = cgroup_getxattr, | ||
2612 | .listxattr = cgroup_listxattr, | ||
2613 | .removexattr = cgroup_removexattr, | ||
2614 | }; | ||
2615 | |||
2562 | static const struct inode_operations cgroup_dir_inode_operations = { | 2616 | static const struct inode_operations cgroup_dir_inode_operations = { |
2563 | .lookup = cgroup_lookup, | 2617 | .lookup = cgroup_lookup, |
2564 | .mkdir = cgroup_mkdir, | 2618 | .mkdir = cgroup_mkdir, |
2565 | .rmdir = cgroup_rmdir, | 2619 | .rmdir = cgroup_rmdir, |
2566 | .rename = cgroup_rename, | 2620 | .rename = cgroup_rename, |
2621 | .setxattr = cgroup_setxattr, | ||
2622 | .getxattr = cgroup_getxattr, | ||
2623 | .listxattr = cgroup_listxattr, | ||
2624 | .removexattr = cgroup_removexattr, | ||
2567 | }; | 2625 | }; |
2568 | 2626 | ||
2569 | static struct dentry *cgroup_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags) | 2627 | static struct dentry *cgroup_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags) |
@@ -2604,45 +2662,27 @@ static int cgroup_create_file(struct dentry *dentry, umode_t mode, | |||
2604 | 2662 | ||
2605 | /* start off with i_nlink == 2 (for "." entry) */ | 2663 | /* start off with i_nlink == 2 (for "." entry) */ |
2606 | inc_nlink(inode); | 2664 | inc_nlink(inode); |
2665 | inc_nlink(dentry->d_parent->d_inode); | ||
2607 | 2666 | ||
2608 | /* start with the directory inode held, so that we can | 2667 | /* |
2609 | * populate it without racing with another mkdir */ | 2668 | * Control reaches here with cgroup_mutex held. |
2610 | mutex_lock_nested(&inode->i_mutex, I_MUTEX_CHILD); | 2669 | * @inode->i_mutex should nest outside cgroup_mutex but we |
2670 | * want to populate it immediately without releasing | ||
2671 | * cgroup_mutex. As @inode isn't visible to anyone else | ||
2672 | * yet, trylock will always succeed without affecting | ||
2673 | * lockdep checks. | ||
2674 | */ | ||
2675 | WARN_ON_ONCE(!mutex_trylock(&inode->i_mutex)); | ||
2611 | } else if (S_ISREG(mode)) { | 2676 | } else if (S_ISREG(mode)) { |
2612 | inode->i_size = 0; | 2677 | inode->i_size = 0; |
2613 | inode->i_fop = &cgroup_file_operations; | 2678 | inode->i_fop = &cgroup_file_operations; |
2679 | inode->i_op = &cgroup_file_inode_operations; | ||
2614 | } | 2680 | } |
2615 | d_instantiate(dentry, inode); | 2681 | d_instantiate(dentry, inode); |
2616 | dget(dentry); /* Extra count - pin the dentry in core */ | 2682 | dget(dentry); /* Extra count - pin the dentry in core */ |
2617 | return 0; | 2683 | return 0; |
2618 | } | 2684 | } |
2619 | 2685 | ||
2620 | /* | ||
2621 | * cgroup_create_dir - create a directory for an object. | ||
2622 | * @cgrp: the cgroup we create the directory for. It must have a valid | ||
2623 | * ->parent field. And we are going to fill its ->dentry field. | ||
2624 | * @dentry: dentry of the new cgroup | ||
2625 | * @mode: mode to set on new directory. | ||
2626 | */ | ||
2627 | static int cgroup_create_dir(struct cgroup *cgrp, struct dentry *dentry, | ||
2628 | umode_t mode) | ||
2629 | { | ||
2630 | struct dentry *parent; | ||
2631 | int error = 0; | ||
2632 | |||
2633 | parent = cgrp->parent->dentry; | ||
2634 | error = cgroup_create_file(dentry, S_IFDIR | mode, cgrp->root->sb); | ||
2635 | if (!error) { | ||
2636 | dentry->d_fsdata = cgrp; | ||
2637 | inc_nlink(parent->d_inode); | ||
2638 | rcu_assign_pointer(cgrp->dentry, dentry); | ||
2639 | dget(dentry); | ||
2640 | } | ||
2641 | dput(dentry); | ||
2642 | |||
2643 | return error; | ||
2644 | } | ||
2645 | |||
2646 | /** | 2686 | /** |
2647 | * cgroup_file_mode - deduce file mode of a control file | 2687 | * cgroup_file_mode - deduce file mode of a control file |
2648 | * @cft: the control file in question | 2688 | * @cft: the control file in question |
@@ -2671,7 +2711,7 @@ static umode_t cgroup_file_mode(const struct cftype *cft) | |||
2671 | } | 2711 | } |
2672 | 2712 | ||
2673 | static int cgroup_add_file(struct cgroup *cgrp, struct cgroup_subsys *subsys, | 2713 | static int cgroup_add_file(struct cgroup *cgrp, struct cgroup_subsys *subsys, |
2674 | const struct cftype *cft) | 2714 | struct cftype *cft) |
2675 | { | 2715 | { |
2676 | struct dentry *dir = cgrp->dentry; | 2716 | struct dentry *dir = cgrp->dentry; |
2677 | struct cgroup *parent = __d_cgrp(dir); | 2717 | struct cgroup *parent = __d_cgrp(dir); |
@@ -2681,11 +2721,7 @@ static int cgroup_add_file(struct cgroup *cgrp, struct cgroup_subsys *subsys, | |||
2681 | umode_t mode; | 2721 | umode_t mode; |
2682 | char name[MAX_CGROUP_TYPE_NAMELEN + MAX_CFTYPE_NAME + 2] = { 0 }; | 2722 | char name[MAX_CGROUP_TYPE_NAMELEN + MAX_CFTYPE_NAME + 2] = { 0 }; |
2683 | 2723 | ||
2684 | /* does @cft->flags tell us to skip creation on @cgrp? */ | 2724 | simple_xattrs_init(&cft->xattrs); |
2685 | if ((cft->flags & CFTYPE_NOT_ON_ROOT) && !cgrp->parent) | ||
2686 | return 0; | ||
2687 | if ((cft->flags & CFTYPE_ONLY_ON_ROOT) && cgrp->parent) | ||
2688 | return 0; | ||
2689 | 2725 | ||
2690 | if (subsys && !test_bit(ROOT_NOPREFIX, &cgrp->root->flags)) { | 2726 | if (subsys && !test_bit(ROOT_NOPREFIX, &cgrp->root->flags)) { |
2691 | strcpy(name, subsys->name); | 2727 | strcpy(name, subsys->name); |
@@ -2721,12 +2757,18 @@ out: | |||
2721 | } | 2757 | } |
2722 | 2758 | ||
2723 | static int cgroup_addrm_files(struct cgroup *cgrp, struct cgroup_subsys *subsys, | 2759 | static int cgroup_addrm_files(struct cgroup *cgrp, struct cgroup_subsys *subsys, |
2724 | const struct cftype cfts[], bool is_add) | 2760 | struct cftype cfts[], bool is_add) |
2725 | { | 2761 | { |
2726 | const struct cftype *cft; | 2762 | struct cftype *cft; |
2727 | int err, ret = 0; | 2763 | int err, ret = 0; |
2728 | 2764 | ||
2729 | for (cft = cfts; cft->name[0] != '\0'; cft++) { | 2765 | for (cft = cfts; cft->name[0] != '\0'; cft++) { |
2766 | /* does cft->flags tell us to skip this file on @cgrp? */ | ||
2767 | if ((cft->flags & CFTYPE_NOT_ON_ROOT) && !cgrp->parent) | ||
2768 | continue; | ||
2769 | if ((cft->flags & CFTYPE_ONLY_ON_ROOT) && cgrp->parent) | ||
2770 | continue; | ||
2771 | |||
2730 | if (is_add) | 2772 | if (is_add) |
2731 | err = cgroup_add_file(cgrp, subsys, cft); | 2773 | err = cgroup_add_file(cgrp, subsys, cft); |
2732 | else | 2774 | else |
@@ -2757,7 +2799,7 @@ static void cgroup_cfts_prepare(void) | |||
2757 | } | 2799 | } |
2758 | 2800 | ||
2759 | static void cgroup_cfts_commit(struct cgroup_subsys *ss, | 2801 | static void cgroup_cfts_commit(struct cgroup_subsys *ss, |
2760 | const struct cftype *cfts, bool is_add) | 2802 | struct cftype *cfts, bool is_add) |
2761 | __releases(&cgroup_mutex) __releases(&cgroup_cft_mutex) | 2803 | __releases(&cgroup_mutex) __releases(&cgroup_cft_mutex) |
2762 | { | 2804 | { |
2763 | LIST_HEAD(pending); | 2805 | LIST_HEAD(pending); |
@@ -2808,7 +2850,7 @@ static void cgroup_cfts_commit(struct cgroup_subsys *ss, | |||
2808 | * function currently returns 0 as long as @cfts registration is successful | 2850 | * function currently returns 0 as long as @cfts registration is successful |
2809 | * even if some file creation attempts on existing cgroups fail. | 2851 | * even if some file creation attempts on existing cgroups fail. |
2810 | */ | 2852 | */ |
2811 | int cgroup_add_cftypes(struct cgroup_subsys *ss, const struct cftype *cfts) | 2853 | int cgroup_add_cftypes(struct cgroup_subsys *ss, struct cftype *cfts) |
2812 | { | 2854 | { |
2813 | struct cftype_set *set; | 2855 | struct cftype_set *set; |
2814 | 2856 | ||
@@ -2838,7 +2880,7 @@ EXPORT_SYMBOL_GPL(cgroup_add_cftypes); | |||
2838 | * Returns 0 on successful unregistration, -ENOENT if @cfts is not | 2880 | * Returns 0 on successful unregistration, -ENOENT if @cfts is not |
2839 | * registered with @ss. | 2881 | * registered with @ss. |
2840 | */ | 2882 | */ |
2841 | int cgroup_rm_cftypes(struct cgroup_subsys *ss, const struct cftype *cfts) | 2883 | int cgroup_rm_cftypes(struct cgroup_subsys *ss, struct cftype *cfts) |
2842 | { | 2884 | { |
2843 | struct cftype_set *set; | 2885 | struct cftype_set *set; |
2844 | 2886 | ||
@@ -2934,6 +2976,92 @@ static void cgroup_enable_task_cg_lists(void) | |||
2934 | write_unlock(&css_set_lock); | 2976 | write_unlock(&css_set_lock); |
2935 | } | 2977 | } |
2936 | 2978 | ||
2979 | /** | ||
2980 | * cgroup_next_descendant_pre - find the next descendant for pre-order walk | ||
2981 | * @pos: the current position (%NULL to initiate traversal) | ||
2982 | * @cgroup: cgroup whose descendants to walk | ||
2983 | * | ||
2984 | * To be used by cgroup_for_each_descendant_pre(). Find the next | ||
2985 | * descendant to visit for pre-order traversal of @cgroup's descendants. | ||
2986 | */ | ||
2987 | struct cgroup *cgroup_next_descendant_pre(struct cgroup *pos, | ||
2988 | struct cgroup *cgroup) | ||
2989 | { | ||
2990 | struct cgroup *next; | ||
2991 | |||
2992 | WARN_ON_ONCE(!rcu_read_lock_held()); | ||
2993 | |||
2994 | /* if first iteration, pretend we just visited @cgroup */ | ||
2995 | if (!pos) { | ||
2996 | if (list_empty(&cgroup->children)) | ||
2997 | return NULL; | ||
2998 | pos = cgroup; | ||
2999 | } | ||
3000 | |||
3001 | /* visit the first child if exists */ | ||
3002 | next = list_first_or_null_rcu(&pos->children, struct cgroup, sibling); | ||
3003 | if (next) | ||
3004 | return next; | ||
3005 | |||
3006 | /* no child, visit my or the closest ancestor's next sibling */ | ||
3007 | do { | ||
3008 | next = list_entry_rcu(pos->sibling.next, struct cgroup, | ||
3009 | sibling); | ||
3010 | if (&next->sibling != &pos->parent->children) | ||
3011 | return next; | ||
3012 | |||
3013 | pos = pos->parent; | ||
3014 | } while (pos != cgroup); | ||
3015 | |||
3016 | return NULL; | ||
3017 | } | ||
3018 | EXPORT_SYMBOL_GPL(cgroup_next_descendant_pre); | ||
3019 | |||
3020 | static struct cgroup *cgroup_leftmost_descendant(struct cgroup *pos) | ||
3021 | { | ||
3022 | struct cgroup *last; | ||
3023 | |||
3024 | do { | ||
3025 | last = pos; | ||
3026 | pos = list_first_or_null_rcu(&pos->children, struct cgroup, | ||
3027 | sibling); | ||
3028 | } while (pos); | ||
3029 | |||
3030 | return last; | ||
3031 | } | ||
3032 | |||
3033 | /** | ||
3034 | * cgroup_next_descendant_post - find the next descendant for post-order walk | ||
3035 | * @pos: the current position (%NULL to initiate traversal) | ||
3036 | * @cgroup: cgroup whose descendants to walk | ||
3037 | * | ||
3038 | * To be used by cgroup_for_each_descendant_post(). Find the next | ||
3039 | * descendant to visit for post-order traversal of @cgroup's descendants. | ||
3040 | */ | ||
3041 | struct cgroup *cgroup_next_descendant_post(struct cgroup *pos, | ||
3042 | struct cgroup *cgroup) | ||
3043 | { | ||
3044 | struct cgroup *next; | ||
3045 | |||
3046 | WARN_ON_ONCE(!rcu_read_lock_held()); | ||
3047 | |||
3048 | /* if first iteration, visit the leftmost descendant */ | ||
3049 | if (!pos) { | ||
3050 | next = cgroup_leftmost_descendant(cgroup); | ||
3051 | return next != cgroup ? next : NULL; | ||
3052 | } | ||
3053 | |||
3054 | /* if there's an unvisited sibling, visit its leftmost descendant */ | ||
3055 | next = list_entry_rcu(pos->sibling.next, struct cgroup, sibling); | ||
3056 | if (&next->sibling != &pos->parent->children) | ||
3057 | return cgroup_leftmost_descendant(next); | ||
3058 | |||
3059 | /* no sibling left, visit parent */ | ||
3060 | next = pos->parent; | ||
3061 | return next != cgroup ? next : NULL; | ||
3062 | } | ||
3063 | EXPORT_SYMBOL_GPL(cgroup_next_descendant_post); | ||
3064 | |||
2937 | void cgroup_iter_start(struct cgroup *cgrp, struct cgroup_iter *it) | 3065 | void cgroup_iter_start(struct cgroup *cgrp, struct cgroup_iter *it) |
2938 | __acquires(css_set_lock) | 3066 | __acquires(css_set_lock) |
2939 | { | 3067 | { |
@@ -3280,7 +3408,7 @@ static struct cgroup_pidlist *cgroup_pidlist_find(struct cgroup *cgrp, | |||
3280 | { | 3408 | { |
3281 | struct cgroup_pidlist *l; | 3409 | struct cgroup_pidlist *l; |
3282 | /* don't need task_nsproxy() if we're looking at ourself */ | 3410 | /* don't need task_nsproxy() if we're looking at ourself */ |
3283 | struct pid_namespace *ns = current->nsproxy->pid_ns; | 3411 | struct pid_namespace *ns = task_active_pid_ns(current); |
3284 | 3412 | ||
3285 | /* | 3413 | /* |
3286 | * We can't drop the pidlist_mutex before taking the l->mutex in case | 3414 | * We can't drop the pidlist_mutex before taking the l->mutex in case |
@@ -3647,7 +3775,7 @@ static int cgroup_event_wake(wait_queue_t *wait, unsigned mode, | |||
3647 | if (flags & POLLHUP) { | 3775 | if (flags & POLLHUP) { |
3648 | __remove_wait_queue(event->wqh, &event->wait); | 3776 | __remove_wait_queue(event->wqh, &event->wait); |
3649 | spin_lock(&cgrp->event_list_lock); | 3777 | spin_lock(&cgrp->event_list_lock); |
3650 | list_del(&event->list); | 3778 | list_del_init(&event->list); |
3651 | spin_unlock(&cgrp->event_list_lock); | 3779 | spin_unlock(&cgrp->event_list_lock); |
3652 | /* | 3780 | /* |
3653 | * We are in atomic context, but cgroup_event_remove() may | 3781 | * We are in atomic context, but cgroup_event_remove() may |
@@ -3784,7 +3912,7 @@ fail: | |||
3784 | static u64 cgroup_clone_children_read(struct cgroup *cgrp, | 3912 | static u64 cgroup_clone_children_read(struct cgroup *cgrp, |
3785 | struct cftype *cft) | 3913 | struct cftype *cft) |
3786 | { | 3914 | { |
3787 | return clone_children(cgrp); | 3915 | return test_bit(CGRP_CPUSET_CLONE_CHILDREN, &cgrp->flags); |
3788 | } | 3916 | } |
3789 | 3917 | ||
3790 | static int cgroup_clone_children_write(struct cgroup *cgrp, | 3918 | static int cgroup_clone_children_write(struct cgroup *cgrp, |
@@ -3792,9 +3920,9 @@ static int cgroup_clone_children_write(struct cgroup *cgrp, | |||
3792 | u64 val) | 3920 | u64 val) |
3793 | { | 3921 | { |
3794 | if (val) | 3922 | if (val) |
3795 | set_bit(CGRP_CLONE_CHILDREN, &cgrp->flags); | 3923 | set_bit(CGRP_CPUSET_CLONE_CHILDREN, &cgrp->flags); |
3796 | else | 3924 | else |
3797 | clear_bit(CGRP_CLONE_CHILDREN, &cgrp->flags); | 3925 | clear_bit(CGRP_CPUSET_CLONE_CHILDREN, &cgrp->flags); |
3798 | return 0; | 3926 | return 0; |
3799 | } | 3927 | } |
3800 | 3928 | ||
@@ -3843,18 +3971,29 @@ static struct cftype files[] = { | |||
3843 | { } /* terminate */ | 3971 | { } /* terminate */ |
3844 | }; | 3972 | }; |
3845 | 3973 | ||
3846 | static int cgroup_populate_dir(struct cgroup *cgrp) | 3974 | /** |
3975 | * cgroup_populate_dir - selectively creation of files in a directory | ||
3976 | * @cgrp: target cgroup | ||
3977 | * @base_files: true if the base files should be added | ||
3978 | * @subsys_mask: mask of the subsystem ids whose files should be added | ||
3979 | */ | ||
3980 | static int cgroup_populate_dir(struct cgroup *cgrp, bool base_files, | ||
3981 | unsigned long subsys_mask) | ||
3847 | { | 3982 | { |
3848 | int err; | 3983 | int err; |
3849 | struct cgroup_subsys *ss; | 3984 | struct cgroup_subsys *ss; |
3850 | 3985 | ||
3851 | err = cgroup_addrm_files(cgrp, NULL, files, true); | 3986 | if (base_files) { |
3852 | if (err < 0) | 3987 | err = cgroup_addrm_files(cgrp, NULL, files, true); |
3853 | return err; | 3988 | if (err < 0) |
3989 | return err; | ||
3990 | } | ||
3854 | 3991 | ||
3855 | /* process cftsets of each subsystem */ | 3992 | /* process cftsets of each subsystem */ |
3856 | for_each_subsys(cgrp->root, ss) { | 3993 | for_each_subsys(cgrp->root, ss) { |
3857 | struct cftype_set *set; | 3994 | struct cftype_set *set; |
3995 | if (!test_bit(ss->subsys_id, &subsys_mask)) | ||
3996 | continue; | ||
3858 | 3997 | ||
3859 | list_for_each_entry(set, &ss->cftsets, node) | 3998 | list_for_each_entry(set, &ss->cftsets, node) |
3860 | cgroup_addrm_files(cgrp, ss, set->cfts, true); | 3999 | cgroup_addrm_files(cgrp, ss, set->cfts, true); |
@@ -3896,19 +4035,57 @@ static void init_cgroup_css(struct cgroup_subsys_state *css, | |||
3896 | css->flags = 0; | 4035 | css->flags = 0; |
3897 | css->id = NULL; | 4036 | css->id = NULL; |
3898 | if (cgrp == dummytop) | 4037 | if (cgrp == dummytop) |
3899 | set_bit(CSS_ROOT, &css->flags); | 4038 | css->flags |= CSS_ROOT; |
3900 | BUG_ON(cgrp->subsys[ss->subsys_id]); | 4039 | BUG_ON(cgrp->subsys[ss->subsys_id]); |
3901 | cgrp->subsys[ss->subsys_id] = css; | 4040 | cgrp->subsys[ss->subsys_id] = css; |
3902 | 4041 | ||
3903 | /* | 4042 | /* |
3904 | * If !clear_css_refs, css holds an extra ref to @cgrp->dentry | 4043 | * css holds an extra ref to @cgrp->dentry which is put on the last |
3905 | * which is put on the last css_put(). dput() requires process | 4044 | * css_put(). dput() requires process context, which css_put() may |
3906 | * context, which css_put() may be called without. @css->dput_work | 4045 | * be called without. @css->dput_work will be used to invoke |
3907 | * will be used to invoke dput() asynchronously from css_put(). | 4046 | * dput() asynchronously from css_put(). |
3908 | */ | 4047 | */ |
3909 | INIT_WORK(&css->dput_work, css_dput_fn); | 4048 | INIT_WORK(&css->dput_work, css_dput_fn); |
3910 | if (ss->__DEPRECATED_clear_css_refs) | 4049 | } |
3911 | set_bit(CSS_CLEAR_CSS_REFS, &css->flags); | 4050 | |
4051 | /* invoke ->post_create() on a new CSS and mark it online if successful */ | ||
4052 | static int online_css(struct cgroup_subsys *ss, struct cgroup *cgrp) | ||
4053 | { | ||
4054 | int ret = 0; | ||
4055 | |||
4056 | lockdep_assert_held(&cgroup_mutex); | ||
4057 | |||
4058 | if (ss->css_online) | ||
4059 | ret = ss->css_online(cgrp); | ||
4060 | if (!ret) | ||
4061 | cgrp->subsys[ss->subsys_id]->flags |= CSS_ONLINE; | ||
4062 | return ret; | ||
4063 | } | ||
4064 | |||
4065 | /* if the CSS is online, invoke ->pre_destory() on it and mark it offline */ | ||
4066 | static void offline_css(struct cgroup_subsys *ss, struct cgroup *cgrp) | ||
4067 | __releases(&cgroup_mutex) __acquires(&cgroup_mutex) | ||
4068 | { | ||
4069 | struct cgroup_subsys_state *css = cgrp->subsys[ss->subsys_id]; | ||
4070 | |||
4071 | lockdep_assert_held(&cgroup_mutex); | ||
4072 | |||
4073 | if (!(css->flags & CSS_ONLINE)) | ||
4074 | return; | ||
4075 | |||
4076 | /* | ||
4077 | * css_offline() should be called with cgroup_mutex unlocked. See | ||
4078 | * 3fa59dfbc3 ("cgroup: fix potential deadlock in pre_destroy") for | ||
4079 | * details. This temporary unlocking should go away once | ||
4080 | * cgroup_mutex is unexported from controllers. | ||
4081 | */ | ||
4082 | if (ss->css_offline) { | ||
4083 | mutex_unlock(&cgroup_mutex); | ||
4084 | ss->css_offline(cgrp); | ||
4085 | mutex_lock(&cgroup_mutex); | ||
4086 | } | ||
4087 | |||
4088 | cgrp->subsys[ss->subsys_id]->flags &= ~CSS_ONLINE; | ||
3912 | } | 4089 | } |
3913 | 4090 | ||
3914 | /* | 4091 | /* |
@@ -3928,10 +4105,27 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry, | |||
3928 | struct cgroup_subsys *ss; | 4105 | struct cgroup_subsys *ss; |
3929 | struct super_block *sb = root->sb; | 4106 | struct super_block *sb = root->sb; |
3930 | 4107 | ||
4108 | /* allocate the cgroup and its ID, 0 is reserved for the root */ | ||
3931 | cgrp = kzalloc(sizeof(*cgrp), GFP_KERNEL); | 4109 | cgrp = kzalloc(sizeof(*cgrp), GFP_KERNEL); |
3932 | if (!cgrp) | 4110 | if (!cgrp) |
3933 | return -ENOMEM; | 4111 | return -ENOMEM; |
3934 | 4112 | ||
4113 | cgrp->id = ida_simple_get(&root->cgroup_ida, 1, 0, GFP_KERNEL); | ||
4114 | if (cgrp->id < 0) | ||
4115 | goto err_free_cgrp; | ||
4116 | |||
4117 | /* | ||
4118 | * Only live parents can have children. Note that the liveliness | ||
4119 | * check isn't strictly necessary because cgroup_mkdir() and | ||
4120 | * cgroup_rmdir() are fully synchronized by i_mutex; however, do it | ||
4121 | * anyway so that locking is contained inside cgroup proper and we | ||
4122 | * don't get nasty surprises if we ever grow another caller. | ||
4123 | */ | ||
4124 | if (!cgroup_lock_live_group(parent)) { | ||
4125 | err = -ENODEV; | ||
4126 | goto err_free_id; | ||
4127 | } | ||
4128 | |||
3935 | /* Grab a reference on the superblock so the hierarchy doesn't | 4129 | /* Grab a reference on the superblock so the hierarchy doesn't |
3936 | * get deleted on unmount if there are child cgroups. This | 4130 | * get deleted on unmount if there are child cgroups. This |
3937 | * can be done outside cgroup_mutex, since the sb can't | 4131 | * can be done outside cgroup_mutex, since the sb can't |
@@ -3939,8 +4133,6 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry, | |||
3939 | * fs */ | 4133 | * fs */ |
3940 | atomic_inc(&sb->s_active); | 4134 | atomic_inc(&sb->s_active); |
3941 | 4135 | ||
3942 | mutex_lock(&cgroup_mutex); | ||
3943 | |||
3944 | init_cgroup_housekeeping(cgrp); | 4136 | init_cgroup_housekeeping(cgrp); |
3945 | 4137 | ||
3946 | cgrp->parent = parent; | 4138 | cgrp->parent = parent; |
@@ -3950,71 +4142,90 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry, | |||
3950 | if (notify_on_release(parent)) | 4142 | if (notify_on_release(parent)) |
3951 | set_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags); | 4143 | set_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags); |
3952 | 4144 | ||
3953 | if (clone_children(parent)) | 4145 | if (test_bit(CGRP_CPUSET_CLONE_CHILDREN, &parent->flags)) |
3954 | set_bit(CGRP_CLONE_CHILDREN, &cgrp->flags); | 4146 | set_bit(CGRP_CPUSET_CLONE_CHILDREN, &cgrp->flags); |
3955 | 4147 | ||
3956 | for_each_subsys(root, ss) { | 4148 | for_each_subsys(root, ss) { |
3957 | struct cgroup_subsys_state *css = ss->create(cgrp); | 4149 | struct cgroup_subsys_state *css; |
3958 | 4150 | ||
4151 | css = ss->css_alloc(cgrp); | ||
3959 | if (IS_ERR(css)) { | 4152 | if (IS_ERR(css)) { |
3960 | err = PTR_ERR(css); | 4153 | err = PTR_ERR(css); |
3961 | goto err_destroy; | 4154 | goto err_free_all; |
3962 | } | 4155 | } |
3963 | init_cgroup_css(css, ss, cgrp); | 4156 | init_cgroup_css(css, ss, cgrp); |
3964 | if (ss->use_id) { | 4157 | if (ss->use_id) { |
3965 | err = alloc_css_id(ss, parent, cgrp); | 4158 | err = alloc_css_id(ss, parent, cgrp); |
3966 | if (err) | 4159 | if (err) |
3967 | goto err_destroy; | 4160 | goto err_free_all; |
3968 | } | 4161 | } |
3969 | /* At error, ->destroy() callback has to free assigned ID. */ | ||
3970 | if (clone_children(parent) && ss->post_clone) | ||
3971 | ss->post_clone(cgrp); | ||
3972 | } | 4162 | } |
3973 | 4163 | ||
3974 | list_add(&cgrp->sibling, &cgrp->parent->children); | 4164 | /* |
3975 | root->number_of_cgroups++; | 4165 | * Create directory. cgroup_create_file() returns with the new |
3976 | 4166 | * directory locked on success so that it can be populated without | |
3977 | err = cgroup_create_dir(cgrp, dentry, mode); | 4167 | * dropping cgroup_mutex. |
4168 | */ | ||
4169 | err = cgroup_create_file(dentry, S_IFDIR | mode, sb); | ||
3978 | if (err < 0) | 4170 | if (err < 0) |
3979 | goto err_remove; | 4171 | goto err_free_all; |
4172 | lockdep_assert_held(&dentry->d_inode->i_mutex); | ||
4173 | |||
4174 | /* allocation complete, commit to creation */ | ||
4175 | dentry->d_fsdata = cgrp; | ||
4176 | cgrp->dentry = dentry; | ||
4177 | list_add_tail(&cgrp->allcg_node, &root->allcg_list); | ||
4178 | list_add_tail_rcu(&cgrp->sibling, &cgrp->parent->children); | ||
4179 | root->number_of_cgroups++; | ||
3980 | 4180 | ||
3981 | /* If !clear_css_refs, each css holds a ref to the cgroup's dentry */ | 4181 | /* each css holds a ref to the cgroup's dentry */ |
3982 | for_each_subsys(root, ss) | 4182 | for_each_subsys(root, ss) |
3983 | if (!ss->__DEPRECATED_clear_css_refs) | 4183 | dget(dentry); |
3984 | dget(dentry); | ||
3985 | 4184 | ||
3986 | /* The cgroup directory was pre-locked for us */ | 4185 | /* creation succeeded, notify subsystems */ |
3987 | BUG_ON(!mutex_is_locked(&cgrp->dentry->d_inode->i_mutex)); | 4186 | for_each_subsys(root, ss) { |
4187 | err = online_css(ss, cgrp); | ||
4188 | if (err) | ||
4189 | goto err_destroy; | ||
3988 | 4190 | ||
3989 | list_add_tail(&cgrp->allcg_node, &root->allcg_list); | 4191 | if (ss->broken_hierarchy && !ss->warned_broken_hierarchy && |
4192 | parent->parent) { | ||
4193 | pr_warning("cgroup: %s (%d) created nested cgroup for controller \"%s\" which has incomplete hierarchy support. Nested cgroups may change behavior in the future.\n", | ||
4194 | current->comm, current->pid, ss->name); | ||
4195 | if (!strcmp(ss->name, "memory")) | ||
4196 | pr_warning("cgroup: \"memory\" requires setting use_hierarchy to 1 on the root.\n"); | ||
4197 | ss->warned_broken_hierarchy = true; | ||
4198 | } | ||
4199 | } | ||
3990 | 4200 | ||
3991 | err = cgroup_populate_dir(cgrp); | 4201 | err = cgroup_populate_dir(cgrp, true, root->subsys_mask); |
3992 | /* If err < 0, we have a half-filled directory - oh well ;) */ | 4202 | if (err) |
4203 | goto err_destroy; | ||
3993 | 4204 | ||
3994 | mutex_unlock(&cgroup_mutex); | 4205 | mutex_unlock(&cgroup_mutex); |
3995 | mutex_unlock(&cgrp->dentry->d_inode->i_mutex); | 4206 | mutex_unlock(&cgrp->dentry->d_inode->i_mutex); |
3996 | 4207 | ||
3997 | return 0; | 4208 | return 0; |
3998 | 4209 | ||
3999 | err_remove: | 4210 | err_free_all: |
4000 | |||
4001 | list_del(&cgrp->sibling); | ||
4002 | root->number_of_cgroups--; | ||
4003 | |||
4004 | err_destroy: | ||
4005 | |||
4006 | for_each_subsys(root, ss) { | 4211 | for_each_subsys(root, ss) { |
4007 | if (cgrp->subsys[ss->subsys_id]) | 4212 | if (cgrp->subsys[ss->subsys_id]) |
4008 | ss->destroy(cgrp); | 4213 | ss->css_free(cgrp); |
4009 | } | 4214 | } |
4010 | |||
4011 | mutex_unlock(&cgroup_mutex); | 4215 | mutex_unlock(&cgroup_mutex); |
4012 | |||
4013 | /* Release the reference count that we took on the superblock */ | 4216 | /* Release the reference count that we took on the superblock */ |
4014 | deactivate_super(sb); | 4217 | deactivate_super(sb); |
4015 | 4218 | err_free_id: | |
4219 | ida_simple_remove(&root->cgroup_ida, cgrp->id); | ||
4220 | err_free_cgrp: | ||
4016 | kfree(cgrp); | 4221 | kfree(cgrp); |
4017 | return err; | 4222 | return err; |
4223 | |||
4224 | err_destroy: | ||
4225 | cgroup_destroy_locked(cgrp); | ||
4226 | mutex_unlock(&cgroup_mutex); | ||
4227 | mutex_unlock(&dentry->d_inode->i_mutex); | ||
4228 | return err; | ||
4018 | } | 4229 | } |
4019 | 4230 | ||
4020 | static int cgroup_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) | 4231 | static int cgroup_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) |
@@ -4066,153 +4277,60 @@ static int cgroup_has_css_refs(struct cgroup *cgrp) | |||
4066 | return 0; | 4277 | return 0; |
4067 | } | 4278 | } |
4068 | 4279 | ||
4069 | /* | 4280 | static int cgroup_destroy_locked(struct cgroup *cgrp) |
4070 | * Atomically mark all (or else none) of the cgroup's CSS objects as | 4281 | __releases(&cgroup_mutex) __acquires(&cgroup_mutex) |
4071 | * CSS_REMOVED. Return true on success, or false if the cgroup has | ||
4072 | * busy subsystems. Call with cgroup_mutex held | ||
4073 | * | ||
4074 | * Depending on whether a subsys has __DEPRECATED_clear_css_refs set or | ||
4075 | * not, cgroup removal behaves differently. | ||
4076 | * | ||
4077 | * If clear is set, css refcnt for the subsystem should be zero before | ||
4078 | * cgroup removal can be committed. This is implemented by | ||
4079 | * CGRP_WAIT_ON_RMDIR and retry logic around ->pre_destroy(), which may be | ||
4080 | * called multiple times until all css refcnts reach zero and is allowed to | ||
4081 | * veto removal on any invocation. This behavior is deprecated and will be | ||
4082 | * removed as soon as the existing user (memcg) is updated. | ||
4083 | * | ||
4084 | * If clear is not set, each css holds an extra reference to the cgroup's | ||
4085 | * dentry and cgroup removal proceeds regardless of css refs. | ||
4086 | * ->pre_destroy() will be called at least once and is not allowed to fail. | ||
4087 | * On the last put of each css, whenever that may be, the extra dentry ref | ||
4088 | * is put so that dentry destruction happens only after all css's are | ||
4089 | * released. | ||
4090 | */ | ||
4091 | static int cgroup_clear_css_refs(struct cgroup *cgrp) | ||
4092 | { | 4282 | { |
4283 | struct dentry *d = cgrp->dentry; | ||
4284 | struct cgroup *parent = cgrp->parent; | ||
4285 | DEFINE_WAIT(wait); | ||
4286 | struct cgroup_event *event, *tmp; | ||
4093 | struct cgroup_subsys *ss; | 4287 | struct cgroup_subsys *ss; |
4094 | unsigned long flags; | 4288 | LIST_HEAD(tmp_list); |
4095 | bool failed = false; | 4289 | |
4290 | lockdep_assert_held(&d->d_inode->i_mutex); | ||
4291 | lockdep_assert_held(&cgroup_mutex); | ||
4096 | 4292 | ||
4097 | local_irq_save(flags); | 4293 | if (atomic_read(&cgrp->count) || !list_empty(&cgrp->children)) |
4294 | return -EBUSY; | ||
4098 | 4295 | ||
4099 | /* | 4296 | /* |
4100 | * Block new css_tryget() by deactivating refcnt. If all refcnts | 4297 | * Block new css_tryget() by deactivating refcnt and mark @cgrp |
4101 | * for subsystems w/ clear_css_refs set were 1 at the moment of | 4298 | * removed. This makes future css_tryget() and child creation |
4102 | * deactivation, we succeeded. | 4299 | * attempts fail thus maintaining the removal conditions verified |
4300 | * above. | ||
4103 | */ | 4301 | */ |
4104 | for_each_subsys(cgrp->root, ss) { | 4302 | for_each_subsys(cgrp->root, ss) { |
4105 | struct cgroup_subsys_state *css = cgrp->subsys[ss->subsys_id]; | 4303 | struct cgroup_subsys_state *css = cgrp->subsys[ss->subsys_id]; |
4106 | 4304 | ||
4107 | WARN_ON(atomic_read(&css->refcnt) < 0); | 4305 | WARN_ON(atomic_read(&css->refcnt) < 0); |
4108 | atomic_add(CSS_DEACT_BIAS, &css->refcnt); | 4306 | atomic_add(CSS_DEACT_BIAS, &css->refcnt); |
4109 | |||
4110 | if (ss->__DEPRECATED_clear_css_refs) | ||
4111 | failed |= css_refcnt(css) != 1; | ||
4112 | } | 4307 | } |
4308 | set_bit(CGRP_REMOVED, &cgrp->flags); | ||
4113 | 4309 | ||
4114 | /* | 4310 | /* tell subsystems to initate destruction */ |
4115 | * If succeeded, set REMOVED and put all the base refs; otherwise, | 4311 | for_each_subsys(cgrp->root, ss) |
4116 | * restore refcnts to positive values. Either way, all in-progress | 4312 | offline_css(ss, cgrp); |
4117 | * css_tryget() will be released. | ||
4118 | */ | ||
4119 | for_each_subsys(cgrp->root, ss) { | ||
4120 | struct cgroup_subsys_state *css = cgrp->subsys[ss->subsys_id]; | ||
4121 | |||
4122 | if (!failed) { | ||
4123 | set_bit(CSS_REMOVED, &css->flags); | ||
4124 | css_put(css); | ||
4125 | } else { | ||
4126 | atomic_sub(CSS_DEACT_BIAS, &css->refcnt); | ||
4127 | } | ||
4128 | } | ||
4129 | |||
4130 | local_irq_restore(flags); | ||
4131 | return !failed; | ||
4132 | } | ||
4133 | |||
4134 | static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry) | ||
4135 | { | ||
4136 | struct cgroup *cgrp = dentry->d_fsdata; | ||
4137 | struct dentry *d; | ||
4138 | struct cgroup *parent; | ||
4139 | DEFINE_WAIT(wait); | ||
4140 | struct cgroup_event *event, *tmp; | ||
4141 | int ret; | ||
4142 | |||
4143 | /* the vfs holds both inode->i_mutex already */ | ||
4144 | again: | ||
4145 | mutex_lock(&cgroup_mutex); | ||
4146 | if (atomic_read(&cgrp->count) != 0) { | ||
4147 | mutex_unlock(&cgroup_mutex); | ||
4148 | return -EBUSY; | ||
4149 | } | ||
4150 | if (!list_empty(&cgrp->children)) { | ||
4151 | mutex_unlock(&cgroup_mutex); | ||
4152 | return -EBUSY; | ||
4153 | } | ||
4154 | mutex_unlock(&cgroup_mutex); | ||
4155 | 4313 | ||
4156 | /* | 4314 | /* |
4157 | * In general, subsystem has no css->refcnt after pre_destroy(). But | 4315 | * Put all the base refs. Each css holds an extra reference to the |
4158 | * in racy cases, subsystem may have to get css->refcnt after | 4316 | * cgroup's dentry and cgroup removal proceeds regardless of css |
4159 | * pre_destroy() and it makes rmdir return with -EBUSY. This sometimes | 4317 | * refs. On the last put of each css, whenever that may be, the |
4160 | * make rmdir return -EBUSY too often. To avoid that, we use waitqueue | 4318 | * extra dentry ref is put so that dentry destruction happens only |
4161 | * for cgroup's rmdir. CGRP_WAIT_ON_RMDIR is for synchronizing rmdir | 4319 | * after all css's are released. |
4162 | * and subsystem's reference count handling. Please see css_get/put | ||
4163 | * and css_tryget() and cgroup_wakeup_rmdir_waiter() implementation. | ||
4164 | */ | 4320 | */ |
4165 | set_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags); | 4321 | for_each_subsys(cgrp->root, ss) |
4166 | 4322 | css_put(cgrp->subsys[ss->subsys_id]); | |
4167 | /* | ||
4168 | * Call pre_destroy handlers of subsys. Notify subsystems | ||
4169 | * that rmdir() request comes. | ||
4170 | */ | ||
4171 | ret = cgroup_call_pre_destroy(cgrp); | ||
4172 | if (ret) { | ||
4173 | clear_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags); | ||
4174 | return ret; | ||
4175 | } | ||
4176 | |||
4177 | mutex_lock(&cgroup_mutex); | ||
4178 | parent = cgrp->parent; | ||
4179 | if (atomic_read(&cgrp->count) || !list_empty(&cgrp->children)) { | ||
4180 | clear_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags); | ||
4181 | mutex_unlock(&cgroup_mutex); | ||
4182 | return -EBUSY; | ||
4183 | } | ||
4184 | prepare_to_wait(&cgroup_rmdir_waitq, &wait, TASK_INTERRUPTIBLE); | ||
4185 | if (!cgroup_clear_css_refs(cgrp)) { | ||
4186 | mutex_unlock(&cgroup_mutex); | ||
4187 | /* | ||
4188 | * Because someone may call cgroup_wakeup_rmdir_waiter() before | ||
4189 | * prepare_to_wait(), we need to check this flag. | ||
4190 | */ | ||
4191 | if (test_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags)) | ||
4192 | schedule(); | ||
4193 | finish_wait(&cgroup_rmdir_waitq, &wait); | ||
4194 | clear_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags); | ||
4195 | if (signal_pending(current)) | ||
4196 | return -EINTR; | ||
4197 | goto again; | ||
4198 | } | ||
4199 | /* NO css_tryget() can success after here. */ | ||
4200 | finish_wait(&cgroup_rmdir_waitq, &wait); | ||
4201 | clear_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags); | ||
4202 | 4323 | ||
4203 | raw_spin_lock(&release_list_lock); | 4324 | raw_spin_lock(&release_list_lock); |
4204 | set_bit(CGRP_REMOVED, &cgrp->flags); | ||
4205 | if (!list_empty(&cgrp->release_list)) | 4325 | if (!list_empty(&cgrp->release_list)) |
4206 | list_del_init(&cgrp->release_list); | 4326 | list_del_init(&cgrp->release_list); |
4207 | raw_spin_unlock(&release_list_lock); | 4327 | raw_spin_unlock(&release_list_lock); |
4208 | 4328 | ||
4209 | /* delete this cgroup from parent->children */ | 4329 | /* delete this cgroup from parent->children */ |
4210 | list_del_init(&cgrp->sibling); | 4330 | list_del_rcu(&cgrp->sibling); |
4211 | |||
4212 | list_del_init(&cgrp->allcg_node); | 4331 | list_del_init(&cgrp->allcg_node); |
4213 | 4332 | ||
4214 | d = dget(cgrp->dentry); | 4333 | dget(d); |
4215 | |||
4216 | cgroup_d_remove_dir(d); | 4334 | cgroup_d_remove_dir(d); |
4217 | dput(d); | 4335 | dput(d); |
4218 | 4336 | ||
@@ -4222,21 +4340,35 @@ again: | |||
4222 | /* | 4340 | /* |
4223 | * Unregister events and notify userspace. | 4341 | * Unregister events and notify userspace. |
4224 | * Notify userspace about cgroup removing only after rmdir of cgroup | 4342 | * Notify userspace about cgroup removing only after rmdir of cgroup |
4225 | * directory to avoid race between userspace and kernelspace | 4343 | * directory to avoid race between userspace and kernelspace. Use |
4344 | * a temporary list to avoid a deadlock with cgroup_event_wake(). Since | ||
4345 | * cgroup_event_wake() is called with the wait queue head locked, | ||
4346 | * remove_wait_queue() cannot be called while holding event_list_lock. | ||
4226 | */ | 4347 | */ |
4227 | spin_lock(&cgrp->event_list_lock); | 4348 | spin_lock(&cgrp->event_list_lock); |
4228 | list_for_each_entry_safe(event, tmp, &cgrp->event_list, list) { | 4349 | list_splice_init(&cgrp->event_list, &tmp_list); |
4229 | list_del(&event->list); | 4350 | spin_unlock(&cgrp->event_list_lock); |
4351 | list_for_each_entry_safe(event, tmp, &tmp_list, list) { | ||
4352 | list_del_init(&event->list); | ||
4230 | remove_wait_queue(event->wqh, &event->wait); | 4353 | remove_wait_queue(event->wqh, &event->wait); |
4231 | eventfd_signal(event->eventfd, 1); | 4354 | eventfd_signal(event->eventfd, 1); |
4232 | schedule_work(&event->remove); | 4355 | schedule_work(&event->remove); |
4233 | } | 4356 | } |
4234 | spin_unlock(&cgrp->event_list_lock); | ||
4235 | 4357 | ||
4236 | mutex_unlock(&cgroup_mutex); | ||
4237 | return 0; | 4358 | return 0; |
4238 | } | 4359 | } |
4239 | 4360 | ||
4361 | static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry) | ||
4362 | { | ||
4363 | int ret; | ||
4364 | |||
4365 | mutex_lock(&cgroup_mutex); | ||
4366 | ret = cgroup_destroy_locked(dentry->d_fsdata); | ||
4367 | mutex_unlock(&cgroup_mutex); | ||
4368 | |||
4369 | return ret; | ||
4370 | } | ||
4371 | |||
4240 | static void __init_or_module cgroup_init_cftsets(struct cgroup_subsys *ss) | 4372 | static void __init_or_module cgroup_init_cftsets(struct cgroup_subsys *ss) |
4241 | { | 4373 | { |
4242 | INIT_LIST_HEAD(&ss->cftsets); | 4374 | INIT_LIST_HEAD(&ss->cftsets); |
@@ -4257,13 +4389,15 @@ static void __init cgroup_init_subsys(struct cgroup_subsys *ss) | |||
4257 | 4389 | ||
4258 | printk(KERN_INFO "Initializing cgroup subsys %s\n", ss->name); | 4390 | printk(KERN_INFO "Initializing cgroup subsys %s\n", ss->name); |
4259 | 4391 | ||
4392 | mutex_lock(&cgroup_mutex); | ||
4393 | |||
4260 | /* init base cftset */ | 4394 | /* init base cftset */ |
4261 | cgroup_init_cftsets(ss); | 4395 | cgroup_init_cftsets(ss); |
4262 | 4396 | ||
4263 | /* Create the top cgroup state for this subsystem */ | 4397 | /* Create the top cgroup state for this subsystem */ |
4264 | list_add(&ss->sibling, &rootnode.subsys_list); | 4398 | list_add(&ss->sibling, &rootnode.subsys_list); |
4265 | ss->root = &rootnode; | 4399 | ss->root = &rootnode; |
4266 | css = ss->create(dummytop); | 4400 | css = ss->css_alloc(dummytop); |
4267 | /* We don't handle early failures gracefully */ | 4401 | /* We don't handle early failures gracefully */ |
4268 | BUG_ON(IS_ERR(css)); | 4402 | BUG_ON(IS_ERR(css)); |
4269 | init_cgroup_css(css, ss, dummytop); | 4403 | init_cgroup_css(css, ss, dummytop); |
@@ -4272,7 +4406,7 @@ static void __init cgroup_init_subsys(struct cgroup_subsys *ss) | |||
4272 | * pointer to this state - since the subsystem is | 4406 | * pointer to this state - since the subsystem is |
4273 | * newly registered, all tasks and hence the | 4407 | * newly registered, all tasks and hence the |
4274 | * init_css_set is in the subsystem's top cgroup. */ | 4408 | * init_css_set is in the subsystem's top cgroup. */ |
4275 | init_css_set.subsys[ss->subsys_id] = dummytop->subsys[ss->subsys_id]; | 4409 | init_css_set.subsys[ss->subsys_id] = css; |
4276 | 4410 | ||
4277 | need_forkexit_callback |= ss->fork || ss->exit; | 4411 | need_forkexit_callback |= ss->fork || ss->exit; |
4278 | 4412 | ||
@@ -4282,6 +4416,9 @@ static void __init cgroup_init_subsys(struct cgroup_subsys *ss) | |||
4282 | BUG_ON(!list_empty(&init_task.tasks)); | 4416 | BUG_ON(!list_empty(&init_task.tasks)); |
4283 | 4417 | ||
4284 | ss->active = 1; | 4418 | ss->active = 1; |
4419 | BUG_ON(online_css(ss, dummytop)); | ||
4420 | |||
4421 | mutex_unlock(&cgroup_mutex); | ||
4285 | 4422 | ||
4286 | /* this function shouldn't be used with modular subsystems, since they | 4423 | /* this function shouldn't be used with modular subsystems, since they |
4287 | * need to register a subsys_id, among other things */ | 4424 | * need to register a subsys_id, among other things */ |
@@ -4299,12 +4436,12 @@ static void __init cgroup_init_subsys(struct cgroup_subsys *ss) | |||
4299 | */ | 4436 | */ |
4300 | int __init_or_module cgroup_load_subsys(struct cgroup_subsys *ss) | 4437 | int __init_or_module cgroup_load_subsys(struct cgroup_subsys *ss) |
4301 | { | 4438 | { |
4302 | int i; | ||
4303 | struct cgroup_subsys_state *css; | 4439 | struct cgroup_subsys_state *css; |
4440 | int i, ret; | ||
4304 | 4441 | ||
4305 | /* check name and function validity */ | 4442 | /* check name and function validity */ |
4306 | if (ss->name == NULL || strlen(ss->name) > MAX_CGROUP_TYPE_NAMELEN || | 4443 | if (ss->name == NULL || strlen(ss->name) > MAX_CGROUP_TYPE_NAMELEN || |
4307 | ss->create == NULL || ss->destroy == NULL) | 4444 | ss->css_alloc == NULL || ss->css_free == NULL) |
4308 | return -EINVAL; | 4445 | return -EINVAL; |
4309 | 4446 | ||
4310 | /* | 4447 | /* |
@@ -4321,8 +4458,7 @@ int __init_or_module cgroup_load_subsys(struct cgroup_subsys *ss) | |||
4321 | * since cgroup_init_subsys will have already taken care of it. | 4458 | * since cgroup_init_subsys will have already taken care of it. |
4322 | */ | 4459 | */ |
4323 | if (ss->module == NULL) { | 4460 | if (ss->module == NULL) { |
4324 | /* a few sanity checks */ | 4461 | /* a sanity check */ |
4325 | BUG_ON(ss->subsys_id >= CGROUP_BUILTIN_SUBSYS_COUNT); | ||
4326 | BUG_ON(subsys[ss->subsys_id] != ss); | 4462 | BUG_ON(subsys[ss->subsys_id] != ss); |
4327 | return 0; | 4463 | return 0; |
4328 | } | 4464 | } |
@@ -4330,33 +4466,18 @@ int __init_or_module cgroup_load_subsys(struct cgroup_subsys *ss) | |||
4330 | /* init base cftset */ | 4466 | /* init base cftset */ |
4331 | cgroup_init_cftsets(ss); | 4467 | cgroup_init_cftsets(ss); |
4332 | 4468 | ||
4333 | /* | ||
4334 | * need to register a subsys id before anything else - for example, | ||
4335 | * init_cgroup_css needs it. | ||
4336 | */ | ||
4337 | mutex_lock(&cgroup_mutex); | 4469 | mutex_lock(&cgroup_mutex); |
4338 | /* find the first empty slot in the array */ | 4470 | subsys[ss->subsys_id] = ss; |
4339 | for (i = CGROUP_BUILTIN_SUBSYS_COUNT; i < CGROUP_SUBSYS_COUNT; i++) { | ||
4340 | if (subsys[i] == NULL) | ||
4341 | break; | ||
4342 | } | ||
4343 | if (i == CGROUP_SUBSYS_COUNT) { | ||
4344 | /* maximum number of subsystems already registered! */ | ||
4345 | mutex_unlock(&cgroup_mutex); | ||
4346 | return -EBUSY; | ||
4347 | } | ||
4348 | /* assign ourselves the subsys_id */ | ||
4349 | ss->subsys_id = i; | ||
4350 | subsys[i] = ss; | ||
4351 | 4471 | ||
4352 | /* | 4472 | /* |
4353 | * no ss->create seems to need anything important in the ss struct, so | 4473 | * no ss->css_alloc seems to need anything important in the ss |
4354 | * this can happen first (i.e. before the rootnode attachment). | 4474 | * struct, so this can happen first (i.e. before the rootnode |
4475 | * attachment). | ||
4355 | */ | 4476 | */ |
4356 | css = ss->create(dummytop); | 4477 | css = ss->css_alloc(dummytop); |
4357 | if (IS_ERR(css)) { | 4478 | if (IS_ERR(css)) { |
4358 | /* failure case - need to deassign the subsys[] slot. */ | 4479 | /* failure case - need to deassign the subsys[] slot. */ |
4359 | subsys[i] = NULL; | 4480 | subsys[ss->subsys_id] = NULL; |
4360 | mutex_unlock(&cgroup_mutex); | 4481 | mutex_unlock(&cgroup_mutex); |
4361 | return PTR_ERR(css); | 4482 | return PTR_ERR(css); |
4362 | } | 4483 | } |
@@ -4368,14 +4489,9 @@ int __init_or_module cgroup_load_subsys(struct cgroup_subsys *ss) | |||
4368 | init_cgroup_css(css, ss, dummytop); | 4489 | init_cgroup_css(css, ss, dummytop); |
4369 | /* init_idr must be after init_cgroup_css because it sets css->id. */ | 4490 | /* init_idr must be after init_cgroup_css because it sets css->id. */ |
4370 | if (ss->use_id) { | 4491 | if (ss->use_id) { |
4371 | int ret = cgroup_init_idr(ss, css); | 4492 | ret = cgroup_init_idr(ss, css); |
4372 | if (ret) { | 4493 | if (ret) |
4373 | dummytop->subsys[ss->subsys_id] = NULL; | 4494 | goto err_unload; |
4374 | ss->destroy(dummytop); | ||
4375 | subsys[i] = NULL; | ||
4376 | mutex_unlock(&cgroup_mutex); | ||
4377 | return ret; | ||
4378 | } | ||
4379 | } | 4495 | } |
4380 | 4496 | ||
4381 | /* | 4497 | /* |
@@ -4408,10 +4524,19 @@ int __init_or_module cgroup_load_subsys(struct cgroup_subsys *ss) | |||
4408 | write_unlock(&css_set_lock); | 4524 | write_unlock(&css_set_lock); |
4409 | 4525 | ||
4410 | ss->active = 1; | 4526 | ss->active = 1; |
4527 | ret = online_css(ss, dummytop); | ||
4528 | if (ret) | ||
4529 | goto err_unload; | ||
4411 | 4530 | ||
4412 | /* success! */ | 4531 | /* success! */ |
4413 | mutex_unlock(&cgroup_mutex); | 4532 | mutex_unlock(&cgroup_mutex); |
4414 | return 0; | 4533 | return 0; |
4534 | |||
4535 | err_unload: | ||
4536 | mutex_unlock(&cgroup_mutex); | ||
4537 | /* @ss can't be mounted here as try_module_get() would fail */ | ||
4538 | cgroup_unload_subsys(ss); | ||
4539 | return ret; | ||
4415 | } | 4540 | } |
4416 | EXPORT_SYMBOL_GPL(cgroup_load_subsys); | 4541 | EXPORT_SYMBOL_GPL(cgroup_load_subsys); |
4417 | 4542 | ||
@@ -4438,8 +4563,16 @@ void cgroup_unload_subsys(struct cgroup_subsys *ss) | |||
4438 | BUG_ON(ss->root != &rootnode); | 4563 | BUG_ON(ss->root != &rootnode); |
4439 | 4564 | ||
4440 | mutex_lock(&cgroup_mutex); | 4565 | mutex_lock(&cgroup_mutex); |
4566 | |||
4567 | offline_css(ss, dummytop); | ||
4568 | ss->active = 0; | ||
4569 | |||
4570 | if (ss->use_id) { | ||
4571 | idr_remove_all(&ss->idr); | ||
4572 | idr_destroy(&ss->idr); | ||
4573 | } | ||
4574 | |||
4441 | /* deassign the subsys_id */ | 4575 | /* deassign the subsys_id */ |
4442 | BUG_ON(ss->subsys_id < CGROUP_BUILTIN_SUBSYS_COUNT); | ||
4443 | subsys[ss->subsys_id] = NULL; | 4576 | subsys[ss->subsys_id] = NULL; |
4444 | 4577 | ||
4445 | /* remove subsystem from rootnode's list of subsystems */ | 4578 | /* remove subsystem from rootnode's list of subsystems */ |
@@ -4454,7 +4587,6 @@ void cgroup_unload_subsys(struct cgroup_subsys *ss) | |||
4454 | struct css_set *cg = link->cg; | 4587 | struct css_set *cg = link->cg; |
4455 | 4588 | ||
4456 | hlist_del(&cg->hlist); | 4589 | hlist_del(&cg->hlist); |
4457 | BUG_ON(!cg->subsys[ss->subsys_id]); | ||
4458 | cg->subsys[ss->subsys_id] = NULL; | 4590 | cg->subsys[ss->subsys_id] = NULL; |
4459 | hhead = css_set_hash(cg->subsys); | 4591 | hhead = css_set_hash(cg->subsys); |
4460 | hlist_add_head(&cg->hlist, hhead); | 4592 | hlist_add_head(&cg->hlist, hhead); |
@@ -4462,12 +4594,12 @@ void cgroup_unload_subsys(struct cgroup_subsys *ss) | |||
4462 | write_unlock(&css_set_lock); | 4594 | write_unlock(&css_set_lock); |
4463 | 4595 | ||
4464 | /* | 4596 | /* |
4465 | * remove subsystem's css from the dummytop and free it - need to free | 4597 | * remove subsystem's css from the dummytop and free it - need to |
4466 | * before marking as null because ss->destroy needs the cgrp->subsys | 4598 | * free before marking as null because ss->css_free needs the |
4467 | * pointer to find their state. note that this also takes care of | 4599 | * cgrp->subsys pointer to find their state. note that this also |
4468 | * freeing the css_id. | 4600 | * takes care of freeing the css_id. |
4469 | */ | 4601 | */ |
4470 | ss->destroy(dummytop); | 4602 | ss->css_free(dummytop); |
4471 | dummytop->subsys[ss->subsys_id] = NULL; | 4603 | dummytop->subsys[ss->subsys_id] = NULL; |
4472 | 4604 | ||
4473 | mutex_unlock(&cgroup_mutex); | 4605 | mutex_unlock(&cgroup_mutex); |
@@ -4502,14 +4634,17 @@ int __init cgroup_init_early(void) | |||
4502 | for (i = 0; i < CSS_SET_TABLE_SIZE; i++) | 4634 | for (i = 0; i < CSS_SET_TABLE_SIZE; i++) |
4503 | INIT_HLIST_HEAD(&css_set_table[i]); | 4635 | INIT_HLIST_HEAD(&css_set_table[i]); |
4504 | 4636 | ||
4505 | /* at bootup time, we don't worry about modular subsystems */ | 4637 | for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { |
4506 | for (i = 0; i < CGROUP_BUILTIN_SUBSYS_COUNT; i++) { | ||
4507 | struct cgroup_subsys *ss = subsys[i]; | 4638 | struct cgroup_subsys *ss = subsys[i]; |
4508 | 4639 | ||
4640 | /* at bootup time, we don't worry about modular subsystems */ | ||
4641 | if (!ss || ss->module) | ||
4642 | continue; | ||
4643 | |||
4509 | BUG_ON(!ss->name); | 4644 | BUG_ON(!ss->name); |
4510 | BUG_ON(strlen(ss->name) > MAX_CGROUP_TYPE_NAMELEN); | 4645 | BUG_ON(strlen(ss->name) > MAX_CGROUP_TYPE_NAMELEN); |
4511 | BUG_ON(!ss->create); | 4646 | BUG_ON(!ss->css_alloc); |
4512 | BUG_ON(!ss->destroy); | 4647 | BUG_ON(!ss->css_free); |
4513 | if (ss->subsys_id != i) { | 4648 | if (ss->subsys_id != i) { |
4514 | printk(KERN_ERR "cgroup: Subsys %s id == %d\n", | 4649 | printk(KERN_ERR "cgroup: Subsys %s id == %d\n", |
4515 | ss->name, ss->subsys_id); | 4650 | ss->name, ss->subsys_id); |
@@ -4538,9 +4673,12 @@ int __init cgroup_init(void) | |||
4538 | if (err) | 4673 | if (err) |
4539 | return err; | 4674 | return err; |
4540 | 4675 | ||
4541 | /* at bootup time, we don't worry about modular subsystems */ | 4676 | for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { |
4542 | for (i = 0; i < CGROUP_BUILTIN_SUBSYS_COUNT; i++) { | ||
4543 | struct cgroup_subsys *ss = subsys[i]; | 4677 | struct cgroup_subsys *ss = subsys[i]; |
4678 | |||
4679 | /* at bootup time, we don't worry about modular subsystems */ | ||
4680 | if (!ss || ss->module) | ||
4681 | continue; | ||
4544 | if (!ss->early_init) | 4682 | if (!ss->early_init) |
4545 | cgroup_init_subsys(ss); | 4683 | cgroup_init_subsys(ss); |
4546 | if (ss->use_id) | 4684 | if (ss->use_id) |
@@ -4695,70 +4833,37 @@ static const struct file_operations proc_cgroupstats_operations = { | |||
4695 | * | 4833 | * |
4696 | * A pointer to the shared css_set was automatically copied in | 4834 | * A pointer to the shared css_set was automatically copied in |
4697 | * fork.c by dup_task_struct(). However, we ignore that copy, since | 4835 | * fork.c by dup_task_struct(). However, we ignore that copy, since |
4698 | * it was not made under the protection of RCU, cgroup_mutex or | 4836 | * it was not made under the protection of RCU or cgroup_mutex, so |
4699 | * threadgroup_change_begin(), so it might no longer be a valid | 4837 | * might no longer be a valid cgroup pointer. cgroup_attach_task() might |
4700 | * cgroup pointer. cgroup_attach_task() might have already changed | 4838 | * have already changed current->cgroups, allowing the previously |
4701 | * current->cgroups, allowing the previously referenced cgroup | 4839 | * referenced cgroup group to be removed and freed. |
4702 | * group to be removed and freed. | ||
4703 | * | ||
4704 | * Outside the pointer validity we also need to process the css_set | ||
4705 | * inheritance between threadgoup_change_begin() and | ||
4706 | * threadgoup_change_end(), this way there is no leak in any process | ||
4707 | * wide migration performed by cgroup_attach_proc() that could otherwise | ||
4708 | * miss a thread because it is too early or too late in the fork stage. | ||
4709 | * | 4840 | * |
4710 | * At the point that cgroup_fork() is called, 'current' is the parent | 4841 | * At the point that cgroup_fork() is called, 'current' is the parent |
4711 | * task, and the passed argument 'child' points to the child task. | 4842 | * task, and the passed argument 'child' points to the child task. |
4712 | */ | 4843 | */ |
4713 | void cgroup_fork(struct task_struct *child) | 4844 | void cgroup_fork(struct task_struct *child) |
4714 | { | 4845 | { |
4715 | /* | 4846 | task_lock(current); |
4716 | * We don't need to task_lock() current because current->cgroups | ||
4717 | * can't be changed concurrently here. The parent obviously hasn't | ||
4718 | * exited and called cgroup_exit(), and we are synchronized against | ||
4719 | * cgroup migration through threadgroup_change_begin(). | ||
4720 | */ | ||
4721 | child->cgroups = current->cgroups; | 4847 | child->cgroups = current->cgroups; |
4722 | get_css_set(child->cgroups); | 4848 | get_css_set(child->cgroups); |
4849 | task_unlock(current); | ||
4723 | INIT_LIST_HEAD(&child->cg_list); | 4850 | INIT_LIST_HEAD(&child->cg_list); |
4724 | } | 4851 | } |
4725 | 4852 | ||
4726 | /** | 4853 | /** |
4727 | * cgroup_fork_callbacks - run fork callbacks | ||
4728 | * @child: the new task | ||
4729 | * | ||
4730 | * Called on a new task very soon before adding it to the | ||
4731 | * tasklist. No need to take any locks since no-one can | ||
4732 | * be operating on this task. | ||
4733 | */ | ||
4734 | void cgroup_fork_callbacks(struct task_struct *child) | ||
4735 | { | ||
4736 | if (need_forkexit_callback) { | ||
4737 | int i; | ||
4738 | /* | ||
4739 | * forkexit callbacks are only supported for builtin | ||
4740 | * subsystems, and the builtin section of the subsys array is | ||
4741 | * immutable, so we don't need to lock the subsys array here. | ||
4742 | */ | ||
4743 | for (i = 0; i < CGROUP_BUILTIN_SUBSYS_COUNT; i++) { | ||
4744 | struct cgroup_subsys *ss = subsys[i]; | ||
4745 | if (ss->fork) | ||
4746 | ss->fork(child); | ||
4747 | } | ||
4748 | } | ||
4749 | } | ||
4750 | |||
4751 | /** | ||
4752 | * cgroup_post_fork - called on a new task after adding it to the task list | 4854 | * cgroup_post_fork - called on a new task after adding it to the task list |
4753 | * @child: the task in question | 4855 | * @child: the task in question |
4754 | * | 4856 | * |
4755 | * Adds the task to the list running through its css_set if necessary. | 4857 | * Adds the task to the list running through its css_set if necessary and |
4756 | * Has to be after the task is visible on the task list in case we race | 4858 | * call the subsystem fork() callbacks. Has to be after the task is |
4757 | * with the first call to cgroup_iter_start() - to guarantee that the | 4859 | * visible on the task list in case we race with the first call to |
4758 | * new task ends up on its list. | 4860 | * cgroup_iter_start() - to guarantee that the new task ends up on its |
4861 | * list. | ||
4759 | */ | 4862 | */ |
4760 | void cgroup_post_fork(struct task_struct *child) | 4863 | void cgroup_post_fork(struct task_struct *child) |
4761 | { | 4864 | { |
4865 | int i; | ||
4866 | |||
4762 | /* | 4867 | /* |
4763 | * use_task_css_set_links is set to 1 before we walk the tasklist | 4868 | * use_task_css_set_links is set to 1 before we walk the tasklist |
4764 | * under the tasklist_lock and we read it here after we added the child | 4869 | * under the tasklist_lock and we read it here after we added the child |
@@ -4772,22 +4877,36 @@ void cgroup_post_fork(struct task_struct *child) | |||
4772 | */ | 4877 | */ |
4773 | if (use_task_css_set_links) { | 4878 | if (use_task_css_set_links) { |
4774 | write_lock(&css_set_lock); | 4879 | write_lock(&css_set_lock); |
4775 | if (list_empty(&child->cg_list)) { | 4880 | task_lock(child); |
4881 | if (list_empty(&child->cg_list)) | ||
4882 | list_add(&child->cg_list, &child->cgroups->tasks); | ||
4883 | task_unlock(child); | ||
4884 | write_unlock(&css_set_lock); | ||
4885 | } | ||
4886 | |||
4887 | /* | ||
4888 | * Call ss->fork(). This must happen after @child is linked on | ||
4889 | * css_set; otherwise, @child might change state between ->fork() | ||
4890 | * and addition to css_set. | ||
4891 | */ | ||
4892 | if (need_forkexit_callback) { | ||
4893 | for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { | ||
4894 | struct cgroup_subsys *ss = subsys[i]; | ||
4895 | |||
4776 | /* | 4896 | /* |
4777 | * It's safe to use child->cgroups without task_lock() | 4897 | * fork/exit callbacks are supported only for |
4778 | * here because we are protected through | 4898 | * builtin subsystems and we don't need further |
4779 | * threadgroup_change_begin() against concurrent | 4899 | * synchronization as they never go away. |
4780 | * css_set change in cgroup_task_migrate(). Also | ||
4781 | * the task can't exit at that point until | ||
4782 | * wake_up_new_task() is called, so we are protected | ||
4783 | * against cgroup_exit() setting child->cgroup to | ||
4784 | * init_css_set. | ||
4785 | */ | 4900 | */ |
4786 | list_add(&child->cg_list, &child->cgroups->tasks); | 4901 | if (!ss || ss->module) |
4902 | continue; | ||
4903 | |||
4904 | if (ss->fork) | ||
4905 | ss->fork(child); | ||
4787 | } | 4906 | } |
4788 | write_unlock(&css_set_lock); | ||
4789 | } | 4907 | } |
4790 | } | 4908 | } |
4909 | |||
4791 | /** | 4910 | /** |
4792 | * cgroup_exit - detach cgroup from exiting task | 4911 | * cgroup_exit - detach cgroup from exiting task |
4793 | * @tsk: pointer to task_struct of exiting process | 4912 | * @tsk: pointer to task_struct of exiting process |
@@ -4846,12 +4965,13 @@ void cgroup_exit(struct task_struct *tsk, int run_callbacks) | |||
4846 | tsk->cgroups = &init_css_set; | 4965 | tsk->cgroups = &init_css_set; |
4847 | 4966 | ||
4848 | if (run_callbacks && need_forkexit_callback) { | 4967 | if (run_callbacks && need_forkexit_callback) { |
4849 | /* | 4968 | for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { |
4850 | * modular subsystems can't use callbacks, so no need to lock | ||
4851 | * the subsys array | ||
4852 | */ | ||
4853 | for (i = 0; i < CGROUP_BUILTIN_SUBSYS_COUNT; i++) { | ||
4854 | struct cgroup_subsys *ss = subsys[i]; | 4969 | struct cgroup_subsys *ss = subsys[i]; |
4970 | |||
4971 | /* modular subsystems can't use callbacks */ | ||
4972 | if (!ss || ss->module) | ||
4973 | continue; | ||
4974 | |||
4855 | if (ss->exit) { | 4975 | if (ss->exit) { |
4856 | struct cgroup *old_cgrp = | 4976 | struct cgroup *old_cgrp = |
4857 | rcu_dereference_raw(cg->subsys[i])->cgroup; | 4977 | rcu_dereference_raw(cg->subsys[i])->cgroup; |
@@ -4919,15 +5039,17 @@ static void check_for_release(struct cgroup *cgrp) | |||
4919 | /* Caller must verify that the css is not for root cgroup */ | 5039 | /* Caller must verify that the css is not for root cgroup */ |
4920 | bool __css_tryget(struct cgroup_subsys_state *css) | 5040 | bool __css_tryget(struct cgroup_subsys_state *css) |
4921 | { | 5041 | { |
4922 | do { | 5042 | while (true) { |
4923 | int v = css_refcnt(css); | 5043 | int t, v; |
4924 | 5044 | ||
4925 | if (atomic_cmpxchg(&css->refcnt, v, v + 1) == v) | 5045 | v = css_refcnt(css); |
5046 | t = atomic_cmpxchg(&css->refcnt, v, v + 1); | ||
5047 | if (likely(t == v)) | ||
4926 | return true; | 5048 | return true; |
5049 | else if (t < 0) | ||
5050 | return false; | ||
4927 | cpu_relax(); | 5051 | cpu_relax(); |
4928 | } while (!test_bit(CSS_REMOVED, &css->flags)); | 5052 | } |
4929 | |||
4930 | return false; | ||
4931 | } | 5053 | } |
4932 | EXPORT_SYMBOL_GPL(__css_tryget); | 5054 | EXPORT_SYMBOL_GPL(__css_tryget); |
4933 | 5055 | ||
@@ -4946,11 +5068,9 @@ void __css_put(struct cgroup_subsys_state *css) | |||
4946 | set_bit(CGRP_RELEASABLE, &cgrp->flags); | 5068 | set_bit(CGRP_RELEASABLE, &cgrp->flags); |
4947 | check_for_release(cgrp); | 5069 | check_for_release(cgrp); |
4948 | } | 5070 | } |
4949 | cgroup_wakeup_rmdir_waiter(cgrp); | ||
4950 | break; | 5071 | break; |
4951 | case 0: | 5072 | case 0: |
4952 | if (!test_bit(CSS_CLEAR_CSS_REFS, &css->flags)) | 5073 | schedule_work(&css->dput_work); |
4953 | schedule_work(&css->dput_work); | ||
4954 | break; | 5074 | break; |
4955 | } | 5075 | } |
4956 | rcu_read_unlock(); | 5076 | rcu_read_unlock(); |
@@ -5037,13 +5157,17 @@ static int __init cgroup_disable(char *str) | |||
5037 | while ((token = strsep(&str, ",")) != NULL) { | 5157 | while ((token = strsep(&str, ",")) != NULL) { |
5038 | if (!*token) | 5158 | if (!*token) |
5039 | continue; | 5159 | continue; |
5040 | /* | 5160 | for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { |
5041 | * cgroup_disable, being at boot time, can't know about module | ||
5042 | * subsystems, so we don't worry about them. | ||
5043 | */ | ||
5044 | for (i = 0; i < CGROUP_BUILTIN_SUBSYS_COUNT; i++) { | ||
5045 | struct cgroup_subsys *ss = subsys[i]; | 5161 | struct cgroup_subsys *ss = subsys[i]; |
5046 | 5162 | ||
5163 | /* | ||
5164 | * cgroup_disable, being at boot time, can't | ||
5165 | * know about module subsystems, so we don't | ||
5166 | * worry about them. | ||
5167 | */ | ||
5168 | if (!ss || ss->module) | ||
5169 | continue; | ||
5170 | |||
5047 | if (!strcmp(token, ss->name)) { | 5171 | if (!strcmp(token, ss->name)) { |
5048 | ss->disabled = 1; | 5172 | ss->disabled = 1; |
5049 | printk(KERN_INFO "Disabling %s control group" | 5173 | printk(KERN_INFO "Disabling %s control group" |
@@ -5332,7 +5456,7 @@ struct cgroup_subsys_state *cgroup_css_from_dir(struct file *f, int id) | |||
5332 | } | 5456 | } |
5333 | 5457 | ||
5334 | #ifdef CONFIG_CGROUP_DEBUG | 5458 | #ifdef CONFIG_CGROUP_DEBUG |
5335 | static struct cgroup_subsys_state *debug_create(struct cgroup *cont) | 5459 | static struct cgroup_subsys_state *debug_css_alloc(struct cgroup *cont) |
5336 | { | 5460 | { |
5337 | struct cgroup_subsys_state *css = kzalloc(sizeof(*css), GFP_KERNEL); | 5461 | struct cgroup_subsys_state *css = kzalloc(sizeof(*css), GFP_KERNEL); |
5338 | 5462 | ||
@@ -5342,7 +5466,7 @@ static struct cgroup_subsys_state *debug_create(struct cgroup *cont) | |||
5342 | return css; | 5466 | return css; |
5343 | } | 5467 | } |
5344 | 5468 | ||
5345 | static void debug_destroy(struct cgroup *cont) | 5469 | static void debug_css_free(struct cgroup *cont) |
5346 | { | 5470 | { |
5347 | kfree(cont->subsys[debug_subsys_id]); | 5471 | kfree(cont->subsys[debug_subsys_id]); |
5348 | } | 5472 | } |
@@ -5471,8 +5595,8 @@ static struct cftype debug_files[] = { | |||
5471 | 5595 | ||
5472 | struct cgroup_subsys debug_subsys = { | 5596 | struct cgroup_subsys debug_subsys = { |
5473 | .name = "debug", | 5597 | .name = "debug", |
5474 | .create = debug_create, | 5598 | .css_alloc = debug_css_alloc, |
5475 | .destroy = debug_destroy, | 5599 | .css_free = debug_css_free, |
5476 | .subsys_id = debug_subsys_id, | 5600 | .subsys_id = debug_subsys_id, |
5477 | .base_cftypes = debug_files, | 5601 | .base_cftypes = debug_files, |
5478 | }; | 5602 | }; |
diff --git a/kernel/cgroup_freezer.c b/kernel/cgroup_freezer.c index 3649fc6b3eaa..75dda1ea5026 100644 --- a/kernel/cgroup_freezer.c +++ b/kernel/cgroup_freezer.c | |||
@@ -22,24 +22,33 @@ | |||
22 | #include <linux/freezer.h> | 22 | #include <linux/freezer.h> |
23 | #include <linux/seq_file.h> | 23 | #include <linux/seq_file.h> |
24 | 24 | ||
25 | enum freezer_state { | 25 | /* |
26 | CGROUP_THAWED = 0, | 26 | * A cgroup is freezing if any FREEZING flags are set. FREEZING_SELF is |
27 | CGROUP_FREEZING, | 27 | * set if "FROZEN" is written to freezer.state cgroupfs file, and cleared |
28 | CGROUP_FROZEN, | 28 | * for "THAWED". FREEZING_PARENT is set if the parent freezer is FREEZING |
29 | * for whatever reason. IOW, a cgroup has FREEZING_PARENT set if one of | ||
30 | * its ancestors has FREEZING_SELF set. | ||
31 | */ | ||
32 | enum freezer_state_flags { | ||
33 | CGROUP_FREEZER_ONLINE = (1 << 0), /* freezer is fully online */ | ||
34 | CGROUP_FREEZING_SELF = (1 << 1), /* this freezer is freezing */ | ||
35 | CGROUP_FREEZING_PARENT = (1 << 2), /* the parent freezer is freezing */ | ||
36 | CGROUP_FROZEN = (1 << 3), /* this and its descendants frozen */ | ||
37 | |||
38 | /* mask for all FREEZING flags */ | ||
39 | CGROUP_FREEZING = CGROUP_FREEZING_SELF | CGROUP_FREEZING_PARENT, | ||
29 | }; | 40 | }; |
30 | 41 | ||
31 | struct freezer { | 42 | struct freezer { |
32 | struct cgroup_subsys_state css; | 43 | struct cgroup_subsys_state css; |
33 | enum freezer_state state; | 44 | unsigned int state; |
34 | spinlock_t lock; /* protects _writes_ to state */ | 45 | spinlock_t lock; |
35 | }; | 46 | }; |
36 | 47 | ||
37 | static inline struct freezer *cgroup_freezer( | 48 | static inline struct freezer *cgroup_freezer(struct cgroup *cgroup) |
38 | struct cgroup *cgroup) | ||
39 | { | 49 | { |
40 | return container_of( | 50 | return container_of(cgroup_subsys_state(cgroup, freezer_subsys_id), |
41 | cgroup_subsys_state(cgroup, freezer_subsys_id), | 51 | struct freezer, css); |
42 | struct freezer, css); | ||
43 | } | 52 | } |
44 | 53 | ||
45 | static inline struct freezer *task_freezer(struct task_struct *task) | 54 | static inline struct freezer *task_freezer(struct task_struct *task) |
@@ -48,14 +57,21 @@ static inline struct freezer *task_freezer(struct task_struct *task) | |||
48 | struct freezer, css); | 57 | struct freezer, css); |
49 | } | 58 | } |
50 | 59 | ||
60 | static struct freezer *parent_freezer(struct freezer *freezer) | ||
61 | { | ||
62 | struct cgroup *pcg = freezer->css.cgroup->parent; | ||
63 | |||
64 | if (pcg) | ||
65 | return cgroup_freezer(pcg); | ||
66 | return NULL; | ||
67 | } | ||
68 | |||
51 | bool cgroup_freezing(struct task_struct *task) | 69 | bool cgroup_freezing(struct task_struct *task) |
52 | { | 70 | { |
53 | enum freezer_state state; | ||
54 | bool ret; | 71 | bool ret; |
55 | 72 | ||
56 | rcu_read_lock(); | 73 | rcu_read_lock(); |
57 | state = task_freezer(task)->state; | 74 | ret = task_freezer(task)->state & CGROUP_FREEZING; |
58 | ret = state == CGROUP_FREEZING || state == CGROUP_FROZEN; | ||
59 | rcu_read_unlock(); | 75 | rcu_read_unlock(); |
60 | 76 | ||
61 | return ret; | 77 | return ret; |
@@ -65,70 +81,18 @@ bool cgroup_freezing(struct task_struct *task) | |||
65 | * cgroups_write_string() limits the size of freezer state strings to | 81 | * cgroups_write_string() limits the size of freezer state strings to |
66 | * CGROUP_LOCAL_BUFFER_SIZE | 82 | * CGROUP_LOCAL_BUFFER_SIZE |
67 | */ | 83 | */ |
68 | static const char *freezer_state_strs[] = { | 84 | static const char *freezer_state_strs(unsigned int state) |
69 | "THAWED", | 85 | { |
70 | "FREEZING", | 86 | if (state & CGROUP_FROZEN) |
71 | "FROZEN", | 87 | return "FROZEN"; |
88 | if (state & CGROUP_FREEZING) | ||
89 | return "FREEZING"; | ||
90 | return "THAWED"; | ||
72 | }; | 91 | }; |
73 | 92 | ||
74 | /* | ||
75 | * State diagram | ||
76 | * Transitions are caused by userspace writes to the freezer.state file. | ||
77 | * The values in parenthesis are state labels. The rest are edge labels. | ||
78 | * | ||
79 | * (THAWED) --FROZEN--> (FREEZING) --FROZEN--> (FROZEN) | ||
80 | * ^ ^ | | | ||
81 | * | \_______THAWED_______/ | | ||
82 | * \__________________________THAWED____________/ | ||
83 | */ | ||
84 | |||
85 | struct cgroup_subsys freezer_subsys; | 93 | struct cgroup_subsys freezer_subsys; |
86 | 94 | ||
87 | /* Locks taken and their ordering | 95 | static struct cgroup_subsys_state *freezer_css_alloc(struct cgroup *cgroup) |
88 | * ------------------------------ | ||
89 | * cgroup_mutex (AKA cgroup_lock) | ||
90 | * freezer->lock | ||
91 | * css_set_lock | ||
92 | * task->alloc_lock (AKA task_lock) | ||
93 | * task->sighand->siglock | ||
94 | * | ||
95 | * cgroup code forces css_set_lock to be taken before task->alloc_lock | ||
96 | * | ||
97 | * freezer_create(), freezer_destroy(): | ||
98 | * cgroup_mutex [ by cgroup core ] | ||
99 | * | ||
100 | * freezer_can_attach(): | ||
101 | * cgroup_mutex (held by caller of can_attach) | ||
102 | * | ||
103 | * freezer_fork() (preserving fork() performance means can't take cgroup_mutex): | ||
104 | * freezer->lock | ||
105 | * sighand->siglock (if the cgroup is freezing) | ||
106 | * | ||
107 | * freezer_read(): | ||
108 | * cgroup_mutex | ||
109 | * freezer->lock | ||
110 | * write_lock css_set_lock (cgroup iterator start) | ||
111 | * task->alloc_lock | ||
112 | * read_lock css_set_lock (cgroup iterator start) | ||
113 | * | ||
114 | * freezer_write() (freeze): | ||
115 | * cgroup_mutex | ||
116 | * freezer->lock | ||
117 | * write_lock css_set_lock (cgroup iterator start) | ||
118 | * task->alloc_lock | ||
119 | * read_lock css_set_lock (cgroup iterator start) | ||
120 | * sighand->siglock (fake signal delivery inside freeze_task()) | ||
121 | * | ||
122 | * freezer_write() (unfreeze): | ||
123 | * cgroup_mutex | ||
124 | * freezer->lock | ||
125 | * write_lock css_set_lock (cgroup iterator start) | ||
126 | * task->alloc_lock | ||
127 | * read_lock css_set_lock (cgroup iterator start) | ||
128 | * task->alloc_lock (inside __thaw_task(), prevents race with refrigerator()) | ||
129 | * sighand->siglock | ||
130 | */ | ||
131 | static struct cgroup_subsys_state *freezer_create(struct cgroup *cgroup) | ||
132 | { | 96 | { |
133 | struct freezer *freezer; | 97 | struct freezer *freezer; |
134 | 98 | ||
@@ -137,160 +101,244 @@ static struct cgroup_subsys_state *freezer_create(struct cgroup *cgroup) | |||
137 | return ERR_PTR(-ENOMEM); | 101 | return ERR_PTR(-ENOMEM); |
138 | 102 | ||
139 | spin_lock_init(&freezer->lock); | 103 | spin_lock_init(&freezer->lock); |
140 | freezer->state = CGROUP_THAWED; | ||
141 | return &freezer->css; | 104 | return &freezer->css; |
142 | } | 105 | } |
143 | 106 | ||
144 | static void freezer_destroy(struct cgroup *cgroup) | 107 | /** |
108 | * freezer_css_online - commit creation of a freezer cgroup | ||
109 | * @cgroup: cgroup being created | ||
110 | * | ||
111 | * We're committing to creation of @cgroup. Mark it online and inherit | ||
112 | * parent's freezing state while holding both parent's and our | ||
113 | * freezer->lock. | ||
114 | */ | ||
115 | static int freezer_css_online(struct cgroup *cgroup) | ||
116 | { | ||
117 | struct freezer *freezer = cgroup_freezer(cgroup); | ||
118 | struct freezer *parent = parent_freezer(freezer); | ||
119 | |||
120 | /* | ||
121 | * The following double locking and freezing state inheritance | ||
122 | * guarantee that @cgroup can never escape ancestors' freezing | ||
123 | * states. See cgroup_for_each_descendant_pre() for details. | ||
124 | */ | ||
125 | if (parent) | ||
126 | spin_lock_irq(&parent->lock); | ||
127 | spin_lock_nested(&freezer->lock, SINGLE_DEPTH_NESTING); | ||
128 | |||
129 | freezer->state |= CGROUP_FREEZER_ONLINE; | ||
130 | |||
131 | if (parent && (parent->state & CGROUP_FREEZING)) { | ||
132 | freezer->state |= CGROUP_FREEZING_PARENT | CGROUP_FROZEN; | ||
133 | atomic_inc(&system_freezing_cnt); | ||
134 | } | ||
135 | |||
136 | spin_unlock(&freezer->lock); | ||
137 | if (parent) | ||
138 | spin_unlock_irq(&parent->lock); | ||
139 | |||
140 | return 0; | ||
141 | } | ||
142 | |||
143 | /** | ||
144 | * freezer_css_offline - initiate destruction of @cgroup | ||
145 | * @cgroup: cgroup being destroyed | ||
146 | * | ||
147 | * @cgroup is going away. Mark it dead and decrement system_freezing_count | ||
148 | * if it was holding one. | ||
149 | */ | ||
150 | static void freezer_css_offline(struct cgroup *cgroup) | ||
145 | { | 151 | { |
146 | struct freezer *freezer = cgroup_freezer(cgroup); | 152 | struct freezer *freezer = cgroup_freezer(cgroup); |
147 | 153 | ||
148 | if (freezer->state != CGROUP_THAWED) | 154 | spin_lock_irq(&freezer->lock); |
155 | |||
156 | if (freezer->state & CGROUP_FREEZING) | ||
149 | atomic_dec(&system_freezing_cnt); | 157 | atomic_dec(&system_freezing_cnt); |
150 | kfree(freezer); | 158 | |
159 | freezer->state = 0; | ||
160 | |||
161 | spin_unlock_irq(&freezer->lock); | ||
151 | } | 162 | } |
152 | 163 | ||
153 | /* task is frozen or will freeze immediately when next it gets woken */ | 164 | static void freezer_css_free(struct cgroup *cgroup) |
154 | static bool is_task_frozen_enough(struct task_struct *task) | ||
155 | { | 165 | { |
156 | return frozen(task) || | 166 | kfree(cgroup_freezer(cgroup)); |
157 | (task_is_stopped_or_traced(task) && freezing(task)); | ||
158 | } | 167 | } |
159 | 168 | ||
160 | /* | 169 | /* |
161 | * The call to cgroup_lock() in the freezer.state write method prevents | 170 | * Tasks can be migrated into a different freezer anytime regardless of its |
162 | * a write to that file racing against an attach, and hence the | 171 | * current state. freezer_attach() is responsible for making new tasks |
163 | * can_attach() result will remain valid until the attach completes. | 172 | * conform to the current state. |
173 | * | ||
174 | * Freezer state changes and task migration are synchronized via | ||
175 | * @freezer->lock. freezer_attach() makes the new tasks conform to the | ||
176 | * current state and all following state changes can see the new tasks. | ||
164 | */ | 177 | */ |
165 | static int freezer_can_attach(struct cgroup *new_cgroup, | 178 | static void freezer_attach(struct cgroup *new_cgrp, struct cgroup_taskset *tset) |
166 | struct cgroup_taskset *tset) | ||
167 | { | 179 | { |
168 | struct freezer *freezer; | 180 | struct freezer *freezer = cgroup_freezer(new_cgrp); |
169 | struct task_struct *task; | 181 | struct task_struct *task; |
182 | bool clear_frozen = false; | ||
183 | |||
184 | spin_lock_irq(&freezer->lock); | ||
170 | 185 | ||
171 | /* | 186 | /* |
172 | * Anything frozen can't move or be moved to/from. | 187 | * Make the new tasks conform to the current state of @new_cgrp. |
188 | * For simplicity, when migrating any task to a FROZEN cgroup, we | ||
189 | * revert it to FREEZING and let update_if_frozen() determine the | ||
190 | * correct state later. | ||
191 | * | ||
192 | * Tasks in @tset are on @new_cgrp but may not conform to its | ||
193 | * current state before executing the following - !frozen tasks may | ||
194 | * be visible in a FROZEN cgroup and frozen tasks in a THAWED one. | ||
173 | */ | 195 | */ |
174 | cgroup_taskset_for_each(task, new_cgroup, tset) | 196 | cgroup_taskset_for_each(task, new_cgrp, tset) { |
175 | if (cgroup_freezing(task)) | 197 | if (!(freezer->state & CGROUP_FREEZING)) { |
176 | return -EBUSY; | 198 | __thaw_task(task); |
199 | } else { | ||
200 | freeze_task(task); | ||
201 | freezer->state &= ~CGROUP_FROZEN; | ||
202 | clear_frozen = true; | ||
203 | } | ||
204 | } | ||
177 | 205 | ||
178 | freezer = cgroup_freezer(new_cgroup); | 206 | spin_unlock_irq(&freezer->lock); |
179 | if (freezer->state != CGROUP_THAWED) | ||
180 | return -EBUSY; | ||
181 | 207 | ||
182 | return 0; | 208 | /* |
209 | * Propagate FROZEN clearing upwards. We may race with | ||
210 | * update_if_frozen(), but as long as both work bottom-up, either | ||
211 | * update_if_frozen() sees child's FROZEN cleared or we clear the | ||
212 | * parent's FROZEN later. No parent w/ !FROZEN children can be | ||
213 | * left FROZEN. | ||
214 | */ | ||
215 | while (clear_frozen && (freezer = parent_freezer(freezer))) { | ||
216 | spin_lock_irq(&freezer->lock); | ||
217 | freezer->state &= ~CGROUP_FROZEN; | ||
218 | clear_frozen = freezer->state & CGROUP_FREEZING; | ||
219 | spin_unlock_irq(&freezer->lock); | ||
220 | } | ||
183 | } | 221 | } |
184 | 222 | ||
185 | static void freezer_fork(struct task_struct *task) | 223 | static void freezer_fork(struct task_struct *task) |
186 | { | 224 | { |
187 | struct freezer *freezer; | 225 | struct freezer *freezer; |
188 | 226 | ||
189 | /* | ||
190 | * No lock is needed, since the task isn't on tasklist yet, | ||
191 | * so it can't be moved to another cgroup, which means the | ||
192 | * freezer won't be removed and will be valid during this | ||
193 | * function call. Nevertheless, apply RCU read-side critical | ||
194 | * section to suppress RCU lockdep false positives. | ||
195 | */ | ||
196 | rcu_read_lock(); | 227 | rcu_read_lock(); |
197 | freezer = task_freezer(task); | 228 | freezer = task_freezer(task); |
198 | rcu_read_unlock(); | ||
199 | 229 | ||
200 | /* | 230 | /* |
201 | * The root cgroup is non-freezable, so we can skip the | 231 | * The root cgroup is non-freezable, so we can skip the |
202 | * following check. | 232 | * following check. |
203 | */ | 233 | */ |
204 | if (!freezer->css.cgroup->parent) | 234 | if (!freezer->css.cgroup->parent) |
205 | return; | 235 | goto out; |
206 | 236 | ||
207 | spin_lock_irq(&freezer->lock); | 237 | spin_lock_irq(&freezer->lock); |
208 | BUG_ON(freezer->state == CGROUP_FROZEN); | 238 | if (freezer->state & CGROUP_FREEZING) |
209 | |||
210 | /* Locking avoids race with FREEZING -> THAWED transitions. */ | ||
211 | if (freezer->state == CGROUP_FREEZING) | ||
212 | freeze_task(task); | 239 | freeze_task(task); |
213 | spin_unlock_irq(&freezer->lock); | 240 | spin_unlock_irq(&freezer->lock); |
241 | out: | ||
242 | rcu_read_unlock(); | ||
214 | } | 243 | } |
215 | 244 | ||
216 | /* | 245 | /** |
217 | * caller must hold freezer->lock | 246 | * update_if_frozen - update whether a cgroup finished freezing |
247 | * @cgroup: cgroup of interest | ||
248 | * | ||
249 | * Once FREEZING is initiated, transition to FROZEN is lazily updated by | ||
250 | * calling this function. If the current state is FREEZING but not FROZEN, | ||
251 | * this function checks whether all tasks of this cgroup and the descendant | ||
252 | * cgroups finished freezing and, if so, sets FROZEN. | ||
253 | * | ||
254 | * The caller is responsible for grabbing RCU read lock and calling | ||
255 | * update_if_frozen() on all descendants prior to invoking this function. | ||
256 | * | ||
257 | * Task states and freezer state might disagree while tasks are being | ||
258 | * migrated into or out of @cgroup, so we can't verify task states against | ||
259 | * @freezer state here. See freezer_attach() for details. | ||
218 | */ | 260 | */ |
219 | static void update_if_frozen(struct cgroup *cgroup, | 261 | static void update_if_frozen(struct cgroup *cgroup) |
220 | struct freezer *freezer) | ||
221 | { | 262 | { |
263 | struct freezer *freezer = cgroup_freezer(cgroup); | ||
264 | struct cgroup *pos; | ||
222 | struct cgroup_iter it; | 265 | struct cgroup_iter it; |
223 | struct task_struct *task; | 266 | struct task_struct *task; |
224 | unsigned int nfrozen = 0, ntotal = 0; | ||
225 | enum freezer_state old_state = freezer->state; | ||
226 | 267 | ||
227 | cgroup_iter_start(cgroup, &it); | 268 | WARN_ON_ONCE(!rcu_read_lock_held()); |
228 | while ((task = cgroup_iter_next(cgroup, &it))) { | 269 | |
229 | ntotal++; | 270 | spin_lock_irq(&freezer->lock); |
230 | if (freezing(task) && is_task_frozen_enough(task)) | 271 | |
231 | nfrozen++; | 272 | if (!(freezer->state & CGROUP_FREEZING) || |
273 | (freezer->state & CGROUP_FROZEN)) | ||
274 | goto out_unlock; | ||
275 | |||
276 | /* are all (live) children frozen? */ | ||
277 | cgroup_for_each_child(pos, cgroup) { | ||
278 | struct freezer *child = cgroup_freezer(pos); | ||
279 | |||
280 | if ((child->state & CGROUP_FREEZER_ONLINE) && | ||
281 | !(child->state & CGROUP_FROZEN)) | ||
282 | goto out_unlock; | ||
232 | } | 283 | } |
233 | 284 | ||
234 | if (old_state == CGROUP_THAWED) { | 285 | /* are all tasks frozen? */ |
235 | BUG_ON(nfrozen > 0); | 286 | cgroup_iter_start(cgroup, &it); |
236 | } else if (old_state == CGROUP_FREEZING) { | 287 | |
237 | if (nfrozen == ntotal) | 288 | while ((task = cgroup_iter_next(cgroup, &it))) { |
238 | freezer->state = CGROUP_FROZEN; | 289 | if (freezing(task)) { |
239 | } else { /* old_state == CGROUP_FROZEN */ | 290 | /* |
240 | BUG_ON(nfrozen != ntotal); | 291 | * freezer_should_skip() indicates that the task |
292 | * should be skipped when determining freezing | ||
293 | * completion. Consider it frozen in addition to | ||
294 | * the usual frozen condition. | ||
295 | */ | ||
296 | if (!frozen(task) && !freezer_should_skip(task)) | ||
297 | goto out_iter_end; | ||
298 | } | ||
241 | } | 299 | } |
242 | 300 | ||
301 | freezer->state |= CGROUP_FROZEN; | ||
302 | out_iter_end: | ||
243 | cgroup_iter_end(cgroup, &it); | 303 | cgroup_iter_end(cgroup, &it); |
304 | out_unlock: | ||
305 | spin_unlock_irq(&freezer->lock); | ||
244 | } | 306 | } |
245 | 307 | ||
246 | static int freezer_read(struct cgroup *cgroup, struct cftype *cft, | 308 | static int freezer_read(struct cgroup *cgroup, struct cftype *cft, |
247 | struct seq_file *m) | 309 | struct seq_file *m) |
248 | { | 310 | { |
249 | struct freezer *freezer; | 311 | struct cgroup *pos; |
250 | enum freezer_state state; | ||
251 | 312 | ||
252 | if (!cgroup_lock_live_group(cgroup)) | 313 | rcu_read_lock(); |
253 | return -ENODEV; | ||
254 | 314 | ||
255 | freezer = cgroup_freezer(cgroup); | 315 | /* update states bottom-up */ |
256 | spin_lock_irq(&freezer->lock); | 316 | cgroup_for_each_descendant_post(pos, cgroup) |
257 | state = freezer->state; | 317 | update_if_frozen(pos); |
258 | if (state == CGROUP_FREEZING) { | 318 | update_if_frozen(cgroup); |
259 | /* We change from FREEZING to FROZEN lazily if the cgroup was | ||
260 | * only partially frozen when we exitted write. */ | ||
261 | update_if_frozen(cgroup, freezer); | ||
262 | state = freezer->state; | ||
263 | } | ||
264 | spin_unlock_irq(&freezer->lock); | ||
265 | cgroup_unlock(); | ||
266 | 319 | ||
267 | seq_puts(m, freezer_state_strs[state]); | 320 | rcu_read_unlock(); |
321 | |||
322 | seq_puts(m, freezer_state_strs(cgroup_freezer(cgroup)->state)); | ||
268 | seq_putc(m, '\n'); | 323 | seq_putc(m, '\n'); |
269 | return 0; | 324 | return 0; |
270 | } | 325 | } |
271 | 326 | ||
272 | static int try_to_freeze_cgroup(struct cgroup *cgroup, struct freezer *freezer) | 327 | static void freeze_cgroup(struct freezer *freezer) |
273 | { | 328 | { |
329 | struct cgroup *cgroup = freezer->css.cgroup; | ||
274 | struct cgroup_iter it; | 330 | struct cgroup_iter it; |
275 | struct task_struct *task; | 331 | struct task_struct *task; |
276 | unsigned int num_cant_freeze_now = 0; | ||
277 | 332 | ||
278 | cgroup_iter_start(cgroup, &it); | 333 | cgroup_iter_start(cgroup, &it); |
279 | while ((task = cgroup_iter_next(cgroup, &it))) { | 334 | while ((task = cgroup_iter_next(cgroup, &it))) |
280 | if (!freeze_task(task)) | 335 | freeze_task(task); |
281 | continue; | ||
282 | if (is_task_frozen_enough(task)) | ||
283 | continue; | ||
284 | if (!freezing(task) && !freezer_should_skip(task)) | ||
285 | num_cant_freeze_now++; | ||
286 | } | ||
287 | cgroup_iter_end(cgroup, &it); | 336 | cgroup_iter_end(cgroup, &it); |
288 | |||
289 | return num_cant_freeze_now ? -EBUSY : 0; | ||
290 | } | 337 | } |
291 | 338 | ||
292 | static void unfreeze_cgroup(struct cgroup *cgroup, struct freezer *freezer) | 339 | static void unfreeze_cgroup(struct freezer *freezer) |
293 | { | 340 | { |
341 | struct cgroup *cgroup = freezer->css.cgroup; | ||
294 | struct cgroup_iter it; | 342 | struct cgroup_iter it; |
295 | struct task_struct *task; | 343 | struct task_struct *task; |
296 | 344 | ||
@@ -300,59 +348,111 @@ static void unfreeze_cgroup(struct cgroup *cgroup, struct freezer *freezer) | |||
300 | cgroup_iter_end(cgroup, &it); | 348 | cgroup_iter_end(cgroup, &it); |
301 | } | 349 | } |
302 | 350 | ||
303 | static int freezer_change_state(struct cgroup *cgroup, | 351 | /** |
304 | enum freezer_state goal_state) | 352 | * freezer_apply_state - apply state change to a single cgroup_freezer |
353 | * @freezer: freezer to apply state change to | ||
354 | * @freeze: whether to freeze or unfreeze | ||
355 | * @state: CGROUP_FREEZING_* flag to set or clear | ||
356 | * | ||
357 | * Set or clear @state on @cgroup according to @freeze, and perform | ||
358 | * freezing or thawing as necessary. | ||
359 | */ | ||
360 | static void freezer_apply_state(struct freezer *freezer, bool freeze, | ||
361 | unsigned int state) | ||
305 | { | 362 | { |
306 | struct freezer *freezer; | 363 | /* also synchronizes against task migration, see freezer_attach() */ |
307 | int retval = 0; | 364 | lockdep_assert_held(&freezer->lock); |
308 | |||
309 | freezer = cgroup_freezer(cgroup); | ||
310 | 365 | ||
311 | spin_lock_irq(&freezer->lock); | 366 | if (!(freezer->state & CGROUP_FREEZER_ONLINE)) |
367 | return; | ||
312 | 368 | ||
313 | update_if_frozen(cgroup, freezer); | 369 | if (freeze) { |
314 | 370 | if (!(freezer->state & CGROUP_FREEZING)) | |
315 | switch (goal_state) { | ||
316 | case CGROUP_THAWED: | ||
317 | if (freezer->state != CGROUP_THAWED) | ||
318 | atomic_dec(&system_freezing_cnt); | ||
319 | freezer->state = CGROUP_THAWED; | ||
320 | unfreeze_cgroup(cgroup, freezer); | ||
321 | break; | ||
322 | case CGROUP_FROZEN: | ||
323 | if (freezer->state == CGROUP_THAWED) | ||
324 | atomic_inc(&system_freezing_cnt); | 371 | atomic_inc(&system_freezing_cnt); |
325 | freezer->state = CGROUP_FREEZING; | 372 | freezer->state |= state; |
326 | retval = try_to_freeze_cgroup(cgroup, freezer); | 373 | freeze_cgroup(freezer); |
327 | break; | 374 | } else { |
328 | default: | 375 | bool was_freezing = freezer->state & CGROUP_FREEZING; |
329 | BUG(); | 376 | |
377 | freezer->state &= ~state; | ||
378 | |||
379 | if (!(freezer->state & CGROUP_FREEZING)) { | ||
380 | if (was_freezing) | ||
381 | atomic_dec(&system_freezing_cnt); | ||
382 | freezer->state &= ~CGROUP_FROZEN; | ||
383 | unfreeze_cgroup(freezer); | ||
384 | } | ||
330 | } | 385 | } |
386 | } | ||
387 | |||
388 | /** | ||
389 | * freezer_change_state - change the freezing state of a cgroup_freezer | ||
390 | * @freezer: freezer of interest | ||
391 | * @freeze: whether to freeze or thaw | ||
392 | * | ||
393 | * Freeze or thaw @freezer according to @freeze. The operations are | ||
394 | * recursive - all descendants of @freezer will be affected. | ||
395 | */ | ||
396 | static void freezer_change_state(struct freezer *freezer, bool freeze) | ||
397 | { | ||
398 | struct cgroup *pos; | ||
331 | 399 | ||
400 | /* update @freezer */ | ||
401 | spin_lock_irq(&freezer->lock); | ||
402 | freezer_apply_state(freezer, freeze, CGROUP_FREEZING_SELF); | ||
332 | spin_unlock_irq(&freezer->lock); | 403 | spin_unlock_irq(&freezer->lock); |
333 | 404 | ||
334 | return retval; | 405 | /* |
406 | * Update all its descendants in pre-order traversal. Each | ||
407 | * descendant will try to inherit its parent's FREEZING state as | ||
408 | * CGROUP_FREEZING_PARENT. | ||
409 | */ | ||
410 | rcu_read_lock(); | ||
411 | cgroup_for_each_descendant_pre(pos, freezer->css.cgroup) { | ||
412 | struct freezer *pos_f = cgroup_freezer(pos); | ||
413 | struct freezer *parent = parent_freezer(pos_f); | ||
414 | |||
415 | /* | ||
416 | * Our update to @parent->state is already visible which is | ||
417 | * all we need. No need to lock @parent. For more info on | ||
418 | * synchronization, see freezer_post_create(). | ||
419 | */ | ||
420 | spin_lock_irq(&pos_f->lock); | ||
421 | freezer_apply_state(pos_f, parent->state & CGROUP_FREEZING, | ||
422 | CGROUP_FREEZING_PARENT); | ||
423 | spin_unlock_irq(&pos_f->lock); | ||
424 | } | ||
425 | rcu_read_unlock(); | ||
335 | } | 426 | } |
336 | 427 | ||
337 | static int freezer_write(struct cgroup *cgroup, | 428 | static int freezer_write(struct cgroup *cgroup, struct cftype *cft, |
338 | struct cftype *cft, | ||
339 | const char *buffer) | 429 | const char *buffer) |
340 | { | 430 | { |
341 | int retval; | 431 | bool freeze; |
342 | enum freezer_state goal_state; | ||
343 | 432 | ||
344 | if (strcmp(buffer, freezer_state_strs[CGROUP_THAWED]) == 0) | 433 | if (strcmp(buffer, freezer_state_strs(0)) == 0) |
345 | goal_state = CGROUP_THAWED; | 434 | freeze = false; |
346 | else if (strcmp(buffer, freezer_state_strs[CGROUP_FROZEN]) == 0) | 435 | else if (strcmp(buffer, freezer_state_strs(CGROUP_FROZEN)) == 0) |
347 | goal_state = CGROUP_FROZEN; | 436 | freeze = true; |
348 | else | 437 | else |
349 | return -EINVAL; | 438 | return -EINVAL; |
350 | 439 | ||
351 | if (!cgroup_lock_live_group(cgroup)) | 440 | freezer_change_state(cgroup_freezer(cgroup), freeze); |
352 | return -ENODEV; | 441 | return 0; |
353 | retval = freezer_change_state(cgroup, goal_state); | 442 | } |
354 | cgroup_unlock(); | 443 | |
355 | return retval; | 444 | static u64 freezer_self_freezing_read(struct cgroup *cgroup, struct cftype *cft) |
445 | { | ||
446 | struct freezer *freezer = cgroup_freezer(cgroup); | ||
447 | |||
448 | return (bool)(freezer->state & CGROUP_FREEZING_SELF); | ||
449 | } | ||
450 | |||
451 | static u64 freezer_parent_freezing_read(struct cgroup *cgroup, struct cftype *cft) | ||
452 | { | ||
453 | struct freezer *freezer = cgroup_freezer(cgroup); | ||
454 | |||
455 | return (bool)(freezer->state & CGROUP_FREEZING_PARENT); | ||
356 | } | 456 | } |
357 | 457 | ||
358 | static struct cftype files[] = { | 458 | static struct cftype files[] = { |
@@ -362,15 +462,27 @@ static struct cftype files[] = { | |||
362 | .read_seq_string = freezer_read, | 462 | .read_seq_string = freezer_read, |
363 | .write_string = freezer_write, | 463 | .write_string = freezer_write, |
364 | }, | 464 | }, |
465 | { | ||
466 | .name = "self_freezing", | ||
467 | .flags = CFTYPE_NOT_ON_ROOT, | ||
468 | .read_u64 = freezer_self_freezing_read, | ||
469 | }, | ||
470 | { | ||
471 | .name = "parent_freezing", | ||
472 | .flags = CFTYPE_NOT_ON_ROOT, | ||
473 | .read_u64 = freezer_parent_freezing_read, | ||
474 | }, | ||
365 | { } /* terminate */ | 475 | { } /* terminate */ |
366 | }; | 476 | }; |
367 | 477 | ||
368 | struct cgroup_subsys freezer_subsys = { | 478 | struct cgroup_subsys freezer_subsys = { |
369 | .name = "freezer", | 479 | .name = "freezer", |
370 | .create = freezer_create, | 480 | .css_alloc = freezer_css_alloc, |
371 | .destroy = freezer_destroy, | 481 | .css_online = freezer_css_online, |
482 | .css_offline = freezer_css_offline, | ||
483 | .css_free = freezer_css_free, | ||
372 | .subsys_id = freezer_subsys_id, | 484 | .subsys_id = freezer_subsys_id, |
373 | .can_attach = freezer_can_attach, | 485 | .attach = freezer_attach, |
374 | .fork = freezer_fork, | 486 | .fork = freezer_fork, |
375 | .base_cftypes = files, | 487 | .base_cftypes = files, |
376 | }; | 488 | }; |
diff --git a/kernel/compat.c b/kernel/compat.c index c28a306ae05c..f6150e92dfc9 100644 --- a/kernel/compat.c +++ b/kernel/compat.c | |||
@@ -1215,6 +1215,23 @@ compat_sys_sysinfo(struct compat_sysinfo __user *info) | |||
1215 | return 0; | 1215 | return 0; |
1216 | } | 1216 | } |
1217 | 1217 | ||
1218 | #ifdef __ARCH_WANT_COMPAT_SYS_SCHED_RR_GET_INTERVAL | ||
1219 | asmlinkage long compat_sys_sched_rr_get_interval(compat_pid_t pid, | ||
1220 | struct compat_timespec __user *interval) | ||
1221 | { | ||
1222 | struct timespec t; | ||
1223 | int ret; | ||
1224 | mm_segment_t old_fs = get_fs(); | ||
1225 | |||
1226 | set_fs(KERNEL_DS); | ||
1227 | ret = sys_sched_rr_get_interval(pid, (struct timespec __user *)&t); | ||
1228 | set_fs(old_fs); | ||
1229 | if (put_compat_timespec(&t, interval)) | ||
1230 | return -EFAULT; | ||
1231 | return ret; | ||
1232 | } | ||
1233 | #endif /* __ARCH_WANT_COMPAT_SYS_SCHED_RR_GET_INTERVAL */ | ||
1234 | |||
1218 | /* | 1235 | /* |
1219 | * Allocate user-space memory for the duration of a single system call, | 1236 | * Allocate user-space memory for the duration of a single system call, |
1220 | * in order to marshall parameters inside a compat thunk. | 1237 | * in order to marshall parameters inside a compat thunk. |
diff --git a/kernel/context_tracking.c b/kernel/context_tracking.c new file mode 100644 index 000000000000..e0e07fd55508 --- /dev/null +++ b/kernel/context_tracking.c | |||
@@ -0,0 +1,83 @@ | |||
1 | #include <linux/context_tracking.h> | ||
2 | #include <linux/rcupdate.h> | ||
3 | #include <linux/sched.h> | ||
4 | #include <linux/percpu.h> | ||
5 | #include <linux/hardirq.h> | ||
6 | |||
7 | struct context_tracking { | ||
8 | /* | ||
9 | * When active is false, hooks are not set to | ||
10 | * minimize overhead: TIF flags are cleared | ||
11 | * and calls to user_enter/exit are ignored. This | ||
12 | * may be further optimized using static keys. | ||
13 | */ | ||
14 | bool active; | ||
15 | enum { | ||
16 | IN_KERNEL = 0, | ||
17 | IN_USER, | ||
18 | } state; | ||
19 | }; | ||
20 | |||
21 | static DEFINE_PER_CPU(struct context_tracking, context_tracking) = { | ||
22 | #ifdef CONFIG_CONTEXT_TRACKING_FORCE | ||
23 | .active = true, | ||
24 | #endif | ||
25 | }; | ||
26 | |||
27 | void user_enter(void) | ||
28 | { | ||
29 | unsigned long flags; | ||
30 | |||
31 | /* | ||
32 | * Some contexts may involve an exception occuring in an irq, | ||
33 | * leading to that nesting: | ||
34 | * rcu_irq_enter() rcu_user_exit() rcu_user_exit() rcu_irq_exit() | ||
35 | * This would mess up the dyntick_nesting count though. And rcu_irq_*() | ||
36 | * helpers are enough to protect RCU uses inside the exception. So | ||
37 | * just return immediately if we detect we are in an IRQ. | ||
38 | */ | ||
39 | if (in_interrupt()) | ||
40 | return; | ||
41 | |||
42 | WARN_ON_ONCE(!current->mm); | ||
43 | |||
44 | local_irq_save(flags); | ||
45 | if (__this_cpu_read(context_tracking.active) && | ||
46 | __this_cpu_read(context_tracking.state) != IN_USER) { | ||
47 | __this_cpu_write(context_tracking.state, IN_USER); | ||
48 | rcu_user_enter(); | ||
49 | } | ||
50 | local_irq_restore(flags); | ||
51 | } | ||
52 | |||
53 | void user_exit(void) | ||
54 | { | ||
55 | unsigned long flags; | ||
56 | |||
57 | /* | ||
58 | * Some contexts may involve an exception occuring in an irq, | ||
59 | * leading to that nesting: | ||
60 | * rcu_irq_enter() rcu_user_exit() rcu_user_exit() rcu_irq_exit() | ||
61 | * This would mess up the dyntick_nesting count though. And rcu_irq_*() | ||
62 | * helpers are enough to protect RCU uses inside the exception. So | ||
63 | * just return immediately if we detect we are in an IRQ. | ||
64 | */ | ||
65 | if (in_interrupt()) | ||
66 | return; | ||
67 | |||
68 | local_irq_save(flags); | ||
69 | if (__this_cpu_read(context_tracking.state) == IN_USER) { | ||
70 | __this_cpu_write(context_tracking.state, IN_KERNEL); | ||
71 | rcu_user_exit(); | ||
72 | } | ||
73 | local_irq_restore(flags); | ||
74 | } | ||
75 | |||
76 | void context_tracking_task_switch(struct task_struct *prev, | ||
77 | struct task_struct *next) | ||
78 | { | ||
79 | if (__this_cpu_read(context_tracking.active)) { | ||
80 | clear_tsk_thread_flag(prev, TIF_NOHZ); | ||
81 | set_tsk_thread_flag(next, TIF_NOHZ); | ||
82 | } | ||
83 | } | ||
diff --git a/kernel/cpu.c b/kernel/cpu.c index 14d32588cccd..3046a503242c 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c | |||
@@ -80,6 +80,10 @@ void put_online_cpus(void) | |||
80 | if (cpu_hotplug.active_writer == current) | 80 | if (cpu_hotplug.active_writer == current) |
81 | return; | 81 | return; |
82 | mutex_lock(&cpu_hotplug.lock); | 82 | mutex_lock(&cpu_hotplug.lock); |
83 | |||
84 | if (WARN_ON(!cpu_hotplug.refcount)) | ||
85 | cpu_hotplug.refcount++; /* try to fix things up */ | ||
86 | |||
83 | if (!--cpu_hotplug.refcount && unlikely(cpu_hotplug.active_writer)) | 87 | if (!--cpu_hotplug.refcount && unlikely(cpu_hotplug.active_writer)) |
84 | wake_up_process(cpu_hotplug.active_writer); | 88 | wake_up_process(cpu_hotplug.active_writer); |
85 | mutex_unlock(&cpu_hotplug.lock); | 89 | mutex_unlock(&cpu_hotplug.lock); |
@@ -280,12 +284,13 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen) | |||
280 | __func__, cpu); | 284 | __func__, cpu); |
281 | goto out_release; | 285 | goto out_release; |
282 | } | 286 | } |
287 | smpboot_park_threads(cpu); | ||
283 | 288 | ||
284 | err = __stop_machine(take_cpu_down, &tcd_param, cpumask_of(cpu)); | 289 | err = __stop_machine(take_cpu_down, &tcd_param, cpumask_of(cpu)); |
285 | if (err) { | 290 | if (err) { |
286 | /* CPU didn't die: tell everyone. Can't complain. */ | 291 | /* CPU didn't die: tell everyone. Can't complain. */ |
292 | smpboot_unpark_threads(cpu); | ||
287 | cpu_notify_nofail(CPU_DOWN_FAILED | mod, hcpu); | 293 | cpu_notify_nofail(CPU_DOWN_FAILED | mod, hcpu); |
288 | |||
289 | goto out_release; | 294 | goto out_release; |
290 | } | 295 | } |
291 | BUG_ON(cpu_online(cpu)); | 296 | BUG_ON(cpu_online(cpu)); |
@@ -343,17 +348,23 @@ static int __cpuinit _cpu_up(unsigned int cpu, int tasks_frozen) | |||
343 | unsigned long mod = tasks_frozen ? CPU_TASKS_FROZEN : 0; | 348 | unsigned long mod = tasks_frozen ? CPU_TASKS_FROZEN : 0; |
344 | struct task_struct *idle; | 349 | struct task_struct *idle; |
345 | 350 | ||
346 | if (cpu_online(cpu) || !cpu_present(cpu)) | ||
347 | return -EINVAL; | ||
348 | |||
349 | cpu_hotplug_begin(); | 351 | cpu_hotplug_begin(); |
350 | 352 | ||
353 | if (cpu_online(cpu) || !cpu_present(cpu)) { | ||
354 | ret = -EINVAL; | ||
355 | goto out; | ||
356 | } | ||
357 | |||
351 | idle = idle_thread_get(cpu); | 358 | idle = idle_thread_get(cpu); |
352 | if (IS_ERR(idle)) { | 359 | if (IS_ERR(idle)) { |
353 | ret = PTR_ERR(idle); | 360 | ret = PTR_ERR(idle); |
354 | goto out; | 361 | goto out; |
355 | } | 362 | } |
356 | 363 | ||
364 | ret = smpboot_create_threads(cpu); | ||
365 | if (ret) | ||
366 | goto out; | ||
367 | |||
357 | ret = __cpu_notify(CPU_UP_PREPARE | mod, hcpu, -1, &nr_calls); | 368 | ret = __cpu_notify(CPU_UP_PREPARE | mod, hcpu, -1, &nr_calls); |
358 | if (ret) { | 369 | if (ret) { |
359 | nr_calls--; | 370 | nr_calls--; |
@@ -368,6 +379,9 @@ static int __cpuinit _cpu_up(unsigned int cpu, int tasks_frozen) | |||
368 | goto out_notify; | 379 | goto out_notify; |
369 | BUG_ON(!cpu_online(cpu)); | 380 | BUG_ON(!cpu_online(cpu)); |
370 | 381 | ||
382 | /* Wake the per cpu threads */ | ||
383 | smpboot_unpark_threads(cpu); | ||
384 | |||
371 | /* Now call notifier in preparation. */ | 385 | /* Now call notifier in preparation. */ |
372 | cpu_notify(CPU_ONLINE | mod, hcpu); | 386 | cpu_notify(CPU_ONLINE | mod, hcpu); |
373 | 387 | ||
@@ -439,14 +453,6 @@ EXPORT_SYMBOL_GPL(cpu_up); | |||
439 | #ifdef CONFIG_PM_SLEEP_SMP | 453 | #ifdef CONFIG_PM_SLEEP_SMP |
440 | static cpumask_var_t frozen_cpus; | 454 | static cpumask_var_t frozen_cpus; |
441 | 455 | ||
442 | void __weak arch_disable_nonboot_cpus_begin(void) | ||
443 | { | ||
444 | } | ||
445 | |||
446 | void __weak arch_disable_nonboot_cpus_end(void) | ||
447 | { | ||
448 | } | ||
449 | |||
450 | int disable_nonboot_cpus(void) | 456 | int disable_nonboot_cpus(void) |
451 | { | 457 | { |
452 | int cpu, first_cpu, error = 0; | 458 | int cpu, first_cpu, error = 0; |
@@ -458,7 +464,6 @@ int disable_nonboot_cpus(void) | |||
458 | * with the userspace trying to use the CPU hotplug at the same time | 464 | * with the userspace trying to use the CPU hotplug at the same time |
459 | */ | 465 | */ |
460 | cpumask_clear(frozen_cpus); | 466 | cpumask_clear(frozen_cpus); |
461 | arch_disable_nonboot_cpus_begin(); | ||
462 | 467 | ||
463 | printk("Disabling non-boot CPUs ...\n"); | 468 | printk("Disabling non-boot CPUs ...\n"); |
464 | for_each_online_cpu(cpu) { | 469 | for_each_online_cpu(cpu) { |
@@ -474,8 +479,6 @@ int disable_nonboot_cpus(void) | |||
474 | } | 479 | } |
475 | } | 480 | } |
476 | 481 | ||
477 | arch_disable_nonboot_cpus_end(); | ||
478 | |||
479 | if (!error) { | 482 | if (!error) { |
480 | BUG_ON(num_online_cpus() > 1); | 483 | BUG_ON(num_online_cpus() > 1); |
481 | /* Make sure the CPUs won't be enabled by someone else */ | 484 | /* Make sure the CPUs won't be enabled by someone else */ |
@@ -600,6 +603,11 @@ cpu_hotplug_pm_callback(struct notifier_block *nb, | |||
600 | 603 | ||
601 | static int __init cpu_hotplug_pm_sync_init(void) | 604 | static int __init cpu_hotplug_pm_sync_init(void) |
602 | { | 605 | { |
606 | /* | ||
607 | * cpu_hotplug_pm_callback has higher priority than x86 | ||
608 | * bsp_pm_callback which depends on cpu_hotplug_pm_callback | ||
609 | * to disable cpu hotplug to avoid cpu hotplug race. | ||
610 | */ | ||
603 | pm_notifier(cpu_hotplug_pm_callback, 0); | 611 | pm_notifier(cpu_hotplug_pm_callback, 0); |
604 | return 0; | 612 | return 0; |
605 | } | 613 | } |
diff --git a/kernel/cpuset.c b/kernel/cpuset.c index f33c7153b6d7..7bb63eea6eb8 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c | |||
@@ -302,10 +302,10 @@ static void guarantee_online_cpus(const struct cpuset *cs, | |||
302 | * are online, with memory. If none are online with memory, walk | 302 | * are online, with memory. If none are online with memory, walk |
303 | * up the cpuset hierarchy until we find one that does have some | 303 | * up the cpuset hierarchy until we find one that does have some |
304 | * online mems. If we get all the way to the top and still haven't | 304 | * online mems. If we get all the way to the top and still haven't |
305 | * found any online mems, return node_states[N_HIGH_MEMORY]. | 305 | * found any online mems, return node_states[N_MEMORY]. |
306 | * | 306 | * |
307 | * One way or another, we guarantee to return some non-empty subset | 307 | * One way or another, we guarantee to return some non-empty subset |
308 | * of node_states[N_HIGH_MEMORY]. | 308 | * of node_states[N_MEMORY]. |
309 | * | 309 | * |
310 | * Call with callback_mutex held. | 310 | * Call with callback_mutex held. |
311 | */ | 311 | */ |
@@ -313,14 +313,14 @@ static void guarantee_online_cpus(const struct cpuset *cs, | |||
313 | static void guarantee_online_mems(const struct cpuset *cs, nodemask_t *pmask) | 313 | static void guarantee_online_mems(const struct cpuset *cs, nodemask_t *pmask) |
314 | { | 314 | { |
315 | while (cs && !nodes_intersects(cs->mems_allowed, | 315 | while (cs && !nodes_intersects(cs->mems_allowed, |
316 | node_states[N_HIGH_MEMORY])) | 316 | node_states[N_MEMORY])) |
317 | cs = cs->parent; | 317 | cs = cs->parent; |
318 | if (cs) | 318 | if (cs) |
319 | nodes_and(*pmask, cs->mems_allowed, | 319 | nodes_and(*pmask, cs->mems_allowed, |
320 | node_states[N_HIGH_MEMORY]); | 320 | node_states[N_MEMORY]); |
321 | else | 321 | else |
322 | *pmask = node_states[N_HIGH_MEMORY]; | 322 | *pmask = node_states[N_MEMORY]; |
323 | BUG_ON(!nodes_intersects(*pmask, node_states[N_HIGH_MEMORY])); | 323 | BUG_ON(!nodes_intersects(*pmask, node_states[N_MEMORY])); |
324 | } | 324 | } |
325 | 325 | ||
326 | /* | 326 | /* |
@@ -1100,7 +1100,7 @@ static int update_nodemask(struct cpuset *cs, struct cpuset *trialcs, | |||
1100 | return -ENOMEM; | 1100 | return -ENOMEM; |
1101 | 1101 | ||
1102 | /* | 1102 | /* |
1103 | * top_cpuset.mems_allowed tracks node_stats[N_HIGH_MEMORY]; | 1103 | * top_cpuset.mems_allowed tracks node_stats[N_MEMORY]; |
1104 | * it's read-only | 1104 | * it's read-only |
1105 | */ | 1105 | */ |
1106 | if (cs == &top_cpuset) { | 1106 | if (cs == &top_cpuset) { |
@@ -1122,7 +1122,7 @@ static int update_nodemask(struct cpuset *cs, struct cpuset *trialcs, | |||
1122 | goto done; | 1122 | goto done; |
1123 | 1123 | ||
1124 | if (!nodes_subset(trialcs->mems_allowed, | 1124 | if (!nodes_subset(trialcs->mems_allowed, |
1125 | node_states[N_HIGH_MEMORY])) { | 1125 | node_states[N_MEMORY])) { |
1126 | retval = -EINVAL; | 1126 | retval = -EINVAL; |
1127 | goto done; | 1127 | goto done; |
1128 | } | 1128 | } |
@@ -1784,56 +1784,20 @@ static struct cftype files[] = { | |||
1784 | }; | 1784 | }; |
1785 | 1785 | ||
1786 | /* | 1786 | /* |
1787 | * post_clone() is called during cgroup_create() when the | 1787 | * cpuset_css_alloc - allocate a cpuset css |
1788 | * clone_children mount argument was specified. The cgroup | ||
1789 | * can not yet have any tasks. | ||
1790 | * | ||
1791 | * Currently we refuse to set up the cgroup - thereby | ||
1792 | * refusing the task to be entered, and as a result refusing | ||
1793 | * the sys_unshare() or clone() which initiated it - if any | ||
1794 | * sibling cpusets have exclusive cpus or mem. | ||
1795 | * | ||
1796 | * If this becomes a problem for some users who wish to | ||
1797 | * allow that scenario, then cpuset_post_clone() could be | ||
1798 | * changed to grant parent->cpus_allowed-sibling_cpus_exclusive | ||
1799 | * (and likewise for mems) to the new cgroup. Called with cgroup_mutex | ||
1800 | * held. | ||
1801 | */ | ||
1802 | static void cpuset_post_clone(struct cgroup *cgroup) | ||
1803 | { | ||
1804 | struct cgroup *parent, *child; | ||
1805 | struct cpuset *cs, *parent_cs; | ||
1806 | |||
1807 | parent = cgroup->parent; | ||
1808 | list_for_each_entry(child, &parent->children, sibling) { | ||
1809 | cs = cgroup_cs(child); | ||
1810 | if (is_mem_exclusive(cs) || is_cpu_exclusive(cs)) | ||
1811 | return; | ||
1812 | } | ||
1813 | cs = cgroup_cs(cgroup); | ||
1814 | parent_cs = cgroup_cs(parent); | ||
1815 | |||
1816 | mutex_lock(&callback_mutex); | ||
1817 | cs->mems_allowed = parent_cs->mems_allowed; | ||
1818 | cpumask_copy(cs->cpus_allowed, parent_cs->cpus_allowed); | ||
1819 | mutex_unlock(&callback_mutex); | ||
1820 | return; | ||
1821 | } | ||
1822 | |||
1823 | /* | ||
1824 | * cpuset_create - create a cpuset | ||
1825 | * cont: control group that the new cpuset will be part of | 1788 | * cont: control group that the new cpuset will be part of |
1826 | */ | 1789 | */ |
1827 | 1790 | ||
1828 | static struct cgroup_subsys_state *cpuset_create(struct cgroup *cont) | 1791 | static struct cgroup_subsys_state *cpuset_css_alloc(struct cgroup *cont) |
1829 | { | 1792 | { |
1830 | struct cpuset *cs; | 1793 | struct cgroup *parent_cg = cont->parent; |
1831 | struct cpuset *parent; | 1794 | struct cgroup *tmp_cg; |
1795 | struct cpuset *parent, *cs; | ||
1832 | 1796 | ||
1833 | if (!cont->parent) { | 1797 | if (!parent_cg) |
1834 | return &top_cpuset.css; | 1798 | return &top_cpuset.css; |
1835 | } | 1799 | parent = cgroup_cs(parent_cg); |
1836 | parent = cgroup_cs(cont->parent); | 1800 | |
1837 | cs = kmalloc(sizeof(*cs), GFP_KERNEL); | 1801 | cs = kmalloc(sizeof(*cs), GFP_KERNEL); |
1838 | if (!cs) | 1802 | if (!cs) |
1839 | return ERR_PTR(-ENOMEM); | 1803 | return ERR_PTR(-ENOMEM); |
@@ -1855,7 +1819,36 @@ static struct cgroup_subsys_state *cpuset_create(struct cgroup *cont) | |||
1855 | 1819 | ||
1856 | cs->parent = parent; | 1820 | cs->parent = parent; |
1857 | number_of_cpusets++; | 1821 | number_of_cpusets++; |
1858 | return &cs->css ; | 1822 | |
1823 | if (!test_bit(CGRP_CPUSET_CLONE_CHILDREN, &cont->flags)) | ||
1824 | goto skip_clone; | ||
1825 | |||
1826 | /* | ||
1827 | * Clone @parent's configuration if CGRP_CPUSET_CLONE_CHILDREN is | ||
1828 | * set. This flag handling is implemented in cgroup core for | ||
1829 | * histrical reasons - the flag may be specified during mount. | ||
1830 | * | ||
1831 | * Currently, if any sibling cpusets have exclusive cpus or mem, we | ||
1832 | * refuse to clone the configuration - thereby refusing the task to | ||
1833 | * be entered, and as a result refusing the sys_unshare() or | ||
1834 | * clone() which initiated it. If this becomes a problem for some | ||
1835 | * users who wish to allow that scenario, then this could be | ||
1836 | * changed to grant parent->cpus_allowed-sibling_cpus_exclusive | ||
1837 | * (and likewise for mems) to the new cgroup. | ||
1838 | */ | ||
1839 | list_for_each_entry(tmp_cg, &parent_cg->children, sibling) { | ||
1840 | struct cpuset *tmp_cs = cgroup_cs(tmp_cg); | ||
1841 | |||
1842 | if (is_mem_exclusive(tmp_cs) || is_cpu_exclusive(tmp_cs)) | ||
1843 | goto skip_clone; | ||
1844 | } | ||
1845 | |||
1846 | mutex_lock(&callback_mutex); | ||
1847 | cs->mems_allowed = parent->mems_allowed; | ||
1848 | cpumask_copy(cs->cpus_allowed, parent->cpus_allowed); | ||
1849 | mutex_unlock(&callback_mutex); | ||
1850 | skip_clone: | ||
1851 | return &cs->css; | ||
1859 | } | 1852 | } |
1860 | 1853 | ||
1861 | /* | 1854 | /* |
@@ -1864,7 +1857,7 @@ static struct cgroup_subsys_state *cpuset_create(struct cgroup *cont) | |||
1864 | * will call async_rebuild_sched_domains(). | 1857 | * will call async_rebuild_sched_domains(). |
1865 | */ | 1858 | */ |
1866 | 1859 | ||
1867 | static void cpuset_destroy(struct cgroup *cont) | 1860 | static void cpuset_css_free(struct cgroup *cont) |
1868 | { | 1861 | { |
1869 | struct cpuset *cs = cgroup_cs(cont); | 1862 | struct cpuset *cs = cgroup_cs(cont); |
1870 | 1863 | ||
@@ -1878,11 +1871,10 @@ static void cpuset_destroy(struct cgroup *cont) | |||
1878 | 1871 | ||
1879 | struct cgroup_subsys cpuset_subsys = { | 1872 | struct cgroup_subsys cpuset_subsys = { |
1880 | .name = "cpuset", | 1873 | .name = "cpuset", |
1881 | .create = cpuset_create, | 1874 | .css_alloc = cpuset_css_alloc, |
1882 | .destroy = cpuset_destroy, | 1875 | .css_free = cpuset_css_free, |
1883 | .can_attach = cpuset_can_attach, | 1876 | .can_attach = cpuset_can_attach, |
1884 | .attach = cpuset_attach, | 1877 | .attach = cpuset_attach, |
1885 | .post_clone = cpuset_post_clone, | ||
1886 | .subsys_id = cpuset_subsys_id, | 1878 | .subsys_id = cpuset_subsys_id, |
1887 | .base_cftypes = files, | 1879 | .base_cftypes = files, |
1888 | .early_init = 1, | 1880 | .early_init = 1, |
@@ -2034,7 +2026,7 @@ static struct cpuset *cpuset_next(struct list_head *queue) | |||
2034 | * before dropping down to the next. It always processes a node before | 2026 | * before dropping down to the next. It always processes a node before |
2035 | * any of its children. | 2027 | * any of its children. |
2036 | * | 2028 | * |
2037 | * In the case of memory hot-unplug, it will remove nodes from N_HIGH_MEMORY | 2029 | * In the case of memory hot-unplug, it will remove nodes from N_MEMORY |
2038 | * if all present pages from a node are offlined. | 2030 | * if all present pages from a node are offlined. |
2039 | */ | 2031 | */ |
2040 | static void | 2032 | static void |
@@ -2073,7 +2065,7 @@ scan_cpusets_upon_hotplug(struct cpuset *root, enum hotplug_event event) | |||
2073 | 2065 | ||
2074 | /* Continue past cpusets with all mems online */ | 2066 | /* Continue past cpusets with all mems online */ |
2075 | if (nodes_subset(cp->mems_allowed, | 2067 | if (nodes_subset(cp->mems_allowed, |
2076 | node_states[N_HIGH_MEMORY])) | 2068 | node_states[N_MEMORY])) |
2077 | continue; | 2069 | continue; |
2078 | 2070 | ||
2079 | oldmems = cp->mems_allowed; | 2071 | oldmems = cp->mems_allowed; |
@@ -2081,7 +2073,7 @@ scan_cpusets_upon_hotplug(struct cpuset *root, enum hotplug_event event) | |||
2081 | /* Remove offline mems from this cpuset. */ | 2073 | /* Remove offline mems from this cpuset. */ |
2082 | mutex_lock(&callback_mutex); | 2074 | mutex_lock(&callback_mutex); |
2083 | nodes_and(cp->mems_allowed, cp->mems_allowed, | 2075 | nodes_and(cp->mems_allowed, cp->mems_allowed, |
2084 | node_states[N_HIGH_MEMORY]); | 2076 | node_states[N_MEMORY]); |
2085 | mutex_unlock(&callback_mutex); | 2077 | mutex_unlock(&callback_mutex); |
2086 | 2078 | ||
2087 | /* Move tasks from the empty cpuset to a parent */ | 2079 | /* Move tasks from the empty cpuset to a parent */ |
@@ -2134,8 +2126,8 @@ void cpuset_update_active_cpus(bool cpu_online) | |||
2134 | 2126 | ||
2135 | #ifdef CONFIG_MEMORY_HOTPLUG | 2127 | #ifdef CONFIG_MEMORY_HOTPLUG |
2136 | /* | 2128 | /* |
2137 | * Keep top_cpuset.mems_allowed tracking node_states[N_HIGH_MEMORY]. | 2129 | * Keep top_cpuset.mems_allowed tracking node_states[N_MEMORY]. |
2138 | * Call this routine anytime after node_states[N_HIGH_MEMORY] changes. | 2130 | * Call this routine anytime after node_states[N_MEMORY] changes. |
2139 | * See cpuset_update_active_cpus() for CPU hotplug handling. | 2131 | * See cpuset_update_active_cpus() for CPU hotplug handling. |
2140 | */ | 2132 | */ |
2141 | static int cpuset_track_online_nodes(struct notifier_block *self, | 2133 | static int cpuset_track_online_nodes(struct notifier_block *self, |
@@ -2148,7 +2140,7 @@ static int cpuset_track_online_nodes(struct notifier_block *self, | |||
2148 | case MEM_ONLINE: | 2140 | case MEM_ONLINE: |
2149 | oldmems = top_cpuset.mems_allowed; | 2141 | oldmems = top_cpuset.mems_allowed; |
2150 | mutex_lock(&callback_mutex); | 2142 | mutex_lock(&callback_mutex); |
2151 | top_cpuset.mems_allowed = node_states[N_HIGH_MEMORY]; | 2143 | top_cpuset.mems_allowed = node_states[N_MEMORY]; |
2152 | mutex_unlock(&callback_mutex); | 2144 | mutex_unlock(&callback_mutex); |
2153 | update_tasks_nodemask(&top_cpuset, &oldmems, NULL); | 2145 | update_tasks_nodemask(&top_cpuset, &oldmems, NULL); |
2154 | break; | 2146 | break; |
@@ -2177,7 +2169,7 @@ static int cpuset_track_online_nodes(struct notifier_block *self, | |||
2177 | void __init cpuset_init_smp(void) | 2169 | void __init cpuset_init_smp(void) |
2178 | { | 2170 | { |
2179 | cpumask_copy(top_cpuset.cpus_allowed, cpu_active_mask); | 2171 | cpumask_copy(top_cpuset.cpus_allowed, cpu_active_mask); |
2180 | top_cpuset.mems_allowed = node_states[N_HIGH_MEMORY]; | 2172 | top_cpuset.mems_allowed = node_states[N_MEMORY]; |
2181 | 2173 | ||
2182 | hotplug_memory_notifier(cpuset_track_online_nodes, 10); | 2174 | hotplug_memory_notifier(cpuset_track_online_nodes, 10); |
2183 | 2175 | ||
@@ -2245,7 +2237,7 @@ void cpuset_init_current_mems_allowed(void) | |||
2245 | * | 2237 | * |
2246 | * Description: Returns the nodemask_t mems_allowed of the cpuset | 2238 | * Description: Returns the nodemask_t mems_allowed of the cpuset |
2247 | * attached to the specified @tsk. Guaranteed to return some non-empty | 2239 | * attached to the specified @tsk. Guaranteed to return some non-empty |
2248 | * subset of node_states[N_HIGH_MEMORY], even if this means going outside the | 2240 | * subset of node_states[N_MEMORY], even if this means going outside the |
2249 | * tasks cpuset. | 2241 | * tasks cpuset. |
2250 | **/ | 2242 | **/ |
2251 | 2243 | ||
diff --git a/kernel/cred.c b/kernel/cred.c index de728ac50d82..e0573a43c7df 100644 --- a/kernel/cred.c +++ b/kernel/cred.c | |||
@@ -30,17 +30,6 @@ | |||
30 | static struct kmem_cache *cred_jar; | 30 | static struct kmem_cache *cred_jar; |
31 | 31 | ||
32 | /* | 32 | /* |
33 | * The common credentials for the initial task's thread group | ||
34 | */ | ||
35 | #ifdef CONFIG_KEYS | ||
36 | static struct thread_group_cred init_tgcred = { | ||
37 | .usage = ATOMIC_INIT(2), | ||
38 | .tgid = 0, | ||
39 | .lock = __SPIN_LOCK_UNLOCKED(init_cred.tgcred.lock), | ||
40 | }; | ||
41 | #endif | ||
42 | |||
43 | /* | ||
44 | * The initial credentials for the initial task | 33 | * The initial credentials for the initial task |
45 | */ | 34 | */ |
46 | struct cred init_cred = { | 35 | struct cred init_cred = { |
@@ -65,9 +54,6 @@ struct cred init_cred = { | |||
65 | .user = INIT_USER, | 54 | .user = INIT_USER, |
66 | .user_ns = &init_user_ns, | 55 | .user_ns = &init_user_ns, |
67 | .group_info = &init_groups, | 56 | .group_info = &init_groups, |
68 | #ifdef CONFIG_KEYS | ||
69 | .tgcred = &init_tgcred, | ||
70 | #endif | ||
71 | }; | 57 | }; |
72 | 58 | ||
73 | static inline void set_cred_subscribers(struct cred *cred, int n) | 59 | static inline void set_cred_subscribers(struct cred *cred, int n) |
@@ -96,36 +82,6 @@ static inline void alter_cred_subscribers(const struct cred *_cred, int n) | |||
96 | } | 82 | } |
97 | 83 | ||
98 | /* | 84 | /* |
99 | * Dispose of the shared task group credentials | ||
100 | */ | ||
101 | #ifdef CONFIG_KEYS | ||
102 | static void release_tgcred_rcu(struct rcu_head *rcu) | ||
103 | { | ||
104 | struct thread_group_cred *tgcred = | ||
105 | container_of(rcu, struct thread_group_cred, rcu); | ||
106 | |||
107 | BUG_ON(atomic_read(&tgcred->usage) != 0); | ||
108 | |||
109 | key_put(tgcred->session_keyring); | ||
110 | key_put(tgcred->process_keyring); | ||
111 | kfree(tgcred); | ||
112 | } | ||
113 | #endif | ||
114 | |||
115 | /* | ||
116 | * Release a set of thread group credentials. | ||
117 | */ | ||
118 | static void release_tgcred(struct cred *cred) | ||
119 | { | ||
120 | #ifdef CONFIG_KEYS | ||
121 | struct thread_group_cred *tgcred = cred->tgcred; | ||
122 | |||
123 | if (atomic_dec_and_test(&tgcred->usage)) | ||
124 | call_rcu(&tgcred->rcu, release_tgcred_rcu); | ||
125 | #endif | ||
126 | } | ||
127 | |||
128 | /* | ||
129 | * The RCU callback to actually dispose of a set of credentials | 85 | * The RCU callback to actually dispose of a set of credentials |
130 | */ | 86 | */ |
131 | static void put_cred_rcu(struct rcu_head *rcu) | 87 | static void put_cred_rcu(struct rcu_head *rcu) |
@@ -150,9 +106,10 @@ static void put_cred_rcu(struct rcu_head *rcu) | |||
150 | #endif | 106 | #endif |
151 | 107 | ||
152 | security_cred_free(cred); | 108 | security_cred_free(cred); |
109 | key_put(cred->session_keyring); | ||
110 | key_put(cred->process_keyring); | ||
153 | key_put(cred->thread_keyring); | 111 | key_put(cred->thread_keyring); |
154 | key_put(cred->request_key_auth); | 112 | key_put(cred->request_key_auth); |
155 | release_tgcred(cred); | ||
156 | if (cred->group_info) | 113 | if (cred->group_info) |
157 | put_group_info(cred->group_info); | 114 | put_group_info(cred->group_info); |
158 | free_uid(cred->user); | 115 | free_uid(cred->user); |
@@ -246,15 +203,6 @@ struct cred *cred_alloc_blank(void) | |||
246 | if (!new) | 203 | if (!new) |
247 | return NULL; | 204 | return NULL; |
248 | 205 | ||
249 | #ifdef CONFIG_KEYS | ||
250 | new->tgcred = kzalloc(sizeof(*new->tgcred), GFP_KERNEL); | ||
251 | if (!new->tgcred) { | ||
252 | kmem_cache_free(cred_jar, new); | ||
253 | return NULL; | ||
254 | } | ||
255 | atomic_set(&new->tgcred->usage, 1); | ||
256 | #endif | ||
257 | |||
258 | atomic_set(&new->usage, 1); | 206 | atomic_set(&new->usage, 1); |
259 | #ifdef CONFIG_DEBUG_CREDENTIALS | 207 | #ifdef CONFIG_DEBUG_CREDENTIALS |
260 | new->magic = CRED_MAGIC; | 208 | new->magic = CRED_MAGIC; |
@@ -308,9 +256,10 @@ struct cred *prepare_creds(void) | |||
308 | get_user_ns(new->user_ns); | 256 | get_user_ns(new->user_ns); |
309 | 257 | ||
310 | #ifdef CONFIG_KEYS | 258 | #ifdef CONFIG_KEYS |
259 | key_get(new->session_keyring); | ||
260 | key_get(new->process_keyring); | ||
311 | key_get(new->thread_keyring); | 261 | key_get(new->thread_keyring); |
312 | key_get(new->request_key_auth); | 262 | key_get(new->request_key_auth); |
313 | atomic_inc(&new->tgcred->usage); | ||
314 | #endif | 263 | #endif |
315 | 264 | ||
316 | #ifdef CONFIG_SECURITY | 265 | #ifdef CONFIG_SECURITY |
@@ -334,39 +283,20 @@ EXPORT_SYMBOL(prepare_creds); | |||
334 | */ | 283 | */ |
335 | struct cred *prepare_exec_creds(void) | 284 | struct cred *prepare_exec_creds(void) |
336 | { | 285 | { |
337 | struct thread_group_cred *tgcred = NULL; | ||
338 | struct cred *new; | 286 | struct cred *new; |
339 | 287 | ||
340 | #ifdef CONFIG_KEYS | ||
341 | tgcred = kmalloc(sizeof(*tgcred), GFP_KERNEL); | ||
342 | if (!tgcred) | ||
343 | return NULL; | ||
344 | #endif | ||
345 | |||
346 | new = prepare_creds(); | 288 | new = prepare_creds(); |
347 | if (!new) { | 289 | if (!new) |
348 | kfree(tgcred); | ||
349 | return new; | 290 | return new; |
350 | } | ||
351 | 291 | ||
352 | #ifdef CONFIG_KEYS | 292 | #ifdef CONFIG_KEYS |
353 | /* newly exec'd tasks don't get a thread keyring */ | 293 | /* newly exec'd tasks don't get a thread keyring */ |
354 | key_put(new->thread_keyring); | 294 | key_put(new->thread_keyring); |
355 | new->thread_keyring = NULL; | 295 | new->thread_keyring = NULL; |
356 | 296 | ||
357 | /* create a new per-thread-group creds for all this set of threads to | ||
358 | * share */ | ||
359 | memcpy(tgcred, new->tgcred, sizeof(struct thread_group_cred)); | ||
360 | |||
361 | atomic_set(&tgcred->usage, 1); | ||
362 | spin_lock_init(&tgcred->lock); | ||
363 | |||
364 | /* inherit the session keyring; new process keyring */ | 297 | /* inherit the session keyring; new process keyring */ |
365 | key_get(tgcred->session_keyring); | 298 | key_put(new->process_keyring); |
366 | tgcred->process_keyring = NULL; | 299 | new->process_keyring = NULL; |
367 | |||
368 | release_tgcred(new); | ||
369 | new->tgcred = tgcred; | ||
370 | #endif | 300 | #endif |
371 | 301 | ||
372 | return new; | 302 | return new; |
@@ -383,9 +313,6 @@ struct cred *prepare_exec_creds(void) | |||
383 | */ | 313 | */ |
384 | int copy_creds(struct task_struct *p, unsigned long clone_flags) | 314 | int copy_creds(struct task_struct *p, unsigned long clone_flags) |
385 | { | 315 | { |
386 | #ifdef CONFIG_KEYS | ||
387 | struct thread_group_cred *tgcred; | ||
388 | #endif | ||
389 | struct cred *new; | 316 | struct cred *new; |
390 | int ret; | 317 | int ret; |
391 | 318 | ||
@@ -425,22 +352,12 @@ int copy_creds(struct task_struct *p, unsigned long clone_flags) | |||
425 | install_thread_keyring_to_cred(new); | 352 | install_thread_keyring_to_cred(new); |
426 | } | 353 | } |
427 | 354 | ||
428 | /* we share the process and session keyrings between all the threads in | 355 | /* The process keyring is only shared between the threads in a process; |
429 | * a process - this is slightly icky as we violate COW credentials a | 356 | * anything outside of those threads doesn't inherit. |
430 | * bit */ | 357 | */ |
431 | if (!(clone_flags & CLONE_THREAD)) { | 358 | if (!(clone_flags & CLONE_THREAD)) { |
432 | tgcred = kmalloc(sizeof(*tgcred), GFP_KERNEL); | 359 | key_put(new->process_keyring); |
433 | if (!tgcred) { | 360 | new->process_keyring = NULL; |
434 | ret = -ENOMEM; | ||
435 | goto error_put; | ||
436 | } | ||
437 | atomic_set(&tgcred->usage, 1); | ||
438 | spin_lock_init(&tgcred->lock); | ||
439 | tgcred->process_keyring = NULL; | ||
440 | tgcred->session_keyring = key_get(new->tgcred->session_keyring); | ||
441 | |||
442 | release_tgcred(new); | ||
443 | new->tgcred = tgcred; | ||
444 | } | 361 | } |
445 | #endif | 362 | #endif |
446 | 363 | ||
@@ -455,6 +372,31 @@ error_put: | |||
455 | return ret; | 372 | return ret; |
456 | } | 373 | } |
457 | 374 | ||
375 | static bool cred_cap_issubset(const struct cred *set, const struct cred *subset) | ||
376 | { | ||
377 | const struct user_namespace *set_ns = set->user_ns; | ||
378 | const struct user_namespace *subset_ns = subset->user_ns; | ||
379 | |||
380 | /* If the two credentials are in the same user namespace see if | ||
381 | * the capabilities of subset are a subset of set. | ||
382 | */ | ||
383 | if (set_ns == subset_ns) | ||
384 | return cap_issubset(subset->cap_permitted, set->cap_permitted); | ||
385 | |||
386 | /* The credentials are in a different user namespaces | ||
387 | * therefore one is a subset of the other only if a set is an | ||
388 | * ancestor of subset and set->euid is owner of subset or one | ||
389 | * of subsets ancestors. | ||
390 | */ | ||
391 | for (;subset_ns != &init_user_ns; subset_ns = subset_ns->parent) { | ||
392 | if ((set_ns == subset_ns->parent) && | ||
393 | uid_eq(subset_ns->owner, set->euid)) | ||
394 | return true; | ||
395 | } | ||
396 | |||
397 | return false; | ||
398 | } | ||
399 | |||
458 | /** | 400 | /** |
459 | * commit_creds - Install new credentials upon the current task | 401 | * commit_creds - Install new credentials upon the current task |
460 | * @new: The credentials to be assigned | 402 | * @new: The credentials to be assigned |
@@ -493,7 +435,7 @@ int commit_creds(struct cred *new) | |||
493 | !gid_eq(old->egid, new->egid) || | 435 | !gid_eq(old->egid, new->egid) || |
494 | !uid_eq(old->fsuid, new->fsuid) || | 436 | !uid_eq(old->fsuid, new->fsuid) || |
495 | !gid_eq(old->fsgid, new->fsgid) || | 437 | !gid_eq(old->fsgid, new->fsgid) || |
496 | !cap_issubset(new->cap_permitted, old->cap_permitted)) { | 438 | !cred_cap_issubset(old, new)) { |
497 | if (task->mm) | 439 | if (task->mm) |
498 | set_dumpable(task->mm, suid_dumpable); | 440 | set_dumpable(task->mm, suid_dumpable); |
499 | task->pdeath_signal = 0; | 441 | task->pdeath_signal = 0; |
@@ -643,9 +585,6 @@ void __init cred_init(void) | |||
643 | */ | 585 | */ |
644 | struct cred *prepare_kernel_cred(struct task_struct *daemon) | 586 | struct cred *prepare_kernel_cred(struct task_struct *daemon) |
645 | { | 587 | { |
646 | #ifdef CONFIG_KEYS | ||
647 | struct thread_group_cred *tgcred; | ||
648 | #endif | ||
649 | const struct cred *old; | 588 | const struct cred *old; |
650 | struct cred *new; | 589 | struct cred *new; |
651 | 590 | ||
@@ -653,14 +592,6 @@ struct cred *prepare_kernel_cred(struct task_struct *daemon) | |||
653 | if (!new) | 592 | if (!new) |
654 | return NULL; | 593 | return NULL; |
655 | 594 | ||
656 | #ifdef CONFIG_KEYS | ||
657 | tgcred = kmalloc(sizeof(*tgcred), GFP_KERNEL); | ||
658 | if (!tgcred) { | ||
659 | kmem_cache_free(cred_jar, new); | ||
660 | return NULL; | ||
661 | } | ||
662 | #endif | ||
663 | |||
664 | kdebug("prepare_kernel_cred() alloc %p", new); | 595 | kdebug("prepare_kernel_cred() alloc %p", new); |
665 | 596 | ||
666 | if (daemon) | 597 | if (daemon) |
@@ -678,13 +609,10 @@ struct cred *prepare_kernel_cred(struct task_struct *daemon) | |||
678 | get_group_info(new->group_info); | 609 | get_group_info(new->group_info); |
679 | 610 | ||
680 | #ifdef CONFIG_KEYS | 611 | #ifdef CONFIG_KEYS |
681 | atomic_set(&tgcred->usage, 1); | 612 | new->session_keyring = NULL; |
682 | spin_lock_init(&tgcred->lock); | 613 | new->process_keyring = NULL; |
683 | tgcred->process_keyring = NULL; | ||
684 | tgcred->session_keyring = NULL; | ||
685 | new->tgcred = tgcred; | ||
686 | new->request_key_auth = NULL; | ||
687 | new->thread_keyring = NULL; | 614 | new->thread_keyring = NULL; |
615 | new->request_key_auth = NULL; | ||
688 | new->jit_keyring = KEY_REQKEY_DEFL_THREAD_KEYRING; | 616 | new->jit_keyring = KEY_REQKEY_DEFL_THREAD_KEYRING; |
689 | #endif | 617 | #endif |
690 | 618 | ||
@@ -799,9 +727,15 @@ static void dump_invalid_creds(const struct cred *cred, const char *label, | |||
799 | atomic_read(&cred->usage), | 727 | atomic_read(&cred->usage), |
800 | read_cred_subscribers(cred)); | 728 | read_cred_subscribers(cred)); |
801 | printk(KERN_ERR "CRED: ->*uid = { %d,%d,%d,%d }\n", | 729 | printk(KERN_ERR "CRED: ->*uid = { %d,%d,%d,%d }\n", |
802 | cred->uid, cred->euid, cred->suid, cred->fsuid); | 730 | from_kuid_munged(&init_user_ns, cred->uid), |
731 | from_kuid_munged(&init_user_ns, cred->euid), | ||
732 | from_kuid_munged(&init_user_ns, cred->suid), | ||
733 | from_kuid_munged(&init_user_ns, cred->fsuid)); | ||
803 | printk(KERN_ERR "CRED: ->*gid = { %d,%d,%d,%d }\n", | 734 | printk(KERN_ERR "CRED: ->*gid = { %d,%d,%d,%d }\n", |
804 | cred->gid, cred->egid, cred->sgid, cred->fsgid); | 735 | from_kgid_munged(&init_user_ns, cred->gid), |
736 | from_kgid_munged(&init_user_ns, cred->egid), | ||
737 | from_kgid_munged(&init_user_ns, cred->sgid), | ||
738 | from_kgid_munged(&init_user_ns, cred->fsgid)); | ||
805 | #ifdef CONFIG_SECURITY | 739 | #ifdef CONFIG_SECURITY |
806 | printk(KERN_ERR "CRED: ->security is %p\n", cred->security); | 740 | printk(KERN_ERR "CRED: ->security is %p\n", cred->security); |
807 | if ((unsigned long) cred->security >= PAGE_SIZE && | 741 | if ((unsigned long) cred->security >= PAGE_SIZE && |
diff --git a/kernel/debug/debug_core.c b/kernel/debug/debug_core.c index 0557f24c6bca..9a61738cefc8 100644 --- a/kernel/debug/debug_core.c +++ b/kernel/debug/debug_core.c | |||
@@ -672,6 +672,10 @@ kgdb_handle_exception(int evector, int signo, int ecode, struct pt_regs *regs) | |||
672 | { | 672 | { |
673 | struct kgdb_state kgdb_var; | 673 | struct kgdb_state kgdb_var; |
674 | struct kgdb_state *ks = &kgdb_var; | 674 | struct kgdb_state *ks = &kgdb_var; |
675 | int ret = 0; | ||
676 | |||
677 | if (arch_kgdb_ops.enable_nmi) | ||
678 | arch_kgdb_ops.enable_nmi(0); | ||
675 | 679 | ||
676 | ks->cpu = raw_smp_processor_id(); | 680 | ks->cpu = raw_smp_processor_id(); |
677 | ks->ex_vector = evector; | 681 | ks->ex_vector = evector; |
@@ -681,13 +685,33 @@ kgdb_handle_exception(int evector, int signo, int ecode, struct pt_regs *regs) | |||
681 | ks->linux_regs = regs; | 685 | ks->linux_regs = regs; |
682 | 686 | ||
683 | if (kgdb_reenter_check(ks)) | 687 | if (kgdb_reenter_check(ks)) |
684 | return 0; /* Ouch, double exception ! */ | 688 | goto out; /* Ouch, double exception ! */ |
685 | if (kgdb_info[ks->cpu].enter_kgdb != 0) | 689 | if (kgdb_info[ks->cpu].enter_kgdb != 0) |
686 | return 0; | 690 | goto out; |
687 | 691 | ||
688 | return kgdb_cpu_enter(ks, regs, DCPU_WANT_MASTER); | 692 | ret = kgdb_cpu_enter(ks, regs, DCPU_WANT_MASTER); |
693 | out: | ||
694 | if (arch_kgdb_ops.enable_nmi) | ||
695 | arch_kgdb_ops.enable_nmi(1); | ||
696 | return ret; | ||
689 | } | 697 | } |
690 | 698 | ||
699 | /* | ||
700 | * GDB places a breakpoint at this function to know dynamically | ||
701 | * loaded objects. It's not defined static so that only one instance with this | ||
702 | * name exists in the kernel. | ||
703 | */ | ||
704 | |||
705 | static int module_event(struct notifier_block *self, unsigned long val, | ||
706 | void *data) | ||
707 | { | ||
708 | return 0; | ||
709 | } | ||
710 | |||
711 | static struct notifier_block dbg_module_load_nb = { | ||
712 | .notifier_call = module_event, | ||
713 | }; | ||
714 | |||
691 | int kgdb_nmicallback(int cpu, void *regs) | 715 | int kgdb_nmicallback(int cpu, void *regs) |
692 | { | 716 | { |
693 | #ifdef CONFIG_SMP | 717 | #ifdef CONFIG_SMP |
@@ -816,6 +840,7 @@ static void kgdb_register_callbacks(void) | |||
816 | kgdb_arch_init(); | 840 | kgdb_arch_init(); |
817 | if (!dbg_is_early) | 841 | if (!dbg_is_early) |
818 | kgdb_arch_late(); | 842 | kgdb_arch_late(); |
843 | register_module_notifier(&dbg_module_load_nb); | ||
819 | register_reboot_notifier(&dbg_reboot_notifier); | 844 | register_reboot_notifier(&dbg_reboot_notifier); |
820 | atomic_notifier_chain_register(&panic_notifier_list, | 845 | atomic_notifier_chain_register(&panic_notifier_list, |
821 | &kgdb_panic_event_nb); | 846 | &kgdb_panic_event_nb); |
@@ -839,6 +864,7 @@ static void kgdb_unregister_callbacks(void) | |||
839 | if (kgdb_io_module_registered) { | 864 | if (kgdb_io_module_registered) { |
840 | kgdb_io_module_registered = 0; | 865 | kgdb_io_module_registered = 0; |
841 | unregister_reboot_notifier(&dbg_reboot_notifier); | 866 | unregister_reboot_notifier(&dbg_reboot_notifier); |
867 | unregister_module_notifier(&dbg_module_load_nb); | ||
842 | atomic_notifier_chain_unregister(&panic_notifier_list, | 868 | atomic_notifier_chain_unregister(&panic_notifier_list, |
843 | &kgdb_panic_event_nb); | 869 | &kgdb_panic_event_nb); |
844 | kgdb_arch_exit(); | 870 | kgdb_arch_exit(); |
diff --git a/kernel/debug/kdb/kdb_bt.c b/kernel/debug/kdb/kdb_bt.c index 07c9bbb94a0b..b03e0e814e43 100644 --- a/kernel/debug/kdb/kdb_bt.c +++ b/kernel/debug/kdb/kdb_bt.c | |||
@@ -129,6 +129,8 @@ kdb_bt(int argc, const char **argv) | |||
129 | } | 129 | } |
130 | /* Now the inactive tasks */ | 130 | /* Now the inactive tasks */ |
131 | kdb_do_each_thread(g, p) { | 131 | kdb_do_each_thread(g, p) { |
132 | if (KDB_FLAG(CMD_INTERRUPT)) | ||
133 | return 0; | ||
132 | if (task_curr(p)) | 134 | if (task_curr(p)) |
133 | continue; | 135 | continue; |
134 | if (kdb_bt1(p, mask, argcount, btaprompt)) | 136 | if (kdb_bt1(p, mask, argcount, btaprompt)) |
diff --git a/kernel/debug/kdb/kdb_io.c b/kernel/debug/kdb/kdb_io.c index 0a69d2adc4f3..14ff4849262c 100644 --- a/kernel/debug/kdb/kdb_io.c +++ b/kernel/debug/kdb/kdb_io.c | |||
@@ -552,6 +552,7 @@ int vkdb_printf(const char *fmt, va_list ap) | |||
552 | { | 552 | { |
553 | int diag; | 553 | int diag; |
554 | int linecount; | 554 | int linecount; |
555 | int colcount; | ||
555 | int logging, saved_loglevel = 0; | 556 | int logging, saved_loglevel = 0; |
556 | int saved_trap_printk; | 557 | int saved_trap_printk; |
557 | int got_printf_lock = 0; | 558 | int got_printf_lock = 0; |
@@ -584,6 +585,10 @@ int vkdb_printf(const char *fmt, va_list ap) | |||
584 | if (diag || linecount <= 1) | 585 | if (diag || linecount <= 1) |
585 | linecount = 24; | 586 | linecount = 24; |
586 | 587 | ||
588 | diag = kdbgetintenv("COLUMNS", &colcount); | ||
589 | if (diag || colcount <= 1) | ||
590 | colcount = 80; | ||
591 | |||
587 | diag = kdbgetintenv("LOGGING", &logging); | 592 | diag = kdbgetintenv("LOGGING", &logging); |
588 | if (diag) | 593 | if (diag) |
589 | logging = 0; | 594 | logging = 0; |
@@ -690,7 +695,7 @@ kdb_printit: | |||
690 | gdbstub_msg_write(kdb_buffer, retlen); | 695 | gdbstub_msg_write(kdb_buffer, retlen); |
691 | } else { | 696 | } else { |
692 | if (dbg_io_ops && !dbg_io_ops->is_console) { | 697 | if (dbg_io_ops && !dbg_io_ops->is_console) { |
693 | len = strlen(kdb_buffer); | 698 | len = retlen; |
694 | cp = kdb_buffer; | 699 | cp = kdb_buffer; |
695 | while (len--) { | 700 | while (len--) { |
696 | dbg_io_ops->write_char(*cp); | 701 | dbg_io_ops->write_char(*cp); |
@@ -709,11 +714,29 @@ kdb_printit: | |||
709 | printk(KERN_INFO "%s", kdb_buffer); | 714 | printk(KERN_INFO "%s", kdb_buffer); |
710 | } | 715 | } |
711 | 716 | ||
712 | if (KDB_STATE(PAGER) && strchr(kdb_buffer, '\n')) | 717 | if (KDB_STATE(PAGER)) { |
713 | kdb_nextline++; | 718 | /* |
719 | * Check printed string to decide how to bump the | ||
720 | * kdb_nextline to control when the more prompt should | ||
721 | * show up. | ||
722 | */ | ||
723 | int got = 0; | ||
724 | len = retlen; | ||
725 | while (len--) { | ||
726 | if (kdb_buffer[len] == '\n') { | ||
727 | kdb_nextline++; | ||
728 | got = 0; | ||
729 | } else if (kdb_buffer[len] == '\r') { | ||
730 | got = 0; | ||
731 | } else { | ||
732 | got++; | ||
733 | } | ||
734 | } | ||
735 | kdb_nextline += got / (colcount + 1); | ||
736 | } | ||
714 | 737 | ||
715 | /* check for having reached the LINES number of printed lines */ | 738 | /* check for having reached the LINES number of printed lines */ |
716 | if (kdb_nextline == linecount) { | 739 | if (kdb_nextline >= linecount) { |
717 | char buf1[16] = ""; | 740 | char buf1[16] = ""; |
718 | 741 | ||
719 | /* Watch out for recursion here. Any routine that calls | 742 | /* Watch out for recursion here. Any routine that calls |
@@ -765,7 +788,7 @@ kdb_printit: | |||
765 | kdb_grepping_flag = 0; | 788 | kdb_grepping_flag = 0; |
766 | kdb_printf("\n"); | 789 | kdb_printf("\n"); |
767 | } else if (buf1[0] == ' ') { | 790 | } else if (buf1[0] == ' ') { |
768 | kdb_printf("\n"); | 791 | kdb_printf("\r"); |
769 | suspend_grep = 1; /* for this recursion */ | 792 | suspend_grep = 1; /* for this recursion */ |
770 | } else if (buf1[0] == '\n') { | 793 | } else if (buf1[0] == '\n') { |
771 | kdb_nextline = linecount - 1; | 794 | kdb_nextline = linecount - 1; |
diff --git a/kernel/debug/kdb/kdb_main.c b/kernel/debug/kdb/kdb_main.c index 31df1706b9a9..4d5f8d5612f3 100644 --- a/kernel/debug/kdb/kdb_main.c +++ b/kernel/debug/kdb/kdb_main.c | |||
@@ -21,6 +21,7 @@ | |||
21 | #include <linux/smp.h> | 21 | #include <linux/smp.h> |
22 | #include <linux/utsname.h> | 22 | #include <linux/utsname.h> |
23 | #include <linux/vmalloc.h> | 23 | #include <linux/vmalloc.h> |
24 | #include <linux/atomic.h> | ||
24 | #include <linux/module.h> | 25 | #include <linux/module.h> |
25 | #include <linux/mm.h> | 26 | #include <linux/mm.h> |
26 | #include <linux/init.h> | 27 | #include <linux/init.h> |
@@ -2100,6 +2101,8 @@ static int kdb_dmesg(int argc, const char **argv) | |||
2100 | } | 2101 | } |
2101 | if (!lines--) | 2102 | if (!lines--) |
2102 | break; | 2103 | break; |
2104 | if (KDB_FLAG(CMD_INTERRUPT)) | ||
2105 | return 0; | ||
2103 | 2106 | ||
2104 | kdb_printf("%.*s\n", (int)len - 1, buf); | 2107 | kdb_printf("%.*s\n", (int)len - 1, buf); |
2105 | } | 2108 | } |
@@ -2107,6 +2110,32 @@ static int kdb_dmesg(int argc, const char **argv) | |||
2107 | return 0; | 2110 | return 0; |
2108 | } | 2111 | } |
2109 | #endif /* CONFIG_PRINTK */ | 2112 | #endif /* CONFIG_PRINTK */ |
2113 | |||
2114 | /* Make sure we balance enable/disable calls, must disable first. */ | ||
2115 | static atomic_t kdb_nmi_disabled; | ||
2116 | |||
2117 | static int kdb_disable_nmi(int argc, const char *argv[]) | ||
2118 | { | ||
2119 | if (atomic_read(&kdb_nmi_disabled)) | ||
2120 | return 0; | ||
2121 | atomic_set(&kdb_nmi_disabled, 1); | ||
2122 | arch_kgdb_ops.enable_nmi(0); | ||
2123 | return 0; | ||
2124 | } | ||
2125 | |||
2126 | static int kdb_param_enable_nmi(const char *val, const struct kernel_param *kp) | ||
2127 | { | ||
2128 | if (!atomic_add_unless(&kdb_nmi_disabled, -1, 0)) | ||
2129 | return -EINVAL; | ||
2130 | arch_kgdb_ops.enable_nmi(1); | ||
2131 | return 0; | ||
2132 | } | ||
2133 | |||
2134 | static const struct kernel_param_ops kdb_param_ops_enable_nmi = { | ||
2135 | .set = kdb_param_enable_nmi, | ||
2136 | }; | ||
2137 | module_param_cb(enable_nmi, &kdb_param_ops_enable_nmi, NULL, 0600); | ||
2138 | |||
2110 | /* | 2139 | /* |
2111 | * kdb_cpu - This function implements the 'cpu' command. | 2140 | * kdb_cpu - This function implements the 'cpu' command. |
2112 | * cpu [<cpunum>] | 2141 | * cpu [<cpunum>] |
@@ -2851,6 +2880,10 @@ static void __init kdb_inittab(void) | |||
2851 | kdb_register_repeat("dmesg", kdb_dmesg, "[lines]", | 2880 | kdb_register_repeat("dmesg", kdb_dmesg, "[lines]", |
2852 | "Display syslog buffer", 0, KDB_REPEAT_NONE); | 2881 | "Display syslog buffer", 0, KDB_REPEAT_NONE); |
2853 | #endif | 2882 | #endif |
2883 | if (arch_kgdb_ops.enable_nmi) { | ||
2884 | kdb_register_repeat("disable_nmi", kdb_disable_nmi, "", | ||
2885 | "Disable NMI entry to KDB", 0, KDB_REPEAT_NONE); | ||
2886 | } | ||
2854 | kdb_register_repeat("defcmd", kdb_defcmd, "name \"usage\" \"help\"", | 2887 | kdb_register_repeat("defcmd", kdb_defcmd, "name \"usage\" \"help\"", |
2855 | "Define a set of commands, down to endefcmd", 0, KDB_REPEAT_NONE); | 2888 | "Define a set of commands, down to endefcmd", 0, KDB_REPEAT_NONE); |
2856 | kdb_register_repeat("kill", kdb_kill, "<-signal> <pid>", | 2889 | kdb_register_repeat("kill", kdb_kill, "<-signal> <pid>", |
diff --git a/kernel/events/callchain.c b/kernel/events/callchain.c index 98d4597f43d6..c77206184b8b 100644 --- a/kernel/events/callchain.c +++ b/kernel/events/callchain.c | |||
@@ -159,6 +159,11 @@ perf_callchain(struct perf_event *event, struct pt_regs *regs) | |||
159 | int rctx; | 159 | int rctx; |
160 | struct perf_callchain_entry *entry; | 160 | struct perf_callchain_entry *entry; |
161 | 161 | ||
162 | int kernel = !event->attr.exclude_callchain_kernel; | ||
163 | int user = !event->attr.exclude_callchain_user; | ||
164 | |||
165 | if (!kernel && !user) | ||
166 | return NULL; | ||
162 | 167 | ||
163 | entry = get_callchain_entry(&rctx); | 168 | entry = get_callchain_entry(&rctx); |
164 | if (rctx == -1) | 169 | if (rctx == -1) |
@@ -169,24 +174,29 @@ perf_callchain(struct perf_event *event, struct pt_regs *regs) | |||
169 | 174 | ||
170 | entry->nr = 0; | 175 | entry->nr = 0; |
171 | 176 | ||
172 | if (!user_mode(regs)) { | 177 | if (kernel && !user_mode(regs)) { |
173 | perf_callchain_store(entry, PERF_CONTEXT_KERNEL); | 178 | perf_callchain_store(entry, PERF_CONTEXT_KERNEL); |
174 | perf_callchain_kernel(entry, regs); | 179 | perf_callchain_kernel(entry, regs); |
175 | if (current->mm) | ||
176 | regs = task_pt_regs(current); | ||
177 | else | ||
178 | regs = NULL; | ||
179 | } | 180 | } |
180 | 181 | ||
181 | if (regs) { | 182 | if (user) { |
182 | /* | 183 | if (!user_mode(regs)) { |
183 | * Disallow cross-task user callchains. | 184 | if (current->mm) |
184 | */ | 185 | regs = task_pt_regs(current); |
185 | if (event->ctx->task && event->ctx->task != current) | 186 | else |
186 | goto exit_put; | 187 | regs = NULL; |
187 | 188 | } | |
188 | perf_callchain_store(entry, PERF_CONTEXT_USER); | 189 | |
189 | perf_callchain_user(entry, regs); | 190 | if (regs) { |
191 | /* | ||
192 | * Disallow cross-task user callchains. | ||
193 | */ | ||
194 | if (event->ctx->task && event->ctx->task != current) | ||
195 | goto exit_put; | ||
196 | |||
197 | perf_callchain_store(entry, PERF_CONTEXT_USER); | ||
198 | perf_callchain_user(entry, regs); | ||
199 | } | ||
190 | } | 200 | } |
191 | 201 | ||
192 | exit_put: | 202 | exit_put: |
diff --git a/kernel/events/core.c b/kernel/events/core.c index 7fee567153f0..301079d06f24 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c | |||
@@ -36,6 +36,7 @@ | |||
36 | #include <linux/perf_event.h> | 36 | #include <linux/perf_event.h> |
37 | #include <linux/ftrace_event.h> | 37 | #include <linux/ftrace_event.h> |
38 | #include <linux/hw_breakpoint.h> | 38 | #include <linux/hw_breakpoint.h> |
39 | #include <linux/mm_types.h> | ||
39 | 40 | ||
40 | #include "internal.h" | 41 | #include "internal.h" |
41 | 42 | ||
@@ -371,6 +372,8 @@ void perf_cgroup_switch(struct task_struct *task, int mode) | |||
371 | 372 | ||
372 | list_for_each_entry_rcu(pmu, &pmus, entry) { | 373 | list_for_each_entry_rcu(pmu, &pmus, entry) { |
373 | cpuctx = this_cpu_ptr(pmu->pmu_cpu_context); | 374 | cpuctx = this_cpu_ptr(pmu->pmu_cpu_context); |
375 | if (cpuctx->unique_pmu != pmu) | ||
376 | continue; /* ensure we process each cpuctx once */ | ||
374 | 377 | ||
375 | /* | 378 | /* |
376 | * perf_cgroup_events says at least one | 379 | * perf_cgroup_events says at least one |
@@ -394,9 +397,10 @@ void perf_cgroup_switch(struct task_struct *task, int mode) | |||
394 | 397 | ||
395 | if (mode & PERF_CGROUP_SWIN) { | 398 | if (mode & PERF_CGROUP_SWIN) { |
396 | WARN_ON_ONCE(cpuctx->cgrp); | 399 | WARN_ON_ONCE(cpuctx->cgrp); |
397 | /* set cgrp before ctxsw in to | 400 | /* |
398 | * allow event_filter_match() to not | 401 | * set cgrp before ctxsw in to allow |
399 | * have to pass task around | 402 | * event_filter_match() to not have to pass |
403 | * task around | ||
400 | */ | 404 | */ |
401 | cpuctx->cgrp = perf_cgroup_from_task(task); | 405 | cpuctx->cgrp = perf_cgroup_from_task(task); |
402 | cpu_ctx_sched_in(cpuctx, EVENT_ALL, task); | 406 | cpu_ctx_sched_in(cpuctx, EVENT_ALL, task); |
@@ -467,14 +471,13 @@ static inline int perf_cgroup_connect(int fd, struct perf_event *event, | |||
467 | { | 471 | { |
468 | struct perf_cgroup *cgrp; | 472 | struct perf_cgroup *cgrp; |
469 | struct cgroup_subsys_state *css; | 473 | struct cgroup_subsys_state *css; |
470 | struct file *file; | 474 | struct fd f = fdget(fd); |
471 | int ret = 0, fput_needed; | 475 | int ret = 0; |
472 | 476 | ||
473 | file = fget_light(fd, &fput_needed); | 477 | if (!f.file) |
474 | if (!file) | ||
475 | return -EBADF; | 478 | return -EBADF; |
476 | 479 | ||
477 | css = cgroup_css_from_dir(file, perf_subsys_id); | 480 | css = cgroup_css_from_dir(f.file, perf_subsys_id); |
478 | if (IS_ERR(css)) { | 481 | if (IS_ERR(css)) { |
479 | ret = PTR_ERR(css); | 482 | ret = PTR_ERR(css); |
480 | goto out; | 483 | goto out; |
@@ -500,7 +503,7 @@ static inline int perf_cgroup_connect(int fd, struct perf_event *event, | |||
500 | ret = -EINVAL; | 503 | ret = -EINVAL; |
501 | } | 504 | } |
502 | out: | 505 | out: |
503 | fput_light(file, fput_needed); | 506 | fdput(f); |
504 | return ret; | 507 | return ret; |
505 | } | 508 | } |
506 | 509 | ||
@@ -3233,21 +3236,18 @@ unlock: | |||
3233 | 3236 | ||
3234 | static const struct file_operations perf_fops; | 3237 | static const struct file_operations perf_fops; |
3235 | 3238 | ||
3236 | static struct file *perf_fget_light(int fd, int *fput_needed) | 3239 | static inline int perf_fget_light(int fd, struct fd *p) |
3237 | { | 3240 | { |
3238 | struct file *file; | 3241 | struct fd f = fdget(fd); |
3239 | 3242 | if (!f.file) | |
3240 | file = fget_light(fd, fput_needed); | 3243 | return -EBADF; |
3241 | if (!file) | ||
3242 | return ERR_PTR(-EBADF); | ||
3243 | 3244 | ||
3244 | if (file->f_op != &perf_fops) { | 3245 | if (f.file->f_op != &perf_fops) { |
3245 | fput_light(file, *fput_needed); | 3246 | fdput(f); |
3246 | *fput_needed = 0; | 3247 | return -EBADF; |
3247 | return ERR_PTR(-EBADF); | ||
3248 | } | 3248 | } |
3249 | 3249 | *p = f; | |
3250 | return file; | 3250 | return 0; |
3251 | } | 3251 | } |
3252 | 3252 | ||
3253 | static int perf_event_set_output(struct perf_event *event, | 3253 | static int perf_event_set_output(struct perf_event *event, |
@@ -3279,22 +3279,19 @@ static long perf_ioctl(struct file *file, unsigned int cmd, unsigned long arg) | |||
3279 | 3279 | ||
3280 | case PERF_EVENT_IOC_SET_OUTPUT: | 3280 | case PERF_EVENT_IOC_SET_OUTPUT: |
3281 | { | 3281 | { |
3282 | struct file *output_file = NULL; | ||
3283 | struct perf_event *output_event = NULL; | ||
3284 | int fput_needed = 0; | ||
3285 | int ret; | 3282 | int ret; |
3286 | |||
3287 | if (arg != -1) { | 3283 | if (arg != -1) { |
3288 | output_file = perf_fget_light(arg, &fput_needed); | 3284 | struct perf_event *output_event; |
3289 | if (IS_ERR(output_file)) | 3285 | struct fd output; |
3290 | return PTR_ERR(output_file); | 3286 | ret = perf_fget_light(arg, &output); |
3291 | output_event = output_file->private_data; | 3287 | if (ret) |
3288 | return ret; | ||
3289 | output_event = output.file->private_data; | ||
3290 | ret = perf_event_set_output(event, output_event); | ||
3291 | fdput(output); | ||
3292 | } else { | ||
3293 | ret = perf_event_set_output(event, NULL); | ||
3292 | } | 3294 | } |
3293 | |||
3294 | ret = perf_event_set_output(event, output_event); | ||
3295 | if (output_event) | ||
3296 | fput_light(output_file, fput_needed); | ||
3297 | |||
3298 | return ret; | 3295 | return ret; |
3299 | } | 3296 | } |
3300 | 3297 | ||
@@ -3677,7 +3674,7 @@ unlock: | |||
3677 | atomic_inc(&event->mmap_count); | 3674 | atomic_inc(&event->mmap_count); |
3678 | mutex_unlock(&event->mmap_mutex); | 3675 | mutex_unlock(&event->mmap_mutex); |
3679 | 3676 | ||
3680 | vma->vm_flags |= VM_RESERVED; | 3677 | vma->vm_flags |= VM_DONTEXPAND | VM_DONTDUMP; |
3681 | vma->vm_ops = &perf_mmap_vmops; | 3678 | vma->vm_ops = &perf_mmap_vmops; |
3682 | 3679 | ||
3683 | return ret; | 3680 | return ret; |
@@ -3764,6 +3761,132 @@ int perf_unregister_guest_info_callbacks(struct perf_guest_info_callbacks *cbs) | |||
3764 | } | 3761 | } |
3765 | EXPORT_SYMBOL_GPL(perf_unregister_guest_info_callbacks); | 3762 | EXPORT_SYMBOL_GPL(perf_unregister_guest_info_callbacks); |
3766 | 3763 | ||
3764 | static void | ||
3765 | perf_output_sample_regs(struct perf_output_handle *handle, | ||
3766 | struct pt_regs *regs, u64 mask) | ||
3767 | { | ||
3768 | int bit; | ||
3769 | |||
3770 | for_each_set_bit(bit, (const unsigned long *) &mask, | ||
3771 | sizeof(mask) * BITS_PER_BYTE) { | ||
3772 | u64 val; | ||
3773 | |||
3774 | val = perf_reg_value(regs, bit); | ||
3775 | perf_output_put(handle, val); | ||
3776 | } | ||
3777 | } | ||
3778 | |||
3779 | static void perf_sample_regs_user(struct perf_regs_user *regs_user, | ||
3780 | struct pt_regs *regs) | ||
3781 | { | ||
3782 | if (!user_mode(regs)) { | ||
3783 | if (current->mm) | ||
3784 | regs = task_pt_regs(current); | ||
3785 | else | ||
3786 | regs = NULL; | ||
3787 | } | ||
3788 | |||
3789 | if (regs) { | ||
3790 | regs_user->regs = regs; | ||
3791 | regs_user->abi = perf_reg_abi(current); | ||
3792 | } | ||
3793 | } | ||
3794 | |||
3795 | /* | ||
3796 | * Get remaining task size from user stack pointer. | ||
3797 | * | ||
3798 | * It'd be better to take stack vma map and limit this more | ||
3799 | * precisly, but there's no way to get it safely under interrupt, | ||
3800 | * so using TASK_SIZE as limit. | ||
3801 | */ | ||
3802 | static u64 perf_ustack_task_size(struct pt_regs *regs) | ||
3803 | { | ||
3804 | unsigned long addr = perf_user_stack_pointer(regs); | ||
3805 | |||
3806 | if (!addr || addr >= TASK_SIZE) | ||
3807 | return 0; | ||
3808 | |||
3809 | return TASK_SIZE - addr; | ||
3810 | } | ||
3811 | |||
3812 | static u16 | ||
3813 | perf_sample_ustack_size(u16 stack_size, u16 header_size, | ||
3814 | struct pt_regs *regs) | ||
3815 | { | ||
3816 | u64 task_size; | ||
3817 | |||
3818 | /* No regs, no stack pointer, no dump. */ | ||
3819 | if (!regs) | ||
3820 | return 0; | ||
3821 | |||
3822 | /* | ||
3823 | * Check if we fit in with the requested stack size into the: | ||
3824 | * - TASK_SIZE | ||
3825 | * If we don't, we limit the size to the TASK_SIZE. | ||
3826 | * | ||
3827 | * - remaining sample size | ||
3828 | * If we don't, we customize the stack size to | ||
3829 | * fit in to the remaining sample size. | ||
3830 | */ | ||
3831 | |||
3832 | task_size = min((u64) USHRT_MAX, perf_ustack_task_size(regs)); | ||
3833 | stack_size = min(stack_size, (u16) task_size); | ||
3834 | |||
3835 | /* Current header size plus static size and dynamic size. */ | ||
3836 | header_size += 2 * sizeof(u64); | ||
3837 | |||
3838 | /* Do we fit in with the current stack dump size? */ | ||
3839 | if ((u16) (header_size + stack_size) < header_size) { | ||
3840 | /* | ||
3841 | * If we overflow the maximum size for the sample, | ||
3842 | * we customize the stack dump size to fit in. | ||
3843 | */ | ||
3844 | stack_size = USHRT_MAX - header_size - sizeof(u64); | ||
3845 | stack_size = round_up(stack_size, sizeof(u64)); | ||
3846 | } | ||
3847 | |||
3848 | return stack_size; | ||
3849 | } | ||
3850 | |||
3851 | static void | ||
3852 | perf_output_sample_ustack(struct perf_output_handle *handle, u64 dump_size, | ||
3853 | struct pt_regs *regs) | ||
3854 | { | ||
3855 | /* Case of a kernel thread, nothing to dump */ | ||
3856 | if (!regs) { | ||
3857 | u64 size = 0; | ||
3858 | perf_output_put(handle, size); | ||
3859 | } else { | ||
3860 | unsigned long sp; | ||
3861 | unsigned int rem; | ||
3862 | u64 dyn_size; | ||
3863 | |||
3864 | /* | ||
3865 | * We dump: | ||
3866 | * static size | ||
3867 | * - the size requested by user or the best one we can fit | ||
3868 | * in to the sample max size | ||
3869 | * data | ||
3870 | * - user stack dump data | ||
3871 | * dynamic size | ||
3872 | * - the actual dumped size | ||
3873 | */ | ||
3874 | |||
3875 | /* Static size. */ | ||
3876 | perf_output_put(handle, dump_size); | ||
3877 | |||
3878 | /* Data. */ | ||
3879 | sp = perf_user_stack_pointer(regs); | ||
3880 | rem = __output_copy_user(handle, (void *) sp, dump_size); | ||
3881 | dyn_size = dump_size - rem; | ||
3882 | |||
3883 | perf_output_skip(handle, rem); | ||
3884 | |||
3885 | /* Dynamic size. */ | ||
3886 | perf_output_put(handle, dyn_size); | ||
3887 | } | ||
3888 | } | ||
3889 | |||
3767 | static void __perf_event_header__init_id(struct perf_event_header *header, | 3890 | static void __perf_event_header__init_id(struct perf_event_header *header, |
3768 | struct perf_sample_data *data, | 3891 | struct perf_sample_data *data, |
3769 | struct perf_event *event) | 3892 | struct perf_event *event) |
@@ -4024,6 +4147,28 @@ void perf_output_sample(struct perf_output_handle *handle, | |||
4024 | perf_output_put(handle, nr); | 4147 | perf_output_put(handle, nr); |
4025 | } | 4148 | } |
4026 | } | 4149 | } |
4150 | |||
4151 | if (sample_type & PERF_SAMPLE_REGS_USER) { | ||
4152 | u64 abi = data->regs_user.abi; | ||
4153 | |||
4154 | /* | ||
4155 | * If there are no regs to dump, notice it through | ||
4156 | * first u64 being zero (PERF_SAMPLE_REGS_ABI_NONE). | ||
4157 | */ | ||
4158 | perf_output_put(handle, abi); | ||
4159 | |||
4160 | if (abi) { | ||
4161 | u64 mask = event->attr.sample_regs_user; | ||
4162 | perf_output_sample_regs(handle, | ||
4163 | data->regs_user.regs, | ||
4164 | mask); | ||
4165 | } | ||
4166 | } | ||
4167 | |||
4168 | if (sample_type & PERF_SAMPLE_STACK_USER) | ||
4169 | perf_output_sample_ustack(handle, | ||
4170 | data->stack_user_size, | ||
4171 | data->regs_user.regs); | ||
4027 | } | 4172 | } |
4028 | 4173 | ||
4029 | void perf_prepare_sample(struct perf_event_header *header, | 4174 | void perf_prepare_sample(struct perf_event_header *header, |
@@ -4075,6 +4220,49 @@ void perf_prepare_sample(struct perf_event_header *header, | |||
4075 | } | 4220 | } |
4076 | header->size += size; | 4221 | header->size += size; |
4077 | } | 4222 | } |
4223 | |||
4224 | if (sample_type & PERF_SAMPLE_REGS_USER) { | ||
4225 | /* regs dump ABI info */ | ||
4226 | int size = sizeof(u64); | ||
4227 | |||
4228 | perf_sample_regs_user(&data->regs_user, regs); | ||
4229 | |||
4230 | if (data->regs_user.regs) { | ||
4231 | u64 mask = event->attr.sample_regs_user; | ||
4232 | size += hweight64(mask) * sizeof(u64); | ||
4233 | } | ||
4234 | |||
4235 | header->size += size; | ||
4236 | } | ||
4237 | |||
4238 | if (sample_type & PERF_SAMPLE_STACK_USER) { | ||
4239 | /* | ||
4240 | * Either we need PERF_SAMPLE_STACK_USER bit to be allways | ||
4241 | * processed as the last one or have additional check added | ||
4242 | * in case new sample type is added, because we could eat | ||
4243 | * up the rest of the sample size. | ||
4244 | */ | ||
4245 | struct perf_regs_user *uregs = &data->regs_user; | ||
4246 | u16 stack_size = event->attr.sample_stack_user; | ||
4247 | u16 size = sizeof(u64); | ||
4248 | |||
4249 | if (!uregs->abi) | ||
4250 | perf_sample_regs_user(uregs, regs); | ||
4251 | |||
4252 | stack_size = perf_sample_ustack_size(stack_size, header->size, | ||
4253 | uregs->regs); | ||
4254 | |||
4255 | /* | ||
4256 | * If there is something to dump, add space for the dump | ||
4257 | * itself and for the field that tells the dynamic size, | ||
4258 | * which is how many have been actually dumped. | ||
4259 | */ | ||
4260 | if (stack_size) | ||
4261 | size += sizeof(u64) + stack_size; | ||
4262 | |||
4263 | data->stack_user_size = stack_size; | ||
4264 | header->size += size; | ||
4265 | } | ||
4078 | } | 4266 | } |
4079 | 4267 | ||
4080 | static void perf_event_output(struct perf_event *event, | 4268 | static void perf_event_output(struct perf_event *event, |
@@ -4227,7 +4415,7 @@ static void perf_event_task_event(struct perf_task_event *task_event) | |||
4227 | rcu_read_lock(); | 4415 | rcu_read_lock(); |
4228 | list_for_each_entry_rcu(pmu, &pmus, entry) { | 4416 | list_for_each_entry_rcu(pmu, &pmus, entry) { |
4229 | cpuctx = get_cpu_ptr(pmu->pmu_cpu_context); | 4417 | cpuctx = get_cpu_ptr(pmu->pmu_cpu_context); |
4230 | if (cpuctx->active_pmu != pmu) | 4418 | if (cpuctx->unique_pmu != pmu) |
4231 | goto next; | 4419 | goto next; |
4232 | perf_event_task_ctx(&cpuctx->ctx, task_event); | 4420 | perf_event_task_ctx(&cpuctx->ctx, task_event); |
4233 | 4421 | ||
@@ -4373,7 +4561,7 @@ static void perf_event_comm_event(struct perf_comm_event *comm_event) | |||
4373 | rcu_read_lock(); | 4561 | rcu_read_lock(); |
4374 | list_for_each_entry_rcu(pmu, &pmus, entry) { | 4562 | list_for_each_entry_rcu(pmu, &pmus, entry) { |
4375 | cpuctx = get_cpu_ptr(pmu->pmu_cpu_context); | 4563 | cpuctx = get_cpu_ptr(pmu->pmu_cpu_context); |
4376 | if (cpuctx->active_pmu != pmu) | 4564 | if (cpuctx->unique_pmu != pmu) |
4377 | goto next; | 4565 | goto next; |
4378 | perf_event_comm_ctx(&cpuctx->ctx, comm_event); | 4566 | perf_event_comm_ctx(&cpuctx->ctx, comm_event); |
4379 | 4567 | ||
@@ -4569,7 +4757,7 @@ got_name: | |||
4569 | rcu_read_lock(); | 4757 | rcu_read_lock(); |
4570 | list_for_each_entry_rcu(pmu, &pmus, entry) { | 4758 | list_for_each_entry_rcu(pmu, &pmus, entry) { |
4571 | cpuctx = get_cpu_ptr(pmu->pmu_cpu_context); | 4759 | cpuctx = get_cpu_ptr(pmu->pmu_cpu_context); |
4572 | if (cpuctx->active_pmu != pmu) | 4760 | if (cpuctx->unique_pmu != pmu) |
4573 | goto next; | 4761 | goto next; |
4574 | perf_event_mmap_ctx(&cpuctx->ctx, mmap_event, | 4762 | perf_event_mmap_ctx(&cpuctx->ctx, mmap_event, |
4575 | vma->vm_flags & VM_EXEC); | 4763 | vma->vm_flags & VM_EXEC); |
@@ -5670,8 +5858,8 @@ static void update_pmu_context(struct pmu *pmu, struct pmu *old_pmu) | |||
5670 | 5858 | ||
5671 | cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, cpu); | 5859 | cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, cpu); |
5672 | 5860 | ||
5673 | if (cpuctx->active_pmu == old_pmu) | 5861 | if (cpuctx->unique_pmu == old_pmu) |
5674 | cpuctx->active_pmu = pmu; | 5862 | cpuctx->unique_pmu = pmu; |
5675 | } | 5863 | } |
5676 | } | 5864 | } |
5677 | 5865 | ||
@@ -5806,7 +5994,7 @@ skip_type: | |||
5806 | cpuctx->ctx.pmu = pmu; | 5994 | cpuctx->ctx.pmu = pmu; |
5807 | cpuctx->jiffies_interval = 1; | 5995 | cpuctx->jiffies_interval = 1; |
5808 | INIT_LIST_HEAD(&cpuctx->rotation_list); | 5996 | INIT_LIST_HEAD(&cpuctx->rotation_list); |
5809 | cpuctx->active_pmu = pmu; | 5997 | cpuctx->unique_pmu = pmu; |
5810 | } | 5998 | } |
5811 | 5999 | ||
5812 | got_cpu_context: | 6000 | got_cpu_context: |
@@ -5967,7 +6155,7 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu, | |||
5967 | 6155 | ||
5968 | event->parent = parent_event; | 6156 | event->parent = parent_event; |
5969 | 6157 | ||
5970 | event->ns = get_pid_ns(current->nsproxy->pid_ns); | 6158 | event->ns = get_pid_ns(task_active_pid_ns(current)); |
5971 | event->id = atomic64_inc_return(&perf_event_id); | 6159 | event->id = atomic64_inc_return(&perf_event_id); |
5972 | 6160 | ||
5973 | event->state = PERF_EVENT_STATE_INACTIVE; | 6161 | event->state = PERF_EVENT_STATE_INACTIVE; |
@@ -6151,6 +6339,28 @@ static int perf_copy_attr(struct perf_event_attr __user *uattr, | |||
6151 | attr->branch_sample_type = mask; | 6339 | attr->branch_sample_type = mask; |
6152 | } | 6340 | } |
6153 | } | 6341 | } |
6342 | |||
6343 | if (attr->sample_type & PERF_SAMPLE_REGS_USER) { | ||
6344 | ret = perf_reg_validate(attr->sample_regs_user); | ||
6345 | if (ret) | ||
6346 | return ret; | ||
6347 | } | ||
6348 | |||
6349 | if (attr->sample_type & PERF_SAMPLE_STACK_USER) { | ||
6350 | if (!arch_perf_have_user_stack_dump()) | ||
6351 | return -ENOSYS; | ||
6352 | |||
6353 | /* | ||
6354 | * We have __u32 type for the size, but so far | ||
6355 | * we can only use __u16 as maximum due to the | ||
6356 | * __u16 sample size limit. | ||
6357 | */ | ||
6358 | if (attr->sample_stack_user >= USHRT_MAX) | ||
6359 | ret = -EINVAL; | ||
6360 | else if (!IS_ALIGNED(attr->sample_stack_user, sizeof(u64))) | ||
6361 | ret = -EINVAL; | ||
6362 | } | ||
6363 | |||
6154 | out: | 6364 | out: |
6155 | return ret; | 6365 | return ret; |
6156 | 6366 | ||
@@ -6229,12 +6439,11 @@ SYSCALL_DEFINE5(perf_event_open, | |||
6229 | struct perf_event_attr attr; | 6439 | struct perf_event_attr attr; |
6230 | struct perf_event_context *ctx; | 6440 | struct perf_event_context *ctx; |
6231 | struct file *event_file = NULL; | 6441 | struct file *event_file = NULL; |
6232 | struct file *group_file = NULL; | 6442 | struct fd group = {NULL, 0}; |
6233 | struct task_struct *task = NULL; | 6443 | struct task_struct *task = NULL; |
6234 | struct pmu *pmu; | 6444 | struct pmu *pmu; |
6235 | int event_fd; | 6445 | int event_fd; |
6236 | int move_group = 0; | 6446 | int move_group = 0; |
6237 | int fput_needed = 0; | ||
6238 | int err; | 6447 | int err; |
6239 | 6448 | ||
6240 | /* for future expandability... */ | 6449 | /* for future expandability... */ |
@@ -6264,17 +6473,15 @@ SYSCALL_DEFINE5(perf_event_open, | |||
6264 | if ((flags & PERF_FLAG_PID_CGROUP) && (pid == -1 || cpu == -1)) | 6473 | if ((flags & PERF_FLAG_PID_CGROUP) && (pid == -1 || cpu == -1)) |
6265 | return -EINVAL; | 6474 | return -EINVAL; |
6266 | 6475 | ||
6267 | event_fd = get_unused_fd_flags(O_RDWR); | 6476 | event_fd = get_unused_fd(); |
6268 | if (event_fd < 0) | 6477 | if (event_fd < 0) |
6269 | return event_fd; | 6478 | return event_fd; |
6270 | 6479 | ||
6271 | if (group_fd != -1) { | 6480 | if (group_fd != -1) { |
6272 | group_file = perf_fget_light(group_fd, &fput_needed); | 6481 | err = perf_fget_light(group_fd, &group); |
6273 | if (IS_ERR(group_file)) { | 6482 | if (err) |
6274 | err = PTR_ERR(group_file); | ||
6275 | goto err_fd; | 6483 | goto err_fd; |
6276 | } | 6484 | group_leader = group.file->private_data; |
6277 | group_leader = group_file->private_data; | ||
6278 | if (flags & PERF_FLAG_FD_OUTPUT) | 6485 | if (flags & PERF_FLAG_FD_OUTPUT) |
6279 | output_event = group_leader; | 6486 | output_event = group_leader; |
6280 | if (flags & PERF_FLAG_FD_NO_GROUP) | 6487 | if (flags & PERF_FLAG_FD_NO_GROUP) |
@@ -6450,7 +6657,7 @@ SYSCALL_DEFINE5(perf_event_open, | |||
6450 | * of the group leader will find the pointer to itself in | 6657 | * of the group leader will find the pointer to itself in |
6451 | * perf_group_detach(). | 6658 | * perf_group_detach(). |
6452 | */ | 6659 | */ |
6453 | fput_light(group_file, fput_needed); | 6660 | fdput(group); |
6454 | fd_install(event_fd, event_file); | 6661 | fd_install(event_fd, event_file); |
6455 | return event_fd; | 6662 | return event_fd; |
6456 | 6663 | ||
@@ -6464,7 +6671,7 @@ err_task: | |||
6464 | if (task) | 6671 | if (task) |
6465 | put_task_struct(task); | 6672 | put_task_struct(task); |
6466 | err_group_fd: | 6673 | err_group_fd: |
6467 | fput_light(group_file, fput_needed); | 6674 | fdput(group); |
6468 | err_fd: | 6675 | err_fd: |
6469 | put_unused_fd(event_fd); | 6676 | put_unused_fd(event_fd); |
6470 | return err; | 6677 | return err; |
@@ -7227,7 +7434,7 @@ unlock: | |||
7227 | device_initcall(perf_event_sysfs_init); | 7434 | device_initcall(perf_event_sysfs_init); |
7228 | 7435 | ||
7229 | #ifdef CONFIG_CGROUP_PERF | 7436 | #ifdef CONFIG_CGROUP_PERF |
7230 | static struct cgroup_subsys_state *perf_cgroup_create(struct cgroup *cont) | 7437 | static struct cgroup_subsys_state *perf_cgroup_css_alloc(struct cgroup *cont) |
7231 | { | 7438 | { |
7232 | struct perf_cgroup *jc; | 7439 | struct perf_cgroup *jc; |
7233 | 7440 | ||
@@ -7244,7 +7451,7 @@ static struct cgroup_subsys_state *perf_cgroup_create(struct cgroup *cont) | |||
7244 | return &jc->css; | 7451 | return &jc->css; |
7245 | } | 7452 | } |
7246 | 7453 | ||
7247 | static void perf_cgroup_destroy(struct cgroup *cont) | 7454 | static void perf_cgroup_css_free(struct cgroup *cont) |
7248 | { | 7455 | { |
7249 | struct perf_cgroup *jc; | 7456 | struct perf_cgroup *jc; |
7250 | jc = container_of(cgroup_subsys_state(cont, perf_subsys_id), | 7457 | jc = container_of(cgroup_subsys_state(cont, perf_subsys_id), |
@@ -7285,9 +7492,16 @@ static void perf_cgroup_exit(struct cgroup *cgrp, struct cgroup *old_cgrp, | |||
7285 | struct cgroup_subsys perf_subsys = { | 7492 | struct cgroup_subsys perf_subsys = { |
7286 | .name = "perf_event", | 7493 | .name = "perf_event", |
7287 | .subsys_id = perf_subsys_id, | 7494 | .subsys_id = perf_subsys_id, |
7288 | .create = perf_cgroup_create, | 7495 | .css_alloc = perf_cgroup_css_alloc, |
7289 | .destroy = perf_cgroup_destroy, | 7496 | .css_free = perf_cgroup_css_free, |
7290 | .exit = perf_cgroup_exit, | 7497 | .exit = perf_cgroup_exit, |
7291 | .attach = perf_cgroup_attach, | 7498 | .attach = perf_cgroup_attach, |
7499 | |||
7500 | /* | ||
7501 | * perf_event cgroup doesn't handle nesting correctly. | ||
7502 | * ctx->nr_cgroups adjustments should be propagated through the | ||
7503 | * cgroup hierarchy. Fix it and remove the following. | ||
7504 | */ | ||
7505 | .broken_hierarchy = true, | ||
7292 | }; | 7506 | }; |
7293 | #endif /* CONFIG_CGROUP_PERF */ | 7507 | #endif /* CONFIG_CGROUP_PERF */ |
diff --git a/kernel/events/hw_breakpoint.c b/kernel/events/hw_breakpoint.c index 9a7b487c6fe2..fe8a916507ed 100644 --- a/kernel/events/hw_breakpoint.c +++ b/kernel/events/hw_breakpoint.c | |||
@@ -111,14 +111,16 @@ static unsigned int max_task_bp_pinned(int cpu, enum bp_type_idx type) | |||
111 | * Count the number of breakpoints of the same type and same task. | 111 | * Count the number of breakpoints of the same type and same task. |
112 | * The given event must be not on the list. | 112 | * The given event must be not on the list. |
113 | */ | 113 | */ |
114 | static int task_bp_pinned(struct perf_event *bp, enum bp_type_idx type) | 114 | static int task_bp_pinned(int cpu, struct perf_event *bp, enum bp_type_idx type) |
115 | { | 115 | { |
116 | struct task_struct *tsk = bp->hw.bp_target; | 116 | struct task_struct *tsk = bp->hw.bp_target; |
117 | struct perf_event *iter; | 117 | struct perf_event *iter; |
118 | int count = 0; | 118 | int count = 0; |
119 | 119 | ||
120 | list_for_each_entry(iter, &bp_task_head, hw.bp_list) { | 120 | list_for_each_entry(iter, &bp_task_head, hw.bp_list) { |
121 | if (iter->hw.bp_target == tsk && find_slot_idx(iter) == type) | 121 | if (iter->hw.bp_target == tsk && |
122 | find_slot_idx(iter) == type && | ||
123 | cpu == iter->cpu) | ||
122 | count += hw_breakpoint_weight(iter); | 124 | count += hw_breakpoint_weight(iter); |
123 | } | 125 | } |
124 | 126 | ||
@@ -141,7 +143,7 @@ fetch_bp_busy_slots(struct bp_busy_slots *slots, struct perf_event *bp, | |||
141 | if (!tsk) | 143 | if (!tsk) |
142 | slots->pinned += max_task_bp_pinned(cpu, type); | 144 | slots->pinned += max_task_bp_pinned(cpu, type); |
143 | else | 145 | else |
144 | slots->pinned += task_bp_pinned(bp, type); | 146 | slots->pinned += task_bp_pinned(cpu, bp, type); |
145 | slots->flexible = per_cpu(nr_bp_flexible[type], cpu); | 147 | slots->flexible = per_cpu(nr_bp_flexible[type], cpu); |
146 | 148 | ||
147 | return; | 149 | return; |
@@ -154,7 +156,7 @@ fetch_bp_busy_slots(struct bp_busy_slots *slots, struct perf_event *bp, | |||
154 | if (!tsk) | 156 | if (!tsk) |
155 | nr += max_task_bp_pinned(cpu, type); | 157 | nr += max_task_bp_pinned(cpu, type); |
156 | else | 158 | else |
157 | nr += task_bp_pinned(bp, type); | 159 | nr += task_bp_pinned(cpu, bp, type); |
158 | 160 | ||
159 | if (nr > slots->pinned) | 161 | if (nr > slots->pinned) |
160 | slots->pinned = nr; | 162 | slots->pinned = nr; |
@@ -188,7 +190,7 @@ static void toggle_bp_task_slot(struct perf_event *bp, int cpu, bool enable, | |||
188 | int old_idx = 0; | 190 | int old_idx = 0; |
189 | int idx = 0; | 191 | int idx = 0; |
190 | 192 | ||
191 | old_count = task_bp_pinned(bp, type); | 193 | old_count = task_bp_pinned(cpu, bp, type); |
192 | old_idx = old_count - 1; | 194 | old_idx = old_count - 1; |
193 | idx = old_idx + weight; | 195 | idx = old_idx + weight; |
194 | 196 | ||
diff --git a/kernel/events/internal.h b/kernel/events/internal.h index a096c19f2c2a..d56a64c99a8b 100644 --- a/kernel/events/internal.h +++ b/kernel/events/internal.h | |||
@@ -2,6 +2,7 @@ | |||
2 | #define _KERNEL_EVENTS_INTERNAL_H | 2 | #define _KERNEL_EVENTS_INTERNAL_H |
3 | 3 | ||
4 | #include <linux/hardirq.h> | 4 | #include <linux/hardirq.h> |
5 | #include <linux/uaccess.h> | ||
5 | 6 | ||
6 | /* Buffer handling */ | 7 | /* Buffer handling */ |
7 | 8 | ||
@@ -76,30 +77,53 @@ static inline unsigned long perf_data_size(struct ring_buffer *rb) | |||
76 | return rb->nr_pages << (PAGE_SHIFT + page_order(rb)); | 77 | return rb->nr_pages << (PAGE_SHIFT + page_order(rb)); |
77 | } | 78 | } |
78 | 79 | ||
79 | static inline void | 80 | #define DEFINE_OUTPUT_COPY(func_name, memcpy_func) \ |
80 | __output_copy(struct perf_output_handle *handle, | 81 | static inline unsigned int \ |
81 | const void *buf, unsigned int len) | 82 | func_name(struct perf_output_handle *handle, \ |
83 | const void *buf, unsigned int len) \ | ||
84 | { \ | ||
85 | unsigned long size, written; \ | ||
86 | \ | ||
87 | do { \ | ||
88 | size = min_t(unsigned long, handle->size, len); \ | ||
89 | \ | ||
90 | written = memcpy_func(handle->addr, buf, size); \ | ||
91 | \ | ||
92 | len -= written; \ | ||
93 | handle->addr += written; \ | ||
94 | buf += written; \ | ||
95 | handle->size -= written; \ | ||
96 | if (!handle->size) { \ | ||
97 | struct ring_buffer *rb = handle->rb; \ | ||
98 | \ | ||
99 | handle->page++; \ | ||
100 | handle->page &= rb->nr_pages - 1; \ | ||
101 | handle->addr = rb->data_pages[handle->page]; \ | ||
102 | handle->size = PAGE_SIZE << page_order(rb); \ | ||
103 | } \ | ||
104 | } while (len && written == size); \ | ||
105 | \ | ||
106 | return len; \ | ||
107 | } | ||
108 | |||
109 | static inline int memcpy_common(void *dst, const void *src, size_t n) | ||
82 | { | 110 | { |
83 | do { | 111 | memcpy(dst, src, n); |
84 | unsigned long size = min_t(unsigned long, handle->size, len); | 112 | return n; |
85 | |||
86 | memcpy(handle->addr, buf, size); | ||
87 | |||
88 | len -= size; | ||
89 | handle->addr += size; | ||
90 | buf += size; | ||
91 | handle->size -= size; | ||
92 | if (!handle->size) { | ||
93 | struct ring_buffer *rb = handle->rb; | ||
94 | |||
95 | handle->page++; | ||
96 | handle->page &= rb->nr_pages - 1; | ||
97 | handle->addr = rb->data_pages[handle->page]; | ||
98 | handle->size = PAGE_SIZE << page_order(rb); | ||
99 | } | ||
100 | } while (len); | ||
101 | } | 113 | } |
102 | 114 | ||
115 | DEFINE_OUTPUT_COPY(__output_copy, memcpy_common) | ||
116 | |||
117 | #define MEMCPY_SKIP(dst, src, n) (n) | ||
118 | |||
119 | DEFINE_OUTPUT_COPY(__output_skip, MEMCPY_SKIP) | ||
120 | |||
121 | #ifndef arch_perf_out_copy_user | ||
122 | #define arch_perf_out_copy_user __copy_from_user_inatomic | ||
123 | #endif | ||
124 | |||
125 | DEFINE_OUTPUT_COPY(__output_copy_user, arch_perf_out_copy_user) | ||
126 | |||
103 | /* Callchain handling */ | 127 | /* Callchain handling */ |
104 | extern struct perf_callchain_entry * | 128 | extern struct perf_callchain_entry * |
105 | perf_callchain(struct perf_event *event, struct pt_regs *regs); | 129 | perf_callchain(struct perf_event *event, struct pt_regs *regs); |
@@ -134,4 +158,20 @@ static inline void put_recursion_context(int *recursion, int rctx) | |||
134 | recursion[rctx]--; | 158 | recursion[rctx]--; |
135 | } | 159 | } |
136 | 160 | ||
161 | #ifdef CONFIG_HAVE_PERF_USER_STACK_DUMP | ||
162 | static inline bool arch_perf_have_user_stack_dump(void) | ||
163 | { | ||
164 | return true; | ||
165 | } | ||
166 | |||
167 | #define perf_user_stack_pointer(regs) user_stack_pointer(regs) | ||
168 | #else | ||
169 | static inline bool arch_perf_have_user_stack_dump(void) | ||
170 | { | ||
171 | return false; | ||
172 | } | ||
173 | |||
174 | #define perf_user_stack_pointer(regs) 0 | ||
175 | #endif /* CONFIG_HAVE_PERF_USER_STACK_DUMP */ | ||
176 | |||
137 | #endif /* _KERNEL_EVENTS_INTERNAL_H */ | 177 | #endif /* _KERNEL_EVENTS_INTERNAL_H */ |
diff --git a/kernel/events/ring_buffer.c b/kernel/events/ring_buffer.c index 6ddaba43fb7a..23cb34ff3973 100644 --- a/kernel/events/ring_buffer.c +++ b/kernel/events/ring_buffer.c | |||
@@ -182,10 +182,16 @@ out: | |||
182 | return -ENOSPC; | 182 | return -ENOSPC; |
183 | } | 183 | } |
184 | 184 | ||
185 | void perf_output_copy(struct perf_output_handle *handle, | 185 | unsigned int perf_output_copy(struct perf_output_handle *handle, |
186 | const void *buf, unsigned int len) | 186 | const void *buf, unsigned int len) |
187 | { | 187 | { |
188 | __output_copy(handle, buf, len); | 188 | return __output_copy(handle, buf, len); |
189 | } | ||
190 | |||
191 | unsigned int perf_output_skip(struct perf_output_handle *handle, | ||
192 | unsigned int len) | ||
193 | { | ||
194 | return __output_skip(handle, NULL, len); | ||
189 | } | 195 | } |
190 | 196 | ||
191 | void perf_output_end(struct perf_output_handle *handle) | 197 | void perf_output_end(struct perf_output_handle *handle) |
diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index c08a22d02f72..dea7acfbb071 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c | |||
@@ -33,6 +33,7 @@ | |||
33 | #include <linux/ptrace.h> /* user_enable_single_step */ | 33 | #include <linux/ptrace.h> /* user_enable_single_step */ |
34 | #include <linux/kdebug.h> /* notifier mechanism */ | 34 | #include <linux/kdebug.h> /* notifier mechanism */ |
35 | #include "../../mm/internal.h" /* munlock_vma_page */ | 35 | #include "../../mm/internal.h" /* munlock_vma_page */ |
36 | #include <linux/percpu-rwsem.h> | ||
36 | 37 | ||
37 | #include <linux/uprobes.h> | 38 | #include <linux/uprobes.h> |
38 | 39 | ||
@@ -71,6 +72,8 @@ static struct mutex uprobes_mutex[UPROBES_HASH_SZ]; | |||
71 | static struct mutex uprobes_mmap_mutex[UPROBES_HASH_SZ]; | 72 | static struct mutex uprobes_mmap_mutex[UPROBES_HASH_SZ]; |
72 | #define uprobes_mmap_hash(v) (&uprobes_mmap_mutex[((unsigned long)(v)) % UPROBES_HASH_SZ]) | 73 | #define uprobes_mmap_hash(v) (&uprobes_mmap_mutex[((unsigned long)(v)) % UPROBES_HASH_SZ]) |
73 | 74 | ||
75 | static struct percpu_rw_semaphore dup_mmap_sem; | ||
76 | |||
74 | /* | 77 | /* |
75 | * uprobe_events allows us to skip the uprobe_mmap if there are no uprobe | 78 | * uprobe_events allows us to skip the uprobe_mmap if there are no uprobe |
76 | * events active at this time. Probably a fine grained per inode count is | 79 | * events active at this time. Probably a fine grained per inode count is |
@@ -78,15 +81,23 @@ static struct mutex uprobes_mmap_mutex[UPROBES_HASH_SZ]; | |||
78 | */ | 81 | */ |
79 | static atomic_t uprobe_events = ATOMIC_INIT(0); | 82 | static atomic_t uprobe_events = ATOMIC_INIT(0); |
80 | 83 | ||
84 | /* Have a copy of original instruction */ | ||
85 | #define UPROBE_COPY_INSN 0 | ||
86 | /* Dont run handlers when first register/ last unregister in progress*/ | ||
87 | #define UPROBE_RUN_HANDLER 1 | ||
88 | /* Can skip singlestep */ | ||
89 | #define UPROBE_SKIP_SSTEP 2 | ||
90 | |||
81 | struct uprobe { | 91 | struct uprobe { |
82 | struct rb_node rb_node; /* node in the rb tree */ | 92 | struct rb_node rb_node; /* node in the rb tree */ |
83 | atomic_t ref; | 93 | atomic_t ref; |
84 | struct rw_semaphore consumer_rwsem; | 94 | struct rw_semaphore consumer_rwsem; |
95 | struct mutex copy_mutex; /* TODO: kill me and UPROBE_COPY_INSN */ | ||
85 | struct list_head pending_list; | 96 | struct list_head pending_list; |
86 | struct uprobe_consumer *consumers; | 97 | struct uprobe_consumer *consumers; |
87 | struct inode *inode; /* Also hold a ref to inode */ | 98 | struct inode *inode; /* Also hold a ref to inode */ |
88 | loff_t offset; | 99 | loff_t offset; |
89 | int flags; | 100 | unsigned long flags; |
90 | struct arch_uprobe arch; | 101 | struct arch_uprobe arch; |
91 | }; | 102 | }; |
92 | 103 | ||
@@ -100,17 +111,12 @@ struct uprobe { | |||
100 | */ | 111 | */ |
101 | static bool valid_vma(struct vm_area_struct *vma, bool is_register) | 112 | static bool valid_vma(struct vm_area_struct *vma, bool is_register) |
102 | { | 113 | { |
103 | if (!vma->vm_file) | 114 | vm_flags_t flags = VM_HUGETLB | VM_MAYEXEC | VM_SHARED; |
104 | return false; | ||
105 | |||
106 | if (!is_register) | ||
107 | return true; | ||
108 | 115 | ||
109 | if ((vma->vm_flags & (VM_HUGETLB|VM_READ|VM_WRITE|VM_EXEC|VM_SHARED)) | 116 | if (is_register) |
110 | == (VM_READ|VM_EXEC)) | 117 | flags |= VM_WRITE; |
111 | return true; | ||
112 | 118 | ||
113 | return false; | 119 | return vma->vm_file && (vma->vm_flags & flags) == VM_MAYEXEC; |
114 | } | 120 | } |
115 | 121 | ||
116 | static unsigned long offset_to_vaddr(struct vm_area_struct *vma, loff_t offset) | 122 | static unsigned long offset_to_vaddr(struct vm_area_struct *vma, loff_t offset) |
@@ -141,10 +147,14 @@ static int __replace_page(struct vm_area_struct *vma, unsigned long addr, | |||
141 | spinlock_t *ptl; | 147 | spinlock_t *ptl; |
142 | pte_t *ptep; | 148 | pte_t *ptep; |
143 | int err; | 149 | int err; |
150 | /* For mmu_notifiers */ | ||
151 | const unsigned long mmun_start = addr; | ||
152 | const unsigned long mmun_end = addr + PAGE_SIZE; | ||
144 | 153 | ||
145 | /* For try_to_free_swap() and munlock_vma_page() below */ | 154 | /* For try_to_free_swap() and munlock_vma_page() below */ |
146 | lock_page(page); | 155 | lock_page(page); |
147 | 156 | ||
157 | mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end); | ||
148 | err = -EAGAIN; | 158 | err = -EAGAIN; |
149 | ptep = page_check_address(page, mm, addr, &ptl, 0); | 159 | ptep = page_check_address(page, mm, addr, &ptl, 0); |
150 | if (!ptep) | 160 | if (!ptep) |
@@ -173,6 +183,7 @@ static int __replace_page(struct vm_area_struct *vma, unsigned long addr, | |||
173 | 183 | ||
174 | err = 0; | 184 | err = 0; |
175 | unlock: | 185 | unlock: |
186 | mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end); | ||
176 | unlock_page(page); | 187 | unlock_page(page); |
177 | return err; | 188 | return err; |
178 | } | 189 | } |
@@ -188,19 +199,44 @@ bool __weak is_swbp_insn(uprobe_opcode_t *insn) | |||
188 | return *insn == UPROBE_SWBP_INSN; | 199 | return *insn == UPROBE_SWBP_INSN; |
189 | } | 200 | } |
190 | 201 | ||
202 | static void copy_opcode(struct page *page, unsigned long vaddr, uprobe_opcode_t *opcode) | ||
203 | { | ||
204 | void *kaddr = kmap_atomic(page); | ||
205 | memcpy(opcode, kaddr + (vaddr & ~PAGE_MASK), UPROBE_SWBP_INSN_SIZE); | ||
206 | kunmap_atomic(kaddr); | ||
207 | } | ||
208 | |||
209 | static int verify_opcode(struct page *page, unsigned long vaddr, uprobe_opcode_t *new_opcode) | ||
210 | { | ||
211 | uprobe_opcode_t old_opcode; | ||
212 | bool is_swbp; | ||
213 | |||
214 | copy_opcode(page, vaddr, &old_opcode); | ||
215 | is_swbp = is_swbp_insn(&old_opcode); | ||
216 | |||
217 | if (is_swbp_insn(new_opcode)) { | ||
218 | if (is_swbp) /* register: already installed? */ | ||
219 | return 0; | ||
220 | } else { | ||
221 | if (!is_swbp) /* unregister: was it changed by us? */ | ||
222 | return 0; | ||
223 | } | ||
224 | |||
225 | return 1; | ||
226 | } | ||
227 | |||
191 | /* | 228 | /* |
192 | * NOTE: | 229 | * NOTE: |
193 | * Expect the breakpoint instruction to be the smallest size instruction for | 230 | * Expect the breakpoint instruction to be the smallest size instruction for |
194 | * the architecture. If an arch has variable length instruction and the | 231 | * the architecture. If an arch has variable length instruction and the |
195 | * breakpoint instruction is not of the smallest length instruction | 232 | * breakpoint instruction is not of the smallest length instruction |
196 | * supported by that architecture then we need to modify read_opcode / | 233 | * supported by that architecture then we need to modify is_swbp_at_addr and |
197 | * write_opcode accordingly. This would never be a problem for archs that | 234 | * write_opcode accordingly. This would never be a problem for archs that |
198 | * have fixed length instructions. | 235 | * have fixed length instructions. |
199 | */ | 236 | */ |
200 | 237 | ||
201 | /* | 238 | /* |
202 | * write_opcode - write the opcode at a given virtual address. | 239 | * write_opcode - write the opcode at a given virtual address. |
203 | * @auprobe: arch breakpointing information. | ||
204 | * @mm: the probed process address space. | 240 | * @mm: the probed process address space. |
205 | * @vaddr: the virtual address to store the opcode. | 241 | * @vaddr: the virtual address to store the opcode. |
206 | * @opcode: opcode to be written at @vaddr. | 242 | * @opcode: opcode to be written at @vaddr. |
@@ -211,8 +247,8 @@ bool __weak is_swbp_insn(uprobe_opcode_t *insn) | |||
211 | * For mm @mm, write the opcode at @vaddr. | 247 | * For mm @mm, write the opcode at @vaddr. |
212 | * Return 0 (success) or a negative errno. | 248 | * Return 0 (success) or a negative errno. |
213 | */ | 249 | */ |
214 | static int write_opcode(struct arch_uprobe *auprobe, struct mm_struct *mm, | 250 | static int write_opcode(struct mm_struct *mm, unsigned long vaddr, |
215 | unsigned long vaddr, uprobe_opcode_t opcode) | 251 | uprobe_opcode_t opcode) |
216 | { | 252 | { |
217 | struct page *old_page, *new_page; | 253 | struct page *old_page, *new_page; |
218 | void *vaddr_old, *vaddr_new; | 254 | void *vaddr_old, *vaddr_new; |
@@ -221,10 +257,14 @@ static int write_opcode(struct arch_uprobe *auprobe, struct mm_struct *mm, | |||
221 | 257 | ||
222 | retry: | 258 | retry: |
223 | /* Read the page with vaddr into memory */ | 259 | /* Read the page with vaddr into memory */ |
224 | ret = get_user_pages(NULL, mm, vaddr, 1, 0, 0, &old_page, &vma); | 260 | ret = get_user_pages(NULL, mm, vaddr, 1, 0, 1, &old_page, &vma); |
225 | if (ret <= 0) | 261 | if (ret <= 0) |
226 | return ret; | 262 | return ret; |
227 | 263 | ||
264 | ret = verify_opcode(old_page, vaddr, &opcode); | ||
265 | if (ret <= 0) | ||
266 | goto put_old; | ||
267 | |||
228 | ret = -ENOMEM; | 268 | ret = -ENOMEM; |
229 | new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, vaddr); | 269 | new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, vaddr); |
230 | if (!new_page) | 270 | if (!new_page) |
@@ -259,65 +299,6 @@ put_old: | |||
259 | } | 299 | } |
260 | 300 | ||
261 | /** | 301 | /** |
262 | * read_opcode - read the opcode at a given virtual address. | ||
263 | * @mm: the probed process address space. | ||
264 | * @vaddr: the virtual address to read the opcode. | ||
265 | * @opcode: location to store the read opcode. | ||
266 | * | ||
267 | * Called with mm->mmap_sem held (for read and with a reference to | ||
268 | * mm. | ||
269 | * | ||
270 | * For mm @mm, read the opcode at @vaddr and store it in @opcode. | ||
271 | * Return 0 (success) or a negative errno. | ||
272 | */ | ||
273 | static int read_opcode(struct mm_struct *mm, unsigned long vaddr, uprobe_opcode_t *opcode) | ||
274 | { | ||
275 | struct page *page; | ||
276 | void *vaddr_new; | ||
277 | int ret; | ||
278 | |||
279 | ret = get_user_pages(NULL, mm, vaddr, 1, 0, 1, &page, NULL); | ||
280 | if (ret <= 0) | ||
281 | return ret; | ||
282 | |||
283 | lock_page(page); | ||
284 | vaddr_new = kmap_atomic(page); | ||
285 | vaddr &= ~PAGE_MASK; | ||
286 | memcpy(opcode, vaddr_new + vaddr, UPROBE_SWBP_INSN_SIZE); | ||
287 | kunmap_atomic(vaddr_new); | ||
288 | unlock_page(page); | ||
289 | |||
290 | put_page(page); | ||
291 | |||
292 | return 0; | ||
293 | } | ||
294 | |||
295 | static int is_swbp_at_addr(struct mm_struct *mm, unsigned long vaddr) | ||
296 | { | ||
297 | uprobe_opcode_t opcode; | ||
298 | int result; | ||
299 | |||
300 | if (current->mm == mm) { | ||
301 | pagefault_disable(); | ||
302 | result = __copy_from_user_inatomic(&opcode, (void __user*)vaddr, | ||
303 | sizeof(opcode)); | ||
304 | pagefault_enable(); | ||
305 | |||
306 | if (likely(result == 0)) | ||
307 | goto out; | ||
308 | } | ||
309 | |||
310 | result = read_opcode(mm, vaddr, &opcode); | ||
311 | if (result) | ||
312 | return result; | ||
313 | out: | ||
314 | if (is_swbp_insn(&opcode)) | ||
315 | return 1; | ||
316 | |||
317 | return 0; | ||
318 | } | ||
319 | |||
320 | /** | ||
321 | * set_swbp - store breakpoint at a given address. | 302 | * set_swbp - store breakpoint at a given address. |
322 | * @auprobe: arch specific probepoint information. | 303 | * @auprobe: arch specific probepoint information. |
323 | * @mm: the probed process address space. | 304 | * @mm: the probed process address space. |
@@ -328,18 +309,7 @@ out: | |||
328 | */ | 309 | */ |
329 | int __weak set_swbp(struct arch_uprobe *auprobe, struct mm_struct *mm, unsigned long vaddr) | 310 | int __weak set_swbp(struct arch_uprobe *auprobe, struct mm_struct *mm, unsigned long vaddr) |
330 | { | 311 | { |
331 | int result; | 312 | return write_opcode(mm, vaddr, UPROBE_SWBP_INSN); |
332 | /* | ||
333 | * See the comment near uprobes_hash(). | ||
334 | */ | ||
335 | result = is_swbp_at_addr(mm, vaddr); | ||
336 | if (result == 1) | ||
337 | return -EEXIST; | ||
338 | |||
339 | if (result) | ||
340 | return result; | ||
341 | |||
342 | return write_opcode(auprobe, mm, vaddr, UPROBE_SWBP_INSN); | ||
343 | } | 313 | } |
344 | 314 | ||
345 | /** | 315 | /** |
@@ -347,25 +317,14 @@ int __weak set_swbp(struct arch_uprobe *auprobe, struct mm_struct *mm, unsigned | |||
347 | * @mm: the probed process address space. | 317 | * @mm: the probed process address space. |
348 | * @auprobe: arch specific probepoint information. | 318 | * @auprobe: arch specific probepoint information. |
349 | * @vaddr: the virtual address to insert the opcode. | 319 | * @vaddr: the virtual address to insert the opcode. |
350 | * @verify: if true, verify existance of breakpoint instruction. | ||
351 | * | 320 | * |
352 | * For mm @mm, restore the original opcode (opcode) at @vaddr. | 321 | * For mm @mm, restore the original opcode (opcode) at @vaddr. |
353 | * Return 0 (success) or a negative errno. | 322 | * Return 0 (success) or a negative errno. |
354 | */ | 323 | */ |
355 | int __weak | 324 | int __weak |
356 | set_orig_insn(struct arch_uprobe *auprobe, struct mm_struct *mm, unsigned long vaddr, bool verify) | 325 | set_orig_insn(struct arch_uprobe *auprobe, struct mm_struct *mm, unsigned long vaddr) |
357 | { | 326 | { |
358 | if (verify) { | 327 | return write_opcode(mm, vaddr, *(uprobe_opcode_t *)auprobe->insn); |
359 | int result; | ||
360 | |||
361 | result = is_swbp_at_addr(mm, vaddr); | ||
362 | if (!result) | ||
363 | return -EINVAL; | ||
364 | |||
365 | if (result != 1) | ||
366 | return result; | ||
367 | } | ||
368 | return write_opcode(auprobe, mm, vaddr, *(uprobe_opcode_t *)auprobe->insn); | ||
369 | } | 328 | } |
370 | 329 | ||
371 | static int match_uprobe(struct uprobe *l, struct uprobe *r) | 330 | static int match_uprobe(struct uprobe *l, struct uprobe *r) |
@@ -415,11 +374,10 @@ static struct uprobe *__find_uprobe(struct inode *inode, loff_t offset) | |||
415 | static struct uprobe *find_uprobe(struct inode *inode, loff_t offset) | 374 | static struct uprobe *find_uprobe(struct inode *inode, loff_t offset) |
416 | { | 375 | { |
417 | struct uprobe *uprobe; | 376 | struct uprobe *uprobe; |
418 | unsigned long flags; | ||
419 | 377 | ||
420 | spin_lock_irqsave(&uprobes_treelock, flags); | 378 | spin_lock(&uprobes_treelock); |
421 | uprobe = __find_uprobe(inode, offset); | 379 | uprobe = __find_uprobe(inode, offset); |
422 | spin_unlock_irqrestore(&uprobes_treelock, flags); | 380 | spin_unlock(&uprobes_treelock); |
423 | 381 | ||
424 | return uprobe; | 382 | return uprobe; |
425 | } | 383 | } |
@@ -466,15 +424,14 @@ static struct uprobe *__insert_uprobe(struct uprobe *uprobe) | |||
466 | */ | 424 | */ |
467 | static struct uprobe *insert_uprobe(struct uprobe *uprobe) | 425 | static struct uprobe *insert_uprobe(struct uprobe *uprobe) |
468 | { | 426 | { |
469 | unsigned long flags; | ||
470 | struct uprobe *u; | 427 | struct uprobe *u; |
471 | 428 | ||
472 | spin_lock_irqsave(&uprobes_treelock, flags); | 429 | spin_lock(&uprobes_treelock); |
473 | u = __insert_uprobe(uprobe); | 430 | u = __insert_uprobe(uprobe); |
474 | spin_unlock_irqrestore(&uprobes_treelock, flags); | 431 | spin_unlock(&uprobes_treelock); |
475 | 432 | ||
476 | /* For now assume that the instruction need not be single-stepped */ | 433 | /* For now assume that the instruction need not be single-stepped */ |
477 | uprobe->flags |= UPROBE_SKIP_SSTEP; | 434 | __set_bit(UPROBE_SKIP_SSTEP, &uprobe->flags); |
478 | 435 | ||
479 | return u; | 436 | return u; |
480 | } | 437 | } |
@@ -496,6 +453,7 @@ static struct uprobe *alloc_uprobe(struct inode *inode, loff_t offset) | |||
496 | uprobe->inode = igrab(inode); | 453 | uprobe->inode = igrab(inode); |
497 | uprobe->offset = offset; | 454 | uprobe->offset = offset; |
498 | init_rwsem(&uprobe->consumer_rwsem); | 455 | init_rwsem(&uprobe->consumer_rwsem); |
456 | mutex_init(&uprobe->copy_mutex); | ||
499 | 457 | ||
500 | /* add to uprobes_tree, sorted on inode:offset */ | 458 | /* add to uprobes_tree, sorted on inode:offset */ |
501 | cur_uprobe = insert_uprobe(uprobe); | 459 | cur_uprobe = insert_uprobe(uprobe); |
@@ -516,7 +474,7 @@ static void handler_chain(struct uprobe *uprobe, struct pt_regs *regs) | |||
516 | { | 474 | { |
517 | struct uprobe_consumer *uc; | 475 | struct uprobe_consumer *uc; |
518 | 476 | ||
519 | if (!(uprobe->flags & UPROBE_RUN_HANDLER)) | 477 | if (!test_bit(UPROBE_RUN_HANDLER, &uprobe->flags)) |
520 | return; | 478 | return; |
521 | 479 | ||
522 | down_read(&uprobe->consumer_rwsem); | 480 | down_read(&uprobe->consumer_rwsem); |
@@ -622,33 +580,48 @@ static int copy_insn(struct uprobe *uprobe, struct file *filp) | |||
622 | return __copy_insn(mapping, filp, uprobe->arch.insn, bytes, uprobe->offset); | 580 | return __copy_insn(mapping, filp, uprobe->arch.insn, bytes, uprobe->offset); |
623 | } | 581 | } |
624 | 582 | ||
625 | /* | 583 | static int prepare_uprobe(struct uprobe *uprobe, struct file *file, |
626 | * How mm->uprobes_state.count gets updated | 584 | struct mm_struct *mm, unsigned long vaddr) |
627 | * uprobe_mmap() increments the count if | 585 | { |
628 | * - it successfully adds a breakpoint. | 586 | int ret = 0; |
629 | * - it cannot add a breakpoint, but sees that there is a underlying | 587 | |
630 | * breakpoint (via a is_swbp_at_addr()). | 588 | if (test_bit(UPROBE_COPY_INSN, &uprobe->flags)) |
631 | * | 589 | return ret; |
632 | * uprobe_munmap() decrements the count if | 590 | |
633 | * - it sees a underlying breakpoint, (via is_swbp_at_addr) | 591 | mutex_lock(&uprobe->copy_mutex); |
634 | * (Subsequent uprobe_unregister wouldnt find the breakpoint | 592 | if (test_bit(UPROBE_COPY_INSN, &uprobe->flags)) |
635 | * unless a uprobe_mmap kicks in, since the old vma would be | 593 | goto out; |
636 | * dropped just after uprobe_munmap.) | 594 | |
637 | * | 595 | ret = copy_insn(uprobe, file); |
638 | * uprobe_register increments the count if: | 596 | if (ret) |
639 | * - it successfully adds a breakpoint. | 597 | goto out; |
640 | * | 598 | |
641 | * uprobe_unregister decrements the count if: | 599 | ret = -ENOTSUPP; |
642 | * - it sees a underlying breakpoint and removes successfully. | 600 | if (is_swbp_insn((uprobe_opcode_t *)uprobe->arch.insn)) |
643 | * (via is_swbp_at_addr) | 601 | goto out; |
644 | * (Subsequent uprobe_munmap wouldnt find the breakpoint | 602 | |
645 | * since there is no underlying breakpoint after the | 603 | ret = arch_uprobe_analyze_insn(&uprobe->arch, mm, vaddr); |
646 | * breakpoint removal.) | 604 | if (ret) |
647 | */ | 605 | goto out; |
606 | |||
607 | /* write_opcode() assumes we don't cross page boundary */ | ||
608 | BUG_ON((uprobe->offset & ~PAGE_MASK) + | ||
609 | UPROBE_SWBP_INSN_SIZE > PAGE_SIZE); | ||
610 | |||
611 | smp_wmb(); /* pairs with rmb() in find_active_uprobe() */ | ||
612 | set_bit(UPROBE_COPY_INSN, &uprobe->flags); | ||
613 | |||
614 | out: | ||
615 | mutex_unlock(&uprobe->copy_mutex); | ||
616 | |||
617 | return ret; | ||
618 | } | ||
619 | |||
648 | static int | 620 | static int |
649 | install_breakpoint(struct uprobe *uprobe, struct mm_struct *mm, | 621 | install_breakpoint(struct uprobe *uprobe, struct mm_struct *mm, |
650 | struct vm_area_struct *vma, unsigned long vaddr) | 622 | struct vm_area_struct *vma, unsigned long vaddr) |
651 | { | 623 | { |
624 | bool first_uprobe; | ||
652 | int ret; | 625 | int ret; |
653 | 626 | ||
654 | /* | 627 | /* |
@@ -659,48 +632,38 @@ install_breakpoint(struct uprobe *uprobe, struct mm_struct *mm, | |||
659 | * Hence behave as if probe already existed. | 632 | * Hence behave as if probe already existed. |
660 | */ | 633 | */ |
661 | if (!uprobe->consumers) | 634 | if (!uprobe->consumers) |
662 | return -EEXIST; | 635 | return 0; |
663 | |||
664 | if (!(uprobe->flags & UPROBE_COPY_INSN)) { | ||
665 | ret = copy_insn(uprobe, vma->vm_file); | ||
666 | if (ret) | ||
667 | return ret; | ||
668 | |||
669 | if (is_swbp_insn((uprobe_opcode_t *)uprobe->arch.insn)) | ||
670 | return -ENOTSUPP; | ||
671 | |||
672 | ret = arch_uprobe_analyze_insn(&uprobe->arch, mm, vaddr); | ||
673 | if (ret) | ||
674 | return ret; | ||
675 | |||
676 | /* write_opcode() assumes we don't cross page boundary */ | ||
677 | BUG_ON((uprobe->offset & ~PAGE_MASK) + | ||
678 | UPROBE_SWBP_INSN_SIZE > PAGE_SIZE); | ||
679 | 636 | ||
680 | uprobe->flags |= UPROBE_COPY_INSN; | 637 | ret = prepare_uprobe(uprobe, vma->vm_file, mm, vaddr); |
681 | } | 638 | if (ret) |
639 | return ret; | ||
682 | 640 | ||
683 | /* | 641 | /* |
684 | * Ideally, should be updating the probe count after the breakpoint | 642 | * set MMF_HAS_UPROBES in advance for uprobe_pre_sstep_notifier(), |
685 | * has been successfully inserted. However a thread could hit the | 643 | * the task can hit this breakpoint right after __replace_page(). |
686 | * breakpoint we just inserted even before the probe count is | ||
687 | * incremented. If this is the first breakpoint placed, breakpoint | ||
688 | * notifier might ignore uprobes and pass the trap to the thread. | ||
689 | * Hence increment before and decrement on failure. | ||
690 | */ | 644 | */ |
691 | atomic_inc(&mm->uprobes_state.count); | 645 | first_uprobe = !test_bit(MMF_HAS_UPROBES, &mm->flags); |
646 | if (first_uprobe) | ||
647 | set_bit(MMF_HAS_UPROBES, &mm->flags); | ||
648 | |||
692 | ret = set_swbp(&uprobe->arch, mm, vaddr); | 649 | ret = set_swbp(&uprobe->arch, mm, vaddr); |
693 | if (ret) | 650 | if (!ret) |
694 | atomic_dec(&mm->uprobes_state.count); | 651 | clear_bit(MMF_RECALC_UPROBES, &mm->flags); |
652 | else if (first_uprobe) | ||
653 | clear_bit(MMF_HAS_UPROBES, &mm->flags); | ||
695 | 654 | ||
696 | return ret; | 655 | return ret; |
697 | } | 656 | } |
698 | 657 | ||
699 | static void | 658 | static int |
700 | remove_breakpoint(struct uprobe *uprobe, struct mm_struct *mm, unsigned long vaddr) | 659 | remove_breakpoint(struct uprobe *uprobe, struct mm_struct *mm, unsigned long vaddr) |
701 | { | 660 | { |
702 | if (!set_orig_insn(&uprobe->arch, mm, vaddr, true)) | 661 | /* can happen if uprobe_register() fails */ |
703 | atomic_dec(&mm->uprobes_state.count); | 662 | if (!test_bit(MMF_HAS_UPROBES, &mm->flags)) |
663 | return 0; | ||
664 | |||
665 | set_bit(MMF_RECALC_UPROBES, &mm->flags); | ||
666 | return set_orig_insn(&uprobe->arch, mm, vaddr); | ||
704 | } | 667 | } |
705 | 668 | ||
706 | /* | 669 | /* |
@@ -710,11 +673,9 @@ remove_breakpoint(struct uprobe *uprobe, struct mm_struct *mm, unsigned long vad | |||
710 | */ | 673 | */ |
711 | static void delete_uprobe(struct uprobe *uprobe) | 674 | static void delete_uprobe(struct uprobe *uprobe) |
712 | { | 675 | { |
713 | unsigned long flags; | 676 | spin_lock(&uprobes_treelock); |
714 | |||
715 | spin_lock_irqsave(&uprobes_treelock, flags); | ||
716 | rb_erase(&uprobe->rb_node, &uprobes_tree); | 677 | rb_erase(&uprobe->rb_node, &uprobes_tree); |
717 | spin_unlock_irqrestore(&uprobes_treelock, flags); | 678 | spin_unlock(&uprobes_treelock); |
718 | iput(uprobe->inode); | 679 | iput(uprobe->inode); |
719 | put_uprobe(uprobe); | 680 | put_uprobe(uprobe); |
720 | atomic_dec(&uprobe_events); | 681 | atomic_dec(&uprobe_events); |
@@ -737,7 +698,6 @@ static struct map_info * | |||
737 | build_map_info(struct address_space *mapping, loff_t offset, bool is_register) | 698 | build_map_info(struct address_space *mapping, loff_t offset, bool is_register) |
738 | { | 699 | { |
739 | unsigned long pgoff = offset >> PAGE_SHIFT; | 700 | unsigned long pgoff = offset >> PAGE_SHIFT; |
740 | struct prio_tree_iter iter; | ||
741 | struct vm_area_struct *vma; | 701 | struct vm_area_struct *vma; |
742 | struct map_info *curr = NULL; | 702 | struct map_info *curr = NULL; |
743 | struct map_info *prev = NULL; | 703 | struct map_info *prev = NULL; |
@@ -746,7 +706,7 @@ build_map_info(struct address_space *mapping, loff_t offset, bool is_register) | |||
746 | 706 | ||
747 | again: | 707 | again: |
748 | mutex_lock(&mapping->i_mmap_mutex); | 708 | mutex_lock(&mapping->i_mmap_mutex); |
749 | vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, pgoff) { | 709 | vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff, pgoff) { |
750 | if (!valid_vma(vma, is_register)) | 710 | if (!valid_vma(vma, is_register)) |
751 | continue; | 711 | continue; |
752 | 712 | ||
@@ -809,16 +769,19 @@ static int register_for_each_vma(struct uprobe *uprobe, bool is_register) | |||
809 | struct map_info *info; | 769 | struct map_info *info; |
810 | int err = 0; | 770 | int err = 0; |
811 | 771 | ||
772 | percpu_down_write(&dup_mmap_sem); | ||
812 | info = build_map_info(uprobe->inode->i_mapping, | 773 | info = build_map_info(uprobe->inode->i_mapping, |
813 | uprobe->offset, is_register); | 774 | uprobe->offset, is_register); |
814 | if (IS_ERR(info)) | 775 | if (IS_ERR(info)) { |
815 | return PTR_ERR(info); | 776 | err = PTR_ERR(info); |
777 | goto out; | ||
778 | } | ||
816 | 779 | ||
817 | while (info) { | 780 | while (info) { |
818 | struct mm_struct *mm = info->mm; | 781 | struct mm_struct *mm = info->mm; |
819 | struct vm_area_struct *vma; | 782 | struct vm_area_struct *vma; |
820 | 783 | ||
821 | if (err) | 784 | if (err && is_register) |
822 | goto free; | 785 | goto free; |
823 | 786 | ||
824 | down_write(&mm->mmap_sem); | 787 | down_write(&mm->mmap_sem); |
@@ -831,24 +794,19 @@ static int register_for_each_vma(struct uprobe *uprobe, bool is_register) | |||
831 | vaddr_to_offset(vma, info->vaddr) != uprobe->offset) | 794 | vaddr_to_offset(vma, info->vaddr) != uprobe->offset) |
832 | goto unlock; | 795 | goto unlock; |
833 | 796 | ||
834 | if (is_register) { | 797 | if (is_register) |
835 | err = install_breakpoint(uprobe, mm, vma, info->vaddr); | 798 | err = install_breakpoint(uprobe, mm, vma, info->vaddr); |
836 | /* | 799 | else |
837 | * We can race against uprobe_mmap(), see the | 800 | err |= remove_breakpoint(uprobe, mm, info->vaddr); |
838 | * comment near uprobe_hash(). | 801 | |
839 | */ | ||
840 | if (err == -EEXIST) | ||
841 | err = 0; | ||
842 | } else { | ||
843 | remove_breakpoint(uprobe, mm, info->vaddr); | ||
844 | } | ||
845 | unlock: | 802 | unlock: |
846 | up_write(&mm->mmap_sem); | 803 | up_write(&mm->mmap_sem); |
847 | free: | 804 | free: |
848 | mmput(mm); | 805 | mmput(mm); |
849 | info = free_map_info(info); | 806 | info = free_map_info(info); |
850 | } | 807 | } |
851 | 808 | out: | |
809 | percpu_up_write(&dup_mmap_sem); | ||
852 | return err; | 810 | return err; |
853 | } | 811 | } |
854 | 812 | ||
@@ -897,18 +855,21 @@ int uprobe_register(struct inode *inode, loff_t offset, struct uprobe_consumer * | |||
897 | mutex_lock(uprobes_hash(inode)); | 855 | mutex_lock(uprobes_hash(inode)); |
898 | uprobe = alloc_uprobe(inode, offset); | 856 | uprobe = alloc_uprobe(inode, offset); |
899 | 857 | ||
900 | if (uprobe && !consumer_add(uprobe, uc)) { | 858 | if (!uprobe) { |
859 | ret = -ENOMEM; | ||
860 | } else if (!consumer_add(uprobe, uc)) { | ||
901 | ret = __uprobe_register(uprobe); | 861 | ret = __uprobe_register(uprobe); |
902 | if (ret) { | 862 | if (ret) { |
903 | uprobe->consumers = NULL; | 863 | uprobe->consumers = NULL; |
904 | __uprobe_unregister(uprobe); | 864 | __uprobe_unregister(uprobe); |
905 | } else { | 865 | } else { |
906 | uprobe->flags |= UPROBE_RUN_HANDLER; | 866 | set_bit(UPROBE_RUN_HANDLER, &uprobe->flags); |
907 | } | 867 | } |
908 | } | 868 | } |
909 | 869 | ||
910 | mutex_unlock(uprobes_hash(inode)); | 870 | mutex_unlock(uprobes_hash(inode)); |
911 | put_uprobe(uprobe); | 871 | if (uprobe) |
872 | put_uprobe(uprobe); | ||
912 | 873 | ||
913 | return ret; | 874 | return ret; |
914 | } | 875 | } |
@@ -935,7 +896,7 @@ void uprobe_unregister(struct inode *inode, loff_t offset, struct uprobe_consume | |||
935 | if (consumer_del(uprobe, uc)) { | 896 | if (consumer_del(uprobe, uc)) { |
936 | if (!uprobe->consumers) { | 897 | if (!uprobe->consumers) { |
937 | __uprobe_unregister(uprobe); | 898 | __uprobe_unregister(uprobe); |
938 | uprobe->flags &= ~UPROBE_RUN_HANDLER; | 899 | clear_bit(UPROBE_RUN_HANDLER, &uprobe->flags); |
939 | } | 900 | } |
940 | } | 901 | } |
941 | 902 | ||
@@ -978,7 +939,6 @@ static void build_probe_list(struct inode *inode, | |||
978 | struct list_head *head) | 939 | struct list_head *head) |
979 | { | 940 | { |
980 | loff_t min, max; | 941 | loff_t min, max; |
981 | unsigned long flags; | ||
982 | struct rb_node *n, *t; | 942 | struct rb_node *n, *t; |
983 | struct uprobe *u; | 943 | struct uprobe *u; |
984 | 944 | ||
@@ -986,7 +946,7 @@ static void build_probe_list(struct inode *inode, | |||
986 | min = vaddr_to_offset(vma, start); | 946 | min = vaddr_to_offset(vma, start); |
987 | max = min + (end - start) - 1; | 947 | max = min + (end - start) - 1; |
988 | 948 | ||
989 | spin_lock_irqsave(&uprobes_treelock, flags); | 949 | spin_lock(&uprobes_treelock); |
990 | n = find_node_in_range(inode, min, max); | 950 | n = find_node_in_range(inode, min, max); |
991 | if (n) { | 951 | if (n) { |
992 | for (t = n; t; t = rb_prev(t)) { | 952 | for (t = n; t; t = rb_prev(t)) { |
@@ -1004,27 +964,20 @@ static void build_probe_list(struct inode *inode, | |||
1004 | atomic_inc(&u->ref); | 964 | atomic_inc(&u->ref); |
1005 | } | 965 | } |
1006 | } | 966 | } |
1007 | spin_unlock_irqrestore(&uprobes_treelock, flags); | 967 | spin_unlock(&uprobes_treelock); |
1008 | } | 968 | } |
1009 | 969 | ||
1010 | /* | 970 | /* |
1011 | * Called from mmap_region. | 971 | * Called from mmap_region/vma_adjust with mm->mmap_sem acquired. |
1012 | * called with mm->mmap_sem acquired. | ||
1013 | * | 972 | * |
1014 | * Return -ve no if we fail to insert probes and we cannot | 973 | * Currently we ignore all errors and always return 0, the callers |
1015 | * bail-out. | 974 | * can't handle the failure anyway. |
1016 | * Return 0 otherwise. i.e: | ||
1017 | * | ||
1018 | * - successful insertion of probes | ||
1019 | * - (or) no possible probes to be inserted. | ||
1020 | * - (or) insertion of probes failed but we can bail-out. | ||
1021 | */ | 975 | */ |
1022 | int uprobe_mmap(struct vm_area_struct *vma) | 976 | int uprobe_mmap(struct vm_area_struct *vma) |
1023 | { | 977 | { |
1024 | struct list_head tmp_list; | 978 | struct list_head tmp_list; |
1025 | struct uprobe *uprobe, *u; | 979 | struct uprobe *uprobe, *u; |
1026 | struct inode *inode; | 980 | struct inode *inode; |
1027 | int ret, count; | ||
1028 | 981 | ||
1029 | if (!atomic_read(&uprobe_events) || !valid_vma(vma, true)) | 982 | if (!atomic_read(&uprobe_events) || !valid_vma(vma, true)) |
1030 | return 0; | 983 | return 0; |
@@ -1036,44 +989,35 @@ int uprobe_mmap(struct vm_area_struct *vma) | |||
1036 | mutex_lock(uprobes_mmap_hash(inode)); | 989 | mutex_lock(uprobes_mmap_hash(inode)); |
1037 | build_probe_list(inode, vma, vma->vm_start, vma->vm_end, &tmp_list); | 990 | build_probe_list(inode, vma, vma->vm_start, vma->vm_end, &tmp_list); |
1038 | 991 | ||
1039 | ret = 0; | ||
1040 | count = 0; | ||
1041 | |||
1042 | list_for_each_entry_safe(uprobe, u, &tmp_list, pending_list) { | 992 | list_for_each_entry_safe(uprobe, u, &tmp_list, pending_list) { |
1043 | if (!ret) { | 993 | if (!fatal_signal_pending(current)) { |
1044 | unsigned long vaddr = offset_to_vaddr(vma, uprobe->offset); | 994 | unsigned long vaddr = offset_to_vaddr(vma, uprobe->offset); |
1045 | 995 | install_breakpoint(uprobe, vma->vm_mm, vma, vaddr); | |
1046 | ret = install_breakpoint(uprobe, vma->vm_mm, vma, vaddr); | ||
1047 | /* | ||
1048 | * We can race against uprobe_register(), see the | ||
1049 | * comment near uprobe_hash(). | ||
1050 | */ | ||
1051 | if (ret == -EEXIST) { | ||
1052 | ret = 0; | ||
1053 | |||
1054 | if (!is_swbp_at_addr(vma->vm_mm, vaddr)) | ||
1055 | continue; | ||
1056 | |||
1057 | /* | ||
1058 | * Unable to insert a breakpoint, but | ||
1059 | * breakpoint lies underneath. Increment the | ||
1060 | * probe count. | ||
1061 | */ | ||
1062 | atomic_inc(&vma->vm_mm->uprobes_state.count); | ||
1063 | } | ||
1064 | |||
1065 | if (!ret) | ||
1066 | count++; | ||
1067 | } | 996 | } |
1068 | put_uprobe(uprobe); | 997 | put_uprobe(uprobe); |
1069 | } | 998 | } |
1070 | |||
1071 | mutex_unlock(uprobes_mmap_hash(inode)); | 999 | mutex_unlock(uprobes_mmap_hash(inode)); |
1072 | 1000 | ||
1073 | if (ret) | 1001 | return 0; |
1074 | atomic_sub(count, &vma->vm_mm->uprobes_state.count); | 1002 | } |
1075 | 1003 | ||
1076 | return ret; | 1004 | static bool |
1005 | vma_has_uprobes(struct vm_area_struct *vma, unsigned long start, unsigned long end) | ||
1006 | { | ||
1007 | loff_t min, max; | ||
1008 | struct inode *inode; | ||
1009 | struct rb_node *n; | ||
1010 | |||
1011 | inode = vma->vm_file->f_mapping->host; | ||
1012 | |||
1013 | min = vaddr_to_offset(vma, start); | ||
1014 | max = min + (end - start) - 1; | ||
1015 | |||
1016 | spin_lock(&uprobes_treelock); | ||
1017 | n = find_node_in_range(inode, min, max); | ||
1018 | spin_unlock(&uprobes_treelock); | ||
1019 | |||
1020 | return !!n; | ||
1077 | } | 1021 | } |
1078 | 1022 | ||
1079 | /* | 1023 | /* |
@@ -1081,37 +1025,18 @@ int uprobe_mmap(struct vm_area_struct *vma) | |||
1081 | */ | 1025 | */ |
1082 | void uprobe_munmap(struct vm_area_struct *vma, unsigned long start, unsigned long end) | 1026 | void uprobe_munmap(struct vm_area_struct *vma, unsigned long start, unsigned long end) |
1083 | { | 1027 | { |
1084 | struct list_head tmp_list; | ||
1085 | struct uprobe *uprobe, *u; | ||
1086 | struct inode *inode; | ||
1087 | |||
1088 | if (!atomic_read(&uprobe_events) || !valid_vma(vma, false)) | 1028 | if (!atomic_read(&uprobe_events) || !valid_vma(vma, false)) |
1089 | return; | 1029 | return; |
1090 | 1030 | ||
1091 | if (!atomic_read(&vma->vm_mm->mm_users)) /* called by mmput() ? */ | 1031 | if (!atomic_read(&vma->vm_mm->mm_users)) /* called by mmput() ? */ |
1092 | return; | 1032 | return; |
1093 | 1033 | ||
1094 | if (!atomic_read(&vma->vm_mm->uprobes_state.count)) | 1034 | if (!test_bit(MMF_HAS_UPROBES, &vma->vm_mm->flags) || |
1035 | test_bit(MMF_RECALC_UPROBES, &vma->vm_mm->flags)) | ||
1095 | return; | 1036 | return; |
1096 | 1037 | ||
1097 | inode = vma->vm_file->f_mapping->host; | 1038 | if (vma_has_uprobes(vma, start, end)) |
1098 | if (!inode) | 1039 | set_bit(MMF_RECALC_UPROBES, &vma->vm_mm->flags); |
1099 | return; | ||
1100 | |||
1101 | mutex_lock(uprobes_mmap_hash(inode)); | ||
1102 | build_probe_list(inode, vma, start, end, &tmp_list); | ||
1103 | |||
1104 | list_for_each_entry_safe(uprobe, u, &tmp_list, pending_list) { | ||
1105 | unsigned long vaddr = offset_to_vaddr(vma, uprobe->offset); | ||
1106 | /* | ||
1107 | * An unregister could have removed the probe before | ||
1108 | * unmap. So check before we decrement the count. | ||
1109 | */ | ||
1110 | if (is_swbp_at_addr(vma->vm_mm, vaddr) == 1) | ||
1111 | atomic_dec(&vma->vm_mm->uprobes_state.count); | ||
1112 | put_uprobe(uprobe); | ||
1113 | } | ||
1114 | mutex_unlock(uprobes_mmap_hash(inode)); | ||
1115 | } | 1040 | } |
1116 | 1041 | ||
1117 | /* Slot allocation for XOL */ | 1042 | /* Slot allocation for XOL */ |
@@ -1213,13 +1138,25 @@ void uprobe_clear_state(struct mm_struct *mm) | |||
1213 | kfree(area); | 1138 | kfree(area); |
1214 | } | 1139 | } |
1215 | 1140 | ||
1216 | /* | 1141 | void uprobe_start_dup_mmap(void) |
1217 | * uprobe_reset_state - Free the area allocated for slots. | 1142 | { |
1218 | */ | 1143 | percpu_down_read(&dup_mmap_sem); |
1219 | void uprobe_reset_state(struct mm_struct *mm) | 1144 | } |
1145 | |||
1146 | void uprobe_end_dup_mmap(void) | ||
1220 | { | 1147 | { |
1221 | mm->uprobes_state.xol_area = NULL; | 1148 | percpu_up_read(&dup_mmap_sem); |
1222 | atomic_set(&mm->uprobes_state.count, 0); | 1149 | } |
1150 | |||
1151 | void uprobe_dup_mmap(struct mm_struct *oldmm, struct mm_struct *newmm) | ||
1152 | { | ||
1153 | newmm->uprobes_state.xol_area = NULL; | ||
1154 | |||
1155 | if (test_bit(MMF_HAS_UPROBES, &oldmm->flags)) { | ||
1156 | set_bit(MMF_HAS_UPROBES, &newmm->flags); | ||
1157 | /* unconditionally, dup_mmap() skips VM_DONTCOPY vmas */ | ||
1158 | set_bit(MMF_RECALC_UPROBES, &newmm->flags); | ||
1159 | } | ||
1223 | } | 1160 | } |
1224 | 1161 | ||
1225 | /* | 1162 | /* |
@@ -1279,6 +1216,11 @@ static unsigned long xol_get_insn_slot(struct uprobe *uprobe, unsigned long slot | |||
1279 | vaddr = kmap_atomic(area->page); | 1216 | vaddr = kmap_atomic(area->page); |
1280 | memcpy(vaddr + offset, uprobe->arch.insn, MAX_UINSN_BYTES); | 1217 | memcpy(vaddr + offset, uprobe->arch.insn, MAX_UINSN_BYTES); |
1281 | kunmap_atomic(vaddr); | 1218 | kunmap_atomic(vaddr); |
1219 | /* | ||
1220 | * We probably need flush_icache_user_range() but it needs vma. | ||
1221 | * This should work on supported architectures too. | ||
1222 | */ | ||
1223 | flush_dcache_page(area->page); | ||
1282 | 1224 | ||
1283 | return current->utask->xol_vaddr; | 1225 | return current->utask->xol_vaddr; |
1284 | } | 1226 | } |
@@ -1430,13 +1372,57 @@ bool uprobe_deny_signal(void) | |||
1430 | */ | 1372 | */ |
1431 | static bool can_skip_sstep(struct uprobe *uprobe, struct pt_regs *regs) | 1373 | static bool can_skip_sstep(struct uprobe *uprobe, struct pt_regs *regs) |
1432 | { | 1374 | { |
1433 | if (arch_uprobe_skip_sstep(&uprobe->arch, regs)) | 1375 | if (test_bit(UPROBE_SKIP_SSTEP, &uprobe->flags)) { |
1434 | return true; | 1376 | if (arch_uprobe_skip_sstep(&uprobe->arch, regs)) |
1435 | 1377 | return true; | |
1436 | uprobe->flags &= ~UPROBE_SKIP_SSTEP; | 1378 | clear_bit(UPROBE_SKIP_SSTEP, &uprobe->flags); |
1379 | } | ||
1437 | return false; | 1380 | return false; |
1438 | } | 1381 | } |
1439 | 1382 | ||
1383 | static void mmf_recalc_uprobes(struct mm_struct *mm) | ||
1384 | { | ||
1385 | struct vm_area_struct *vma; | ||
1386 | |||
1387 | for (vma = mm->mmap; vma; vma = vma->vm_next) { | ||
1388 | if (!valid_vma(vma, false)) | ||
1389 | continue; | ||
1390 | /* | ||
1391 | * This is not strictly accurate, we can race with | ||
1392 | * uprobe_unregister() and see the already removed | ||
1393 | * uprobe if delete_uprobe() was not yet called. | ||
1394 | */ | ||
1395 | if (vma_has_uprobes(vma, vma->vm_start, vma->vm_end)) | ||
1396 | return; | ||
1397 | } | ||
1398 | |||
1399 | clear_bit(MMF_HAS_UPROBES, &mm->flags); | ||
1400 | } | ||
1401 | |||
1402 | static int is_swbp_at_addr(struct mm_struct *mm, unsigned long vaddr) | ||
1403 | { | ||
1404 | struct page *page; | ||
1405 | uprobe_opcode_t opcode; | ||
1406 | int result; | ||
1407 | |||
1408 | pagefault_disable(); | ||
1409 | result = __copy_from_user_inatomic(&opcode, (void __user*)vaddr, | ||
1410 | sizeof(opcode)); | ||
1411 | pagefault_enable(); | ||
1412 | |||
1413 | if (likely(result == 0)) | ||
1414 | goto out; | ||
1415 | |||
1416 | result = get_user_pages(NULL, mm, vaddr, 1, 0, 1, &page, NULL); | ||
1417 | if (result < 0) | ||
1418 | return result; | ||
1419 | |||
1420 | copy_opcode(page, vaddr, &opcode); | ||
1421 | put_page(page); | ||
1422 | out: | ||
1423 | return is_swbp_insn(&opcode); | ||
1424 | } | ||
1425 | |||
1440 | static struct uprobe *find_active_uprobe(unsigned long bp_vaddr, int *is_swbp) | 1426 | static struct uprobe *find_active_uprobe(unsigned long bp_vaddr, int *is_swbp) |
1441 | { | 1427 | { |
1442 | struct mm_struct *mm = current->mm; | 1428 | struct mm_struct *mm = current->mm; |
@@ -1458,6 +1444,9 @@ static struct uprobe *find_active_uprobe(unsigned long bp_vaddr, int *is_swbp) | |||
1458 | } else { | 1444 | } else { |
1459 | *is_swbp = -EFAULT; | 1445 | *is_swbp = -EFAULT; |
1460 | } | 1446 | } |
1447 | |||
1448 | if (!uprobe && test_and_clear_bit(MMF_RECALC_UPROBES, &mm->flags)) | ||
1449 | mmf_recalc_uprobes(mm); | ||
1461 | up_read(&mm->mmap_sem); | 1450 | up_read(&mm->mmap_sem); |
1462 | 1451 | ||
1463 | return uprobe; | 1452 | return uprobe; |
@@ -1494,41 +1483,41 @@ static void handle_swbp(struct pt_regs *regs) | |||
1494 | } | 1483 | } |
1495 | return; | 1484 | return; |
1496 | } | 1485 | } |
1486 | /* | ||
1487 | * TODO: move copy_insn/etc into _register and remove this hack. | ||
1488 | * After we hit the bp, _unregister + _register can install the | ||
1489 | * new and not-yet-analyzed uprobe at the same address, restart. | ||
1490 | */ | ||
1491 | smp_rmb(); /* pairs with wmb() in install_breakpoint() */ | ||
1492 | if (unlikely(!test_bit(UPROBE_COPY_INSN, &uprobe->flags))) | ||
1493 | goto restart; | ||
1497 | 1494 | ||
1498 | utask = current->utask; | 1495 | utask = current->utask; |
1499 | if (!utask) { | 1496 | if (!utask) { |
1500 | utask = add_utask(); | 1497 | utask = add_utask(); |
1501 | /* Cannot allocate; re-execute the instruction. */ | 1498 | /* Cannot allocate; re-execute the instruction. */ |
1502 | if (!utask) | 1499 | if (!utask) |
1503 | goto cleanup_ret; | 1500 | goto restart; |
1504 | } | 1501 | } |
1505 | utask->active_uprobe = uprobe; | 1502 | |
1506 | handler_chain(uprobe, regs); | 1503 | handler_chain(uprobe, regs); |
1507 | if (uprobe->flags & UPROBE_SKIP_SSTEP && can_skip_sstep(uprobe, regs)) | 1504 | if (can_skip_sstep(uprobe, regs)) |
1508 | goto cleanup_ret; | 1505 | goto out; |
1509 | 1506 | ||
1510 | utask->state = UTASK_SSTEP; | ||
1511 | if (!pre_ssout(uprobe, regs, bp_vaddr)) { | 1507 | if (!pre_ssout(uprobe, regs, bp_vaddr)) { |
1512 | user_enable_single_step(current); | 1508 | utask->active_uprobe = uprobe; |
1509 | utask->state = UTASK_SSTEP; | ||
1513 | return; | 1510 | return; |
1514 | } | 1511 | } |
1515 | 1512 | ||
1516 | cleanup_ret: | 1513 | restart: |
1517 | if (utask) { | 1514 | /* |
1518 | utask->active_uprobe = NULL; | 1515 | * cannot singlestep; cannot skip instruction; |
1519 | utask->state = UTASK_RUNNING; | 1516 | * re-execute the instruction. |
1520 | } | 1517 | */ |
1521 | if (uprobe) { | 1518 | instruction_pointer_set(regs, bp_vaddr); |
1522 | if (!(uprobe->flags & UPROBE_SKIP_SSTEP)) | 1519 | out: |
1523 | 1520 | put_uprobe(uprobe); | |
1524 | /* | ||
1525 | * cannot singlestep; cannot skip instruction; | ||
1526 | * re-execute the instruction. | ||
1527 | */ | ||
1528 | instruction_pointer_set(regs, bp_vaddr); | ||
1529 | |||
1530 | put_uprobe(uprobe); | ||
1531 | } | ||
1532 | } | 1521 | } |
1533 | 1522 | ||
1534 | /* | 1523 | /* |
@@ -1550,7 +1539,6 @@ static void handle_singlestep(struct uprobe_task *utask, struct pt_regs *regs) | |||
1550 | put_uprobe(uprobe); | 1539 | put_uprobe(uprobe); |
1551 | utask->active_uprobe = NULL; | 1540 | utask->active_uprobe = NULL; |
1552 | utask->state = UTASK_RUNNING; | 1541 | utask->state = UTASK_RUNNING; |
1553 | user_disable_single_step(current); | ||
1554 | xol_free_insn_slot(current); | 1542 | xol_free_insn_slot(current); |
1555 | 1543 | ||
1556 | spin_lock_irq(¤t->sighand->siglock); | 1544 | spin_lock_irq(¤t->sighand->siglock); |
@@ -1559,13 +1547,12 @@ static void handle_singlestep(struct uprobe_task *utask, struct pt_regs *regs) | |||
1559 | } | 1547 | } |
1560 | 1548 | ||
1561 | /* | 1549 | /* |
1562 | * On breakpoint hit, breakpoint notifier sets the TIF_UPROBE flag. (and on | 1550 | * On breakpoint hit, breakpoint notifier sets the TIF_UPROBE flag and |
1563 | * subsequent probe hits on the thread sets the state to UTASK_BP_HIT) and | 1551 | * allows the thread to return from interrupt. After that handle_swbp() |
1564 | * allows the thread to return from interrupt. | 1552 | * sets utask->active_uprobe. |
1565 | * | 1553 | * |
1566 | * On singlestep exception, singlestep notifier sets the TIF_UPROBE flag and | 1554 | * On singlestep exception, singlestep notifier sets the TIF_UPROBE flag |
1567 | * also sets the state to UTASK_SSTEP_ACK and allows the thread to return from | 1555 | * and allows the thread to return from interrupt. |
1568 | * interrupt. | ||
1569 | * | 1556 | * |
1570 | * While returning to userspace, thread notices the TIF_UPROBE flag and calls | 1557 | * While returning to userspace, thread notices the TIF_UPROBE flag and calls |
1571 | * uprobe_notify_resume(). | 1558 | * uprobe_notify_resume(). |
@@ -1574,11 +1561,13 @@ void uprobe_notify_resume(struct pt_regs *regs) | |||
1574 | { | 1561 | { |
1575 | struct uprobe_task *utask; | 1562 | struct uprobe_task *utask; |
1576 | 1563 | ||
1564 | clear_thread_flag(TIF_UPROBE); | ||
1565 | |||
1577 | utask = current->utask; | 1566 | utask = current->utask; |
1578 | if (!utask || utask->state == UTASK_BP_HIT) | 1567 | if (utask && utask->active_uprobe) |
1579 | handle_swbp(regs); | ||
1580 | else | ||
1581 | handle_singlestep(utask, regs); | 1568 | handle_singlestep(utask, regs); |
1569 | else | ||
1570 | handle_swbp(regs); | ||
1582 | } | 1571 | } |
1583 | 1572 | ||
1584 | /* | 1573 | /* |
@@ -1587,18 +1576,10 @@ void uprobe_notify_resume(struct pt_regs *regs) | |||
1587 | */ | 1576 | */ |
1588 | int uprobe_pre_sstep_notifier(struct pt_regs *regs) | 1577 | int uprobe_pre_sstep_notifier(struct pt_regs *regs) |
1589 | { | 1578 | { |
1590 | struct uprobe_task *utask; | 1579 | if (!current->mm || !test_bit(MMF_HAS_UPROBES, ¤t->mm->flags)) |
1591 | |||
1592 | if (!current->mm || !atomic_read(¤t->mm->uprobes_state.count)) | ||
1593 | /* task is currently not uprobed */ | ||
1594 | return 0; | 1580 | return 0; |
1595 | 1581 | ||
1596 | utask = current->utask; | ||
1597 | if (utask) | ||
1598 | utask->state = UTASK_BP_HIT; | ||
1599 | |||
1600 | set_thread_flag(TIF_UPROBE); | 1582 | set_thread_flag(TIF_UPROBE); |
1601 | |||
1602 | return 1; | 1583 | return 1; |
1603 | } | 1584 | } |
1604 | 1585 | ||
@@ -1633,6 +1614,9 @@ static int __init init_uprobes(void) | |||
1633 | mutex_init(&uprobes_mmap_mutex[i]); | 1614 | mutex_init(&uprobes_mmap_mutex[i]); |
1634 | } | 1615 | } |
1635 | 1616 | ||
1617 | if (percpu_init_rwsem(&dup_mmap_sem)) | ||
1618 | return -ENOMEM; | ||
1619 | |||
1636 | return register_die_notifier(&uprobe_exception_nb); | 1620 | return register_die_notifier(&uprobe_exception_nb); |
1637 | } | 1621 | } |
1638 | module_init(init_uprobes); | 1622 | module_init(init_uprobes); |
diff --git a/kernel/exit.c b/kernel/exit.c index f65345f9e5bb..b4df21937216 100644 --- a/kernel/exit.c +++ b/kernel/exit.c | |||
@@ -72,18 +72,6 @@ static void __unhash_process(struct task_struct *p, bool group_dead) | |||
72 | list_del_rcu(&p->tasks); | 72 | list_del_rcu(&p->tasks); |
73 | list_del_init(&p->sibling); | 73 | list_del_init(&p->sibling); |
74 | __this_cpu_dec(process_counts); | 74 | __this_cpu_dec(process_counts); |
75 | /* | ||
76 | * If we are the last child process in a pid namespace to be | ||
77 | * reaped, notify the reaper sleeping zap_pid_ns_processes(). | ||
78 | */ | ||
79 | if (IS_ENABLED(CONFIG_PID_NS)) { | ||
80 | struct task_struct *parent = p->real_parent; | ||
81 | |||
82 | if ((task_active_pid_ns(parent)->child_reaper == parent) && | ||
83 | list_empty(&parent->children) && | ||
84 | (parent->flags & PF_EXITING)) | ||
85 | wake_up_process(parent); | ||
86 | } | ||
87 | } | 75 | } |
88 | list_del_rcu(&p->thread_group); | 76 | list_del_rcu(&p->thread_group); |
89 | } | 77 | } |
@@ -322,43 +310,6 @@ kill_orphaned_pgrp(struct task_struct *tsk, struct task_struct *parent) | |||
322 | } | 310 | } |
323 | } | 311 | } |
324 | 312 | ||
325 | /** | ||
326 | * reparent_to_kthreadd - Reparent the calling kernel thread to kthreadd | ||
327 | * | ||
328 | * If a kernel thread is launched as a result of a system call, or if | ||
329 | * it ever exits, it should generally reparent itself to kthreadd so it | ||
330 | * isn't in the way of other processes and is correctly cleaned up on exit. | ||
331 | * | ||
332 | * The various task state such as scheduling policy and priority may have | ||
333 | * been inherited from a user process, so we reset them to sane values here. | ||
334 | * | ||
335 | * NOTE that reparent_to_kthreadd() gives the caller full capabilities. | ||
336 | */ | ||
337 | static void reparent_to_kthreadd(void) | ||
338 | { | ||
339 | write_lock_irq(&tasklist_lock); | ||
340 | |||
341 | ptrace_unlink(current); | ||
342 | /* Reparent to init */ | ||
343 | current->real_parent = current->parent = kthreadd_task; | ||
344 | list_move_tail(¤t->sibling, ¤t->real_parent->children); | ||
345 | |||
346 | /* Set the exit signal to SIGCHLD so we signal init on exit */ | ||
347 | current->exit_signal = SIGCHLD; | ||
348 | |||
349 | if (task_nice(current) < 0) | ||
350 | set_user_nice(current, 0); | ||
351 | /* cpus_allowed? */ | ||
352 | /* rt_priority? */ | ||
353 | /* signals? */ | ||
354 | memcpy(current->signal->rlim, init_task.signal->rlim, | ||
355 | sizeof(current->signal->rlim)); | ||
356 | |||
357 | atomic_inc(&init_cred.usage); | ||
358 | commit_creds(&init_cred); | ||
359 | write_unlock_irq(&tasklist_lock); | ||
360 | } | ||
361 | |||
362 | void __set_special_pids(struct pid *pid) | 313 | void __set_special_pids(struct pid *pid) |
363 | { | 314 | { |
364 | struct task_struct *curr = current->group_leader; | 315 | struct task_struct *curr = current->group_leader; |
@@ -370,13 +321,6 @@ void __set_special_pids(struct pid *pid) | |||
370 | change_pid(curr, PIDTYPE_PGID, pid); | 321 | change_pid(curr, PIDTYPE_PGID, pid); |
371 | } | 322 | } |
372 | 323 | ||
373 | static void set_special_pids(struct pid *pid) | ||
374 | { | ||
375 | write_lock_irq(&tasklist_lock); | ||
376 | __set_special_pids(pid); | ||
377 | write_unlock_irq(&tasklist_lock); | ||
378 | } | ||
379 | |||
380 | /* | 324 | /* |
381 | * Let kernel threads use this to say that they allow a certain signal. | 325 | * Let kernel threads use this to say that they allow a certain signal. |
382 | * Must not be used if kthread was cloned with CLONE_SIGHAND. | 326 | * Must not be used if kthread was cloned with CLONE_SIGHAND. |
@@ -416,149 +360,6 @@ int disallow_signal(int sig) | |||
416 | 360 | ||
417 | EXPORT_SYMBOL(disallow_signal); | 361 | EXPORT_SYMBOL(disallow_signal); |
418 | 362 | ||
419 | /* | ||
420 | * Put all the gunge required to become a kernel thread without | ||
421 | * attached user resources in one place where it belongs. | ||
422 | */ | ||
423 | |||
424 | void daemonize(const char *name, ...) | ||
425 | { | ||
426 | va_list args; | ||
427 | sigset_t blocked; | ||
428 | |||
429 | va_start(args, name); | ||
430 | vsnprintf(current->comm, sizeof(current->comm), name, args); | ||
431 | va_end(args); | ||
432 | |||
433 | /* | ||
434 | * If we were started as result of loading a module, close all of the | ||
435 | * user space pages. We don't need them, and if we didn't close them | ||
436 | * they would be locked into memory. | ||
437 | */ | ||
438 | exit_mm(current); | ||
439 | /* | ||
440 | * We don't want to get frozen, in case system-wide hibernation | ||
441 | * or suspend transition begins right now. | ||
442 | */ | ||
443 | current->flags |= (PF_NOFREEZE | PF_KTHREAD); | ||
444 | |||
445 | if (current->nsproxy != &init_nsproxy) { | ||
446 | get_nsproxy(&init_nsproxy); | ||
447 | switch_task_namespaces(current, &init_nsproxy); | ||
448 | } | ||
449 | set_special_pids(&init_struct_pid); | ||
450 | proc_clear_tty(current); | ||
451 | |||
452 | /* Block and flush all signals */ | ||
453 | sigfillset(&blocked); | ||
454 | sigprocmask(SIG_BLOCK, &blocked, NULL); | ||
455 | flush_signals(current); | ||
456 | |||
457 | /* Become as one with the init task */ | ||
458 | |||
459 | daemonize_fs_struct(); | ||
460 | exit_files(current); | ||
461 | current->files = init_task.files; | ||
462 | atomic_inc(¤t->files->count); | ||
463 | |||
464 | reparent_to_kthreadd(); | ||
465 | } | ||
466 | |||
467 | EXPORT_SYMBOL(daemonize); | ||
468 | |||
469 | static void close_files(struct files_struct * files) | ||
470 | { | ||
471 | int i, j; | ||
472 | struct fdtable *fdt; | ||
473 | |||
474 | j = 0; | ||
475 | |||
476 | /* | ||
477 | * It is safe to dereference the fd table without RCU or | ||
478 | * ->file_lock because this is the last reference to the | ||
479 | * files structure. But use RCU to shut RCU-lockdep up. | ||
480 | */ | ||
481 | rcu_read_lock(); | ||
482 | fdt = files_fdtable(files); | ||
483 | rcu_read_unlock(); | ||
484 | for (;;) { | ||
485 | unsigned long set; | ||
486 | i = j * BITS_PER_LONG; | ||
487 | if (i >= fdt->max_fds) | ||
488 | break; | ||
489 | set = fdt->open_fds[j++]; | ||
490 | while (set) { | ||
491 | if (set & 1) { | ||
492 | struct file * file = xchg(&fdt->fd[i], NULL); | ||
493 | if (file) { | ||
494 | filp_close(file, files); | ||
495 | cond_resched(); | ||
496 | } | ||
497 | } | ||
498 | i++; | ||
499 | set >>= 1; | ||
500 | } | ||
501 | } | ||
502 | } | ||
503 | |||
504 | struct files_struct *get_files_struct(struct task_struct *task) | ||
505 | { | ||
506 | struct files_struct *files; | ||
507 | |||
508 | task_lock(task); | ||
509 | files = task->files; | ||
510 | if (files) | ||
511 | atomic_inc(&files->count); | ||
512 | task_unlock(task); | ||
513 | |||
514 | return files; | ||
515 | } | ||
516 | |||
517 | void put_files_struct(struct files_struct *files) | ||
518 | { | ||
519 | struct fdtable *fdt; | ||
520 | |||
521 | if (atomic_dec_and_test(&files->count)) { | ||
522 | close_files(files); | ||
523 | /* | ||
524 | * Free the fd and fdset arrays if we expanded them. | ||
525 | * If the fdtable was embedded, pass files for freeing | ||
526 | * at the end of the RCU grace period. Otherwise, | ||
527 | * you can free files immediately. | ||
528 | */ | ||
529 | rcu_read_lock(); | ||
530 | fdt = files_fdtable(files); | ||
531 | if (fdt != &files->fdtab) | ||
532 | kmem_cache_free(files_cachep, files); | ||
533 | free_fdtable(fdt); | ||
534 | rcu_read_unlock(); | ||
535 | } | ||
536 | } | ||
537 | |||
538 | void reset_files_struct(struct files_struct *files) | ||
539 | { | ||
540 | struct task_struct *tsk = current; | ||
541 | struct files_struct *old; | ||
542 | |||
543 | old = tsk->files; | ||
544 | task_lock(tsk); | ||
545 | tsk->files = files; | ||
546 | task_unlock(tsk); | ||
547 | put_files_struct(old); | ||
548 | } | ||
549 | |||
550 | void exit_files(struct task_struct *tsk) | ||
551 | { | ||
552 | struct files_struct * files = tsk->files; | ||
553 | |||
554 | if (files) { | ||
555 | task_lock(tsk); | ||
556 | tsk->files = NULL; | ||
557 | task_unlock(tsk); | ||
558 | put_files_struct(files); | ||
559 | } | ||
560 | } | ||
561 | |||
562 | #ifdef CONFIG_MM_OWNER | 363 | #ifdef CONFIG_MM_OWNER |
563 | /* | 364 | /* |
564 | * A task is exiting. If it owned this mm, find a new owner for the mm. | 365 | * A task is exiting. If it owned this mm, find a new owner for the mm. |
@@ -1046,6 +847,9 @@ void do_exit(long code) | |||
1046 | if (tsk->splice_pipe) | 847 | if (tsk->splice_pipe) |
1047 | __free_pipe_info(tsk->splice_pipe); | 848 | __free_pipe_info(tsk->splice_pipe); |
1048 | 849 | ||
850 | if (tsk->task_frag.page) | ||
851 | put_page(tsk->task_frag.page); | ||
852 | |||
1049 | validate_creds_for_do_exit(tsk); | 853 | validate_creds_for_do_exit(tsk); |
1050 | 854 | ||
1051 | preempt_disable(); | 855 | preempt_disable(); |
@@ -1278,11 +1082,11 @@ static int wait_task_zombie(struct wait_opts *wo, struct task_struct *p) | |||
1278 | * as other threads in the parent group can be right | 1082 | * as other threads in the parent group can be right |
1279 | * here reaping other children at the same time. | 1083 | * here reaping other children at the same time. |
1280 | * | 1084 | * |
1281 | * We use thread_group_times() to get times for the thread | 1085 | * We use thread_group_cputime_adjusted() to get times for the thread |
1282 | * group, which consolidates times for all threads in the | 1086 | * group, which consolidates times for all threads in the |
1283 | * group including the group leader. | 1087 | * group including the group leader. |
1284 | */ | 1088 | */ |
1285 | thread_group_times(p, &tgutime, &tgstime); | 1089 | thread_group_cputime_adjusted(p, &tgutime, &tgstime); |
1286 | spin_lock_irq(&p->real_parent->sighand->siglock); | 1090 | spin_lock_irq(&p->real_parent->sighand->siglock); |
1287 | psig = p->real_parent->signal; | 1091 | psig = p->real_parent->signal; |
1288 | sig = p->signal; | 1092 | sig = p->signal; |
diff --git a/kernel/fork.c b/kernel/fork.c index 2c8857e12855..a31b823b3c2d 100644 --- a/kernel/fork.c +++ b/kernel/fork.c | |||
@@ -146,7 +146,7 @@ void __weak arch_release_thread_info(struct thread_info *ti) | |||
146 | static struct thread_info *alloc_thread_info_node(struct task_struct *tsk, | 146 | static struct thread_info *alloc_thread_info_node(struct task_struct *tsk, |
147 | int node) | 147 | int node) |
148 | { | 148 | { |
149 | struct page *page = alloc_pages_node(node, THREADINFO_GFP, | 149 | struct page *page = alloc_pages_node(node, THREADINFO_GFP_ACCOUNTED, |
150 | THREAD_SIZE_ORDER); | 150 | THREAD_SIZE_ORDER); |
151 | 151 | ||
152 | return page ? page_address(page) : NULL; | 152 | return page ? page_address(page) : NULL; |
@@ -154,7 +154,7 @@ static struct thread_info *alloc_thread_info_node(struct task_struct *tsk, | |||
154 | 154 | ||
155 | static inline void free_thread_info(struct thread_info *ti) | 155 | static inline void free_thread_info(struct thread_info *ti) |
156 | { | 156 | { |
157 | free_pages((unsigned long)ti, THREAD_SIZE_ORDER); | 157 | free_memcg_kmem_pages((unsigned long)ti, THREAD_SIZE_ORDER); |
158 | } | 158 | } |
159 | # else | 159 | # else |
160 | static struct kmem_cache *thread_info_cache; | 160 | static struct kmem_cache *thread_info_cache; |
@@ -330,6 +330,7 @@ static struct task_struct *dup_task_struct(struct task_struct *orig) | |||
330 | tsk->btrace_seq = 0; | 330 | tsk->btrace_seq = 0; |
331 | #endif | 331 | #endif |
332 | tsk->splice_pipe = NULL; | 332 | tsk->splice_pipe = NULL; |
333 | tsk->task_frag.page = NULL; | ||
333 | 334 | ||
334 | account_kernel_stack(ti, 1); | 335 | account_kernel_stack(ti, 1); |
335 | 336 | ||
@@ -351,8 +352,10 @@ static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm) | |||
351 | unsigned long charge; | 352 | unsigned long charge; |
352 | struct mempolicy *pol; | 353 | struct mempolicy *pol; |
353 | 354 | ||
355 | uprobe_start_dup_mmap(); | ||
354 | down_write(&oldmm->mmap_sem); | 356 | down_write(&oldmm->mmap_sem); |
355 | flush_cache_dup_mm(oldmm); | 357 | flush_cache_dup_mm(oldmm); |
358 | uprobe_dup_mmap(oldmm, mm); | ||
356 | /* | 359 | /* |
357 | * Not linked in yet - no deadlock potential: | 360 | * Not linked in yet - no deadlock potential: |
358 | */ | 361 | */ |
@@ -421,7 +424,12 @@ static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm) | |||
421 | mapping->i_mmap_writable++; | 424 | mapping->i_mmap_writable++; |
422 | flush_dcache_mmap_lock(mapping); | 425 | flush_dcache_mmap_lock(mapping); |
423 | /* insert tmp into the share list, just after mpnt */ | 426 | /* insert tmp into the share list, just after mpnt */ |
424 | vma_prio_tree_add(tmp, mpnt); | 427 | if (unlikely(tmp->vm_flags & VM_NONLINEAR)) |
428 | vma_nonlinear_insert(tmp, | ||
429 | &mapping->i_mmap_nonlinear); | ||
430 | else | ||
431 | vma_interval_tree_insert_after(tmp, mpnt, | ||
432 | &mapping->i_mmap); | ||
425 | flush_dcache_mmap_unlock(mapping); | 433 | flush_dcache_mmap_unlock(mapping); |
426 | mutex_unlock(&mapping->i_mmap_mutex); | 434 | mutex_unlock(&mapping->i_mmap_mutex); |
427 | } | 435 | } |
@@ -454,9 +462,6 @@ static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm) | |||
454 | 462 | ||
455 | if (retval) | 463 | if (retval) |
456 | goto out; | 464 | goto out; |
457 | |||
458 | if (file) | ||
459 | uprobe_mmap(tmp); | ||
460 | } | 465 | } |
461 | /* a new mm has just been created */ | 466 | /* a new mm has just been created */ |
462 | arch_dup_mmap(oldmm, mm); | 467 | arch_dup_mmap(oldmm, mm); |
@@ -465,6 +470,7 @@ out: | |||
465 | up_write(&mm->mmap_sem); | 470 | up_write(&mm->mmap_sem); |
466 | flush_tlb_mm(oldmm); | 471 | flush_tlb_mm(oldmm); |
467 | up_write(&oldmm->mmap_sem); | 472 | up_write(&oldmm->mmap_sem); |
473 | uprobe_end_dup_mmap(); | ||
468 | return retval; | 474 | return retval; |
469 | fail_nomem_anon_vma_fork: | 475 | fail_nomem_anon_vma_fork: |
470 | mpol_put(pol); | 476 | mpol_put(pol); |
@@ -623,26 +629,6 @@ void mmput(struct mm_struct *mm) | |||
623 | } | 629 | } |
624 | EXPORT_SYMBOL_GPL(mmput); | 630 | EXPORT_SYMBOL_GPL(mmput); |
625 | 631 | ||
626 | /* | ||
627 | * We added or removed a vma mapping the executable. The vmas are only mapped | ||
628 | * during exec and are not mapped with the mmap system call. | ||
629 | * Callers must hold down_write() on the mm's mmap_sem for these | ||
630 | */ | ||
631 | void added_exe_file_vma(struct mm_struct *mm) | ||
632 | { | ||
633 | mm->num_exe_file_vmas++; | ||
634 | } | ||
635 | |||
636 | void removed_exe_file_vma(struct mm_struct *mm) | ||
637 | { | ||
638 | mm->num_exe_file_vmas--; | ||
639 | if ((mm->num_exe_file_vmas == 0) && mm->exe_file) { | ||
640 | fput(mm->exe_file); | ||
641 | mm->exe_file = NULL; | ||
642 | } | ||
643 | |||
644 | } | ||
645 | |||
646 | void set_mm_exe_file(struct mm_struct *mm, struct file *new_exe_file) | 632 | void set_mm_exe_file(struct mm_struct *mm, struct file *new_exe_file) |
647 | { | 633 | { |
648 | if (new_exe_file) | 634 | if (new_exe_file) |
@@ -650,15 +636,13 @@ void set_mm_exe_file(struct mm_struct *mm, struct file *new_exe_file) | |||
650 | if (mm->exe_file) | 636 | if (mm->exe_file) |
651 | fput(mm->exe_file); | 637 | fput(mm->exe_file); |
652 | mm->exe_file = new_exe_file; | 638 | mm->exe_file = new_exe_file; |
653 | mm->num_exe_file_vmas = 0; | ||
654 | } | 639 | } |
655 | 640 | ||
656 | struct file *get_mm_exe_file(struct mm_struct *mm) | 641 | struct file *get_mm_exe_file(struct mm_struct *mm) |
657 | { | 642 | { |
658 | struct file *exe_file; | 643 | struct file *exe_file; |
659 | 644 | ||
660 | /* We need mmap_sem to protect against races with removal of | 645 | /* We need mmap_sem to protect against races with removal of exe_file */ |
661 | * VM_EXECUTABLE vmas */ | ||
662 | down_read(&mm->mmap_sem); | 646 | down_read(&mm->mmap_sem); |
663 | exe_file = mm->exe_file; | 647 | exe_file = mm->exe_file; |
664 | if (exe_file) | 648 | if (exe_file) |
@@ -839,8 +823,9 @@ struct mm_struct *dup_mm(struct task_struct *tsk) | |||
839 | #ifdef CONFIG_TRANSPARENT_HUGEPAGE | 823 | #ifdef CONFIG_TRANSPARENT_HUGEPAGE |
840 | mm->pmd_huge_pte = NULL; | 824 | mm->pmd_huge_pte = NULL; |
841 | #endif | 825 | #endif |
842 | uprobe_reset_state(mm); | 826 | #ifdef CONFIG_NUMA_BALANCING |
843 | 827 | mm->first_nid = NUMA_PTE_SCAN_INIT; | |
828 | #endif | ||
844 | if (!mm_init(mm, tsk)) | 829 | if (!mm_init(mm, tsk)) |
845 | goto fail_nomem; | 830 | goto fail_nomem; |
846 | 831 | ||
@@ -1059,8 +1044,6 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk) | |||
1059 | atomic_set(&sig->live, 1); | 1044 | atomic_set(&sig->live, 1); |
1060 | atomic_set(&sig->sigcnt, 1); | 1045 | atomic_set(&sig->sigcnt, 1); |
1061 | init_waitqueue_head(&sig->wait_chldexit); | 1046 | init_waitqueue_head(&sig->wait_chldexit); |
1062 | if (clone_flags & CLONE_NEWPID) | ||
1063 | sig->flags |= SIGNAL_UNKILLABLE; | ||
1064 | sig->curr_target = tsk; | 1047 | sig->curr_target = tsk; |
1065 | init_sigpending(&sig->shared_pending); | 1048 | init_sigpending(&sig->shared_pending); |
1066 | INIT_LIST_HEAD(&sig->posix_timers); | 1049 | INIT_LIST_HEAD(&sig->posix_timers); |
@@ -1081,7 +1064,6 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk) | |||
1081 | init_rwsem(&sig->group_rwsem); | 1064 | init_rwsem(&sig->group_rwsem); |
1082 | #endif | 1065 | #endif |
1083 | 1066 | ||
1084 | sig->oom_adj = current->signal->oom_adj; | ||
1085 | sig->oom_score_adj = current->signal->oom_score_adj; | 1067 | sig->oom_score_adj = current->signal->oom_score_adj; |
1086 | sig->oom_score_adj_min = current->signal->oom_score_adj_min; | 1068 | sig->oom_score_adj_min = current->signal->oom_score_adj_min; |
1087 | 1069 | ||
@@ -1148,7 +1130,6 @@ static void posix_cpu_timers_init(struct task_struct *tsk) | |||
1148 | */ | 1130 | */ |
1149 | static struct task_struct *copy_process(unsigned long clone_flags, | 1131 | static struct task_struct *copy_process(unsigned long clone_flags, |
1150 | unsigned long stack_start, | 1132 | unsigned long stack_start, |
1151 | struct pt_regs *regs, | ||
1152 | unsigned long stack_size, | 1133 | unsigned long stack_size, |
1153 | int __user *child_tidptr, | 1134 | int __user *child_tidptr, |
1154 | struct pid *pid, | 1135 | struct pid *pid, |
@@ -1156,7 +1137,6 @@ static struct task_struct *copy_process(unsigned long clone_flags, | |||
1156 | { | 1137 | { |
1157 | int retval; | 1138 | int retval; |
1158 | struct task_struct *p; | 1139 | struct task_struct *p; |
1159 | int cgroup_callbacks_done = 0; | ||
1160 | 1140 | ||
1161 | if ((clone_flags & (CLONE_NEWNS|CLONE_FS)) == (CLONE_NEWNS|CLONE_FS)) | 1141 | if ((clone_flags & (CLONE_NEWNS|CLONE_FS)) == (CLONE_NEWNS|CLONE_FS)) |
1162 | return ERR_PTR(-EINVAL); | 1142 | return ERR_PTR(-EINVAL); |
@@ -1243,7 +1223,7 @@ static struct task_struct *copy_process(unsigned long clone_flags, | |||
1243 | p->utime = p->stime = p->gtime = 0; | 1223 | p->utime = p->stime = p->gtime = 0; |
1244 | p->utimescaled = p->stimescaled = 0; | 1224 | p->utimescaled = p->stimescaled = 0; |
1245 | #ifndef CONFIG_VIRT_CPU_ACCOUNTING | 1225 | #ifndef CONFIG_VIRT_CPU_ACCOUNTING |
1246 | p->prev_utime = p->prev_stime = 0; | 1226 | p->prev_cputime.utime = p->prev_cputime.stime = 0; |
1247 | #endif | 1227 | #endif |
1248 | #if defined(SPLIT_RSS_COUNTING) | 1228 | #if defined(SPLIT_RSS_COUNTING) |
1249 | memset(&p->rss_stat, 0, sizeof(p->rss_stat)); | 1229 | memset(&p->rss_stat, 0, sizeof(p->rss_stat)); |
@@ -1280,11 +1260,7 @@ static struct task_struct *copy_process(unsigned long clone_flags, | |||
1280 | #endif | 1260 | #endif |
1281 | #ifdef CONFIG_TRACE_IRQFLAGS | 1261 | #ifdef CONFIG_TRACE_IRQFLAGS |
1282 | p->irq_events = 0; | 1262 | p->irq_events = 0; |
1283 | #ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW | ||
1284 | p->hardirqs_enabled = 1; | ||
1285 | #else | ||
1286 | p->hardirqs_enabled = 0; | 1263 | p->hardirqs_enabled = 0; |
1287 | #endif | ||
1288 | p->hardirq_enable_ip = 0; | 1264 | p->hardirq_enable_ip = 0; |
1289 | p->hardirq_enable_event = 0; | 1265 | p->hardirq_enable_event = 0; |
1290 | p->hardirq_disable_ip = _THIS_IP_; | 1266 | p->hardirq_disable_ip = _THIS_IP_; |
@@ -1345,7 +1321,7 @@ static struct task_struct *copy_process(unsigned long clone_flags, | |||
1345 | retval = copy_io(clone_flags, p); | 1321 | retval = copy_io(clone_flags, p); |
1346 | if (retval) | 1322 | if (retval) |
1347 | goto bad_fork_cleanup_namespaces; | 1323 | goto bad_fork_cleanup_namespaces; |
1348 | retval = copy_thread(clone_flags, stack_start, stack_size, p, regs); | 1324 | retval = copy_thread(clone_flags, stack_start, stack_size, p); |
1349 | if (retval) | 1325 | if (retval) |
1350 | goto bad_fork_cleanup_io; | 1326 | goto bad_fork_cleanup_io; |
1351 | 1327 | ||
@@ -1418,12 +1394,6 @@ static struct task_struct *copy_process(unsigned long clone_flags, | |||
1418 | INIT_LIST_HEAD(&p->thread_group); | 1394 | INIT_LIST_HEAD(&p->thread_group); |
1419 | p->task_works = NULL; | 1395 | p->task_works = NULL; |
1420 | 1396 | ||
1421 | /* Now that the task is set up, run cgroup callbacks if | ||
1422 | * necessary. We need to run them before the task is visible | ||
1423 | * on the tasklist. */ | ||
1424 | cgroup_fork_callbacks(p); | ||
1425 | cgroup_callbacks_done = 1; | ||
1426 | |||
1427 | /* Need tasklist lock for parent etc handling! */ | 1397 | /* Need tasklist lock for parent etc handling! */ |
1428 | write_lock_irq(&tasklist_lock); | 1398 | write_lock_irq(&tasklist_lock); |
1429 | 1399 | ||
@@ -1466,8 +1436,10 @@ static struct task_struct *copy_process(unsigned long clone_flags, | |||
1466 | ptrace_init_task(p, (clone_flags & CLONE_PTRACE) || trace); | 1436 | ptrace_init_task(p, (clone_flags & CLONE_PTRACE) || trace); |
1467 | 1437 | ||
1468 | if (thread_group_leader(p)) { | 1438 | if (thread_group_leader(p)) { |
1469 | if (is_child_reaper(pid)) | 1439 | if (is_child_reaper(pid)) { |
1470 | p->nsproxy->pid_ns->child_reaper = p; | 1440 | ns_of_pid(pid)->child_reaper = p; |
1441 | p->signal->flags |= SIGNAL_UNKILLABLE; | ||
1442 | } | ||
1471 | 1443 | ||
1472 | p->signal->leader_pid = pid; | 1444 | p->signal->leader_pid = pid; |
1473 | p->signal->tty = tty_kref_get(current->signal->tty); | 1445 | p->signal->tty = tty_kref_get(current->signal->tty); |
@@ -1501,8 +1473,6 @@ bad_fork_cleanup_io: | |||
1501 | if (p->io_context) | 1473 | if (p->io_context) |
1502 | exit_io_context(p); | 1474 | exit_io_context(p); |
1503 | bad_fork_cleanup_namespaces: | 1475 | bad_fork_cleanup_namespaces: |
1504 | if (unlikely(clone_flags & CLONE_NEWPID)) | ||
1505 | pid_ns_release_proc(p->nsproxy->pid_ns); | ||
1506 | exit_task_namespaces(p); | 1476 | exit_task_namespaces(p); |
1507 | bad_fork_cleanup_mm: | 1477 | bad_fork_cleanup_mm: |
1508 | if (p->mm) | 1478 | if (p->mm) |
@@ -1528,7 +1498,7 @@ bad_fork_cleanup_cgroup: | |||
1528 | #endif | 1498 | #endif |
1529 | if (clone_flags & CLONE_THREAD) | 1499 | if (clone_flags & CLONE_THREAD) |
1530 | threadgroup_change_end(current); | 1500 | threadgroup_change_end(current); |
1531 | cgroup_exit(p, cgroup_callbacks_done); | 1501 | cgroup_exit(p, 0); |
1532 | delayacct_tsk_free(p); | 1502 | delayacct_tsk_free(p); |
1533 | module_put(task_thread_info(p)->exec_domain->module); | 1503 | module_put(task_thread_info(p)->exec_domain->module); |
1534 | bad_fork_cleanup_count: | 1504 | bad_fork_cleanup_count: |
@@ -1540,12 +1510,6 @@ fork_out: | |||
1540 | return ERR_PTR(retval); | 1510 | return ERR_PTR(retval); |
1541 | } | 1511 | } |
1542 | 1512 | ||
1543 | noinline struct pt_regs * __cpuinit __attribute__((weak)) idle_regs(struct pt_regs *regs) | ||
1544 | { | ||
1545 | memset(regs, 0, sizeof(struct pt_regs)); | ||
1546 | return regs; | ||
1547 | } | ||
1548 | |||
1549 | static inline void init_idle_pids(struct pid_link *links) | 1513 | static inline void init_idle_pids(struct pid_link *links) |
1550 | { | 1514 | { |
1551 | enum pid_type type; | 1515 | enum pid_type type; |
@@ -1559,10 +1523,7 @@ static inline void init_idle_pids(struct pid_link *links) | |||
1559 | struct task_struct * __cpuinit fork_idle(int cpu) | 1523 | struct task_struct * __cpuinit fork_idle(int cpu) |
1560 | { | 1524 | { |
1561 | struct task_struct *task; | 1525 | struct task_struct *task; |
1562 | struct pt_regs regs; | 1526 | task = copy_process(CLONE_VM, 0, 0, NULL, &init_struct_pid, 0); |
1563 | |||
1564 | task = copy_process(CLONE_VM, 0, idle_regs(®s), 0, NULL, | ||
1565 | &init_struct_pid, 0); | ||
1566 | if (!IS_ERR(task)) { | 1527 | if (!IS_ERR(task)) { |
1567 | init_idle_pids(task->pids); | 1528 | init_idle_pids(task->pids); |
1568 | init_idle(task, cpu); | 1529 | init_idle(task, cpu); |
@@ -1579,7 +1540,6 @@ struct task_struct * __cpuinit fork_idle(int cpu) | |||
1579 | */ | 1540 | */ |
1580 | long do_fork(unsigned long clone_flags, | 1541 | long do_fork(unsigned long clone_flags, |
1581 | unsigned long stack_start, | 1542 | unsigned long stack_start, |
1582 | struct pt_regs *regs, | ||
1583 | unsigned long stack_size, | 1543 | unsigned long stack_size, |
1584 | int __user *parent_tidptr, | 1544 | int __user *parent_tidptr, |
1585 | int __user *child_tidptr) | 1545 | int __user *child_tidptr) |
@@ -1592,15 +1552,9 @@ long do_fork(unsigned long clone_flags, | |||
1592 | * Do some preliminary argument and permissions checking before we | 1552 | * Do some preliminary argument and permissions checking before we |
1593 | * actually start allocating stuff | 1553 | * actually start allocating stuff |
1594 | */ | 1554 | */ |
1595 | if (clone_flags & CLONE_NEWUSER) { | 1555 | if (clone_flags & (CLONE_NEWUSER | CLONE_NEWPID)) { |
1596 | if (clone_flags & CLONE_THREAD) | 1556 | if (clone_flags & (CLONE_THREAD|CLONE_PARENT)) |
1597 | return -EINVAL; | 1557 | return -EINVAL; |
1598 | /* hopefully this check will go away when userns support is | ||
1599 | * complete | ||
1600 | */ | ||
1601 | if (!capable(CAP_SYS_ADMIN) || !capable(CAP_SETUID) || | ||
1602 | !capable(CAP_SETGID)) | ||
1603 | return -EPERM; | ||
1604 | } | 1558 | } |
1605 | 1559 | ||
1606 | /* | 1560 | /* |
@@ -1609,7 +1563,7 @@ long do_fork(unsigned long clone_flags, | |||
1609 | * requested, no event is reported; otherwise, report if the event | 1563 | * requested, no event is reported; otherwise, report if the event |
1610 | * for the type of forking is enabled. | 1564 | * for the type of forking is enabled. |
1611 | */ | 1565 | */ |
1612 | if (likely(user_mode(regs)) && !(clone_flags & CLONE_UNTRACED)) { | 1566 | if (!(clone_flags & CLONE_UNTRACED)) { |
1613 | if (clone_flags & CLONE_VFORK) | 1567 | if (clone_flags & CLONE_VFORK) |
1614 | trace = PTRACE_EVENT_VFORK; | 1568 | trace = PTRACE_EVENT_VFORK; |
1615 | else if ((clone_flags & CSIGNAL) != SIGCHLD) | 1569 | else if ((clone_flags & CSIGNAL) != SIGCHLD) |
@@ -1621,7 +1575,7 @@ long do_fork(unsigned long clone_flags, | |||
1621 | trace = 0; | 1575 | trace = 0; |
1622 | } | 1576 | } |
1623 | 1577 | ||
1624 | p = copy_process(clone_flags, stack_start, regs, stack_size, | 1578 | p = copy_process(clone_flags, stack_start, stack_size, |
1625 | child_tidptr, NULL, trace); | 1579 | child_tidptr, NULL, trace); |
1626 | /* | 1580 | /* |
1627 | * Do this prior waking up the new thread - the thread pointer | 1581 | * Do this prior waking up the new thread - the thread pointer |
@@ -1659,6 +1613,58 @@ long do_fork(unsigned long clone_flags, | |||
1659 | return nr; | 1613 | return nr; |
1660 | } | 1614 | } |
1661 | 1615 | ||
1616 | /* | ||
1617 | * Create a kernel thread. | ||
1618 | */ | ||
1619 | pid_t kernel_thread(int (*fn)(void *), void *arg, unsigned long flags) | ||
1620 | { | ||
1621 | return do_fork(flags|CLONE_VM|CLONE_UNTRACED, (unsigned long)fn, | ||
1622 | (unsigned long)arg, NULL, NULL); | ||
1623 | } | ||
1624 | |||
1625 | #ifdef __ARCH_WANT_SYS_FORK | ||
1626 | SYSCALL_DEFINE0(fork) | ||
1627 | { | ||
1628 | #ifdef CONFIG_MMU | ||
1629 | return do_fork(SIGCHLD, 0, 0, NULL, NULL); | ||
1630 | #else | ||
1631 | /* can not support in nommu mode */ | ||
1632 | return(-EINVAL); | ||
1633 | #endif | ||
1634 | } | ||
1635 | #endif | ||
1636 | |||
1637 | #ifdef __ARCH_WANT_SYS_VFORK | ||
1638 | SYSCALL_DEFINE0(vfork) | ||
1639 | { | ||
1640 | return do_fork(CLONE_VFORK | CLONE_VM | SIGCHLD, 0, | ||
1641 | 0, NULL, NULL); | ||
1642 | } | ||
1643 | #endif | ||
1644 | |||
1645 | #ifdef __ARCH_WANT_SYS_CLONE | ||
1646 | #ifdef CONFIG_CLONE_BACKWARDS | ||
1647 | SYSCALL_DEFINE5(clone, unsigned long, clone_flags, unsigned long, newsp, | ||
1648 | int __user *, parent_tidptr, | ||
1649 | int, tls_val, | ||
1650 | int __user *, child_tidptr) | ||
1651 | #elif defined(CONFIG_CLONE_BACKWARDS2) | ||
1652 | SYSCALL_DEFINE5(clone, unsigned long, newsp, unsigned long, clone_flags, | ||
1653 | int __user *, parent_tidptr, | ||
1654 | int __user *, child_tidptr, | ||
1655 | int, tls_val) | ||
1656 | #else | ||
1657 | SYSCALL_DEFINE5(clone, unsigned long, clone_flags, unsigned long, newsp, | ||
1658 | int __user *, parent_tidptr, | ||
1659 | int __user *, child_tidptr, | ||
1660 | int, tls_val) | ||
1661 | #endif | ||
1662 | { | ||
1663 | return do_fork(clone_flags, newsp, 0, | ||
1664 | parent_tidptr, child_tidptr); | ||
1665 | } | ||
1666 | #endif | ||
1667 | |||
1662 | #ifndef ARCH_MIN_MMSTRUCT_ALIGN | 1668 | #ifndef ARCH_MIN_MMSTRUCT_ALIGN |
1663 | #define ARCH_MIN_MMSTRUCT_ALIGN 0 | 1669 | #define ARCH_MIN_MMSTRUCT_ALIGN 0 |
1664 | #endif | 1670 | #endif |
@@ -1708,7 +1714,8 @@ static int check_unshare_flags(unsigned long unshare_flags) | |||
1708 | { | 1714 | { |
1709 | if (unshare_flags & ~(CLONE_THREAD|CLONE_FS|CLONE_NEWNS|CLONE_SIGHAND| | 1715 | if (unshare_flags & ~(CLONE_THREAD|CLONE_FS|CLONE_NEWNS|CLONE_SIGHAND| |
1710 | CLONE_VM|CLONE_FILES|CLONE_SYSVSEM| | 1716 | CLONE_VM|CLONE_FILES|CLONE_SYSVSEM| |
1711 | CLONE_NEWUTS|CLONE_NEWIPC|CLONE_NEWNET)) | 1717 | CLONE_NEWUTS|CLONE_NEWIPC|CLONE_NEWNET| |
1718 | CLONE_NEWUSER|CLONE_NEWPID)) | ||
1712 | return -EINVAL; | 1719 | return -EINVAL; |
1713 | /* | 1720 | /* |
1714 | * Not implemented, but pretend it works if there is nothing to | 1721 | * Not implemented, but pretend it works if there is nothing to |
@@ -1775,19 +1782,40 @@ SYSCALL_DEFINE1(unshare, unsigned long, unshare_flags) | |||
1775 | { | 1782 | { |
1776 | struct fs_struct *fs, *new_fs = NULL; | 1783 | struct fs_struct *fs, *new_fs = NULL; |
1777 | struct files_struct *fd, *new_fd = NULL; | 1784 | struct files_struct *fd, *new_fd = NULL; |
1785 | struct cred *new_cred = NULL; | ||
1778 | struct nsproxy *new_nsproxy = NULL; | 1786 | struct nsproxy *new_nsproxy = NULL; |
1779 | int do_sysvsem = 0; | 1787 | int do_sysvsem = 0; |
1780 | int err; | 1788 | int err; |
1781 | 1789 | ||
1782 | err = check_unshare_flags(unshare_flags); | 1790 | /* |
1783 | if (err) | 1791 | * If unsharing a user namespace must also unshare the thread. |
1784 | goto bad_unshare_out; | 1792 | */ |
1785 | 1793 | if (unshare_flags & CLONE_NEWUSER) | |
1794 | unshare_flags |= CLONE_THREAD; | ||
1795 | /* | ||
1796 | * If unsharing a pid namespace must also unshare the thread. | ||
1797 | */ | ||
1798 | if (unshare_flags & CLONE_NEWPID) | ||
1799 | unshare_flags |= CLONE_THREAD; | ||
1800 | /* | ||
1801 | * If unsharing a thread from a thread group, must also unshare vm. | ||
1802 | */ | ||
1803 | if (unshare_flags & CLONE_THREAD) | ||
1804 | unshare_flags |= CLONE_VM; | ||
1805 | /* | ||
1806 | * If unsharing vm, must also unshare signal handlers. | ||
1807 | */ | ||
1808 | if (unshare_flags & CLONE_VM) | ||
1809 | unshare_flags |= CLONE_SIGHAND; | ||
1786 | /* | 1810 | /* |
1787 | * If unsharing namespace, must also unshare filesystem information. | 1811 | * If unsharing namespace, must also unshare filesystem information. |
1788 | */ | 1812 | */ |
1789 | if (unshare_flags & CLONE_NEWNS) | 1813 | if (unshare_flags & CLONE_NEWNS) |
1790 | unshare_flags |= CLONE_FS; | 1814 | unshare_flags |= CLONE_FS; |
1815 | |||
1816 | err = check_unshare_flags(unshare_flags); | ||
1817 | if (err) | ||
1818 | goto bad_unshare_out; | ||
1791 | /* | 1819 | /* |
1792 | * CLONE_NEWIPC must also detach from the undolist: after switching | 1820 | * CLONE_NEWIPC must also detach from the undolist: after switching |
1793 | * to a new ipc namespace, the semaphore arrays from the old | 1821 | * to a new ipc namespace, the semaphore arrays from the old |
@@ -1801,11 +1829,15 @@ SYSCALL_DEFINE1(unshare, unsigned long, unshare_flags) | |||
1801 | err = unshare_fd(unshare_flags, &new_fd); | 1829 | err = unshare_fd(unshare_flags, &new_fd); |
1802 | if (err) | 1830 | if (err) |
1803 | goto bad_unshare_cleanup_fs; | 1831 | goto bad_unshare_cleanup_fs; |
1804 | err = unshare_nsproxy_namespaces(unshare_flags, &new_nsproxy, new_fs); | 1832 | err = unshare_userns(unshare_flags, &new_cred); |
1805 | if (err) | 1833 | if (err) |
1806 | goto bad_unshare_cleanup_fd; | 1834 | goto bad_unshare_cleanup_fd; |
1835 | err = unshare_nsproxy_namespaces(unshare_flags, &new_nsproxy, | ||
1836 | new_cred, new_fs); | ||
1837 | if (err) | ||
1838 | goto bad_unshare_cleanup_cred; | ||
1807 | 1839 | ||
1808 | if (new_fs || new_fd || do_sysvsem || new_nsproxy) { | 1840 | if (new_fs || new_fd || do_sysvsem || new_cred || new_nsproxy) { |
1809 | if (do_sysvsem) { | 1841 | if (do_sysvsem) { |
1810 | /* | 1842 | /* |
1811 | * CLONE_SYSVSEM is equivalent to sys_exit(). | 1843 | * CLONE_SYSVSEM is equivalent to sys_exit(). |
@@ -1838,11 +1870,20 @@ SYSCALL_DEFINE1(unshare, unsigned long, unshare_flags) | |||
1838 | } | 1870 | } |
1839 | 1871 | ||
1840 | task_unlock(current); | 1872 | task_unlock(current); |
1873 | |||
1874 | if (new_cred) { | ||
1875 | /* Install the new user namespace */ | ||
1876 | commit_creds(new_cred); | ||
1877 | new_cred = NULL; | ||
1878 | } | ||
1841 | } | 1879 | } |
1842 | 1880 | ||
1843 | if (new_nsproxy) | 1881 | if (new_nsproxy) |
1844 | put_nsproxy(new_nsproxy); | 1882 | put_nsproxy(new_nsproxy); |
1845 | 1883 | ||
1884 | bad_unshare_cleanup_cred: | ||
1885 | if (new_cred) | ||
1886 | put_cred(new_cred); | ||
1846 | bad_unshare_cleanup_fd: | 1887 | bad_unshare_cleanup_fd: |
1847 | if (new_fd) | 1888 | if (new_fd) |
1848 | put_files_struct(new_fd); | 1889 | put_files_struct(new_fd); |
diff --git a/kernel/freezer.c b/kernel/freezer.c index 11f82a4d4eae..c38893b0efba 100644 --- a/kernel/freezer.c +++ b/kernel/freezer.c | |||
@@ -116,17 +116,10 @@ bool freeze_task(struct task_struct *p) | |||
116 | return false; | 116 | return false; |
117 | } | 117 | } |
118 | 118 | ||
119 | if (!(p->flags & PF_KTHREAD)) { | 119 | if (!(p->flags & PF_KTHREAD)) |
120 | fake_signal_wake_up(p); | 120 | fake_signal_wake_up(p); |
121 | /* | 121 | else |
122 | * fake_signal_wake_up() goes through p's scheduler | ||
123 | * lock and guarantees that TASK_STOPPED/TRACED -> | ||
124 | * TASK_RUNNING transition can't race with task state | ||
125 | * testing in try_to_freeze_tasks(). | ||
126 | */ | ||
127 | } else { | ||
128 | wake_up_state(p, TASK_INTERRUPTIBLE); | 122 | wake_up_state(p, TASK_INTERRUPTIBLE); |
129 | } | ||
130 | 123 | ||
131 | spin_unlock_irqrestore(&freezer_lock, flags); | 124 | spin_unlock_irqrestore(&freezer_lock, flags); |
132 | return true; | 125 | return true; |
diff --git a/kernel/futex.c b/kernel/futex.c index 3717e7b306e0..19eb089ca003 100644 --- a/kernel/futex.c +++ b/kernel/futex.c | |||
@@ -716,7 +716,7 @@ static int futex_lock_pi_atomic(u32 __user *uaddr, struct futex_hash_bucket *hb, | |||
716 | struct futex_pi_state **ps, | 716 | struct futex_pi_state **ps, |
717 | struct task_struct *task, int set_waiters) | 717 | struct task_struct *task, int set_waiters) |
718 | { | 718 | { |
719 | int lock_taken, ret, ownerdied = 0; | 719 | int lock_taken, ret, force_take = 0; |
720 | u32 uval, newval, curval, vpid = task_pid_vnr(task); | 720 | u32 uval, newval, curval, vpid = task_pid_vnr(task); |
721 | 721 | ||
722 | retry: | 722 | retry: |
@@ -755,17 +755,15 @@ retry: | |||
755 | newval = curval | FUTEX_WAITERS; | 755 | newval = curval | FUTEX_WAITERS; |
756 | 756 | ||
757 | /* | 757 | /* |
758 | * There are two cases, where a futex might have no owner (the | 758 | * Should we force take the futex? See below. |
759 | * owner TID is 0): OWNER_DIED. We take over the futex in this | ||
760 | * case. We also do an unconditional take over, when the owner | ||
761 | * of the futex died. | ||
762 | * | ||
763 | * This is safe as we are protected by the hash bucket lock ! | ||
764 | */ | 759 | */ |
765 | if (unlikely(ownerdied || !(curval & FUTEX_TID_MASK))) { | 760 | if (unlikely(force_take)) { |
766 | /* Keep the OWNER_DIED bit */ | 761 | /* |
762 | * Keep the OWNER_DIED and the WAITERS bit and set the | ||
763 | * new TID value. | ||
764 | */ | ||
767 | newval = (curval & ~FUTEX_TID_MASK) | vpid; | 765 | newval = (curval & ~FUTEX_TID_MASK) | vpid; |
768 | ownerdied = 0; | 766 | force_take = 0; |
769 | lock_taken = 1; | 767 | lock_taken = 1; |
770 | } | 768 | } |
771 | 769 | ||
@@ -775,7 +773,7 @@ retry: | |||
775 | goto retry; | 773 | goto retry; |
776 | 774 | ||
777 | /* | 775 | /* |
778 | * We took the lock due to owner died take over. | 776 | * We took the lock due to forced take over. |
779 | */ | 777 | */ |
780 | if (unlikely(lock_taken)) | 778 | if (unlikely(lock_taken)) |
781 | return 1; | 779 | return 1; |
@@ -790,20 +788,25 @@ retry: | |||
790 | switch (ret) { | 788 | switch (ret) { |
791 | case -ESRCH: | 789 | case -ESRCH: |
792 | /* | 790 | /* |
793 | * No owner found for this futex. Check if the | 791 | * We failed to find an owner for this |
794 | * OWNER_DIED bit is set to figure out whether | 792 | * futex. So we have no pi_state to block |
795 | * this is a robust futex or not. | 793 | * on. This can happen in two cases: |
794 | * | ||
795 | * 1) The owner died | ||
796 | * 2) A stale FUTEX_WAITERS bit | ||
797 | * | ||
798 | * Re-read the futex value. | ||
796 | */ | 799 | */ |
797 | if (get_futex_value_locked(&curval, uaddr)) | 800 | if (get_futex_value_locked(&curval, uaddr)) |
798 | return -EFAULT; | 801 | return -EFAULT; |
799 | 802 | ||
800 | /* | 803 | /* |
801 | * We simply start over in case of a robust | 804 | * If the owner died or we have a stale |
802 | * futex. The code above will take the futex | 805 | * WAITERS bit the owner TID in the user space |
803 | * and return happy. | 806 | * futex is 0. |
804 | */ | 807 | */ |
805 | if (curval & FUTEX_OWNER_DIED) { | 808 | if (!(curval & FUTEX_TID_MASK)) { |
806 | ownerdied = 1; | 809 | force_take = 1; |
807 | goto retry; | 810 | goto retry; |
808 | } | 811 | } |
809 | default: | 812 | default: |
@@ -840,6 +843,9 @@ static void wake_futex(struct futex_q *q) | |||
840 | { | 843 | { |
841 | struct task_struct *p = q->task; | 844 | struct task_struct *p = q->task; |
842 | 845 | ||
846 | if (WARN(q->pi_state || q->rt_waiter, "refusing to wake PI futex\n")) | ||
847 | return; | ||
848 | |||
843 | /* | 849 | /* |
844 | * We set q->lock_ptr = NULL _before_ we wake up the task. If | 850 | * We set q->lock_ptr = NULL _before_ we wake up the task. If |
845 | * a non-futex wake up happens on another CPU then the task | 851 | * a non-futex wake up happens on another CPU then the task |
@@ -1075,6 +1081,10 @@ retry_private: | |||
1075 | 1081 | ||
1076 | plist_for_each_entry_safe(this, next, head, list) { | 1082 | plist_for_each_entry_safe(this, next, head, list) { |
1077 | if (match_futex (&this->key, &key1)) { | 1083 | if (match_futex (&this->key, &key1)) { |
1084 | if (this->pi_state || this->rt_waiter) { | ||
1085 | ret = -EINVAL; | ||
1086 | goto out_unlock; | ||
1087 | } | ||
1078 | wake_futex(this); | 1088 | wake_futex(this); |
1079 | if (++ret >= nr_wake) | 1089 | if (++ret >= nr_wake) |
1080 | break; | 1090 | break; |
@@ -1087,6 +1097,10 @@ retry_private: | |||
1087 | op_ret = 0; | 1097 | op_ret = 0; |
1088 | plist_for_each_entry_safe(this, next, head, list) { | 1098 | plist_for_each_entry_safe(this, next, head, list) { |
1089 | if (match_futex (&this->key, &key2)) { | 1099 | if (match_futex (&this->key, &key2)) { |
1100 | if (this->pi_state || this->rt_waiter) { | ||
1101 | ret = -EINVAL; | ||
1102 | goto out_unlock; | ||
1103 | } | ||
1090 | wake_futex(this); | 1104 | wake_futex(this); |
1091 | if (++op_ret >= nr_wake2) | 1105 | if (++op_ret >= nr_wake2) |
1092 | break; | 1106 | break; |
@@ -1095,6 +1109,7 @@ retry_private: | |||
1095 | ret += op_ret; | 1109 | ret += op_ret; |
1096 | } | 1110 | } |
1097 | 1111 | ||
1112 | out_unlock: | ||
1098 | double_unlock_hb(hb1, hb2); | 1113 | double_unlock_hb(hb1, hb2); |
1099 | out_put_keys: | 1114 | out_put_keys: |
1100 | put_futex_key(&key2); | 1115 | put_futex_key(&key2); |
@@ -1384,9 +1399,13 @@ retry_private: | |||
1384 | /* | 1399 | /* |
1385 | * FUTEX_WAIT_REQEUE_PI and FUTEX_CMP_REQUEUE_PI should always | 1400 | * FUTEX_WAIT_REQEUE_PI and FUTEX_CMP_REQUEUE_PI should always |
1386 | * be paired with each other and no other futex ops. | 1401 | * be paired with each other and no other futex ops. |
1402 | * | ||
1403 | * We should never be requeueing a futex_q with a pi_state, | ||
1404 | * which is awaiting a futex_unlock_pi(). | ||
1387 | */ | 1405 | */ |
1388 | if ((requeue_pi && !this->rt_waiter) || | 1406 | if ((requeue_pi && !this->rt_waiter) || |
1389 | (!requeue_pi && this->rt_waiter)) { | 1407 | (!requeue_pi && this->rt_waiter) || |
1408 | this->pi_state) { | ||
1390 | ret = -EINVAL; | 1409 | ret = -EINVAL; |
1391 | break; | 1410 | break; |
1392 | } | 1411 | } |
diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c index eebd6d5cfb44..3aca9f29d30e 100644 --- a/kernel/irq/chip.c +++ b/kernel/irq/chip.c | |||
@@ -272,6 +272,7 @@ void handle_nested_irq(unsigned int irq) | |||
272 | 272 | ||
273 | raw_spin_lock_irq(&desc->lock); | 273 | raw_spin_lock_irq(&desc->lock); |
274 | 274 | ||
275 | desc->istate &= ~(IRQS_REPLAY | IRQS_WAITING); | ||
275 | kstat_incr_irqs_this_cpu(irq, desc); | 276 | kstat_incr_irqs_this_cpu(irq, desc); |
276 | 277 | ||
277 | action = desc->action; | 278 | action = desc->action; |
@@ -671,6 +672,7 @@ irq_set_chip_and_handler_name(unsigned int irq, struct irq_chip *chip, | |||
671 | irq_set_chip(irq, chip); | 672 | irq_set_chip(irq, chip); |
672 | __irq_set_handler(irq, handle, 0, name); | 673 | __irq_set_handler(irq, handle, 0, name); |
673 | } | 674 | } |
675 | EXPORT_SYMBOL_GPL(irq_set_chip_and_handler_name); | ||
674 | 676 | ||
675 | void irq_modify_status(unsigned int irq, unsigned long clr, unsigned long set) | 677 | void irq_modify_status(unsigned int irq, unsigned long clr, unsigned long set) |
676 | { | 678 | { |
diff --git a/kernel/irq/dummychip.c b/kernel/irq/dummychip.c index b5fcd96c7102..988dc58e8847 100644 --- a/kernel/irq/dummychip.c +++ b/kernel/irq/dummychip.c | |||
@@ -6,6 +6,7 @@ | |||
6 | */ | 6 | */ |
7 | #include <linux/interrupt.h> | 7 | #include <linux/interrupt.h> |
8 | #include <linux/irq.h> | 8 | #include <linux/irq.h> |
9 | #include <linux/export.h> | ||
9 | 10 | ||
10 | #include "internals.h" | 11 | #include "internals.h" |
11 | 12 | ||
@@ -57,3 +58,4 @@ struct irq_chip dummy_irq_chip = { | |||
57 | .irq_mask = noop, | 58 | .irq_mask = noop, |
58 | .irq_unmask = noop, | 59 | .irq_unmask = noop, |
59 | }; | 60 | }; |
61 | EXPORT_SYMBOL_GPL(dummy_irq_chip); | ||
diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c index 49a77727db42..96f3a1d9c379 100644 --- a/kernel/irq/irqdomain.c +++ b/kernel/irq/irqdomain.c | |||
@@ -148,7 +148,8 @@ static unsigned int irq_domain_legacy_revmap(struct irq_domain *domain, | |||
148 | * @host_data: Controller private data pointer | 148 | * @host_data: Controller private data pointer |
149 | * | 149 | * |
150 | * Allocates a legacy irq_domain if irq_base is positive or a linear | 150 | * Allocates a legacy irq_domain if irq_base is positive or a linear |
151 | * domain otherwise. | 151 | * domain otherwise. For the legacy domain, IRQ descriptors will also |
152 | * be allocated. | ||
152 | * | 153 | * |
153 | * This is intended to implement the expected behaviour for most | 154 | * This is intended to implement the expected behaviour for most |
154 | * interrupt controllers which is that a linear mapping should | 155 | * interrupt controllers which is that a linear mapping should |
@@ -162,11 +163,33 @@ struct irq_domain *irq_domain_add_simple(struct device_node *of_node, | |||
162 | const struct irq_domain_ops *ops, | 163 | const struct irq_domain_ops *ops, |
163 | void *host_data) | 164 | void *host_data) |
164 | { | 165 | { |
165 | if (first_irq > 0) | 166 | if (first_irq > 0) { |
166 | return irq_domain_add_legacy(of_node, size, first_irq, 0, | 167 | int irq_base; |
168 | |||
169 | if (IS_ENABLED(CONFIG_SPARSE_IRQ)) { | ||
170 | /* | ||
171 | * Set the descriptor allocator to search for a | ||
172 | * 1-to-1 mapping, such as irq_alloc_desc_at(). | ||
173 | * Use of_node_to_nid() which is defined to | ||
174 | * numa_node_id() on platforms that have no custom | ||
175 | * implementation. | ||
176 | */ | ||
177 | irq_base = irq_alloc_descs(first_irq, first_irq, size, | ||
178 | of_node_to_nid(of_node)); | ||
179 | if (irq_base < 0) { | ||
180 | pr_info("Cannot allocate irq_descs @ IRQ%d, assuming pre-allocated\n", | ||
181 | first_irq); | ||
182 | irq_base = first_irq; | ||
183 | } | ||
184 | } else | ||
185 | irq_base = first_irq; | ||
186 | |||
187 | return irq_domain_add_legacy(of_node, size, irq_base, 0, | ||
167 | ops, host_data); | 188 | ops, host_data); |
168 | else | 189 | } |
169 | return irq_domain_add_linear(of_node, size, ops, host_data); | 190 | |
191 | /* A linear domain is the default */ | ||
192 | return irq_domain_add_linear(of_node, size, ops, host_data); | ||
170 | } | 193 | } |
171 | 194 | ||
172 | /** | 195 | /** |
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index 4c69326aa773..e49a288fa479 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c | |||
@@ -616,6 +616,22 @@ int __irq_set_trigger(struct irq_desc *desc, unsigned int irq, | |||
616 | return ret; | 616 | return ret; |
617 | } | 617 | } |
618 | 618 | ||
619 | #ifdef CONFIG_HARDIRQS_SW_RESEND | ||
620 | int irq_set_parent(int irq, int parent_irq) | ||
621 | { | ||
622 | unsigned long flags; | ||
623 | struct irq_desc *desc = irq_get_desc_lock(irq, &flags, 0); | ||
624 | |||
625 | if (!desc) | ||
626 | return -EINVAL; | ||
627 | |||
628 | desc->parent_irq = parent_irq; | ||
629 | |||
630 | irq_put_desc_unlock(desc, flags); | ||
631 | return 0; | ||
632 | } | ||
633 | #endif | ||
634 | |||
619 | /* | 635 | /* |
620 | * Default primary interrupt handler for threaded interrupts. Is | 636 | * Default primary interrupt handler for threaded interrupts. Is |
621 | * assigned as primary handler when request_threaded_irq is called | 637 | * assigned as primary handler when request_threaded_irq is called |
@@ -716,6 +732,7 @@ static void | |||
716 | irq_thread_check_affinity(struct irq_desc *desc, struct irqaction *action) | 732 | irq_thread_check_affinity(struct irq_desc *desc, struct irqaction *action) |
717 | { | 733 | { |
718 | cpumask_var_t mask; | 734 | cpumask_var_t mask; |
735 | bool valid = true; | ||
719 | 736 | ||
720 | if (!test_and_clear_bit(IRQTF_AFFINITY, &action->thread_flags)) | 737 | if (!test_and_clear_bit(IRQTF_AFFINITY, &action->thread_flags)) |
721 | return; | 738 | return; |
@@ -730,10 +747,18 @@ irq_thread_check_affinity(struct irq_desc *desc, struct irqaction *action) | |||
730 | } | 747 | } |
731 | 748 | ||
732 | raw_spin_lock_irq(&desc->lock); | 749 | raw_spin_lock_irq(&desc->lock); |
733 | cpumask_copy(mask, desc->irq_data.affinity); | 750 | /* |
751 | * This code is triggered unconditionally. Check the affinity | ||
752 | * mask pointer. For CPU_MASK_OFFSTACK=n this is optimized out. | ||
753 | */ | ||
754 | if (desc->irq_data.affinity) | ||
755 | cpumask_copy(mask, desc->irq_data.affinity); | ||
756 | else | ||
757 | valid = false; | ||
734 | raw_spin_unlock_irq(&desc->lock); | 758 | raw_spin_unlock_irq(&desc->lock); |
735 | 759 | ||
736 | set_cpus_allowed_ptr(current, mask); | 760 | if (valid) |
761 | set_cpus_allowed_ptr(current, mask); | ||
737 | free_cpumask_var(mask); | 762 | free_cpumask_var(mask); |
738 | } | 763 | } |
739 | #else | 764 | #else |
@@ -793,7 +818,7 @@ static void irq_thread_dtor(struct callback_head *unused) | |||
793 | action = kthread_data(tsk); | 818 | action = kthread_data(tsk); |
794 | 819 | ||
795 | pr_err("exiting task \"%s\" (%d) is an active IRQ thread (irq %d)\n", | 820 | pr_err("exiting task \"%s\" (%d) is an active IRQ thread (irq %d)\n", |
796 | tsk->comm ? tsk->comm : "", tsk->pid, action->irq); | 821 | tsk->comm, tsk->pid, action->irq); |
797 | 822 | ||
798 | 823 | ||
799 | desc = irq_to_desc(action->irq); | 824 | desc = irq_to_desc(action->irq); |
@@ -833,6 +858,8 @@ static int irq_thread(void *data) | |||
833 | init_task_work(&on_exit_work, irq_thread_dtor); | 858 | init_task_work(&on_exit_work, irq_thread_dtor); |
834 | task_work_add(current, &on_exit_work, false); | 859 | task_work_add(current, &on_exit_work, false); |
835 | 860 | ||
861 | irq_thread_check_affinity(desc, action); | ||
862 | |||
836 | while (!irq_wait_for_interrupt(action)) { | 863 | while (!irq_wait_for_interrupt(action)) { |
837 | irqreturn_t action_ret; | 864 | irqreturn_t action_ret; |
838 | 865 | ||
@@ -936,6 +963,16 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new) | |||
936 | */ | 963 | */ |
937 | get_task_struct(t); | 964 | get_task_struct(t); |
938 | new->thread = t; | 965 | new->thread = t; |
966 | /* | ||
967 | * Tell the thread to set its affinity. This is | ||
968 | * important for shared interrupt handlers as we do | ||
969 | * not invoke setup_affinity() for the secondary | ||
970 | * handlers as everything is already set up. Even for | ||
971 | * interrupts marked with IRQF_NO_BALANCE this is | ||
972 | * correct as we want the thread to move to the cpu(s) | ||
973 | * on which the requesting code placed the interrupt. | ||
974 | */ | ||
975 | set_bit(IRQTF_AFFINITY, &new->thread_flags); | ||
939 | } | 976 | } |
940 | 977 | ||
941 | if (!alloc_cpumask_var(&mask, GFP_KERNEL)) { | 978 | if (!alloc_cpumask_var(&mask, GFP_KERNEL)) { |
diff --git a/kernel/irq/resend.c b/kernel/irq/resend.c index 6454db7b6a4d..9065107f083e 100644 --- a/kernel/irq/resend.c +++ b/kernel/irq/resend.c | |||
@@ -74,6 +74,14 @@ void check_irq_resend(struct irq_desc *desc, unsigned int irq) | |||
74 | if (!desc->irq_data.chip->irq_retrigger || | 74 | if (!desc->irq_data.chip->irq_retrigger || |
75 | !desc->irq_data.chip->irq_retrigger(&desc->irq_data)) { | 75 | !desc->irq_data.chip->irq_retrigger(&desc->irq_data)) { |
76 | #ifdef CONFIG_HARDIRQS_SW_RESEND | 76 | #ifdef CONFIG_HARDIRQS_SW_RESEND |
77 | /* | ||
78 | * If the interrupt has a parent irq and runs | ||
79 | * in the thread context of the parent irq, | ||
80 | * retrigger the parent. | ||
81 | */ | ||
82 | if (desc->parent_irq && | ||
83 | irq_settings_is_nested_thread(desc)) | ||
84 | irq = desc->parent_irq; | ||
77 | /* Set it pending and activate the softirq: */ | 85 | /* Set it pending and activate the softirq: */ |
78 | set_bit(irq, irqs_resend); | 86 | set_bit(irq, irqs_resend); |
79 | tasklet_schedule(&resend_tasklet); | 87 | tasklet_schedule(&resend_tasklet); |
diff --git a/kernel/jump_label.c b/kernel/jump_label.c index 43049192b5ec..60f48fa0fd0d 100644 --- a/kernel/jump_label.c +++ b/kernel/jump_label.c | |||
@@ -118,6 +118,7 @@ void jump_label_rate_limit(struct static_key_deferred *key, | |||
118 | key->timeout = rl; | 118 | key->timeout = rl; |
119 | INIT_DELAYED_WORK(&key->work, jump_label_update_timeout); | 119 | INIT_DELAYED_WORK(&key->work, jump_label_update_timeout); |
120 | } | 120 | } |
121 | EXPORT_SYMBOL_GPL(jump_label_rate_limit); | ||
121 | 122 | ||
122 | static int addr_conflict(struct jump_entry *entry, void *start, void *end) | 123 | static int addr_conflict(struct jump_entry *entry, void *start, void *end) |
123 | { | 124 | { |
diff --git a/kernel/kcmp.c b/kernel/kcmp.c index 30b7b225306c..e30ac0fe61c3 100644 --- a/kernel/kcmp.c +++ b/kernel/kcmp.c | |||
@@ -4,6 +4,7 @@ | |||
4 | #include <linux/string.h> | 4 | #include <linux/string.h> |
5 | #include <linux/random.h> | 5 | #include <linux/random.h> |
6 | #include <linux/module.h> | 6 | #include <linux/module.h> |
7 | #include <linux/ptrace.h> | ||
7 | #include <linux/init.h> | 8 | #include <linux/init.h> |
8 | #include <linux/errno.h> | 9 | #include <linux/errno.h> |
9 | #include <linux/cache.h> | 10 | #include <linux/cache.h> |
diff --git a/kernel/kexec.c b/kernel/kexec.c index 0668d58d6413..5e4bd7864c5d 100644 --- a/kernel/kexec.c +++ b/kernel/kexec.c | |||
@@ -21,7 +21,6 @@ | |||
21 | #include <linux/hardirq.h> | 21 | #include <linux/hardirq.h> |
22 | #include <linux/elf.h> | 22 | #include <linux/elf.h> |
23 | #include <linux/elfcore.h> | 23 | #include <linux/elfcore.h> |
24 | #include <generated/utsrelease.h> | ||
25 | #include <linux/utsname.h> | 24 | #include <linux/utsname.h> |
26 | #include <linux/numa.h> | 25 | #include <linux/numa.h> |
27 | #include <linux/suspend.h> | 26 | #include <linux/suspend.h> |
diff --git a/kernel/kmod.c b/kernel/kmod.c index 6f99aead66c6..0023a87e8de6 100644 --- a/kernel/kmod.c +++ b/kernel/kmod.c | |||
@@ -37,6 +37,7 @@ | |||
37 | #include <linux/notifier.h> | 37 | #include <linux/notifier.h> |
38 | #include <linux/suspend.h> | 38 | #include <linux/suspend.h> |
39 | #include <linux/rwsem.h> | 39 | #include <linux/rwsem.h> |
40 | #include <linux/ptrace.h> | ||
40 | #include <asm/uaccess.h> | 41 | #include <asm/uaccess.h> |
41 | 42 | ||
42 | #include <trace/events/module.h> | 43 | #include <trace/events/module.h> |
@@ -218,14 +219,16 @@ static int ____call_usermodehelper(void *data) | |||
218 | 219 | ||
219 | commit_creds(new); | 220 | commit_creds(new); |
220 | 221 | ||
221 | retval = kernel_execve(sub_info->path, | 222 | retval = do_execve(sub_info->path, |
222 | (const char *const *)sub_info->argv, | 223 | (const char __user *const __user *)sub_info->argv, |
223 | (const char *const *)sub_info->envp); | 224 | (const char __user *const __user *)sub_info->envp); |
225 | if (!retval) | ||
226 | return 0; | ||
224 | 227 | ||
225 | /* Exec failed? */ | 228 | /* Exec failed? */ |
226 | fail: | 229 | fail: |
227 | sub_info->retval = retval; | 230 | sub_info->retval = retval; |
228 | return 0; | 231 | do_exit(0); |
229 | } | 232 | } |
230 | 233 | ||
231 | static int call_helper(void *data) | 234 | static int call_helper(void *data) |
@@ -292,7 +295,7 @@ static int wait_for_helper(void *data) | |||
292 | } | 295 | } |
293 | 296 | ||
294 | umh_complete(sub_info); | 297 | umh_complete(sub_info); |
295 | return 0; | 298 | do_exit(0); |
296 | } | 299 | } |
297 | 300 | ||
298 | /* This is run by khelper thread */ | 301 | /* This is run by khelper thread */ |
diff --git a/kernel/kprobes.c b/kernel/kprobes.c index c62b8546cc90..098f396aa409 100644 --- a/kernel/kprobes.c +++ b/kernel/kprobes.c | |||
@@ -561,9 +561,9 @@ static __kprobes void kprobe_optimizer(struct work_struct *work) | |||
561 | { | 561 | { |
562 | LIST_HEAD(free_list); | 562 | LIST_HEAD(free_list); |
563 | 563 | ||
564 | mutex_lock(&kprobe_mutex); | ||
564 | /* Lock modules while optimizing kprobes */ | 565 | /* Lock modules while optimizing kprobes */ |
565 | mutex_lock(&module_mutex); | 566 | mutex_lock(&module_mutex); |
566 | mutex_lock(&kprobe_mutex); | ||
567 | 567 | ||
568 | /* | 568 | /* |
569 | * Step 1: Unoptimize kprobes and collect cleaned (unused and disarmed) | 569 | * Step 1: Unoptimize kprobes and collect cleaned (unused and disarmed) |
@@ -586,8 +586,8 @@ static __kprobes void kprobe_optimizer(struct work_struct *work) | |||
586 | /* Step 4: Free cleaned kprobes after quiesence period */ | 586 | /* Step 4: Free cleaned kprobes after quiesence period */ |
587 | do_free_cleaned_kprobes(&free_list); | 587 | do_free_cleaned_kprobes(&free_list); |
588 | 588 | ||
589 | mutex_unlock(&kprobe_mutex); | ||
590 | mutex_unlock(&module_mutex); | 589 | mutex_unlock(&module_mutex); |
590 | mutex_unlock(&kprobe_mutex); | ||
591 | 591 | ||
592 | /* Step 5: Kick optimizer again if needed */ | 592 | /* Step 5: Kick optimizer again if needed */ |
593 | if (!list_empty(&optimizing_list) || !list_empty(&unoptimizing_list)) | 593 | if (!list_empty(&optimizing_list) || !list_empty(&unoptimizing_list)) |
@@ -759,20 +759,32 @@ static __kprobes void try_to_optimize_kprobe(struct kprobe *p) | |||
759 | struct kprobe *ap; | 759 | struct kprobe *ap; |
760 | struct optimized_kprobe *op; | 760 | struct optimized_kprobe *op; |
761 | 761 | ||
762 | /* Impossible to optimize ftrace-based kprobe */ | ||
763 | if (kprobe_ftrace(p)) | ||
764 | return; | ||
765 | |||
766 | /* For preparing optimization, jump_label_text_reserved() is called */ | ||
767 | jump_label_lock(); | ||
768 | mutex_lock(&text_mutex); | ||
769 | |||
762 | ap = alloc_aggr_kprobe(p); | 770 | ap = alloc_aggr_kprobe(p); |
763 | if (!ap) | 771 | if (!ap) |
764 | return; | 772 | goto out; |
765 | 773 | ||
766 | op = container_of(ap, struct optimized_kprobe, kp); | 774 | op = container_of(ap, struct optimized_kprobe, kp); |
767 | if (!arch_prepared_optinsn(&op->optinsn)) { | 775 | if (!arch_prepared_optinsn(&op->optinsn)) { |
768 | /* If failed to setup optimizing, fallback to kprobe */ | 776 | /* If failed to setup optimizing, fallback to kprobe */ |
769 | arch_remove_optimized_kprobe(op); | 777 | arch_remove_optimized_kprobe(op); |
770 | kfree(op); | 778 | kfree(op); |
771 | return; | 779 | goto out; |
772 | } | 780 | } |
773 | 781 | ||
774 | init_aggr_kprobe(ap, p); | 782 | init_aggr_kprobe(ap, p); |
775 | optimize_kprobe(ap); | 783 | optimize_kprobe(ap); /* This just kicks optimizer thread */ |
784 | |||
785 | out: | ||
786 | mutex_unlock(&text_mutex); | ||
787 | jump_label_unlock(); | ||
776 | } | 788 | } |
777 | 789 | ||
778 | #ifdef CONFIG_SYSCTL | 790 | #ifdef CONFIG_SYSCTL |
@@ -907,9 +919,64 @@ static __kprobes struct kprobe *alloc_aggr_kprobe(struct kprobe *p) | |||
907 | } | 919 | } |
908 | #endif /* CONFIG_OPTPROBES */ | 920 | #endif /* CONFIG_OPTPROBES */ |
909 | 921 | ||
922 | #ifdef KPROBES_CAN_USE_FTRACE | ||
923 | static struct ftrace_ops kprobe_ftrace_ops __read_mostly = { | ||
924 | .func = kprobe_ftrace_handler, | ||
925 | .flags = FTRACE_OPS_FL_SAVE_REGS, | ||
926 | }; | ||
927 | static int kprobe_ftrace_enabled; | ||
928 | |||
929 | /* Must ensure p->addr is really on ftrace */ | ||
930 | static int __kprobes prepare_kprobe(struct kprobe *p) | ||
931 | { | ||
932 | if (!kprobe_ftrace(p)) | ||
933 | return arch_prepare_kprobe(p); | ||
934 | |||
935 | return arch_prepare_kprobe_ftrace(p); | ||
936 | } | ||
937 | |||
938 | /* Caller must lock kprobe_mutex */ | ||
939 | static void __kprobes arm_kprobe_ftrace(struct kprobe *p) | ||
940 | { | ||
941 | int ret; | ||
942 | |||
943 | ret = ftrace_set_filter_ip(&kprobe_ftrace_ops, | ||
944 | (unsigned long)p->addr, 0, 0); | ||
945 | WARN(ret < 0, "Failed to arm kprobe-ftrace at %p (%d)\n", p->addr, ret); | ||
946 | kprobe_ftrace_enabled++; | ||
947 | if (kprobe_ftrace_enabled == 1) { | ||
948 | ret = register_ftrace_function(&kprobe_ftrace_ops); | ||
949 | WARN(ret < 0, "Failed to init kprobe-ftrace (%d)\n", ret); | ||
950 | } | ||
951 | } | ||
952 | |||
953 | /* Caller must lock kprobe_mutex */ | ||
954 | static void __kprobes disarm_kprobe_ftrace(struct kprobe *p) | ||
955 | { | ||
956 | int ret; | ||
957 | |||
958 | kprobe_ftrace_enabled--; | ||
959 | if (kprobe_ftrace_enabled == 0) { | ||
960 | ret = unregister_ftrace_function(&kprobe_ftrace_ops); | ||
961 | WARN(ret < 0, "Failed to init kprobe-ftrace (%d)\n", ret); | ||
962 | } | ||
963 | ret = ftrace_set_filter_ip(&kprobe_ftrace_ops, | ||
964 | (unsigned long)p->addr, 1, 0); | ||
965 | WARN(ret < 0, "Failed to disarm kprobe-ftrace at %p (%d)\n", p->addr, ret); | ||
966 | } | ||
967 | #else /* !KPROBES_CAN_USE_FTRACE */ | ||
968 | #define prepare_kprobe(p) arch_prepare_kprobe(p) | ||
969 | #define arm_kprobe_ftrace(p) do {} while (0) | ||
970 | #define disarm_kprobe_ftrace(p) do {} while (0) | ||
971 | #endif | ||
972 | |||
910 | /* Arm a kprobe with text_mutex */ | 973 | /* Arm a kprobe with text_mutex */ |
911 | static void __kprobes arm_kprobe(struct kprobe *kp) | 974 | static void __kprobes arm_kprobe(struct kprobe *kp) |
912 | { | 975 | { |
976 | if (unlikely(kprobe_ftrace(kp))) { | ||
977 | arm_kprobe_ftrace(kp); | ||
978 | return; | ||
979 | } | ||
913 | /* | 980 | /* |
914 | * Here, since __arm_kprobe() doesn't use stop_machine(), | 981 | * Here, since __arm_kprobe() doesn't use stop_machine(), |
915 | * this doesn't cause deadlock on text_mutex. So, we don't | 982 | * this doesn't cause deadlock on text_mutex. So, we don't |
@@ -921,11 +988,15 @@ static void __kprobes arm_kprobe(struct kprobe *kp) | |||
921 | } | 988 | } |
922 | 989 | ||
923 | /* Disarm a kprobe with text_mutex */ | 990 | /* Disarm a kprobe with text_mutex */ |
924 | static void __kprobes disarm_kprobe(struct kprobe *kp) | 991 | static void __kprobes disarm_kprobe(struct kprobe *kp, bool reopt) |
925 | { | 992 | { |
993 | if (unlikely(kprobe_ftrace(kp))) { | ||
994 | disarm_kprobe_ftrace(kp); | ||
995 | return; | ||
996 | } | ||
926 | /* Ditto */ | 997 | /* Ditto */ |
927 | mutex_lock(&text_mutex); | 998 | mutex_lock(&text_mutex); |
928 | __disarm_kprobe(kp, true); | 999 | __disarm_kprobe(kp, reopt); |
929 | mutex_unlock(&text_mutex); | 1000 | mutex_unlock(&text_mutex); |
930 | } | 1001 | } |
931 | 1002 | ||
@@ -1144,12 +1215,6 @@ static int __kprobes add_new_kprobe(struct kprobe *ap, struct kprobe *p) | |||
1144 | if (p->post_handler && !ap->post_handler) | 1215 | if (p->post_handler && !ap->post_handler) |
1145 | ap->post_handler = aggr_post_handler; | 1216 | ap->post_handler = aggr_post_handler; |
1146 | 1217 | ||
1147 | if (kprobe_disabled(ap) && !kprobe_disabled(p)) { | ||
1148 | ap->flags &= ~KPROBE_FLAG_DISABLED; | ||
1149 | if (!kprobes_all_disarmed) | ||
1150 | /* Arm the breakpoint again. */ | ||
1151 | __arm_kprobe(ap); | ||
1152 | } | ||
1153 | return 0; | 1218 | return 0; |
1154 | } | 1219 | } |
1155 | 1220 | ||
@@ -1189,11 +1254,22 @@ static int __kprobes register_aggr_kprobe(struct kprobe *orig_p, | |||
1189 | int ret = 0; | 1254 | int ret = 0; |
1190 | struct kprobe *ap = orig_p; | 1255 | struct kprobe *ap = orig_p; |
1191 | 1256 | ||
1257 | /* For preparing optimization, jump_label_text_reserved() is called */ | ||
1258 | jump_label_lock(); | ||
1259 | /* | ||
1260 | * Get online CPUs to avoid text_mutex deadlock.with stop machine, | ||
1261 | * which is invoked by unoptimize_kprobe() in add_new_kprobe() | ||
1262 | */ | ||
1263 | get_online_cpus(); | ||
1264 | mutex_lock(&text_mutex); | ||
1265 | |||
1192 | if (!kprobe_aggrprobe(orig_p)) { | 1266 | if (!kprobe_aggrprobe(orig_p)) { |
1193 | /* If orig_p is not an aggr_kprobe, create new aggr_kprobe. */ | 1267 | /* If orig_p is not an aggr_kprobe, create new aggr_kprobe. */ |
1194 | ap = alloc_aggr_kprobe(orig_p); | 1268 | ap = alloc_aggr_kprobe(orig_p); |
1195 | if (!ap) | 1269 | if (!ap) { |
1196 | return -ENOMEM; | 1270 | ret = -ENOMEM; |
1271 | goto out; | ||
1272 | } | ||
1197 | init_aggr_kprobe(ap, orig_p); | 1273 | init_aggr_kprobe(ap, orig_p); |
1198 | } else if (kprobe_unused(ap)) | 1274 | } else if (kprobe_unused(ap)) |
1199 | /* This probe is going to die. Rescue it */ | 1275 | /* This probe is going to die. Rescue it */ |
@@ -1213,7 +1289,7 @@ static int __kprobes register_aggr_kprobe(struct kprobe *orig_p, | |||
1213 | * free aggr_probe. It will be used next time, or | 1289 | * free aggr_probe. It will be used next time, or |
1214 | * freed by unregister_kprobe. | 1290 | * freed by unregister_kprobe. |
1215 | */ | 1291 | */ |
1216 | return ret; | 1292 | goto out; |
1217 | 1293 | ||
1218 | /* Prepare optimized instructions if possible. */ | 1294 | /* Prepare optimized instructions if possible. */ |
1219 | prepare_optimized_kprobe(ap); | 1295 | prepare_optimized_kprobe(ap); |
@@ -1228,7 +1304,20 @@ static int __kprobes register_aggr_kprobe(struct kprobe *orig_p, | |||
1228 | 1304 | ||
1229 | /* Copy ap's insn slot to p */ | 1305 | /* Copy ap's insn slot to p */ |
1230 | copy_kprobe(ap, p); | 1306 | copy_kprobe(ap, p); |
1231 | return add_new_kprobe(ap, p); | 1307 | ret = add_new_kprobe(ap, p); |
1308 | |||
1309 | out: | ||
1310 | mutex_unlock(&text_mutex); | ||
1311 | put_online_cpus(); | ||
1312 | jump_label_unlock(); | ||
1313 | |||
1314 | if (ret == 0 && kprobe_disabled(ap) && !kprobe_disabled(p)) { | ||
1315 | ap->flags &= ~KPROBE_FLAG_DISABLED; | ||
1316 | if (!kprobes_all_disarmed) | ||
1317 | /* Arm the breakpoint again. */ | ||
1318 | arm_kprobe(ap); | ||
1319 | } | ||
1320 | return ret; | ||
1232 | } | 1321 | } |
1233 | 1322 | ||
1234 | static int __kprobes in_kprobes_functions(unsigned long addr) | 1323 | static int __kprobes in_kprobes_functions(unsigned long addr) |
@@ -1313,71 +1402,96 @@ static inline int check_kprobe_rereg(struct kprobe *p) | |||
1313 | return ret; | 1402 | return ret; |
1314 | } | 1403 | } |
1315 | 1404 | ||
1316 | int __kprobes register_kprobe(struct kprobe *p) | 1405 | static __kprobes int check_kprobe_address_safe(struct kprobe *p, |
1406 | struct module **probed_mod) | ||
1317 | { | 1407 | { |
1318 | int ret = 0; | 1408 | int ret = 0; |
1319 | struct kprobe *old_p; | 1409 | unsigned long ftrace_addr; |
1320 | struct module *probed_mod; | ||
1321 | kprobe_opcode_t *addr; | ||
1322 | |||
1323 | addr = kprobe_addr(p); | ||
1324 | if (IS_ERR(addr)) | ||
1325 | return PTR_ERR(addr); | ||
1326 | p->addr = addr; | ||
1327 | 1410 | ||
1328 | ret = check_kprobe_rereg(p); | 1411 | /* |
1329 | if (ret) | 1412 | * If the address is located on a ftrace nop, set the |
1330 | return ret; | 1413 | * breakpoint to the following instruction. |
1414 | */ | ||
1415 | ftrace_addr = ftrace_location((unsigned long)p->addr); | ||
1416 | if (ftrace_addr) { | ||
1417 | #ifdef KPROBES_CAN_USE_FTRACE | ||
1418 | /* Given address is not on the instruction boundary */ | ||
1419 | if ((unsigned long)p->addr != ftrace_addr) | ||
1420 | return -EILSEQ; | ||
1421 | p->flags |= KPROBE_FLAG_FTRACE; | ||
1422 | #else /* !KPROBES_CAN_USE_FTRACE */ | ||
1423 | return -EINVAL; | ||
1424 | #endif | ||
1425 | } | ||
1331 | 1426 | ||
1332 | jump_label_lock(); | 1427 | jump_label_lock(); |
1333 | preempt_disable(); | 1428 | preempt_disable(); |
1429 | |||
1430 | /* Ensure it is not in reserved area nor out of text */ | ||
1334 | if (!kernel_text_address((unsigned long) p->addr) || | 1431 | if (!kernel_text_address((unsigned long) p->addr) || |
1335 | in_kprobes_functions((unsigned long) p->addr) || | 1432 | in_kprobes_functions((unsigned long) p->addr) || |
1336 | ftrace_text_reserved(p->addr, p->addr) || | ||
1337 | jump_label_text_reserved(p->addr, p->addr)) { | 1433 | jump_label_text_reserved(p->addr, p->addr)) { |
1338 | ret = -EINVAL; | 1434 | ret = -EINVAL; |
1339 | goto cannot_probe; | 1435 | goto out; |
1340 | } | 1436 | } |
1341 | 1437 | ||
1342 | /* User can pass only KPROBE_FLAG_DISABLED to register_kprobe */ | 1438 | /* Check if are we probing a module */ |
1343 | p->flags &= KPROBE_FLAG_DISABLED; | 1439 | *probed_mod = __module_text_address((unsigned long) p->addr); |
1344 | 1440 | if (*probed_mod) { | |
1345 | /* | ||
1346 | * Check if are we probing a module. | ||
1347 | */ | ||
1348 | probed_mod = __module_text_address((unsigned long) p->addr); | ||
1349 | if (probed_mod) { | ||
1350 | /* Return -ENOENT if fail. */ | ||
1351 | ret = -ENOENT; | ||
1352 | /* | 1441 | /* |
1353 | * We must hold a refcount of the probed module while updating | 1442 | * We must hold a refcount of the probed module while updating |
1354 | * its code to prohibit unexpected unloading. | 1443 | * its code to prohibit unexpected unloading. |
1355 | */ | 1444 | */ |
1356 | if (unlikely(!try_module_get(probed_mod))) | 1445 | if (unlikely(!try_module_get(*probed_mod))) { |
1357 | goto cannot_probe; | 1446 | ret = -ENOENT; |
1447 | goto out; | ||
1448 | } | ||
1358 | 1449 | ||
1359 | /* | 1450 | /* |
1360 | * If the module freed .init.text, we couldn't insert | 1451 | * If the module freed .init.text, we couldn't insert |
1361 | * kprobes in there. | 1452 | * kprobes in there. |
1362 | */ | 1453 | */ |
1363 | if (within_module_init((unsigned long)p->addr, probed_mod) && | 1454 | if (within_module_init((unsigned long)p->addr, *probed_mod) && |
1364 | probed_mod->state != MODULE_STATE_COMING) { | 1455 | (*probed_mod)->state != MODULE_STATE_COMING) { |
1365 | module_put(probed_mod); | 1456 | module_put(*probed_mod); |
1366 | goto cannot_probe; | 1457 | *probed_mod = NULL; |
1458 | ret = -ENOENT; | ||
1367 | } | 1459 | } |
1368 | /* ret will be updated by following code */ | ||
1369 | } | 1460 | } |
1461 | out: | ||
1370 | preempt_enable(); | 1462 | preempt_enable(); |
1371 | jump_label_unlock(); | 1463 | jump_label_unlock(); |
1372 | 1464 | ||
1465 | return ret; | ||
1466 | } | ||
1467 | |||
1468 | int __kprobes register_kprobe(struct kprobe *p) | ||
1469 | { | ||
1470 | int ret; | ||
1471 | struct kprobe *old_p; | ||
1472 | struct module *probed_mod; | ||
1473 | kprobe_opcode_t *addr; | ||
1474 | |||
1475 | /* Adjust probe address from symbol */ | ||
1476 | addr = kprobe_addr(p); | ||
1477 | if (IS_ERR(addr)) | ||
1478 | return PTR_ERR(addr); | ||
1479 | p->addr = addr; | ||
1480 | |||
1481 | ret = check_kprobe_rereg(p); | ||
1482 | if (ret) | ||
1483 | return ret; | ||
1484 | |||
1485 | /* User can pass only KPROBE_FLAG_DISABLED to register_kprobe */ | ||
1486 | p->flags &= KPROBE_FLAG_DISABLED; | ||
1373 | p->nmissed = 0; | 1487 | p->nmissed = 0; |
1374 | INIT_LIST_HEAD(&p->list); | 1488 | INIT_LIST_HEAD(&p->list); |
1375 | mutex_lock(&kprobe_mutex); | ||
1376 | 1489 | ||
1377 | jump_label_lock(); /* needed to call jump_label_text_reserved() */ | 1490 | ret = check_kprobe_address_safe(p, &probed_mod); |
1491 | if (ret) | ||
1492 | return ret; | ||
1378 | 1493 | ||
1379 | get_online_cpus(); /* For avoiding text_mutex deadlock. */ | 1494 | mutex_lock(&kprobe_mutex); |
1380 | mutex_lock(&text_mutex); | ||
1381 | 1495 | ||
1382 | old_p = get_kprobe(p->addr); | 1496 | old_p = get_kprobe(p->addr); |
1383 | if (old_p) { | 1497 | if (old_p) { |
@@ -1386,7 +1500,9 @@ int __kprobes register_kprobe(struct kprobe *p) | |||
1386 | goto out; | 1500 | goto out; |
1387 | } | 1501 | } |
1388 | 1502 | ||
1389 | ret = arch_prepare_kprobe(p); | 1503 | mutex_lock(&text_mutex); /* Avoiding text modification */ |
1504 | ret = prepare_kprobe(p); | ||
1505 | mutex_unlock(&text_mutex); | ||
1390 | if (ret) | 1506 | if (ret) |
1391 | goto out; | 1507 | goto out; |
1392 | 1508 | ||
@@ -1395,26 +1511,18 @@ int __kprobes register_kprobe(struct kprobe *p) | |||
1395 | &kprobe_table[hash_ptr(p->addr, KPROBE_HASH_BITS)]); | 1511 | &kprobe_table[hash_ptr(p->addr, KPROBE_HASH_BITS)]); |
1396 | 1512 | ||
1397 | if (!kprobes_all_disarmed && !kprobe_disabled(p)) | 1513 | if (!kprobes_all_disarmed && !kprobe_disabled(p)) |
1398 | __arm_kprobe(p); | 1514 | arm_kprobe(p); |
1399 | 1515 | ||
1400 | /* Try to optimize kprobe */ | 1516 | /* Try to optimize kprobe */ |
1401 | try_to_optimize_kprobe(p); | 1517 | try_to_optimize_kprobe(p); |
1402 | 1518 | ||
1403 | out: | 1519 | out: |
1404 | mutex_unlock(&text_mutex); | ||
1405 | put_online_cpus(); | ||
1406 | jump_label_unlock(); | ||
1407 | mutex_unlock(&kprobe_mutex); | 1520 | mutex_unlock(&kprobe_mutex); |
1408 | 1521 | ||
1409 | if (probed_mod) | 1522 | if (probed_mod) |
1410 | module_put(probed_mod); | 1523 | module_put(probed_mod); |
1411 | 1524 | ||
1412 | return ret; | 1525 | return ret; |
1413 | |||
1414 | cannot_probe: | ||
1415 | preempt_enable(); | ||
1416 | jump_label_unlock(); | ||
1417 | return ret; | ||
1418 | } | 1526 | } |
1419 | EXPORT_SYMBOL_GPL(register_kprobe); | 1527 | EXPORT_SYMBOL_GPL(register_kprobe); |
1420 | 1528 | ||
@@ -1451,7 +1559,7 @@ static struct kprobe *__kprobes __disable_kprobe(struct kprobe *p) | |||
1451 | 1559 | ||
1452 | /* Try to disarm and disable this/parent probe */ | 1560 | /* Try to disarm and disable this/parent probe */ |
1453 | if (p == orig_p || aggr_kprobe_disabled(orig_p)) { | 1561 | if (p == orig_p || aggr_kprobe_disabled(orig_p)) { |
1454 | disarm_kprobe(orig_p); | 1562 | disarm_kprobe(orig_p, true); |
1455 | orig_p->flags |= KPROBE_FLAG_DISABLED; | 1563 | orig_p->flags |= KPROBE_FLAG_DISABLED; |
1456 | } | 1564 | } |
1457 | } | 1565 | } |
@@ -2049,10 +2157,11 @@ static void __kprobes report_probe(struct seq_file *pi, struct kprobe *p, | |||
2049 | 2157 | ||
2050 | if (!pp) | 2158 | if (!pp) |
2051 | pp = p; | 2159 | pp = p; |
2052 | seq_printf(pi, "%s%s%s\n", | 2160 | seq_printf(pi, "%s%s%s%s\n", |
2053 | (kprobe_gone(p) ? "[GONE]" : ""), | 2161 | (kprobe_gone(p) ? "[GONE]" : ""), |
2054 | ((kprobe_disabled(p) && !kprobe_gone(p)) ? "[DISABLED]" : ""), | 2162 | ((kprobe_disabled(p) && !kprobe_gone(p)) ? "[DISABLED]" : ""), |
2055 | (kprobe_optimized(pp) ? "[OPTIMIZED]" : "")); | 2163 | (kprobe_optimized(pp) ? "[OPTIMIZED]" : ""), |
2164 | (kprobe_ftrace(pp) ? "[FTRACE]" : "")); | ||
2056 | } | 2165 | } |
2057 | 2166 | ||
2058 | static void __kprobes *kprobe_seq_start(struct seq_file *f, loff_t *pos) | 2167 | static void __kprobes *kprobe_seq_start(struct seq_file *f, loff_t *pos) |
@@ -2131,14 +2240,12 @@ static void __kprobes arm_all_kprobes(void) | |||
2131 | goto already_enabled; | 2240 | goto already_enabled; |
2132 | 2241 | ||
2133 | /* Arming kprobes doesn't optimize kprobe itself */ | 2242 | /* Arming kprobes doesn't optimize kprobe itself */ |
2134 | mutex_lock(&text_mutex); | ||
2135 | for (i = 0; i < KPROBE_TABLE_SIZE; i++) { | 2243 | for (i = 0; i < KPROBE_TABLE_SIZE; i++) { |
2136 | head = &kprobe_table[i]; | 2244 | head = &kprobe_table[i]; |
2137 | hlist_for_each_entry_rcu(p, node, head, hlist) | 2245 | hlist_for_each_entry_rcu(p, node, head, hlist) |
2138 | if (!kprobe_disabled(p)) | 2246 | if (!kprobe_disabled(p)) |
2139 | __arm_kprobe(p); | 2247 | arm_kprobe(p); |
2140 | } | 2248 | } |
2141 | mutex_unlock(&text_mutex); | ||
2142 | 2249 | ||
2143 | kprobes_all_disarmed = false; | 2250 | kprobes_all_disarmed = false; |
2144 | printk(KERN_INFO "Kprobes globally enabled\n"); | 2251 | printk(KERN_INFO "Kprobes globally enabled\n"); |
@@ -2166,15 +2273,13 @@ static void __kprobes disarm_all_kprobes(void) | |||
2166 | kprobes_all_disarmed = true; | 2273 | kprobes_all_disarmed = true; |
2167 | printk(KERN_INFO "Kprobes globally disabled\n"); | 2274 | printk(KERN_INFO "Kprobes globally disabled\n"); |
2168 | 2275 | ||
2169 | mutex_lock(&text_mutex); | ||
2170 | for (i = 0; i < KPROBE_TABLE_SIZE; i++) { | 2276 | for (i = 0; i < KPROBE_TABLE_SIZE; i++) { |
2171 | head = &kprobe_table[i]; | 2277 | head = &kprobe_table[i]; |
2172 | hlist_for_each_entry_rcu(p, node, head, hlist) { | 2278 | hlist_for_each_entry_rcu(p, node, head, hlist) { |
2173 | if (!arch_trampoline_kprobe(p) && !kprobe_disabled(p)) | 2279 | if (!arch_trampoline_kprobe(p) && !kprobe_disabled(p)) |
2174 | __disarm_kprobe(p, false); | 2280 | disarm_kprobe(p, false); |
2175 | } | 2281 | } |
2176 | } | 2282 | } |
2177 | mutex_unlock(&text_mutex); | ||
2178 | mutex_unlock(&kprobe_mutex); | 2283 | mutex_unlock(&kprobe_mutex); |
2179 | 2284 | ||
2180 | /* Wait for disarming all kprobes by optimizer */ | 2285 | /* Wait for disarming all kprobes by optimizer */ |
diff --git a/kernel/ksysfs.c b/kernel/ksysfs.c index 4e316e1acf58..6ada93c23a9a 100644 --- a/kernel/ksysfs.c +++ b/kernel/ksysfs.c | |||
@@ -26,7 +26,6 @@ static struct kobj_attribute _name##_attr = __ATTR_RO(_name) | |||
26 | static struct kobj_attribute _name##_attr = \ | 26 | static struct kobj_attribute _name##_attr = \ |
27 | __ATTR(_name, 0644, _name##_show, _name##_store) | 27 | __ATTR(_name, 0644, _name##_show, _name##_store) |
28 | 28 | ||
29 | #if defined(CONFIG_HOTPLUG) | ||
30 | /* current uevent sequence number */ | 29 | /* current uevent sequence number */ |
31 | static ssize_t uevent_seqnum_show(struct kobject *kobj, | 30 | static ssize_t uevent_seqnum_show(struct kobject *kobj, |
32 | struct kobj_attribute *attr, char *buf) | 31 | struct kobj_attribute *attr, char *buf) |
@@ -54,7 +53,7 @@ static ssize_t uevent_helper_store(struct kobject *kobj, | |||
54 | return count; | 53 | return count; |
55 | } | 54 | } |
56 | KERNEL_ATTR_RW(uevent_helper); | 55 | KERNEL_ATTR_RW(uevent_helper); |
57 | #endif | 56 | |
58 | 57 | ||
59 | #ifdef CONFIG_PROFILING | 58 | #ifdef CONFIG_PROFILING |
60 | static ssize_t profiling_show(struct kobject *kobj, | 59 | static ssize_t profiling_show(struct kobject *kobj, |
@@ -141,6 +140,23 @@ static ssize_t fscaps_show(struct kobject *kobj, | |||
141 | } | 140 | } |
142 | KERNEL_ATTR_RO(fscaps); | 141 | KERNEL_ATTR_RO(fscaps); |
143 | 142 | ||
143 | int rcu_expedited; | ||
144 | static ssize_t rcu_expedited_show(struct kobject *kobj, | ||
145 | struct kobj_attribute *attr, char *buf) | ||
146 | { | ||
147 | return sprintf(buf, "%d\n", rcu_expedited); | ||
148 | } | ||
149 | static ssize_t rcu_expedited_store(struct kobject *kobj, | ||
150 | struct kobj_attribute *attr, | ||
151 | const char *buf, size_t count) | ||
152 | { | ||
153 | if (kstrtoint(buf, 0, &rcu_expedited)) | ||
154 | return -EINVAL; | ||
155 | |||
156 | return count; | ||
157 | } | ||
158 | KERNEL_ATTR_RW(rcu_expedited); | ||
159 | |||
144 | /* | 160 | /* |
145 | * Make /sys/kernel/notes give the raw contents of our kernel .notes section. | 161 | * Make /sys/kernel/notes give the raw contents of our kernel .notes section. |
146 | */ | 162 | */ |
@@ -169,10 +185,8 @@ EXPORT_SYMBOL_GPL(kernel_kobj); | |||
169 | 185 | ||
170 | static struct attribute * kernel_attrs[] = { | 186 | static struct attribute * kernel_attrs[] = { |
171 | &fscaps_attr.attr, | 187 | &fscaps_attr.attr, |
172 | #if defined(CONFIG_HOTPLUG) | ||
173 | &uevent_seqnum_attr.attr, | 188 | &uevent_seqnum_attr.attr, |
174 | &uevent_helper_attr.attr, | 189 | &uevent_helper_attr.attr, |
175 | #endif | ||
176 | #ifdef CONFIG_PROFILING | 190 | #ifdef CONFIG_PROFILING |
177 | &profiling_attr.attr, | 191 | &profiling_attr.attr, |
178 | #endif | 192 | #endif |
@@ -182,6 +196,7 @@ static struct attribute * kernel_attrs[] = { | |||
182 | &kexec_crash_size_attr.attr, | 196 | &kexec_crash_size_attr.attr, |
183 | &vmcoreinfo_attr.attr, | 197 | &vmcoreinfo_attr.attr, |
184 | #endif | 198 | #endif |
199 | &rcu_expedited_attr.attr, | ||
185 | NULL | 200 | NULL |
186 | }; | 201 | }; |
187 | 202 | ||
diff --git a/kernel/kthread.c b/kernel/kthread.c index b579af57ea10..691dc2ef9baf 100644 --- a/kernel/kthread.c +++ b/kernel/kthread.c | |||
@@ -16,6 +16,7 @@ | |||
16 | #include <linux/mutex.h> | 16 | #include <linux/mutex.h> |
17 | #include <linux/slab.h> | 17 | #include <linux/slab.h> |
18 | #include <linux/freezer.h> | 18 | #include <linux/freezer.h> |
19 | #include <linux/ptrace.h> | ||
19 | #include <trace/events/sched.h> | 20 | #include <trace/events/sched.h> |
20 | 21 | ||
21 | static DEFINE_SPINLOCK(kthread_create_lock); | 22 | static DEFINE_SPINLOCK(kthread_create_lock); |
@@ -37,11 +38,20 @@ struct kthread_create_info | |||
37 | }; | 38 | }; |
38 | 39 | ||
39 | struct kthread { | 40 | struct kthread { |
40 | int should_stop; | 41 | unsigned long flags; |
42 | unsigned int cpu; | ||
41 | void *data; | 43 | void *data; |
44 | struct completion parked; | ||
42 | struct completion exited; | 45 | struct completion exited; |
43 | }; | 46 | }; |
44 | 47 | ||
48 | enum KTHREAD_BITS { | ||
49 | KTHREAD_IS_PER_CPU = 0, | ||
50 | KTHREAD_SHOULD_STOP, | ||
51 | KTHREAD_SHOULD_PARK, | ||
52 | KTHREAD_IS_PARKED, | ||
53 | }; | ||
54 | |||
45 | #define to_kthread(tsk) \ | 55 | #define to_kthread(tsk) \ |
46 | container_of((tsk)->vfork_done, struct kthread, exited) | 56 | container_of((tsk)->vfork_done, struct kthread, exited) |
47 | 57 | ||
@@ -52,13 +62,29 @@ struct kthread { | |||
52 | * and this will return true. You should then return, and your return | 62 | * and this will return true. You should then return, and your return |
53 | * value will be passed through to kthread_stop(). | 63 | * value will be passed through to kthread_stop(). |
54 | */ | 64 | */ |
55 | int kthread_should_stop(void) | 65 | bool kthread_should_stop(void) |
56 | { | 66 | { |
57 | return to_kthread(current)->should_stop; | 67 | return test_bit(KTHREAD_SHOULD_STOP, &to_kthread(current)->flags); |
58 | } | 68 | } |
59 | EXPORT_SYMBOL(kthread_should_stop); | 69 | EXPORT_SYMBOL(kthread_should_stop); |
60 | 70 | ||
61 | /** | 71 | /** |
72 | * kthread_should_park - should this kthread park now? | ||
73 | * | ||
74 | * When someone calls kthread_park() on your kthread, it will be woken | ||
75 | * and this will return true. You should then do the necessary | ||
76 | * cleanup and call kthread_parkme() | ||
77 | * | ||
78 | * Similar to kthread_should_stop(), but this keeps the thread alive | ||
79 | * and in a park position. kthread_unpark() "restarts" the thread and | ||
80 | * calls the thread function again. | ||
81 | */ | ||
82 | bool kthread_should_park(void) | ||
83 | { | ||
84 | return test_bit(KTHREAD_SHOULD_PARK, &to_kthread(current)->flags); | ||
85 | } | ||
86 | |||
87 | /** | ||
62 | * kthread_freezable_should_stop - should this freezable kthread return now? | 88 | * kthread_freezable_should_stop - should this freezable kthread return now? |
63 | * @was_frozen: optional out parameter, indicates whether %current was frozen | 89 | * @was_frozen: optional out parameter, indicates whether %current was frozen |
64 | * | 90 | * |
@@ -96,6 +122,24 @@ void *kthread_data(struct task_struct *task) | |||
96 | return to_kthread(task)->data; | 122 | return to_kthread(task)->data; |
97 | } | 123 | } |
98 | 124 | ||
125 | static void __kthread_parkme(struct kthread *self) | ||
126 | { | ||
127 | __set_current_state(TASK_INTERRUPTIBLE); | ||
128 | while (test_bit(KTHREAD_SHOULD_PARK, &self->flags)) { | ||
129 | if (!test_and_set_bit(KTHREAD_IS_PARKED, &self->flags)) | ||
130 | complete(&self->parked); | ||
131 | schedule(); | ||
132 | __set_current_state(TASK_INTERRUPTIBLE); | ||
133 | } | ||
134 | clear_bit(KTHREAD_IS_PARKED, &self->flags); | ||
135 | __set_current_state(TASK_RUNNING); | ||
136 | } | ||
137 | |||
138 | void kthread_parkme(void) | ||
139 | { | ||
140 | __kthread_parkme(to_kthread(current)); | ||
141 | } | ||
142 | |||
99 | static int kthread(void *_create) | 143 | static int kthread(void *_create) |
100 | { | 144 | { |
101 | /* Copy data: it's on kthread's stack */ | 145 | /* Copy data: it's on kthread's stack */ |
@@ -105,9 +149,10 @@ static int kthread(void *_create) | |||
105 | struct kthread self; | 149 | struct kthread self; |
106 | int ret; | 150 | int ret; |
107 | 151 | ||
108 | self.should_stop = 0; | 152 | self.flags = 0; |
109 | self.data = data; | 153 | self.data = data; |
110 | init_completion(&self.exited); | 154 | init_completion(&self.exited); |
155 | init_completion(&self.parked); | ||
111 | current->vfork_done = &self.exited; | 156 | current->vfork_done = &self.exited; |
112 | 157 | ||
113 | /* OK, tell user we're spawned, wait for stop or wakeup */ | 158 | /* OK, tell user we're spawned, wait for stop or wakeup */ |
@@ -117,9 +162,11 @@ static int kthread(void *_create) | |||
117 | schedule(); | 162 | schedule(); |
118 | 163 | ||
119 | ret = -EINTR; | 164 | ret = -EINTR; |
120 | if (!self.should_stop) | ||
121 | ret = threadfn(data); | ||
122 | 165 | ||
166 | if (!test_bit(KTHREAD_SHOULD_STOP, &self.flags)) { | ||
167 | __kthread_parkme(&self); | ||
168 | ret = threadfn(data); | ||
169 | } | ||
123 | /* we can't just return, we must preserve "self" on stack */ | 170 | /* we can't just return, we must preserve "self" on stack */ |
124 | do_exit(ret); | 171 | do_exit(ret); |
125 | } | 172 | } |
@@ -172,8 +219,7 @@ static void create_kthread(struct kthread_create_info *create) | |||
172 | * Returns a task_struct or ERR_PTR(-ENOMEM). | 219 | * Returns a task_struct or ERR_PTR(-ENOMEM). |
173 | */ | 220 | */ |
174 | struct task_struct *kthread_create_on_node(int (*threadfn)(void *data), | 221 | struct task_struct *kthread_create_on_node(int (*threadfn)(void *data), |
175 | void *data, | 222 | void *data, int node, |
176 | int node, | ||
177 | const char namefmt[], | 223 | const char namefmt[], |
178 | ...) | 224 | ...) |
179 | { | 225 | { |
@@ -210,6 +256,13 @@ struct task_struct *kthread_create_on_node(int (*threadfn)(void *data), | |||
210 | } | 256 | } |
211 | EXPORT_SYMBOL(kthread_create_on_node); | 257 | EXPORT_SYMBOL(kthread_create_on_node); |
212 | 258 | ||
259 | static void __kthread_bind(struct task_struct *p, unsigned int cpu) | ||
260 | { | ||
261 | /* It's safe because the task is inactive. */ | ||
262 | do_set_cpus_allowed(p, cpumask_of(cpu)); | ||
263 | p->flags |= PF_THREAD_BOUND; | ||
264 | } | ||
265 | |||
213 | /** | 266 | /** |
214 | * kthread_bind - bind a just-created kthread to a cpu. | 267 | * kthread_bind - bind a just-created kthread to a cpu. |
215 | * @p: thread created by kthread_create(). | 268 | * @p: thread created by kthread_create(). |
@@ -226,14 +279,112 @@ void kthread_bind(struct task_struct *p, unsigned int cpu) | |||
226 | WARN_ON(1); | 279 | WARN_ON(1); |
227 | return; | 280 | return; |
228 | } | 281 | } |
229 | 282 | __kthread_bind(p, cpu); | |
230 | /* It's safe because the task is inactive. */ | ||
231 | do_set_cpus_allowed(p, cpumask_of(cpu)); | ||
232 | p->flags |= PF_THREAD_BOUND; | ||
233 | } | 283 | } |
234 | EXPORT_SYMBOL(kthread_bind); | 284 | EXPORT_SYMBOL(kthread_bind); |
235 | 285 | ||
236 | /** | 286 | /** |
287 | * kthread_create_on_cpu - Create a cpu bound kthread | ||
288 | * @threadfn: the function to run until signal_pending(current). | ||
289 | * @data: data ptr for @threadfn. | ||
290 | * @cpu: The cpu on which the thread should be bound, | ||
291 | * @namefmt: printf-style name for the thread. Format is restricted | ||
292 | * to "name.*%u". Code fills in cpu number. | ||
293 | * | ||
294 | * Description: This helper function creates and names a kernel thread | ||
295 | * The thread will be woken and put into park mode. | ||
296 | */ | ||
297 | struct task_struct *kthread_create_on_cpu(int (*threadfn)(void *data), | ||
298 | void *data, unsigned int cpu, | ||
299 | const char *namefmt) | ||
300 | { | ||
301 | struct task_struct *p; | ||
302 | |||
303 | p = kthread_create_on_node(threadfn, data, cpu_to_node(cpu), namefmt, | ||
304 | cpu); | ||
305 | if (IS_ERR(p)) | ||
306 | return p; | ||
307 | set_bit(KTHREAD_IS_PER_CPU, &to_kthread(p)->flags); | ||
308 | to_kthread(p)->cpu = cpu; | ||
309 | /* Park the thread to get it out of TASK_UNINTERRUPTIBLE state */ | ||
310 | kthread_park(p); | ||
311 | return p; | ||
312 | } | ||
313 | |||
314 | static struct kthread *task_get_live_kthread(struct task_struct *k) | ||
315 | { | ||
316 | struct kthread *kthread; | ||
317 | |||
318 | get_task_struct(k); | ||
319 | kthread = to_kthread(k); | ||
320 | /* It might have exited */ | ||
321 | barrier(); | ||
322 | if (k->vfork_done != NULL) | ||
323 | return kthread; | ||
324 | return NULL; | ||
325 | } | ||
326 | |||
327 | /** | ||
328 | * kthread_unpark - unpark a thread created by kthread_create(). | ||
329 | * @k: thread created by kthread_create(). | ||
330 | * | ||
331 | * Sets kthread_should_park() for @k to return false, wakes it, and | ||
332 | * waits for it to return. If the thread is marked percpu then its | ||
333 | * bound to the cpu again. | ||
334 | */ | ||
335 | void kthread_unpark(struct task_struct *k) | ||
336 | { | ||
337 | struct kthread *kthread = task_get_live_kthread(k); | ||
338 | |||
339 | if (kthread) { | ||
340 | clear_bit(KTHREAD_SHOULD_PARK, &kthread->flags); | ||
341 | /* | ||
342 | * We clear the IS_PARKED bit here as we don't wait | ||
343 | * until the task has left the park code. So if we'd | ||
344 | * park before that happens we'd see the IS_PARKED bit | ||
345 | * which might be about to be cleared. | ||
346 | */ | ||
347 | if (test_and_clear_bit(KTHREAD_IS_PARKED, &kthread->flags)) { | ||
348 | if (test_bit(KTHREAD_IS_PER_CPU, &kthread->flags)) | ||
349 | __kthread_bind(k, kthread->cpu); | ||
350 | wake_up_process(k); | ||
351 | } | ||
352 | } | ||
353 | put_task_struct(k); | ||
354 | } | ||
355 | |||
356 | /** | ||
357 | * kthread_park - park a thread created by kthread_create(). | ||
358 | * @k: thread created by kthread_create(). | ||
359 | * | ||
360 | * Sets kthread_should_park() for @k to return true, wakes it, and | ||
361 | * waits for it to return. This can also be called after kthread_create() | ||
362 | * instead of calling wake_up_process(): the thread will park without | ||
363 | * calling threadfn(). | ||
364 | * | ||
365 | * Returns 0 if the thread is parked, -ENOSYS if the thread exited. | ||
366 | * If called by the kthread itself just the park bit is set. | ||
367 | */ | ||
368 | int kthread_park(struct task_struct *k) | ||
369 | { | ||
370 | struct kthread *kthread = task_get_live_kthread(k); | ||
371 | int ret = -ENOSYS; | ||
372 | |||
373 | if (kthread) { | ||
374 | if (!test_bit(KTHREAD_IS_PARKED, &kthread->flags)) { | ||
375 | set_bit(KTHREAD_SHOULD_PARK, &kthread->flags); | ||
376 | if (k != current) { | ||
377 | wake_up_process(k); | ||
378 | wait_for_completion(&kthread->parked); | ||
379 | } | ||
380 | } | ||
381 | ret = 0; | ||
382 | } | ||
383 | put_task_struct(k); | ||
384 | return ret; | ||
385 | } | ||
386 | |||
387 | /** | ||
237 | * kthread_stop - stop a thread created by kthread_create(). | 388 | * kthread_stop - stop a thread created by kthread_create(). |
238 | * @k: thread created by kthread_create(). | 389 | * @k: thread created by kthread_create(). |
239 | * | 390 | * |
@@ -250,16 +401,13 @@ EXPORT_SYMBOL(kthread_bind); | |||
250 | */ | 401 | */ |
251 | int kthread_stop(struct task_struct *k) | 402 | int kthread_stop(struct task_struct *k) |
252 | { | 403 | { |
253 | struct kthread *kthread; | 404 | struct kthread *kthread = task_get_live_kthread(k); |
254 | int ret; | 405 | int ret; |
255 | 406 | ||
256 | trace_sched_kthread_stop(k); | 407 | trace_sched_kthread_stop(k); |
257 | get_task_struct(k); | 408 | if (kthread) { |
258 | 409 | set_bit(KTHREAD_SHOULD_STOP, &kthread->flags); | |
259 | kthread = to_kthread(k); | 410 | clear_bit(KTHREAD_SHOULD_PARK, &kthread->flags); |
260 | barrier(); /* it might have exited */ | ||
261 | if (k->vfork_done != NULL) { | ||
262 | kthread->should_stop = 1; | ||
263 | wake_up_process(k); | 411 | wake_up_process(k); |
264 | wait_for_completion(&kthread->exited); | 412 | wait_for_completion(&kthread->exited); |
265 | } | 413 | } |
@@ -280,7 +428,7 @@ int kthreadd(void *unused) | |||
280 | set_task_comm(tsk, "kthreadd"); | 428 | set_task_comm(tsk, "kthreadd"); |
281 | ignore_signals(tsk); | 429 | ignore_signals(tsk); |
282 | set_cpus_allowed_ptr(tsk, cpu_all_mask); | 430 | set_cpus_allowed_ptr(tsk, cpu_all_mask); |
283 | set_mems_allowed(node_states[N_HIGH_MEMORY]); | 431 | set_mems_allowed(node_states[N_MEMORY]); |
284 | 432 | ||
285 | current->flags |= PF_NOFREEZE; | 433 | current->flags |= PF_NOFREEZE; |
286 | 434 | ||
diff --git a/kernel/lockdep.c b/kernel/lockdep.c index ea9ee4518c35..7981e5b2350d 100644 --- a/kernel/lockdep.c +++ b/kernel/lockdep.c | |||
@@ -2998,6 +2998,42 @@ EXPORT_SYMBOL_GPL(lockdep_init_map); | |||
2998 | 2998 | ||
2999 | struct lock_class_key __lockdep_no_validate__; | 2999 | struct lock_class_key __lockdep_no_validate__; |
3000 | 3000 | ||
3001 | static int | ||
3002 | print_lock_nested_lock_not_held(struct task_struct *curr, | ||
3003 | struct held_lock *hlock, | ||
3004 | unsigned long ip) | ||
3005 | { | ||
3006 | if (!debug_locks_off()) | ||
3007 | return 0; | ||
3008 | if (debug_locks_silent) | ||
3009 | return 0; | ||
3010 | |||
3011 | printk("\n"); | ||
3012 | printk("==================================\n"); | ||
3013 | printk("[ BUG: Nested lock was not taken ]\n"); | ||
3014 | print_kernel_ident(); | ||
3015 | printk("----------------------------------\n"); | ||
3016 | |||
3017 | printk("%s/%d is trying to lock:\n", curr->comm, task_pid_nr(curr)); | ||
3018 | print_lock(hlock); | ||
3019 | |||
3020 | printk("\nbut this task is not holding:\n"); | ||
3021 | printk("%s\n", hlock->nest_lock->name); | ||
3022 | |||
3023 | printk("\nstack backtrace:\n"); | ||
3024 | dump_stack(); | ||
3025 | |||
3026 | printk("\nother info that might help us debug this:\n"); | ||
3027 | lockdep_print_held_locks(curr); | ||
3028 | |||
3029 | printk("\nstack backtrace:\n"); | ||
3030 | dump_stack(); | ||
3031 | |||
3032 | return 0; | ||
3033 | } | ||
3034 | |||
3035 | static int __lock_is_held(struct lockdep_map *lock); | ||
3036 | |||
3001 | /* | 3037 | /* |
3002 | * This gets called for every mutex_lock*()/spin_lock*() operation. | 3038 | * This gets called for every mutex_lock*()/spin_lock*() operation. |
3003 | * We maintain the dependency maps and validate the locking attempt: | 3039 | * We maintain the dependency maps and validate the locking attempt: |
@@ -3139,6 +3175,9 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass, | |||
3139 | } | 3175 | } |
3140 | chain_key = iterate_chain_key(chain_key, id); | 3176 | chain_key = iterate_chain_key(chain_key, id); |
3141 | 3177 | ||
3178 | if (nest_lock && !__lock_is_held(nest_lock)) | ||
3179 | return print_lock_nested_lock_not_held(curr, hlock, ip); | ||
3180 | |||
3142 | if (!validate_chain(curr, lock, hlock, chain_head, chain_key)) | 3181 | if (!validate_chain(curr, lock, hlock, chain_head, chain_key)) |
3143 | return 0; | 3182 | return 0; |
3144 | 3183 | ||
diff --git a/kernel/lockdep_proc.c b/kernel/lockdep_proc.c index 91c32a0b612c..b2c71c5873e4 100644 --- a/kernel/lockdep_proc.c +++ b/kernel/lockdep_proc.c | |||
@@ -39,7 +39,7 @@ static void l_stop(struct seq_file *m, void *v) | |||
39 | 39 | ||
40 | static void print_name(struct seq_file *m, struct lock_class *class) | 40 | static void print_name(struct seq_file *m, struct lock_class *class) |
41 | { | 41 | { |
42 | char str[128]; | 42 | char str[KSYM_NAME_LEN]; |
43 | const char *name = class->name; | 43 | const char *name = class->name; |
44 | 44 | ||
45 | if (!name) { | 45 | if (!name) { |
diff --git a/kernel/modsign_certificate.S b/kernel/modsign_certificate.S new file mode 100644 index 000000000000..246b4c6e6135 --- /dev/null +++ b/kernel/modsign_certificate.S | |||
@@ -0,0 +1,19 @@ | |||
1 | /* SYMBOL_PREFIX defined on commandline from CONFIG_SYMBOL_PREFIX */ | ||
2 | #ifndef SYMBOL_PREFIX | ||
3 | #define ASM_SYMBOL(sym) sym | ||
4 | #else | ||
5 | #define PASTE2(x,y) x##y | ||
6 | #define PASTE(x,y) PASTE2(x,y) | ||
7 | #define ASM_SYMBOL(sym) PASTE(SYMBOL_PREFIX, sym) | ||
8 | #endif | ||
9 | |||
10 | #define GLOBAL(name) \ | ||
11 | .globl ASM_SYMBOL(name); \ | ||
12 | ASM_SYMBOL(name): | ||
13 | |||
14 | .section ".init.data","aw" | ||
15 | |||
16 | GLOBAL(modsign_certificate_list) | ||
17 | .incbin "signing_key.x509" | ||
18 | .incbin "extra_certificates" | ||
19 | GLOBAL(modsign_certificate_list_end) | ||
diff --git a/kernel/modsign_pubkey.c b/kernel/modsign_pubkey.c new file mode 100644 index 000000000000..2b6e69909c39 --- /dev/null +++ b/kernel/modsign_pubkey.c | |||
@@ -0,0 +1,104 @@ | |||
1 | /* Public keys for module signature verification | ||
2 | * | ||
3 | * Copyright (C) 2012 Red Hat, Inc. All Rights Reserved. | ||
4 | * Written by David Howells (dhowells@redhat.com) | ||
5 | * | ||
6 | * This program is free software; you can redistribute it and/or | ||
7 | * modify it under the terms of the GNU General Public Licence | ||
8 | * as published by the Free Software Foundation; either version | ||
9 | * 2 of the Licence, or (at your option) any later version. | ||
10 | */ | ||
11 | |||
12 | #include <linux/kernel.h> | ||
13 | #include <linux/sched.h> | ||
14 | #include <linux/cred.h> | ||
15 | #include <linux/err.h> | ||
16 | #include <keys/asymmetric-type.h> | ||
17 | #include "module-internal.h" | ||
18 | |||
19 | struct key *modsign_keyring; | ||
20 | |||
21 | extern __initdata const u8 modsign_certificate_list[]; | ||
22 | extern __initdata const u8 modsign_certificate_list_end[]; | ||
23 | |||
24 | /* | ||
25 | * We need to make sure ccache doesn't cache the .o file as it doesn't notice | ||
26 | * if modsign.pub changes. | ||
27 | */ | ||
28 | static __initdata const char annoy_ccache[] = __TIME__ "foo"; | ||
29 | |||
30 | /* | ||
31 | * Load the compiled-in keys | ||
32 | */ | ||
33 | static __init int module_verify_init(void) | ||
34 | { | ||
35 | pr_notice("Initialise module verification\n"); | ||
36 | |||
37 | modsign_keyring = keyring_alloc(".module_sign", | ||
38 | KUIDT_INIT(0), KGIDT_INIT(0), | ||
39 | current_cred(), | ||
40 | ((KEY_POS_ALL & ~KEY_POS_SETATTR) | | ||
41 | KEY_USR_VIEW | KEY_USR_READ), | ||
42 | KEY_ALLOC_NOT_IN_QUOTA, NULL); | ||
43 | if (IS_ERR(modsign_keyring)) | ||
44 | panic("Can't allocate module signing keyring\n"); | ||
45 | |||
46 | return 0; | ||
47 | } | ||
48 | |||
49 | /* | ||
50 | * Must be initialised before we try and load the keys into the keyring. | ||
51 | */ | ||
52 | device_initcall(module_verify_init); | ||
53 | |||
54 | /* | ||
55 | * Load the compiled-in keys | ||
56 | */ | ||
57 | static __init int load_module_signing_keys(void) | ||
58 | { | ||
59 | key_ref_t key; | ||
60 | const u8 *p, *end; | ||
61 | size_t plen; | ||
62 | |||
63 | pr_notice("Loading module verification certificates\n"); | ||
64 | |||
65 | end = modsign_certificate_list_end; | ||
66 | p = modsign_certificate_list; | ||
67 | while (p < end) { | ||
68 | /* Each cert begins with an ASN.1 SEQUENCE tag and must be more | ||
69 | * than 256 bytes in size. | ||
70 | */ | ||
71 | if (end - p < 4) | ||
72 | goto dodgy_cert; | ||
73 | if (p[0] != 0x30 && | ||
74 | p[1] != 0x82) | ||
75 | goto dodgy_cert; | ||
76 | plen = (p[2] << 8) | p[3]; | ||
77 | plen += 4; | ||
78 | if (plen > end - p) | ||
79 | goto dodgy_cert; | ||
80 | |||
81 | key = key_create_or_update(make_key_ref(modsign_keyring, 1), | ||
82 | "asymmetric", | ||
83 | NULL, | ||
84 | p, | ||
85 | plen, | ||
86 | (KEY_POS_ALL & ~KEY_POS_SETATTR) | | ||
87 | KEY_USR_VIEW, | ||
88 | KEY_ALLOC_NOT_IN_QUOTA); | ||
89 | if (IS_ERR(key)) | ||
90 | pr_err("MODSIGN: Problem loading in-kernel X.509 certificate (%ld)\n", | ||
91 | PTR_ERR(key)); | ||
92 | else | ||
93 | pr_notice("MODSIGN: Loaded cert '%s'\n", | ||
94 | key_ref_to_ptr(key)->description); | ||
95 | p += plen; | ||
96 | } | ||
97 | |||
98 | return 0; | ||
99 | |||
100 | dodgy_cert: | ||
101 | pr_err("MODSIGN: Problem parsing in-kernel X.509 certificate list\n"); | ||
102 | return 0; | ||
103 | } | ||
104 | late_initcall(load_module_signing_keys); | ||
diff --git a/kernel/module-internal.h b/kernel/module-internal.h new file mode 100644 index 000000000000..24f9247b7d02 --- /dev/null +++ b/kernel/module-internal.h | |||
@@ -0,0 +1,14 @@ | |||
1 | /* Module internals | ||
2 | * | ||
3 | * Copyright (C) 2012 Red Hat, Inc. All Rights Reserved. | ||
4 | * Written by David Howells (dhowells@redhat.com) | ||
5 | * | ||
6 | * This program is free software; you can redistribute it and/or | ||
7 | * modify it under the terms of the GNU General Public Licence | ||
8 | * as published by the Free Software Foundation; either version | ||
9 | * 2 of the Licence, or (at your option) any later version. | ||
10 | */ | ||
11 | |||
12 | extern struct key *modsign_keyring; | ||
13 | |||
14 | extern int mod_verify_sig(const void *mod, unsigned long *_modlen); | ||
diff --git a/kernel/module.c b/kernel/module.c index 4edbd9c11aca..250092c1d57d 100644 --- a/kernel/module.c +++ b/kernel/module.c | |||
@@ -21,6 +21,7 @@ | |||
21 | #include <linux/ftrace_event.h> | 21 | #include <linux/ftrace_event.h> |
22 | #include <linux/init.h> | 22 | #include <linux/init.h> |
23 | #include <linux/kallsyms.h> | 23 | #include <linux/kallsyms.h> |
24 | #include <linux/file.h> | ||
24 | #include <linux/fs.h> | 25 | #include <linux/fs.h> |
25 | #include <linux/sysfs.h> | 26 | #include <linux/sysfs.h> |
26 | #include <linux/kernel.h> | 27 | #include <linux/kernel.h> |
@@ -28,6 +29,7 @@ | |||
28 | #include <linux/vmalloc.h> | 29 | #include <linux/vmalloc.h> |
29 | #include <linux/elf.h> | 30 | #include <linux/elf.h> |
30 | #include <linux/proc_fs.h> | 31 | #include <linux/proc_fs.h> |
32 | #include <linux/security.h> | ||
31 | #include <linux/seq_file.h> | 33 | #include <linux/seq_file.h> |
32 | #include <linux/syscalls.h> | 34 | #include <linux/syscalls.h> |
33 | #include <linux/fcntl.h> | 35 | #include <linux/fcntl.h> |
@@ -58,6 +60,9 @@ | |||
58 | #include <linux/jump_label.h> | 60 | #include <linux/jump_label.h> |
59 | #include <linux/pfn.h> | 61 | #include <linux/pfn.h> |
60 | #include <linux/bsearch.h> | 62 | #include <linux/bsearch.h> |
63 | #include <linux/fips.h> | ||
64 | #include <uapi/linux/module.h> | ||
65 | #include "module-internal.h" | ||
61 | 66 | ||
62 | #define CREATE_TRACE_POINTS | 67 | #define CREATE_TRACE_POINTS |
63 | #include <trace/events/module.h> | 68 | #include <trace/events/module.h> |
@@ -102,6 +107,43 @@ static LIST_HEAD(modules); | |||
102 | struct list_head *kdb_modules = &modules; /* kdb needs the list of modules */ | 107 | struct list_head *kdb_modules = &modules; /* kdb needs the list of modules */ |
103 | #endif /* CONFIG_KGDB_KDB */ | 108 | #endif /* CONFIG_KGDB_KDB */ |
104 | 109 | ||
110 | #ifdef CONFIG_MODULE_SIG | ||
111 | #ifdef CONFIG_MODULE_SIG_FORCE | ||
112 | static bool sig_enforce = true; | ||
113 | #else | ||
114 | static bool sig_enforce = false; | ||
115 | |||
116 | static int param_set_bool_enable_only(const char *val, | ||
117 | const struct kernel_param *kp) | ||
118 | { | ||
119 | int err; | ||
120 | bool test; | ||
121 | struct kernel_param dummy_kp = *kp; | ||
122 | |||
123 | dummy_kp.arg = &test; | ||
124 | |||
125 | err = param_set_bool(val, &dummy_kp); | ||
126 | if (err) | ||
127 | return err; | ||
128 | |||
129 | /* Don't let them unset it once it's set! */ | ||
130 | if (!test && sig_enforce) | ||
131 | return -EROFS; | ||
132 | |||
133 | if (test) | ||
134 | sig_enforce = true; | ||
135 | return 0; | ||
136 | } | ||
137 | |||
138 | static const struct kernel_param_ops param_ops_bool_enable_only = { | ||
139 | .set = param_set_bool_enable_only, | ||
140 | .get = param_get_bool, | ||
141 | }; | ||
142 | #define param_check_bool_enable_only param_check_bool | ||
143 | |||
144 | module_param(sig_enforce, bool_enable_only, 0644); | ||
145 | #endif /* !CONFIG_MODULE_SIG_FORCE */ | ||
146 | #endif /* CONFIG_MODULE_SIG */ | ||
105 | 147 | ||
106 | /* Block module loading/unloading? */ | 148 | /* Block module loading/unloading? */ |
107 | int modules_disabled = 0; | 149 | int modules_disabled = 0; |
@@ -136,6 +178,7 @@ struct load_info { | |||
136 | unsigned long symoffs, stroffs; | 178 | unsigned long symoffs, stroffs; |
137 | struct _ddebug *debug; | 179 | struct _ddebug *debug; |
138 | unsigned int num_debug; | 180 | unsigned int num_debug; |
181 | bool sig_ok; | ||
139 | struct { | 182 | struct { |
140 | unsigned int sym, str, mod, vers, info, pcpu; | 183 | unsigned int sym, str, mod, vers, info, pcpu; |
141 | } index; | 184 | } index; |
@@ -332,9 +375,6 @@ static bool check_symbol(const struct symsearch *syms, | |||
332 | printk(KERN_WARNING "Symbol %s is being used " | 375 | printk(KERN_WARNING "Symbol %s is being used " |
333 | "by a non-GPL module, which will not " | 376 | "by a non-GPL module, which will not " |
334 | "be allowed in the future\n", fsa->name); | 377 | "be allowed in the future\n", fsa->name); |
335 | printk(KERN_WARNING "Please see the file " | ||
336 | "Documentation/feature-removal-schedule.txt " | ||
337 | "in the kernel source tree for more details.\n"); | ||
338 | } | 378 | } |
339 | } | 379 | } |
340 | 380 | ||
@@ -1949,26 +1989,6 @@ static int simplify_symbols(struct module *mod, const struct load_info *info) | |||
1949 | return ret; | 1989 | return ret; |
1950 | } | 1990 | } |
1951 | 1991 | ||
1952 | int __weak apply_relocate(Elf_Shdr *sechdrs, | ||
1953 | const char *strtab, | ||
1954 | unsigned int symindex, | ||
1955 | unsigned int relsec, | ||
1956 | struct module *me) | ||
1957 | { | ||
1958 | pr_err("module %s: REL relocation unsupported\n", me->name); | ||
1959 | return -ENOEXEC; | ||
1960 | } | ||
1961 | |||
1962 | int __weak apply_relocate_add(Elf_Shdr *sechdrs, | ||
1963 | const char *strtab, | ||
1964 | unsigned int symindex, | ||
1965 | unsigned int relsec, | ||
1966 | struct module *me) | ||
1967 | { | ||
1968 | pr_err("module %s: RELA relocation unsupported\n", me->name); | ||
1969 | return -ENOEXEC; | ||
1970 | } | ||
1971 | |||
1972 | static int apply_relocations(struct module *mod, const struct load_info *info) | 1992 | static int apply_relocations(struct module *mod, const struct load_info *info) |
1973 | { | 1993 | { |
1974 | unsigned int i; | 1994 | unsigned int i; |
@@ -2262,7 +2282,7 @@ static void layout_symtab(struct module *mod, struct load_info *info) | |||
2262 | Elf_Shdr *symsect = info->sechdrs + info->index.sym; | 2282 | Elf_Shdr *symsect = info->sechdrs + info->index.sym; |
2263 | Elf_Shdr *strsect = info->sechdrs + info->index.str; | 2283 | Elf_Shdr *strsect = info->sechdrs + info->index.str; |
2264 | const Elf_Sym *src; | 2284 | const Elf_Sym *src; |
2265 | unsigned int i, nsrc, ndst, strtab_size; | 2285 | unsigned int i, nsrc, ndst, strtab_size = 0; |
2266 | 2286 | ||
2267 | /* Put symbol section at end of init part of module. */ | 2287 | /* Put symbol section at end of init part of module. */ |
2268 | symsect->sh_flags |= SHF_ALLOC; | 2288 | symsect->sh_flags |= SHF_ALLOC; |
@@ -2274,11 +2294,13 @@ static void layout_symtab(struct module *mod, struct load_info *info) | |||
2274 | nsrc = symsect->sh_size / sizeof(*src); | 2294 | nsrc = symsect->sh_size / sizeof(*src); |
2275 | 2295 | ||
2276 | /* Compute total space required for the core symbols' strtab. */ | 2296 | /* Compute total space required for the core symbols' strtab. */ |
2277 | for (ndst = i = strtab_size = 1; i < nsrc; ++i, ++src) | 2297 | for (ndst = i = 0; i < nsrc; i++) { |
2278 | if (is_core_symbol(src, info->sechdrs, info->hdr->e_shnum)) { | 2298 | if (i == 0 || |
2279 | strtab_size += strlen(&info->strtab[src->st_name]) + 1; | 2299 | is_core_symbol(src+i, info->sechdrs, info->hdr->e_shnum)) { |
2300 | strtab_size += strlen(&info->strtab[src[i].st_name])+1; | ||
2280 | ndst++; | 2301 | ndst++; |
2281 | } | 2302 | } |
2303 | } | ||
2282 | 2304 | ||
2283 | /* Append room for core symbols at end of core part. */ | 2305 | /* Append room for core symbols at end of core part. */ |
2284 | info->symoffs = ALIGN(mod->core_size, symsect->sh_addralign ?: 1); | 2306 | info->symoffs = ALIGN(mod->core_size, symsect->sh_addralign ?: 1); |
@@ -2312,15 +2334,14 @@ static void add_kallsyms(struct module *mod, const struct load_info *info) | |||
2312 | mod->core_symtab = dst = mod->module_core + info->symoffs; | 2334 | mod->core_symtab = dst = mod->module_core + info->symoffs; |
2313 | mod->core_strtab = s = mod->module_core + info->stroffs; | 2335 | mod->core_strtab = s = mod->module_core + info->stroffs; |
2314 | src = mod->symtab; | 2336 | src = mod->symtab; |
2315 | *dst = *src; | 2337 | for (ndst = i = 0; i < mod->num_symtab; i++) { |
2316 | *s++ = 0; | 2338 | if (i == 0 || |
2317 | for (ndst = i = 1; i < mod->num_symtab; ++i, ++src) { | 2339 | is_core_symbol(src+i, info->sechdrs, info->hdr->e_shnum)) { |
2318 | if (!is_core_symbol(src, info->sechdrs, info->hdr->e_shnum)) | 2340 | dst[ndst] = src[i]; |
2319 | continue; | 2341 | dst[ndst++].st_name = s - mod->core_strtab; |
2320 | 2342 | s += strlcpy(s, &mod->strtab[src[i].st_name], | |
2321 | dst[ndst] = *src; | 2343 | KSYM_NAME_LEN) + 1; |
2322 | dst[ndst++].st_name = s - mod->core_strtab; | 2344 | } |
2323 | s += strlcpy(s, &mod->strtab[src->st_name], KSYM_NAME_LEN) + 1; | ||
2324 | } | 2345 | } |
2325 | mod->core_num_syms = ndst; | 2346 | mod->core_num_syms = ndst; |
2326 | } | 2347 | } |
@@ -2353,7 +2374,7 @@ static void dynamic_debug_remove(struct _ddebug *debug) | |||
2353 | 2374 | ||
2354 | void * __weak module_alloc(unsigned long size) | 2375 | void * __weak module_alloc(unsigned long size) |
2355 | { | 2376 | { |
2356 | return size == 0 ? NULL : vmalloc_exec(size); | 2377 | return vmalloc_exec(size); |
2357 | } | 2378 | } |
2358 | 2379 | ||
2359 | static void *module_alloc_update_bounds(unsigned long size) | 2380 | static void *module_alloc_update_bounds(unsigned long size) |
@@ -2399,48 +2420,136 @@ static inline void kmemleak_load_module(const struct module *mod, | |||
2399 | } | 2420 | } |
2400 | #endif | 2421 | #endif |
2401 | 2422 | ||
2423 | #ifdef CONFIG_MODULE_SIG | ||
2424 | static int module_sig_check(struct load_info *info) | ||
2425 | { | ||
2426 | int err = -ENOKEY; | ||
2427 | const unsigned long markerlen = sizeof(MODULE_SIG_STRING) - 1; | ||
2428 | const void *mod = info->hdr; | ||
2429 | |||
2430 | if (info->len > markerlen && | ||
2431 | memcmp(mod + info->len - markerlen, MODULE_SIG_STRING, markerlen) == 0) { | ||
2432 | /* We truncate the module to discard the signature */ | ||
2433 | info->len -= markerlen; | ||
2434 | err = mod_verify_sig(mod, &info->len); | ||
2435 | } | ||
2436 | |||
2437 | if (!err) { | ||
2438 | info->sig_ok = true; | ||
2439 | return 0; | ||
2440 | } | ||
2441 | |||
2442 | /* Not having a signature is only an error if we're strict. */ | ||
2443 | if (err < 0 && fips_enabled) | ||
2444 | panic("Module verification failed with error %d in FIPS mode\n", | ||
2445 | err); | ||
2446 | if (err == -ENOKEY && !sig_enforce) | ||
2447 | err = 0; | ||
2448 | |||
2449 | return err; | ||
2450 | } | ||
2451 | #else /* !CONFIG_MODULE_SIG */ | ||
2452 | static int module_sig_check(struct load_info *info) | ||
2453 | { | ||
2454 | return 0; | ||
2455 | } | ||
2456 | #endif /* !CONFIG_MODULE_SIG */ | ||
2457 | |||
2458 | /* Sanity checks against invalid binaries, wrong arch, weird elf version. */ | ||
2459 | static int elf_header_check(struct load_info *info) | ||
2460 | { | ||
2461 | if (info->len < sizeof(*(info->hdr))) | ||
2462 | return -ENOEXEC; | ||
2463 | |||
2464 | if (memcmp(info->hdr->e_ident, ELFMAG, SELFMAG) != 0 | ||
2465 | || info->hdr->e_type != ET_REL | ||
2466 | || !elf_check_arch(info->hdr) | ||
2467 | || info->hdr->e_shentsize != sizeof(Elf_Shdr)) | ||
2468 | return -ENOEXEC; | ||
2469 | |||
2470 | if (info->hdr->e_shoff >= info->len | ||
2471 | || (info->hdr->e_shnum * sizeof(Elf_Shdr) > | ||
2472 | info->len - info->hdr->e_shoff)) | ||
2473 | return -ENOEXEC; | ||
2474 | |||
2475 | return 0; | ||
2476 | } | ||
2477 | |||
2402 | /* Sets info->hdr and info->len. */ | 2478 | /* Sets info->hdr and info->len. */ |
2403 | static int copy_and_check(struct load_info *info, | 2479 | static int copy_module_from_user(const void __user *umod, unsigned long len, |
2404 | const void __user *umod, unsigned long len, | 2480 | struct load_info *info) |
2405 | const char __user *uargs) | ||
2406 | { | 2481 | { |
2407 | int err; | 2482 | int err; |
2408 | Elf_Ehdr *hdr; | ||
2409 | 2483 | ||
2410 | if (len < sizeof(*hdr)) | 2484 | info->len = len; |
2485 | if (info->len < sizeof(*(info->hdr))) | ||
2411 | return -ENOEXEC; | 2486 | return -ENOEXEC; |
2412 | 2487 | ||
2488 | err = security_kernel_module_from_file(NULL); | ||
2489 | if (err) | ||
2490 | return err; | ||
2491 | |||
2413 | /* Suck in entire file: we'll want most of it. */ | 2492 | /* Suck in entire file: we'll want most of it. */ |
2414 | if ((hdr = vmalloc(len)) == NULL) | 2493 | info->hdr = vmalloc(info->len); |
2494 | if (!info->hdr) | ||
2415 | return -ENOMEM; | 2495 | return -ENOMEM; |
2416 | 2496 | ||
2417 | if (copy_from_user(hdr, umod, len) != 0) { | 2497 | if (copy_from_user(info->hdr, umod, info->len) != 0) { |
2418 | err = -EFAULT; | 2498 | vfree(info->hdr); |
2419 | goto free_hdr; | 2499 | return -EFAULT; |
2420 | } | 2500 | } |
2421 | 2501 | ||
2422 | /* Sanity checks against insmoding binaries or wrong arch, | 2502 | return 0; |
2423 | weird elf version */ | 2503 | } |
2424 | if (memcmp(hdr->e_ident, ELFMAG, SELFMAG) != 0 | 2504 | |
2425 | || hdr->e_type != ET_REL | 2505 | /* Sets info->hdr and info->len. */ |
2426 | || !elf_check_arch(hdr) | 2506 | static int copy_module_from_fd(int fd, struct load_info *info) |
2427 | || hdr->e_shentsize != sizeof(Elf_Shdr)) { | 2507 | { |
2428 | err = -ENOEXEC; | 2508 | struct file *file; |
2429 | goto free_hdr; | 2509 | int err; |
2430 | } | 2510 | struct kstat stat; |
2511 | loff_t pos; | ||
2512 | ssize_t bytes = 0; | ||
2513 | |||
2514 | file = fget(fd); | ||
2515 | if (!file) | ||
2516 | return -ENOEXEC; | ||
2431 | 2517 | ||
2432 | if (hdr->e_shoff >= len || | 2518 | err = security_kernel_module_from_file(file); |
2433 | hdr->e_shnum * sizeof(Elf_Shdr) > len - hdr->e_shoff) { | 2519 | if (err) |
2434 | err = -ENOEXEC; | 2520 | goto out; |
2435 | goto free_hdr; | 2521 | |
2522 | err = vfs_getattr(file->f_vfsmnt, file->f_dentry, &stat); | ||
2523 | if (err) | ||
2524 | goto out; | ||
2525 | |||
2526 | if (stat.size > INT_MAX) { | ||
2527 | err = -EFBIG; | ||
2528 | goto out; | ||
2529 | } | ||
2530 | info->hdr = vmalloc(stat.size); | ||
2531 | if (!info->hdr) { | ||
2532 | err = -ENOMEM; | ||
2533 | goto out; | ||
2436 | } | 2534 | } |
2437 | 2535 | ||
2438 | info->hdr = hdr; | 2536 | pos = 0; |
2439 | info->len = len; | 2537 | while (pos < stat.size) { |
2440 | return 0; | 2538 | bytes = kernel_read(file, pos, (char *)(info->hdr) + pos, |
2539 | stat.size - pos); | ||
2540 | if (bytes < 0) { | ||
2541 | vfree(info->hdr); | ||
2542 | err = bytes; | ||
2543 | goto out; | ||
2544 | } | ||
2545 | if (bytes == 0) | ||
2546 | break; | ||
2547 | pos += bytes; | ||
2548 | } | ||
2549 | info->len = pos; | ||
2441 | 2550 | ||
2442 | free_hdr: | 2551 | out: |
2443 | vfree(hdr); | 2552 | fput(file); |
2444 | return err; | 2553 | return err; |
2445 | } | 2554 | } |
2446 | 2555 | ||
@@ -2449,7 +2558,7 @@ static void free_copy(struct load_info *info) | |||
2449 | vfree(info->hdr); | 2558 | vfree(info->hdr); |
2450 | } | 2559 | } |
2451 | 2560 | ||
2452 | static int rewrite_section_headers(struct load_info *info) | 2561 | static int rewrite_section_headers(struct load_info *info, int flags) |
2453 | { | 2562 | { |
2454 | unsigned int i; | 2563 | unsigned int i; |
2455 | 2564 | ||
@@ -2477,7 +2586,10 @@ static int rewrite_section_headers(struct load_info *info) | |||
2477 | } | 2586 | } |
2478 | 2587 | ||
2479 | /* Track but don't keep modinfo and version sections. */ | 2588 | /* Track but don't keep modinfo and version sections. */ |
2480 | info->index.vers = find_sec(info, "__versions"); | 2589 | if (flags & MODULE_INIT_IGNORE_MODVERSIONS) |
2590 | info->index.vers = 0; /* Pretend no __versions section! */ | ||
2591 | else | ||
2592 | info->index.vers = find_sec(info, "__versions"); | ||
2481 | info->index.info = find_sec(info, ".modinfo"); | 2593 | info->index.info = find_sec(info, ".modinfo"); |
2482 | info->sechdrs[info->index.info].sh_flags &= ~(unsigned long)SHF_ALLOC; | 2594 | info->sechdrs[info->index.info].sh_flags &= ~(unsigned long)SHF_ALLOC; |
2483 | info->sechdrs[info->index.vers].sh_flags &= ~(unsigned long)SHF_ALLOC; | 2595 | info->sechdrs[info->index.vers].sh_flags &= ~(unsigned long)SHF_ALLOC; |
@@ -2492,7 +2604,7 @@ static int rewrite_section_headers(struct load_info *info) | |||
2492 | * Return the temporary module pointer (we'll replace it with the final | 2604 | * Return the temporary module pointer (we'll replace it with the final |
2493 | * one when we move the module sections around). | 2605 | * one when we move the module sections around). |
2494 | */ | 2606 | */ |
2495 | static struct module *setup_load_info(struct load_info *info) | 2607 | static struct module *setup_load_info(struct load_info *info, int flags) |
2496 | { | 2608 | { |
2497 | unsigned int i; | 2609 | unsigned int i; |
2498 | int err; | 2610 | int err; |
@@ -2503,7 +2615,7 @@ static struct module *setup_load_info(struct load_info *info) | |||
2503 | info->secstrings = (void *)info->hdr | 2615 | info->secstrings = (void *)info->hdr |
2504 | + info->sechdrs[info->hdr->e_shstrndx].sh_offset; | 2616 | + info->sechdrs[info->hdr->e_shstrndx].sh_offset; |
2505 | 2617 | ||
2506 | err = rewrite_section_headers(info); | 2618 | err = rewrite_section_headers(info, flags); |
2507 | if (err) | 2619 | if (err) |
2508 | return ERR_PTR(err); | 2620 | return ERR_PTR(err); |
2509 | 2621 | ||
@@ -2541,11 +2653,14 @@ static struct module *setup_load_info(struct load_info *info) | |||
2541 | return mod; | 2653 | return mod; |
2542 | } | 2654 | } |
2543 | 2655 | ||
2544 | static int check_modinfo(struct module *mod, struct load_info *info) | 2656 | static int check_modinfo(struct module *mod, struct load_info *info, int flags) |
2545 | { | 2657 | { |
2546 | const char *modmagic = get_modinfo(info, "vermagic"); | 2658 | const char *modmagic = get_modinfo(info, "vermagic"); |
2547 | int err; | 2659 | int err; |
2548 | 2660 | ||
2661 | if (flags & MODULE_INIT_IGNORE_VERMAGIC) | ||
2662 | modmagic = NULL; | ||
2663 | |||
2549 | /* This is allowed: modprobe --force will invalidate it. */ | 2664 | /* This is allowed: modprobe --force will invalidate it. */ |
2550 | if (!modmagic) { | 2665 | if (!modmagic) { |
2551 | err = try_to_force_load(mod, "bad vermagic"); | 2666 | err = try_to_force_load(mod, "bad vermagic"); |
@@ -2675,20 +2790,23 @@ static int move_module(struct module *mod, struct load_info *info) | |||
2675 | memset(ptr, 0, mod->core_size); | 2790 | memset(ptr, 0, mod->core_size); |
2676 | mod->module_core = ptr; | 2791 | mod->module_core = ptr; |
2677 | 2792 | ||
2678 | ptr = module_alloc_update_bounds(mod->init_size); | 2793 | if (mod->init_size) { |
2679 | /* | 2794 | ptr = module_alloc_update_bounds(mod->init_size); |
2680 | * The pointer to this block is stored in the module structure | 2795 | /* |
2681 | * which is inside the block. This block doesn't need to be | 2796 | * The pointer to this block is stored in the module structure |
2682 | * scanned as it contains data and code that will be freed | 2797 | * which is inside the block. This block doesn't need to be |
2683 | * after the module is initialized. | 2798 | * scanned as it contains data and code that will be freed |
2684 | */ | 2799 | * after the module is initialized. |
2685 | kmemleak_ignore(ptr); | 2800 | */ |
2686 | if (!ptr && mod->init_size) { | 2801 | kmemleak_ignore(ptr); |
2687 | module_free(mod, mod->module_core); | 2802 | if (!ptr) { |
2688 | return -ENOMEM; | 2803 | module_free(mod, mod->module_core); |
2689 | } | 2804 | return -ENOMEM; |
2690 | memset(ptr, 0, mod->init_size); | 2805 | } |
2691 | mod->module_init = ptr; | 2806 | memset(ptr, 0, mod->init_size); |
2807 | mod->module_init = ptr; | ||
2808 | } else | ||
2809 | mod->module_init = NULL; | ||
2692 | 2810 | ||
2693 | /* Transfer each section which specifies SHF_ALLOC */ | 2811 | /* Transfer each section which specifies SHF_ALLOC */ |
2694 | pr_debug("final section addresses:\n"); | 2812 | pr_debug("final section addresses:\n"); |
@@ -2730,6 +2848,10 @@ static int check_module_license_and_versions(struct module *mod) | |||
2730 | if (strcmp(mod->name, "driverloader") == 0) | 2848 | if (strcmp(mod->name, "driverloader") == 0) |
2731 | add_taint_module(mod, TAINT_PROPRIETARY_MODULE); | 2849 | add_taint_module(mod, TAINT_PROPRIETARY_MODULE); |
2732 | 2850 | ||
2851 | /* lve claims to be GPL but upstream won't provide source */ | ||
2852 | if (strcmp(mod->name, "lve") == 0) | ||
2853 | add_taint_module(mod, TAINT_PROPRIETARY_MODULE); | ||
2854 | |||
2733 | #ifdef CONFIG_MODVERSIONS | 2855 | #ifdef CONFIG_MODVERSIONS |
2734 | if ((mod->num_syms && !mod->crcs) | 2856 | if ((mod->num_syms && !mod->crcs) |
2735 | || (mod->num_gpl_syms && !mod->gpl_crcs) | 2857 | || (mod->num_gpl_syms && !mod->gpl_crcs) |
@@ -2777,18 +2899,18 @@ int __weak module_frob_arch_sections(Elf_Ehdr *hdr, | |||
2777 | return 0; | 2899 | return 0; |
2778 | } | 2900 | } |
2779 | 2901 | ||
2780 | static struct module *layout_and_allocate(struct load_info *info) | 2902 | static struct module *layout_and_allocate(struct load_info *info, int flags) |
2781 | { | 2903 | { |
2782 | /* Module within temporary copy. */ | 2904 | /* Module within temporary copy. */ |
2783 | struct module *mod; | 2905 | struct module *mod; |
2784 | Elf_Shdr *pcpusec; | 2906 | Elf_Shdr *pcpusec; |
2785 | int err; | 2907 | int err; |
2786 | 2908 | ||
2787 | mod = setup_load_info(info); | 2909 | mod = setup_load_info(info, flags); |
2788 | if (IS_ERR(mod)) | 2910 | if (IS_ERR(mod)) |
2789 | return mod; | 2911 | return mod; |
2790 | 2912 | ||
2791 | err = check_modinfo(mod, info); | 2913 | err = check_modinfo(mod, info, flags); |
2792 | if (err) | 2914 | if (err) |
2793 | return ERR_PTR(err); | 2915 | return ERR_PTR(err); |
2794 | 2916 | ||
@@ -2861,31 +2983,142 @@ static int post_relocation(struct module *mod, const struct load_info *info) | |||
2861 | return module_finalize(info->hdr, info->sechdrs, mod); | 2983 | return module_finalize(info->hdr, info->sechdrs, mod); |
2862 | } | 2984 | } |
2863 | 2985 | ||
2986 | /* Is this module of this name done loading? No locks held. */ | ||
2987 | static bool finished_loading(const char *name) | ||
2988 | { | ||
2989 | struct module *mod; | ||
2990 | bool ret; | ||
2991 | |||
2992 | mutex_lock(&module_mutex); | ||
2993 | mod = find_module(name); | ||
2994 | ret = !mod || mod->state != MODULE_STATE_COMING; | ||
2995 | mutex_unlock(&module_mutex); | ||
2996 | |||
2997 | return ret; | ||
2998 | } | ||
2999 | |||
3000 | /* Call module constructors. */ | ||
3001 | static void do_mod_ctors(struct module *mod) | ||
3002 | { | ||
3003 | #ifdef CONFIG_CONSTRUCTORS | ||
3004 | unsigned long i; | ||
3005 | |||
3006 | for (i = 0; i < mod->num_ctors; i++) | ||
3007 | mod->ctors[i](); | ||
3008 | #endif | ||
3009 | } | ||
3010 | |||
3011 | /* This is where the real work happens */ | ||
3012 | static int do_init_module(struct module *mod) | ||
3013 | { | ||
3014 | int ret = 0; | ||
3015 | |||
3016 | blocking_notifier_call_chain(&module_notify_list, | ||
3017 | MODULE_STATE_COMING, mod); | ||
3018 | |||
3019 | /* Set RO and NX regions for core */ | ||
3020 | set_section_ro_nx(mod->module_core, | ||
3021 | mod->core_text_size, | ||
3022 | mod->core_ro_size, | ||
3023 | mod->core_size); | ||
3024 | |||
3025 | /* Set RO and NX regions for init */ | ||
3026 | set_section_ro_nx(mod->module_init, | ||
3027 | mod->init_text_size, | ||
3028 | mod->init_ro_size, | ||
3029 | mod->init_size); | ||
3030 | |||
3031 | do_mod_ctors(mod); | ||
3032 | /* Start the module */ | ||
3033 | if (mod->init != NULL) | ||
3034 | ret = do_one_initcall(mod->init); | ||
3035 | if (ret < 0) { | ||
3036 | /* Init routine failed: abort. Try to protect us from | ||
3037 | buggy refcounters. */ | ||
3038 | mod->state = MODULE_STATE_GOING; | ||
3039 | synchronize_sched(); | ||
3040 | module_put(mod); | ||
3041 | blocking_notifier_call_chain(&module_notify_list, | ||
3042 | MODULE_STATE_GOING, mod); | ||
3043 | free_module(mod); | ||
3044 | wake_up_all(&module_wq); | ||
3045 | return ret; | ||
3046 | } | ||
3047 | if (ret > 0) { | ||
3048 | printk(KERN_WARNING | ||
3049 | "%s: '%s'->init suspiciously returned %d, it should follow 0/-E convention\n" | ||
3050 | "%s: loading module anyway...\n", | ||
3051 | __func__, mod->name, ret, | ||
3052 | __func__); | ||
3053 | dump_stack(); | ||
3054 | } | ||
3055 | |||
3056 | /* Now it's a first class citizen! */ | ||
3057 | mod->state = MODULE_STATE_LIVE; | ||
3058 | blocking_notifier_call_chain(&module_notify_list, | ||
3059 | MODULE_STATE_LIVE, mod); | ||
3060 | |||
3061 | /* We need to finish all async code before the module init sequence is done */ | ||
3062 | async_synchronize_full(); | ||
3063 | |||
3064 | mutex_lock(&module_mutex); | ||
3065 | /* Drop initial reference. */ | ||
3066 | module_put(mod); | ||
3067 | trim_init_extable(mod); | ||
3068 | #ifdef CONFIG_KALLSYMS | ||
3069 | mod->num_symtab = mod->core_num_syms; | ||
3070 | mod->symtab = mod->core_symtab; | ||
3071 | mod->strtab = mod->core_strtab; | ||
3072 | #endif | ||
3073 | unset_module_init_ro_nx(mod); | ||
3074 | module_free(mod, mod->module_init); | ||
3075 | mod->module_init = NULL; | ||
3076 | mod->init_size = 0; | ||
3077 | mod->init_ro_size = 0; | ||
3078 | mod->init_text_size = 0; | ||
3079 | mutex_unlock(&module_mutex); | ||
3080 | wake_up_all(&module_wq); | ||
3081 | |||
3082 | return 0; | ||
3083 | } | ||
3084 | |||
3085 | static int may_init_module(void) | ||
3086 | { | ||
3087 | if (!capable(CAP_SYS_MODULE) || modules_disabled) | ||
3088 | return -EPERM; | ||
3089 | |||
3090 | return 0; | ||
3091 | } | ||
3092 | |||
2864 | /* Allocate and load the module: note that size of section 0 is always | 3093 | /* Allocate and load the module: note that size of section 0 is always |
2865 | zero, and we rely on this for optional sections. */ | 3094 | zero, and we rely on this for optional sections. */ |
2866 | static struct module *load_module(void __user *umod, | 3095 | static int load_module(struct load_info *info, const char __user *uargs, |
2867 | unsigned long len, | 3096 | int flags) |
2868 | const char __user *uargs) | ||
2869 | { | 3097 | { |
2870 | struct load_info info = { NULL, }; | 3098 | struct module *mod, *old; |
2871 | struct module *mod; | ||
2872 | long err; | 3099 | long err; |
2873 | 3100 | ||
2874 | pr_debug("load_module: umod=%p, len=%lu, uargs=%p\n", | 3101 | err = module_sig_check(info); |
2875 | umod, len, uargs); | 3102 | if (err) |
3103 | goto free_copy; | ||
2876 | 3104 | ||
2877 | /* Copy in the blobs from userspace, check they are vaguely sane. */ | 3105 | err = elf_header_check(info); |
2878 | err = copy_and_check(&info, umod, len, uargs); | ||
2879 | if (err) | 3106 | if (err) |
2880 | return ERR_PTR(err); | 3107 | goto free_copy; |
2881 | 3108 | ||
2882 | /* Figure out module layout, and allocate all the memory. */ | 3109 | /* Figure out module layout, and allocate all the memory. */ |
2883 | mod = layout_and_allocate(&info); | 3110 | mod = layout_and_allocate(info, flags); |
2884 | if (IS_ERR(mod)) { | 3111 | if (IS_ERR(mod)) { |
2885 | err = PTR_ERR(mod); | 3112 | err = PTR_ERR(mod); |
2886 | goto free_copy; | 3113 | goto free_copy; |
2887 | } | 3114 | } |
2888 | 3115 | ||
3116 | #ifdef CONFIG_MODULE_SIG | ||
3117 | mod->sig_ok = info->sig_ok; | ||
3118 | if (!mod->sig_ok) | ||
3119 | add_taint_module(mod, TAINT_FORCED_MODULE); | ||
3120 | #endif | ||
3121 | |||
2889 | /* Now module is in final location, initialize linked lists, etc. */ | 3122 | /* Now module is in final location, initialize linked lists, etc. */ |
2890 | err = module_unload_init(mod); | 3123 | err = module_unload_init(mod); |
2891 | if (err) | 3124 | if (err) |
@@ -2893,25 +3126,25 @@ static struct module *load_module(void __user *umod, | |||
2893 | 3126 | ||
2894 | /* Now we've got everything in the final locations, we can | 3127 | /* Now we've got everything in the final locations, we can |
2895 | * find optional sections. */ | 3128 | * find optional sections. */ |
2896 | find_module_sections(mod, &info); | 3129 | find_module_sections(mod, info); |
2897 | 3130 | ||
2898 | err = check_module_license_and_versions(mod); | 3131 | err = check_module_license_and_versions(mod); |
2899 | if (err) | 3132 | if (err) |
2900 | goto free_unload; | 3133 | goto free_unload; |
2901 | 3134 | ||
2902 | /* Set up MODINFO_ATTR fields */ | 3135 | /* Set up MODINFO_ATTR fields */ |
2903 | setup_modinfo(mod, &info); | 3136 | setup_modinfo(mod, info); |
2904 | 3137 | ||
2905 | /* Fix up syms, so that st_value is a pointer to location. */ | 3138 | /* Fix up syms, so that st_value is a pointer to location. */ |
2906 | err = simplify_symbols(mod, &info); | 3139 | err = simplify_symbols(mod, info); |
2907 | if (err < 0) | 3140 | if (err < 0) |
2908 | goto free_modinfo; | 3141 | goto free_modinfo; |
2909 | 3142 | ||
2910 | err = apply_relocations(mod, &info); | 3143 | err = apply_relocations(mod, info); |
2911 | if (err < 0) | 3144 | if (err < 0) |
2912 | goto free_modinfo; | 3145 | goto free_modinfo; |
2913 | 3146 | ||
2914 | err = post_relocation(mod, &info); | 3147 | err = post_relocation(mod, info); |
2915 | if (err < 0) | 3148 | if (err < 0) |
2916 | goto free_modinfo; | 3149 | goto free_modinfo; |
2917 | 3150 | ||
@@ -2934,21 +3167,31 @@ static struct module *load_module(void __user *umod, | |||
2934 | * function to insert in a way safe to concurrent readers. | 3167 | * function to insert in a way safe to concurrent readers. |
2935 | * The mutex protects against concurrent writers. | 3168 | * The mutex protects against concurrent writers. |
2936 | */ | 3169 | */ |
3170 | again: | ||
2937 | mutex_lock(&module_mutex); | 3171 | mutex_lock(&module_mutex); |
2938 | if (find_module(mod->name)) { | 3172 | if ((old = find_module(mod->name)) != NULL) { |
3173 | if (old->state == MODULE_STATE_COMING) { | ||
3174 | /* Wait in case it fails to load. */ | ||
3175 | mutex_unlock(&module_mutex); | ||
3176 | err = wait_event_interruptible(module_wq, | ||
3177 | finished_loading(mod->name)); | ||
3178 | if (err) | ||
3179 | goto free_arch_cleanup; | ||
3180 | goto again; | ||
3181 | } | ||
2939 | err = -EEXIST; | 3182 | err = -EEXIST; |
2940 | goto unlock; | 3183 | goto unlock; |
2941 | } | 3184 | } |
2942 | 3185 | ||
2943 | /* This has to be done once we're sure module name is unique. */ | 3186 | /* This has to be done once we're sure module name is unique. */ |
2944 | dynamic_debug_setup(info.debug, info.num_debug); | 3187 | dynamic_debug_setup(info->debug, info->num_debug); |
2945 | 3188 | ||
2946 | /* Find duplicate symbols */ | 3189 | /* Find duplicate symbols */ |
2947 | err = verify_export_symbols(mod); | 3190 | err = verify_export_symbols(mod); |
2948 | if (err < 0) | 3191 | if (err < 0) |
2949 | goto ddebug; | 3192 | goto ddebug; |
2950 | 3193 | ||
2951 | module_bug_finalize(info.hdr, info.sechdrs, mod); | 3194 | module_bug_finalize(info->hdr, info->sechdrs, mod); |
2952 | list_add_rcu(&mod->list, &modules); | 3195 | list_add_rcu(&mod->list, &modules); |
2953 | mutex_unlock(&module_mutex); | 3196 | mutex_unlock(&module_mutex); |
2954 | 3197 | ||
@@ -2959,25 +3202,26 @@ static struct module *load_module(void __user *umod, | |||
2959 | goto unlink; | 3202 | goto unlink; |
2960 | 3203 | ||
2961 | /* Link in to syfs. */ | 3204 | /* Link in to syfs. */ |
2962 | err = mod_sysfs_setup(mod, &info, mod->kp, mod->num_kp); | 3205 | err = mod_sysfs_setup(mod, info, mod->kp, mod->num_kp); |
2963 | if (err < 0) | 3206 | if (err < 0) |
2964 | goto unlink; | 3207 | goto unlink; |
2965 | 3208 | ||
2966 | /* Get rid of temporary copy. */ | 3209 | /* Get rid of temporary copy. */ |
2967 | free_copy(&info); | 3210 | free_copy(info); |
2968 | 3211 | ||
2969 | /* Done! */ | 3212 | /* Done! */ |
2970 | trace_module_load(mod); | 3213 | trace_module_load(mod); |
2971 | return mod; | 3214 | |
3215 | return do_init_module(mod); | ||
2972 | 3216 | ||
2973 | unlink: | 3217 | unlink: |
2974 | mutex_lock(&module_mutex); | 3218 | mutex_lock(&module_mutex); |
2975 | /* Unlink carefully: kallsyms could be walking list. */ | 3219 | /* Unlink carefully: kallsyms could be walking list. */ |
2976 | list_del_rcu(&mod->list); | 3220 | list_del_rcu(&mod->list); |
2977 | module_bug_cleanup(mod); | 3221 | module_bug_cleanup(mod); |
2978 | 3222 | wake_up_all(&module_wq); | |
2979 | ddebug: | 3223 | ddebug: |
2980 | dynamic_debug_remove(info.debug); | 3224 | dynamic_debug_remove(info->debug); |
2981 | unlock: | 3225 | unlock: |
2982 | mutex_unlock(&module_mutex); | 3226 | mutex_unlock(&module_mutex); |
2983 | synchronize_sched(); | 3227 | synchronize_sched(); |
@@ -2989,106 +3233,52 @@ static struct module *load_module(void __user *umod, | |||
2989 | free_unload: | 3233 | free_unload: |
2990 | module_unload_free(mod); | 3234 | module_unload_free(mod); |
2991 | free_module: | 3235 | free_module: |
2992 | module_deallocate(mod, &info); | 3236 | module_deallocate(mod, info); |
2993 | free_copy: | 3237 | free_copy: |
2994 | free_copy(&info); | 3238 | free_copy(info); |
2995 | return ERR_PTR(err); | 3239 | return err; |
2996 | } | ||
2997 | |||
2998 | /* Call module constructors. */ | ||
2999 | static void do_mod_ctors(struct module *mod) | ||
3000 | { | ||
3001 | #ifdef CONFIG_CONSTRUCTORS | ||
3002 | unsigned long i; | ||
3003 | |||
3004 | for (i = 0; i < mod->num_ctors; i++) | ||
3005 | mod->ctors[i](); | ||
3006 | #endif | ||
3007 | } | 3240 | } |
3008 | 3241 | ||
3009 | /* This is where the real work happens */ | ||
3010 | SYSCALL_DEFINE3(init_module, void __user *, umod, | 3242 | SYSCALL_DEFINE3(init_module, void __user *, umod, |
3011 | unsigned long, len, const char __user *, uargs) | 3243 | unsigned long, len, const char __user *, uargs) |
3012 | { | 3244 | { |
3013 | struct module *mod; | 3245 | int err; |
3014 | int ret = 0; | 3246 | struct load_info info = { }; |
3015 | 3247 | ||
3016 | /* Must have permission */ | 3248 | err = may_init_module(); |
3017 | if (!capable(CAP_SYS_MODULE) || modules_disabled) | 3249 | if (err) |
3018 | return -EPERM; | 3250 | return err; |
3019 | 3251 | ||
3020 | /* Do all the hard work */ | 3252 | pr_debug("init_module: umod=%p, len=%lu, uargs=%p\n", |
3021 | mod = load_module(umod, len, uargs); | 3253 | umod, len, uargs); |
3022 | if (IS_ERR(mod)) | ||
3023 | return PTR_ERR(mod); | ||
3024 | 3254 | ||
3025 | blocking_notifier_call_chain(&module_notify_list, | 3255 | err = copy_module_from_user(umod, len, &info); |
3026 | MODULE_STATE_COMING, mod); | 3256 | if (err) |
3257 | return err; | ||
3027 | 3258 | ||
3028 | /* Set RO and NX regions for core */ | 3259 | return load_module(&info, uargs, 0); |
3029 | set_section_ro_nx(mod->module_core, | 3260 | } |
3030 | mod->core_text_size, | ||
3031 | mod->core_ro_size, | ||
3032 | mod->core_size); | ||
3033 | 3261 | ||
3034 | /* Set RO and NX regions for init */ | 3262 | SYSCALL_DEFINE3(finit_module, int, fd, const char __user *, uargs, int, flags) |
3035 | set_section_ro_nx(mod->module_init, | 3263 | { |
3036 | mod->init_text_size, | 3264 | int err; |
3037 | mod->init_ro_size, | 3265 | struct load_info info = { }; |
3038 | mod->init_size); | ||
3039 | 3266 | ||
3040 | do_mod_ctors(mod); | 3267 | err = may_init_module(); |
3041 | /* Start the module */ | 3268 | if (err) |
3042 | if (mod->init != NULL) | 3269 | return err; |
3043 | ret = do_one_initcall(mod->init); | ||
3044 | if (ret < 0) { | ||
3045 | /* Init routine failed: abort. Try to protect us from | ||
3046 | buggy refcounters. */ | ||
3047 | mod->state = MODULE_STATE_GOING; | ||
3048 | synchronize_sched(); | ||
3049 | module_put(mod); | ||
3050 | blocking_notifier_call_chain(&module_notify_list, | ||
3051 | MODULE_STATE_GOING, mod); | ||
3052 | free_module(mod); | ||
3053 | wake_up(&module_wq); | ||
3054 | return ret; | ||
3055 | } | ||
3056 | if (ret > 0) { | ||
3057 | printk(KERN_WARNING | ||
3058 | "%s: '%s'->init suspiciously returned %d, it should follow 0/-E convention\n" | ||
3059 | "%s: loading module anyway...\n", | ||
3060 | __func__, mod->name, ret, | ||
3061 | __func__); | ||
3062 | dump_stack(); | ||
3063 | } | ||
3064 | 3270 | ||
3065 | /* Now it's a first class citizen! Wake up anyone waiting for it. */ | 3271 | pr_debug("finit_module: fd=%d, uargs=%p, flags=%i\n", fd, uargs, flags); |
3066 | mod->state = MODULE_STATE_LIVE; | ||
3067 | wake_up(&module_wq); | ||
3068 | blocking_notifier_call_chain(&module_notify_list, | ||
3069 | MODULE_STATE_LIVE, mod); | ||
3070 | 3272 | ||
3071 | /* We need to finish all async code before the module init sequence is done */ | 3273 | if (flags & ~(MODULE_INIT_IGNORE_MODVERSIONS |
3072 | async_synchronize_full(); | 3274 | |MODULE_INIT_IGNORE_VERMAGIC)) |
3275 | return -EINVAL; | ||
3073 | 3276 | ||
3074 | mutex_lock(&module_mutex); | 3277 | err = copy_module_from_fd(fd, &info); |
3075 | /* Drop initial reference. */ | 3278 | if (err) |
3076 | module_put(mod); | 3279 | return err; |
3077 | trim_init_extable(mod); | ||
3078 | #ifdef CONFIG_KALLSYMS | ||
3079 | mod->num_symtab = mod->core_num_syms; | ||
3080 | mod->symtab = mod->core_symtab; | ||
3081 | mod->strtab = mod->core_strtab; | ||
3082 | #endif | ||
3083 | unset_module_init_ro_nx(mod); | ||
3084 | module_free(mod, mod->module_init); | ||
3085 | mod->module_init = NULL; | ||
3086 | mod->init_size = 0; | ||
3087 | mod->init_ro_size = 0; | ||
3088 | mod->init_text_size = 0; | ||
3089 | mutex_unlock(&module_mutex); | ||
3090 | 3280 | ||
3091 | return 0; | 3281 | return load_module(&info, uargs, flags); |
3092 | } | 3282 | } |
3093 | 3283 | ||
3094 | static inline int within(unsigned long addr, void *start, unsigned long size) | 3284 | static inline int within(unsigned long addr, void *start, unsigned long size) |
diff --git a/kernel/module_signing.c b/kernel/module_signing.c new file mode 100644 index 000000000000..f2970bddc5ea --- /dev/null +++ b/kernel/module_signing.c | |||
@@ -0,0 +1,249 @@ | |||
1 | /* Module signature checker | ||
2 | * | ||
3 | * Copyright (C) 2012 Red Hat, Inc. All Rights Reserved. | ||
4 | * Written by David Howells (dhowells@redhat.com) | ||
5 | * | ||
6 | * This program is free software; you can redistribute it and/or | ||
7 | * modify it under the terms of the GNU General Public Licence | ||
8 | * as published by the Free Software Foundation; either version | ||
9 | * 2 of the Licence, or (at your option) any later version. | ||
10 | */ | ||
11 | |||
12 | #include <linux/kernel.h> | ||
13 | #include <linux/err.h> | ||
14 | #include <crypto/public_key.h> | ||
15 | #include <crypto/hash.h> | ||
16 | #include <keys/asymmetric-type.h> | ||
17 | #include "module-internal.h" | ||
18 | |||
19 | /* | ||
20 | * Module signature information block. | ||
21 | * | ||
22 | * The constituents of the signature section are, in order: | ||
23 | * | ||
24 | * - Signer's name | ||
25 | * - Key identifier | ||
26 | * - Signature data | ||
27 | * - Information block | ||
28 | */ | ||
29 | struct module_signature { | ||
30 | u8 algo; /* Public-key crypto algorithm [enum pkey_algo] */ | ||
31 | u8 hash; /* Digest algorithm [enum pkey_hash_algo] */ | ||
32 | u8 id_type; /* Key identifier type [enum pkey_id_type] */ | ||
33 | u8 signer_len; /* Length of signer's name */ | ||
34 | u8 key_id_len; /* Length of key identifier */ | ||
35 | u8 __pad[3]; | ||
36 | __be32 sig_len; /* Length of signature data */ | ||
37 | }; | ||
38 | |||
39 | /* | ||
40 | * Digest the module contents. | ||
41 | */ | ||
42 | static struct public_key_signature *mod_make_digest(enum pkey_hash_algo hash, | ||
43 | const void *mod, | ||
44 | unsigned long modlen) | ||
45 | { | ||
46 | struct public_key_signature *pks; | ||
47 | struct crypto_shash *tfm; | ||
48 | struct shash_desc *desc; | ||
49 | size_t digest_size, desc_size; | ||
50 | int ret; | ||
51 | |||
52 | pr_devel("==>%s()\n", __func__); | ||
53 | |||
54 | /* Allocate the hashing algorithm we're going to need and find out how | ||
55 | * big the hash operational data will be. | ||
56 | */ | ||
57 | tfm = crypto_alloc_shash(pkey_hash_algo[hash], 0, 0); | ||
58 | if (IS_ERR(tfm)) | ||
59 | return (PTR_ERR(tfm) == -ENOENT) ? ERR_PTR(-ENOPKG) : ERR_CAST(tfm); | ||
60 | |||
61 | desc_size = crypto_shash_descsize(tfm) + sizeof(*desc); | ||
62 | digest_size = crypto_shash_digestsize(tfm); | ||
63 | |||
64 | /* We allocate the hash operational data storage on the end of our | ||
65 | * context data and the digest output buffer on the end of that. | ||
66 | */ | ||
67 | ret = -ENOMEM; | ||
68 | pks = kzalloc(digest_size + sizeof(*pks) + desc_size, GFP_KERNEL); | ||
69 | if (!pks) | ||
70 | goto error_no_pks; | ||
71 | |||
72 | pks->pkey_hash_algo = hash; | ||
73 | pks->digest = (u8 *)pks + sizeof(*pks) + desc_size; | ||
74 | pks->digest_size = digest_size; | ||
75 | |||
76 | desc = (void *)pks + sizeof(*pks); | ||
77 | desc->tfm = tfm; | ||
78 | desc->flags = CRYPTO_TFM_REQ_MAY_SLEEP; | ||
79 | |||
80 | ret = crypto_shash_init(desc); | ||
81 | if (ret < 0) | ||
82 | goto error; | ||
83 | |||
84 | ret = crypto_shash_finup(desc, mod, modlen, pks->digest); | ||
85 | if (ret < 0) | ||
86 | goto error; | ||
87 | |||
88 | crypto_free_shash(tfm); | ||
89 | pr_devel("<==%s() = ok\n", __func__); | ||
90 | return pks; | ||
91 | |||
92 | error: | ||
93 | kfree(pks); | ||
94 | error_no_pks: | ||
95 | crypto_free_shash(tfm); | ||
96 | pr_devel("<==%s() = %d\n", __func__, ret); | ||
97 | return ERR_PTR(ret); | ||
98 | } | ||
99 | |||
100 | /* | ||
101 | * Extract an MPI array from the signature data. This represents the actual | ||
102 | * signature. Each raw MPI is prefaced by a BE 2-byte value indicating the | ||
103 | * size of the MPI in bytes. | ||
104 | * | ||
105 | * RSA signatures only have one MPI, so currently we only read one. | ||
106 | */ | ||
107 | static int mod_extract_mpi_array(struct public_key_signature *pks, | ||
108 | const void *data, size_t len) | ||
109 | { | ||
110 | size_t nbytes; | ||
111 | MPI mpi; | ||
112 | |||
113 | if (len < 3) | ||
114 | return -EBADMSG; | ||
115 | nbytes = ((const u8 *)data)[0] << 8 | ((const u8 *)data)[1]; | ||
116 | data += 2; | ||
117 | len -= 2; | ||
118 | if (len != nbytes) | ||
119 | return -EBADMSG; | ||
120 | |||
121 | mpi = mpi_read_raw_data(data, nbytes); | ||
122 | if (!mpi) | ||
123 | return -ENOMEM; | ||
124 | pks->mpi[0] = mpi; | ||
125 | pks->nr_mpi = 1; | ||
126 | return 0; | ||
127 | } | ||
128 | |||
129 | /* | ||
130 | * Request an asymmetric key. | ||
131 | */ | ||
132 | static struct key *request_asymmetric_key(const char *signer, size_t signer_len, | ||
133 | const u8 *key_id, size_t key_id_len) | ||
134 | { | ||
135 | key_ref_t key; | ||
136 | size_t i; | ||
137 | char *id, *q; | ||
138 | |||
139 | pr_devel("==>%s(,%zu,,%zu)\n", __func__, signer_len, key_id_len); | ||
140 | |||
141 | /* Construct an identifier. */ | ||
142 | id = kmalloc(signer_len + 2 + key_id_len * 2 + 1, GFP_KERNEL); | ||
143 | if (!id) | ||
144 | return ERR_PTR(-ENOKEY); | ||
145 | |||
146 | memcpy(id, signer, signer_len); | ||
147 | |||
148 | q = id + signer_len; | ||
149 | *q++ = ':'; | ||
150 | *q++ = ' '; | ||
151 | for (i = 0; i < key_id_len; i++) { | ||
152 | *q++ = hex_asc[*key_id >> 4]; | ||
153 | *q++ = hex_asc[*key_id++ & 0x0f]; | ||
154 | } | ||
155 | |||
156 | *q = 0; | ||
157 | |||
158 | pr_debug("Look up: \"%s\"\n", id); | ||
159 | |||
160 | key = keyring_search(make_key_ref(modsign_keyring, 1), | ||
161 | &key_type_asymmetric, id); | ||
162 | if (IS_ERR(key)) | ||
163 | pr_warn("Request for unknown module key '%s' err %ld\n", | ||
164 | id, PTR_ERR(key)); | ||
165 | kfree(id); | ||
166 | |||
167 | if (IS_ERR(key)) { | ||
168 | switch (PTR_ERR(key)) { | ||
169 | /* Hide some search errors */ | ||
170 | case -EACCES: | ||
171 | case -ENOTDIR: | ||
172 | case -EAGAIN: | ||
173 | return ERR_PTR(-ENOKEY); | ||
174 | default: | ||
175 | return ERR_CAST(key); | ||
176 | } | ||
177 | } | ||
178 | |||
179 | pr_devel("<==%s() = 0 [%x]\n", __func__, key_serial(key_ref_to_ptr(key))); | ||
180 | return key_ref_to_ptr(key); | ||
181 | } | ||
182 | |||
183 | /* | ||
184 | * Verify the signature on a module. | ||
185 | */ | ||
186 | int mod_verify_sig(const void *mod, unsigned long *_modlen) | ||
187 | { | ||
188 | struct public_key_signature *pks; | ||
189 | struct module_signature ms; | ||
190 | struct key *key; | ||
191 | const void *sig; | ||
192 | size_t modlen = *_modlen, sig_len; | ||
193 | int ret; | ||
194 | |||
195 | pr_devel("==>%s(,%zu)\n", __func__, modlen); | ||
196 | |||
197 | if (modlen <= sizeof(ms)) | ||
198 | return -EBADMSG; | ||
199 | |||
200 | memcpy(&ms, mod + (modlen - sizeof(ms)), sizeof(ms)); | ||
201 | modlen -= sizeof(ms); | ||
202 | |||
203 | sig_len = be32_to_cpu(ms.sig_len); | ||
204 | if (sig_len >= modlen) | ||
205 | return -EBADMSG; | ||
206 | modlen -= sig_len; | ||
207 | if ((size_t)ms.signer_len + ms.key_id_len >= modlen) | ||
208 | return -EBADMSG; | ||
209 | modlen -= (size_t)ms.signer_len + ms.key_id_len; | ||
210 | |||
211 | *_modlen = modlen; | ||
212 | sig = mod + modlen; | ||
213 | |||
214 | /* For the moment, only support RSA and X.509 identifiers */ | ||
215 | if (ms.algo != PKEY_ALGO_RSA || | ||
216 | ms.id_type != PKEY_ID_X509) | ||
217 | return -ENOPKG; | ||
218 | |||
219 | if (ms.hash >= PKEY_HASH__LAST || | ||
220 | !pkey_hash_algo[ms.hash]) | ||
221 | return -ENOPKG; | ||
222 | |||
223 | key = request_asymmetric_key(sig, ms.signer_len, | ||
224 | sig + ms.signer_len, ms.key_id_len); | ||
225 | if (IS_ERR(key)) | ||
226 | return PTR_ERR(key); | ||
227 | |||
228 | pks = mod_make_digest(ms.hash, mod, modlen); | ||
229 | if (IS_ERR(pks)) { | ||
230 | ret = PTR_ERR(pks); | ||
231 | goto error_put_key; | ||
232 | } | ||
233 | |||
234 | ret = mod_extract_mpi_array(pks, sig + ms.signer_len + ms.key_id_len, | ||
235 | sig_len); | ||
236 | if (ret < 0) | ||
237 | goto error_free_pks; | ||
238 | |||
239 | ret = verify_signature(key, pks); | ||
240 | pr_devel("verify_signature() = %d\n", ret); | ||
241 | |||
242 | error_free_pks: | ||
243 | mpi_free(pks->rsa.s); | ||
244 | kfree(pks); | ||
245 | error_put_key: | ||
246 | key_put(key); | ||
247 | pr_devel("<==%s() = %d\n", __func__, ret); | ||
248 | return ret; | ||
249 | } | ||
diff --git a/kernel/nsproxy.c b/kernel/nsproxy.c index b576f7f14bc6..78e2ecb20165 100644 --- a/kernel/nsproxy.c +++ b/kernel/nsproxy.c | |||
@@ -57,7 +57,8 @@ static inline struct nsproxy *create_nsproxy(void) | |||
57 | * leave it to the caller to do proper locking and attach it to task. | 57 | * leave it to the caller to do proper locking and attach it to task. |
58 | */ | 58 | */ |
59 | static struct nsproxy *create_new_namespaces(unsigned long flags, | 59 | static struct nsproxy *create_new_namespaces(unsigned long flags, |
60 | struct task_struct *tsk, struct fs_struct *new_fs) | 60 | struct task_struct *tsk, struct user_namespace *user_ns, |
61 | struct fs_struct *new_fs) | ||
61 | { | 62 | { |
62 | struct nsproxy *new_nsp; | 63 | struct nsproxy *new_nsp; |
63 | int err; | 64 | int err; |
@@ -66,31 +67,31 @@ static struct nsproxy *create_new_namespaces(unsigned long flags, | |||
66 | if (!new_nsp) | 67 | if (!new_nsp) |
67 | return ERR_PTR(-ENOMEM); | 68 | return ERR_PTR(-ENOMEM); |
68 | 69 | ||
69 | new_nsp->mnt_ns = copy_mnt_ns(flags, tsk->nsproxy->mnt_ns, new_fs); | 70 | new_nsp->mnt_ns = copy_mnt_ns(flags, tsk->nsproxy->mnt_ns, user_ns, new_fs); |
70 | if (IS_ERR(new_nsp->mnt_ns)) { | 71 | if (IS_ERR(new_nsp->mnt_ns)) { |
71 | err = PTR_ERR(new_nsp->mnt_ns); | 72 | err = PTR_ERR(new_nsp->mnt_ns); |
72 | goto out_ns; | 73 | goto out_ns; |
73 | } | 74 | } |
74 | 75 | ||
75 | new_nsp->uts_ns = copy_utsname(flags, tsk); | 76 | new_nsp->uts_ns = copy_utsname(flags, user_ns, tsk->nsproxy->uts_ns); |
76 | if (IS_ERR(new_nsp->uts_ns)) { | 77 | if (IS_ERR(new_nsp->uts_ns)) { |
77 | err = PTR_ERR(new_nsp->uts_ns); | 78 | err = PTR_ERR(new_nsp->uts_ns); |
78 | goto out_uts; | 79 | goto out_uts; |
79 | } | 80 | } |
80 | 81 | ||
81 | new_nsp->ipc_ns = copy_ipcs(flags, tsk); | 82 | new_nsp->ipc_ns = copy_ipcs(flags, user_ns, tsk->nsproxy->ipc_ns); |
82 | if (IS_ERR(new_nsp->ipc_ns)) { | 83 | if (IS_ERR(new_nsp->ipc_ns)) { |
83 | err = PTR_ERR(new_nsp->ipc_ns); | 84 | err = PTR_ERR(new_nsp->ipc_ns); |
84 | goto out_ipc; | 85 | goto out_ipc; |
85 | } | 86 | } |
86 | 87 | ||
87 | new_nsp->pid_ns = copy_pid_ns(flags, task_active_pid_ns(tsk)); | 88 | new_nsp->pid_ns = copy_pid_ns(flags, user_ns, tsk->nsproxy->pid_ns); |
88 | if (IS_ERR(new_nsp->pid_ns)) { | 89 | if (IS_ERR(new_nsp->pid_ns)) { |
89 | err = PTR_ERR(new_nsp->pid_ns); | 90 | err = PTR_ERR(new_nsp->pid_ns); |
90 | goto out_pid; | 91 | goto out_pid; |
91 | } | 92 | } |
92 | 93 | ||
93 | new_nsp->net_ns = copy_net_ns(flags, tsk->nsproxy->net_ns); | 94 | new_nsp->net_ns = copy_net_ns(flags, user_ns, tsk->nsproxy->net_ns); |
94 | if (IS_ERR(new_nsp->net_ns)) { | 95 | if (IS_ERR(new_nsp->net_ns)) { |
95 | err = PTR_ERR(new_nsp->net_ns); | 96 | err = PTR_ERR(new_nsp->net_ns); |
96 | goto out_net; | 97 | goto out_net; |
@@ -122,6 +123,7 @@ out_ns: | |||
122 | int copy_namespaces(unsigned long flags, struct task_struct *tsk) | 123 | int copy_namespaces(unsigned long flags, struct task_struct *tsk) |
123 | { | 124 | { |
124 | struct nsproxy *old_ns = tsk->nsproxy; | 125 | struct nsproxy *old_ns = tsk->nsproxy; |
126 | struct user_namespace *user_ns = task_cred_xxx(tsk, user_ns); | ||
125 | struct nsproxy *new_ns; | 127 | struct nsproxy *new_ns; |
126 | int err = 0; | 128 | int err = 0; |
127 | 129 | ||
@@ -134,7 +136,7 @@ int copy_namespaces(unsigned long flags, struct task_struct *tsk) | |||
134 | CLONE_NEWPID | CLONE_NEWNET))) | 136 | CLONE_NEWPID | CLONE_NEWNET))) |
135 | return 0; | 137 | return 0; |
136 | 138 | ||
137 | if (!capable(CAP_SYS_ADMIN)) { | 139 | if (!ns_capable(user_ns, CAP_SYS_ADMIN)) { |
138 | err = -EPERM; | 140 | err = -EPERM; |
139 | goto out; | 141 | goto out; |
140 | } | 142 | } |
@@ -151,7 +153,8 @@ int copy_namespaces(unsigned long flags, struct task_struct *tsk) | |||
151 | goto out; | 153 | goto out; |
152 | } | 154 | } |
153 | 155 | ||
154 | new_ns = create_new_namespaces(flags, tsk, tsk->fs); | 156 | new_ns = create_new_namespaces(flags, tsk, |
157 | task_cred_xxx(tsk, user_ns), tsk->fs); | ||
155 | if (IS_ERR(new_ns)) { | 158 | if (IS_ERR(new_ns)) { |
156 | err = PTR_ERR(new_ns); | 159 | err = PTR_ERR(new_ns); |
157 | goto out; | 160 | goto out; |
@@ -183,19 +186,21 @@ void free_nsproxy(struct nsproxy *ns) | |||
183 | * On success, returns the new nsproxy. | 186 | * On success, returns the new nsproxy. |
184 | */ | 187 | */ |
185 | int unshare_nsproxy_namespaces(unsigned long unshare_flags, | 188 | int unshare_nsproxy_namespaces(unsigned long unshare_flags, |
186 | struct nsproxy **new_nsp, struct fs_struct *new_fs) | 189 | struct nsproxy **new_nsp, struct cred *new_cred, struct fs_struct *new_fs) |
187 | { | 190 | { |
191 | struct user_namespace *user_ns; | ||
188 | int err = 0; | 192 | int err = 0; |
189 | 193 | ||
190 | if (!(unshare_flags & (CLONE_NEWNS | CLONE_NEWUTS | CLONE_NEWIPC | | 194 | if (!(unshare_flags & (CLONE_NEWNS | CLONE_NEWUTS | CLONE_NEWIPC | |
191 | CLONE_NEWNET))) | 195 | CLONE_NEWNET | CLONE_NEWPID))) |
192 | return 0; | 196 | return 0; |
193 | 197 | ||
194 | if (!capable(CAP_SYS_ADMIN)) | 198 | user_ns = new_cred ? new_cred->user_ns : current_user_ns(); |
199 | if (!ns_capable(user_ns, CAP_SYS_ADMIN)) | ||
195 | return -EPERM; | 200 | return -EPERM; |
196 | 201 | ||
197 | *new_nsp = create_new_namespaces(unshare_flags, current, | 202 | *new_nsp = create_new_namespaces(unshare_flags, current, user_ns, |
198 | new_fs ? new_fs : current->fs); | 203 | new_fs ? new_fs : current->fs); |
199 | if (IS_ERR(*new_nsp)) { | 204 | if (IS_ERR(*new_nsp)) { |
200 | err = PTR_ERR(*new_nsp); | 205 | err = PTR_ERR(*new_nsp); |
201 | goto out; | 206 | goto out; |
@@ -241,9 +246,6 @@ SYSCALL_DEFINE2(setns, int, fd, int, nstype) | |||
241 | struct file *file; | 246 | struct file *file; |
242 | int err; | 247 | int err; |
243 | 248 | ||
244 | if (!capable(CAP_SYS_ADMIN)) | ||
245 | return -EPERM; | ||
246 | |||
247 | file = proc_ns_fget(fd); | 249 | file = proc_ns_fget(fd); |
248 | if (IS_ERR(file)) | 250 | if (IS_ERR(file)) |
249 | return PTR_ERR(file); | 251 | return PTR_ERR(file); |
@@ -254,7 +256,7 @@ SYSCALL_DEFINE2(setns, int, fd, int, nstype) | |||
254 | if (nstype && (ops->type != nstype)) | 256 | if (nstype && (ops->type != nstype)) |
255 | goto out; | 257 | goto out; |
256 | 258 | ||
257 | new_nsproxy = create_new_namespaces(0, tsk, tsk->fs); | 259 | new_nsproxy = create_new_namespaces(0, tsk, current_user_ns(), tsk->fs); |
258 | if (IS_ERR(new_nsproxy)) { | 260 | if (IS_ERR(new_nsproxy)) { |
259 | err = PTR_ERR(new_nsproxy); | 261 | err = PTR_ERR(new_nsproxy); |
260 | goto out; | 262 | goto out; |
diff --git a/kernel/padata.c b/kernel/padata.c index 89fe3d1b9efb..072f4ee4eb89 100644 --- a/kernel/padata.c +++ b/kernel/padata.c | |||
@@ -171,7 +171,7 @@ static struct padata_priv *padata_get_next(struct parallel_data *pd) | |||
171 | { | 171 | { |
172 | int cpu, num_cpus; | 172 | int cpu, num_cpus; |
173 | unsigned int next_nr, next_index; | 173 | unsigned int next_nr, next_index; |
174 | struct padata_parallel_queue *queue, *next_queue; | 174 | struct padata_parallel_queue *next_queue; |
175 | struct padata_priv *padata; | 175 | struct padata_priv *padata; |
176 | struct padata_list *reorder; | 176 | struct padata_list *reorder; |
177 | 177 | ||
@@ -204,8 +204,7 @@ static struct padata_priv *padata_get_next(struct parallel_data *pd) | |||
204 | goto out; | 204 | goto out; |
205 | } | 205 | } |
206 | 206 | ||
207 | queue = per_cpu_ptr(pd->pqueue, smp_processor_id()); | 207 | if (__this_cpu_read(pd->pqueue->cpu_index) == next_queue->cpu_index) { |
208 | if (queue->cpu_index == next_queue->cpu_index) { | ||
209 | padata = ERR_PTR(-ENODATA); | 208 | padata = ERR_PTR(-ENODATA); |
210 | goto out; | 209 | goto out; |
211 | } | 210 | } |
diff --git a/kernel/pid.c b/kernel/pid.c index e86b291ad834..36aa02ff17d6 100644 --- a/kernel/pid.c +++ b/kernel/pid.c | |||
@@ -1,8 +1,8 @@ | |||
1 | /* | 1 | /* |
2 | * Generic pidhash and scalable, time-bounded PID allocator | 2 | * Generic pidhash and scalable, time-bounded PID allocator |
3 | * | 3 | * |
4 | * (C) 2002-2003 William Irwin, IBM | 4 | * (C) 2002-2003 Nadia Yvette Chambers, IBM |
5 | * (C) 2004 William Irwin, Oracle | 5 | * (C) 2004 Nadia Yvette Chambers, Oracle |
6 | * (C) 2002-2004 Ingo Molnar, Red Hat | 6 | * (C) 2002-2004 Ingo Molnar, Red Hat |
7 | * | 7 | * |
8 | * pid-structures are backing objects for tasks sharing a given ID to chain | 8 | * pid-structures are backing objects for tasks sharing a given ID to chain |
@@ -36,6 +36,7 @@ | |||
36 | #include <linux/pid_namespace.h> | 36 | #include <linux/pid_namespace.h> |
37 | #include <linux/init_task.h> | 37 | #include <linux/init_task.h> |
38 | #include <linux/syscalls.h> | 38 | #include <linux/syscalls.h> |
39 | #include <linux/proc_fs.h> | ||
39 | 40 | ||
40 | #define pid_hashfn(nr, ns) \ | 41 | #define pid_hashfn(nr, ns) \ |
41 | hash_long((unsigned long)nr + (unsigned long)ns, pidhash_shift) | 42 | hash_long((unsigned long)nr + (unsigned long)ns, pidhash_shift) |
@@ -78,24 +79,11 @@ struct pid_namespace init_pid_ns = { | |||
78 | .last_pid = 0, | 79 | .last_pid = 0, |
79 | .level = 0, | 80 | .level = 0, |
80 | .child_reaper = &init_task, | 81 | .child_reaper = &init_task, |
82 | .user_ns = &init_user_ns, | ||
83 | .proc_inum = PROC_PID_INIT_INO, | ||
81 | }; | 84 | }; |
82 | EXPORT_SYMBOL_GPL(init_pid_ns); | 85 | EXPORT_SYMBOL_GPL(init_pid_ns); |
83 | 86 | ||
84 | int is_container_init(struct task_struct *tsk) | ||
85 | { | ||
86 | int ret = 0; | ||
87 | struct pid *pid; | ||
88 | |||
89 | rcu_read_lock(); | ||
90 | pid = task_pid(tsk); | ||
91 | if (pid != NULL && pid->numbers[pid->level].nr == 1) | ||
92 | ret = 1; | ||
93 | rcu_read_unlock(); | ||
94 | |||
95 | return ret; | ||
96 | } | ||
97 | EXPORT_SYMBOL(is_container_init); | ||
98 | |||
99 | /* | 87 | /* |
100 | * Note: disable interrupts while the pidmap_lock is held as an | 88 | * Note: disable interrupts while the pidmap_lock is held as an |
101 | * interrupt might come in and do read_lock(&tasklist_lock). | 89 | * interrupt might come in and do read_lock(&tasklist_lock). |
@@ -269,8 +257,24 @@ void free_pid(struct pid *pid) | |||
269 | unsigned long flags; | 257 | unsigned long flags; |
270 | 258 | ||
271 | spin_lock_irqsave(&pidmap_lock, flags); | 259 | spin_lock_irqsave(&pidmap_lock, flags); |
272 | for (i = 0; i <= pid->level; i++) | 260 | for (i = 0; i <= pid->level; i++) { |
273 | hlist_del_rcu(&pid->numbers[i].pid_chain); | 261 | struct upid *upid = pid->numbers + i; |
262 | struct pid_namespace *ns = upid->ns; | ||
263 | hlist_del_rcu(&upid->pid_chain); | ||
264 | switch(--ns->nr_hashed) { | ||
265 | case 1: | ||
266 | /* When all that is left in the pid namespace | ||
267 | * is the reaper wake up the reaper. The reaper | ||
268 | * may be sleeping in zap_pid_ns_processes(). | ||
269 | */ | ||
270 | wake_up_process(ns->child_reaper); | ||
271 | break; | ||
272 | case 0: | ||
273 | ns->nr_hashed = -1; | ||
274 | schedule_work(&ns->proc_work); | ||
275 | break; | ||
276 | } | ||
277 | } | ||
274 | spin_unlock_irqrestore(&pidmap_lock, flags); | 278 | spin_unlock_irqrestore(&pidmap_lock, flags); |
275 | 279 | ||
276 | for (i = 0; i <= pid->level; i++) | 280 | for (i = 0; i <= pid->level; i++) |
@@ -292,6 +296,7 @@ struct pid *alloc_pid(struct pid_namespace *ns) | |||
292 | goto out; | 296 | goto out; |
293 | 297 | ||
294 | tmp = ns; | 298 | tmp = ns; |
299 | pid->level = ns->level; | ||
295 | for (i = ns->level; i >= 0; i--) { | 300 | for (i = ns->level; i >= 0; i--) { |
296 | nr = alloc_pidmap(tmp); | 301 | nr = alloc_pidmap(tmp); |
297 | if (nr < 0) | 302 | if (nr < 0) |
@@ -302,22 +307,32 @@ struct pid *alloc_pid(struct pid_namespace *ns) | |||
302 | tmp = tmp->parent; | 307 | tmp = tmp->parent; |
303 | } | 308 | } |
304 | 309 | ||
310 | if (unlikely(is_child_reaper(pid))) { | ||
311 | if (pid_ns_prepare_proc(ns)) | ||
312 | goto out_free; | ||
313 | } | ||
314 | |||
305 | get_pid_ns(ns); | 315 | get_pid_ns(ns); |
306 | pid->level = ns->level; | ||
307 | atomic_set(&pid->count, 1); | 316 | atomic_set(&pid->count, 1); |
308 | for (type = 0; type < PIDTYPE_MAX; ++type) | 317 | for (type = 0; type < PIDTYPE_MAX; ++type) |
309 | INIT_HLIST_HEAD(&pid->tasks[type]); | 318 | INIT_HLIST_HEAD(&pid->tasks[type]); |
310 | 319 | ||
311 | upid = pid->numbers + ns->level; | 320 | upid = pid->numbers + ns->level; |
312 | spin_lock_irq(&pidmap_lock); | 321 | spin_lock_irq(&pidmap_lock); |
313 | for ( ; upid >= pid->numbers; --upid) | 322 | if (ns->nr_hashed < 0) |
323 | goto out_unlock; | ||
324 | for ( ; upid >= pid->numbers; --upid) { | ||
314 | hlist_add_head_rcu(&upid->pid_chain, | 325 | hlist_add_head_rcu(&upid->pid_chain, |
315 | &pid_hash[pid_hashfn(upid->nr, upid->ns)]); | 326 | &pid_hash[pid_hashfn(upid->nr, upid->ns)]); |
327 | upid->ns->nr_hashed++; | ||
328 | } | ||
316 | spin_unlock_irq(&pidmap_lock); | 329 | spin_unlock_irq(&pidmap_lock); |
317 | 330 | ||
318 | out: | 331 | out: |
319 | return pid; | 332 | return pid; |
320 | 333 | ||
334 | out_unlock: | ||
335 | spin_unlock(&pidmap_lock); | ||
321 | out_free: | 336 | out_free: |
322 | while (++i <= ns->level) | 337 | while (++i <= ns->level) |
323 | free_pidmap(pid->numbers + i); | 338 | free_pidmap(pid->numbers + i); |
@@ -344,7 +359,7 @@ EXPORT_SYMBOL_GPL(find_pid_ns); | |||
344 | 359 | ||
345 | struct pid *find_vpid(int nr) | 360 | struct pid *find_vpid(int nr) |
346 | { | 361 | { |
347 | return find_pid_ns(nr, current->nsproxy->pid_ns); | 362 | return find_pid_ns(nr, task_active_pid_ns(current)); |
348 | } | 363 | } |
349 | EXPORT_SYMBOL_GPL(find_vpid); | 364 | EXPORT_SYMBOL_GPL(find_vpid); |
350 | 365 | ||
@@ -428,7 +443,7 @@ struct task_struct *find_task_by_pid_ns(pid_t nr, struct pid_namespace *ns) | |||
428 | 443 | ||
429 | struct task_struct *find_task_by_vpid(pid_t vnr) | 444 | struct task_struct *find_task_by_vpid(pid_t vnr) |
430 | { | 445 | { |
431 | return find_task_by_pid_ns(vnr, current->nsproxy->pid_ns); | 446 | return find_task_by_pid_ns(vnr, task_active_pid_ns(current)); |
432 | } | 447 | } |
433 | 448 | ||
434 | struct pid *get_task_pid(struct task_struct *task, enum pid_type type) | 449 | struct pid *get_task_pid(struct task_struct *task, enum pid_type type) |
@@ -479,10 +494,11 @@ pid_t pid_nr_ns(struct pid *pid, struct pid_namespace *ns) | |||
479 | } | 494 | } |
480 | return nr; | 495 | return nr; |
481 | } | 496 | } |
497 | EXPORT_SYMBOL_GPL(pid_nr_ns); | ||
482 | 498 | ||
483 | pid_t pid_vnr(struct pid *pid) | 499 | pid_t pid_vnr(struct pid *pid) |
484 | { | 500 | { |
485 | return pid_nr_ns(pid, current->nsproxy->pid_ns); | 501 | return pid_nr_ns(pid, task_active_pid_ns(current)); |
486 | } | 502 | } |
487 | EXPORT_SYMBOL_GPL(pid_vnr); | 503 | EXPORT_SYMBOL_GPL(pid_vnr); |
488 | 504 | ||
@@ -493,7 +509,7 @@ pid_t __task_pid_nr_ns(struct task_struct *task, enum pid_type type, | |||
493 | 509 | ||
494 | rcu_read_lock(); | 510 | rcu_read_lock(); |
495 | if (!ns) | 511 | if (!ns) |
496 | ns = current->nsproxy->pid_ns; | 512 | ns = task_active_pid_ns(current); |
497 | if (likely(pid_alive(task))) { | 513 | if (likely(pid_alive(task))) { |
498 | if (type != PIDTYPE_PID) | 514 | if (type != PIDTYPE_PID) |
499 | task = task->group_leader; | 515 | task = task->group_leader; |
@@ -568,6 +584,7 @@ void __init pidmap_init(void) | |||
568 | /* Reserve PID 0. We never call free_pidmap(0) */ | 584 | /* Reserve PID 0. We never call free_pidmap(0) */ |
569 | set_bit(0, init_pid_ns.pidmap[0].page); | 585 | set_bit(0, init_pid_ns.pidmap[0].page); |
570 | atomic_dec(&init_pid_ns.pidmap[0].nr_free); | 586 | atomic_dec(&init_pid_ns.pidmap[0].nr_free); |
587 | init_pid_ns.nr_hashed = 1; | ||
571 | 588 | ||
572 | init_pid_ns.pid_cachep = KMEM_CACHE(pid, | 589 | init_pid_ns.pid_cachep = KMEM_CACHE(pid, |
573 | SLAB_HWCACHE_ALIGN | SLAB_PANIC); | 590 | SLAB_HWCACHE_ALIGN | SLAB_PANIC); |
diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c index 6144bab8fd8e..fdbd0cdf271a 100644 --- a/kernel/pid_namespace.c +++ b/kernel/pid_namespace.c | |||
@@ -10,12 +10,14 @@ | |||
10 | 10 | ||
11 | #include <linux/pid.h> | 11 | #include <linux/pid.h> |
12 | #include <linux/pid_namespace.h> | 12 | #include <linux/pid_namespace.h> |
13 | #include <linux/user_namespace.h> | ||
13 | #include <linux/syscalls.h> | 14 | #include <linux/syscalls.h> |
14 | #include <linux/err.h> | 15 | #include <linux/err.h> |
15 | #include <linux/acct.h> | 16 | #include <linux/acct.h> |
16 | #include <linux/slab.h> | 17 | #include <linux/slab.h> |
17 | #include <linux/proc_fs.h> | 18 | #include <linux/proc_fs.h> |
18 | #include <linux/reboot.h> | 19 | #include <linux/reboot.h> |
20 | #include <linux/export.h> | ||
19 | 21 | ||
20 | #define BITS_PER_PAGE (PAGE_SIZE*8) | 22 | #define BITS_PER_PAGE (PAGE_SIZE*8) |
21 | 23 | ||
@@ -70,12 +72,29 @@ err_alloc: | |||
70 | return NULL; | 72 | return NULL; |
71 | } | 73 | } |
72 | 74 | ||
73 | static struct pid_namespace *create_pid_namespace(struct pid_namespace *parent_pid_ns) | 75 | static void proc_cleanup_work(struct work_struct *work) |
76 | { | ||
77 | struct pid_namespace *ns = container_of(work, struct pid_namespace, proc_work); | ||
78 | pid_ns_release_proc(ns); | ||
79 | } | ||
80 | |||
81 | /* MAX_PID_NS_LEVEL is needed for limiting size of 'struct pid' */ | ||
82 | #define MAX_PID_NS_LEVEL 32 | ||
83 | |||
84 | static struct pid_namespace *create_pid_namespace(struct user_namespace *user_ns, | ||
85 | struct pid_namespace *parent_pid_ns) | ||
74 | { | 86 | { |
75 | struct pid_namespace *ns; | 87 | struct pid_namespace *ns; |
76 | unsigned int level = parent_pid_ns->level + 1; | 88 | unsigned int level = parent_pid_ns->level + 1; |
77 | int i, err = -ENOMEM; | 89 | int i; |
90 | int err; | ||
78 | 91 | ||
92 | if (level > MAX_PID_NS_LEVEL) { | ||
93 | err = -EINVAL; | ||
94 | goto out; | ||
95 | } | ||
96 | |||
97 | err = -ENOMEM; | ||
79 | ns = kmem_cache_zalloc(pid_ns_cachep, GFP_KERNEL); | 98 | ns = kmem_cache_zalloc(pid_ns_cachep, GFP_KERNEL); |
80 | if (ns == NULL) | 99 | if (ns == NULL) |
81 | goto out; | 100 | goto out; |
@@ -88,9 +107,15 @@ static struct pid_namespace *create_pid_namespace(struct pid_namespace *parent_p | |||
88 | if (ns->pid_cachep == NULL) | 107 | if (ns->pid_cachep == NULL) |
89 | goto out_free_map; | 108 | goto out_free_map; |
90 | 109 | ||
110 | err = proc_alloc_inum(&ns->proc_inum); | ||
111 | if (err) | ||
112 | goto out_free_map; | ||
113 | |||
91 | kref_init(&ns->kref); | 114 | kref_init(&ns->kref); |
92 | ns->level = level; | 115 | ns->level = level; |
93 | ns->parent = get_pid_ns(parent_pid_ns); | 116 | ns->parent = get_pid_ns(parent_pid_ns); |
117 | ns->user_ns = get_user_ns(user_ns); | ||
118 | INIT_WORK(&ns->proc_work, proc_cleanup_work); | ||
94 | 119 | ||
95 | set_bit(0, ns->pidmap[0].page); | 120 | set_bit(0, ns->pidmap[0].page); |
96 | atomic_set(&ns->pidmap[0].nr_free, BITS_PER_PAGE - 1); | 121 | atomic_set(&ns->pidmap[0].nr_free, BITS_PER_PAGE - 1); |
@@ -98,14 +123,8 @@ static struct pid_namespace *create_pid_namespace(struct pid_namespace *parent_p | |||
98 | for (i = 1; i < PIDMAP_ENTRIES; i++) | 123 | for (i = 1; i < PIDMAP_ENTRIES; i++) |
99 | atomic_set(&ns->pidmap[i].nr_free, BITS_PER_PAGE); | 124 | atomic_set(&ns->pidmap[i].nr_free, BITS_PER_PAGE); |
100 | 125 | ||
101 | err = pid_ns_prepare_proc(ns); | ||
102 | if (err) | ||
103 | goto out_put_parent_pid_ns; | ||
104 | |||
105 | return ns; | 126 | return ns; |
106 | 127 | ||
107 | out_put_parent_pid_ns: | ||
108 | put_pid_ns(parent_pid_ns); | ||
109 | out_free_map: | 128 | out_free_map: |
110 | kfree(ns->pidmap[0].page); | 129 | kfree(ns->pidmap[0].page); |
111 | out_free: | 130 | out_free: |
@@ -118,32 +137,43 @@ static void destroy_pid_namespace(struct pid_namespace *ns) | |||
118 | { | 137 | { |
119 | int i; | 138 | int i; |
120 | 139 | ||
140 | proc_free_inum(ns->proc_inum); | ||
121 | for (i = 0; i < PIDMAP_ENTRIES; i++) | 141 | for (i = 0; i < PIDMAP_ENTRIES; i++) |
122 | kfree(ns->pidmap[i].page); | 142 | kfree(ns->pidmap[i].page); |
143 | put_user_ns(ns->user_ns); | ||
123 | kmem_cache_free(pid_ns_cachep, ns); | 144 | kmem_cache_free(pid_ns_cachep, ns); |
124 | } | 145 | } |
125 | 146 | ||
126 | struct pid_namespace *copy_pid_ns(unsigned long flags, struct pid_namespace *old_ns) | 147 | struct pid_namespace *copy_pid_ns(unsigned long flags, |
148 | struct user_namespace *user_ns, struct pid_namespace *old_ns) | ||
127 | { | 149 | { |
128 | if (!(flags & CLONE_NEWPID)) | 150 | if (!(flags & CLONE_NEWPID)) |
129 | return get_pid_ns(old_ns); | 151 | return get_pid_ns(old_ns); |
130 | if (flags & (CLONE_THREAD|CLONE_PARENT)) | 152 | if (task_active_pid_ns(current) != old_ns) |
131 | return ERR_PTR(-EINVAL); | 153 | return ERR_PTR(-EINVAL); |
132 | return create_pid_namespace(old_ns); | 154 | return create_pid_namespace(user_ns, old_ns); |
133 | } | 155 | } |
134 | 156 | ||
135 | void free_pid_ns(struct kref *kref) | 157 | static void free_pid_ns(struct kref *kref) |
136 | { | 158 | { |
137 | struct pid_namespace *ns, *parent; | 159 | struct pid_namespace *ns; |
138 | 160 | ||
139 | ns = container_of(kref, struct pid_namespace, kref); | 161 | ns = container_of(kref, struct pid_namespace, kref); |
140 | |||
141 | parent = ns->parent; | ||
142 | destroy_pid_namespace(ns); | 162 | destroy_pid_namespace(ns); |
163 | } | ||
143 | 164 | ||
144 | if (parent != NULL) | 165 | void put_pid_ns(struct pid_namespace *ns) |
145 | put_pid_ns(parent); | 166 | { |
167 | struct pid_namespace *parent; | ||
168 | |||
169 | while (ns != &init_pid_ns) { | ||
170 | parent = ns->parent; | ||
171 | if (!kref_put(&ns->kref, free_pid_ns)) | ||
172 | break; | ||
173 | ns = parent; | ||
174 | } | ||
146 | } | 175 | } |
176 | EXPORT_SYMBOL_GPL(put_pid_ns); | ||
147 | 177 | ||
148 | void zap_pid_ns_processes(struct pid_namespace *pid_ns) | 178 | void zap_pid_ns_processes(struct pid_namespace *pid_ns) |
149 | { | 179 | { |
@@ -192,22 +222,15 @@ void zap_pid_ns_processes(struct pid_namespace *pid_ns) | |||
192 | 222 | ||
193 | /* | 223 | /* |
194 | * sys_wait4() above can't reap the TASK_DEAD children. | 224 | * sys_wait4() above can't reap the TASK_DEAD children. |
195 | * Make sure they all go away, see __unhash_process(). | 225 | * Make sure they all go away, see free_pid(). |
196 | */ | 226 | */ |
197 | for (;;) { | 227 | for (;;) { |
198 | bool need_wait = false; | 228 | set_current_state(TASK_UNINTERRUPTIBLE); |
199 | 229 | if (pid_ns->nr_hashed == 1) | |
200 | read_lock(&tasklist_lock); | ||
201 | if (!list_empty(¤t->children)) { | ||
202 | __set_current_state(TASK_UNINTERRUPTIBLE); | ||
203 | need_wait = true; | ||
204 | } | ||
205 | read_unlock(&tasklist_lock); | ||
206 | |||
207 | if (!need_wait) | ||
208 | break; | 230 | break; |
209 | schedule(); | 231 | schedule(); |
210 | } | 232 | } |
233 | __set_current_state(TASK_RUNNING); | ||
211 | 234 | ||
212 | if (pid_ns->reboot) | 235 | if (pid_ns->reboot) |
213 | current->signal->group_exit_code = pid_ns->reboot; | 236 | current->signal->group_exit_code = pid_ns->reboot; |
@@ -220,9 +243,10 @@ void zap_pid_ns_processes(struct pid_namespace *pid_ns) | |||
220 | static int pid_ns_ctl_handler(struct ctl_table *table, int write, | 243 | static int pid_ns_ctl_handler(struct ctl_table *table, int write, |
221 | void __user *buffer, size_t *lenp, loff_t *ppos) | 244 | void __user *buffer, size_t *lenp, loff_t *ppos) |
222 | { | 245 | { |
246 | struct pid_namespace *pid_ns = task_active_pid_ns(current); | ||
223 | struct ctl_table tmp = *table; | 247 | struct ctl_table tmp = *table; |
224 | 248 | ||
225 | if (write && !capable(CAP_SYS_ADMIN)) | 249 | if (write && !ns_capable(pid_ns->user_ns, CAP_SYS_ADMIN)) |
226 | return -EPERM; | 250 | return -EPERM; |
227 | 251 | ||
228 | /* | 252 | /* |
@@ -231,7 +255,7 @@ static int pid_ns_ctl_handler(struct ctl_table *table, int write, | |||
231 | * it should synchronize its usage with external means. | 255 | * it should synchronize its usage with external means. |
232 | */ | 256 | */ |
233 | 257 | ||
234 | tmp.data = ¤t->nsproxy->pid_ns->last_pid; | 258 | tmp.data = &pid_ns->last_pid; |
235 | return proc_dointvec_minmax(&tmp, write, buffer, lenp, ppos); | 259 | return proc_dointvec_minmax(&tmp, write, buffer, lenp, ppos); |
236 | } | 260 | } |
237 | 261 | ||
@@ -280,6 +304,68 @@ int reboot_pid_ns(struct pid_namespace *pid_ns, int cmd) | |||
280 | return 0; | 304 | return 0; |
281 | } | 305 | } |
282 | 306 | ||
307 | static void *pidns_get(struct task_struct *task) | ||
308 | { | ||
309 | struct pid_namespace *ns; | ||
310 | |||
311 | rcu_read_lock(); | ||
312 | ns = get_pid_ns(task_active_pid_ns(task)); | ||
313 | rcu_read_unlock(); | ||
314 | |||
315 | return ns; | ||
316 | } | ||
317 | |||
318 | static void pidns_put(void *ns) | ||
319 | { | ||
320 | put_pid_ns(ns); | ||
321 | } | ||
322 | |||
323 | static int pidns_install(struct nsproxy *nsproxy, void *ns) | ||
324 | { | ||
325 | struct pid_namespace *active = task_active_pid_ns(current); | ||
326 | struct pid_namespace *ancestor, *new = ns; | ||
327 | |||
328 | if (!ns_capable(new->user_ns, CAP_SYS_ADMIN) || | ||
329 | !nsown_capable(CAP_SYS_ADMIN)) | ||
330 | return -EPERM; | ||
331 | |||
332 | /* | ||
333 | * Only allow entering the current active pid namespace | ||
334 | * or a child of the current active pid namespace. | ||
335 | * | ||
336 | * This is required for fork to return a usable pid value and | ||
337 | * this maintains the property that processes and their | ||
338 | * children can not escape their current pid namespace. | ||
339 | */ | ||
340 | if (new->level < active->level) | ||
341 | return -EINVAL; | ||
342 | |||
343 | ancestor = new; | ||
344 | while (ancestor->level > active->level) | ||
345 | ancestor = ancestor->parent; | ||
346 | if (ancestor != active) | ||
347 | return -EINVAL; | ||
348 | |||
349 | put_pid_ns(nsproxy->pid_ns); | ||
350 | nsproxy->pid_ns = get_pid_ns(new); | ||
351 | return 0; | ||
352 | } | ||
353 | |||
354 | static unsigned int pidns_inum(void *ns) | ||
355 | { | ||
356 | struct pid_namespace *pid_ns = ns; | ||
357 | return pid_ns->proc_inum; | ||
358 | } | ||
359 | |||
360 | const struct proc_ns_operations pidns_operations = { | ||
361 | .name = "pid", | ||
362 | .type = CLONE_NEWPID, | ||
363 | .get = pidns_get, | ||
364 | .put = pidns_put, | ||
365 | .install = pidns_install, | ||
366 | .inum = pidns_inum, | ||
367 | }; | ||
368 | |||
283 | static __init int pid_namespaces_init(void) | 369 | static __init int pid_namespaces_init(void) |
284 | { | 370 | { |
285 | pid_ns_cachep = KMEM_CACHE(pid_namespace, SLAB_PANIC); | 371 | pid_ns_cachep = KMEM_CACHE(pid_namespace, SLAB_PANIC); |
diff --git a/kernel/posix-cpu-timers.c b/kernel/posix-cpu-timers.c index 125cb67daa21..a278cad1d5d6 100644 --- a/kernel/posix-cpu-timers.c +++ b/kernel/posix-cpu-timers.c | |||
@@ -9,6 +9,7 @@ | |||
9 | #include <asm/uaccess.h> | 9 | #include <asm/uaccess.h> |
10 | #include <linux/kernel_stat.h> | 10 | #include <linux/kernel_stat.h> |
11 | #include <trace/events/timer.h> | 11 | #include <trace/events/timer.h> |
12 | #include <linux/random.h> | ||
12 | 13 | ||
13 | /* | 14 | /* |
14 | * Called after updating RLIMIT_CPU to run cpu timer and update | 15 | * Called after updating RLIMIT_CPU to run cpu timer and update |
@@ -217,30 +218,6 @@ static int cpu_clock_sample(const clockid_t which_clock, struct task_struct *p, | |||
217 | return 0; | 218 | return 0; |
218 | } | 219 | } |
219 | 220 | ||
220 | void thread_group_cputime(struct task_struct *tsk, struct task_cputime *times) | ||
221 | { | ||
222 | struct signal_struct *sig = tsk->signal; | ||
223 | struct task_struct *t; | ||
224 | |||
225 | times->utime = sig->utime; | ||
226 | times->stime = sig->stime; | ||
227 | times->sum_exec_runtime = sig->sum_sched_runtime; | ||
228 | |||
229 | rcu_read_lock(); | ||
230 | /* make sure we can trust tsk->thread_group list */ | ||
231 | if (!likely(pid_alive(tsk))) | ||
232 | goto out; | ||
233 | |||
234 | t = tsk; | ||
235 | do { | ||
236 | times->utime += t->utime; | ||
237 | times->stime += t->stime; | ||
238 | times->sum_exec_runtime += task_sched_runtime(t); | ||
239 | } while_each_thread(tsk, t); | ||
240 | out: | ||
241 | rcu_read_unlock(); | ||
242 | } | ||
243 | |||
244 | static void update_gt_cputime(struct task_cputime *a, struct task_cputime *b) | 221 | static void update_gt_cputime(struct task_cputime *a, struct task_cputime *b) |
245 | { | 222 | { |
246 | if (b->utime > a->utime) | 223 | if (b->utime > a->utime) |
@@ -494,6 +471,8 @@ static void cleanup_timers(struct list_head *head, | |||
494 | */ | 471 | */ |
495 | void posix_cpu_timers_exit(struct task_struct *tsk) | 472 | void posix_cpu_timers_exit(struct task_struct *tsk) |
496 | { | 473 | { |
474 | add_device_randomness((const void*) &tsk->se.sum_exec_runtime, | ||
475 | sizeof(unsigned long long)); | ||
497 | cleanup_timers(tsk->cpu_timers, | 476 | cleanup_timers(tsk->cpu_timers, |
498 | tsk->utime, tsk->stime, tsk->se.sum_exec_runtime); | 477 | tsk->utime, tsk->stime, tsk->se.sum_exec_runtime); |
499 | 478 | ||
diff --git a/kernel/power/Kconfig b/kernel/power/Kconfig index a70518c9d82f..5dfdc9ea180b 100644 --- a/kernel/power/Kconfig +++ b/kernel/power/Kconfig | |||
@@ -263,6 +263,10 @@ config PM_GENERIC_DOMAINS | |||
263 | bool | 263 | bool |
264 | depends on PM | 264 | depends on PM |
265 | 265 | ||
266 | config PM_GENERIC_DOMAINS_SLEEP | ||
267 | def_bool y | ||
268 | depends on PM_SLEEP && PM_GENERIC_DOMAINS | ||
269 | |||
266 | config PM_GENERIC_DOMAINS_RUNTIME | 270 | config PM_GENERIC_DOMAINS_RUNTIME |
267 | def_bool y | 271 | def_bool y |
268 | depends on PM_RUNTIME && PM_GENERIC_DOMAINS | 272 | depends on PM_RUNTIME && PM_GENERIC_DOMAINS |
diff --git a/kernel/power/main.c b/kernel/power/main.c index f458238109cc..1c16f9167de1 100644 --- a/kernel/power/main.c +++ b/kernel/power/main.c | |||
@@ -59,7 +59,7 @@ static ssize_t pm_async_store(struct kobject *kobj, struct kobj_attribute *attr, | |||
59 | { | 59 | { |
60 | unsigned long val; | 60 | unsigned long val; |
61 | 61 | ||
62 | if (strict_strtoul(buf, 10, &val)) | 62 | if (kstrtoul(buf, 10, &val)) |
63 | return -EINVAL; | 63 | return -EINVAL; |
64 | 64 | ||
65 | if (val > 1) | 65 | if (val > 1) |
diff --git a/kernel/power/poweroff.c b/kernel/power/poweroff.c index d52359374e85..68197a4e8fc9 100644 --- a/kernel/power/poweroff.c +++ b/kernel/power/poweroff.c | |||
@@ -37,7 +37,7 @@ static struct sysrq_key_op sysrq_poweroff_op = { | |||
37 | .enable_mask = SYSRQ_ENABLE_BOOT, | 37 | .enable_mask = SYSRQ_ENABLE_BOOT, |
38 | }; | 38 | }; |
39 | 39 | ||
40 | static int pm_sysrq_init(void) | 40 | static int __init pm_sysrq_init(void) |
41 | { | 41 | { |
42 | register_sysrq_key('o', &sysrq_poweroff_op); | 42 | register_sysrq_key('o', &sysrq_poweroff_op); |
43 | return 0; | 43 | return 0; |
diff --git a/kernel/power/process.c b/kernel/power/process.c index 19db29f67558..d5a258b60c6f 100644 --- a/kernel/power/process.c +++ b/kernel/power/process.c | |||
@@ -48,18 +48,7 @@ static int try_to_freeze_tasks(bool user_only) | |||
48 | if (p == current || !freeze_task(p)) | 48 | if (p == current || !freeze_task(p)) |
49 | continue; | 49 | continue; |
50 | 50 | ||
51 | /* | 51 | if (!freezer_should_skip(p)) |
52 | * Now that we've done set_freeze_flag, don't | ||
53 | * perturb a task in TASK_STOPPED or TASK_TRACED. | ||
54 | * It is "frozen enough". If the task does wake | ||
55 | * up, it will immediately call try_to_freeze. | ||
56 | * | ||
57 | * Because freeze_task() goes through p's scheduler lock, it's | ||
58 | * guaranteed that TASK_STOPPED/TRACED -> TASK_RUNNING | ||
59 | * transition can't race with task state testing here. | ||
60 | */ | ||
61 | if (!task_is_stopped_or_traced(p) && | ||
62 | !freezer_should_skip(p)) | ||
63 | todo++; | 52 | todo++; |
64 | } while_each_thread(g, p); | 53 | } while_each_thread(g, p); |
65 | read_unlock(&tasklist_lock); | 54 | read_unlock(&tasklist_lock); |
@@ -79,7 +68,7 @@ static int try_to_freeze_tasks(bool user_only) | |||
79 | 68 | ||
80 | /* | 69 | /* |
81 | * We need to retry, but first give the freezing tasks some | 70 | * We need to retry, but first give the freezing tasks some |
82 | * time to enter the regrigerator. | 71 | * time to enter the refrigerator. |
83 | */ | 72 | */ |
84 | msleep(10); | 73 | msleep(10); |
85 | } | 74 | } |
diff --git a/kernel/power/qos.c b/kernel/power/qos.c index 6a031e684026..9322ff7eaad6 100644 --- a/kernel/power/qos.c +++ b/kernel/power/qos.c | |||
@@ -139,6 +139,7 @@ static inline int pm_qos_get_value(struct pm_qos_constraints *c) | |||
139 | default: | 139 | default: |
140 | /* runtime check for not using enum */ | 140 | /* runtime check for not using enum */ |
141 | BUG(); | 141 | BUG(); |
142 | return PM_QOS_DEFAULT_VALUE; | ||
142 | } | 143 | } |
143 | } | 144 | } |
144 | 145 | ||
@@ -212,6 +213,69 @@ int pm_qos_update_target(struct pm_qos_constraints *c, struct plist_node *node, | |||
212 | } | 213 | } |
213 | 214 | ||
214 | /** | 215 | /** |
216 | * pm_qos_flags_remove_req - Remove device PM QoS flags request. | ||
217 | * @pqf: Device PM QoS flags set to remove the request from. | ||
218 | * @req: Request to remove from the set. | ||
219 | */ | ||
220 | static void pm_qos_flags_remove_req(struct pm_qos_flags *pqf, | ||
221 | struct pm_qos_flags_request *req) | ||
222 | { | ||
223 | s32 val = 0; | ||
224 | |||
225 | list_del(&req->node); | ||
226 | list_for_each_entry(req, &pqf->list, node) | ||
227 | val |= req->flags; | ||
228 | |||
229 | pqf->effective_flags = val; | ||
230 | } | ||
231 | |||
232 | /** | ||
233 | * pm_qos_update_flags - Update a set of PM QoS flags. | ||
234 | * @pqf: Set of flags to update. | ||
235 | * @req: Request to add to the set, to modify, or to remove from the set. | ||
236 | * @action: Action to take on the set. | ||
237 | * @val: Value of the request to add or modify. | ||
238 | * | ||
239 | * Update the given set of PM QoS flags and call notifiers if the aggregate | ||
240 | * value has changed. Returns 1 if the aggregate constraint value has changed, | ||
241 | * 0 otherwise. | ||
242 | */ | ||
243 | bool pm_qos_update_flags(struct pm_qos_flags *pqf, | ||
244 | struct pm_qos_flags_request *req, | ||
245 | enum pm_qos_req_action action, s32 val) | ||
246 | { | ||
247 | unsigned long irqflags; | ||
248 | s32 prev_value, curr_value; | ||
249 | |||
250 | spin_lock_irqsave(&pm_qos_lock, irqflags); | ||
251 | |||
252 | prev_value = list_empty(&pqf->list) ? 0 : pqf->effective_flags; | ||
253 | |||
254 | switch (action) { | ||
255 | case PM_QOS_REMOVE_REQ: | ||
256 | pm_qos_flags_remove_req(pqf, req); | ||
257 | break; | ||
258 | case PM_QOS_UPDATE_REQ: | ||
259 | pm_qos_flags_remove_req(pqf, req); | ||
260 | case PM_QOS_ADD_REQ: | ||
261 | req->flags = val; | ||
262 | INIT_LIST_HEAD(&req->node); | ||
263 | list_add_tail(&req->node, &pqf->list); | ||
264 | pqf->effective_flags |= val; | ||
265 | break; | ||
266 | default: | ||
267 | /* no action */ | ||
268 | ; | ||
269 | } | ||
270 | |||
271 | curr_value = list_empty(&pqf->list) ? 0 : pqf->effective_flags; | ||
272 | |||
273 | spin_unlock_irqrestore(&pm_qos_lock, irqflags); | ||
274 | |||
275 | return prev_value != curr_value; | ||
276 | } | ||
277 | |||
278 | /** | ||
215 | * pm_qos_request - returns current system wide qos expectation | 279 | * pm_qos_request - returns current system wide qos expectation |
216 | * @pm_qos_class: identification of which qos value is requested | 280 | * @pm_qos_class: identification of which qos value is requested |
217 | * | 281 | * |
@@ -499,7 +563,7 @@ static ssize_t pm_qos_power_write(struct file *filp, const char __user *buf, | |||
499 | } else { | 563 | } else { |
500 | ascii_value[count] = '\0'; | 564 | ascii_value[count] = '\0'; |
501 | } | 565 | } |
502 | ret = strict_strtoul(ascii_value, 16, &ulval); | 566 | ret = kstrtoul(ascii_value, 16, &ulval); |
503 | if (ret) { | 567 | if (ret) { |
504 | pr_debug("%s, 0x%lx, 0x%x\n", ascii_value, ulval, ret); | 568 | pr_debug("%s, 0x%lx, 0x%x\n", ascii_value, ulval, ret); |
505 | return -EINVAL; | 569 | return -EINVAL; |
diff --git a/kernel/power/swap.c b/kernel/power/swap.c index 3c9d764eb0d8..7c33ed200410 100644 --- a/kernel/power/swap.c +++ b/kernel/power/swap.c | |||
@@ -126,7 +126,7 @@ static int swsusp_extents_insert(unsigned long swap_offset) | |||
126 | 126 | ||
127 | /* Figure out where to put the new node */ | 127 | /* Figure out where to put the new node */ |
128 | while (*new) { | 128 | while (*new) { |
129 | ext = container_of(*new, struct swsusp_extent, node); | 129 | ext = rb_entry(*new, struct swsusp_extent, node); |
130 | parent = *new; | 130 | parent = *new; |
131 | if (swap_offset < ext->start) { | 131 | if (swap_offset < ext->start) { |
132 | /* Try to merge */ | 132 | /* Try to merge */ |
diff --git a/kernel/printk.c b/kernel/printk.c index 66a2ea37b576..19c0d7bcf24a 100644 --- a/kernel/printk.c +++ b/kernel/printk.c | |||
@@ -87,6 +87,12 @@ static DEFINE_SEMAPHORE(console_sem); | |||
87 | struct console *console_drivers; | 87 | struct console *console_drivers; |
88 | EXPORT_SYMBOL_GPL(console_drivers); | 88 | EXPORT_SYMBOL_GPL(console_drivers); |
89 | 89 | ||
90 | #ifdef CONFIG_LOCKDEP | ||
91 | static struct lockdep_map console_lock_dep_map = { | ||
92 | .name = "console_lock" | ||
93 | }; | ||
94 | #endif | ||
95 | |||
90 | /* | 96 | /* |
91 | * This is used for debugging the mess that is the VT code by | 97 | * This is used for debugging the mess that is the VT code by |
92 | * keeping track if we have the console semaphore held. It's | 98 | * keeping track if we have the console semaphore held. It's |
@@ -741,6 +747,21 @@ void __init setup_log_buf(int early) | |||
741 | free, (free * 100) / __LOG_BUF_LEN); | 747 | free, (free * 100) / __LOG_BUF_LEN); |
742 | } | 748 | } |
743 | 749 | ||
750 | static bool __read_mostly ignore_loglevel; | ||
751 | |||
752 | static int __init ignore_loglevel_setup(char *str) | ||
753 | { | ||
754 | ignore_loglevel = 1; | ||
755 | printk(KERN_INFO "debug: ignoring loglevel setting.\n"); | ||
756 | |||
757 | return 0; | ||
758 | } | ||
759 | |||
760 | early_param("ignore_loglevel", ignore_loglevel_setup); | ||
761 | module_param(ignore_loglevel, bool, S_IRUGO | S_IWUSR); | ||
762 | MODULE_PARM_DESC(ignore_loglevel, "ignore loglevel setting, to" | ||
763 | "print all kernel messages to the console."); | ||
764 | |||
744 | #ifdef CONFIG_BOOT_PRINTK_DELAY | 765 | #ifdef CONFIG_BOOT_PRINTK_DELAY |
745 | 766 | ||
746 | static int boot_delay; /* msecs delay after each printk during bootup */ | 767 | static int boot_delay; /* msecs delay after each printk during bootup */ |
@@ -764,13 +785,15 @@ static int __init boot_delay_setup(char *str) | |||
764 | } | 785 | } |
765 | __setup("boot_delay=", boot_delay_setup); | 786 | __setup("boot_delay=", boot_delay_setup); |
766 | 787 | ||
767 | static void boot_delay_msec(void) | 788 | static void boot_delay_msec(int level) |
768 | { | 789 | { |
769 | unsigned long long k; | 790 | unsigned long long k; |
770 | unsigned long timeout; | 791 | unsigned long timeout; |
771 | 792 | ||
772 | if (boot_delay == 0 || system_state != SYSTEM_BOOTING) | 793 | if ((boot_delay == 0 || system_state != SYSTEM_BOOTING) |
794 | || (level >= console_loglevel && !ignore_loglevel)) { | ||
773 | return; | 795 | return; |
796 | } | ||
774 | 797 | ||
775 | k = (unsigned long long)loops_per_msec * boot_delay; | 798 | k = (unsigned long long)loops_per_msec * boot_delay; |
776 | 799 | ||
@@ -789,7 +812,7 @@ static void boot_delay_msec(void) | |||
789 | } | 812 | } |
790 | } | 813 | } |
791 | #else | 814 | #else |
792 | static inline void boot_delay_msec(void) | 815 | static inline void boot_delay_msec(int level) |
793 | { | 816 | { |
794 | } | 817 | } |
795 | #endif | 818 | #endif |
@@ -1232,21 +1255,6 @@ SYSCALL_DEFINE3(syslog, int, type, char __user *, buf, int, len) | |||
1232 | return do_syslog(type, buf, len, SYSLOG_FROM_CALL); | 1255 | return do_syslog(type, buf, len, SYSLOG_FROM_CALL); |
1233 | } | 1256 | } |
1234 | 1257 | ||
1235 | static bool __read_mostly ignore_loglevel; | ||
1236 | |||
1237 | static int __init ignore_loglevel_setup(char *str) | ||
1238 | { | ||
1239 | ignore_loglevel = 1; | ||
1240 | printk(KERN_INFO "debug: ignoring loglevel setting.\n"); | ||
1241 | |||
1242 | return 0; | ||
1243 | } | ||
1244 | |||
1245 | early_param("ignore_loglevel", ignore_loglevel_setup); | ||
1246 | module_param(ignore_loglevel, bool, S_IRUGO | S_IWUSR); | ||
1247 | MODULE_PARM_DESC(ignore_loglevel, "ignore loglevel setting, to" | ||
1248 | "print all kernel messages to the console."); | ||
1249 | |||
1250 | /* | 1258 | /* |
1251 | * Call the console drivers, asking them to write out | 1259 | * Call the console drivers, asking them to write out |
1252 | * log_buf[start] to log_buf[end - 1]. | 1260 | * log_buf[start] to log_buf[end - 1]. |
@@ -1492,7 +1500,7 @@ asmlinkage int vprintk_emit(int facility, int level, | |||
1492 | int this_cpu; | 1500 | int this_cpu; |
1493 | int printed_len = 0; | 1501 | int printed_len = 0; |
1494 | 1502 | ||
1495 | boot_delay_msec(); | 1503 | boot_delay_msec(level); |
1496 | printk_delay(); | 1504 | printk_delay(); |
1497 | 1505 | ||
1498 | /* This stops the holder of console_sem just where we want him */ | 1506 | /* This stops the holder of console_sem just where we want him */ |
@@ -1890,7 +1898,6 @@ static int __cpuinit console_cpu_notify(struct notifier_block *self, | |||
1890 | switch (action) { | 1898 | switch (action) { |
1891 | case CPU_ONLINE: | 1899 | case CPU_ONLINE: |
1892 | case CPU_DEAD: | 1900 | case CPU_DEAD: |
1893 | case CPU_DYING: | ||
1894 | case CPU_DOWN_FAILED: | 1901 | case CPU_DOWN_FAILED: |
1895 | case CPU_UP_CANCELED: | 1902 | case CPU_UP_CANCELED: |
1896 | console_lock(); | 1903 | console_lock(); |
@@ -1909,12 +1916,14 @@ static int __cpuinit console_cpu_notify(struct notifier_block *self, | |||
1909 | */ | 1916 | */ |
1910 | void console_lock(void) | 1917 | void console_lock(void) |
1911 | { | 1918 | { |
1912 | BUG_ON(in_interrupt()); | 1919 | might_sleep(); |
1920 | |||
1913 | down(&console_sem); | 1921 | down(&console_sem); |
1914 | if (console_suspended) | 1922 | if (console_suspended) |
1915 | return; | 1923 | return; |
1916 | console_locked = 1; | 1924 | console_locked = 1; |
1917 | console_may_schedule = 1; | 1925 | console_may_schedule = 1; |
1926 | mutex_acquire(&console_lock_dep_map, 0, 0, _RET_IP_); | ||
1918 | } | 1927 | } |
1919 | EXPORT_SYMBOL(console_lock); | 1928 | EXPORT_SYMBOL(console_lock); |
1920 | 1929 | ||
@@ -1936,6 +1945,7 @@ int console_trylock(void) | |||
1936 | } | 1945 | } |
1937 | console_locked = 1; | 1946 | console_locked = 1; |
1938 | console_may_schedule = 0; | 1947 | console_may_schedule = 0; |
1948 | mutex_acquire(&console_lock_dep_map, 0, 1, _RET_IP_); | ||
1939 | return 1; | 1949 | return 1; |
1940 | } | 1950 | } |
1941 | EXPORT_SYMBOL(console_trylock); | 1951 | EXPORT_SYMBOL(console_trylock); |
@@ -2096,6 +2106,7 @@ skip: | |||
2096 | local_irq_restore(flags); | 2106 | local_irq_restore(flags); |
2097 | } | 2107 | } |
2098 | console_locked = 0; | 2108 | console_locked = 0; |
2109 | mutex_release(&console_lock_dep_map, 1, _RET_IP_); | ||
2099 | 2110 | ||
2100 | /* Release the exclusive_console once it is used */ | 2111 | /* Release the exclusive_console once it is used */ |
2101 | if (unlikely(exclusive_console)) | 2112 | if (unlikely(exclusive_console)) |
diff --git a/kernel/profile.c b/kernel/profile.c index 76b8e77773ee..1f391819c42f 100644 --- a/kernel/profile.c +++ b/kernel/profile.c | |||
@@ -8,9 +8,10 @@ | |||
8 | * Scheduler profiling support, Arjan van de Ven and Ingo Molnar, | 8 | * Scheduler profiling support, Arjan van de Ven and Ingo Molnar, |
9 | * Red Hat, July 2004 | 9 | * Red Hat, July 2004 |
10 | * Consolidation of architecture support code for profiling, | 10 | * Consolidation of architecture support code for profiling, |
11 | * William Irwin, Oracle, July 2004 | 11 | * Nadia Yvette Chambers, Oracle, July 2004 |
12 | * Amortized hit count accounting via per-cpu open-addressed hashtables | 12 | * Amortized hit count accounting via per-cpu open-addressed hashtables |
13 | * to resolve timer interrupt livelocks, William Irwin, Oracle, 2004 | 13 | * to resolve timer interrupt livelocks, Nadia Yvette Chambers, |
14 | * Oracle, 2004 | ||
14 | */ | 15 | */ |
15 | 16 | ||
16 | #include <linux/export.h> | 17 | #include <linux/export.h> |
@@ -256,7 +257,7 @@ EXPORT_SYMBOL_GPL(unregister_timer_hook); | |||
256 | * pagetable hash functions, but uses a full hashtable full of finite | 257 | * pagetable hash functions, but uses a full hashtable full of finite |
257 | * collision chains, not just pairs of them. | 258 | * collision chains, not just pairs of them. |
258 | * | 259 | * |
259 | * -- wli | 260 | * -- nyc |
260 | */ | 261 | */ |
261 | static void __profile_flip_buffers(void *unused) | 262 | static void __profile_flip_buffers(void *unused) |
262 | { | 263 | { |
diff --git a/kernel/ptrace.c b/kernel/ptrace.c index a232bb59d93f..1599157336a6 100644 --- a/kernel/ptrace.c +++ b/kernel/ptrace.c | |||
@@ -180,7 +180,8 @@ static int ptrace_has_cap(struct user_namespace *ns, unsigned int mode) | |||
180 | return has_ns_capability(current, ns, CAP_SYS_PTRACE); | 180 | return has_ns_capability(current, ns, CAP_SYS_PTRACE); |
181 | } | 181 | } |
182 | 182 | ||
183 | int __ptrace_may_access(struct task_struct *task, unsigned int mode) | 183 | /* Returns 0 on success, -errno on denial. */ |
184 | static int __ptrace_may_access(struct task_struct *task, unsigned int mode) | ||
184 | { | 185 | { |
185 | const struct cred *cred = current_cred(), *tcred; | 186 | const struct cred *cred = current_cred(), *tcred; |
186 | 187 | ||
@@ -214,8 +215,12 @@ ok: | |||
214 | smp_rmb(); | 215 | smp_rmb(); |
215 | if (task->mm) | 216 | if (task->mm) |
216 | dumpable = get_dumpable(task->mm); | 217 | dumpable = get_dumpable(task->mm); |
217 | if (!dumpable && !ptrace_has_cap(task_user_ns(task), mode)) | 218 | rcu_read_lock(); |
219 | if (!dumpable && !ptrace_has_cap(__task_cred(task)->user_ns, mode)) { | ||
220 | rcu_read_unlock(); | ||
218 | return -EPERM; | 221 | return -EPERM; |
222 | } | ||
223 | rcu_read_unlock(); | ||
219 | 224 | ||
220 | return security_ptrace_access_check(task, mode); | 225 | return security_ptrace_access_check(task, mode); |
221 | } | 226 | } |
@@ -279,8 +284,10 @@ static int ptrace_attach(struct task_struct *task, long request, | |||
279 | 284 | ||
280 | if (seize) | 285 | if (seize) |
281 | flags |= PT_SEIZED; | 286 | flags |= PT_SEIZED; |
282 | if (ns_capable(task_user_ns(task), CAP_SYS_PTRACE)) | 287 | rcu_read_lock(); |
288 | if (ns_capable(__task_cred(task)->user_ns, CAP_SYS_PTRACE)) | ||
283 | flags |= PT_PTRACE_CAP; | 289 | flags |= PT_PTRACE_CAP; |
290 | rcu_read_unlock(); | ||
284 | task->ptrace = flags; | 291 | task->ptrace = flags; |
285 | 292 | ||
286 | __ptrace_link(task, current); | 293 | __ptrace_link(task, current); |
@@ -456,6 +463,9 @@ void exit_ptrace(struct task_struct *tracer) | |||
456 | return; | 463 | return; |
457 | 464 | ||
458 | list_for_each_entry_safe(p, n, &tracer->ptraced, ptrace_entry) { | 465 | list_for_each_entry_safe(p, n, &tracer->ptraced, ptrace_entry) { |
466 | if (unlikely(p->ptrace & PT_EXITKILL)) | ||
467 | send_sig_info(SIGKILL, SEND_SIG_FORCED, p); | ||
468 | |||
459 | if (__ptrace_detach(tracer, p)) | 469 | if (__ptrace_detach(tracer, p)) |
460 | list_add(&p->ptrace_entry, &ptrace_dead); | 470 | list_add(&p->ptrace_entry, &ptrace_dead); |
461 | } | 471 | } |
diff --git a/kernel/rcu.h b/kernel/rcu.h index 8ba99cdc6515..20dfba576c2b 100644 --- a/kernel/rcu.h +++ b/kernel/rcu.h | |||
@@ -109,4 +109,6 @@ static inline bool __rcu_reclaim(char *rn, struct rcu_head *head) | |||
109 | } | 109 | } |
110 | } | 110 | } |
111 | 111 | ||
112 | extern int rcu_expedited; | ||
113 | |||
112 | #endif /* __LINUX_RCU_H */ | 114 | #endif /* __LINUX_RCU_H */ |
diff --git a/kernel/rcupdate.c b/kernel/rcupdate.c index 4e6a61b15e86..a2cf76177b44 100644 --- a/kernel/rcupdate.c +++ b/kernel/rcupdate.c | |||
@@ -45,12 +45,16 @@ | |||
45 | #include <linux/mutex.h> | 45 | #include <linux/mutex.h> |
46 | #include <linux/export.h> | 46 | #include <linux/export.h> |
47 | #include <linux/hardirq.h> | 47 | #include <linux/hardirq.h> |
48 | #include <linux/delay.h> | ||
49 | #include <linux/module.h> | ||
48 | 50 | ||
49 | #define CREATE_TRACE_POINTS | 51 | #define CREATE_TRACE_POINTS |
50 | #include <trace/events/rcu.h> | 52 | #include <trace/events/rcu.h> |
51 | 53 | ||
52 | #include "rcu.h" | 54 | #include "rcu.h" |
53 | 55 | ||
56 | module_param(rcu_expedited, int, 0); | ||
57 | |||
54 | #ifdef CONFIG_PREEMPT_RCU | 58 | #ifdef CONFIG_PREEMPT_RCU |
55 | 59 | ||
56 | /* | 60 | /* |
@@ -81,6 +85,9 @@ void __rcu_read_unlock(void) | |||
81 | } else { | 85 | } else { |
82 | barrier(); /* critical section before exit code. */ | 86 | barrier(); /* critical section before exit code. */ |
83 | t->rcu_read_lock_nesting = INT_MIN; | 87 | t->rcu_read_lock_nesting = INT_MIN; |
88 | #ifdef CONFIG_PROVE_RCU_DELAY | ||
89 | udelay(10); /* Make preemption more probable. */ | ||
90 | #endif /* #ifdef CONFIG_PROVE_RCU_DELAY */ | ||
84 | barrier(); /* assign before ->rcu_read_unlock_special load */ | 91 | barrier(); /* assign before ->rcu_read_unlock_special load */ |
85 | if (unlikely(ACCESS_ONCE(t->rcu_read_unlock_special))) | 92 | if (unlikely(ACCESS_ONCE(t->rcu_read_unlock_special))) |
86 | rcu_read_unlock_special(t); | 93 | rcu_read_unlock_special(t); |
diff --git a/kernel/rcutiny.c b/kernel/rcutiny.c index 547b1fe5b052..e7dce58f9c2a 100644 --- a/kernel/rcutiny.c +++ b/kernel/rcutiny.c | |||
@@ -56,25 +56,28 @@ static void __call_rcu(struct rcu_head *head, | |||
56 | static long long rcu_dynticks_nesting = DYNTICK_TASK_EXIT_IDLE; | 56 | static long long rcu_dynticks_nesting = DYNTICK_TASK_EXIT_IDLE; |
57 | 57 | ||
58 | /* Common code for rcu_idle_enter() and rcu_irq_exit(), see kernel/rcutree.c. */ | 58 | /* Common code for rcu_idle_enter() and rcu_irq_exit(), see kernel/rcutree.c. */ |
59 | static void rcu_idle_enter_common(long long oldval) | 59 | static void rcu_idle_enter_common(long long newval) |
60 | { | 60 | { |
61 | if (rcu_dynticks_nesting) { | 61 | if (newval) { |
62 | RCU_TRACE(trace_rcu_dyntick("--=", | 62 | RCU_TRACE(trace_rcu_dyntick("--=", |
63 | oldval, rcu_dynticks_nesting)); | 63 | rcu_dynticks_nesting, newval)); |
64 | rcu_dynticks_nesting = newval; | ||
64 | return; | 65 | return; |
65 | } | 66 | } |
66 | RCU_TRACE(trace_rcu_dyntick("Start", oldval, rcu_dynticks_nesting)); | 67 | RCU_TRACE(trace_rcu_dyntick("Start", rcu_dynticks_nesting, newval)); |
67 | if (!is_idle_task(current)) { | 68 | if (!is_idle_task(current)) { |
68 | struct task_struct *idle = idle_task(smp_processor_id()); | 69 | struct task_struct *idle = idle_task(smp_processor_id()); |
69 | 70 | ||
70 | RCU_TRACE(trace_rcu_dyntick("Error on entry: not idle task", | 71 | RCU_TRACE(trace_rcu_dyntick("Error on entry: not idle task", |
71 | oldval, rcu_dynticks_nesting)); | 72 | rcu_dynticks_nesting, newval)); |
72 | ftrace_dump(DUMP_ALL); | 73 | ftrace_dump(DUMP_ALL); |
73 | WARN_ONCE(1, "Current pid: %d comm: %s / Idle pid: %d comm: %s", | 74 | WARN_ONCE(1, "Current pid: %d comm: %s / Idle pid: %d comm: %s", |
74 | current->pid, current->comm, | 75 | current->pid, current->comm, |
75 | idle->pid, idle->comm); /* must be idle task! */ | 76 | idle->pid, idle->comm); /* must be idle task! */ |
76 | } | 77 | } |
77 | rcu_sched_qs(0); /* implies rcu_bh_qsctr_inc(0) */ | 78 | rcu_sched_qs(0); /* implies rcu_bh_qsctr_inc(0) */ |
79 | barrier(); | ||
80 | rcu_dynticks_nesting = newval; | ||
78 | } | 81 | } |
79 | 82 | ||
80 | /* | 83 | /* |
@@ -84,17 +87,16 @@ static void rcu_idle_enter_common(long long oldval) | |||
84 | void rcu_idle_enter(void) | 87 | void rcu_idle_enter(void) |
85 | { | 88 | { |
86 | unsigned long flags; | 89 | unsigned long flags; |
87 | long long oldval; | 90 | long long newval; |
88 | 91 | ||
89 | local_irq_save(flags); | 92 | local_irq_save(flags); |
90 | oldval = rcu_dynticks_nesting; | ||
91 | WARN_ON_ONCE((rcu_dynticks_nesting & DYNTICK_TASK_NEST_MASK) == 0); | 93 | WARN_ON_ONCE((rcu_dynticks_nesting & DYNTICK_TASK_NEST_MASK) == 0); |
92 | if ((rcu_dynticks_nesting & DYNTICK_TASK_NEST_MASK) == | 94 | if ((rcu_dynticks_nesting & DYNTICK_TASK_NEST_MASK) == |
93 | DYNTICK_TASK_NEST_VALUE) | 95 | DYNTICK_TASK_NEST_VALUE) |
94 | rcu_dynticks_nesting = 0; | 96 | newval = 0; |
95 | else | 97 | else |
96 | rcu_dynticks_nesting -= DYNTICK_TASK_NEST_VALUE; | 98 | newval = rcu_dynticks_nesting - DYNTICK_TASK_NEST_VALUE; |
97 | rcu_idle_enter_common(oldval); | 99 | rcu_idle_enter_common(newval); |
98 | local_irq_restore(flags); | 100 | local_irq_restore(flags); |
99 | } | 101 | } |
100 | EXPORT_SYMBOL_GPL(rcu_idle_enter); | 102 | EXPORT_SYMBOL_GPL(rcu_idle_enter); |
@@ -105,15 +107,15 @@ EXPORT_SYMBOL_GPL(rcu_idle_enter); | |||
105 | void rcu_irq_exit(void) | 107 | void rcu_irq_exit(void) |
106 | { | 108 | { |
107 | unsigned long flags; | 109 | unsigned long flags; |
108 | long long oldval; | 110 | long long newval; |
109 | 111 | ||
110 | local_irq_save(flags); | 112 | local_irq_save(flags); |
111 | oldval = rcu_dynticks_nesting; | 113 | newval = rcu_dynticks_nesting - 1; |
112 | rcu_dynticks_nesting--; | 114 | WARN_ON_ONCE(newval < 0); |
113 | WARN_ON_ONCE(rcu_dynticks_nesting < 0); | 115 | rcu_idle_enter_common(newval); |
114 | rcu_idle_enter_common(oldval); | ||
115 | local_irq_restore(flags); | 116 | local_irq_restore(flags); |
116 | } | 117 | } |
118 | EXPORT_SYMBOL_GPL(rcu_irq_exit); | ||
117 | 119 | ||
118 | /* Common code for rcu_idle_exit() and rcu_irq_enter(), see kernel/rcutree.c. */ | 120 | /* Common code for rcu_idle_exit() and rcu_irq_enter(), see kernel/rcutree.c. */ |
119 | static void rcu_idle_exit_common(long long oldval) | 121 | static void rcu_idle_exit_common(long long oldval) |
@@ -171,6 +173,7 @@ void rcu_irq_enter(void) | |||
171 | rcu_idle_exit_common(oldval); | 173 | rcu_idle_exit_common(oldval); |
172 | local_irq_restore(flags); | 174 | local_irq_restore(flags); |
173 | } | 175 | } |
176 | EXPORT_SYMBOL_GPL(rcu_irq_enter); | ||
174 | 177 | ||
175 | #ifdef CONFIG_DEBUG_LOCK_ALLOC | 178 | #ifdef CONFIG_DEBUG_LOCK_ALLOC |
176 | 179 | ||
@@ -192,7 +195,7 @@ EXPORT_SYMBOL(rcu_is_cpu_idle); | |||
192 | */ | 195 | */ |
193 | int rcu_is_cpu_rrupt_from_idle(void) | 196 | int rcu_is_cpu_rrupt_from_idle(void) |
194 | { | 197 | { |
195 | return rcu_dynticks_nesting <= 0; | 198 | return rcu_dynticks_nesting <= 1; |
196 | } | 199 | } |
197 | 200 | ||
198 | /* | 201 | /* |
diff --git a/kernel/rcutiny_plugin.h b/kernel/rcutiny_plugin.h index 918fd1e8509c..f85016a2309b 100644 --- a/kernel/rcutiny_plugin.h +++ b/kernel/rcutiny_plugin.h | |||
@@ -278,7 +278,7 @@ static int rcu_boost(void) | |||
278 | rcu_preempt_ctrlblk.exp_tasks == NULL) | 278 | rcu_preempt_ctrlblk.exp_tasks == NULL) |
279 | return 0; /* Nothing to boost. */ | 279 | return 0; /* Nothing to boost. */ |
280 | 280 | ||
281 | raw_local_irq_save(flags); | 281 | local_irq_save(flags); |
282 | 282 | ||
283 | /* | 283 | /* |
284 | * Recheck with irqs disabled: all tasks in need of boosting | 284 | * Recheck with irqs disabled: all tasks in need of boosting |
@@ -287,7 +287,7 @@ static int rcu_boost(void) | |||
287 | */ | 287 | */ |
288 | if (rcu_preempt_ctrlblk.boost_tasks == NULL && | 288 | if (rcu_preempt_ctrlblk.boost_tasks == NULL && |
289 | rcu_preempt_ctrlblk.exp_tasks == NULL) { | 289 | rcu_preempt_ctrlblk.exp_tasks == NULL) { |
290 | raw_local_irq_restore(flags); | 290 | local_irq_restore(flags); |
291 | return 0; | 291 | return 0; |
292 | } | 292 | } |
293 | 293 | ||
@@ -317,7 +317,7 @@ static int rcu_boost(void) | |||
317 | t = container_of(tb, struct task_struct, rcu_node_entry); | 317 | t = container_of(tb, struct task_struct, rcu_node_entry); |
318 | rt_mutex_init_proxy_locked(&mtx, t); | 318 | rt_mutex_init_proxy_locked(&mtx, t); |
319 | t->rcu_boost_mutex = &mtx; | 319 | t->rcu_boost_mutex = &mtx; |
320 | raw_local_irq_restore(flags); | 320 | local_irq_restore(flags); |
321 | rt_mutex_lock(&mtx); | 321 | rt_mutex_lock(&mtx); |
322 | rt_mutex_unlock(&mtx); /* Keep lockdep happy. */ | 322 | rt_mutex_unlock(&mtx); /* Keep lockdep happy. */ |
323 | 323 | ||
@@ -706,7 +706,10 @@ void synchronize_rcu(void) | |||
706 | return; | 706 | return; |
707 | 707 | ||
708 | /* Once we get past the fastpath checks, same code as rcu_barrier(). */ | 708 | /* Once we get past the fastpath checks, same code as rcu_barrier(). */ |
709 | rcu_barrier(); | 709 | if (rcu_expedited) |
710 | synchronize_rcu_expedited(); | ||
711 | else | ||
712 | rcu_barrier(); | ||
710 | } | 713 | } |
711 | EXPORT_SYMBOL_GPL(synchronize_rcu); | 714 | EXPORT_SYMBOL_GPL(synchronize_rcu); |
712 | 715 | ||
@@ -991,9 +994,9 @@ static void rcu_trace_sub_qlen(struct rcu_ctrlblk *rcp, int n) | |||
991 | { | 994 | { |
992 | unsigned long flags; | 995 | unsigned long flags; |
993 | 996 | ||
994 | raw_local_irq_save(flags); | 997 | local_irq_save(flags); |
995 | rcp->qlen -= n; | 998 | rcp->qlen -= n; |
996 | raw_local_irq_restore(flags); | 999 | local_irq_restore(flags); |
997 | } | 1000 | } |
998 | 1001 | ||
999 | /* | 1002 | /* |
diff --git a/kernel/rcutorture.c b/kernel/rcutorture.c index 25b15033c61f..31dea01c85fd 100644 --- a/kernel/rcutorture.c +++ b/kernel/rcutorture.c | |||
@@ -53,10 +53,11 @@ MODULE_AUTHOR("Paul E. McKenney <paulmck@us.ibm.com> and Josh Triplett <josh@fre | |||
53 | 53 | ||
54 | static int nreaders = -1; /* # reader threads, defaults to 2*ncpus */ | 54 | static int nreaders = -1; /* # reader threads, defaults to 2*ncpus */ |
55 | static int nfakewriters = 4; /* # fake writer threads */ | 55 | static int nfakewriters = 4; /* # fake writer threads */ |
56 | static int stat_interval; /* Interval between stats, in seconds. */ | 56 | static int stat_interval = 60; /* Interval between stats, in seconds. */ |
57 | /* Defaults to "only at end of test". */ | 57 | /* Zero means "only at end of test". */ |
58 | static bool verbose; /* Print more debug info. */ | 58 | static bool verbose; /* Print more debug info. */ |
59 | static bool test_no_idle_hz; /* Test RCU's support for tickless idle CPUs. */ | 59 | static bool test_no_idle_hz = true; |
60 | /* Test RCU support for tickless idle CPUs. */ | ||
60 | static int shuffle_interval = 3; /* Interval between shuffles (in sec)*/ | 61 | static int shuffle_interval = 3; /* Interval between shuffles (in sec)*/ |
61 | static int stutter = 5; /* Start/stop testing interval (in sec) */ | 62 | static int stutter = 5; /* Start/stop testing interval (in sec) */ |
62 | static int irqreader = 1; /* RCU readers from irq (timers). */ | 63 | static int irqreader = 1; /* RCU readers from irq (timers). */ |
@@ -119,11 +120,11 @@ MODULE_PARM_DESC(torture_type, "Type of RCU to torture (rcu, rcu_bh, srcu)"); | |||
119 | 120 | ||
120 | #define TORTURE_FLAG "-torture:" | 121 | #define TORTURE_FLAG "-torture:" |
121 | #define PRINTK_STRING(s) \ | 122 | #define PRINTK_STRING(s) \ |
122 | do { printk(KERN_ALERT "%s" TORTURE_FLAG s "\n", torture_type); } while (0) | 123 | do { pr_alert("%s" TORTURE_FLAG s "\n", torture_type); } while (0) |
123 | #define VERBOSE_PRINTK_STRING(s) \ | 124 | #define VERBOSE_PRINTK_STRING(s) \ |
124 | do { if (verbose) printk(KERN_ALERT "%s" TORTURE_FLAG s "\n", torture_type); } while (0) | 125 | do { if (verbose) pr_alert("%s" TORTURE_FLAG s "\n", torture_type); } while (0) |
125 | #define VERBOSE_PRINTK_ERRSTRING(s) \ | 126 | #define VERBOSE_PRINTK_ERRSTRING(s) \ |
126 | do { if (verbose) printk(KERN_ALERT "%s" TORTURE_FLAG "!!! " s "\n", torture_type); } while (0) | 127 | do { if (verbose) pr_alert("%s" TORTURE_FLAG "!!! " s "\n", torture_type); } while (0) |
127 | 128 | ||
128 | static char printk_buf[4096]; | 129 | static char printk_buf[4096]; |
129 | 130 | ||
@@ -176,8 +177,14 @@ static long n_rcu_torture_boosts; | |||
176 | static long n_rcu_torture_timers; | 177 | static long n_rcu_torture_timers; |
177 | static long n_offline_attempts; | 178 | static long n_offline_attempts; |
178 | static long n_offline_successes; | 179 | static long n_offline_successes; |
180 | static unsigned long sum_offline; | ||
181 | static int min_offline = -1; | ||
182 | static int max_offline; | ||
179 | static long n_online_attempts; | 183 | static long n_online_attempts; |
180 | static long n_online_successes; | 184 | static long n_online_successes; |
185 | static unsigned long sum_online; | ||
186 | static int min_online = -1; | ||
187 | static int max_online; | ||
181 | static long n_barrier_attempts; | 188 | static long n_barrier_attempts; |
182 | static long n_barrier_successes; | 189 | static long n_barrier_successes; |
183 | static struct list_head rcu_torture_removed; | 190 | static struct list_head rcu_torture_removed; |
@@ -235,7 +242,7 @@ rcutorture_shutdown_notify(struct notifier_block *unused1, | |||
235 | if (fullstop == FULLSTOP_DONTSTOP) | 242 | if (fullstop == FULLSTOP_DONTSTOP) |
236 | fullstop = FULLSTOP_SHUTDOWN; | 243 | fullstop = FULLSTOP_SHUTDOWN; |
237 | else | 244 | else |
238 | printk(KERN_WARNING /* but going down anyway, so... */ | 245 | pr_warn(/* but going down anyway, so... */ |
239 | "Concurrent 'rmmod rcutorture' and shutdown illegal!\n"); | 246 | "Concurrent 'rmmod rcutorture' and shutdown illegal!\n"); |
240 | mutex_unlock(&fullstop_mutex); | 247 | mutex_unlock(&fullstop_mutex); |
241 | return NOTIFY_DONE; | 248 | return NOTIFY_DONE; |
@@ -248,7 +255,7 @@ rcutorture_shutdown_notify(struct notifier_block *unused1, | |||
248 | static void rcutorture_shutdown_absorb(char *title) | 255 | static void rcutorture_shutdown_absorb(char *title) |
249 | { | 256 | { |
250 | if (ACCESS_ONCE(fullstop) == FULLSTOP_SHUTDOWN) { | 257 | if (ACCESS_ONCE(fullstop) == FULLSTOP_SHUTDOWN) { |
251 | printk(KERN_NOTICE | 258 | pr_notice( |
252 | "rcutorture thread %s parking due to system shutdown\n", | 259 | "rcutorture thread %s parking due to system shutdown\n", |
253 | title); | 260 | title); |
254 | schedule_timeout_uninterruptible(MAX_SCHEDULE_TIMEOUT); | 261 | schedule_timeout_uninterruptible(MAX_SCHEDULE_TIMEOUT); |
@@ -332,7 +339,6 @@ rcu_stutter_wait(char *title) | |||
332 | 339 | ||
333 | struct rcu_torture_ops { | 340 | struct rcu_torture_ops { |
334 | void (*init)(void); | 341 | void (*init)(void); |
335 | void (*cleanup)(void); | ||
336 | int (*readlock)(void); | 342 | int (*readlock)(void); |
337 | void (*read_delay)(struct rcu_random_state *rrsp); | 343 | void (*read_delay)(struct rcu_random_state *rrsp); |
338 | void (*readunlock)(int idx); | 344 | void (*readunlock)(int idx); |
@@ -424,7 +430,6 @@ static void rcu_torture_deferred_free(struct rcu_torture *p) | |||
424 | 430 | ||
425 | static struct rcu_torture_ops rcu_ops = { | 431 | static struct rcu_torture_ops rcu_ops = { |
426 | .init = NULL, | 432 | .init = NULL, |
427 | .cleanup = NULL, | ||
428 | .readlock = rcu_torture_read_lock, | 433 | .readlock = rcu_torture_read_lock, |
429 | .read_delay = rcu_read_delay, | 434 | .read_delay = rcu_read_delay, |
430 | .readunlock = rcu_torture_read_unlock, | 435 | .readunlock = rcu_torture_read_unlock, |
@@ -468,7 +473,6 @@ static void rcu_sync_torture_init(void) | |||
468 | 473 | ||
469 | static struct rcu_torture_ops rcu_sync_ops = { | 474 | static struct rcu_torture_ops rcu_sync_ops = { |
470 | .init = rcu_sync_torture_init, | 475 | .init = rcu_sync_torture_init, |
471 | .cleanup = NULL, | ||
472 | .readlock = rcu_torture_read_lock, | 476 | .readlock = rcu_torture_read_lock, |
473 | .read_delay = rcu_read_delay, | 477 | .read_delay = rcu_read_delay, |
474 | .readunlock = rcu_torture_read_unlock, | 478 | .readunlock = rcu_torture_read_unlock, |
@@ -486,7 +490,6 @@ static struct rcu_torture_ops rcu_sync_ops = { | |||
486 | 490 | ||
487 | static struct rcu_torture_ops rcu_expedited_ops = { | 491 | static struct rcu_torture_ops rcu_expedited_ops = { |
488 | .init = rcu_sync_torture_init, | 492 | .init = rcu_sync_torture_init, |
489 | .cleanup = NULL, | ||
490 | .readlock = rcu_torture_read_lock, | 493 | .readlock = rcu_torture_read_lock, |
491 | .read_delay = rcu_read_delay, /* just reuse rcu's version. */ | 494 | .read_delay = rcu_read_delay, /* just reuse rcu's version. */ |
492 | .readunlock = rcu_torture_read_unlock, | 495 | .readunlock = rcu_torture_read_unlock, |
@@ -529,7 +532,6 @@ static void rcu_bh_torture_deferred_free(struct rcu_torture *p) | |||
529 | 532 | ||
530 | static struct rcu_torture_ops rcu_bh_ops = { | 533 | static struct rcu_torture_ops rcu_bh_ops = { |
531 | .init = NULL, | 534 | .init = NULL, |
532 | .cleanup = NULL, | ||
533 | .readlock = rcu_bh_torture_read_lock, | 535 | .readlock = rcu_bh_torture_read_lock, |
534 | .read_delay = rcu_read_delay, /* just reuse rcu's version. */ | 536 | .read_delay = rcu_read_delay, /* just reuse rcu's version. */ |
535 | .readunlock = rcu_bh_torture_read_unlock, | 537 | .readunlock = rcu_bh_torture_read_unlock, |
@@ -546,7 +548,6 @@ static struct rcu_torture_ops rcu_bh_ops = { | |||
546 | 548 | ||
547 | static struct rcu_torture_ops rcu_bh_sync_ops = { | 549 | static struct rcu_torture_ops rcu_bh_sync_ops = { |
548 | .init = rcu_sync_torture_init, | 550 | .init = rcu_sync_torture_init, |
549 | .cleanup = NULL, | ||
550 | .readlock = rcu_bh_torture_read_lock, | 551 | .readlock = rcu_bh_torture_read_lock, |
551 | .read_delay = rcu_read_delay, /* just reuse rcu's version. */ | 552 | .read_delay = rcu_read_delay, /* just reuse rcu's version. */ |
552 | .readunlock = rcu_bh_torture_read_unlock, | 553 | .readunlock = rcu_bh_torture_read_unlock, |
@@ -563,7 +564,6 @@ static struct rcu_torture_ops rcu_bh_sync_ops = { | |||
563 | 564 | ||
564 | static struct rcu_torture_ops rcu_bh_expedited_ops = { | 565 | static struct rcu_torture_ops rcu_bh_expedited_ops = { |
565 | .init = rcu_sync_torture_init, | 566 | .init = rcu_sync_torture_init, |
566 | .cleanup = NULL, | ||
567 | .readlock = rcu_bh_torture_read_lock, | 567 | .readlock = rcu_bh_torture_read_lock, |
568 | .read_delay = rcu_read_delay, /* just reuse rcu's version. */ | 568 | .read_delay = rcu_read_delay, /* just reuse rcu's version. */ |
569 | .readunlock = rcu_bh_torture_read_unlock, | 569 | .readunlock = rcu_bh_torture_read_unlock, |
@@ -582,19 +582,7 @@ static struct rcu_torture_ops rcu_bh_expedited_ops = { | |||
582 | * Definitions for srcu torture testing. | 582 | * Definitions for srcu torture testing. |
583 | */ | 583 | */ |
584 | 584 | ||
585 | static struct srcu_struct srcu_ctl; | 585 | DEFINE_STATIC_SRCU(srcu_ctl); |
586 | |||
587 | static void srcu_torture_init(void) | ||
588 | { | ||
589 | init_srcu_struct(&srcu_ctl); | ||
590 | rcu_sync_torture_init(); | ||
591 | } | ||
592 | |||
593 | static void srcu_torture_cleanup(void) | ||
594 | { | ||
595 | synchronize_srcu(&srcu_ctl); | ||
596 | cleanup_srcu_struct(&srcu_ctl); | ||
597 | } | ||
598 | 586 | ||
599 | static int srcu_torture_read_lock(void) __acquires(&srcu_ctl) | 587 | static int srcu_torture_read_lock(void) __acquires(&srcu_ctl) |
600 | { | 588 | { |
@@ -665,8 +653,7 @@ static int srcu_torture_stats(char *page) | |||
665 | } | 653 | } |
666 | 654 | ||
667 | static struct rcu_torture_ops srcu_ops = { | 655 | static struct rcu_torture_ops srcu_ops = { |
668 | .init = srcu_torture_init, | 656 | .init = rcu_sync_torture_init, |
669 | .cleanup = srcu_torture_cleanup, | ||
670 | .readlock = srcu_torture_read_lock, | 657 | .readlock = srcu_torture_read_lock, |
671 | .read_delay = srcu_read_delay, | 658 | .read_delay = srcu_read_delay, |
672 | .readunlock = srcu_torture_read_unlock, | 659 | .readunlock = srcu_torture_read_unlock, |
@@ -680,8 +667,7 @@ static struct rcu_torture_ops srcu_ops = { | |||
680 | }; | 667 | }; |
681 | 668 | ||
682 | static struct rcu_torture_ops srcu_sync_ops = { | 669 | static struct rcu_torture_ops srcu_sync_ops = { |
683 | .init = srcu_torture_init, | 670 | .init = rcu_sync_torture_init, |
684 | .cleanup = srcu_torture_cleanup, | ||
685 | .readlock = srcu_torture_read_lock, | 671 | .readlock = srcu_torture_read_lock, |
686 | .read_delay = srcu_read_delay, | 672 | .read_delay = srcu_read_delay, |
687 | .readunlock = srcu_torture_read_unlock, | 673 | .readunlock = srcu_torture_read_unlock, |
@@ -705,8 +691,7 @@ static void srcu_torture_read_unlock_raw(int idx) __releases(&srcu_ctl) | |||
705 | } | 691 | } |
706 | 692 | ||
707 | static struct rcu_torture_ops srcu_raw_ops = { | 693 | static struct rcu_torture_ops srcu_raw_ops = { |
708 | .init = srcu_torture_init, | 694 | .init = rcu_sync_torture_init, |
709 | .cleanup = srcu_torture_cleanup, | ||
710 | .readlock = srcu_torture_read_lock_raw, | 695 | .readlock = srcu_torture_read_lock_raw, |
711 | .read_delay = srcu_read_delay, | 696 | .read_delay = srcu_read_delay, |
712 | .readunlock = srcu_torture_read_unlock_raw, | 697 | .readunlock = srcu_torture_read_unlock_raw, |
@@ -720,8 +705,7 @@ static struct rcu_torture_ops srcu_raw_ops = { | |||
720 | }; | 705 | }; |
721 | 706 | ||
722 | static struct rcu_torture_ops srcu_raw_sync_ops = { | 707 | static struct rcu_torture_ops srcu_raw_sync_ops = { |
723 | .init = srcu_torture_init, | 708 | .init = rcu_sync_torture_init, |
724 | .cleanup = srcu_torture_cleanup, | ||
725 | .readlock = srcu_torture_read_lock_raw, | 709 | .readlock = srcu_torture_read_lock_raw, |
726 | .read_delay = srcu_read_delay, | 710 | .read_delay = srcu_read_delay, |
727 | .readunlock = srcu_torture_read_unlock_raw, | 711 | .readunlock = srcu_torture_read_unlock_raw, |
@@ -740,8 +724,7 @@ static void srcu_torture_synchronize_expedited(void) | |||
740 | } | 724 | } |
741 | 725 | ||
742 | static struct rcu_torture_ops srcu_expedited_ops = { | 726 | static struct rcu_torture_ops srcu_expedited_ops = { |
743 | .init = srcu_torture_init, | 727 | .init = rcu_sync_torture_init, |
744 | .cleanup = srcu_torture_cleanup, | ||
745 | .readlock = srcu_torture_read_lock, | 728 | .readlock = srcu_torture_read_lock, |
746 | .read_delay = srcu_read_delay, | 729 | .read_delay = srcu_read_delay, |
747 | .readunlock = srcu_torture_read_unlock, | 730 | .readunlock = srcu_torture_read_unlock, |
@@ -776,7 +759,6 @@ static void rcu_sched_torture_deferred_free(struct rcu_torture *p) | |||
776 | 759 | ||
777 | static struct rcu_torture_ops sched_ops = { | 760 | static struct rcu_torture_ops sched_ops = { |
778 | .init = rcu_sync_torture_init, | 761 | .init = rcu_sync_torture_init, |
779 | .cleanup = NULL, | ||
780 | .readlock = sched_torture_read_lock, | 762 | .readlock = sched_torture_read_lock, |
781 | .read_delay = rcu_read_delay, /* just reuse rcu's version. */ | 763 | .read_delay = rcu_read_delay, /* just reuse rcu's version. */ |
782 | .readunlock = sched_torture_read_unlock, | 764 | .readunlock = sched_torture_read_unlock, |
@@ -792,7 +774,6 @@ static struct rcu_torture_ops sched_ops = { | |||
792 | 774 | ||
793 | static struct rcu_torture_ops sched_sync_ops = { | 775 | static struct rcu_torture_ops sched_sync_ops = { |
794 | .init = rcu_sync_torture_init, | 776 | .init = rcu_sync_torture_init, |
795 | .cleanup = NULL, | ||
796 | .readlock = sched_torture_read_lock, | 777 | .readlock = sched_torture_read_lock, |
797 | .read_delay = rcu_read_delay, /* just reuse rcu's version. */ | 778 | .read_delay = rcu_read_delay, /* just reuse rcu's version. */ |
798 | .readunlock = sched_torture_read_unlock, | 779 | .readunlock = sched_torture_read_unlock, |
@@ -807,7 +788,6 @@ static struct rcu_torture_ops sched_sync_ops = { | |||
807 | 788 | ||
808 | static struct rcu_torture_ops sched_expedited_ops = { | 789 | static struct rcu_torture_ops sched_expedited_ops = { |
809 | .init = rcu_sync_torture_init, | 790 | .init = rcu_sync_torture_init, |
810 | .cleanup = NULL, | ||
811 | .readlock = sched_torture_read_lock, | 791 | .readlock = sched_torture_read_lock, |
812 | .read_delay = rcu_read_delay, /* just reuse rcu's version. */ | 792 | .read_delay = rcu_read_delay, /* just reuse rcu's version. */ |
813 | .readunlock = sched_torture_read_unlock, | 793 | .readunlock = sched_torture_read_unlock, |
@@ -1214,11 +1194,13 @@ rcu_torture_printk(char *page) | |||
1214 | n_rcu_torture_boost_failure, | 1194 | n_rcu_torture_boost_failure, |
1215 | n_rcu_torture_boosts, | 1195 | n_rcu_torture_boosts, |
1216 | n_rcu_torture_timers); | 1196 | n_rcu_torture_timers); |
1217 | cnt += sprintf(&page[cnt], "onoff: %ld/%ld:%ld/%ld ", | 1197 | cnt += sprintf(&page[cnt], |
1218 | n_online_successes, | 1198 | "onoff: %ld/%ld:%ld/%ld %d,%d:%d,%d %lu:%lu (HZ=%d) ", |
1219 | n_online_attempts, | 1199 | n_online_successes, n_online_attempts, |
1220 | n_offline_successes, | 1200 | n_offline_successes, n_offline_attempts, |
1221 | n_offline_attempts); | 1201 | min_online, max_online, |
1202 | min_offline, max_offline, | ||
1203 | sum_online, sum_offline, HZ); | ||
1222 | cnt += sprintf(&page[cnt], "barrier: %ld/%ld:%ld", | 1204 | cnt += sprintf(&page[cnt], "barrier: %ld/%ld:%ld", |
1223 | n_barrier_successes, | 1205 | n_barrier_successes, |
1224 | n_barrier_attempts, | 1206 | n_barrier_attempts, |
@@ -1267,7 +1249,7 @@ rcu_torture_stats_print(void) | |||
1267 | int cnt; | 1249 | int cnt; |
1268 | 1250 | ||
1269 | cnt = rcu_torture_printk(printk_buf); | 1251 | cnt = rcu_torture_printk(printk_buf); |
1270 | printk(KERN_ALERT "%s", printk_buf); | 1252 | pr_alert("%s", printk_buf); |
1271 | } | 1253 | } |
1272 | 1254 | ||
1273 | /* | 1255 | /* |
@@ -1380,20 +1362,24 @@ rcu_torture_stutter(void *arg) | |||
1380 | static inline void | 1362 | static inline void |
1381 | rcu_torture_print_module_parms(struct rcu_torture_ops *cur_ops, char *tag) | 1363 | rcu_torture_print_module_parms(struct rcu_torture_ops *cur_ops, char *tag) |
1382 | { | 1364 | { |
1383 | printk(KERN_ALERT "%s" TORTURE_FLAG | 1365 | pr_alert("%s" TORTURE_FLAG |
1384 | "--- %s: nreaders=%d nfakewriters=%d " | 1366 | "--- %s: nreaders=%d nfakewriters=%d " |
1385 | "stat_interval=%d verbose=%d test_no_idle_hz=%d " | 1367 | "stat_interval=%d verbose=%d test_no_idle_hz=%d " |
1386 | "shuffle_interval=%d stutter=%d irqreader=%d " | 1368 | "shuffle_interval=%d stutter=%d irqreader=%d " |
1387 | "fqs_duration=%d fqs_holdoff=%d fqs_stutter=%d " | 1369 | "fqs_duration=%d fqs_holdoff=%d fqs_stutter=%d " |
1388 | "test_boost=%d/%d test_boost_interval=%d " | 1370 | "test_boost=%d/%d test_boost_interval=%d " |
1389 | "test_boost_duration=%d shutdown_secs=%d " | 1371 | "test_boost_duration=%d shutdown_secs=%d " |
1390 | "onoff_interval=%d onoff_holdoff=%d\n", | 1372 | "stall_cpu=%d stall_cpu_holdoff=%d " |
1391 | torture_type, tag, nrealreaders, nfakewriters, | 1373 | "n_barrier_cbs=%d " |
1392 | stat_interval, verbose, test_no_idle_hz, shuffle_interval, | 1374 | "onoff_interval=%d onoff_holdoff=%d\n", |
1393 | stutter, irqreader, fqs_duration, fqs_holdoff, fqs_stutter, | 1375 | torture_type, tag, nrealreaders, nfakewriters, |
1394 | test_boost, cur_ops->can_boost, | 1376 | stat_interval, verbose, test_no_idle_hz, shuffle_interval, |
1395 | test_boost_interval, test_boost_duration, shutdown_secs, | 1377 | stutter, irqreader, fqs_duration, fqs_holdoff, fqs_stutter, |
1396 | onoff_interval, onoff_holdoff); | 1378 | test_boost, cur_ops->can_boost, |
1379 | test_boost_interval, test_boost_duration, shutdown_secs, | ||
1380 | stall_cpu, stall_cpu_holdoff, | ||
1381 | n_barrier_cbs, | ||
1382 | onoff_interval, onoff_holdoff); | ||
1397 | } | 1383 | } |
1398 | 1384 | ||
1399 | static struct notifier_block rcutorture_shutdown_nb = { | 1385 | static struct notifier_block rcutorture_shutdown_nb = { |
@@ -1460,9 +1446,9 @@ rcu_torture_shutdown(void *arg) | |||
1460 | !kthread_should_stop()) { | 1446 | !kthread_should_stop()) { |
1461 | delta = shutdown_time - jiffies_snap; | 1447 | delta = shutdown_time - jiffies_snap; |
1462 | if (verbose) | 1448 | if (verbose) |
1463 | printk(KERN_ALERT "%s" TORTURE_FLAG | 1449 | pr_alert("%s" TORTURE_FLAG |
1464 | "rcu_torture_shutdown task: %lu jiffies remaining\n", | 1450 | "rcu_torture_shutdown task: %lu jiffies remaining\n", |
1465 | torture_type, delta); | 1451 | torture_type, delta); |
1466 | schedule_timeout_interruptible(delta); | 1452 | schedule_timeout_interruptible(delta); |
1467 | jiffies_snap = ACCESS_ONCE(jiffies); | 1453 | jiffies_snap = ACCESS_ONCE(jiffies); |
1468 | } | 1454 | } |
@@ -1490,8 +1476,11 @@ static int __cpuinit | |||
1490 | rcu_torture_onoff(void *arg) | 1476 | rcu_torture_onoff(void *arg) |
1491 | { | 1477 | { |
1492 | int cpu; | 1478 | int cpu; |
1479 | unsigned long delta; | ||
1493 | int maxcpu = -1; | 1480 | int maxcpu = -1; |
1494 | DEFINE_RCU_RANDOM(rand); | 1481 | DEFINE_RCU_RANDOM(rand); |
1482 | int ret; | ||
1483 | unsigned long starttime; | ||
1495 | 1484 | ||
1496 | VERBOSE_PRINTK_STRING("rcu_torture_onoff task started"); | 1485 | VERBOSE_PRINTK_STRING("rcu_torture_onoff task started"); |
1497 | for_each_online_cpu(cpu) | 1486 | for_each_online_cpu(cpu) |
@@ -1506,29 +1495,57 @@ rcu_torture_onoff(void *arg) | |||
1506 | cpu = (rcu_random(&rand) >> 4) % (maxcpu + 1); | 1495 | cpu = (rcu_random(&rand) >> 4) % (maxcpu + 1); |
1507 | if (cpu_online(cpu) && cpu_is_hotpluggable(cpu)) { | 1496 | if (cpu_online(cpu) && cpu_is_hotpluggable(cpu)) { |
1508 | if (verbose) | 1497 | if (verbose) |
1509 | printk(KERN_ALERT "%s" TORTURE_FLAG | 1498 | pr_alert("%s" TORTURE_FLAG |
1510 | "rcu_torture_onoff task: offlining %d\n", | 1499 | "rcu_torture_onoff task: offlining %d\n", |
1511 | torture_type, cpu); | 1500 | torture_type, cpu); |
1501 | starttime = jiffies; | ||
1512 | n_offline_attempts++; | 1502 | n_offline_attempts++; |
1513 | if (cpu_down(cpu) == 0) { | 1503 | ret = cpu_down(cpu); |
1504 | if (ret) { | ||
1505 | if (verbose) | ||
1506 | pr_alert("%s" TORTURE_FLAG | ||
1507 | "rcu_torture_onoff task: offline %d failed: errno %d\n", | ||
1508 | torture_type, cpu, ret); | ||
1509 | } else { | ||
1514 | if (verbose) | 1510 | if (verbose) |
1515 | printk(KERN_ALERT "%s" TORTURE_FLAG | 1511 | pr_alert("%s" TORTURE_FLAG |
1516 | "rcu_torture_onoff task: offlined %d\n", | 1512 | "rcu_torture_onoff task: offlined %d\n", |
1517 | torture_type, cpu); | 1513 | torture_type, cpu); |
1518 | n_offline_successes++; | 1514 | n_offline_successes++; |
1515 | delta = jiffies - starttime; | ||
1516 | sum_offline += delta; | ||
1517 | if (min_offline < 0) { | ||
1518 | min_offline = delta; | ||
1519 | max_offline = delta; | ||
1520 | } | ||
1521 | if (min_offline > delta) | ||
1522 | min_offline = delta; | ||
1523 | if (max_offline < delta) | ||
1524 | max_offline = delta; | ||
1519 | } | 1525 | } |
1520 | } else if (cpu_is_hotpluggable(cpu)) { | 1526 | } else if (cpu_is_hotpluggable(cpu)) { |
1521 | if (verbose) | 1527 | if (verbose) |
1522 | printk(KERN_ALERT "%s" TORTURE_FLAG | 1528 | pr_alert("%s" TORTURE_FLAG |
1523 | "rcu_torture_onoff task: onlining %d\n", | 1529 | "rcu_torture_onoff task: onlining %d\n", |
1524 | torture_type, cpu); | 1530 | torture_type, cpu); |
1531 | starttime = jiffies; | ||
1525 | n_online_attempts++; | 1532 | n_online_attempts++; |
1526 | if (cpu_up(cpu) == 0) { | 1533 | if (cpu_up(cpu) == 0) { |
1527 | if (verbose) | 1534 | if (verbose) |
1528 | printk(KERN_ALERT "%s" TORTURE_FLAG | 1535 | pr_alert("%s" TORTURE_FLAG |
1529 | "rcu_torture_onoff task: onlined %d\n", | 1536 | "rcu_torture_onoff task: onlined %d\n", |
1530 | torture_type, cpu); | 1537 | torture_type, cpu); |
1531 | n_online_successes++; | 1538 | n_online_successes++; |
1539 | delta = jiffies - starttime; | ||
1540 | sum_online += delta; | ||
1541 | if (min_online < 0) { | ||
1542 | min_online = delta; | ||
1543 | max_online = delta; | ||
1544 | } | ||
1545 | if (min_online > delta) | ||
1546 | min_online = delta; | ||
1547 | if (max_online < delta) | ||
1548 | max_online = delta; | ||
1532 | } | 1549 | } |
1533 | } | 1550 | } |
1534 | schedule_timeout_interruptible(onoff_interval * HZ); | 1551 | schedule_timeout_interruptible(onoff_interval * HZ); |
@@ -1593,14 +1610,14 @@ static int __cpuinit rcu_torture_stall(void *args) | |||
1593 | if (!kthread_should_stop()) { | 1610 | if (!kthread_should_stop()) { |
1594 | stop_at = get_seconds() + stall_cpu; | 1611 | stop_at = get_seconds() + stall_cpu; |
1595 | /* RCU CPU stall is expected behavior in following code. */ | 1612 | /* RCU CPU stall is expected behavior in following code. */ |
1596 | printk(KERN_ALERT "rcu_torture_stall start.\n"); | 1613 | pr_alert("rcu_torture_stall start.\n"); |
1597 | rcu_read_lock(); | 1614 | rcu_read_lock(); |
1598 | preempt_disable(); | 1615 | preempt_disable(); |
1599 | while (ULONG_CMP_LT(get_seconds(), stop_at)) | 1616 | while (ULONG_CMP_LT(get_seconds(), stop_at)) |
1600 | continue; /* Induce RCU CPU stall warning. */ | 1617 | continue; /* Induce RCU CPU stall warning. */ |
1601 | preempt_enable(); | 1618 | preempt_enable(); |
1602 | rcu_read_unlock(); | 1619 | rcu_read_unlock(); |
1603 | printk(KERN_ALERT "rcu_torture_stall end.\n"); | 1620 | pr_alert("rcu_torture_stall end.\n"); |
1604 | } | 1621 | } |
1605 | rcutorture_shutdown_absorb("rcu_torture_stall"); | 1622 | rcutorture_shutdown_absorb("rcu_torture_stall"); |
1606 | while (!kthread_should_stop()) | 1623 | while (!kthread_should_stop()) |
@@ -1716,12 +1733,12 @@ static int rcu_torture_barrier_init(void) | |||
1716 | if (n_barrier_cbs == 0) | 1733 | if (n_barrier_cbs == 0) |
1717 | return 0; | 1734 | return 0; |
1718 | if (cur_ops->call == NULL || cur_ops->cb_barrier == NULL) { | 1735 | if (cur_ops->call == NULL || cur_ops->cb_barrier == NULL) { |
1719 | printk(KERN_ALERT "%s" TORTURE_FLAG | 1736 | pr_alert("%s" TORTURE_FLAG |
1720 | " Call or barrier ops missing for %s,\n", | 1737 | " Call or barrier ops missing for %s,\n", |
1721 | torture_type, cur_ops->name); | 1738 | torture_type, cur_ops->name); |
1722 | printk(KERN_ALERT "%s" TORTURE_FLAG | 1739 | pr_alert("%s" TORTURE_FLAG |
1723 | " RCU barrier testing omitted from run.\n", | 1740 | " RCU barrier testing omitted from run.\n", |
1724 | torture_type); | 1741 | torture_type); |
1725 | return 0; | 1742 | return 0; |
1726 | } | 1743 | } |
1727 | atomic_set(&barrier_cbs_count, 0); | 1744 | atomic_set(&barrier_cbs_count, 0); |
@@ -1814,7 +1831,7 @@ rcu_torture_cleanup(void) | |||
1814 | mutex_lock(&fullstop_mutex); | 1831 | mutex_lock(&fullstop_mutex); |
1815 | rcutorture_record_test_transition(); | 1832 | rcutorture_record_test_transition(); |
1816 | if (fullstop == FULLSTOP_SHUTDOWN) { | 1833 | if (fullstop == FULLSTOP_SHUTDOWN) { |
1817 | printk(KERN_WARNING /* but going down anyway, so... */ | 1834 | pr_warn(/* but going down anyway, so... */ |
1818 | "Concurrent 'rmmod rcutorture' and shutdown illegal!\n"); | 1835 | "Concurrent 'rmmod rcutorture' and shutdown illegal!\n"); |
1819 | mutex_unlock(&fullstop_mutex); | 1836 | mutex_unlock(&fullstop_mutex); |
1820 | schedule_timeout_uninterruptible(10); | 1837 | schedule_timeout_uninterruptible(10); |
@@ -1903,8 +1920,6 @@ rcu_torture_cleanup(void) | |||
1903 | 1920 | ||
1904 | rcu_torture_stats_print(); /* -After- the stats thread is stopped! */ | 1921 | rcu_torture_stats_print(); /* -After- the stats thread is stopped! */ |
1905 | 1922 | ||
1906 | if (cur_ops->cleanup) | ||
1907 | cur_ops->cleanup(); | ||
1908 | if (atomic_read(&n_rcu_torture_error) || n_rcu_torture_barrier_error) | 1923 | if (atomic_read(&n_rcu_torture_error) || n_rcu_torture_barrier_error) |
1909 | rcu_torture_print_module_parms(cur_ops, "End of test: FAILURE"); | 1924 | rcu_torture_print_module_parms(cur_ops, "End of test: FAILURE"); |
1910 | else if (n_online_successes != n_online_attempts || | 1925 | else if (n_online_successes != n_online_attempts || |
@@ -1938,17 +1953,17 @@ rcu_torture_init(void) | |||
1938 | break; | 1953 | break; |
1939 | } | 1954 | } |
1940 | if (i == ARRAY_SIZE(torture_ops)) { | 1955 | if (i == ARRAY_SIZE(torture_ops)) { |
1941 | printk(KERN_ALERT "rcu-torture: invalid torture type: \"%s\"\n", | 1956 | pr_alert("rcu-torture: invalid torture type: \"%s\"\n", |
1942 | torture_type); | 1957 | torture_type); |
1943 | printk(KERN_ALERT "rcu-torture types:"); | 1958 | pr_alert("rcu-torture types:"); |
1944 | for (i = 0; i < ARRAY_SIZE(torture_ops); i++) | 1959 | for (i = 0; i < ARRAY_SIZE(torture_ops); i++) |
1945 | printk(KERN_ALERT " %s", torture_ops[i]->name); | 1960 | pr_alert(" %s", torture_ops[i]->name); |
1946 | printk(KERN_ALERT "\n"); | 1961 | pr_alert("\n"); |
1947 | mutex_unlock(&fullstop_mutex); | 1962 | mutex_unlock(&fullstop_mutex); |
1948 | return -EINVAL; | 1963 | return -EINVAL; |
1949 | } | 1964 | } |
1950 | if (cur_ops->fqs == NULL && fqs_duration != 0) { | 1965 | if (cur_ops->fqs == NULL && fqs_duration != 0) { |
1951 | printk(KERN_ALERT "rcu-torture: ->fqs NULL and non-zero fqs_duration, fqs disabled.\n"); | 1966 | pr_alert("rcu-torture: ->fqs NULL and non-zero fqs_duration, fqs disabled.\n"); |
1952 | fqs_duration = 0; | 1967 | fqs_duration = 0; |
1953 | } | 1968 | } |
1954 | if (cur_ops->init) | 1969 | if (cur_ops->init) |
@@ -1996,14 +2011,15 @@ rcu_torture_init(void) | |||
1996 | /* Start up the kthreads. */ | 2011 | /* Start up the kthreads. */ |
1997 | 2012 | ||
1998 | VERBOSE_PRINTK_STRING("Creating rcu_torture_writer task"); | 2013 | VERBOSE_PRINTK_STRING("Creating rcu_torture_writer task"); |
1999 | writer_task = kthread_run(rcu_torture_writer, NULL, | 2014 | writer_task = kthread_create(rcu_torture_writer, NULL, |
2000 | "rcu_torture_writer"); | 2015 | "rcu_torture_writer"); |
2001 | if (IS_ERR(writer_task)) { | 2016 | if (IS_ERR(writer_task)) { |
2002 | firsterr = PTR_ERR(writer_task); | 2017 | firsterr = PTR_ERR(writer_task); |
2003 | VERBOSE_PRINTK_ERRSTRING("Failed to create writer"); | 2018 | VERBOSE_PRINTK_ERRSTRING("Failed to create writer"); |
2004 | writer_task = NULL; | 2019 | writer_task = NULL; |
2005 | goto unwind; | 2020 | goto unwind; |
2006 | } | 2021 | } |
2022 | wake_up_process(writer_task); | ||
2007 | fakewriter_tasks = kzalloc(nfakewriters * sizeof(fakewriter_tasks[0]), | 2023 | fakewriter_tasks = kzalloc(nfakewriters * sizeof(fakewriter_tasks[0]), |
2008 | GFP_KERNEL); | 2024 | GFP_KERNEL); |
2009 | if (fakewriter_tasks == NULL) { | 2025 | if (fakewriter_tasks == NULL) { |
@@ -2118,14 +2134,15 @@ rcu_torture_init(void) | |||
2118 | } | 2134 | } |
2119 | if (shutdown_secs > 0) { | 2135 | if (shutdown_secs > 0) { |
2120 | shutdown_time = jiffies + shutdown_secs * HZ; | 2136 | shutdown_time = jiffies + shutdown_secs * HZ; |
2121 | shutdown_task = kthread_run(rcu_torture_shutdown, NULL, | 2137 | shutdown_task = kthread_create(rcu_torture_shutdown, NULL, |
2122 | "rcu_torture_shutdown"); | 2138 | "rcu_torture_shutdown"); |
2123 | if (IS_ERR(shutdown_task)) { | 2139 | if (IS_ERR(shutdown_task)) { |
2124 | firsterr = PTR_ERR(shutdown_task); | 2140 | firsterr = PTR_ERR(shutdown_task); |
2125 | VERBOSE_PRINTK_ERRSTRING("Failed to create shutdown"); | 2141 | VERBOSE_PRINTK_ERRSTRING("Failed to create shutdown"); |
2126 | shutdown_task = NULL; | 2142 | shutdown_task = NULL; |
2127 | goto unwind; | 2143 | goto unwind; |
2128 | } | 2144 | } |
2145 | wake_up_process(shutdown_task); | ||
2129 | } | 2146 | } |
2130 | i = rcu_torture_onoff_init(); | 2147 | i = rcu_torture_onoff_init(); |
2131 | if (i != 0) { | 2148 | if (i != 0) { |
diff --git a/kernel/rcutree.c b/kernel/rcutree.c index f280e542e3e9..e441b77b614e 100644 --- a/kernel/rcutree.c +++ b/kernel/rcutree.c | |||
@@ -52,6 +52,7 @@ | |||
52 | #include <linux/prefetch.h> | 52 | #include <linux/prefetch.h> |
53 | #include <linux/delay.h> | 53 | #include <linux/delay.h> |
54 | #include <linux/stop_machine.h> | 54 | #include <linux/stop_machine.h> |
55 | #include <linux/random.h> | ||
55 | 56 | ||
56 | #include "rcutree.h" | 57 | #include "rcutree.h" |
57 | #include <trace/events/rcu.h> | 58 | #include <trace/events/rcu.h> |
@@ -61,18 +62,19 @@ | |||
61 | /* Data structures. */ | 62 | /* Data structures. */ |
62 | 63 | ||
63 | static struct lock_class_key rcu_node_class[RCU_NUM_LVLS]; | 64 | static struct lock_class_key rcu_node_class[RCU_NUM_LVLS]; |
65 | static struct lock_class_key rcu_fqs_class[RCU_NUM_LVLS]; | ||
64 | 66 | ||
65 | #define RCU_STATE_INITIALIZER(sname, cr) { \ | 67 | #define RCU_STATE_INITIALIZER(sname, cr) { \ |
66 | .level = { &sname##_state.node[0] }, \ | 68 | .level = { &sname##_state.node[0] }, \ |
67 | .call = cr, \ | 69 | .call = cr, \ |
68 | .fqs_state = RCU_GP_IDLE, \ | 70 | .fqs_state = RCU_GP_IDLE, \ |
69 | .gpnum = -300, \ | 71 | .gpnum = 0UL - 300UL, \ |
70 | .completed = -300, \ | 72 | .completed = 0UL - 300UL, \ |
71 | .onofflock = __RAW_SPIN_LOCK_UNLOCKED(&sname##_state.onofflock), \ | 73 | .orphan_lock = __RAW_SPIN_LOCK_UNLOCKED(&sname##_state.orphan_lock), \ |
72 | .orphan_nxttail = &sname##_state.orphan_nxtlist, \ | 74 | .orphan_nxttail = &sname##_state.orphan_nxtlist, \ |
73 | .orphan_donetail = &sname##_state.orphan_donelist, \ | 75 | .orphan_donetail = &sname##_state.orphan_donelist, \ |
74 | .barrier_mutex = __MUTEX_INITIALIZER(sname##_state.barrier_mutex), \ | 76 | .barrier_mutex = __MUTEX_INITIALIZER(sname##_state.barrier_mutex), \ |
75 | .fqslock = __RAW_SPIN_LOCK_UNLOCKED(&sname##_state.fqslock), \ | 77 | .onoff_mutex = __MUTEX_INITIALIZER(sname##_state.onoff_mutex), \ |
76 | .name = #sname, \ | 78 | .name = #sname, \ |
77 | } | 79 | } |
78 | 80 | ||
@@ -88,7 +90,7 @@ LIST_HEAD(rcu_struct_flavors); | |||
88 | 90 | ||
89 | /* Increase (but not decrease) the CONFIG_RCU_FANOUT_LEAF at boot time. */ | 91 | /* Increase (but not decrease) the CONFIG_RCU_FANOUT_LEAF at boot time. */ |
90 | static int rcu_fanout_leaf = CONFIG_RCU_FANOUT_LEAF; | 92 | static int rcu_fanout_leaf = CONFIG_RCU_FANOUT_LEAF; |
91 | module_param(rcu_fanout_leaf, int, 0); | 93 | module_param(rcu_fanout_leaf, int, 0444); |
92 | int rcu_num_lvls __read_mostly = RCU_NUM_LVLS; | 94 | int rcu_num_lvls __read_mostly = RCU_NUM_LVLS; |
93 | static int num_rcu_lvl[] = { /* Number of rcu_nodes at specified level. */ | 95 | static int num_rcu_lvl[] = { /* Number of rcu_nodes at specified level. */ |
94 | NUM_RCU_LVL_0, | 96 | NUM_RCU_LVL_0, |
@@ -133,13 +135,12 @@ static int rcu_scheduler_fully_active __read_mostly; | |||
133 | */ | 135 | */ |
134 | static DEFINE_PER_CPU(struct task_struct *, rcu_cpu_kthread_task); | 136 | static DEFINE_PER_CPU(struct task_struct *, rcu_cpu_kthread_task); |
135 | DEFINE_PER_CPU(unsigned int, rcu_cpu_kthread_status); | 137 | DEFINE_PER_CPU(unsigned int, rcu_cpu_kthread_status); |
136 | DEFINE_PER_CPU(int, rcu_cpu_kthread_cpu); | ||
137 | DEFINE_PER_CPU(unsigned int, rcu_cpu_kthread_loops); | 138 | DEFINE_PER_CPU(unsigned int, rcu_cpu_kthread_loops); |
138 | DEFINE_PER_CPU(char, rcu_cpu_has_work); | 139 | DEFINE_PER_CPU(char, rcu_cpu_has_work); |
139 | 140 | ||
140 | #endif /* #ifdef CONFIG_RCU_BOOST */ | 141 | #endif /* #ifdef CONFIG_RCU_BOOST */ |
141 | 142 | ||
142 | static void rcu_node_kthread_setaffinity(struct rcu_node *rnp, int outgoingcpu); | 143 | static void rcu_boost_kthread_setaffinity(struct rcu_node *rnp, int outgoingcpu); |
143 | static void invoke_rcu_core(void); | 144 | static void invoke_rcu_core(void); |
144 | static void invoke_rcu_callbacks(struct rcu_state *rsp, struct rcu_data *rdp); | 145 | static void invoke_rcu_callbacks(struct rcu_state *rsp, struct rcu_data *rdp); |
145 | 146 | ||
@@ -175,8 +176,6 @@ void rcu_sched_qs(int cpu) | |||
175 | { | 176 | { |
176 | struct rcu_data *rdp = &per_cpu(rcu_sched_data, cpu); | 177 | struct rcu_data *rdp = &per_cpu(rcu_sched_data, cpu); |
177 | 178 | ||
178 | rdp->passed_quiesce_gpnum = rdp->gpnum; | ||
179 | barrier(); | ||
180 | if (rdp->passed_quiesce == 0) | 179 | if (rdp->passed_quiesce == 0) |
181 | trace_rcu_grace_period("rcu_sched", rdp->gpnum, "cpuqs"); | 180 | trace_rcu_grace_period("rcu_sched", rdp->gpnum, "cpuqs"); |
182 | rdp->passed_quiesce = 1; | 181 | rdp->passed_quiesce = 1; |
@@ -186,8 +185,6 @@ void rcu_bh_qs(int cpu) | |||
186 | { | 185 | { |
187 | struct rcu_data *rdp = &per_cpu(rcu_bh_data, cpu); | 186 | struct rcu_data *rdp = &per_cpu(rcu_bh_data, cpu); |
188 | 187 | ||
189 | rdp->passed_quiesce_gpnum = rdp->gpnum; | ||
190 | barrier(); | ||
191 | if (rdp->passed_quiesce == 0) | 188 | if (rdp->passed_quiesce == 0) |
192 | trace_rcu_grace_period("rcu_bh", rdp->gpnum, "cpuqs"); | 189 | trace_rcu_grace_period("rcu_bh", rdp->gpnum, "cpuqs"); |
193 | rdp->passed_quiesce = 1; | 190 | rdp->passed_quiesce = 1; |
@@ -212,13 +209,13 @@ DEFINE_PER_CPU(struct rcu_dynticks, rcu_dynticks) = { | |||
212 | .dynticks = ATOMIC_INIT(1), | 209 | .dynticks = ATOMIC_INIT(1), |
213 | }; | 210 | }; |
214 | 211 | ||
215 | static int blimit = 10; /* Maximum callbacks per rcu_do_batch. */ | 212 | static long blimit = 10; /* Maximum callbacks per rcu_do_batch. */ |
216 | static int qhimark = 10000; /* If this many pending, ignore blimit. */ | 213 | static long qhimark = 10000; /* If this many pending, ignore blimit. */ |
217 | static int qlowmark = 100; /* Once only this many pending, use blimit. */ | 214 | static long qlowmark = 100; /* Once only this many pending, use blimit. */ |
218 | 215 | ||
219 | module_param(blimit, int, 0); | 216 | module_param(blimit, long, 0444); |
220 | module_param(qhimark, int, 0); | 217 | module_param(qhimark, long, 0444); |
221 | module_param(qlowmark, int, 0); | 218 | module_param(qlowmark, long, 0444); |
222 | 219 | ||
223 | int rcu_cpu_stall_suppress __read_mostly; /* 1 = suppress stall warnings. */ | 220 | int rcu_cpu_stall_suppress __read_mostly; /* 1 = suppress stall warnings. */ |
224 | int rcu_cpu_stall_timeout __read_mostly = CONFIG_RCU_CPU_STALL_TIMEOUT; | 221 | int rcu_cpu_stall_timeout __read_mostly = CONFIG_RCU_CPU_STALL_TIMEOUT; |
@@ -226,7 +223,14 @@ int rcu_cpu_stall_timeout __read_mostly = CONFIG_RCU_CPU_STALL_TIMEOUT; | |||
226 | module_param(rcu_cpu_stall_suppress, int, 0644); | 223 | module_param(rcu_cpu_stall_suppress, int, 0644); |
227 | module_param(rcu_cpu_stall_timeout, int, 0644); | 224 | module_param(rcu_cpu_stall_timeout, int, 0644); |
228 | 225 | ||
229 | static void force_quiescent_state(struct rcu_state *rsp, int relaxed); | 226 | static ulong jiffies_till_first_fqs = RCU_JIFFIES_TILL_FORCE_QS; |
227 | static ulong jiffies_till_next_fqs = RCU_JIFFIES_TILL_FORCE_QS; | ||
228 | |||
229 | module_param(jiffies_till_first_fqs, ulong, 0644); | ||
230 | module_param(jiffies_till_next_fqs, ulong, 0644); | ||
231 | |||
232 | static void force_qs_rnp(struct rcu_state *rsp, int (*f)(struct rcu_data *)); | ||
233 | static void force_quiescent_state(struct rcu_state *rsp); | ||
230 | static int rcu_pending(int cpu); | 234 | static int rcu_pending(int cpu); |
231 | 235 | ||
232 | /* | 236 | /* |
@@ -252,7 +256,7 @@ EXPORT_SYMBOL_GPL(rcu_batches_completed_bh); | |||
252 | */ | 256 | */ |
253 | void rcu_bh_force_quiescent_state(void) | 257 | void rcu_bh_force_quiescent_state(void) |
254 | { | 258 | { |
255 | force_quiescent_state(&rcu_bh_state, 0); | 259 | force_quiescent_state(&rcu_bh_state); |
256 | } | 260 | } |
257 | EXPORT_SYMBOL_GPL(rcu_bh_force_quiescent_state); | 261 | EXPORT_SYMBOL_GPL(rcu_bh_force_quiescent_state); |
258 | 262 | ||
@@ -286,7 +290,7 @@ EXPORT_SYMBOL_GPL(rcutorture_record_progress); | |||
286 | */ | 290 | */ |
287 | void rcu_sched_force_quiescent_state(void) | 291 | void rcu_sched_force_quiescent_state(void) |
288 | { | 292 | { |
289 | force_quiescent_state(&rcu_sched_state, 0); | 293 | force_quiescent_state(&rcu_sched_state); |
290 | } | 294 | } |
291 | EXPORT_SYMBOL_GPL(rcu_sched_force_quiescent_state); | 295 | EXPORT_SYMBOL_GPL(rcu_sched_force_quiescent_state); |
292 | 296 | ||
@@ -296,7 +300,8 @@ EXPORT_SYMBOL_GPL(rcu_sched_force_quiescent_state); | |||
296 | static int | 300 | static int |
297 | cpu_has_callbacks_ready_to_invoke(struct rcu_data *rdp) | 301 | cpu_has_callbacks_ready_to_invoke(struct rcu_data *rdp) |
298 | { | 302 | { |
299 | return &rdp->nxtlist != rdp->nxttail[RCU_DONE_TAIL]; | 303 | return &rdp->nxtlist != rdp->nxttail[RCU_DONE_TAIL] && |
304 | rdp->nxttail[RCU_DONE_TAIL] != NULL; | ||
300 | } | 305 | } |
301 | 306 | ||
302 | /* | 307 | /* |
@@ -305,7 +310,12 @@ cpu_has_callbacks_ready_to_invoke(struct rcu_data *rdp) | |||
305 | static int | 310 | static int |
306 | cpu_needs_another_gp(struct rcu_state *rsp, struct rcu_data *rdp) | 311 | cpu_needs_another_gp(struct rcu_state *rsp, struct rcu_data *rdp) |
307 | { | 312 | { |
308 | return *rdp->nxttail[RCU_DONE_TAIL] && !rcu_gp_in_progress(rsp); | 313 | struct rcu_head **ntp; |
314 | |||
315 | ntp = rdp->nxttail[RCU_DONE_TAIL + | ||
316 | (ACCESS_ONCE(rsp->completed) != rdp->completed)]; | ||
317 | return rdp->nxttail[RCU_DONE_TAIL] && ntp && *ntp && | ||
318 | !rcu_gp_in_progress(rsp); | ||
309 | } | 319 | } |
310 | 320 | ||
311 | /* | 321 | /* |
@@ -317,45 +327,17 @@ static struct rcu_node *rcu_get_root(struct rcu_state *rsp) | |||
317 | } | 327 | } |
318 | 328 | ||
319 | /* | 329 | /* |
320 | * If the specified CPU is offline, tell the caller that it is in | 330 | * rcu_eqs_enter_common - current CPU is moving towards extended quiescent state |
321 | * a quiescent state. Otherwise, whack it with a reschedule IPI. | ||
322 | * Grace periods can end up waiting on an offline CPU when that | ||
323 | * CPU is in the process of coming online -- it will be added to the | ||
324 | * rcu_node bitmasks before it actually makes it online. The same thing | ||
325 | * can happen while a CPU is in the process of coming online. Because this | ||
326 | * race is quite rare, we check for it after detecting that the grace | ||
327 | * period has been delayed rather than checking each and every CPU | ||
328 | * each and every time we start a new grace period. | ||
329 | */ | ||
330 | static int rcu_implicit_offline_qs(struct rcu_data *rdp) | ||
331 | { | ||
332 | /* | ||
333 | * If the CPU is offline for more than a jiffy, it is in a quiescent | ||
334 | * state. We can trust its state not to change because interrupts | ||
335 | * are disabled. The reason for the jiffy's worth of slack is to | ||
336 | * handle CPUs initializing on the way up and finding their way | ||
337 | * to the idle loop on the way down. | ||
338 | */ | ||
339 | if (cpu_is_offline(rdp->cpu) && | ||
340 | ULONG_CMP_LT(rdp->rsp->gp_start + 2, jiffies)) { | ||
341 | trace_rcu_fqs(rdp->rsp->name, rdp->gpnum, rdp->cpu, "ofl"); | ||
342 | rdp->offline_fqs++; | ||
343 | return 1; | ||
344 | } | ||
345 | return 0; | ||
346 | } | ||
347 | |||
348 | /* | ||
349 | * rcu_idle_enter_common - inform RCU that current CPU is moving towards idle | ||
350 | * | 331 | * |
351 | * If the new value of the ->dynticks_nesting counter now is zero, | 332 | * If the new value of the ->dynticks_nesting counter now is zero, |
352 | * we really have entered idle, and must do the appropriate accounting. | 333 | * we really have entered idle, and must do the appropriate accounting. |
353 | * The caller must have disabled interrupts. | 334 | * The caller must have disabled interrupts. |
354 | */ | 335 | */ |
355 | static void rcu_idle_enter_common(struct rcu_dynticks *rdtp, long long oldval) | 336 | static void rcu_eqs_enter_common(struct rcu_dynticks *rdtp, long long oldval, |
337 | bool user) | ||
356 | { | 338 | { |
357 | trace_rcu_dyntick("Start", oldval, 0); | 339 | trace_rcu_dyntick("Start", oldval, 0); |
358 | if (!is_idle_task(current)) { | 340 | if (!user && !is_idle_task(current)) { |
359 | struct task_struct *idle = idle_task(smp_processor_id()); | 341 | struct task_struct *idle = idle_task(smp_processor_id()); |
360 | 342 | ||
361 | trace_rcu_dyntick("Error on entry: not idle task", oldval, 0); | 343 | trace_rcu_dyntick("Error on entry: not idle task", oldval, 0); |
@@ -372,7 +354,7 @@ static void rcu_idle_enter_common(struct rcu_dynticks *rdtp, long long oldval) | |||
372 | WARN_ON_ONCE(atomic_read(&rdtp->dynticks) & 0x1); | 354 | WARN_ON_ONCE(atomic_read(&rdtp->dynticks) & 0x1); |
373 | 355 | ||
374 | /* | 356 | /* |
375 | * The idle task is not permitted to enter the idle loop while | 357 | * It is illegal to enter an extended quiescent state while |
376 | * in an RCU read-side critical section. | 358 | * in an RCU read-side critical section. |
377 | */ | 359 | */ |
378 | rcu_lockdep_assert(!lock_is_held(&rcu_lock_map), | 360 | rcu_lockdep_assert(!lock_is_held(&rcu_lock_map), |
@@ -383,6 +365,25 @@ static void rcu_idle_enter_common(struct rcu_dynticks *rdtp, long long oldval) | |||
383 | "Illegal idle entry in RCU-sched read-side critical section."); | 365 | "Illegal idle entry in RCU-sched read-side critical section."); |
384 | } | 366 | } |
385 | 367 | ||
368 | /* | ||
369 | * Enter an RCU extended quiescent state, which can be either the | ||
370 | * idle loop or adaptive-tickless usermode execution. | ||
371 | */ | ||
372 | static void rcu_eqs_enter(bool user) | ||
373 | { | ||
374 | long long oldval; | ||
375 | struct rcu_dynticks *rdtp; | ||
376 | |||
377 | rdtp = &__get_cpu_var(rcu_dynticks); | ||
378 | oldval = rdtp->dynticks_nesting; | ||
379 | WARN_ON_ONCE((oldval & DYNTICK_TASK_NEST_MASK) == 0); | ||
380 | if ((oldval & DYNTICK_TASK_NEST_MASK) == DYNTICK_TASK_NEST_VALUE) | ||
381 | rdtp->dynticks_nesting = 0; | ||
382 | else | ||
383 | rdtp->dynticks_nesting -= DYNTICK_TASK_NEST_VALUE; | ||
384 | rcu_eqs_enter_common(rdtp, oldval, user); | ||
385 | } | ||
386 | |||
386 | /** | 387 | /** |
387 | * rcu_idle_enter - inform RCU that current CPU is entering idle | 388 | * rcu_idle_enter - inform RCU that current CPU is entering idle |
388 | * | 389 | * |
@@ -398,21 +399,48 @@ static void rcu_idle_enter_common(struct rcu_dynticks *rdtp, long long oldval) | |||
398 | void rcu_idle_enter(void) | 399 | void rcu_idle_enter(void) |
399 | { | 400 | { |
400 | unsigned long flags; | 401 | unsigned long flags; |
401 | long long oldval; | 402 | |
403 | local_irq_save(flags); | ||
404 | rcu_eqs_enter(false); | ||
405 | local_irq_restore(flags); | ||
406 | } | ||
407 | EXPORT_SYMBOL_GPL(rcu_idle_enter); | ||
408 | |||
409 | #ifdef CONFIG_RCU_USER_QS | ||
410 | /** | ||
411 | * rcu_user_enter - inform RCU that we are resuming userspace. | ||
412 | * | ||
413 | * Enter RCU idle mode right before resuming userspace. No use of RCU | ||
414 | * is permitted between this call and rcu_user_exit(). This way the | ||
415 | * CPU doesn't need to maintain the tick for RCU maintenance purposes | ||
416 | * when the CPU runs in userspace. | ||
417 | */ | ||
418 | void rcu_user_enter(void) | ||
419 | { | ||
420 | rcu_eqs_enter(1); | ||
421 | } | ||
422 | |||
423 | /** | ||
424 | * rcu_user_enter_after_irq - inform RCU that we are going to resume userspace | ||
425 | * after the current irq returns. | ||
426 | * | ||
427 | * This is similar to rcu_user_enter() but in the context of a non-nesting | ||
428 | * irq. After this call, RCU enters into idle mode when the interrupt | ||
429 | * returns. | ||
430 | */ | ||
431 | void rcu_user_enter_after_irq(void) | ||
432 | { | ||
433 | unsigned long flags; | ||
402 | struct rcu_dynticks *rdtp; | 434 | struct rcu_dynticks *rdtp; |
403 | 435 | ||
404 | local_irq_save(flags); | 436 | local_irq_save(flags); |
405 | rdtp = &__get_cpu_var(rcu_dynticks); | 437 | rdtp = &__get_cpu_var(rcu_dynticks); |
406 | oldval = rdtp->dynticks_nesting; | 438 | /* Ensure this irq is interrupting a non-idle RCU state. */ |
407 | WARN_ON_ONCE((oldval & DYNTICK_TASK_NEST_MASK) == 0); | 439 | WARN_ON_ONCE(!(rdtp->dynticks_nesting & DYNTICK_TASK_MASK)); |
408 | if ((oldval & DYNTICK_TASK_NEST_MASK) == DYNTICK_TASK_NEST_VALUE) | 440 | rdtp->dynticks_nesting = 1; |
409 | rdtp->dynticks_nesting = 0; | ||
410 | else | ||
411 | rdtp->dynticks_nesting -= DYNTICK_TASK_NEST_VALUE; | ||
412 | rcu_idle_enter_common(rdtp, oldval); | ||
413 | local_irq_restore(flags); | 441 | local_irq_restore(flags); |
414 | } | 442 | } |
415 | EXPORT_SYMBOL_GPL(rcu_idle_enter); | 443 | #endif /* CONFIG_RCU_USER_QS */ |
416 | 444 | ||
417 | /** | 445 | /** |
418 | * rcu_irq_exit - inform RCU that current CPU is exiting irq towards idle | 446 | * rcu_irq_exit - inform RCU that current CPU is exiting irq towards idle |
@@ -444,18 +472,19 @@ void rcu_irq_exit(void) | |||
444 | if (rdtp->dynticks_nesting) | 472 | if (rdtp->dynticks_nesting) |
445 | trace_rcu_dyntick("--=", oldval, rdtp->dynticks_nesting); | 473 | trace_rcu_dyntick("--=", oldval, rdtp->dynticks_nesting); |
446 | else | 474 | else |
447 | rcu_idle_enter_common(rdtp, oldval); | 475 | rcu_eqs_enter_common(rdtp, oldval, true); |
448 | local_irq_restore(flags); | 476 | local_irq_restore(flags); |
449 | } | 477 | } |
450 | 478 | ||
451 | /* | 479 | /* |
452 | * rcu_idle_exit_common - inform RCU that current CPU is moving away from idle | 480 | * rcu_eqs_exit_common - current CPU moving away from extended quiescent state |
453 | * | 481 | * |
454 | * If the new value of the ->dynticks_nesting counter was previously zero, | 482 | * If the new value of the ->dynticks_nesting counter was previously zero, |
455 | * we really have exited idle, and must do the appropriate accounting. | 483 | * we really have exited idle, and must do the appropriate accounting. |
456 | * The caller must have disabled interrupts. | 484 | * The caller must have disabled interrupts. |
457 | */ | 485 | */ |
458 | static void rcu_idle_exit_common(struct rcu_dynticks *rdtp, long long oldval) | 486 | static void rcu_eqs_exit_common(struct rcu_dynticks *rdtp, long long oldval, |
487 | int user) | ||
459 | { | 488 | { |
460 | smp_mb__before_atomic_inc(); /* Force ordering w/previous sojourn. */ | 489 | smp_mb__before_atomic_inc(); /* Force ordering w/previous sojourn. */ |
461 | atomic_inc(&rdtp->dynticks); | 490 | atomic_inc(&rdtp->dynticks); |
@@ -464,7 +493,7 @@ static void rcu_idle_exit_common(struct rcu_dynticks *rdtp, long long oldval) | |||
464 | WARN_ON_ONCE(!(atomic_read(&rdtp->dynticks) & 0x1)); | 493 | WARN_ON_ONCE(!(atomic_read(&rdtp->dynticks) & 0x1)); |
465 | rcu_cleanup_after_idle(smp_processor_id()); | 494 | rcu_cleanup_after_idle(smp_processor_id()); |
466 | trace_rcu_dyntick("End", oldval, rdtp->dynticks_nesting); | 495 | trace_rcu_dyntick("End", oldval, rdtp->dynticks_nesting); |
467 | if (!is_idle_task(current)) { | 496 | if (!user && !is_idle_task(current)) { |
468 | struct task_struct *idle = idle_task(smp_processor_id()); | 497 | struct task_struct *idle = idle_task(smp_processor_id()); |
469 | 498 | ||
470 | trace_rcu_dyntick("Error on exit: not idle task", | 499 | trace_rcu_dyntick("Error on exit: not idle task", |
@@ -476,6 +505,25 @@ static void rcu_idle_exit_common(struct rcu_dynticks *rdtp, long long oldval) | |||
476 | } | 505 | } |
477 | } | 506 | } |
478 | 507 | ||
508 | /* | ||
509 | * Exit an RCU extended quiescent state, which can be either the | ||
510 | * idle loop or adaptive-tickless usermode execution. | ||
511 | */ | ||
512 | static void rcu_eqs_exit(bool user) | ||
513 | { | ||
514 | struct rcu_dynticks *rdtp; | ||
515 | long long oldval; | ||
516 | |||
517 | rdtp = &__get_cpu_var(rcu_dynticks); | ||
518 | oldval = rdtp->dynticks_nesting; | ||
519 | WARN_ON_ONCE(oldval < 0); | ||
520 | if (oldval & DYNTICK_TASK_NEST_MASK) | ||
521 | rdtp->dynticks_nesting += DYNTICK_TASK_NEST_VALUE; | ||
522 | else | ||
523 | rdtp->dynticks_nesting = DYNTICK_TASK_EXIT_IDLE; | ||
524 | rcu_eqs_exit_common(rdtp, oldval, user); | ||
525 | } | ||
526 | |||
479 | /** | 527 | /** |
480 | * rcu_idle_exit - inform RCU that current CPU is leaving idle | 528 | * rcu_idle_exit - inform RCU that current CPU is leaving idle |
481 | * | 529 | * |
@@ -490,21 +538,47 @@ static void rcu_idle_exit_common(struct rcu_dynticks *rdtp, long long oldval) | |||
490 | void rcu_idle_exit(void) | 538 | void rcu_idle_exit(void) |
491 | { | 539 | { |
492 | unsigned long flags; | 540 | unsigned long flags; |
541 | |||
542 | local_irq_save(flags); | ||
543 | rcu_eqs_exit(false); | ||
544 | local_irq_restore(flags); | ||
545 | } | ||
546 | EXPORT_SYMBOL_GPL(rcu_idle_exit); | ||
547 | |||
548 | #ifdef CONFIG_RCU_USER_QS | ||
549 | /** | ||
550 | * rcu_user_exit - inform RCU that we are exiting userspace. | ||
551 | * | ||
552 | * Exit RCU idle mode while entering the kernel because it can | ||
553 | * run a RCU read side critical section anytime. | ||
554 | */ | ||
555 | void rcu_user_exit(void) | ||
556 | { | ||
557 | rcu_eqs_exit(1); | ||
558 | } | ||
559 | |||
560 | /** | ||
561 | * rcu_user_exit_after_irq - inform RCU that we won't resume to userspace | ||
562 | * idle mode after the current non-nesting irq returns. | ||
563 | * | ||
564 | * This is similar to rcu_user_exit() but in the context of an irq. | ||
565 | * This is called when the irq has interrupted a userspace RCU idle mode | ||
566 | * context. When the current non-nesting interrupt returns after this call, | ||
567 | * the CPU won't restore the RCU idle mode. | ||
568 | */ | ||
569 | void rcu_user_exit_after_irq(void) | ||
570 | { | ||
571 | unsigned long flags; | ||
493 | struct rcu_dynticks *rdtp; | 572 | struct rcu_dynticks *rdtp; |
494 | long long oldval; | ||
495 | 573 | ||
496 | local_irq_save(flags); | 574 | local_irq_save(flags); |
497 | rdtp = &__get_cpu_var(rcu_dynticks); | 575 | rdtp = &__get_cpu_var(rcu_dynticks); |
498 | oldval = rdtp->dynticks_nesting; | 576 | /* Ensure we are interrupting an RCU idle mode. */ |
499 | WARN_ON_ONCE(oldval < 0); | 577 | WARN_ON_ONCE(rdtp->dynticks_nesting & DYNTICK_TASK_NEST_MASK); |
500 | if (oldval & DYNTICK_TASK_NEST_MASK) | 578 | rdtp->dynticks_nesting += DYNTICK_TASK_EXIT_IDLE; |
501 | rdtp->dynticks_nesting += DYNTICK_TASK_NEST_VALUE; | ||
502 | else | ||
503 | rdtp->dynticks_nesting = DYNTICK_TASK_EXIT_IDLE; | ||
504 | rcu_idle_exit_common(rdtp, oldval); | ||
505 | local_irq_restore(flags); | 579 | local_irq_restore(flags); |
506 | } | 580 | } |
507 | EXPORT_SYMBOL_GPL(rcu_idle_exit); | 581 | #endif /* CONFIG_RCU_USER_QS */ |
508 | 582 | ||
509 | /** | 583 | /** |
510 | * rcu_irq_enter - inform RCU that current CPU is entering irq away from idle | 584 | * rcu_irq_enter - inform RCU that current CPU is entering irq away from idle |
@@ -539,7 +613,7 @@ void rcu_irq_enter(void) | |||
539 | if (oldval) | 613 | if (oldval) |
540 | trace_rcu_dyntick("++=", oldval, rdtp->dynticks_nesting); | 614 | trace_rcu_dyntick("++=", oldval, rdtp->dynticks_nesting); |
541 | else | 615 | else |
542 | rcu_idle_exit_common(rdtp, oldval); | 616 | rcu_eqs_exit_common(rdtp, oldval, true); |
543 | local_irq_restore(flags); | 617 | local_irq_restore(flags); |
544 | } | 618 | } |
545 | 619 | ||
@@ -673,7 +747,7 @@ static int dyntick_save_progress_counter(struct rcu_data *rdp) | |||
673 | * Return true if the specified CPU has passed through a quiescent | 747 | * Return true if the specified CPU has passed through a quiescent |
674 | * state by virtue of being in or having passed through an dynticks | 748 | * state by virtue of being in or having passed through an dynticks |
675 | * idle state since the last call to dyntick_save_progress_counter() | 749 | * idle state since the last call to dyntick_save_progress_counter() |
676 | * for this same CPU. | 750 | * for this same CPU, or by virtue of having been offline. |
677 | */ | 751 | */ |
678 | static int rcu_implicit_dynticks_qs(struct rcu_data *rdp) | 752 | static int rcu_implicit_dynticks_qs(struct rcu_data *rdp) |
679 | { | 753 | { |
@@ -697,8 +771,26 @@ static int rcu_implicit_dynticks_qs(struct rcu_data *rdp) | |||
697 | return 1; | 771 | return 1; |
698 | } | 772 | } |
699 | 773 | ||
700 | /* Go check for the CPU being offline. */ | 774 | /* |
701 | return rcu_implicit_offline_qs(rdp); | 775 | * Check for the CPU being offline, but only if the grace period |
776 | * is old enough. We don't need to worry about the CPU changing | ||
777 | * state: If we see it offline even once, it has been through a | ||
778 | * quiescent state. | ||
779 | * | ||
780 | * The reason for insisting that the grace period be at least | ||
781 | * one jiffy old is that CPUs that are not quite online and that | ||
782 | * have just gone offline can still execute RCU read-side critical | ||
783 | * sections. | ||
784 | */ | ||
785 | if (ULONG_CMP_GE(rdp->rsp->gp_start + 2, jiffies)) | ||
786 | return 0; /* Grace period is not old enough. */ | ||
787 | barrier(); | ||
788 | if (cpu_is_offline(rdp->cpu)) { | ||
789 | trace_rcu_fqs(rdp->rsp->name, rdp->gpnum, rdp->cpu, "ofl"); | ||
790 | rdp->offline_fqs++; | ||
791 | return 1; | ||
792 | } | ||
793 | return 0; | ||
702 | } | 794 | } |
703 | 795 | ||
704 | static int jiffies_till_stall_check(void) | 796 | static int jiffies_till_stall_check(void) |
@@ -725,6 +817,29 @@ static void record_gp_stall_check_time(struct rcu_state *rsp) | |||
725 | rsp->jiffies_stall = jiffies + jiffies_till_stall_check(); | 817 | rsp->jiffies_stall = jiffies + jiffies_till_stall_check(); |
726 | } | 818 | } |
727 | 819 | ||
820 | /* | ||
821 | * Dump stacks of all tasks running on stalled CPUs. This is a fallback | ||
822 | * for architectures that do not implement trigger_all_cpu_backtrace(). | ||
823 | * The NMI-triggered stack traces are more accurate because they are | ||
824 | * printed by the target CPU. | ||
825 | */ | ||
826 | static void rcu_dump_cpu_stacks(struct rcu_state *rsp) | ||
827 | { | ||
828 | int cpu; | ||
829 | unsigned long flags; | ||
830 | struct rcu_node *rnp; | ||
831 | |||
832 | rcu_for_each_leaf_node(rsp, rnp) { | ||
833 | raw_spin_lock_irqsave(&rnp->lock, flags); | ||
834 | if (rnp->qsmask != 0) { | ||
835 | for (cpu = 0; cpu <= rnp->grphi - rnp->grplo; cpu++) | ||
836 | if (rnp->qsmask & (1UL << cpu)) | ||
837 | dump_cpu_task(rnp->grplo + cpu); | ||
838 | } | ||
839 | raw_spin_unlock_irqrestore(&rnp->lock, flags); | ||
840 | } | ||
841 | } | ||
842 | |||
728 | static void print_other_cpu_stall(struct rcu_state *rsp) | 843 | static void print_other_cpu_stall(struct rcu_state *rsp) |
729 | { | 844 | { |
730 | int cpu; | 845 | int cpu; |
@@ -732,6 +847,7 @@ static void print_other_cpu_stall(struct rcu_state *rsp) | |||
732 | unsigned long flags; | 847 | unsigned long flags; |
733 | int ndetected = 0; | 848 | int ndetected = 0; |
734 | struct rcu_node *rnp = rcu_get_root(rsp); | 849 | struct rcu_node *rnp = rcu_get_root(rsp); |
850 | long totqlen = 0; | ||
735 | 851 | ||
736 | /* Only let one CPU complain about others per time interval. */ | 852 | /* Only let one CPU complain about others per time interval. */ |
737 | 853 | ||
@@ -755,14 +871,15 @@ static void print_other_cpu_stall(struct rcu_state *rsp) | |||
755 | rcu_for_each_leaf_node(rsp, rnp) { | 871 | rcu_for_each_leaf_node(rsp, rnp) { |
756 | raw_spin_lock_irqsave(&rnp->lock, flags); | 872 | raw_spin_lock_irqsave(&rnp->lock, flags); |
757 | ndetected += rcu_print_task_stall(rnp); | 873 | ndetected += rcu_print_task_stall(rnp); |
874 | if (rnp->qsmask != 0) { | ||
875 | for (cpu = 0; cpu <= rnp->grphi - rnp->grplo; cpu++) | ||
876 | if (rnp->qsmask & (1UL << cpu)) { | ||
877 | print_cpu_stall_info(rsp, | ||
878 | rnp->grplo + cpu); | ||
879 | ndetected++; | ||
880 | } | ||
881 | } | ||
758 | raw_spin_unlock_irqrestore(&rnp->lock, flags); | 882 | raw_spin_unlock_irqrestore(&rnp->lock, flags); |
759 | if (rnp->qsmask == 0) | ||
760 | continue; | ||
761 | for (cpu = 0; cpu <= rnp->grphi - rnp->grplo; cpu++) | ||
762 | if (rnp->qsmask & (1UL << cpu)) { | ||
763 | print_cpu_stall_info(rsp, rnp->grplo + cpu); | ||
764 | ndetected++; | ||
765 | } | ||
766 | } | 883 | } |
767 | 884 | ||
768 | /* | 885 | /* |
@@ -775,24 +892,29 @@ static void print_other_cpu_stall(struct rcu_state *rsp) | |||
775 | raw_spin_unlock_irqrestore(&rnp->lock, flags); | 892 | raw_spin_unlock_irqrestore(&rnp->lock, flags); |
776 | 893 | ||
777 | print_cpu_stall_info_end(); | 894 | print_cpu_stall_info_end(); |
778 | printk(KERN_CONT "(detected by %d, t=%ld jiffies)\n", | 895 | for_each_possible_cpu(cpu) |
779 | smp_processor_id(), (long)(jiffies - rsp->gp_start)); | 896 | totqlen += per_cpu_ptr(rsp->rda, cpu)->qlen; |
897 | pr_cont("(detected by %d, t=%ld jiffies, g=%lu, c=%lu, q=%lu)\n", | ||
898 | smp_processor_id(), (long)(jiffies - rsp->gp_start), | ||
899 | rsp->gpnum, rsp->completed, totqlen); | ||
780 | if (ndetected == 0) | 900 | if (ndetected == 0) |
781 | printk(KERN_ERR "INFO: Stall ended before state dump start\n"); | 901 | printk(KERN_ERR "INFO: Stall ended before state dump start\n"); |
782 | else if (!trigger_all_cpu_backtrace()) | 902 | else if (!trigger_all_cpu_backtrace()) |
783 | dump_stack(); | 903 | rcu_dump_cpu_stacks(rsp); |
784 | 904 | ||
785 | /* If so configured, complain about tasks blocking the grace period. */ | 905 | /* Complain about tasks blocking the grace period. */ |
786 | 906 | ||
787 | rcu_print_detail_task_stall(rsp); | 907 | rcu_print_detail_task_stall(rsp); |
788 | 908 | ||
789 | force_quiescent_state(rsp, 0); /* Kick them all. */ | 909 | force_quiescent_state(rsp); /* Kick them all. */ |
790 | } | 910 | } |
791 | 911 | ||
792 | static void print_cpu_stall(struct rcu_state *rsp) | 912 | static void print_cpu_stall(struct rcu_state *rsp) |
793 | { | 913 | { |
914 | int cpu; | ||
794 | unsigned long flags; | 915 | unsigned long flags; |
795 | struct rcu_node *rnp = rcu_get_root(rsp); | 916 | struct rcu_node *rnp = rcu_get_root(rsp); |
917 | long totqlen = 0; | ||
796 | 918 | ||
797 | /* | 919 | /* |
798 | * OK, time to rat on ourselves... | 920 | * OK, time to rat on ourselves... |
@@ -803,7 +925,10 @@ static void print_cpu_stall(struct rcu_state *rsp) | |||
803 | print_cpu_stall_info_begin(); | 925 | print_cpu_stall_info_begin(); |
804 | print_cpu_stall_info(rsp, smp_processor_id()); | 926 | print_cpu_stall_info(rsp, smp_processor_id()); |
805 | print_cpu_stall_info_end(); | 927 | print_cpu_stall_info_end(); |
806 | printk(KERN_CONT " (t=%lu jiffies)\n", jiffies - rsp->gp_start); | 928 | for_each_possible_cpu(cpu) |
929 | totqlen += per_cpu_ptr(rsp->rda, cpu)->qlen; | ||
930 | pr_cont(" (t=%lu jiffies g=%lu c=%lu q=%lu)\n", | ||
931 | jiffies - rsp->gp_start, rsp->gpnum, rsp->completed, totqlen); | ||
807 | if (!trigger_all_cpu_backtrace()) | 932 | if (!trigger_all_cpu_backtrace()) |
808 | dump_stack(); | 933 | dump_stack(); |
809 | 934 | ||
@@ -827,7 +952,8 @@ static void check_cpu_stall(struct rcu_state *rsp, struct rcu_data *rdp) | |||
827 | j = ACCESS_ONCE(jiffies); | 952 | j = ACCESS_ONCE(jiffies); |
828 | js = ACCESS_ONCE(rsp->jiffies_stall); | 953 | js = ACCESS_ONCE(rsp->jiffies_stall); |
829 | rnp = rdp->mynode; | 954 | rnp = rdp->mynode; |
830 | if ((ACCESS_ONCE(rnp->qsmask) & rdp->grpmask) && ULONG_CMP_GE(j, js)) { | 955 | if (rcu_gp_in_progress(rsp) && |
956 | (ACCESS_ONCE(rnp->qsmask) & rdp->grpmask) && ULONG_CMP_GE(j, js)) { | ||
831 | 957 | ||
832 | /* We haven't checked in, so go dump stack. */ | 958 | /* We haven't checked in, so go dump stack. */ |
833 | print_cpu_stall(rsp); | 959 | print_cpu_stall(rsp); |
@@ -889,12 +1015,8 @@ static void __note_new_gpnum(struct rcu_state *rsp, struct rcu_node *rnp, struct | |||
889 | */ | 1015 | */ |
890 | rdp->gpnum = rnp->gpnum; | 1016 | rdp->gpnum = rnp->gpnum; |
891 | trace_rcu_grace_period(rsp->name, rdp->gpnum, "cpustart"); | 1017 | trace_rcu_grace_period(rsp->name, rdp->gpnum, "cpustart"); |
892 | if (rnp->qsmask & rdp->grpmask) { | 1018 | rdp->passed_quiesce = 0; |
893 | rdp->qs_pending = 1; | 1019 | rdp->qs_pending = !!(rnp->qsmask & rdp->grpmask); |
894 | rdp->passed_quiesce = 0; | ||
895 | } else { | ||
896 | rdp->qs_pending = 0; | ||
897 | } | ||
898 | zero_cpu_stall_ticks(rdp); | 1020 | zero_cpu_stall_ticks(rdp); |
899 | } | 1021 | } |
900 | } | 1022 | } |
@@ -945,6 +1067,7 @@ static void init_callback_list(struct rcu_data *rdp) | |||
945 | rdp->nxtlist = NULL; | 1067 | rdp->nxtlist = NULL; |
946 | for (i = 0; i < RCU_NEXT_SIZE; i++) | 1068 | for (i = 0; i < RCU_NEXT_SIZE; i++) |
947 | rdp->nxttail[i] = &rdp->nxtlist; | 1069 | rdp->nxttail[i] = &rdp->nxtlist; |
1070 | init_nocb_callback_list(rdp); | ||
948 | } | 1071 | } |
949 | 1072 | ||
950 | /* | 1073 | /* |
@@ -974,10 +1097,13 @@ __rcu_process_gp_end(struct rcu_state *rsp, struct rcu_node *rnp, struct rcu_dat | |||
974 | * our behalf. Catch up with this state to avoid noting | 1097 | * our behalf. Catch up with this state to avoid noting |
975 | * spurious new grace periods. If another grace period | 1098 | * spurious new grace periods. If another grace period |
976 | * has started, then rnp->gpnum will have advanced, so | 1099 | * has started, then rnp->gpnum will have advanced, so |
977 | * we will detect this later on. | 1100 | * we will detect this later on. Of course, any quiescent |
1101 | * states we found for the old GP are now invalid. | ||
978 | */ | 1102 | */ |
979 | if (ULONG_CMP_LT(rdp->gpnum, rdp->completed)) | 1103 | if (ULONG_CMP_LT(rdp->gpnum, rdp->completed)) { |
980 | rdp->gpnum = rdp->completed; | 1104 | rdp->gpnum = rdp->completed; |
1105 | rdp->passed_quiesce = 0; | ||
1106 | } | ||
981 | 1107 | ||
982 | /* | 1108 | /* |
983 | * If RCU does not need a quiescent state from this CPU, | 1109 | * If RCU does not need a quiescent state from this CPU, |
@@ -1021,97 +1147,56 @@ rcu_start_gp_per_cpu(struct rcu_state *rsp, struct rcu_node *rnp, struct rcu_dat | |||
1021 | /* Prior grace period ended, so advance callbacks for current CPU. */ | 1147 | /* Prior grace period ended, so advance callbacks for current CPU. */ |
1022 | __rcu_process_gp_end(rsp, rnp, rdp); | 1148 | __rcu_process_gp_end(rsp, rnp, rdp); |
1023 | 1149 | ||
1024 | /* | ||
1025 | * Because this CPU just now started the new grace period, we know | ||
1026 | * that all of its callbacks will be covered by this upcoming grace | ||
1027 | * period, even the ones that were registered arbitrarily recently. | ||
1028 | * Therefore, advance all outstanding callbacks to RCU_WAIT_TAIL. | ||
1029 | * | ||
1030 | * Other CPUs cannot be sure exactly when the grace period started. | ||
1031 | * Therefore, their recently registered callbacks must pass through | ||
1032 | * an additional RCU_NEXT_READY stage, so that they will be handled | ||
1033 | * by the next RCU grace period. | ||
1034 | */ | ||
1035 | rdp->nxttail[RCU_NEXT_READY_TAIL] = rdp->nxttail[RCU_NEXT_TAIL]; | ||
1036 | rdp->nxttail[RCU_WAIT_TAIL] = rdp->nxttail[RCU_NEXT_TAIL]; | ||
1037 | |||
1038 | /* Set state so that this CPU will detect the next quiescent state. */ | 1150 | /* Set state so that this CPU will detect the next quiescent state. */ |
1039 | __note_new_gpnum(rsp, rnp, rdp); | 1151 | __note_new_gpnum(rsp, rnp, rdp); |
1040 | } | 1152 | } |
1041 | 1153 | ||
1042 | /* | 1154 | /* |
1043 | * Start a new RCU grace period if warranted, re-initializing the hierarchy | 1155 | * Initialize a new grace period. |
1044 | * in preparation for detecting the next grace period. The caller must hold | ||
1045 | * the root node's ->lock, which is released before return. Hard irqs must | ||
1046 | * be disabled. | ||
1047 | * | ||
1048 | * Note that it is legal for a dying CPU (which is marked as offline) to | ||
1049 | * invoke this function. This can happen when the dying CPU reports its | ||
1050 | * quiescent state. | ||
1051 | */ | 1156 | */ |
1052 | static void | 1157 | static int rcu_gp_init(struct rcu_state *rsp) |
1053 | rcu_start_gp(struct rcu_state *rsp, unsigned long flags) | ||
1054 | __releases(rcu_get_root(rsp)->lock) | ||
1055 | { | 1158 | { |
1056 | struct rcu_data *rdp = this_cpu_ptr(rsp->rda); | 1159 | struct rcu_data *rdp; |
1057 | struct rcu_node *rnp = rcu_get_root(rsp); | 1160 | struct rcu_node *rnp = rcu_get_root(rsp); |
1058 | 1161 | ||
1059 | if (!rcu_scheduler_fully_active || | 1162 | raw_spin_lock_irq(&rnp->lock); |
1060 | !cpu_needs_another_gp(rsp, rdp)) { | 1163 | rsp->gp_flags = 0; /* Clear all flags: New grace period. */ |
1061 | /* | ||
1062 | * Either the scheduler hasn't yet spawned the first | ||
1063 | * non-idle task or this CPU does not need another | ||
1064 | * grace period. Either way, don't start a new grace | ||
1065 | * period. | ||
1066 | */ | ||
1067 | raw_spin_unlock_irqrestore(&rnp->lock, flags); | ||
1068 | return; | ||
1069 | } | ||
1070 | 1164 | ||
1071 | if (rsp->fqs_active) { | 1165 | if (rcu_gp_in_progress(rsp)) { |
1072 | /* | 1166 | /* Grace period already in progress, don't start another. */ |
1073 | * This CPU needs a grace period, but force_quiescent_state() | 1167 | raw_spin_unlock_irq(&rnp->lock); |
1074 | * is running. Tell it to start one on this CPU's behalf. | 1168 | return 0; |
1075 | */ | ||
1076 | rsp->fqs_need_gp = 1; | ||
1077 | raw_spin_unlock_irqrestore(&rnp->lock, flags); | ||
1078 | return; | ||
1079 | } | 1169 | } |
1080 | 1170 | ||
1081 | /* Advance to a new grace period and initialize state. */ | 1171 | /* Advance to a new grace period and initialize state. */ |
1082 | rsp->gpnum++; | 1172 | rsp->gpnum++; |
1083 | trace_rcu_grace_period(rsp->name, rsp->gpnum, "start"); | 1173 | trace_rcu_grace_period(rsp->name, rsp->gpnum, "start"); |
1084 | WARN_ON_ONCE(rsp->fqs_state == RCU_GP_INIT); | ||
1085 | rsp->fqs_state = RCU_GP_INIT; /* Hold off force_quiescent_state. */ | ||
1086 | rsp->jiffies_force_qs = jiffies + RCU_JIFFIES_TILL_FORCE_QS; | ||
1087 | record_gp_stall_check_time(rsp); | 1174 | record_gp_stall_check_time(rsp); |
1088 | raw_spin_unlock(&rnp->lock); /* leave irqs disabled. */ | 1175 | raw_spin_unlock_irq(&rnp->lock); |
1089 | 1176 | ||
1090 | /* Exclude any concurrent CPU-hotplug operations. */ | 1177 | /* Exclude any concurrent CPU-hotplug operations. */ |
1091 | raw_spin_lock(&rsp->onofflock); /* irqs already disabled. */ | 1178 | mutex_lock(&rsp->onoff_mutex); |
1092 | 1179 | ||
1093 | /* | 1180 | /* |
1094 | * Set the quiescent-state-needed bits in all the rcu_node | 1181 | * Set the quiescent-state-needed bits in all the rcu_node |
1095 | * structures for all currently online CPUs in breadth-first | 1182 | * structures for all currently online CPUs in breadth-first order, |
1096 | * order, starting from the root rcu_node structure. This | 1183 | * starting from the root rcu_node structure, relying on the layout |
1097 | * operation relies on the layout of the hierarchy within the | 1184 | * of the tree within the rsp->node[] array. Note that other CPUs |
1098 | * rsp->node[] array. Note that other CPUs will access only | 1185 | * will access only the leaves of the hierarchy, thus seeing that no |
1099 | * the leaves of the hierarchy, which still indicate that no | ||
1100 | * grace period is in progress, at least until the corresponding | 1186 | * grace period is in progress, at least until the corresponding |
1101 | * leaf node has been initialized. In addition, we have excluded | 1187 | * leaf node has been initialized. In addition, we have excluded |
1102 | * CPU-hotplug operations. | 1188 | * CPU-hotplug operations. |
1103 | * | 1189 | * |
1104 | * Note that the grace period cannot complete until we finish | 1190 | * The grace period cannot complete until the initialization |
1105 | * the initialization process, as there will be at least one | 1191 | * process finishes, because this kthread handles both. |
1106 | * qsmask bit set in the root node until that time, namely the | ||
1107 | * one corresponding to this CPU, due to the fact that we have | ||
1108 | * irqs disabled. | ||
1109 | */ | 1192 | */ |
1110 | rcu_for_each_node_breadth_first(rsp, rnp) { | 1193 | rcu_for_each_node_breadth_first(rsp, rnp) { |
1111 | raw_spin_lock(&rnp->lock); /* irqs already disabled. */ | 1194 | raw_spin_lock_irq(&rnp->lock); |
1195 | rdp = this_cpu_ptr(rsp->rda); | ||
1112 | rcu_preempt_check_blocked_tasks(rnp); | 1196 | rcu_preempt_check_blocked_tasks(rnp); |
1113 | rnp->qsmask = rnp->qsmaskinit; | 1197 | rnp->qsmask = rnp->qsmaskinit; |
1114 | rnp->gpnum = rsp->gpnum; | 1198 | rnp->gpnum = rsp->gpnum; |
1199 | WARN_ON_ONCE(rnp->completed != rsp->completed); | ||
1115 | rnp->completed = rsp->completed; | 1200 | rnp->completed = rsp->completed; |
1116 | if (rnp == rdp->mynode) | 1201 | if (rnp == rdp->mynode) |
1117 | rcu_start_gp_per_cpu(rsp, rnp, rdp); | 1202 | rcu_start_gp_per_cpu(rsp, rnp, rdp); |
@@ -1119,37 +1204,54 @@ rcu_start_gp(struct rcu_state *rsp, unsigned long flags) | |||
1119 | trace_rcu_grace_period_init(rsp->name, rnp->gpnum, | 1204 | trace_rcu_grace_period_init(rsp->name, rnp->gpnum, |
1120 | rnp->level, rnp->grplo, | 1205 | rnp->level, rnp->grplo, |
1121 | rnp->grphi, rnp->qsmask); | 1206 | rnp->grphi, rnp->qsmask); |
1122 | raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */ | 1207 | raw_spin_unlock_irq(&rnp->lock); |
1208 | #ifdef CONFIG_PROVE_RCU_DELAY | ||
1209 | if ((random32() % (rcu_num_nodes * 8)) == 0) | ||
1210 | schedule_timeout_uninterruptible(2); | ||
1211 | #endif /* #ifdef CONFIG_PROVE_RCU_DELAY */ | ||
1212 | cond_resched(); | ||
1123 | } | 1213 | } |
1124 | 1214 | ||
1125 | rnp = rcu_get_root(rsp); | 1215 | mutex_unlock(&rsp->onoff_mutex); |
1126 | raw_spin_lock(&rnp->lock); /* irqs already disabled. */ | 1216 | return 1; |
1127 | rsp->fqs_state = RCU_SIGNAL_INIT; /* force_quiescent_state now OK. */ | ||
1128 | raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */ | ||
1129 | raw_spin_unlock_irqrestore(&rsp->onofflock, flags); | ||
1130 | } | 1217 | } |
1131 | 1218 | ||
1132 | /* | 1219 | /* |
1133 | * Report a full set of quiescent states to the specified rcu_state | 1220 | * Do one round of quiescent-state forcing. |
1134 | * data structure. This involves cleaning up after the prior grace | ||
1135 | * period and letting rcu_start_gp() start up the next grace period | ||
1136 | * if one is needed. Note that the caller must hold rnp->lock, as | ||
1137 | * required by rcu_start_gp(), which will release it. | ||
1138 | */ | 1221 | */ |
1139 | static void rcu_report_qs_rsp(struct rcu_state *rsp, unsigned long flags) | 1222 | int rcu_gp_fqs(struct rcu_state *rsp, int fqs_state_in) |
1140 | __releases(rcu_get_root(rsp)->lock) | ||
1141 | { | 1223 | { |
1142 | unsigned long gp_duration; | 1224 | int fqs_state = fqs_state_in; |
1143 | struct rcu_node *rnp = rcu_get_root(rsp); | 1225 | struct rcu_node *rnp = rcu_get_root(rsp); |
1144 | struct rcu_data *rdp = this_cpu_ptr(rsp->rda); | ||
1145 | 1226 | ||
1146 | WARN_ON_ONCE(!rcu_gp_in_progress(rsp)); | 1227 | rsp->n_force_qs++; |
1228 | if (fqs_state == RCU_SAVE_DYNTICK) { | ||
1229 | /* Collect dyntick-idle snapshots. */ | ||
1230 | force_qs_rnp(rsp, dyntick_save_progress_counter); | ||
1231 | fqs_state = RCU_FORCE_QS; | ||
1232 | } else { | ||
1233 | /* Handle dyntick-idle and offline CPUs. */ | ||
1234 | force_qs_rnp(rsp, rcu_implicit_dynticks_qs); | ||
1235 | } | ||
1236 | /* Clear flag to prevent immediate re-entry. */ | ||
1237 | if (ACCESS_ONCE(rsp->gp_flags) & RCU_GP_FLAG_FQS) { | ||
1238 | raw_spin_lock_irq(&rnp->lock); | ||
1239 | rsp->gp_flags &= ~RCU_GP_FLAG_FQS; | ||
1240 | raw_spin_unlock_irq(&rnp->lock); | ||
1241 | } | ||
1242 | return fqs_state; | ||
1243 | } | ||
1147 | 1244 | ||
1148 | /* | 1245 | /* |
1149 | * Ensure that all grace-period and pre-grace-period activity | 1246 | * Clean up after the old grace period. |
1150 | * is seen before the assignment to rsp->completed. | 1247 | */ |
1151 | */ | 1248 | static void rcu_gp_cleanup(struct rcu_state *rsp) |
1152 | smp_mb(); /* See above block comment. */ | 1249 | { |
1250 | unsigned long gp_duration; | ||
1251 | struct rcu_data *rdp; | ||
1252 | struct rcu_node *rnp = rcu_get_root(rsp); | ||
1253 | |||
1254 | raw_spin_lock_irq(&rnp->lock); | ||
1153 | gp_duration = jiffies - rsp->gp_start; | 1255 | gp_duration = jiffies - rsp->gp_start; |
1154 | if (gp_duration > rsp->gp_max) | 1256 | if (gp_duration > rsp->gp_max) |
1155 | rsp->gp_max = gp_duration; | 1257 | rsp->gp_max = gp_duration; |
@@ -1161,35 +1263,171 @@ static void rcu_report_qs_rsp(struct rcu_state *rsp, unsigned long flags) | |||
1161 | * they can do to advance the grace period. It is therefore | 1263 | * they can do to advance the grace period. It is therefore |
1162 | * safe for us to drop the lock in order to mark the grace | 1264 | * safe for us to drop the lock in order to mark the grace |
1163 | * period as completed in all of the rcu_node structures. | 1265 | * period as completed in all of the rcu_node structures. |
1164 | * | ||
1165 | * But if this CPU needs another grace period, it will take | ||
1166 | * care of this while initializing the next grace period. | ||
1167 | * We use RCU_WAIT_TAIL instead of the usual RCU_DONE_TAIL | ||
1168 | * because the callbacks have not yet been advanced: Those | ||
1169 | * callbacks are waiting on the grace period that just now | ||
1170 | * completed. | ||
1171 | */ | 1266 | */ |
1172 | if (*rdp->nxttail[RCU_WAIT_TAIL] == NULL) { | 1267 | raw_spin_unlock_irq(&rnp->lock); |
1173 | raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */ | ||
1174 | 1268 | ||
1175 | /* | 1269 | /* |
1176 | * Propagate new ->completed value to rcu_node structures | 1270 | * Propagate new ->completed value to rcu_node structures so |
1177 | * so that other CPUs don't have to wait until the start | 1271 | * that other CPUs don't have to wait until the start of the next |
1178 | * of the next grace period to process their callbacks. | 1272 | * grace period to process their callbacks. This also avoids |
1179 | */ | 1273 | * some nasty RCU grace-period initialization races by forcing |
1180 | rcu_for_each_node_breadth_first(rsp, rnp) { | 1274 | * the end of the current grace period to be completely recorded in |
1181 | raw_spin_lock(&rnp->lock); /* irqs already disabled. */ | 1275 | * all of the rcu_node structures before the beginning of the next |
1182 | rnp->completed = rsp->gpnum; | 1276 | * grace period is recorded in any of the rcu_node structures. |
1183 | raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */ | 1277 | */ |
1184 | } | 1278 | rcu_for_each_node_breadth_first(rsp, rnp) { |
1185 | rnp = rcu_get_root(rsp); | 1279 | raw_spin_lock_irq(&rnp->lock); |
1186 | raw_spin_lock(&rnp->lock); /* irqs already disabled. */ | 1280 | rnp->completed = rsp->gpnum; |
1281 | raw_spin_unlock_irq(&rnp->lock); | ||
1282 | cond_resched(); | ||
1187 | } | 1283 | } |
1284 | rnp = rcu_get_root(rsp); | ||
1285 | raw_spin_lock_irq(&rnp->lock); | ||
1188 | 1286 | ||
1189 | rsp->completed = rsp->gpnum; /* Declare the grace period complete. */ | 1287 | rsp->completed = rsp->gpnum; /* Declare grace period done. */ |
1190 | trace_rcu_grace_period(rsp->name, rsp->completed, "end"); | 1288 | trace_rcu_grace_period(rsp->name, rsp->completed, "end"); |
1191 | rsp->fqs_state = RCU_GP_IDLE; | 1289 | rsp->fqs_state = RCU_GP_IDLE; |
1192 | rcu_start_gp(rsp, flags); /* releases root node's rnp->lock. */ | 1290 | rdp = this_cpu_ptr(rsp->rda); |
1291 | if (cpu_needs_another_gp(rsp, rdp)) | ||
1292 | rsp->gp_flags = 1; | ||
1293 | raw_spin_unlock_irq(&rnp->lock); | ||
1294 | } | ||
1295 | |||
1296 | /* | ||
1297 | * Body of kthread that handles grace periods. | ||
1298 | */ | ||
1299 | static int __noreturn rcu_gp_kthread(void *arg) | ||
1300 | { | ||
1301 | int fqs_state; | ||
1302 | unsigned long j; | ||
1303 | int ret; | ||
1304 | struct rcu_state *rsp = arg; | ||
1305 | struct rcu_node *rnp = rcu_get_root(rsp); | ||
1306 | |||
1307 | for (;;) { | ||
1308 | |||
1309 | /* Handle grace-period start. */ | ||
1310 | for (;;) { | ||
1311 | wait_event_interruptible(rsp->gp_wq, | ||
1312 | rsp->gp_flags & | ||
1313 | RCU_GP_FLAG_INIT); | ||
1314 | if ((rsp->gp_flags & RCU_GP_FLAG_INIT) && | ||
1315 | rcu_gp_init(rsp)) | ||
1316 | break; | ||
1317 | cond_resched(); | ||
1318 | flush_signals(current); | ||
1319 | } | ||
1320 | |||
1321 | /* Handle quiescent-state forcing. */ | ||
1322 | fqs_state = RCU_SAVE_DYNTICK; | ||
1323 | j = jiffies_till_first_fqs; | ||
1324 | if (j > HZ) { | ||
1325 | j = HZ; | ||
1326 | jiffies_till_first_fqs = HZ; | ||
1327 | } | ||
1328 | for (;;) { | ||
1329 | rsp->jiffies_force_qs = jiffies + j; | ||
1330 | ret = wait_event_interruptible_timeout(rsp->gp_wq, | ||
1331 | (rsp->gp_flags & RCU_GP_FLAG_FQS) || | ||
1332 | (!ACCESS_ONCE(rnp->qsmask) && | ||
1333 | !rcu_preempt_blocked_readers_cgp(rnp)), | ||
1334 | j); | ||
1335 | /* If grace period done, leave loop. */ | ||
1336 | if (!ACCESS_ONCE(rnp->qsmask) && | ||
1337 | !rcu_preempt_blocked_readers_cgp(rnp)) | ||
1338 | break; | ||
1339 | /* If time for quiescent-state forcing, do it. */ | ||
1340 | if (ret == 0 || (rsp->gp_flags & RCU_GP_FLAG_FQS)) { | ||
1341 | fqs_state = rcu_gp_fqs(rsp, fqs_state); | ||
1342 | cond_resched(); | ||
1343 | } else { | ||
1344 | /* Deal with stray signal. */ | ||
1345 | cond_resched(); | ||
1346 | flush_signals(current); | ||
1347 | } | ||
1348 | j = jiffies_till_next_fqs; | ||
1349 | if (j > HZ) { | ||
1350 | j = HZ; | ||
1351 | jiffies_till_next_fqs = HZ; | ||
1352 | } else if (j < 1) { | ||
1353 | j = 1; | ||
1354 | jiffies_till_next_fqs = 1; | ||
1355 | } | ||
1356 | } | ||
1357 | |||
1358 | /* Handle grace-period end. */ | ||
1359 | rcu_gp_cleanup(rsp); | ||
1360 | } | ||
1361 | } | ||
1362 | |||
1363 | /* | ||
1364 | * Start a new RCU grace period if warranted, re-initializing the hierarchy | ||
1365 | * in preparation for detecting the next grace period. The caller must hold | ||
1366 | * the root node's ->lock, which is released before return. Hard irqs must | ||
1367 | * be disabled. | ||
1368 | * | ||
1369 | * Note that it is legal for a dying CPU (which is marked as offline) to | ||
1370 | * invoke this function. This can happen when the dying CPU reports its | ||
1371 | * quiescent state. | ||
1372 | */ | ||
1373 | static void | ||
1374 | rcu_start_gp(struct rcu_state *rsp, unsigned long flags) | ||
1375 | __releases(rcu_get_root(rsp)->lock) | ||
1376 | { | ||
1377 | struct rcu_data *rdp = this_cpu_ptr(rsp->rda); | ||
1378 | struct rcu_node *rnp = rcu_get_root(rsp); | ||
1379 | |||
1380 | if (!rsp->gp_kthread || | ||
1381 | !cpu_needs_another_gp(rsp, rdp)) { | ||
1382 | /* | ||
1383 | * Either we have not yet spawned the grace-period | ||
1384 | * task, this CPU does not need another grace period, | ||
1385 | * or a grace period is already in progress. | ||
1386 | * Either way, don't start a new grace period. | ||
1387 | */ | ||
1388 | raw_spin_unlock_irqrestore(&rnp->lock, flags); | ||
1389 | return; | ||
1390 | } | ||
1391 | |||
1392 | /* | ||
1393 | * Because there is no grace period in progress right now, | ||
1394 | * any callbacks we have up to this point will be satisfied | ||
1395 | * by the next grace period. So promote all callbacks to be | ||
1396 | * handled after the end of the next grace period. If the | ||
1397 | * CPU is not yet aware of the end of the previous grace period, | ||
1398 | * we need to allow for the callback advancement that will | ||
1399 | * occur when it does become aware. Deadlock prevents us from | ||
1400 | * making it aware at this point: We cannot acquire a leaf | ||
1401 | * rcu_node ->lock while holding the root rcu_node ->lock. | ||
1402 | */ | ||
1403 | rdp->nxttail[RCU_NEXT_READY_TAIL] = rdp->nxttail[RCU_NEXT_TAIL]; | ||
1404 | if (rdp->completed == rsp->completed) | ||
1405 | rdp->nxttail[RCU_WAIT_TAIL] = rdp->nxttail[RCU_NEXT_TAIL]; | ||
1406 | |||
1407 | rsp->gp_flags = RCU_GP_FLAG_INIT; | ||
1408 | raw_spin_unlock(&rnp->lock); /* Interrupts remain disabled. */ | ||
1409 | |||
1410 | /* Ensure that CPU is aware of completion of last grace period. */ | ||
1411 | rcu_process_gp_end(rsp, rdp); | ||
1412 | local_irq_restore(flags); | ||
1413 | |||
1414 | /* Wake up rcu_gp_kthread() to start the grace period. */ | ||
1415 | wake_up(&rsp->gp_wq); | ||
1416 | } | ||
1417 | |||
1418 | /* | ||
1419 | * Report a full set of quiescent states to the specified rcu_state | ||
1420 | * data structure. This involves cleaning up after the prior grace | ||
1421 | * period and letting rcu_start_gp() start up the next grace period | ||
1422 | * if one is needed. Note that the caller must hold rnp->lock, as | ||
1423 | * required by rcu_start_gp(), which will release it. | ||
1424 | */ | ||
1425 | static void rcu_report_qs_rsp(struct rcu_state *rsp, unsigned long flags) | ||
1426 | __releases(rcu_get_root(rsp)->lock) | ||
1427 | { | ||
1428 | WARN_ON_ONCE(!rcu_gp_in_progress(rsp)); | ||
1429 | raw_spin_unlock_irqrestore(&rcu_get_root(rsp)->lock, flags); | ||
1430 | wake_up(&rsp->gp_wq); /* Memory barrier implied by wake_up() path. */ | ||
1193 | } | 1431 | } |
1194 | 1432 | ||
1195 | /* | 1433 | /* |
@@ -1258,7 +1496,7 @@ rcu_report_qs_rnp(unsigned long mask, struct rcu_state *rsp, | |||
1258 | * based on quiescent states detected in an earlier grace period! | 1496 | * based on quiescent states detected in an earlier grace period! |
1259 | */ | 1497 | */ |
1260 | static void | 1498 | static void |
1261 | rcu_report_qs_rdp(int cpu, struct rcu_state *rsp, struct rcu_data *rdp, long lastgp) | 1499 | rcu_report_qs_rdp(int cpu, struct rcu_state *rsp, struct rcu_data *rdp) |
1262 | { | 1500 | { |
1263 | unsigned long flags; | 1501 | unsigned long flags; |
1264 | unsigned long mask; | 1502 | unsigned long mask; |
@@ -1266,7 +1504,8 @@ rcu_report_qs_rdp(int cpu, struct rcu_state *rsp, struct rcu_data *rdp, long las | |||
1266 | 1504 | ||
1267 | rnp = rdp->mynode; | 1505 | rnp = rdp->mynode; |
1268 | raw_spin_lock_irqsave(&rnp->lock, flags); | 1506 | raw_spin_lock_irqsave(&rnp->lock, flags); |
1269 | if (lastgp != rnp->gpnum || rnp->completed == rnp->gpnum) { | 1507 | if (rdp->passed_quiesce == 0 || rdp->gpnum != rnp->gpnum || |
1508 | rnp->completed == rnp->gpnum) { | ||
1270 | 1509 | ||
1271 | /* | 1510 | /* |
1272 | * The grace period in which this quiescent state was | 1511 | * The grace period in which this quiescent state was |
@@ -1325,7 +1564,7 @@ rcu_check_quiescent_state(struct rcu_state *rsp, struct rcu_data *rdp) | |||
1325 | * Tell RCU we are done (but rcu_report_qs_rdp() will be the | 1564 | * Tell RCU we are done (but rcu_report_qs_rdp() will be the |
1326 | * judge of that). | 1565 | * judge of that). |
1327 | */ | 1566 | */ |
1328 | rcu_report_qs_rdp(rdp->cpu, rsp, rdp, rdp->passed_quiesce_gpnum); | 1567 | rcu_report_qs_rdp(rdp->cpu, rsp, rdp); |
1329 | } | 1568 | } |
1330 | 1569 | ||
1331 | #ifdef CONFIG_HOTPLUG_CPU | 1570 | #ifdef CONFIG_HOTPLUG_CPU |
@@ -1333,16 +1572,20 @@ rcu_check_quiescent_state(struct rcu_state *rsp, struct rcu_data *rdp) | |||
1333 | /* | 1572 | /* |
1334 | * Send the specified CPU's RCU callbacks to the orphanage. The | 1573 | * Send the specified CPU's RCU callbacks to the orphanage. The |
1335 | * specified CPU must be offline, and the caller must hold the | 1574 | * specified CPU must be offline, and the caller must hold the |
1336 | * ->onofflock. | 1575 | * ->orphan_lock. |
1337 | */ | 1576 | */ |
1338 | static void | 1577 | static void |
1339 | rcu_send_cbs_to_orphanage(int cpu, struct rcu_state *rsp, | 1578 | rcu_send_cbs_to_orphanage(int cpu, struct rcu_state *rsp, |
1340 | struct rcu_node *rnp, struct rcu_data *rdp) | 1579 | struct rcu_node *rnp, struct rcu_data *rdp) |
1341 | { | 1580 | { |
1581 | /* No-CBs CPUs do not have orphanable callbacks. */ | ||
1582 | if (is_nocb_cpu(rdp->cpu)) | ||
1583 | return; | ||
1584 | |||
1342 | /* | 1585 | /* |
1343 | * Orphan the callbacks. First adjust the counts. This is safe | 1586 | * Orphan the callbacks. First adjust the counts. This is safe |
1344 | * because ->onofflock excludes _rcu_barrier()'s adoption of | 1587 | * because _rcu_barrier() excludes CPU-hotplug operations, so it |
1345 | * the callbacks, thus no memory barrier is required. | 1588 | * cannot be running now. Thus no memory barrier is required. |
1346 | */ | 1589 | */ |
1347 | if (rdp->nxtlist != NULL) { | 1590 | if (rdp->nxtlist != NULL) { |
1348 | rsp->qlen_lazy += rdp->qlen_lazy; | 1591 | rsp->qlen_lazy += rdp->qlen_lazy; |
@@ -1383,22 +1626,15 @@ rcu_send_cbs_to_orphanage(int cpu, struct rcu_state *rsp, | |||
1383 | 1626 | ||
1384 | /* | 1627 | /* |
1385 | * Adopt the RCU callbacks from the specified rcu_state structure's | 1628 | * Adopt the RCU callbacks from the specified rcu_state structure's |
1386 | * orphanage. The caller must hold the ->onofflock. | 1629 | * orphanage. The caller must hold the ->orphan_lock. |
1387 | */ | 1630 | */ |
1388 | static void rcu_adopt_orphan_cbs(struct rcu_state *rsp) | 1631 | static void rcu_adopt_orphan_cbs(struct rcu_state *rsp) |
1389 | { | 1632 | { |
1390 | int i; | 1633 | int i; |
1391 | struct rcu_data *rdp = __this_cpu_ptr(rsp->rda); | 1634 | struct rcu_data *rdp = __this_cpu_ptr(rsp->rda); |
1392 | 1635 | ||
1393 | /* | 1636 | /* No-CBs CPUs are handled specially. */ |
1394 | * If there is an rcu_barrier() operation in progress, then | 1637 | if (rcu_nocb_adopt_orphan_cbs(rsp, rdp)) |
1395 | * only the task doing that operation is permitted to adopt | ||
1396 | * callbacks. To do otherwise breaks rcu_barrier() and friends | ||
1397 | * by causing them to fail to wait for the callbacks in the | ||
1398 | * orphanage. | ||
1399 | */ | ||
1400 | if (rsp->rcu_barrier_in_progress && | ||
1401 | rsp->rcu_barrier_in_progress != current) | ||
1402 | return; | 1638 | return; |
1403 | 1639 | ||
1404 | /* Do the accounting first. */ | 1640 | /* Do the accounting first. */ |
@@ -1455,9 +1691,8 @@ static void rcu_cleanup_dying_cpu(struct rcu_state *rsp) | |||
1455 | * The CPU has been completely removed, and some other CPU is reporting | 1691 | * The CPU has been completely removed, and some other CPU is reporting |
1456 | * this fact from process context. Do the remainder of the cleanup, | 1692 | * this fact from process context. Do the remainder of the cleanup, |
1457 | * including orphaning the outgoing CPU's RCU callbacks, and also | 1693 | * including orphaning the outgoing CPU's RCU callbacks, and also |
1458 | * adopting them, if there is no _rcu_barrier() instance running. | 1694 | * adopting them. There can only be one CPU hotplug operation at a time, |
1459 | * There can only be one CPU hotplug operation at a time, so no other | 1695 | * so no other CPU can be attempting to update rcu_cpu_kthread_task. |
1460 | * CPU can be attempting to update rcu_cpu_kthread_task. | ||
1461 | */ | 1696 | */ |
1462 | static void rcu_cleanup_dead_cpu(int cpu, struct rcu_state *rsp) | 1697 | static void rcu_cleanup_dead_cpu(int cpu, struct rcu_state *rsp) |
1463 | { | 1698 | { |
@@ -1468,13 +1703,13 @@ static void rcu_cleanup_dead_cpu(int cpu, struct rcu_state *rsp) | |||
1468 | struct rcu_node *rnp = rdp->mynode; /* Outgoing CPU's rdp & rnp. */ | 1703 | struct rcu_node *rnp = rdp->mynode; /* Outgoing CPU's rdp & rnp. */ |
1469 | 1704 | ||
1470 | /* Adjust any no-longer-needed kthreads. */ | 1705 | /* Adjust any no-longer-needed kthreads. */ |
1471 | rcu_stop_cpu_kthread(cpu); | 1706 | rcu_boost_kthread_setaffinity(rnp, -1); |
1472 | rcu_node_kthread_setaffinity(rnp, -1); | ||
1473 | 1707 | ||
1474 | /* Remove the dead CPU from the bitmasks in the rcu_node hierarchy. */ | 1708 | /* Remove the dead CPU from the bitmasks in the rcu_node hierarchy. */ |
1475 | 1709 | ||
1476 | /* Exclude any attempts to start a new grace period. */ | 1710 | /* Exclude any attempts to start a new grace period. */ |
1477 | raw_spin_lock_irqsave(&rsp->onofflock, flags); | 1711 | mutex_lock(&rsp->onoff_mutex); |
1712 | raw_spin_lock_irqsave(&rsp->orphan_lock, flags); | ||
1478 | 1713 | ||
1479 | /* Orphan the dead CPU's callbacks, and adopt them if appropriate. */ | 1714 | /* Orphan the dead CPU's callbacks, and adopt them if appropriate. */ |
1480 | rcu_send_cbs_to_orphanage(cpu, rsp, rnp, rdp); | 1715 | rcu_send_cbs_to_orphanage(cpu, rsp, rnp, rdp); |
@@ -1501,10 +1736,10 @@ static void rcu_cleanup_dead_cpu(int cpu, struct rcu_state *rsp) | |||
1501 | /* | 1736 | /* |
1502 | * We still hold the leaf rcu_node structure lock here, and | 1737 | * We still hold the leaf rcu_node structure lock here, and |
1503 | * irqs are still disabled. The reason for this subterfuge is | 1738 | * irqs are still disabled. The reason for this subterfuge is |
1504 | * because invoking rcu_report_unblock_qs_rnp() with ->onofflock | 1739 | * because invoking rcu_report_unblock_qs_rnp() with ->orphan_lock |
1505 | * held leads to deadlock. | 1740 | * held leads to deadlock. |
1506 | */ | 1741 | */ |
1507 | raw_spin_unlock(&rsp->onofflock); /* irqs remain disabled. */ | 1742 | raw_spin_unlock(&rsp->orphan_lock); /* irqs remain disabled. */ |
1508 | rnp = rdp->mynode; | 1743 | rnp = rdp->mynode; |
1509 | if (need_report & RCU_OFL_TASKS_NORM_GP) | 1744 | if (need_report & RCU_OFL_TASKS_NORM_GP) |
1510 | rcu_report_unblock_qs_rnp(rnp, flags); | 1745 | rcu_report_unblock_qs_rnp(rnp, flags); |
@@ -1515,14 +1750,14 @@ static void rcu_cleanup_dead_cpu(int cpu, struct rcu_state *rsp) | |||
1515 | WARN_ONCE(rdp->qlen != 0 || rdp->nxtlist != NULL, | 1750 | WARN_ONCE(rdp->qlen != 0 || rdp->nxtlist != NULL, |
1516 | "rcu_cleanup_dead_cpu: Callbacks on offline CPU %d: qlen=%lu, nxtlist=%p\n", | 1751 | "rcu_cleanup_dead_cpu: Callbacks on offline CPU %d: qlen=%lu, nxtlist=%p\n", |
1517 | cpu, rdp->qlen, rdp->nxtlist); | 1752 | cpu, rdp->qlen, rdp->nxtlist); |
1753 | init_callback_list(rdp); | ||
1754 | /* Disallow further callbacks on this CPU. */ | ||
1755 | rdp->nxttail[RCU_NEXT_TAIL] = NULL; | ||
1756 | mutex_unlock(&rsp->onoff_mutex); | ||
1518 | } | 1757 | } |
1519 | 1758 | ||
1520 | #else /* #ifdef CONFIG_HOTPLUG_CPU */ | 1759 | #else /* #ifdef CONFIG_HOTPLUG_CPU */ |
1521 | 1760 | ||
1522 | static void rcu_adopt_orphan_cbs(struct rcu_state *rsp) | ||
1523 | { | ||
1524 | } | ||
1525 | |||
1526 | static void rcu_cleanup_dying_cpu(struct rcu_state *rsp) | 1761 | static void rcu_cleanup_dying_cpu(struct rcu_state *rsp) |
1527 | { | 1762 | { |
1528 | } | 1763 | } |
@@ -1541,7 +1776,8 @@ static void rcu_do_batch(struct rcu_state *rsp, struct rcu_data *rdp) | |||
1541 | { | 1776 | { |
1542 | unsigned long flags; | 1777 | unsigned long flags; |
1543 | struct rcu_head *next, *list, **tail; | 1778 | struct rcu_head *next, *list, **tail; |
1544 | int bl, count, count_lazy, i; | 1779 | long bl, count, count_lazy; |
1780 | int i; | ||
1545 | 1781 | ||
1546 | /* If no callbacks are ready, just return.*/ | 1782 | /* If no callbacks are ready, just return.*/ |
1547 | if (!cpu_has_callbacks_ready_to_invoke(rdp)) { | 1783 | if (!cpu_has_callbacks_ready_to_invoke(rdp)) { |
@@ -1687,6 +1923,7 @@ static void force_qs_rnp(struct rcu_state *rsp, int (*f)(struct rcu_data *)) | |||
1687 | struct rcu_node *rnp; | 1923 | struct rcu_node *rnp; |
1688 | 1924 | ||
1689 | rcu_for_each_leaf_node(rsp, rnp) { | 1925 | rcu_for_each_leaf_node(rsp, rnp) { |
1926 | cond_resched(); | ||
1690 | mask = 0; | 1927 | mask = 0; |
1691 | raw_spin_lock_irqsave(&rnp->lock, flags); | 1928 | raw_spin_lock_irqsave(&rnp->lock, flags); |
1692 | if (!rcu_gp_in_progress(rsp)) { | 1929 | if (!rcu_gp_in_progress(rsp)) { |
@@ -1723,72 +1960,39 @@ static void force_qs_rnp(struct rcu_state *rsp, int (*f)(struct rcu_data *)) | |||
1723 | * Force quiescent states on reluctant CPUs, and also detect which | 1960 | * Force quiescent states on reluctant CPUs, and also detect which |
1724 | * CPUs are in dyntick-idle mode. | 1961 | * CPUs are in dyntick-idle mode. |
1725 | */ | 1962 | */ |
1726 | static void force_quiescent_state(struct rcu_state *rsp, int relaxed) | 1963 | static void force_quiescent_state(struct rcu_state *rsp) |
1727 | { | 1964 | { |
1728 | unsigned long flags; | 1965 | unsigned long flags; |
1729 | struct rcu_node *rnp = rcu_get_root(rsp); | 1966 | bool ret; |
1730 | 1967 | struct rcu_node *rnp; | |
1731 | trace_rcu_utilization("Start fqs"); | 1968 | struct rcu_node *rnp_old = NULL; |
1732 | if (!rcu_gp_in_progress(rsp)) { | 1969 | |
1733 | trace_rcu_utilization("End fqs"); | 1970 | /* Funnel through hierarchy to reduce memory contention. */ |
1734 | return; /* No grace period in progress, nothing to force. */ | 1971 | rnp = per_cpu_ptr(rsp->rda, raw_smp_processor_id())->mynode; |
1735 | } | 1972 | for (; rnp != NULL; rnp = rnp->parent) { |
1736 | if (!raw_spin_trylock_irqsave(&rsp->fqslock, flags)) { | 1973 | ret = (ACCESS_ONCE(rsp->gp_flags) & RCU_GP_FLAG_FQS) || |
1737 | rsp->n_force_qs_lh++; /* Inexact, can lose counts. Tough! */ | 1974 | !raw_spin_trylock(&rnp->fqslock); |
1738 | trace_rcu_utilization("End fqs"); | 1975 | if (rnp_old != NULL) |
1739 | return; /* Someone else is already on the job. */ | 1976 | raw_spin_unlock(&rnp_old->fqslock); |
1740 | } | 1977 | if (ret) { |
1741 | if (relaxed && ULONG_CMP_GE(rsp->jiffies_force_qs, jiffies)) | 1978 | rsp->n_force_qs_lh++; |
1742 | goto unlock_fqs_ret; /* no emergency and done recently. */ | 1979 | return; |
1743 | rsp->n_force_qs++; | 1980 | } |
1744 | raw_spin_lock(&rnp->lock); /* irqs already disabled */ | 1981 | rnp_old = rnp; |
1745 | rsp->jiffies_force_qs = jiffies + RCU_JIFFIES_TILL_FORCE_QS; | ||
1746 | if(!rcu_gp_in_progress(rsp)) { | ||
1747 | rsp->n_force_qs_ngp++; | ||
1748 | raw_spin_unlock(&rnp->lock); /* irqs remain disabled */ | ||
1749 | goto unlock_fqs_ret; /* no GP in progress, time updated. */ | ||
1750 | } | ||
1751 | rsp->fqs_active = 1; | ||
1752 | switch (rsp->fqs_state) { | ||
1753 | case RCU_GP_IDLE: | ||
1754 | case RCU_GP_INIT: | ||
1755 | |||
1756 | break; /* grace period idle or initializing, ignore. */ | ||
1757 | |||
1758 | case RCU_SAVE_DYNTICK: | ||
1759 | |||
1760 | raw_spin_unlock(&rnp->lock); /* irqs remain disabled */ | ||
1761 | |||
1762 | /* Record dyntick-idle state. */ | ||
1763 | force_qs_rnp(rsp, dyntick_save_progress_counter); | ||
1764 | raw_spin_lock(&rnp->lock); /* irqs already disabled */ | ||
1765 | if (rcu_gp_in_progress(rsp)) | ||
1766 | rsp->fqs_state = RCU_FORCE_QS; | ||
1767 | break; | ||
1768 | |||
1769 | case RCU_FORCE_QS: | ||
1770 | |||
1771 | /* Check dyntick-idle state, send IPI to laggarts. */ | ||
1772 | raw_spin_unlock(&rnp->lock); /* irqs remain disabled */ | ||
1773 | force_qs_rnp(rsp, rcu_implicit_dynticks_qs); | ||
1774 | |||
1775 | /* Leave state in case more forcing is required. */ | ||
1776 | |||
1777 | raw_spin_lock(&rnp->lock); /* irqs already disabled */ | ||
1778 | break; | ||
1779 | } | 1982 | } |
1780 | rsp->fqs_active = 0; | 1983 | /* rnp_old == rcu_get_root(rsp), rnp == NULL. */ |
1781 | if (rsp->fqs_need_gp) { | 1984 | |
1782 | raw_spin_unlock(&rsp->fqslock); /* irqs remain disabled */ | 1985 | /* Reached the root of the rcu_node tree, acquire lock. */ |
1783 | rsp->fqs_need_gp = 0; | 1986 | raw_spin_lock_irqsave(&rnp_old->lock, flags); |
1784 | rcu_start_gp(rsp, flags); /* releases rnp->lock */ | 1987 | raw_spin_unlock(&rnp_old->fqslock); |
1785 | trace_rcu_utilization("End fqs"); | 1988 | if (ACCESS_ONCE(rsp->gp_flags) & RCU_GP_FLAG_FQS) { |
1786 | return; | 1989 | rsp->n_force_qs_lh++; |
1990 | raw_spin_unlock_irqrestore(&rnp_old->lock, flags); | ||
1991 | return; /* Someone beat us to it. */ | ||
1787 | } | 1992 | } |
1788 | raw_spin_unlock(&rnp->lock); /* irqs remain disabled */ | 1993 | rsp->gp_flags |= RCU_GP_FLAG_FQS; |
1789 | unlock_fqs_ret: | 1994 | raw_spin_unlock_irqrestore(&rnp_old->lock, flags); |
1790 | raw_spin_unlock_irqrestore(&rsp->fqslock, flags); | 1995 | wake_up(&rsp->gp_wq); /* Memory barrier implied by wake_up() path. */ |
1791 | trace_rcu_utilization("End fqs"); | ||
1792 | } | 1996 | } |
1793 | 1997 | ||
1794 | /* | 1998 | /* |
@@ -1805,13 +2009,6 @@ __rcu_process_callbacks(struct rcu_state *rsp) | |||
1805 | WARN_ON_ONCE(rdp->beenonline == 0); | 2009 | WARN_ON_ONCE(rdp->beenonline == 0); |
1806 | 2010 | ||
1807 | /* | 2011 | /* |
1808 | * If an RCU GP has gone long enough, go check for dyntick | ||
1809 | * idle CPUs and, if needed, send resched IPIs. | ||
1810 | */ | ||
1811 | if (ULONG_CMP_LT(ACCESS_ONCE(rsp->jiffies_force_qs), jiffies)) | ||
1812 | force_quiescent_state(rsp, 1); | ||
1813 | |||
1814 | /* | ||
1815 | * Advance callbacks in response to end of earlier grace | 2012 | * Advance callbacks in response to end of earlier grace |
1816 | * period that some other CPU ended. | 2013 | * period that some other CPU ended. |
1817 | */ | 2014 | */ |
@@ -1838,6 +2035,8 @@ static void rcu_process_callbacks(struct softirq_action *unused) | |||
1838 | { | 2035 | { |
1839 | struct rcu_state *rsp; | 2036 | struct rcu_state *rsp; |
1840 | 2037 | ||
2038 | if (cpu_is_offline(smp_processor_id())) | ||
2039 | return; | ||
1841 | trace_rcu_utilization("Start RCU core"); | 2040 | trace_rcu_utilization("Start RCU core"); |
1842 | for_each_rcu_flavor(rsp) | 2041 | for_each_rcu_flavor(rsp) |
1843 | __rcu_process_callbacks(rsp); | 2042 | __rcu_process_callbacks(rsp); |
@@ -1909,17 +2108,22 @@ static void __call_rcu_core(struct rcu_state *rsp, struct rcu_data *rdp, | |||
1909 | rdp->blimit = LONG_MAX; | 2108 | rdp->blimit = LONG_MAX; |
1910 | if (rsp->n_force_qs == rdp->n_force_qs_snap && | 2109 | if (rsp->n_force_qs == rdp->n_force_qs_snap && |
1911 | *rdp->nxttail[RCU_DONE_TAIL] != head) | 2110 | *rdp->nxttail[RCU_DONE_TAIL] != head) |
1912 | force_quiescent_state(rsp, 0); | 2111 | force_quiescent_state(rsp); |
1913 | rdp->n_force_qs_snap = rsp->n_force_qs; | 2112 | rdp->n_force_qs_snap = rsp->n_force_qs; |
1914 | rdp->qlen_last_fqs_check = rdp->qlen; | 2113 | rdp->qlen_last_fqs_check = rdp->qlen; |
1915 | } | 2114 | } |
1916 | } else if (ULONG_CMP_LT(ACCESS_ONCE(rsp->jiffies_force_qs), jiffies)) | 2115 | } |
1917 | force_quiescent_state(rsp, 1); | ||
1918 | } | 2116 | } |
1919 | 2117 | ||
2118 | /* | ||
2119 | * Helper function for call_rcu() and friends. The cpu argument will | ||
2120 | * normally be -1, indicating "currently running CPU". It may specify | ||
2121 | * a CPU only if that CPU is a no-CBs CPU. Currently, only _rcu_barrier() | ||
2122 | * is expected to specify a CPU. | ||
2123 | */ | ||
1920 | static void | 2124 | static void |
1921 | __call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu), | 2125 | __call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu), |
1922 | struct rcu_state *rsp, bool lazy) | 2126 | struct rcu_state *rsp, int cpu, bool lazy) |
1923 | { | 2127 | { |
1924 | unsigned long flags; | 2128 | unsigned long flags; |
1925 | struct rcu_data *rdp; | 2129 | struct rcu_data *rdp; |
@@ -1929,8 +2133,6 @@ __call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu), | |||
1929 | head->func = func; | 2133 | head->func = func; |
1930 | head->next = NULL; | 2134 | head->next = NULL; |
1931 | 2135 | ||
1932 | smp_mb(); /* Ensure RCU update seen before callback registry. */ | ||
1933 | |||
1934 | /* | 2136 | /* |
1935 | * Opportunistically note grace-period endings and beginnings. | 2137 | * Opportunistically note grace-period endings and beginnings. |
1936 | * Note that we might see a beginning right after we see an | 2138 | * Note that we might see a beginning right after we see an |
@@ -1941,6 +2143,17 @@ __call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu), | |||
1941 | rdp = this_cpu_ptr(rsp->rda); | 2143 | rdp = this_cpu_ptr(rsp->rda); |
1942 | 2144 | ||
1943 | /* Add the callback to our list. */ | 2145 | /* Add the callback to our list. */ |
2146 | if (unlikely(rdp->nxttail[RCU_NEXT_TAIL] == NULL) || cpu != -1) { | ||
2147 | int offline; | ||
2148 | |||
2149 | if (cpu != -1) | ||
2150 | rdp = per_cpu_ptr(rsp->rda, cpu); | ||
2151 | offline = !__call_rcu_nocb(rdp, head, lazy); | ||
2152 | WARN_ON_ONCE(offline); | ||
2153 | /* _call_rcu() is illegal on offline CPU; leak the callback. */ | ||
2154 | local_irq_restore(flags); | ||
2155 | return; | ||
2156 | } | ||
1944 | ACCESS_ONCE(rdp->qlen)++; | 2157 | ACCESS_ONCE(rdp->qlen)++; |
1945 | if (lazy) | 2158 | if (lazy) |
1946 | rdp->qlen_lazy++; | 2159 | rdp->qlen_lazy++; |
@@ -1966,7 +2179,7 @@ __call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu), | |||
1966 | */ | 2179 | */ |
1967 | void call_rcu_sched(struct rcu_head *head, void (*func)(struct rcu_head *rcu)) | 2180 | void call_rcu_sched(struct rcu_head *head, void (*func)(struct rcu_head *rcu)) |
1968 | { | 2181 | { |
1969 | __call_rcu(head, func, &rcu_sched_state, 0); | 2182 | __call_rcu(head, func, &rcu_sched_state, -1, 0); |
1970 | } | 2183 | } |
1971 | EXPORT_SYMBOL_GPL(call_rcu_sched); | 2184 | EXPORT_SYMBOL_GPL(call_rcu_sched); |
1972 | 2185 | ||
@@ -1975,7 +2188,7 @@ EXPORT_SYMBOL_GPL(call_rcu_sched); | |||
1975 | */ | 2188 | */ |
1976 | void call_rcu_bh(struct rcu_head *head, void (*func)(struct rcu_head *rcu)) | 2189 | void call_rcu_bh(struct rcu_head *head, void (*func)(struct rcu_head *rcu)) |
1977 | { | 2190 | { |
1978 | __call_rcu(head, func, &rcu_bh_state, 0); | 2191 | __call_rcu(head, func, &rcu_bh_state, -1, 0); |
1979 | } | 2192 | } |
1980 | EXPORT_SYMBOL_GPL(call_rcu_bh); | 2193 | EXPORT_SYMBOL_GPL(call_rcu_bh); |
1981 | 2194 | ||
@@ -2011,10 +2224,28 @@ static inline int rcu_blocking_is_gp(void) | |||
2011 | * rcu_read_lock_sched(). | 2224 | * rcu_read_lock_sched(). |
2012 | * | 2225 | * |
2013 | * This means that all preempt_disable code sequences, including NMI and | 2226 | * This means that all preempt_disable code sequences, including NMI and |
2014 | * hardware-interrupt handlers, in progress on entry will have completed | 2227 | * non-threaded hardware-interrupt handlers, in progress on entry will |
2015 | * before this primitive returns. However, this does not guarantee that | 2228 | * have completed before this primitive returns. However, this does not |
2016 | * softirq handlers will have completed, since in some kernels, these | 2229 | * guarantee that softirq handlers will have completed, since in some |
2017 | * handlers can run in process context, and can block. | 2230 | * kernels, these handlers can run in process context, and can block. |
2231 | * | ||
2232 | * Note that this guarantee implies further memory-ordering guarantees. | ||
2233 | * On systems with more than one CPU, when synchronize_sched() returns, | ||
2234 | * each CPU is guaranteed to have executed a full memory barrier since the | ||
2235 | * end of its last RCU-sched read-side critical section whose beginning | ||
2236 | * preceded the call to synchronize_sched(). In addition, each CPU having | ||
2237 | * an RCU read-side critical section that extends beyond the return from | ||
2238 | * synchronize_sched() is guaranteed to have executed a full memory barrier | ||
2239 | * after the beginning of synchronize_sched() and before the beginning of | ||
2240 | * that RCU read-side critical section. Note that these guarantees include | ||
2241 | * CPUs that are offline, idle, or executing in user mode, as well as CPUs | ||
2242 | * that are executing in the kernel. | ||
2243 | * | ||
2244 | * Furthermore, if CPU A invoked synchronize_sched(), which returned | ||
2245 | * to its caller on CPU B, then both CPU A and CPU B are guaranteed | ||
2246 | * to have executed a full memory barrier during the execution of | ||
2247 | * synchronize_sched() -- even if CPU A and CPU B are the same CPU (but | ||
2248 | * again only if the system has more than one CPU). | ||
2018 | * | 2249 | * |
2019 | * This primitive provides the guarantees made by the (now removed) | 2250 | * This primitive provides the guarantees made by the (now removed) |
2020 | * synchronize_kernel() API. In contrast, synchronize_rcu() only | 2251 | * synchronize_kernel() API. In contrast, synchronize_rcu() only |
@@ -2030,7 +2261,10 @@ void synchronize_sched(void) | |||
2030 | "Illegal synchronize_sched() in RCU-sched read-side critical section"); | 2261 | "Illegal synchronize_sched() in RCU-sched read-side critical section"); |
2031 | if (rcu_blocking_is_gp()) | 2262 | if (rcu_blocking_is_gp()) |
2032 | return; | 2263 | return; |
2033 | wait_rcu_gp(call_rcu_sched); | 2264 | if (rcu_expedited) |
2265 | synchronize_sched_expedited(); | ||
2266 | else | ||
2267 | wait_rcu_gp(call_rcu_sched); | ||
2034 | } | 2268 | } |
2035 | EXPORT_SYMBOL_GPL(synchronize_sched); | 2269 | EXPORT_SYMBOL_GPL(synchronize_sched); |
2036 | 2270 | ||
@@ -2042,6 +2276,9 @@ EXPORT_SYMBOL_GPL(synchronize_sched); | |||
2042 | * read-side critical sections have completed. RCU read-side critical | 2276 | * read-side critical sections have completed. RCU read-side critical |
2043 | * sections are delimited by rcu_read_lock_bh() and rcu_read_unlock_bh(), | 2277 | * sections are delimited by rcu_read_lock_bh() and rcu_read_unlock_bh(), |
2044 | * and may be nested. | 2278 | * and may be nested. |
2279 | * | ||
2280 | * See the description of synchronize_sched() for more detailed information | ||
2281 | * on memory ordering guarantees. | ||
2045 | */ | 2282 | */ |
2046 | void synchronize_rcu_bh(void) | 2283 | void synchronize_rcu_bh(void) |
2047 | { | 2284 | { |
@@ -2051,13 +2288,13 @@ void synchronize_rcu_bh(void) | |||
2051 | "Illegal synchronize_rcu_bh() in RCU-bh read-side critical section"); | 2288 | "Illegal synchronize_rcu_bh() in RCU-bh read-side critical section"); |
2052 | if (rcu_blocking_is_gp()) | 2289 | if (rcu_blocking_is_gp()) |
2053 | return; | 2290 | return; |
2054 | wait_rcu_gp(call_rcu_bh); | 2291 | if (rcu_expedited) |
2292 | synchronize_rcu_bh_expedited(); | ||
2293 | else | ||
2294 | wait_rcu_gp(call_rcu_bh); | ||
2055 | } | 2295 | } |
2056 | EXPORT_SYMBOL_GPL(synchronize_rcu_bh); | 2296 | EXPORT_SYMBOL_GPL(synchronize_rcu_bh); |
2057 | 2297 | ||
2058 | static atomic_t sync_sched_expedited_started = ATOMIC_INIT(0); | ||
2059 | static atomic_t sync_sched_expedited_done = ATOMIC_INIT(0); | ||
2060 | |||
2061 | static int synchronize_sched_expedited_cpu_stop(void *data) | 2298 | static int synchronize_sched_expedited_cpu_stop(void *data) |
2062 | { | 2299 | { |
2063 | /* | 2300 | /* |
@@ -2114,10 +2351,32 @@ static int synchronize_sched_expedited_cpu_stop(void *data) | |||
2114 | */ | 2351 | */ |
2115 | void synchronize_sched_expedited(void) | 2352 | void synchronize_sched_expedited(void) |
2116 | { | 2353 | { |
2117 | int firstsnap, s, snap, trycount = 0; | 2354 | long firstsnap, s, snap; |
2355 | int trycount = 0; | ||
2356 | struct rcu_state *rsp = &rcu_sched_state; | ||
2118 | 2357 | ||
2119 | /* Note that atomic_inc_return() implies full memory barrier. */ | 2358 | /* |
2120 | firstsnap = snap = atomic_inc_return(&sync_sched_expedited_started); | 2359 | * If we are in danger of counter wrap, just do synchronize_sched(). |
2360 | * By allowing sync_sched_expedited_started to advance no more than | ||
2361 | * ULONG_MAX/8 ahead of sync_sched_expedited_done, we are ensuring | ||
2362 | * that more than 3.5 billion CPUs would be required to force a | ||
2363 | * counter wrap on a 32-bit system. Quite a few more CPUs would of | ||
2364 | * course be required on a 64-bit system. | ||
2365 | */ | ||
2366 | if (ULONG_CMP_GE((ulong)atomic_long_read(&rsp->expedited_start), | ||
2367 | (ulong)atomic_long_read(&rsp->expedited_done) + | ||
2368 | ULONG_MAX / 8)) { | ||
2369 | synchronize_sched(); | ||
2370 | atomic_long_inc(&rsp->expedited_wrap); | ||
2371 | return; | ||
2372 | } | ||
2373 | |||
2374 | /* | ||
2375 | * Take a ticket. Note that atomic_inc_return() implies a | ||
2376 | * full memory barrier. | ||
2377 | */ | ||
2378 | snap = atomic_long_inc_return(&rsp->expedited_start); | ||
2379 | firstsnap = snap; | ||
2121 | get_online_cpus(); | 2380 | get_online_cpus(); |
2122 | WARN_ON_ONCE(cpu_is_offline(raw_smp_processor_id())); | 2381 | WARN_ON_ONCE(cpu_is_offline(raw_smp_processor_id())); |
2123 | 2382 | ||
@@ -2129,48 +2388,65 @@ void synchronize_sched_expedited(void) | |||
2129 | synchronize_sched_expedited_cpu_stop, | 2388 | synchronize_sched_expedited_cpu_stop, |
2130 | NULL) == -EAGAIN) { | 2389 | NULL) == -EAGAIN) { |
2131 | put_online_cpus(); | 2390 | put_online_cpus(); |
2391 | atomic_long_inc(&rsp->expedited_tryfail); | ||
2392 | |||
2393 | /* Check to see if someone else did our work for us. */ | ||
2394 | s = atomic_long_read(&rsp->expedited_done); | ||
2395 | if (ULONG_CMP_GE((ulong)s, (ulong)firstsnap)) { | ||
2396 | /* ensure test happens before caller kfree */ | ||
2397 | smp_mb__before_atomic_inc(); /* ^^^ */ | ||
2398 | atomic_long_inc(&rsp->expedited_workdone1); | ||
2399 | return; | ||
2400 | } | ||
2132 | 2401 | ||
2133 | /* No joy, try again later. Or just synchronize_sched(). */ | 2402 | /* No joy, try again later. Or just synchronize_sched(). */ |
2134 | if (trycount++ < 10) { | 2403 | if (trycount++ < 10) { |
2135 | udelay(trycount * num_online_cpus()); | 2404 | udelay(trycount * num_online_cpus()); |
2136 | } else { | 2405 | } else { |
2137 | synchronize_sched(); | 2406 | wait_rcu_gp(call_rcu_sched); |
2407 | atomic_long_inc(&rsp->expedited_normal); | ||
2138 | return; | 2408 | return; |
2139 | } | 2409 | } |
2140 | 2410 | ||
2141 | /* Check to see if someone else did our work for us. */ | 2411 | /* Recheck to see if someone else did our work for us. */ |
2142 | s = atomic_read(&sync_sched_expedited_done); | 2412 | s = atomic_long_read(&rsp->expedited_done); |
2143 | if (UINT_CMP_GE((unsigned)s, (unsigned)firstsnap)) { | 2413 | if (ULONG_CMP_GE((ulong)s, (ulong)firstsnap)) { |
2144 | smp_mb(); /* ensure test happens before caller kfree */ | 2414 | /* ensure test happens before caller kfree */ |
2415 | smp_mb__before_atomic_inc(); /* ^^^ */ | ||
2416 | atomic_long_inc(&rsp->expedited_workdone2); | ||
2145 | return; | 2417 | return; |
2146 | } | 2418 | } |
2147 | 2419 | ||
2148 | /* | 2420 | /* |
2149 | * Refetching sync_sched_expedited_started allows later | 2421 | * Refetching sync_sched_expedited_started allows later |
2150 | * callers to piggyback on our grace period. We subtract | 2422 | * callers to piggyback on our grace period. We retry |
2151 | * 1 to get the same token that the last incrementer got. | 2423 | * after they started, so our grace period works for them, |
2152 | * We retry after they started, so our grace period works | 2424 | * and they started after our first try, so their grace |
2153 | * for them, and they started after our first try, so their | 2425 | * period works for us. |
2154 | * grace period works for us. | ||
2155 | */ | 2426 | */ |
2156 | get_online_cpus(); | 2427 | get_online_cpus(); |
2157 | snap = atomic_read(&sync_sched_expedited_started); | 2428 | snap = atomic_long_read(&rsp->expedited_start); |
2158 | smp_mb(); /* ensure read is before try_stop_cpus(). */ | 2429 | smp_mb(); /* ensure read is before try_stop_cpus(). */ |
2159 | } | 2430 | } |
2431 | atomic_long_inc(&rsp->expedited_stoppedcpus); | ||
2160 | 2432 | ||
2161 | /* | 2433 | /* |
2162 | * Everyone up to our most recent fetch is covered by our grace | 2434 | * Everyone up to our most recent fetch is covered by our grace |
2163 | * period. Update the counter, but only if our work is still | 2435 | * period. Update the counter, but only if our work is still |
2164 | * relevant -- which it won't be if someone who started later | 2436 | * relevant -- which it won't be if someone who started later |
2165 | * than we did beat us to the punch. | 2437 | * than we did already did their update. |
2166 | */ | 2438 | */ |
2167 | do { | 2439 | do { |
2168 | s = atomic_read(&sync_sched_expedited_done); | 2440 | atomic_long_inc(&rsp->expedited_done_tries); |
2169 | if (UINT_CMP_GE((unsigned)s, (unsigned)snap)) { | 2441 | s = atomic_long_read(&rsp->expedited_done); |
2170 | smp_mb(); /* ensure test happens before caller kfree */ | 2442 | if (ULONG_CMP_GE((ulong)s, (ulong)snap)) { |
2443 | /* ensure test happens before caller kfree */ | ||
2444 | smp_mb__before_atomic_inc(); /* ^^^ */ | ||
2445 | atomic_long_inc(&rsp->expedited_done_lost); | ||
2171 | break; | 2446 | break; |
2172 | } | 2447 | } |
2173 | } while (atomic_cmpxchg(&sync_sched_expedited_done, s, snap) != s); | 2448 | } while (atomic_long_cmpxchg(&rsp->expedited_done, s, snap) != s); |
2449 | atomic_long_inc(&rsp->expedited_done_exit); | ||
2174 | 2450 | ||
2175 | put_online_cpus(); | 2451 | put_online_cpus(); |
2176 | } | 2452 | } |
@@ -2195,17 +2471,7 @@ static int __rcu_pending(struct rcu_state *rsp, struct rcu_data *rdp) | |||
2195 | /* Is the RCU core waiting for a quiescent state from this CPU? */ | 2471 | /* Is the RCU core waiting for a quiescent state from this CPU? */ |
2196 | if (rcu_scheduler_fully_active && | 2472 | if (rcu_scheduler_fully_active && |
2197 | rdp->qs_pending && !rdp->passed_quiesce) { | 2473 | rdp->qs_pending && !rdp->passed_quiesce) { |
2198 | |||
2199 | /* | ||
2200 | * If force_quiescent_state() coming soon and this CPU | ||
2201 | * needs a quiescent state, and this is either RCU-sched | ||
2202 | * or RCU-bh, force a local reschedule. | ||
2203 | */ | ||
2204 | rdp->n_rp_qs_pending++; | 2474 | rdp->n_rp_qs_pending++; |
2205 | if (!rdp->preemptible && | ||
2206 | ULONG_CMP_LT(ACCESS_ONCE(rsp->jiffies_force_qs) - 1, | ||
2207 | jiffies)) | ||
2208 | set_need_resched(); | ||
2209 | } else if (rdp->qs_pending && rdp->passed_quiesce) { | 2475 | } else if (rdp->qs_pending && rdp->passed_quiesce) { |
2210 | rdp->n_rp_report_qs++; | 2476 | rdp->n_rp_report_qs++; |
2211 | return 1; | 2477 | return 1; |
@@ -2235,13 +2501,6 @@ static int __rcu_pending(struct rcu_state *rsp, struct rcu_data *rdp) | |||
2235 | return 1; | 2501 | return 1; |
2236 | } | 2502 | } |
2237 | 2503 | ||
2238 | /* Has an RCU GP gone long enough to send resched IPIs &c? */ | ||
2239 | if (rcu_gp_in_progress(rsp) && | ||
2240 | ULONG_CMP_LT(ACCESS_ONCE(rsp->jiffies_force_qs), jiffies)) { | ||
2241 | rdp->n_rp_need_fqs++; | ||
2242 | return 1; | ||
2243 | } | ||
2244 | |||
2245 | /* nothing to do */ | 2504 | /* nothing to do */ |
2246 | rdp->n_rp_need_nothing++; | 2505 | rdp->n_rp_need_nothing++; |
2247 | return 0; | 2506 | return 0; |
@@ -2326,13 +2585,10 @@ static void rcu_barrier_func(void *type) | |||
2326 | static void _rcu_barrier(struct rcu_state *rsp) | 2585 | static void _rcu_barrier(struct rcu_state *rsp) |
2327 | { | 2586 | { |
2328 | int cpu; | 2587 | int cpu; |
2329 | unsigned long flags; | ||
2330 | struct rcu_data *rdp; | 2588 | struct rcu_data *rdp; |
2331 | struct rcu_data rd; | ||
2332 | unsigned long snap = ACCESS_ONCE(rsp->n_barrier_done); | 2589 | unsigned long snap = ACCESS_ONCE(rsp->n_barrier_done); |
2333 | unsigned long snap_done; | 2590 | unsigned long snap_done; |
2334 | 2591 | ||
2335 | init_rcu_head_on_stack(&rd.barrier_head); | ||
2336 | _rcu_barrier_trace(rsp, "Begin", -1, snap); | 2592 | _rcu_barrier_trace(rsp, "Begin", -1, snap); |
2337 | 2593 | ||
2338 | /* Take mutex to serialize concurrent rcu_barrier() requests. */ | 2594 | /* Take mutex to serialize concurrent rcu_barrier() requests. */ |
@@ -2372,70 +2628,38 @@ static void _rcu_barrier(struct rcu_state *rsp) | |||
2372 | /* | 2628 | /* |
2373 | * Initialize the count to one rather than to zero in order to | 2629 | * Initialize the count to one rather than to zero in order to |
2374 | * avoid a too-soon return to zero in case of a short grace period | 2630 | * avoid a too-soon return to zero in case of a short grace period |
2375 | * (or preemption of this task). Also flag this task as doing | 2631 | * (or preemption of this task). Exclude CPU-hotplug operations |
2376 | * an rcu_barrier(). This will prevent anyone else from adopting | 2632 | * to ensure that no offline CPU has callbacks queued. |
2377 | * orphaned callbacks, which could cause otherwise failure if a | ||
2378 | * CPU went offline and quickly came back online. To see this, | ||
2379 | * consider the following sequence of events: | ||
2380 | * | ||
2381 | * 1. We cause CPU 0 to post an rcu_barrier_callback() callback. | ||
2382 | * 2. CPU 1 goes offline, orphaning its callbacks. | ||
2383 | * 3. CPU 0 adopts CPU 1's orphaned callbacks. | ||
2384 | * 4. CPU 1 comes back online. | ||
2385 | * 5. We cause CPU 1 to post an rcu_barrier_callback() callback. | ||
2386 | * 6. Both rcu_barrier_callback() callbacks are invoked, awakening | ||
2387 | * us -- but before CPU 1's orphaned callbacks are invoked!!! | ||
2388 | */ | 2633 | */ |
2389 | init_completion(&rsp->barrier_completion); | 2634 | init_completion(&rsp->barrier_completion); |
2390 | atomic_set(&rsp->barrier_cpu_count, 1); | 2635 | atomic_set(&rsp->barrier_cpu_count, 1); |
2391 | raw_spin_lock_irqsave(&rsp->onofflock, flags); | 2636 | get_online_cpus(); |
2392 | rsp->rcu_barrier_in_progress = current; | ||
2393 | raw_spin_unlock_irqrestore(&rsp->onofflock, flags); | ||
2394 | 2637 | ||
2395 | /* | 2638 | /* |
2396 | * Force every CPU with callbacks to register a new callback | 2639 | * Force each CPU with callbacks to register a new callback. |
2397 | * that will tell us when all the preceding callbacks have | 2640 | * When that callback is invoked, we will know that all of the |
2398 | * been invoked. If an offline CPU has callbacks, wait for | 2641 | * corresponding CPU's preceding callbacks have been invoked. |
2399 | * it to either come back online or to finish orphaning those | ||
2400 | * callbacks. | ||
2401 | */ | 2642 | */ |
2402 | for_each_possible_cpu(cpu) { | 2643 | for_each_possible_cpu(cpu) { |
2403 | preempt_disable(); | 2644 | if (!cpu_online(cpu) && !is_nocb_cpu(cpu)) |
2645 | continue; | ||
2404 | rdp = per_cpu_ptr(rsp->rda, cpu); | 2646 | rdp = per_cpu_ptr(rsp->rda, cpu); |
2405 | if (cpu_is_offline(cpu)) { | 2647 | if (is_nocb_cpu(cpu)) { |
2406 | _rcu_barrier_trace(rsp, "Offline", cpu, | 2648 | _rcu_barrier_trace(rsp, "OnlineNoCB", cpu, |
2407 | rsp->n_barrier_done); | 2649 | rsp->n_barrier_done); |
2408 | preempt_enable(); | 2650 | atomic_inc(&rsp->barrier_cpu_count); |
2409 | while (cpu_is_offline(cpu) && ACCESS_ONCE(rdp->qlen)) | 2651 | __call_rcu(&rdp->barrier_head, rcu_barrier_callback, |
2410 | schedule_timeout_interruptible(1); | 2652 | rsp, cpu, 0); |
2411 | } else if (ACCESS_ONCE(rdp->qlen)) { | 2653 | } else if (ACCESS_ONCE(rdp->qlen)) { |
2412 | _rcu_barrier_trace(rsp, "OnlineQ", cpu, | 2654 | _rcu_barrier_trace(rsp, "OnlineQ", cpu, |
2413 | rsp->n_barrier_done); | 2655 | rsp->n_barrier_done); |
2414 | smp_call_function_single(cpu, rcu_barrier_func, rsp, 1); | 2656 | smp_call_function_single(cpu, rcu_barrier_func, rsp, 1); |
2415 | preempt_enable(); | ||
2416 | } else { | 2657 | } else { |
2417 | _rcu_barrier_trace(rsp, "OnlineNQ", cpu, | 2658 | _rcu_barrier_trace(rsp, "OnlineNQ", cpu, |
2418 | rsp->n_barrier_done); | 2659 | rsp->n_barrier_done); |
2419 | preempt_enable(); | ||
2420 | } | 2660 | } |
2421 | } | 2661 | } |
2422 | 2662 | put_online_cpus(); | |
2423 | /* | ||
2424 | * Now that all online CPUs have rcu_barrier_callback() callbacks | ||
2425 | * posted, we can adopt all of the orphaned callbacks and place | ||
2426 | * an rcu_barrier_callback() callback after them. When that is done, | ||
2427 | * we are guaranteed to have an rcu_barrier_callback() callback | ||
2428 | * following every callback that could possibly have been | ||
2429 | * registered before _rcu_barrier() was called. | ||
2430 | */ | ||
2431 | raw_spin_lock_irqsave(&rsp->onofflock, flags); | ||
2432 | rcu_adopt_orphan_cbs(rsp); | ||
2433 | rsp->rcu_barrier_in_progress = NULL; | ||
2434 | raw_spin_unlock_irqrestore(&rsp->onofflock, flags); | ||
2435 | atomic_inc(&rsp->barrier_cpu_count); | ||
2436 | smp_mb__after_atomic_inc(); /* Ensure atomic_inc() before callback. */ | ||
2437 | rd.rsp = rsp; | ||
2438 | rsp->call(&rd.barrier_head, rcu_barrier_callback); | ||
2439 | 2663 | ||
2440 | /* | 2664 | /* |
2441 | * Now that we have an rcu_barrier_callback() callback on each | 2665 | * Now that we have an rcu_barrier_callback() callback on each |
@@ -2456,8 +2680,6 @@ static void _rcu_barrier(struct rcu_state *rsp) | |||
2456 | 2680 | ||
2457 | /* Other rcu_barrier() invocations can now safely proceed. */ | 2681 | /* Other rcu_barrier() invocations can now safely proceed. */ |
2458 | mutex_unlock(&rsp->barrier_mutex); | 2682 | mutex_unlock(&rsp->barrier_mutex); |
2459 | |||
2460 | destroy_rcu_head_on_stack(&rd.barrier_head); | ||
2461 | } | 2683 | } |
2462 | 2684 | ||
2463 | /** | 2685 | /** |
@@ -2497,8 +2719,12 @@ rcu_boot_init_percpu_data(int cpu, struct rcu_state *rsp) | |||
2497 | rdp->dynticks = &per_cpu(rcu_dynticks, cpu); | 2719 | rdp->dynticks = &per_cpu(rcu_dynticks, cpu); |
2498 | WARN_ON_ONCE(rdp->dynticks->dynticks_nesting != DYNTICK_TASK_EXIT_IDLE); | 2720 | WARN_ON_ONCE(rdp->dynticks->dynticks_nesting != DYNTICK_TASK_EXIT_IDLE); |
2499 | WARN_ON_ONCE(atomic_read(&rdp->dynticks->dynticks) != 1); | 2721 | WARN_ON_ONCE(atomic_read(&rdp->dynticks->dynticks) != 1); |
2722 | #ifdef CONFIG_RCU_USER_QS | ||
2723 | WARN_ON_ONCE(rdp->dynticks->in_user); | ||
2724 | #endif | ||
2500 | rdp->cpu = cpu; | 2725 | rdp->cpu = cpu; |
2501 | rdp->rsp = rsp; | 2726 | rdp->rsp = rsp; |
2727 | rcu_boot_init_nocb_percpu_data(rdp); | ||
2502 | raw_spin_unlock_irqrestore(&rnp->lock, flags); | 2728 | raw_spin_unlock_irqrestore(&rnp->lock, flags); |
2503 | } | 2729 | } |
2504 | 2730 | ||
@@ -2516,6 +2742,9 @@ rcu_init_percpu_data(int cpu, struct rcu_state *rsp, int preemptible) | |||
2516 | struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu); | 2742 | struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu); |
2517 | struct rcu_node *rnp = rcu_get_root(rsp); | 2743 | struct rcu_node *rnp = rcu_get_root(rsp); |
2518 | 2744 | ||
2745 | /* Exclude new grace periods. */ | ||
2746 | mutex_lock(&rsp->onoff_mutex); | ||
2747 | |||
2519 | /* Set up local state, ensuring consistent view of global state. */ | 2748 | /* Set up local state, ensuring consistent view of global state. */ |
2520 | raw_spin_lock_irqsave(&rnp->lock, flags); | 2749 | raw_spin_lock_irqsave(&rnp->lock, flags); |
2521 | rdp->beenonline = 1; /* We have now been online. */ | 2750 | rdp->beenonline = 1; /* We have now been online. */ |
@@ -2523,20 +2752,13 @@ rcu_init_percpu_data(int cpu, struct rcu_state *rsp, int preemptible) | |||
2523 | rdp->qlen_last_fqs_check = 0; | 2752 | rdp->qlen_last_fqs_check = 0; |
2524 | rdp->n_force_qs_snap = rsp->n_force_qs; | 2753 | rdp->n_force_qs_snap = rsp->n_force_qs; |
2525 | rdp->blimit = blimit; | 2754 | rdp->blimit = blimit; |
2755 | init_callback_list(rdp); /* Re-enable callbacks on this CPU. */ | ||
2526 | rdp->dynticks->dynticks_nesting = DYNTICK_TASK_EXIT_IDLE; | 2756 | rdp->dynticks->dynticks_nesting = DYNTICK_TASK_EXIT_IDLE; |
2527 | atomic_set(&rdp->dynticks->dynticks, | 2757 | atomic_set(&rdp->dynticks->dynticks, |
2528 | (atomic_read(&rdp->dynticks->dynticks) & ~0x1) + 1); | 2758 | (atomic_read(&rdp->dynticks->dynticks) & ~0x1) + 1); |
2529 | rcu_prepare_for_idle_init(cpu); | 2759 | rcu_prepare_for_idle_init(cpu); |
2530 | raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */ | 2760 | raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */ |
2531 | 2761 | ||
2532 | /* | ||
2533 | * A new grace period might start here. If so, we won't be part | ||
2534 | * of it, but that is OK, as we are currently in a quiescent state. | ||
2535 | */ | ||
2536 | |||
2537 | /* Exclude any attempts to start a new GP on large systems. */ | ||
2538 | raw_spin_lock(&rsp->onofflock); /* irqs already disabled. */ | ||
2539 | |||
2540 | /* Add CPU to rcu_node bitmasks. */ | 2762 | /* Add CPU to rcu_node bitmasks. */ |
2541 | rnp = rdp->mynode; | 2763 | rnp = rdp->mynode; |
2542 | mask = rdp->grpmask; | 2764 | mask = rdp->grpmask; |
@@ -2555,14 +2777,14 @@ rcu_init_percpu_data(int cpu, struct rcu_state *rsp, int preemptible) | |||
2555 | rdp->completed = rnp->completed; | 2777 | rdp->completed = rnp->completed; |
2556 | rdp->passed_quiesce = 0; | 2778 | rdp->passed_quiesce = 0; |
2557 | rdp->qs_pending = 0; | 2779 | rdp->qs_pending = 0; |
2558 | rdp->passed_quiesce_gpnum = rnp->gpnum - 1; | ||
2559 | trace_rcu_grace_period(rsp->name, rdp->gpnum, "cpuonl"); | 2780 | trace_rcu_grace_period(rsp->name, rdp->gpnum, "cpuonl"); |
2560 | } | 2781 | } |
2561 | raw_spin_unlock(&rnp->lock); /* irqs already disabled. */ | 2782 | raw_spin_unlock(&rnp->lock); /* irqs already disabled. */ |
2562 | rnp = rnp->parent; | 2783 | rnp = rnp->parent; |
2563 | } while (rnp != NULL && !(rnp->qsmaskinit & mask)); | 2784 | } while (rnp != NULL && !(rnp->qsmaskinit & mask)); |
2785 | local_irq_restore(flags); | ||
2564 | 2786 | ||
2565 | raw_spin_unlock_irqrestore(&rsp->onofflock, flags); | 2787 | mutex_unlock(&rsp->onoff_mutex); |
2566 | } | 2788 | } |
2567 | 2789 | ||
2568 | static void __cpuinit rcu_prepare_cpu(int cpu) | 2790 | static void __cpuinit rcu_prepare_cpu(int cpu) |
@@ -2584,6 +2806,7 @@ static int __cpuinit rcu_cpu_notify(struct notifier_block *self, | |||
2584 | struct rcu_data *rdp = per_cpu_ptr(rcu_state->rda, cpu); | 2806 | struct rcu_data *rdp = per_cpu_ptr(rcu_state->rda, cpu); |
2585 | struct rcu_node *rnp = rdp->mynode; | 2807 | struct rcu_node *rnp = rdp->mynode; |
2586 | struct rcu_state *rsp; | 2808 | struct rcu_state *rsp; |
2809 | int ret = NOTIFY_OK; | ||
2587 | 2810 | ||
2588 | trace_rcu_utilization("Start CPU hotplug"); | 2811 | trace_rcu_utilization("Start CPU hotplug"); |
2589 | switch (action) { | 2812 | switch (action) { |
@@ -2594,12 +2817,13 @@ static int __cpuinit rcu_cpu_notify(struct notifier_block *self, | |||
2594 | break; | 2817 | break; |
2595 | case CPU_ONLINE: | 2818 | case CPU_ONLINE: |
2596 | case CPU_DOWN_FAILED: | 2819 | case CPU_DOWN_FAILED: |
2597 | rcu_node_kthread_setaffinity(rnp, -1); | 2820 | rcu_boost_kthread_setaffinity(rnp, -1); |
2598 | rcu_cpu_kthread_setrt(cpu, 1); | ||
2599 | break; | 2821 | break; |
2600 | case CPU_DOWN_PREPARE: | 2822 | case CPU_DOWN_PREPARE: |
2601 | rcu_node_kthread_setaffinity(rnp, cpu); | 2823 | if (nocb_cpu_expendable(cpu)) |
2602 | rcu_cpu_kthread_setrt(cpu, 0); | 2824 | rcu_boost_kthread_setaffinity(rnp, cpu); |
2825 | else | ||
2826 | ret = NOTIFY_BAD; | ||
2603 | break; | 2827 | break; |
2604 | case CPU_DYING: | 2828 | case CPU_DYING: |
2605 | case CPU_DYING_FROZEN: | 2829 | case CPU_DYING_FROZEN: |
@@ -2623,8 +2847,31 @@ static int __cpuinit rcu_cpu_notify(struct notifier_block *self, | |||
2623 | break; | 2847 | break; |
2624 | } | 2848 | } |
2625 | trace_rcu_utilization("End CPU hotplug"); | 2849 | trace_rcu_utilization("End CPU hotplug"); |
2626 | return NOTIFY_OK; | 2850 | return ret; |
2851 | } | ||
2852 | |||
2853 | /* | ||
2854 | * Spawn the kthread that handles this RCU flavor's grace periods. | ||
2855 | */ | ||
2856 | static int __init rcu_spawn_gp_kthread(void) | ||
2857 | { | ||
2858 | unsigned long flags; | ||
2859 | struct rcu_node *rnp; | ||
2860 | struct rcu_state *rsp; | ||
2861 | struct task_struct *t; | ||
2862 | |||
2863 | for_each_rcu_flavor(rsp) { | ||
2864 | t = kthread_run(rcu_gp_kthread, rsp, rsp->name); | ||
2865 | BUG_ON(IS_ERR(t)); | ||
2866 | rnp = rcu_get_root(rsp); | ||
2867 | raw_spin_lock_irqsave(&rnp->lock, flags); | ||
2868 | rsp->gp_kthread = t; | ||
2869 | raw_spin_unlock_irqrestore(&rnp->lock, flags); | ||
2870 | rcu_spawn_nocb_kthreads(rsp); | ||
2871 | } | ||
2872 | return 0; | ||
2627 | } | 2873 | } |
2874 | early_initcall(rcu_spawn_gp_kthread); | ||
2628 | 2875 | ||
2629 | /* | 2876 | /* |
2630 | * This function is invoked towards the end of the scheduler's initialization | 2877 | * This function is invoked towards the end of the scheduler's initialization |
@@ -2661,7 +2908,7 @@ static void __init rcu_init_levelspread(struct rcu_state *rsp) | |||
2661 | int cprv; | 2908 | int cprv; |
2662 | int i; | 2909 | int i; |
2663 | 2910 | ||
2664 | cprv = NR_CPUS; | 2911 | cprv = nr_cpu_ids; |
2665 | for (i = rcu_num_lvls - 1; i >= 0; i--) { | 2912 | for (i = rcu_num_lvls - 1; i >= 0; i--) { |
2666 | ccur = rsp->levelcnt[i]; | 2913 | ccur = rsp->levelcnt[i]; |
2667 | rsp->levelspread[i] = (cprv + ccur - 1) / ccur; | 2914 | rsp->levelspread[i] = (cprv + ccur - 1) / ccur; |
@@ -2676,10 +2923,14 @@ static void __init rcu_init_levelspread(struct rcu_state *rsp) | |||
2676 | static void __init rcu_init_one(struct rcu_state *rsp, | 2923 | static void __init rcu_init_one(struct rcu_state *rsp, |
2677 | struct rcu_data __percpu *rda) | 2924 | struct rcu_data __percpu *rda) |
2678 | { | 2925 | { |
2679 | static char *buf[] = { "rcu_node_level_0", | 2926 | static char *buf[] = { "rcu_node_0", |
2680 | "rcu_node_level_1", | 2927 | "rcu_node_1", |
2681 | "rcu_node_level_2", | 2928 | "rcu_node_2", |
2682 | "rcu_node_level_3" }; /* Match MAX_RCU_LVLS */ | 2929 | "rcu_node_3" }; /* Match MAX_RCU_LVLS */ |
2930 | static char *fqs[] = { "rcu_node_fqs_0", | ||
2931 | "rcu_node_fqs_1", | ||
2932 | "rcu_node_fqs_2", | ||
2933 | "rcu_node_fqs_3" }; /* Match MAX_RCU_LVLS */ | ||
2683 | int cpustride = 1; | 2934 | int cpustride = 1; |
2684 | int i; | 2935 | int i; |
2685 | int j; | 2936 | int j; |
@@ -2704,7 +2955,11 @@ static void __init rcu_init_one(struct rcu_state *rsp, | |||
2704 | raw_spin_lock_init(&rnp->lock); | 2955 | raw_spin_lock_init(&rnp->lock); |
2705 | lockdep_set_class_and_name(&rnp->lock, | 2956 | lockdep_set_class_and_name(&rnp->lock, |
2706 | &rcu_node_class[i], buf[i]); | 2957 | &rcu_node_class[i], buf[i]); |
2707 | rnp->gpnum = 0; | 2958 | raw_spin_lock_init(&rnp->fqslock); |
2959 | lockdep_set_class_and_name(&rnp->fqslock, | ||
2960 | &rcu_fqs_class[i], fqs[i]); | ||
2961 | rnp->gpnum = rsp->gpnum; | ||
2962 | rnp->completed = rsp->completed; | ||
2708 | rnp->qsmask = 0; | 2963 | rnp->qsmask = 0; |
2709 | rnp->qsmaskinit = 0; | 2964 | rnp->qsmaskinit = 0; |
2710 | rnp->grplo = j * cpustride; | 2965 | rnp->grplo = j * cpustride; |
@@ -2727,6 +2982,7 @@ static void __init rcu_init_one(struct rcu_state *rsp, | |||
2727 | } | 2982 | } |
2728 | 2983 | ||
2729 | rsp->rda = rda; | 2984 | rsp->rda = rda; |
2985 | init_waitqueue_head(&rsp->gp_wq); | ||
2730 | rnp = rsp->level[rcu_num_lvls - 1]; | 2986 | rnp = rsp->level[rcu_num_lvls - 1]; |
2731 | for_each_possible_cpu(i) { | 2987 | for_each_possible_cpu(i) { |
2732 | while (i > rnp->grphi) | 2988 | while (i > rnp->grphi) |
@@ -2750,7 +3006,8 @@ static void __init rcu_init_geometry(void) | |||
2750 | int rcu_capacity[MAX_RCU_LVLS + 1]; | 3006 | int rcu_capacity[MAX_RCU_LVLS + 1]; |
2751 | 3007 | ||
2752 | /* If the compile-time values are accurate, just leave. */ | 3008 | /* If the compile-time values are accurate, just leave. */ |
2753 | if (rcu_fanout_leaf == CONFIG_RCU_FANOUT_LEAF) | 3009 | if (rcu_fanout_leaf == CONFIG_RCU_FANOUT_LEAF && |
3010 | nr_cpu_ids == NR_CPUS) | ||
2754 | return; | 3011 | return; |
2755 | 3012 | ||
2756 | /* | 3013 | /* |
@@ -2806,6 +3063,7 @@ void __init rcu_init(void) | |||
2806 | rcu_init_one(&rcu_sched_state, &rcu_sched_data); | 3063 | rcu_init_one(&rcu_sched_state, &rcu_sched_data); |
2807 | rcu_init_one(&rcu_bh_state, &rcu_bh_data); | 3064 | rcu_init_one(&rcu_bh_state, &rcu_bh_data); |
2808 | __rcu_init_preempt(); | 3065 | __rcu_init_preempt(); |
3066 | rcu_init_nocb(); | ||
2809 | open_softirq(RCU_SOFTIRQ, rcu_process_callbacks); | 3067 | open_softirq(RCU_SOFTIRQ, rcu_process_callbacks); |
2810 | 3068 | ||
2811 | /* | 3069 | /* |
diff --git a/kernel/rcutree.h b/kernel/rcutree.h index 4d29169f2124..4b69291b093d 100644 --- a/kernel/rcutree.h +++ b/kernel/rcutree.h | |||
@@ -102,6 +102,10 @@ struct rcu_dynticks { | |||
102 | /* idle-period nonlazy_posted snapshot. */ | 102 | /* idle-period nonlazy_posted snapshot. */ |
103 | int tick_nohz_enabled_snap; /* Previously seen value from sysfs. */ | 103 | int tick_nohz_enabled_snap; /* Previously seen value from sysfs. */ |
104 | #endif /* #ifdef CONFIG_RCU_FAST_NO_HZ */ | 104 | #endif /* #ifdef CONFIG_RCU_FAST_NO_HZ */ |
105 | #ifdef CONFIG_RCU_USER_QS | ||
106 | bool ignore_user_qs; /* Treat userspace as extended QS or not */ | ||
107 | bool in_user; /* Is the CPU in userland from RCU POV? */ | ||
108 | #endif | ||
105 | }; | 109 | }; |
106 | 110 | ||
107 | /* RCU's kthread states for tracing. */ | 111 | /* RCU's kthread states for tracing. */ |
@@ -196,12 +200,7 @@ struct rcu_node { | |||
196 | /* Refused to boost: not sure why, though. */ | 200 | /* Refused to boost: not sure why, though. */ |
197 | /* This can happen due to race conditions. */ | 201 | /* This can happen due to race conditions. */ |
198 | #endif /* #ifdef CONFIG_RCU_BOOST */ | 202 | #endif /* #ifdef CONFIG_RCU_BOOST */ |
199 | struct task_struct *node_kthread_task; | 203 | raw_spinlock_t fqslock ____cacheline_internodealigned_in_smp; |
200 | /* kthread that takes care of this rcu_node */ | ||
201 | /* structure, for example, awakening the */ | ||
202 | /* per-CPU kthreads as needed. */ | ||
203 | unsigned int node_kthread_status; | ||
204 | /* State of node_kthread_task for tracing. */ | ||
205 | } ____cacheline_internodealigned_in_smp; | 204 | } ____cacheline_internodealigned_in_smp; |
206 | 205 | ||
207 | /* | 206 | /* |
@@ -245,8 +244,6 @@ struct rcu_data { | |||
245 | /* in order to detect GP end. */ | 244 | /* in order to detect GP end. */ |
246 | unsigned long gpnum; /* Highest gp number that this CPU */ | 245 | unsigned long gpnum; /* Highest gp number that this CPU */ |
247 | /* is aware of having started. */ | 246 | /* is aware of having started. */ |
248 | unsigned long passed_quiesce_gpnum; | ||
249 | /* gpnum at time of quiescent state. */ | ||
250 | bool passed_quiesce; /* User-mode/idle loop etc. */ | 247 | bool passed_quiesce; /* User-mode/idle loop etc. */ |
251 | bool qs_pending; /* Core waits for quiesc state. */ | 248 | bool qs_pending; /* Core waits for quiesc state. */ |
252 | bool beenonline; /* CPU online at least once. */ | 249 | bool beenonline; /* CPU online at least once. */ |
@@ -290,6 +287,7 @@ struct rcu_data { | |||
290 | long qlen_last_fqs_check; | 287 | long qlen_last_fqs_check; |
291 | /* qlen at last check for QS forcing */ | 288 | /* qlen at last check for QS forcing */ |
292 | unsigned long n_cbs_invoked; /* count of RCU cbs invoked. */ | 289 | unsigned long n_cbs_invoked; /* count of RCU cbs invoked. */ |
290 | unsigned long n_nocbs_invoked; /* count of no-CBs RCU cbs invoked. */ | ||
293 | unsigned long n_cbs_orphaned; /* RCU cbs orphaned by dying CPU */ | 291 | unsigned long n_cbs_orphaned; /* RCU cbs orphaned by dying CPU */ |
294 | unsigned long n_cbs_adopted; /* RCU cbs adopted from dying CPU */ | 292 | unsigned long n_cbs_adopted; /* RCU cbs adopted from dying CPU */ |
295 | unsigned long n_force_qs_snap; | 293 | unsigned long n_force_qs_snap; |
@@ -312,11 +310,25 @@ struct rcu_data { | |||
312 | unsigned long n_rp_cpu_needs_gp; | 310 | unsigned long n_rp_cpu_needs_gp; |
313 | unsigned long n_rp_gp_completed; | 311 | unsigned long n_rp_gp_completed; |
314 | unsigned long n_rp_gp_started; | 312 | unsigned long n_rp_gp_started; |
315 | unsigned long n_rp_need_fqs; | ||
316 | unsigned long n_rp_need_nothing; | 313 | unsigned long n_rp_need_nothing; |
317 | 314 | ||
318 | /* 6) _rcu_barrier() callback. */ | 315 | /* 6) _rcu_barrier() and OOM callbacks. */ |
319 | struct rcu_head barrier_head; | 316 | struct rcu_head barrier_head; |
317 | #ifdef CONFIG_RCU_FAST_NO_HZ | ||
318 | struct rcu_head oom_head; | ||
319 | #endif /* #ifdef CONFIG_RCU_FAST_NO_HZ */ | ||
320 | |||
321 | /* 7) Callback offloading. */ | ||
322 | #ifdef CONFIG_RCU_NOCB_CPU | ||
323 | struct rcu_head *nocb_head; /* CBs waiting for kthread. */ | ||
324 | struct rcu_head **nocb_tail; | ||
325 | atomic_long_t nocb_q_count; /* # CBs waiting for kthread */ | ||
326 | atomic_long_t nocb_q_count_lazy; /* (approximate). */ | ||
327 | int nocb_p_count; /* # CBs being invoked by kthread */ | ||
328 | int nocb_p_count_lazy; /* (approximate). */ | ||
329 | wait_queue_head_t nocb_wq; /* For nocb kthreads to sleep on. */ | ||
330 | struct task_struct *nocb_kthread; | ||
331 | #endif /* #ifdef CONFIG_RCU_NOCB_CPU */ | ||
320 | 332 | ||
321 | int cpu; | 333 | int cpu; |
322 | struct rcu_state *rsp; | 334 | struct rcu_state *rsp; |
@@ -370,26 +382,28 @@ struct rcu_state { | |||
370 | struct rcu_data __percpu *rda; /* pointer of percu rcu_data. */ | 382 | struct rcu_data __percpu *rda; /* pointer of percu rcu_data. */ |
371 | void (*call)(struct rcu_head *head, /* call_rcu() flavor. */ | 383 | void (*call)(struct rcu_head *head, /* call_rcu() flavor. */ |
372 | void (*func)(struct rcu_head *head)); | 384 | void (*func)(struct rcu_head *head)); |
385 | #ifdef CONFIG_RCU_NOCB_CPU | ||
386 | void (*call_remote)(struct rcu_head *head, | ||
387 | void (*func)(struct rcu_head *head)); | ||
388 | /* call_rcu() flavor, but for */ | ||
389 | /* placing on remote CPU. */ | ||
390 | #endif /* #ifdef CONFIG_RCU_NOCB_CPU */ | ||
373 | 391 | ||
374 | /* The following fields are guarded by the root rcu_node's lock. */ | 392 | /* The following fields are guarded by the root rcu_node's lock. */ |
375 | 393 | ||
376 | u8 fqs_state ____cacheline_internodealigned_in_smp; | 394 | u8 fqs_state ____cacheline_internodealigned_in_smp; |
377 | /* Force QS state. */ | 395 | /* Force QS state. */ |
378 | u8 fqs_active; /* force_quiescent_state() */ | ||
379 | /* is running. */ | ||
380 | u8 fqs_need_gp; /* A CPU was prevented from */ | ||
381 | /* starting a new grace */ | ||
382 | /* period because */ | ||
383 | /* force_quiescent_state() */ | ||
384 | /* was running. */ | ||
385 | u8 boost; /* Subject to priority boost. */ | 396 | u8 boost; /* Subject to priority boost. */ |
386 | unsigned long gpnum; /* Current gp number. */ | 397 | unsigned long gpnum; /* Current gp number. */ |
387 | unsigned long completed; /* # of last completed gp. */ | 398 | unsigned long completed; /* # of last completed gp. */ |
399 | struct task_struct *gp_kthread; /* Task for grace periods. */ | ||
400 | wait_queue_head_t gp_wq; /* Where GP task waits. */ | ||
401 | int gp_flags; /* Commands for GP task. */ | ||
388 | 402 | ||
389 | /* End of fields guarded by root rcu_node's lock. */ | 403 | /* End of fields guarded by root rcu_node's lock. */ |
390 | 404 | ||
391 | raw_spinlock_t onofflock; /* exclude on/offline and */ | 405 | raw_spinlock_t orphan_lock ____cacheline_internodealigned_in_smp; |
392 | /* starting new GP. */ | 406 | /* Protect following fields. */ |
393 | struct rcu_head *orphan_nxtlist; /* Orphaned callbacks that */ | 407 | struct rcu_head *orphan_nxtlist; /* Orphaned callbacks that */ |
394 | /* need a grace period. */ | 408 | /* need a grace period. */ |
395 | struct rcu_head **orphan_nxttail; /* Tail of above. */ | 409 | struct rcu_head **orphan_nxttail; /* Tail of above. */ |
@@ -398,16 +412,29 @@ struct rcu_state { | |||
398 | struct rcu_head **orphan_donetail; /* Tail of above. */ | 412 | struct rcu_head **orphan_donetail; /* Tail of above. */ |
399 | long qlen_lazy; /* Number of lazy callbacks. */ | 413 | long qlen_lazy; /* Number of lazy callbacks. */ |
400 | long qlen; /* Total number of callbacks. */ | 414 | long qlen; /* Total number of callbacks. */ |
401 | struct task_struct *rcu_barrier_in_progress; | 415 | /* End of fields guarded by orphan_lock. */ |
402 | /* Task doing rcu_barrier(), */ | 416 | |
403 | /* or NULL if no barrier. */ | 417 | struct mutex onoff_mutex; /* Coordinate hotplug & GPs. */ |
418 | |||
404 | struct mutex barrier_mutex; /* Guards barrier fields. */ | 419 | struct mutex barrier_mutex; /* Guards barrier fields. */ |
405 | atomic_t barrier_cpu_count; /* # CPUs waiting on. */ | 420 | atomic_t barrier_cpu_count; /* # CPUs waiting on. */ |
406 | struct completion barrier_completion; /* Wake at barrier end. */ | 421 | struct completion barrier_completion; /* Wake at barrier end. */ |
407 | unsigned long n_barrier_done; /* ++ at start and end of */ | 422 | unsigned long n_barrier_done; /* ++ at start and end of */ |
408 | /* _rcu_barrier(). */ | 423 | /* _rcu_barrier(). */ |
409 | raw_spinlock_t fqslock; /* Only one task forcing */ | 424 | /* End of fields guarded by barrier_mutex. */ |
410 | /* quiescent states. */ | 425 | |
426 | atomic_long_t expedited_start; /* Starting ticket. */ | ||
427 | atomic_long_t expedited_done; /* Done ticket. */ | ||
428 | atomic_long_t expedited_wrap; /* # near-wrap incidents. */ | ||
429 | atomic_long_t expedited_tryfail; /* # acquisition failures. */ | ||
430 | atomic_long_t expedited_workdone1; /* # done by others #1. */ | ||
431 | atomic_long_t expedited_workdone2; /* # done by others #2. */ | ||
432 | atomic_long_t expedited_normal; /* # fallbacks to normal. */ | ||
433 | atomic_long_t expedited_stoppedcpus; /* # successful stop_cpus. */ | ||
434 | atomic_long_t expedited_done_tries; /* # tries to update _done. */ | ||
435 | atomic_long_t expedited_done_lost; /* # times beaten to _done. */ | ||
436 | atomic_long_t expedited_done_exit; /* # times exited _done loop. */ | ||
437 | |||
411 | unsigned long jiffies_force_qs; /* Time at which to invoke */ | 438 | unsigned long jiffies_force_qs; /* Time at which to invoke */ |
412 | /* force_quiescent_state(). */ | 439 | /* force_quiescent_state(). */ |
413 | unsigned long n_force_qs; /* Number of calls to */ | 440 | unsigned long n_force_qs; /* Number of calls to */ |
@@ -426,7 +453,13 @@ struct rcu_state { | |||
426 | struct list_head flavors; /* List of RCU flavors. */ | 453 | struct list_head flavors; /* List of RCU flavors. */ |
427 | }; | 454 | }; |
428 | 455 | ||
456 | /* Values for rcu_state structure's gp_flags field. */ | ||
457 | #define RCU_GP_FLAG_INIT 0x1 /* Need grace-period initialization. */ | ||
458 | #define RCU_GP_FLAG_FQS 0x2 /* Need grace-period quiescent-state forcing. */ | ||
459 | |||
429 | extern struct list_head rcu_struct_flavors; | 460 | extern struct list_head rcu_struct_flavors; |
461 | |||
462 | /* Sequence through rcu_state structures for each RCU flavor. */ | ||
430 | #define for_each_rcu_flavor(rsp) \ | 463 | #define for_each_rcu_flavor(rsp) \ |
431 | list_for_each_entry((rsp), &rcu_struct_flavors, flavors) | 464 | list_for_each_entry((rsp), &rcu_struct_flavors, flavors) |
432 | 465 | ||
@@ -468,7 +501,6 @@ static int rcu_preempt_blocked_readers_cgp(struct rcu_node *rnp); | |||
468 | #ifdef CONFIG_HOTPLUG_CPU | 501 | #ifdef CONFIG_HOTPLUG_CPU |
469 | static void rcu_report_unblock_qs_rnp(struct rcu_node *rnp, | 502 | static void rcu_report_unblock_qs_rnp(struct rcu_node *rnp, |
470 | unsigned long flags); | 503 | unsigned long flags); |
471 | static void rcu_stop_cpu_kthread(int cpu); | ||
472 | #endif /* #ifdef CONFIG_HOTPLUG_CPU */ | 504 | #endif /* #ifdef CONFIG_HOTPLUG_CPU */ |
473 | static void rcu_print_detail_task_stall(struct rcu_state *rsp); | 505 | static void rcu_print_detail_task_stall(struct rcu_state *rsp); |
474 | static int rcu_print_task_stall(struct rcu_node *rnp); | 506 | static int rcu_print_task_stall(struct rcu_node *rnp); |
@@ -491,15 +523,9 @@ static void invoke_rcu_callbacks_kthread(void); | |||
491 | static bool rcu_is_callbacks_kthread(void); | 523 | static bool rcu_is_callbacks_kthread(void); |
492 | #ifdef CONFIG_RCU_BOOST | 524 | #ifdef CONFIG_RCU_BOOST |
493 | static void rcu_preempt_do_callbacks(void); | 525 | static void rcu_preempt_do_callbacks(void); |
494 | static void rcu_boost_kthread_setaffinity(struct rcu_node *rnp, | ||
495 | cpumask_var_t cm); | ||
496 | static int __cpuinit rcu_spawn_one_boost_kthread(struct rcu_state *rsp, | 526 | static int __cpuinit rcu_spawn_one_boost_kthread(struct rcu_state *rsp, |
497 | struct rcu_node *rnp, | 527 | struct rcu_node *rnp); |
498 | int rnp_index); | ||
499 | static void invoke_rcu_node_kthread(struct rcu_node *rnp); | ||
500 | static void rcu_yield(void (*f)(unsigned long), unsigned long arg); | ||
501 | #endif /* #ifdef CONFIG_RCU_BOOST */ | 528 | #endif /* #ifdef CONFIG_RCU_BOOST */ |
502 | static void rcu_cpu_kthread_setrt(int cpu, int to_rt); | ||
503 | static void __cpuinit rcu_prepare_kthreads(int cpu); | 529 | static void __cpuinit rcu_prepare_kthreads(int cpu); |
504 | static void rcu_prepare_for_idle_init(int cpu); | 530 | static void rcu_prepare_for_idle_init(int cpu); |
505 | static void rcu_cleanup_after_idle(int cpu); | 531 | static void rcu_cleanup_after_idle(int cpu); |
@@ -510,5 +536,32 @@ static void print_cpu_stall_info(struct rcu_state *rsp, int cpu); | |||
510 | static void print_cpu_stall_info_end(void); | 536 | static void print_cpu_stall_info_end(void); |
511 | static void zero_cpu_stall_ticks(struct rcu_data *rdp); | 537 | static void zero_cpu_stall_ticks(struct rcu_data *rdp); |
512 | static void increment_cpu_stall_ticks(void); | 538 | static void increment_cpu_stall_ticks(void); |
539 | static bool is_nocb_cpu(int cpu); | ||
540 | static bool __call_rcu_nocb(struct rcu_data *rdp, struct rcu_head *rhp, | ||
541 | bool lazy); | ||
542 | static bool rcu_nocb_adopt_orphan_cbs(struct rcu_state *rsp, | ||
543 | struct rcu_data *rdp); | ||
544 | static bool nocb_cpu_expendable(int cpu); | ||
545 | static void rcu_boot_init_nocb_percpu_data(struct rcu_data *rdp); | ||
546 | static void rcu_spawn_nocb_kthreads(struct rcu_state *rsp); | ||
547 | static void init_nocb_callback_list(struct rcu_data *rdp); | ||
548 | static void __init rcu_init_nocb(void); | ||
513 | 549 | ||
514 | #endif /* #ifndef RCU_TREE_NONCORE */ | 550 | #endif /* #ifndef RCU_TREE_NONCORE */ |
551 | |||
552 | #ifdef CONFIG_RCU_TRACE | ||
553 | #ifdef CONFIG_RCU_NOCB_CPU | ||
554 | /* Sum up queue lengths for tracing. */ | ||
555 | static inline void rcu_nocb_q_lengths(struct rcu_data *rdp, long *ql, long *qll) | ||
556 | { | ||
557 | *ql = atomic_long_read(&rdp->nocb_q_count) + rdp->nocb_p_count; | ||
558 | *qll = atomic_long_read(&rdp->nocb_q_count_lazy) + rdp->nocb_p_count_lazy; | ||
559 | } | ||
560 | #else /* #ifdef CONFIG_RCU_NOCB_CPU */ | ||
561 | static inline void rcu_nocb_q_lengths(struct rcu_data *rdp, long *ql, long *qll) | ||
562 | { | ||
563 | *ql = 0; | ||
564 | *qll = 0; | ||
565 | } | ||
566 | #endif /* #else #ifdef CONFIG_RCU_NOCB_CPU */ | ||
567 | #endif /* #ifdef CONFIG_RCU_TRACE */ | ||
diff --git a/kernel/rcutree_plugin.h b/kernel/rcutree_plugin.h index 7f3244c0df01..f6e5ec2932b4 100644 --- a/kernel/rcutree_plugin.h +++ b/kernel/rcutree_plugin.h | |||
@@ -25,6 +25,9 @@ | |||
25 | */ | 25 | */ |
26 | 26 | ||
27 | #include <linux/delay.h> | 27 | #include <linux/delay.h> |
28 | #include <linux/gfp.h> | ||
29 | #include <linux/oom.h> | ||
30 | #include <linux/smpboot.h> | ||
28 | 31 | ||
29 | #define RCU_KTHREAD_PRIO 1 | 32 | #define RCU_KTHREAD_PRIO 1 |
30 | 33 | ||
@@ -34,6 +37,14 @@ | |||
34 | #define RCU_BOOST_PRIO RCU_KTHREAD_PRIO | 37 | #define RCU_BOOST_PRIO RCU_KTHREAD_PRIO |
35 | #endif | 38 | #endif |
36 | 39 | ||
40 | #ifdef CONFIG_RCU_NOCB_CPU | ||
41 | static cpumask_var_t rcu_nocb_mask; /* CPUs to have callbacks offloaded. */ | ||
42 | static bool have_rcu_nocb_mask; /* Was rcu_nocb_mask allocated? */ | ||
43 | static bool rcu_nocb_poll; /* Offload kthread are to poll. */ | ||
44 | module_param(rcu_nocb_poll, bool, 0444); | ||
45 | static char __initdata nocb_buf[NR_CPUS * 5]; | ||
46 | #endif /* #ifdef CONFIG_RCU_NOCB_CPU */ | ||
47 | |||
37 | /* | 48 | /* |
38 | * Check the RCU kernel configuration parameters and print informative | 49 | * Check the RCU kernel configuration parameters and print informative |
39 | * messages about anything out of the ordinary. If you like #ifdef, you | 50 | * messages about anything out of the ordinary. If you like #ifdef, you |
@@ -74,6 +85,18 @@ static void __init rcu_bootup_announce_oddness(void) | |||
74 | printk(KERN_INFO "\tExperimental boot-time adjustment of leaf fanout to %d.\n", rcu_fanout_leaf); | 85 | printk(KERN_INFO "\tExperimental boot-time adjustment of leaf fanout to %d.\n", rcu_fanout_leaf); |
75 | if (nr_cpu_ids != NR_CPUS) | 86 | if (nr_cpu_ids != NR_CPUS) |
76 | printk(KERN_INFO "\tRCU restricting CPUs from NR_CPUS=%d to nr_cpu_ids=%d.\n", NR_CPUS, nr_cpu_ids); | 87 | printk(KERN_INFO "\tRCU restricting CPUs from NR_CPUS=%d to nr_cpu_ids=%d.\n", NR_CPUS, nr_cpu_ids); |
88 | #ifdef CONFIG_RCU_NOCB_CPU | ||
89 | if (have_rcu_nocb_mask) { | ||
90 | if (cpumask_test_cpu(0, rcu_nocb_mask)) { | ||
91 | cpumask_clear_cpu(0, rcu_nocb_mask); | ||
92 | pr_info("\tCPU 0: illegal no-CBs CPU (cleared).\n"); | ||
93 | } | ||
94 | cpulist_scnprintf(nocb_buf, sizeof(nocb_buf), rcu_nocb_mask); | ||
95 | pr_info("\tExperimental no-CBs CPUs: %s.\n", nocb_buf); | ||
96 | if (rcu_nocb_poll) | ||
97 | pr_info("\tExperimental polled no-CBs CPUs.\n"); | ||
98 | } | ||
99 | #endif /* #ifdef CONFIG_RCU_NOCB_CPU */ | ||
77 | } | 100 | } |
78 | 101 | ||
79 | #ifdef CONFIG_TREE_PREEMPT_RCU | 102 | #ifdef CONFIG_TREE_PREEMPT_RCU |
@@ -118,7 +141,7 @@ EXPORT_SYMBOL_GPL(rcu_batches_completed); | |||
118 | */ | 141 | */ |
119 | void rcu_force_quiescent_state(void) | 142 | void rcu_force_quiescent_state(void) |
120 | { | 143 | { |
121 | force_quiescent_state(&rcu_preempt_state, 0); | 144 | force_quiescent_state(&rcu_preempt_state); |
122 | } | 145 | } |
123 | EXPORT_SYMBOL_GPL(rcu_force_quiescent_state); | 146 | EXPORT_SYMBOL_GPL(rcu_force_quiescent_state); |
124 | 147 | ||
@@ -136,8 +159,6 @@ static void rcu_preempt_qs(int cpu) | |||
136 | { | 159 | { |
137 | struct rcu_data *rdp = &per_cpu(rcu_preempt_data, cpu); | 160 | struct rcu_data *rdp = &per_cpu(rcu_preempt_data, cpu); |
138 | 161 | ||
139 | rdp->passed_quiesce_gpnum = rdp->gpnum; | ||
140 | barrier(); | ||
141 | if (rdp->passed_quiesce == 0) | 162 | if (rdp->passed_quiesce == 0) |
142 | trace_rcu_grace_period("rcu_preempt", rdp->gpnum, "cpuqs"); | 163 | trace_rcu_grace_period("rcu_preempt", rdp->gpnum, "cpuqs"); |
143 | rdp->passed_quiesce = 1; | 164 | rdp->passed_quiesce = 1; |
@@ -422,9 +443,11 @@ static void rcu_print_detail_task_stall_rnp(struct rcu_node *rnp) | |||
422 | unsigned long flags; | 443 | unsigned long flags; |
423 | struct task_struct *t; | 444 | struct task_struct *t; |
424 | 445 | ||
425 | if (!rcu_preempt_blocked_readers_cgp(rnp)) | ||
426 | return; | ||
427 | raw_spin_lock_irqsave(&rnp->lock, flags); | 446 | raw_spin_lock_irqsave(&rnp->lock, flags); |
447 | if (!rcu_preempt_blocked_readers_cgp(rnp)) { | ||
448 | raw_spin_unlock_irqrestore(&rnp->lock, flags); | ||
449 | return; | ||
450 | } | ||
428 | t = list_entry(rnp->gp_tasks, | 451 | t = list_entry(rnp->gp_tasks, |
429 | struct task_struct, rcu_node_entry); | 452 | struct task_struct, rcu_node_entry); |
430 | list_for_each_entry_continue(t, &rnp->blkd_tasks, rcu_node_entry) | 453 | list_for_each_entry_continue(t, &rnp->blkd_tasks, rcu_node_entry) |
@@ -584,17 +607,23 @@ static int rcu_preempt_offline_tasks(struct rcu_state *rsp, | |||
584 | raw_spin_unlock(&rnp_root->lock); /* irqs still disabled */ | 607 | raw_spin_unlock(&rnp_root->lock); /* irqs still disabled */ |
585 | } | 608 | } |
586 | 609 | ||
610 | rnp->gp_tasks = NULL; | ||
611 | rnp->exp_tasks = NULL; | ||
587 | #ifdef CONFIG_RCU_BOOST | 612 | #ifdef CONFIG_RCU_BOOST |
588 | /* In case root is being boosted and leaf is not. */ | 613 | rnp->boost_tasks = NULL; |
614 | /* | ||
615 | * In case root is being boosted and leaf was not. Make sure | ||
616 | * that we boost the tasks blocking the current grace period | ||
617 | * in this case. | ||
618 | */ | ||
589 | raw_spin_lock(&rnp_root->lock); /* irqs already disabled */ | 619 | raw_spin_lock(&rnp_root->lock); /* irqs already disabled */ |
590 | if (rnp_root->boost_tasks != NULL && | 620 | if (rnp_root->boost_tasks != NULL && |
591 | rnp_root->boost_tasks != rnp_root->gp_tasks) | 621 | rnp_root->boost_tasks != rnp_root->gp_tasks && |
622 | rnp_root->boost_tasks != rnp_root->exp_tasks) | ||
592 | rnp_root->boost_tasks = rnp_root->gp_tasks; | 623 | rnp_root->boost_tasks = rnp_root->gp_tasks; |
593 | raw_spin_unlock(&rnp_root->lock); /* irqs still disabled */ | 624 | raw_spin_unlock(&rnp_root->lock); /* irqs still disabled */ |
594 | #endif /* #ifdef CONFIG_RCU_BOOST */ | 625 | #endif /* #ifdef CONFIG_RCU_BOOST */ |
595 | 626 | ||
596 | rnp->gp_tasks = NULL; | ||
597 | rnp->exp_tasks = NULL; | ||
598 | return retval; | 627 | return retval; |
599 | } | 628 | } |
600 | 629 | ||
@@ -634,7 +663,7 @@ static void rcu_preempt_do_callbacks(void) | |||
634 | */ | 663 | */ |
635 | void call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu)) | 664 | void call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu)) |
636 | { | 665 | { |
637 | __call_rcu(head, func, &rcu_preempt_state, 0); | 666 | __call_rcu(head, func, &rcu_preempt_state, -1, 0); |
638 | } | 667 | } |
639 | EXPORT_SYMBOL_GPL(call_rcu); | 668 | EXPORT_SYMBOL_GPL(call_rcu); |
640 | 669 | ||
@@ -648,7 +677,7 @@ EXPORT_SYMBOL_GPL(call_rcu); | |||
648 | void kfree_call_rcu(struct rcu_head *head, | 677 | void kfree_call_rcu(struct rcu_head *head, |
649 | void (*func)(struct rcu_head *rcu)) | 678 | void (*func)(struct rcu_head *rcu)) |
650 | { | 679 | { |
651 | __call_rcu(head, func, &rcu_preempt_state, 1); | 680 | __call_rcu(head, func, &rcu_preempt_state, -1, 1); |
652 | } | 681 | } |
653 | EXPORT_SYMBOL_GPL(kfree_call_rcu); | 682 | EXPORT_SYMBOL_GPL(kfree_call_rcu); |
654 | 683 | ||
@@ -662,6 +691,9 @@ EXPORT_SYMBOL_GPL(kfree_call_rcu); | |||
662 | * concurrently with new RCU read-side critical sections that began while | 691 | * concurrently with new RCU read-side critical sections that began while |
663 | * synchronize_rcu() was waiting. RCU read-side critical sections are | 692 | * synchronize_rcu() was waiting. RCU read-side critical sections are |
664 | * delimited by rcu_read_lock() and rcu_read_unlock(), and may be nested. | 693 | * delimited by rcu_read_lock() and rcu_read_unlock(), and may be nested. |
694 | * | ||
695 | * See the description of synchronize_sched() for more detailed information | ||
696 | * on memory ordering guarantees. | ||
665 | */ | 697 | */ |
666 | void synchronize_rcu(void) | 698 | void synchronize_rcu(void) |
667 | { | 699 | { |
@@ -671,12 +703,15 @@ void synchronize_rcu(void) | |||
671 | "Illegal synchronize_rcu() in RCU read-side critical section"); | 703 | "Illegal synchronize_rcu() in RCU read-side critical section"); |
672 | if (!rcu_scheduler_active) | 704 | if (!rcu_scheduler_active) |
673 | return; | 705 | return; |
674 | wait_rcu_gp(call_rcu); | 706 | if (rcu_expedited) |
707 | synchronize_rcu_expedited(); | ||
708 | else | ||
709 | wait_rcu_gp(call_rcu); | ||
675 | } | 710 | } |
676 | EXPORT_SYMBOL_GPL(synchronize_rcu); | 711 | EXPORT_SYMBOL_GPL(synchronize_rcu); |
677 | 712 | ||
678 | static DECLARE_WAIT_QUEUE_HEAD(sync_rcu_preempt_exp_wq); | 713 | static DECLARE_WAIT_QUEUE_HEAD(sync_rcu_preempt_exp_wq); |
679 | static long sync_rcu_preempt_exp_count; | 714 | static unsigned long sync_rcu_preempt_exp_count; |
680 | static DEFINE_MUTEX(sync_rcu_preempt_exp_mutex); | 715 | static DEFINE_MUTEX(sync_rcu_preempt_exp_mutex); |
681 | 716 | ||
682 | /* | 717 | /* |
@@ -749,7 +784,8 @@ static void rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp, | |||
749 | * grace period for the specified rcu_node structure. If there are no such | 784 | * grace period for the specified rcu_node structure. If there are no such |
750 | * tasks, report it up the rcu_node hierarchy. | 785 | * tasks, report it up the rcu_node hierarchy. |
751 | * | 786 | * |
752 | * Caller must hold sync_rcu_preempt_exp_mutex and rsp->onofflock. | 787 | * Caller must hold sync_rcu_preempt_exp_mutex and must exclude |
788 | * CPU hotplug operations. | ||
753 | */ | 789 | */ |
754 | static void | 790 | static void |
755 | sync_rcu_preempt_exp_init(struct rcu_state *rsp, struct rcu_node *rnp) | 791 | sync_rcu_preempt_exp_init(struct rcu_state *rsp, struct rcu_node *rnp) |
@@ -791,7 +827,7 @@ void synchronize_rcu_expedited(void) | |||
791 | unsigned long flags; | 827 | unsigned long flags; |
792 | struct rcu_node *rnp; | 828 | struct rcu_node *rnp; |
793 | struct rcu_state *rsp = &rcu_preempt_state; | 829 | struct rcu_state *rsp = &rcu_preempt_state; |
794 | long snap; | 830 | unsigned long snap; |
795 | int trycount = 0; | 831 | int trycount = 0; |
796 | 832 | ||
797 | smp_mb(); /* Caller's modifications seen first by other CPUs. */ | 833 | smp_mb(); /* Caller's modifications seen first by other CPUs. */ |
@@ -799,33 +835,47 @@ void synchronize_rcu_expedited(void) | |||
799 | smp_mb(); /* Above access cannot bleed into critical section. */ | 835 | smp_mb(); /* Above access cannot bleed into critical section. */ |
800 | 836 | ||
801 | /* | 837 | /* |
838 | * Block CPU-hotplug operations. This means that any CPU-hotplug | ||
839 | * operation that finds an rcu_node structure with tasks in the | ||
840 | * process of being boosted will know that all tasks blocking | ||
841 | * this expedited grace period will already be in the process of | ||
842 | * being boosted. This simplifies the process of moving tasks | ||
843 | * from leaf to root rcu_node structures. | ||
844 | */ | ||
845 | get_online_cpus(); | ||
846 | |||
847 | /* | ||
802 | * Acquire lock, falling back to synchronize_rcu() if too many | 848 | * Acquire lock, falling back to synchronize_rcu() if too many |
803 | * lock-acquisition failures. Of course, if someone does the | 849 | * lock-acquisition failures. Of course, if someone does the |
804 | * expedited grace period for us, just leave. | 850 | * expedited grace period for us, just leave. |
805 | */ | 851 | */ |
806 | while (!mutex_trylock(&sync_rcu_preempt_exp_mutex)) { | 852 | while (!mutex_trylock(&sync_rcu_preempt_exp_mutex)) { |
853 | if (ULONG_CMP_LT(snap, | ||
854 | ACCESS_ONCE(sync_rcu_preempt_exp_count))) { | ||
855 | put_online_cpus(); | ||
856 | goto mb_ret; /* Others did our work for us. */ | ||
857 | } | ||
807 | if (trycount++ < 10) { | 858 | if (trycount++ < 10) { |
808 | udelay(trycount * num_online_cpus()); | 859 | udelay(trycount * num_online_cpus()); |
809 | } else { | 860 | } else { |
810 | synchronize_rcu(); | 861 | put_online_cpus(); |
862 | wait_rcu_gp(call_rcu); | ||
811 | return; | 863 | return; |
812 | } | 864 | } |
813 | if ((ACCESS_ONCE(sync_rcu_preempt_exp_count) - snap) > 0) | ||
814 | goto mb_ret; /* Others did our work for us. */ | ||
815 | } | 865 | } |
816 | if ((ACCESS_ONCE(sync_rcu_preempt_exp_count) - snap) > 0) | 866 | if (ULONG_CMP_LT(snap, ACCESS_ONCE(sync_rcu_preempt_exp_count))) { |
867 | put_online_cpus(); | ||
817 | goto unlock_mb_ret; /* Others did our work for us. */ | 868 | goto unlock_mb_ret; /* Others did our work for us. */ |
869 | } | ||
818 | 870 | ||
819 | /* force all RCU readers onto ->blkd_tasks lists. */ | 871 | /* force all RCU readers onto ->blkd_tasks lists. */ |
820 | synchronize_sched_expedited(); | 872 | synchronize_sched_expedited(); |
821 | 873 | ||
822 | raw_spin_lock_irqsave(&rsp->onofflock, flags); | ||
823 | |||
824 | /* Initialize ->expmask for all non-leaf rcu_node structures. */ | 874 | /* Initialize ->expmask for all non-leaf rcu_node structures. */ |
825 | rcu_for_each_nonleaf_node_breadth_first(rsp, rnp) { | 875 | rcu_for_each_nonleaf_node_breadth_first(rsp, rnp) { |
826 | raw_spin_lock(&rnp->lock); /* irqs already disabled. */ | 876 | raw_spin_lock_irqsave(&rnp->lock, flags); |
827 | rnp->expmask = rnp->qsmaskinit; | 877 | rnp->expmask = rnp->qsmaskinit; |
828 | raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */ | 878 | raw_spin_unlock_irqrestore(&rnp->lock, flags); |
829 | } | 879 | } |
830 | 880 | ||
831 | /* Snapshot current state of ->blkd_tasks lists. */ | 881 | /* Snapshot current state of ->blkd_tasks lists. */ |
@@ -834,7 +884,7 @@ void synchronize_rcu_expedited(void) | |||
834 | if (NUM_RCU_NODES > 1) | 884 | if (NUM_RCU_NODES > 1) |
835 | sync_rcu_preempt_exp_init(rsp, rcu_get_root(rsp)); | 885 | sync_rcu_preempt_exp_init(rsp, rcu_get_root(rsp)); |
836 | 886 | ||
837 | raw_spin_unlock_irqrestore(&rsp->onofflock, flags); | 887 | put_online_cpus(); |
838 | 888 | ||
839 | /* Wait for snapshotted ->blkd_tasks lists to drain. */ | 889 | /* Wait for snapshotted ->blkd_tasks lists to drain. */ |
840 | rnp = rcu_get_root(rsp); | 890 | rnp = rcu_get_root(rsp); |
@@ -853,6 +903,11 @@ EXPORT_SYMBOL_GPL(synchronize_rcu_expedited); | |||
853 | 903 | ||
854 | /** | 904 | /** |
855 | * rcu_barrier - Wait until all in-flight call_rcu() callbacks complete. | 905 | * rcu_barrier - Wait until all in-flight call_rcu() callbacks complete. |
906 | * | ||
907 | * Note that this primitive does not necessarily wait for an RCU grace period | ||
908 | * to complete. For example, if there are no RCU callbacks queued anywhere | ||
909 | * in the system, then rcu_barrier() is within its rights to return | ||
910 | * immediately, without waiting for anything, much less an RCU grace period. | ||
856 | */ | 911 | */ |
857 | void rcu_barrier(void) | 912 | void rcu_barrier(void) |
858 | { | 913 | { |
@@ -991,7 +1046,7 @@ static void rcu_preempt_check_callbacks(int cpu) | |||
991 | void kfree_call_rcu(struct rcu_head *head, | 1046 | void kfree_call_rcu(struct rcu_head *head, |
992 | void (*func)(struct rcu_head *rcu)) | 1047 | void (*func)(struct rcu_head *rcu)) |
993 | { | 1048 | { |
994 | __call_rcu(head, func, &rcu_sched_state, 1); | 1049 | __call_rcu(head, func, &rcu_sched_state, -1, 1); |
995 | } | 1050 | } |
996 | EXPORT_SYMBOL_GPL(kfree_call_rcu); | 1051 | EXPORT_SYMBOL_GPL(kfree_call_rcu); |
997 | 1052 | ||
@@ -1069,6 +1124,16 @@ static void rcu_initiate_boost_trace(struct rcu_node *rnp) | |||
1069 | 1124 | ||
1070 | #endif /* #else #ifdef CONFIG_RCU_TRACE */ | 1125 | #endif /* #else #ifdef CONFIG_RCU_TRACE */ |
1071 | 1126 | ||
1127 | static void rcu_wake_cond(struct task_struct *t, int status) | ||
1128 | { | ||
1129 | /* | ||
1130 | * If the thread is yielding, only wake it when this | ||
1131 | * is invoked from idle | ||
1132 | */ | ||
1133 | if (status != RCU_KTHREAD_YIELDING || is_idle_task(current)) | ||
1134 | wake_up_process(t); | ||
1135 | } | ||
1136 | |||
1072 | /* | 1137 | /* |
1073 | * Carry out RCU priority boosting on the task indicated by ->exp_tasks | 1138 | * Carry out RCU priority boosting on the task indicated by ->exp_tasks |
1074 | * or ->boost_tasks, advancing the pointer to the next task in the | 1139 | * or ->boost_tasks, advancing the pointer to the next task in the |
@@ -1141,17 +1206,6 @@ static int rcu_boost(struct rcu_node *rnp) | |||
1141 | } | 1206 | } |
1142 | 1207 | ||
1143 | /* | 1208 | /* |
1144 | * Timer handler to initiate waking up of boost kthreads that | ||
1145 | * have yielded the CPU due to excessive numbers of tasks to | ||
1146 | * boost. We wake up the per-rcu_node kthread, which in turn | ||
1147 | * will wake up the booster kthread. | ||
1148 | */ | ||
1149 | static void rcu_boost_kthread_timer(unsigned long arg) | ||
1150 | { | ||
1151 | invoke_rcu_node_kthread((struct rcu_node *)arg); | ||
1152 | } | ||
1153 | |||
1154 | /* | ||
1155 | * Priority-boosting kthread. One per leaf rcu_node and one for the | 1209 | * Priority-boosting kthread. One per leaf rcu_node and one for the |
1156 | * root rcu_node. | 1210 | * root rcu_node. |
1157 | */ | 1211 | */ |
@@ -1174,8 +1228,9 @@ static int rcu_boost_kthread(void *arg) | |||
1174 | else | 1228 | else |
1175 | spincnt = 0; | 1229 | spincnt = 0; |
1176 | if (spincnt > 10) { | 1230 | if (spincnt > 10) { |
1231 | rnp->boost_kthread_status = RCU_KTHREAD_YIELDING; | ||
1177 | trace_rcu_utilization("End boost kthread@rcu_yield"); | 1232 | trace_rcu_utilization("End boost kthread@rcu_yield"); |
1178 | rcu_yield(rcu_boost_kthread_timer, (unsigned long)rnp); | 1233 | schedule_timeout_interruptible(2); |
1179 | trace_rcu_utilization("Start boost kthread@rcu_yield"); | 1234 | trace_rcu_utilization("Start boost kthread@rcu_yield"); |
1180 | spincnt = 0; | 1235 | spincnt = 0; |
1181 | } | 1236 | } |
@@ -1191,9 +1246,9 @@ static int rcu_boost_kthread(void *arg) | |||
1191 | * kthread to start boosting them. If there is an expedited grace | 1246 | * kthread to start boosting them. If there is an expedited grace |
1192 | * period in progress, it is always time to boost. | 1247 | * period in progress, it is always time to boost. |
1193 | * | 1248 | * |
1194 | * The caller must hold rnp->lock, which this function releases, | 1249 | * The caller must hold rnp->lock, which this function releases. |
1195 | * but irqs remain disabled. The ->boost_kthread_task is immortal, | 1250 | * The ->boost_kthread_task is immortal, so we don't need to worry |
1196 | * so we don't need to worry about it going away. | 1251 | * about it going away. |
1197 | */ | 1252 | */ |
1198 | static void rcu_initiate_boost(struct rcu_node *rnp, unsigned long flags) | 1253 | static void rcu_initiate_boost(struct rcu_node *rnp, unsigned long flags) |
1199 | { | 1254 | { |
@@ -1213,8 +1268,8 @@ static void rcu_initiate_boost(struct rcu_node *rnp, unsigned long flags) | |||
1213 | rnp->boost_tasks = rnp->gp_tasks; | 1268 | rnp->boost_tasks = rnp->gp_tasks; |
1214 | raw_spin_unlock_irqrestore(&rnp->lock, flags); | 1269 | raw_spin_unlock_irqrestore(&rnp->lock, flags); |
1215 | t = rnp->boost_kthread_task; | 1270 | t = rnp->boost_kthread_task; |
1216 | if (t != NULL) | 1271 | if (t) |
1217 | wake_up_process(t); | 1272 | rcu_wake_cond(t, rnp->boost_kthread_status); |
1218 | } else { | 1273 | } else { |
1219 | rcu_initiate_boost_trace(rnp); | 1274 | rcu_initiate_boost_trace(rnp); |
1220 | raw_spin_unlock_irqrestore(&rnp->lock, flags); | 1275 | raw_spin_unlock_irqrestore(&rnp->lock, flags); |
@@ -1231,8 +1286,10 @@ static void invoke_rcu_callbacks_kthread(void) | |||
1231 | local_irq_save(flags); | 1286 | local_irq_save(flags); |
1232 | __this_cpu_write(rcu_cpu_has_work, 1); | 1287 | __this_cpu_write(rcu_cpu_has_work, 1); |
1233 | if (__this_cpu_read(rcu_cpu_kthread_task) != NULL && | 1288 | if (__this_cpu_read(rcu_cpu_kthread_task) != NULL && |
1234 | current != __this_cpu_read(rcu_cpu_kthread_task)) | 1289 | current != __this_cpu_read(rcu_cpu_kthread_task)) { |
1235 | wake_up_process(__this_cpu_read(rcu_cpu_kthread_task)); | 1290 | rcu_wake_cond(__this_cpu_read(rcu_cpu_kthread_task), |
1291 | __this_cpu_read(rcu_cpu_kthread_status)); | ||
1292 | } | ||
1236 | local_irq_restore(flags); | 1293 | local_irq_restore(flags); |
1237 | } | 1294 | } |
1238 | 1295 | ||
@@ -1245,21 +1302,6 @@ static bool rcu_is_callbacks_kthread(void) | |||
1245 | return __get_cpu_var(rcu_cpu_kthread_task) == current; | 1302 | return __get_cpu_var(rcu_cpu_kthread_task) == current; |
1246 | } | 1303 | } |
1247 | 1304 | ||
1248 | /* | ||
1249 | * Set the affinity of the boost kthread. The CPU-hotplug locks are | ||
1250 | * held, so no one should be messing with the existence of the boost | ||
1251 | * kthread. | ||
1252 | */ | ||
1253 | static void rcu_boost_kthread_setaffinity(struct rcu_node *rnp, | ||
1254 | cpumask_var_t cm) | ||
1255 | { | ||
1256 | struct task_struct *t; | ||
1257 | |||
1258 | t = rnp->boost_kthread_task; | ||
1259 | if (t != NULL) | ||
1260 | set_cpus_allowed_ptr(rnp->boost_kthread_task, cm); | ||
1261 | } | ||
1262 | |||
1263 | #define RCU_BOOST_DELAY_JIFFIES DIV_ROUND_UP(CONFIG_RCU_BOOST_DELAY * HZ, 1000) | 1305 | #define RCU_BOOST_DELAY_JIFFIES DIV_ROUND_UP(CONFIG_RCU_BOOST_DELAY * HZ, 1000) |
1264 | 1306 | ||
1265 | /* | 1307 | /* |
@@ -1276,15 +1318,19 @@ static void rcu_preempt_boost_start_gp(struct rcu_node *rnp) | |||
1276 | * Returns zero if all is well, a negated errno otherwise. | 1318 | * Returns zero if all is well, a negated errno otherwise. |
1277 | */ | 1319 | */ |
1278 | static int __cpuinit rcu_spawn_one_boost_kthread(struct rcu_state *rsp, | 1320 | static int __cpuinit rcu_spawn_one_boost_kthread(struct rcu_state *rsp, |
1279 | struct rcu_node *rnp, | 1321 | struct rcu_node *rnp) |
1280 | int rnp_index) | ||
1281 | { | 1322 | { |
1323 | int rnp_index = rnp - &rsp->node[0]; | ||
1282 | unsigned long flags; | 1324 | unsigned long flags; |
1283 | struct sched_param sp; | 1325 | struct sched_param sp; |
1284 | struct task_struct *t; | 1326 | struct task_struct *t; |
1285 | 1327 | ||
1286 | if (&rcu_preempt_state != rsp) | 1328 | if (&rcu_preempt_state != rsp) |
1287 | return 0; | 1329 | return 0; |
1330 | |||
1331 | if (!rcu_scheduler_fully_active || rnp->qsmaskinit == 0) | ||
1332 | return 0; | ||
1333 | |||
1288 | rsp->boost = 1; | 1334 | rsp->boost = 1; |
1289 | if (rnp->boost_kthread_task != NULL) | 1335 | if (rnp->boost_kthread_task != NULL) |
1290 | return 0; | 1336 | return 0; |
@@ -1301,25 +1347,6 @@ static int __cpuinit rcu_spawn_one_boost_kthread(struct rcu_state *rsp, | |||
1301 | return 0; | 1347 | return 0; |
1302 | } | 1348 | } |
1303 | 1349 | ||
1304 | #ifdef CONFIG_HOTPLUG_CPU | ||
1305 | |||
1306 | /* | ||
1307 | * Stop the RCU's per-CPU kthread when its CPU goes offline,. | ||
1308 | */ | ||
1309 | static void rcu_stop_cpu_kthread(int cpu) | ||
1310 | { | ||
1311 | struct task_struct *t; | ||
1312 | |||
1313 | /* Stop the CPU's kthread. */ | ||
1314 | t = per_cpu(rcu_cpu_kthread_task, cpu); | ||
1315 | if (t != NULL) { | ||
1316 | per_cpu(rcu_cpu_kthread_task, cpu) = NULL; | ||
1317 | kthread_stop(t); | ||
1318 | } | ||
1319 | } | ||
1320 | |||
1321 | #endif /* #ifdef CONFIG_HOTPLUG_CPU */ | ||
1322 | |||
1323 | static void rcu_kthread_do_work(void) | 1350 | static void rcu_kthread_do_work(void) |
1324 | { | 1351 | { |
1325 | rcu_do_batch(&rcu_sched_state, &__get_cpu_var(rcu_sched_data)); | 1352 | rcu_do_batch(&rcu_sched_state, &__get_cpu_var(rcu_sched_data)); |
@@ -1327,112 +1354,22 @@ static void rcu_kthread_do_work(void) | |||
1327 | rcu_preempt_do_callbacks(); | 1354 | rcu_preempt_do_callbacks(); |
1328 | } | 1355 | } |
1329 | 1356 | ||
1330 | /* | 1357 | static void rcu_cpu_kthread_setup(unsigned int cpu) |
1331 | * Wake up the specified per-rcu_node-structure kthread. | ||
1332 | * Because the per-rcu_node kthreads are immortal, we don't need | ||
1333 | * to do anything to keep them alive. | ||
1334 | */ | ||
1335 | static void invoke_rcu_node_kthread(struct rcu_node *rnp) | ||
1336 | { | ||
1337 | struct task_struct *t; | ||
1338 | |||
1339 | t = rnp->node_kthread_task; | ||
1340 | if (t != NULL) | ||
1341 | wake_up_process(t); | ||
1342 | } | ||
1343 | |||
1344 | /* | ||
1345 | * Set the specified CPU's kthread to run RT or not, as specified by | ||
1346 | * the to_rt argument. The CPU-hotplug locks are held, so the task | ||
1347 | * is not going away. | ||
1348 | */ | ||
1349 | static void rcu_cpu_kthread_setrt(int cpu, int to_rt) | ||
1350 | { | 1358 | { |
1351 | int policy; | ||
1352 | struct sched_param sp; | 1359 | struct sched_param sp; |
1353 | struct task_struct *t; | ||
1354 | |||
1355 | t = per_cpu(rcu_cpu_kthread_task, cpu); | ||
1356 | if (t == NULL) | ||
1357 | return; | ||
1358 | if (to_rt) { | ||
1359 | policy = SCHED_FIFO; | ||
1360 | sp.sched_priority = RCU_KTHREAD_PRIO; | ||
1361 | } else { | ||
1362 | policy = SCHED_NORMAL; | ||
1363 | sp.sched_priority = 0; | ||
1364 | } | ||
1365 | sched_setscheduler_nocheck(t, policy, &sp); | ||
1366 | } | ||
1367 | 1360 | ||
1368 | /* | 1361 | sp.sched_priority = RCU_KTHREAD_PRIO; |
1369 | * Timer handler to initiate the waking up of per-CPU kthreads that | 1362 | sched_setscheduler_nocheck(current, SCHED_FIFO, &sp); |
1370 | * have yielded the CPU due to excess numbers of RCU callbacks. | ||
1371 | * We wake up the per-rcu_node kthread, which in turn will wake up | ||
1372 | * the booster kthread. | ||
1373 | */ | ||
1374 | static void rcu_cpu_kthread_timer(unsigned long arg) | ||
1375 | { | ||
1376 | struct rcu_data *rdp = per_cpu_ptr(rcu_state->rda, arg); | ||
1377 | struct rcu_node *rnp = rdp->mynode; | ||
1378 | |||
1379 | atomic_or(rdp->grpmask, &rnp->wakemask); | ||
1380 | invoke_rcu_node_kthread(rnp); | ||
1381 | } | 1363 | } |
1382 | 1364 | ||
1383 | /* | 1365 | static void rcu_cpu_kthread_park(unsigned int cpu) |
1384 | * Drop to non-real-time priority and yield, but only after posting a | ||
1385 | * timer that will cause us to regain our real-time priority if we | ||
1386 | * remain preempted. Either way, we restore our real-time priority | ||
1387 | * before returning. | ||
1388 | */ | ||
1389 | static void rcu_yield(void (*f)(unsigned long), unsigned long arg) | ||
1390 | { | 1366 | { |
1391 | struct sched_param sp; | 1367 | per_cpu(rcu_cpu_kthread_status, cpu) = RCU_KTHREAD_OFFCPU; |
1392 | struct timer_list yield_timer; | ||
1393 | int prio = current->rt_priority; | ||
1394 | |||
1395 | setup_timer_on_stack(&yield_timer, f, arg); | ||
1396 | mod_timer(&yield_timer, jiffies + 2); | ||
1397 | sp.sched_priority = 0; | ||
1398 | sched_setscheduler_nocheck(current, SCHED_NORMAL, &sp); | ||
1399 | set_user_nice(current, 19); | ||
1400 | schedule(); | ||
1401 | set_user_nice(current, 0); | ||
1402 | sp.sched_priority = prio; | ||
1403 | sched_setscheduler_nocheck(current, SCHED_FIFO, &sp); | ||
1404 | del_timer(&yield_timer); | ||
1405 | } | 1368 | } |
1406 | 1369 | ||
1407 | /* | 1370 | static int rcu_cpu_kthread_should_run(unsigned int cpu) |
1408 | * Handle cases where the rcu_cpu_kthread() ends up on the wrong CPU. | ||
1409 | * This can happen while the corresponding CPU is either coming online | ||
1410 | * or going offline. We cannot wait until the CPU is fully online | ||
1411 | * before starting the kthread, because the various notifier functions | ||
1412 | * can wait for RCU grace periods. So we park rcu_cpu_kthread() until | ||
1413 | * the corresponding CPU is online. | ||
1414 | * | ||
1415 | * Return 1 if the kthread needs to stop, 0 otherwise. | ||
1416 | * | ||
1417 | * Caller must disable bh. This function can momentarily enable it. | ||
1418 | */ | ||
1419 | static int rcu_cpu_kthread_should_stop(int cpu) | ||
1420 | { | 1371 | { |
1421 | while (cpu_is_offline(cpu) || | 1372 | return __get_cpu_var(rcu_cpu_has_work); |
1422 | !cpumask_equal(¤t->cpus_allowed, cpumask_of(cpu)) || | ||
1423 | smp_processor_id() != cpu) { | ||
1424 | if (kthread_should_stop()) | ||
1425 | return 1; | ||
1426 | per_cpu(rcu_cpu_kthread_status, cpu) = RCU_KTHREAD_OFFCPU; | ||
1427 | per_cpu(rcu_cpu_kthread_cpu, cpu) = raw_smp_processor_id(); | ||
1428 | local_bh_enable(); | ||
1429 | schedule_timeout_uninterruptible(1); | ||
1430 | if (!cpumask_equal(¤t->cpus_allowed, cpumask_of(cpu))) | ||
1431 | set_cpus_allowed_ptr(current, cpumask_of(cpu)); | ||
1432 | local_bh_disable(); | ||
1433 | } | ||
1434 | per_cpu(rcu_cpu_kthread_cpu, cpu) = cpu; | ||
1435 | return 0; | ||
1436 | } | 1373 | } |
1437 | 1374 | ||
1438 | /* | 1375 | /* |
@@ -1440,138 +1377,35 @@ static int rcu_cpu_kthread_should_stop(int cpu) | |||
1440 | * RCU softirq used in flavors and configurations of RCU that do not | 1377 | * RCU softirq used in flavors and configurations of RCU that do not |
1441 | * support RCU priority boosting. | 1378 | * support RCU priority boosting. |
1442 | */ | 1379 | */ |
1443 | static int rcu_cpu_kthread(void *arg) | 1380 | static void rcu_cpu_kthread(unsigned int cpu) |
1444 | { | 1381 | { |
1445 | int cpu = (int)(long)arg; | 1382 | unsigned int *statusp = &__get_cpu_var(rcu_cpu_kthread_status); |
1446 | unsigned long flags; | 1383 | char work, *workp = &__get_cpu_var(rcu_cpu_has_work); |
1447 | int spincnt = 0; | 1384 | int spincnt; |
1448 | unsigned int *statusp = &per_cpu(rcu_cpu_kthread_status, cpu); | ||
1449 | char work; | ||
1450 | char *workp = &per_cpu(rcu_cpu_has_work, cpu); | ||
1451 | 1385 | ||
1452 | trace_rcu_utilization("Start CPU kthread@init"); | 1386 | for (spincnt = 0; spincnt < 10; spincnt++) { |
1453 | for (;;) { | ||
1454 | *statusp = RCU_KTHREAD_WAITING; | ||
1455 | trace_rcu_utilization("End CPU kthread@rcu_wait"); | ||
1456 | rcu_wait(*workp != 0 || kthread_should_stop()); | ||
1457 | trace_rcu_utilization("Start CPU kthread@rcu_wait"); | 1387 | trace_rcu_utilization("Start CPU kthread@rcu_wait"); |
1458 | local_bh_disable(); | 1388 | local_bh_disable(); |
1459 | if (rcu_cpu_kthread_should_stop(cpu)) { | ||
1460 | local_bh_enable(); | ||
1461 | break; | ||
1462 | } | ||
1463 | *statusp = RCU_KTHREAD_RUNNING; | 1389 | *statusp = RCU_KTHREAD_RUNNING; |
1464 | per_cpu(rcu_cpu_kthread_loops, cpu)++; | 1390 | this_cpu_inc(rcu_cpu_kthread_loops); |
1465 | local_irq_save(flags); | 1391 | local_irq_disable(); |
1466 | work = *workp; | 1392 | work = *workp; |
1467 | *workp = 0; | 1393 | *workp = 0; |
1468 | local_irq_restore(flags); | 1394 | local_irq_enable(); |
1469 | if (work) | 1395 | if (work) |
1470 | rcu_kthread_do_work(); | 1396 | rcu_kthread_do_work(); |
1471 | local_bh_enable(); | 1397 | local_bh_enable(); |
1472 | if (*workp != 0) | 1398 | if (*workp == 0) { |
1473 | spincnt++; | 1399 | trace_rcu_utilization("End CPU kthread@rcu_wait"); |
1474 | else | 1400 | *statusp = RCU_KTHREAD_WAITING; |
1475 | spincnt = 0; | 1401 | return; |
1476 | if (spincnt > 10) { | ||
1477 | *statusp = RCU_KTHREAD_YIELDING; | ||
1478 | trace_rcu_utilization("End CPU kthread@rcu_yield"); | ||
1479 | rcu_yield(rcu_cpu_kthread_timer, (unsigned long)cpu); | ||
1480 | trace_rcu_utilization("Start CPU kthread@rcu_yield"); | ||
1481 | spincnt = 0; | ||
1482 | } | ||
1483 | } | ||
1484 | *statusp = RCU_KTHREAD_STOPPED; | ||
1485 | trace_rcu_utilization("End CPU kthread@term"); | ||
1486 | return 0; | ||
1487 | } | ||
1488 | |||
1489 | /* | ||
1490 | * Spawn a per-CPU kthread, setting up affinity and priority. | ||
1491 | * Because the CPU hotplug lock is held, no other CPU will be attempting | ||
1492 | * to manipulate rcu_cpu_kthread_task. There might be another CPU | ||
1493 | * attempting to access it during boot, but the locking in kthread_bind() | ||
1494 | * will enforce sufficient ordering. | ||
1495 | * | ||
1496 | * Please note that we cannot simply refuse to wake up the per-CPU | ||
1497 | * kthread because kthreads are created in TASK_UNINTERRUPTIBLE state, | ||
1498 | * which can result in softlockup complaints if the task ends up being | ||
1499 | * idle for more than a couple of minutes. | ||
1500 | * | ||
1501 | * However, please note also that we cannot bind the per-CPU kthread to its | ||
1502 | * CPU until that CPU is fully online. We also cannot wait until the | ||
1503 | * CPU is fully online before we create its per-CPU kthread, as this would | ||
1504 | * deadlock the system when CPU notifiers tried waiting for grace | ||
1505 | * periods. So we bind the per-CPU kthread to its CPU only if the CPU | ||
1506 | * is online. If its CPU is not yet fully online, then the code in | ||
1507 | * rcu_cpu_kthread() will wait until it is fully online, and then do | ||
1508 | * the binding. | ||
1509 | */ | ||
1510 | static int __cpuinit rcu_spawn_one_cpu_kthread(int cpu) | ||
1511 | { | ||
1512 | struct sched_param sp; | ||
1513 | struct task_struct *t; | ||
1514 | |||
1515 | if (!rcu_scheduler_fully_active || | ||
1516 | per_cpu(rcu_cpu_kthread_task, cpu) != NULL) | ||
1517 | return 0; | ||
1518 | t = kthread_create_on_node(rcu_cpu_kthread, | ||
1519 | (void *)(long)cpu, | ||
1520 | cpu_to_node(cpu), | ||
1521 | "rcuc/%d", cpu); | ||
1522 | if (IS_ERR(t)) | ||
1523 | return PTR_ERR(t); | ||
1524 | if (cpu_online(cpu)) | ||
1525 | kthread_bind(t, cpu); | ||
1526 | per_cpu(rcu_cpu_kthread_cpu, cpu) = cpu; | ||
1527 | WARN_ON_ONCE(per_cpu(rcu_cpu_kthread_task, cpu) != NULL); | ||
1528 | sp.sched_priority = RCU_KTHREAD_PRIO; | ||
1529 | sched_setscheduler_nocheck(t, SCHED_FIFO, &sp); | ||
1530 | per_cpu(rcu_cpu_kthread_task, cpu) = t; | ||
1531 | wake_up_process(t); /* Get to TASK_INTERRUPTIBLE quickly. */ | ||
1532 | return 0; | ||
1533 | } | ||
1534 | |||
1535 | /* | ||
1536 | * Per-rcu_node kthread, which is in charge of waking up the per-CPU | ||
1537 | * kthreads when needed. We ignore requests to wake up kthreads | ||
1538 | * for offline CPUs, which is OK because force_quiescent_state() | ||
1539 | * takes care of this case. | ||
1540 | */ | ||
1541 | static int rcu_node_kthread(void *arg) | ||
1542 | { | ||
1543 | int cpu; | ||
1544 | unsigned long flags; | ||
1545 | unsigned long mask; | ||
1546 | struct rcu_node *rnp = (struct rcu_node *)arg; | ||
1547 | struct sched_param sp; | ||
1548 | struct task_struct *t; | ||
1549 | |||
1550 | for (;;) { | ||
1551 | rnp->node_kthread_status = RCU_KTHREAD_WAITING; | ||
1552 | rcu_wait(atomic_read(&rnp->wakemask) != 0); | ||
1553 | rnp->node_kthread_status = RCU_KTHREAD_RUNNING; | ||
1554 | raw_spin_lock_irqsave(&rnp->lock, flags); | ||
1555 | mask = atomic_xchg(&rnp->wakemask, 0); | ||
1556 | rcu_initiate_boost(rnp, flags); /* releases rnp->lock. */ | ||
1557 | for (cpu = rnp->grplo; cpu <= rnp->grphi; cpu++, mask >>= 1) { | ||
1558 | if ((mask & 0x1) == 0) | ||
1559 | continue; | ||
1560 | preempt_disable(); | ||
1561 | t = per_cpu(rcu_cpu_kthread_task, cpu); | ||
1562 | if (!cpu_online(cpu) || t == NULL) { | ||
1563 | preempt_enable(); | ||
1564 | continue; | ||
1565 | } | ||
1566 | per_cpu(rcu_cpu_has_work, cpu) = 1; | ||
1567 | sp.sched_priority = RCU_KTHREAD_PRIO; | ||
1568 | sched_setscheduler_nocheck(t, SCHED_FIFO, &sp); | ||
1569 | preempt_enable(); | ||
1570 | } | 1402 | } |
1571 | } | 1403 | } |
1572 | /* NOTREACHED */ | 1404 | *statusp = RCU_KTHREAD_YIELDING; |
1573 | rnp->node_kthread_status = RCU_KTHREAD_STOPPED; | 1405 | trace_rcu_utilization("Start CPU kthread@rcu_yield"); |
1574 | return 0; | 1406 | schedule_timeout_interruptible(2); |
1407 | trace_rcu_utilization("End CPU kthread@rcu_yield"); | ||
1408 | *statusp = RCU_KTHREAD_WAITING; | ||
1575 | } | 1409 | } |
1576 | 1410 | ||
1577 | /* | 1411 | /* |
@@ -1583,17 +1417,17 @@ static int rcu_node_kthread(void *arg) | |||
1583 | * no outgoing CPU. If there are no CPUs left in the affinity set, | 1417 | * no outgoing CPU. If there are no CPUs left in the affinity set, |
1584 | * this function allows the kthread to execute on any CPU. | 1418 | * this function allows the kthread to execute on any CPU. |
1585 | */ | 1419 | */ |
1586 | static void rcu_node_kthread_setaffinity(struct rcu_node *rnp, int outgoingcpu) | 1420 | static void rcu_boost_kthread_setaffinity(struct rcu_node *rnp, int outgoingcpu) |
1587 | { | 1421 | { |
1422 | struct task_struct *t = rnp->boost_kthread_task; | ||
1423 | unsigned long mask = rnp->qsmaskinit; | ||
1588 | cpumask_var_t cm; | 1424 | cpumask_var_t cm; |
1589 | int cpu; | 1425 | int cpu; |
1590 | unsigned long mask = rnp->qsmaskinit; | ||
1591 | 1426 | ||
1592 | if (rnp->node_kthread_task == NULL) | 1427 | if (!t) |
1593 | return; | 1428 | return; |
1594 | if (!alloc_cpumask_var(&cm, GFP_KERNEL)) | 1429 | if (!zalloc_cpumask_var(&cm, GFP_KERNEL)) |
1595 | return; | 1430 | return; |
1596 | cpumask_clear(cm); | ||
1597 | for (cpu = rnp->grplo; cpu <= rnp->grphi; cpu++, mask >>= 1) | 1431 | for (cpu = rnp->grplo; cpu <= rnp->grphi; cpu++, mask >>= 1) |
1598 | if ((mask & 0x1) && cpu != outgoingcpu) | 1432 | if ((mask & 0x1) && cpu != outgoingcpu) |
1599 | cpumask_set_cpu(cpu, cm); | 1433 | cpumask_set_cpu(cpu, cm); |
@@ -1603,62 +1437,36 @@ static void rcu_node_kthread_setaffinity(struct rcu_node *rnp, int outgoingcpu) | |||
1603 | cpumask_clear_cpu(cpu, cm); | 1437 | cpumask_clear_cpu(cpu, cm); |
1604 | WARN_ON_ONCE(cpumask_weight(cm) == 0); | 1438 | WARN_ON_ONCE(cpumask_weight(cm) == 0); |
1605 | } | 1439 | } |
1606 | set_cpus_allowed_ptr(rnp->node_kthread_task, cm); | 1440 | set_cpus_allowed_ptr(t, cm); |
1607 | rcu_boost_kthread_setaffinity(rnp, cm); | ||
1608 | free_cpumask_var(cm); | 1441 | free_cpumask_var(cm); |
1609 | } | 1442 | } |
1610 | 1443 | ||
1611 | /* | 1444 | static struct smp_hotplug_thread rcu_cpu_thread_spec = { |
1612 | * Spawn a per-rcu_node kthread, setting priority and affinity. | 1445 | .store = &rcu_cpu_kthread_task, |
1613 | * Called during boot before online/offline can happen, or, if | 1446 | .thread_should_run = rcu_cpu_kthread_should_run, |
1614 | * during runtime, with the main CPU-hotplug locks held. So only | 1447 | .thread_fn = rcu_cpu_kthread, |
1615 | * one of these can be executing at a time. | 1448 | .thread_comm = "rcuc/%u", |
1616 | */ | 1449 | .setup = rcu_cpu_kthread_setup, |
1617 | static int __cpuinit rcu_spawn_one_node_kthread(struct rcu_state *rsp, | 1450 | .park = rcu_cpu_kthread_park, |
1618 | struct rcu_node *rnp) | 1451 | }; |
1619 | { | ||
1620 | unsigned long flags; | ||
1621 | int rnp_index = rnp - &rsp->node[0]; | ||
1622 | struct sched_param sp; | ||
1623 | struct task_struct *t; | ||
1624 | |||
1625 | if (!rcu_scheduler_fully_active || | ||
1626 | rnp->qsmaskinit == 0) | ||
1627 | return 0; | ||
1628 | if (rnp->node_kthread_task == NULL) { | ||
1629 | t = kthread_create(rcu_node_kthread, (void *)rnp, | ||
1630 | "rcun/%d", rnp_index); | ||
1631 | if (IS_ERR(t)) | ||
1632 | return PTR_ERR(t); | ||
1633 | raw_spin_lock_irqsave(&rnp->lock, flags); | ||
1634 | rnp->node_kthread_task = t; | ||
1635 | raw_spin_unlock_irqrestore(&rnp->lock, flags); | ||
1636 | sp.sched_priority = 99; | ||
1637 | sched_setscheduler_nocheck(t, SCHED_FIFO, &sp); | ||
1638 | wake_up_process(t); /* get to TASK_INTERRUPTIBLE quickly. */ | ||
1639 | } | ||
1640 | return rcu_spawn_one_boost_kthread(rsp, rnp, rnp_index); | ||
1641 | } | ||
1642 | 1452 | ||
1643 | /* | 1453 | /* |
1644 | * Spawn all kthreads -- called as soon as the scheduler is running. | 1454 | * Spawn all kthreads -- called as soon as the scheduler is running. |
1645 | */ | 1455 | */ |
1646 | static int __init rcu_spawn_kthreads(void) | 1456 | static int __init rcu_spawn_kthreads(void) |
1647 | { | 1457 | { |
1648 | int cpu; | ||
1649 | struct rcu_node *rnp; | 1458 | struct rcu_node *rnp; |
1459 | int cpu; | ||
1650 | 1460 | ||
1651 | rcu_scheduler_fully_active = 1; | 1461 | rcu_scheduler_fully_active = 1; |
1652 | for_each_possible_cpu(cpu) { | 1462 | for_each_possible_cpu(cpu) |
1653 | per_cpu(rcu_cpu_has_work, cpu) = 0; | 1463 | per_cpu(rcu_cpu_has_work, cpu) = 0; |
1654 | if (cpu_online(cpu)) | 1464 | BUG_ON(smpboot_register_percpu_thread(&rcu_cpu_thread_spec)); |
1655 | (void)rcu_spawn_one_cpu_kthread(cpu); | ||
1656 | } | ||
1657 | rnp = rcu_get_root(rcu_state); | 1465 | rnp = rcu_get_root(rcu_state); |
1658 | (void)rcu_spawn_one_node_kthread(rcu_state, rnp); | 1466 | (void)rcu_spawn_one_boost_kthread(rcu_state, rnp); |
1659 | if (NUM_RCU_NODES > 1) { | 1467 | if (NUM_RCU_NODES > 1) { |
1660 | rcu_for_each_leaf_node(rcu_state, rnp) | 1468 | rcu_for_each_leaf_node(rcu_state, rnp) |
1661 | (void)rcu_spawn_one_node_kthread(rcu_state, rnp); | 1469 | (void)rcu_spawn_one_boost_kthread(rcu_state, rnp); |
1662 | } | 1470 | } |
1663 | return 0; | 1471 | return 0; |
1664 | } | 1472 | } |
@@ -1670,11 +1478,8 @@ static void __cpuinit rcu_prepare_kthreads(int cpu) | |||
1670 | struct rcu_node *rnp = rdp->mynode; | 1478 | struct rcu_node *rnp = rdp->mynode; |
1671 | 1479 | ||
1672 | /* Fire up the incoming CPU's kthread and leaf rcu_node kthread. */ | 1480 | /* Fire up the incoming CPU's kthread and leaf rcu_node kthread. */ |
1673 | if (rcu_scheduler_fully_active) { | 1481 | if (rcu_scheduler_fully_active) |
1674 | (void)rcu_spawn_one_cpu_kthread(cpu); | 1482 | (void)rcu_spawn_one_boost_kthread(rcu_state, rnp); |
1675 | if (rnp->node_kthread_task == NULL) | ||
1676 | (void)rcu_spawn_one_node_kthread(rcu_state, rnp); | ||
1677 | } | ||
1678 | } | 1483 | } |
1679 | 1484 | ||
1680 | #else /* #ifdef CONFIG_RCU_BOOST */ | 1485 | #else /* #ifdef CONFIG_RCU_BOOST */ |
@@ -1698,19 +1503,7 @@ static void rcu_preempt_boost_start_gp(struct rcu_node *rnp) | |||
1698 | { | 1503 | { |
1699 | } | 1504 | } |
1700 | 1505 | ||
1701 | #ifdef CONFIG_HOTPLUG_CPU | 1506 | static void rcu_boost_kthread_setaffinity(struct rcu_node *rnp, int outgoingcpu) |
1702 | |||
1703 | static void rcu_stop_cpu_kthread(int cpu) | ||
1704 | { | ||
1705 | } | ||
1706 | |||
1707 | #endif /* #ifdef CONFIG_HOTPLUG_CPU */ | ||
1708 | |||
1709 | static void rcu_node_kthread_setaffinity(struct rcu_node *rnp, int outgoingcpu) | ||
1710 | { | ||
1711 | } | ||
1712 | |||
1713 | static void rcu_cpu_kthread_setrt(int cpu, int to_rt) | ||
1714 | { | 1507 | { |
1715 | } | 1508 | } |
1716 | 1509 | ||
@@ -1997,6 +1790,26 @@ static void rcu_prepare_for_idle(int cpu) | |||
1997 | if (!tne) | 1790 | if (!tne) |
1998 | return; | 1791 | return; |
1999 | 1792 | ||
1793 | /* Adaptive-tick mode, where usermode execution is idle to RCU. */ | ||
1794 | if (!is_idle_task(current)) { | ||
1795 | rdtp->dyntick_holdoff = jiffies - 1; | ||
1796 | if (rcu_cpu_has_nonlazy_callbacks(cpu)) { | ||
1797 | trace_rcu_prep_idle("User dyntick with callbacks"); | ||
1798 | rdtp->idle_gp_timer_expires = | ||
1799 | round_up(jiffies + RCU_IDLE_GP_DELAY, | ||
1800 | RCU_IDLE_GP_DELAY); | ||
1801 | } else if (rcu_cpu_has_callbacks(cpu)) { | ||
1802 | rdtp->idle_gp_timer_expires = | ||
1803 | round_jiffies(jiffies + RCU_IDLE_LAZY_GP_DELAY); | ||
1804 | trace_rcu_prep_idle("User dyntick with lazy callbacks"); | ||
1805 | } else { | ||
1806 | return; | ||
1807 | } | ||
1808 | tp = &rdtp->idle_gp_timer; | ||
1809 | mod_timer_pinned(tp, rdtp->idle_gp_timer_expires); | ||
1810 | return; | ||
1811 | } | ||
1812 | |||
2000 | /* | 1813 | /* |
2001 | * If this is an idle re-entry, for example, due to use of | 1814 | * If this is an idle re-entry, for example, due to use of |
2002 | * RCU_NONIDLE() or the new idle-loop tracing API within the idle | 1815 | * RCU_NONIDLE() or the new idle-loop tracing API within the idle |
@@ -2075,16 +1888,16 @@ static void rcu_prepare_for_idle(int cpu) | |||
2075 | #ifdef CONFIG_TREE_PREEMPT_RCU | 1888 | #ifdef CONFIG_TREE_PREEMPT_RCU |
2076 | if (per_cpu(rcu_preempt_data, cpu).nxtlist) { | 1889 | if (per_cpu(rcu_preempt_data, cpu).nxtlist) { |
2077 | rcu_preempt_qs(cpu); | 1890 | rcu_preempt_qs(cpu); |
2078 | force_quiescent_state(&rcu_preempt_state, 0); | 1891 | force_quiescent_state(&rcu_preempt_state); |
2079 | } | 1892 | } |
2080 | #endif /* #ifdef CONFIG_TREE_PREEMPT_RCU */ | 1893 | #endif /* #ifdef CONFIG_TREE_PREEMPT_RCU */ |
2081 | if (per_cpu(rcu_sched_data, cpu).nxtlist) { | 1894 | if (per_cpu(rcu_sched_data, cpu).nxtlist) { |
2082 | rcu_sched_qs(cpu); | 1895 | rcu_sched_qs(cpu); |
2083 | force_quiescent_state(&rcu_sched_state, 0); | 1896 | force_quiescent_state(&rcu_sched_state); |
2084 | } | 1897 | } |
2085 | if (per_cpu(rcu_bh_data, cpu).nxtlist) { | 1898 | if (per_cpu(rcu_bh_data, cpu).nxtlist) { |
2086 | rcu_bh_qs(cpu); | 1899 | rcu_bh_qs(cpu); |
2087 | force_quiescent_state(&rcu_bh_state, 0); | 1900 | force_quiescent_state(&rcu_bh_state); |
2088 | } | 1901 | } |
2089 | 1902 | ||
2090 | /* | 1903 | /* |
@@ -2112,6 +1925,88 @@ static void rcu_idle_count_callbacks_posted(void) | |||
2112 | __this_cpu_add(rcu_dynticks.nonlazy_posted, 1); | 1925 | __this_cpu_add(rcu_dynticks.nonlazy_posted, 1); |
2113 | } | 1926 | } |
2114 | 1927 | ||
1928 | /* | ||
1929 | * Data for flushing lazy RCU callbacks at OOM time. | ||
1930 | */ | ||
1931 | static atomic_t oom_callback_count; | ||
1932 | static DECLARE_WAIT_QUEUE_HEAD(oom_callback_wq); | ||
1933 | |||
1934 | /* | ||
1935 | * RCU OOM callback -- decrement the outstanding count and deliver the | ||
1936 | * wake-up if we are the last one. | ||
1937 | */ | ||
1938 | static void rcu_oom_callback(struct rcu_head *rhp) | ||
1939 | { | ||
1940 | if (atomic_dec_and_test(&oom_callback_count)) | ||
1941 | wake_up(&oom_callback_wq); | ||
1942 | } | ||
1943 | |||
1944 | /* | ||
1945 | * Post an rcu_oom_notify callback on the current CPU if it has at | ||
1946 | * least one lazy callback. This will unnecessarily post callbacks | ||
1947 | * to CPUs that already have a non-lazy callback at the end of their | ||
1948 | * callback list, but this is an infrequent operation, so accept some | ||
1949 | * extra overhead to keep things simple. | ||
1950 | */ | ||
1951 | static void rcu_oom_notify_cpu(void *unused) | ||
1952 | { | ||
1953 | struct rcu_state *rsp; | ||
1954 | struct rcu_data *rdp; | ||
1955 | |||
1956 | for_each_rcu_flavor(rsp) { | ||
1957 | rdp = __this_cpu_ptr(rsp->rda); | ||
1958 | if (rdp->qlen_lazy != 0) { | ||
1959 | atomic_inc(&oom_callback_count); | ||
1960 | rsp->call(&rdp->oom_head, rcu_oom_callback); | ||
1961 | } | ||
1962 | } | ||
1963 | } | ||
1964 | |||
1965 | /* | ||
1966 | * If low on memory, ensure that each CPU has a non-lazy callback. | ||
1967 | * This will wake up CPUs that have only lazy callbacks, in turn | ||
1968 | * ensuring that they free up the corresponding memory in a timely manner. | ||
1969 | * Because an uncertain amount of memory will be freed in some uncertain | ||
1970 | * timeframe, we do not claim to have freed anything. | ||
1971 | */ | ||
1972 | static int rcu_oom_notify(struct notifier_block *self, | ||
1973 | unsigned long notused, void *nfreed) | ||
1974 | { | ||
1975 | int cpu; | ||
1976 | |||
1977 | /* Wait for callbacks from earlier instance to complete. */ | ||
1978 | wait_event(oom_callback_wq, atomic_read(&oom_callback_count) == 0); | ||
1979 | |||
1980 | /* | ||
1981 | * Prevent premature wakeup: ensure that all increments happen | ||
1982 | * before there is a chance of the counter reaching zero. | ||
1983 | */ | ||
1984 | atomic_set(&oom_callback_count, 1); | ||
1985 | |||
1986 | get_online_cpus(); | ||
1987 | for_each_online_cpu(cpu) { | ||
1988 | smp_call_function_single(cpu, rcu_oom_notify_cpu, NULL, 1); | ||
1989 | cond_resched(); | ||
1990 | } | ||
1991 | put_online_cpus(); | ||
1992 | |||
1993 | /* Unconditionally decrement: no need to wake ourselves up. */ | ||
1994 | atomic_dec(&oom_callback_count); | ||
1995 | |||
1996 | return NOTIFY_OK; | ||
1997 | } | ||
1998 | |||
1999 | static struct notifier_block rcu_oom_nb = { | ||
2000 | .notifier_call = rcu_oom_notify | ||
2001 | }; | ||
2002 | |||
2003 | static int __init rcu_register_oom_notifier(void) | ||
2004 | { | ||
2005 | register_oom_notifier(&rcu_oom_nb); | ||
2006 | return 0; | ||
2007 | } | ||
2008 | early_initcall(rcu_register_oom_notifier); | ||
2009 | |||
2115 | #endif /* #else #if !defined(CONFIG_RCU_FAST_NO_HZ) */ | 2010 | #endif /* #else #if !defined(CONFIG_RCU_FAST_NO_HZ) */ |
2116 | 2011 | ||
2117 | #ifdef CONFIG_RCU_CPU_STALL_INFO | 2012 | #ifdef CONFIG_RCU_CPU_STALL_INFO |
@@ -2122,11 +2017,15 @@ static void print_cpu_stall_fast_no_hz(char *cp, int cpu) | |||
2122 | { | 2017 | { |
2123 | struct rcu_dynticks *rdtp = &per_cpu(rcu_dynticks, cpu); | 2018 | struct rcu_dynticks *rdtp = &per_cpu(rcu_dynticks, cpu); |
2124 | struct timer_list *tltp = &rdtp->idle_gp_timer; | 2019 | struct timer_list *tltp = &rdtp->idle_gp_timer; |
2020 | char c; | ||
2125 | 2021 | ||
2126 | sprintf(cp, "drain=%d %c timer=%lu", | 2022 | c = rdtp->dyntick_holdoff == jiffies ? 'H' : '.'; |
2127 | rdtp->dyntick_drain, | 2023 | if (timer_pending(tltp)) |
2128 | rdtp->dyntick_holdoff == jiffies ? 'H' : '.', | 2024 | sprintf(cp, "drain=%d %c timer=%lu", |
2129 | timer_pending(tltp) ? tltp->expires - jiffies : -1); | 2025 | rdtp->dyntick_drain, c, tltp->expires - jiffies); |
2026 | else | ||
2027 | sprintf(cp, "drain=%d %c timer not pending", | ||
2028 | rdtp->dyntick_drain, c); | ||
2130 | } | 2029 | } |
2131 | 2030 | ||
2132 | #else /* #ifdef CONFIG_RCU_FAST_NO_HZ */ | 2031 | #else /* #ifdef CONFIG_RCU_FAST_NO_HZ */ |
@@ -2194,11 +2093,10 @@ static void zero_cpu_stall_ticks(struct rcu_data *rdp) | |||
2194 | /* Increment ->ticks_this_gp for all flavors of RCU. */ | 2093 | /* Increment ->ticks_this_gp for all flavors of RCU. */ |
2195 | static void increment_cpu_stall_ticks(void) | 2094 | static void increment_cpu_stall_ticks(void) |
2196 | { | 2095 | { |
2197 | __get_cpu_var(rcu_sched_data).ticks_this_gp++; | 2096 | struct rcu_state *rsp; |
2198 | __get_cpu_var(rcu_bh_data).ticks_this_gp++; | 2097 | |
2199 | #ifdef CONFIG_TREE_PREEMPT_RCU | 2098 | for_each_rcu_flavor(rsp) |
2200 | __get_cpu_var(rcu_preempt_data).ticks_this_gp++; | 2099 | __this_cpu_ptr(rsp->rda)->ticks_this_gp++; |
2201 | #endif /* #ifdef CONFIG_TREE_PREEMPT_RCU */ | ||
2202 | } | 2100 | } |
2203 | 2101 | ||
2204 | #else /* #ifdef CONFIG_RCU_CPU_STALL_INFO */ | 2102 | #else /* #ifdef CONFIG_RCU_CPU_STALL_INFO */ |
@@ -2227,3 +2125,373 @@ static void increment_cpu_stall_ticks(void) | |||
2227 | } | 2125 | } |
2228 | 2126 | ||
2229 | #endif /* #else #ifdef CONFIG_RCU_CPU_STALL_INFO */ | 2127 | #endif /* #else #ifdef CONFIG_RCU_CPU_STALL_INFO */ |
2128 | |||
2129 | #ifdef CONFIG_RCU_NOCB_CPU | ||
2130 | |||
2131 | /* | ||
2132 | * Offload callback processing from the boot-time-specified set of CPUs | ||
2133 | * specified by rcu_nocb_mask. For each CPU in the set, there is a | ||
2134 | * kthread created that pulls the callbacks from the corresponding CPU, | ||
2135 | * waits for a grace period to elapse, and invokes the callbacks. | ||
2136 | * The no-CBs CPUs do a wake_up() on their kthread when they insert | ||
2137 | * a callback into any empty list, unless the rcu_nocb_poll boot parameter | ||
2138 | * has been specified, in which case each kthread actively polls its | ||
2139 | * CPU. (Which isn't so great for energy efficiency, but which does | ||
2140 | * reduce RCU's overhead on that CPU.) | ||
2141 | * | ||
2142 | * This is intended to be used in conjunction with Frederic Weisbecker's | ||
2143 | * adaptive-idle work, which would seriously reduce OS jitter on CPUs | ||
2144 | * running CPU-bound user-mode computations. | ||
2145 | * | ||
2146 | * Offloading of callback processing could also in theory be used as | ||
2147 | * an energy-efficiency measure because CPUs with no RCU callbacks | ||
2148 | * queued are more aggressive about entering dyntick-idle mode. | ||
2149 | */ | ||
2150 | |||
2151 | |||
2152 | /* Parse the boot-time rcu_nocb_mask CPU list from the kernel parameters. */ | ||
2153 | static int __init rcu_nocb_setup(char *str) | ||
2154 | { | ||
2155 | alloc_bootmem_cpumask_var(&rcu_nocb_mask); | ||
2156 | have_rcu_nocb_mask = true; | ||
2157 | cpulist_parse(str, rcu_nocb_mask); | ||
2158 | return 1; | ||
2159 | } | ||
2160 | __setup("rcu_nocbs=", rcu_nocb_setup); | ||
2161 | |||
2162 | /* Is the specified CPU a no-CPUs CPU? */ | ||
2163 | static bool is_nocb_cpu(int cpu) | ||
2164 | { | ||
2165 | if (have_rcu_nocb_mask) | ||
2166 | return cpumask_test_cpu(cpu, rcu_nocb_mask); | ||
2167 | return false; | ||
2168 | } | ||
2169 | |||
2170 | /* | ||
2171 | * Enqueue the specified string of rcu_head structures onto the specified | ||
2172 | * CPU's no-CBs lists. The CPU is specified by rdp, the head of the | ||
2173 | * string by rhp, and the tail of the string by rhtp. The non-lazy/lazy | ||
2174 | * counts are supplied by rhcount and rhcount_lazy. | ||
2175 | * | ||
2176 | * If warranted, also wake up the kthread servicing this CPUs queues. | ||
2177 | */ | ||
2178 | static void __call_rcu_nocb_enqueue(struct rcu_data *rdp, | ||
2179 | struct rcu_head *rhp, | ||
2180 | struct rcu_head **rhtp, | ||
2181 | int rhcount, int rhcount_lazy) | ||
2182 | { | ||
2183 | int len; | ||
2184 | struct rcu_head **old_rhpp; | ||
2185 | struct task_struct *t; | ||
2186 | |||
2187 | /* Enqueue the callback on the nocb list and update counts. */ | ||
2188 | old_rhpp = xchg(&rdp->nocb_tail, rhtp); | ||
2189 | ACCESS_ONCE(*old_rhpp) = rhp; | ||
2190 | atomic_long_add(rhcount, &rdp->nocb_q_count); | ||
2191 | atomic_long_add(rhcount_lazy, &rdp->nocb_q_count_lazy); | ||
2192 | |||
2193 | /* If we are not being polled and there is a kthread, awaken it ... */ | ||
2194 | t = ACCESS_ONCE(rdp->nocb_kthread); | ||
2195 | if (rcu_nocb_poll | !t) | ||
2196 | return; | ||
2197 | len = atomic_long_read(&rdp->nocb_q_count); | ||
2198 | if (old_rhpp == &rdp->nocb_head) { | ||
2199 | wake_up(&rdp->nocb_wq); /* ... only if queue was empty ... */ | ||
2200 | rdp->qlen_last_fqs_check = 0; | ||
2201 | } else if (len > rdp->qlen_last_fqs_check + qhimark) { | ||
2202 | wake_up_process(t); /* ... or if many callbacks queued. */ | ||
2203 | rdp->qlen_last_fqs_check = LONG_MAX / 2; | ||
2204 | } | ||
2205 | return; | ||
2206 | } | ||
2207 | |||
2208 | /* | ||
2209 | * This is a helper for __call_rcu(), which invokes this when the normal | ||
2210 | * callback queue is inoperable. If this is not a no-CBs CPU, this | ||
2211 | * function returns failure back to __call_rcu(), which can complain | ||
2212 | * appropriately. | ||
2213 | * | ||
2214 | * Otherwise, this function queues the callback where the corresponding | ||
2215 | * "rcuo" kthread can find it. | ||
2216 | */ | ||
2217 | static bool __call_rcu_nocb(struct rcu_data *rdp, struct rcu_head *rhp, | ||
2218 | bool lazy) | ||
2219 | { | ||
2220 | |||
2221 | if (!is_nocb_cpu(rdp->cpu)) | ||
2222 | return 0; | ||
2223 | __call_rcu_nocb_enqueue(rdp, rhp, &rhp->next, 1, lazy); | ||
2224 | return 1; | ||
2225 | } | ||
2226 | |||
2227 | /* | ||
2228 | * Adopt orphaned callbacks on a no-CBs CPU, or return 0 if this is | ||
2229 | * not a no-CBs CPU. | ||
2230 | */ | ||
2231 | static bool __maybe_unused rcu_nocb_adopt_orphan_cbs(struct rcu_state *rsp, | ||
2232 | struct rcu_data *rdp) | ||
2233 | { | ||
2234 | long ql = rsp->qlen; | ||
2235 | long qll = rsp->qlen_lazy; | ||
2236 | |||
2237 | /* If this is not a no-CBs CPU, tell the caller to do it the old way. */ | ||
2238 | if (!is_nocb_cpu(smp_processor_id())) | ||
2239 | return 0; | ||
2240 | rsp->qlen = 0; | ||
2241 | rsp->qlen_lazy = 0; | ||
2242 | |||
2243 | /* First, enqueue the donelist, if any. This preserves CB ordering. */ | ||
2244 | if (rsp->orphan_donelist != NULL) { | ||
2245 | __call_rcu_nocb_enqueue(rdp, rsp->orphan_donelist, | ||
2246 | rsp->orphan_donetail, ql, qll); | ||
2247 | ql = qll = 0; | ||
2248 | rsp->orphan_donelist = NULL; | ||
2249 | rsp->orphan_donetail = &rsp->orphan_donelist; | ||
2250 | } | ||
2251 | if (rsp->orphan_nxtlist != NULL) { | ||
2252 | __call_rcu_nocb_enqueue(rdp, rsp->orphan_nxtlist, | ||
2253 | rsp->orphan_nxttail, ql, qll); | ||
2254 | ql = qll = 0; | ||
2255 | rsp->orphan_nxtlist = NULL; | ||
2256 | rsp->orphan_nxttail = &rsp->orphan_nxtlist; | ||
2257 | } | ||
2258 | return 1; | ||
2259 | } | ||
2260 | |||
2261 | /* | ||
2262 | * There must be at least one non-no-CBs CPU in operation at any given | ||
2263 | * time, because no-CBs CPUs are not capable of initiating grace periods | ||
2264 | * independently. This function therefore complains if the specified | ||
2265 | * CPU is the last non-no-CBs CPU, allowing the CPU-hotplug system to | ||
2266 | * avoid offlining the last such CPU. (Recursion is a wonderful thing, | ||
2267 | * but you have to have a base case!) | ||
2268 | */ | ||
2269 | static bool nocb_cpu_expendable(int cpu) | ||
2270 | { | ||
2271 | cpumask_var_t non_nocb_cpus; | ||
2272 | int ret; | ||
2273 | |||
2274 | /* | ||
2275 | * If there are no no-CB CPUs or if this CPU is not a no-CB CPU, | ||
2276 | * then offlining this CPU is harmless. Let it happen. | ||
2277 | */ | ||
2278 | if (!have_rcu_nocb_mask || is_nocb_cpu(cpu)) | ||
2279 | return 1; | ||
2280 | |||
2281 | /* If no memory, play it safe and keep the CPU around. */ | ||
2282 | if (!alloc_cpumask_var(&non_nocb_cpus, GFP_NOIO)) | ||
2283 | return 0; | ||
2284 | cpumask_andnot(non_nocb_cpus, cpu_online_mask, rcu_nocb_mask); | ||
2285 | cpumask_clear_cpu(cpu, non_nocb_cpus); | ||
2286 | ret = !cpumask_empty(non_nocb_cpus); | ||
2287 | free_cpumask_var(non_nocb_cpus); | ||
2288 | return ret; | ||
2289 | } | ||
2290 | |||
2291 | /* | ||
2292 | * Helper structure for remote registry of RCU callbacks. | ||
2293 | * This is needed for when a no-CBs CPU needs to start a grace period. | ||
2294 | * If it just invokes call_rcu(), the resulting callback will be queued, | ||
2295 | * which can result in deadlock. | ||
2296 | */ | ||
2297 | struct rcu_head_remote { | ||
2298 | struct rcu_head *rhp; | ||
2299 | call_rcu_func_t *crf; | ||
2300 | void (*func)(struct rcu_head *rhp); | ||
2301 | }; | ||
2302 | |||
2303 | /* | ||
2304 | * Register a callback as specified by the rcu_head_remote struct. | ||
2305 | * This function is intended to be invoked via smp_call_function_single(). | ||
2306 | */ | ||
2307 | static void call_rcu_local(void *arg) | ||
2308 | { | ||
2309 | struct rcu_head_remote *rhrp = | ||
2310 | container_of(arg, struct rcu_head_remote, rhp); | ||
2311 | |||
2312 | rhrp->crf(rhrp->rhp, rhrp->func); | ||
2313 | } | ||
2314 | |||
2315 | /* | ||
2316 | * Set up an rcu_head_remote structure and the invoke call_rcu_local() | ||
2317 | * on CPU 0 (which is guaranteed to be a non-no-CBs CPU) via | ||
2318 | * smp_call_function_single(). | ||
2319 | */ | ||
2320 | static void invoke_crf_remote(struct rcu_head *rhp, | ||
2321 | void (*func)(struct rcu_head *rhp), | ||
2322 | call_rcu_func_t crf) | ||
2323 | { | ||
2324 | struct rcu_head_remote rhr; | ||
2325 | |||
2326 | rhr.rhp = rhp; | ||
2327 | rhr.crf = crf; | ||
2328 | rhr.func = func; | ||
2329 | smp_call_function_single(0, call_rcu_local, &rhr, 1); | ||
2330 | } | ||
2331 | |||
2332 | /* | ||
2333 | * Helper functions to be passed to wait_rcu_gp(), each of which | ||
2334 | * invokes invoke_crf_remote() to register a callback appropriately. | ||
2335 | */ | ||
2336 | static void __maybe_unused | ||
2337 | call_rcu_preempt_remote(struct rcu_head *rhp, | ||
2338 | void (*func)(struct rcu_head *rhp)) | ||
2339 | { | ||
2340 | invoke_crf_remote(rhp, func, call_rcu); | ||
2341 | } | ||
2342 | static void call_rcu_bh_remote(struct rcu_head *rhp, | ||
2343 | void (*func)(struct rcu_head *rhp)) | ||
2344 | { | ||
2345 | invoke_crf_remote(rhp, func, call_rcu_bh); | ||
2346 | } | ||
2347 | static void call_rcu_sched_remote(struct rcu_head *rhp, | ||
2348 | void (*func)(struct rcu_head *rhp)) | ||
2349 | { | ||
2350 | invoke_crf_remote(rhp, func, call_rcu_sched); | ||
2351 | } | ||
2352 | |||
2353 | /* | ||
2354 | * Per-rcu_data kthread, but only for no-CBs CPUs. Each kthread invokes | ||
2355 | * callbacks queued by the corresponding no-CBs CPU. | ||
2356 | */ | ||
2357 | static int rcu_nocb_kthread(void *arg) | ||
2358 | { | ||
2359 | int c, cl; | ||
2360 | struct rcu_head *list; | ||
2361 | struct rcu_head *next; | ||
2362 | struct rcu_head **tail; | ||
2363 | struct rcu_data *rdp = arg; | ||
2364 | |||
2365 | /* Each pass through this loop invokes one batch of callbacks */ | ||
2366 | for (;;) { | ||
2367 | /* If not polling, wait for next batch of callbacks. */ | ||
2368 | if (!rcu_nocb_poll) | ||
2369 | wait_event(rdp->nocb_wq, rdp->nocb_head); | ||
2370 | list = ACCESS_ONCE(rdp->nocb_head); | ||
2371 | if (!list) { | ||
2372 | schedule_timeout_interruptible(1); | ||
2373 | continue; | ||
2374 | } | ||
2375 | |||
2376 | /* | ||
2377 | * Extract queued callbacks, update counts, and wait | ||
2378 | * for a grace period to elapse. | ||
2379 | */ | ||
2380 | ACCESS_ONCE(rdp->nocb_head) = NULL; | ||
2381 | tail = xchg(&rdp->nocb_tail, &rdp->nocb_head); | ||
2382 | c = atomic_long_xchg(&rdp->nocb_q_count, 0); | ||
2383 | cl = atomic_long_xchg(&rdp->nocb_q_count_lazy, 0); | ||
2384 | ACCESS_ONCE(rdp->nocb_p_count) += c; | ||
2385 | ACCESS_ONCE(rdp->nocb_p_count_lazy) += cl; | ||
2386 | wait_rcu_gp(rdp->rsp->call_remote); | ||
2387 | |||
2388 | /* Each pass through the following loop invokes a callback. */ | ||
2389 | trace_rcu_batch_start(rdp->rsp->name, cl, c, -1); | ||
2390 | c = cl = 0; | ||
2391 | while (list) { | ||
2392 | next = list->next; | ||
2393 | /* Wait for enqueuing to complete, if needed. */ | ||
2394 | while (next == NULL && &list->next != tail) { | ||
2395 | schedule_timeout_interruptible(1); | ||
2396 | next = list->next; | ||
2397 | } | ||
2398 | debug_rcu_head_unqueue(list); | ||
2399 | local_bh_disable(); | ||
2400 | if (__rcu_reclaim(rdp->rsp->name, list)) | ||
2401 | cl++; | ||
2402 | c++; | ||
2403 | local_bh_enable(); | ||
2404 | list = next; | ||
2405 | } | ||
2406 | trace_rcu_batch_end(rdp->rsp->name, c, !!list, 0, 0, 1); | ||
2407 | ACCESS_ONCE(rdp->nocb_p_count) -= c; | ||
2408 | ACCESS_ONCE(rdp->nocb_p_count_lazy) -= cl; | ||
2409 | rdp->n_nocbs_invoked += c; | ||
2410 | } | ||
2411 | return 0; | ||
2412 | } | ||
2413 | |||
2414 | /* Initialize per-rcu_data variables for no-CBs CPUs. */ | ||
2415 | static void __init rcu_boot_init_nocb_percpu_data(struct rcu_data *rdp) | ||
2416 | { | ||
2417 | rdp->nocb_tail = &rdp->nocb_head; | ||
2418 | init_waitqueue_head(&rdp->nocb_wq); | ||
2419 | } | ||
2420 | |||
2421 | /* Create a kthread for each RCU flavor for each no-CBs CPU. */ | ||
2422 | static void __init rcu_spawn_nocb_kthreads(struct rcu_state *rsp) | ||
2423 | { | ||
2424 | int cpu; | ||
2425 | struct rcu_data *rdp; | ||
2426 | struct task_struct *t; | ||
2427 | |||
2428 | if (rcu_nocb_mask == NULL) | ||
2429 | return; | ||
2430 | for_each_cpu(cpu, rcu_nocb_mask) { | ||
2431 | rdp = per_cpu_ptr(rsp->rda, cpu); | ||
2432 | t = kthread_run(rcu_nocb_kthread, rdp, "rcuo%d", cpu); | ||
2433 | BUG_ON(IS_ERR(t)); | ||
2434 | ACCESS_ONCE(rdp->nocb_kthread) = t; | ||
2435 | } | ||
2436 | } | ||
2437 | |||
2438 | /* Prevent __call_rcu() from enqueuing callbacks on no-CBs CPUs */ | ||
2439 | static void init_nocb_callback_list(struct rcu_data *rdp) | ||
2440 | { | ||
2441 | if (rcu_nocb_mask == NULL || | ||
2442 | !cpumask_test_cpu(rdp->cpu, rcu_nocb_mask)) | ||
2443 | return; | ||
2444 | rdp->nxttail[RCU_NEXT_TAIL] = NULL; | ||
2445 | } | ||
2446 | |||
2447 | /* Initialize the ->call_remote fields in the rcu_state structures. */ | ||
2448 | static void __init rcu_init_nocb(void) | ||
2449 | { | ||
2450 | #ifdef CONFIG_PREEMPT_RCU | ||
2451 | rcu_preempt_state.call_remote = call_rcu_preempt_remote; | ||
2452 | #endif /* #ifdef CONFIG_PREEMPT_RCU */ | ||
2453 | rcu_bh_state.call_remote = call_rcu_bh_remote; | ||
2454 | rcu_sched_state.call_remote = call_rcu_sched_remote; | ||
2455 | } | ||
2456 | |||
2457 | #else /* #ifdef CONFIG_RCU_NOCB_CPU */ | ||
2458 | |||
2459 | static bool is_nocb_cpu(int cpu) | ||
2460 | { | ||
2461 | return false; | ||
2462 | } | ||
2463 | |||
2464 | static bool __call_rcu_nocb(struct rcu_data *rdp, struct rcu_head *rhp, | ||
2465 | bool lazy) | ||
2466 | { | ||
2467 | return 0; | ||
2468 | } | ||
2469 | |||
2470 | static bool __maybe_unused rcu_nocb_adopt_orphan_cbs(struct rcu_state *rsp, | ||
2471 | struct rcu_data *rdp) | ||
2472 | { | ||
2473 | return 0; | ||
2474 | } | ||
2475 | |||
2476 | static bool nocb_cpu_expendable(int cpu) | ||
2477 | { | ||
2478 | return 1; | ||
2479 | } | ||
2480 | |||
2481 | static void __init rcu_boot_init_nocb_percpu_data(struct rcu_data *rdp) | ||
2482 | { | ||
2483 | } | ||
2484 | |||
2485 | static void __init rcu_spawn_nocb_kthreads(struct rcu_state *rsp) | ||
2486 | { | ||
2487 | } | ||
2488 | |||
2489 | static void init_nocb_callback_list(struct rcu_data *rdp) | ||
2490 | { | ||
2491 | } | ||
2492 | |||
2493 | static void __init rcu_init_nocb(void) | ||
2494 | { | ||
2495 | } | ||
2496 | |||
2497 | #endif /* #else #ifdef CONFIG_RCU_NOCB_CPU */ | ||
diff --git a/kernel/rcutree_trace.c b/kernel/rcutree_trace.c index abffb486e94e..0d095dcaa670 100644 --- a/kernel/rcutree_trace.c +++ b/kernel/rcutree_trace.c | |||
@@ -46,29 +46,58 @@ | |||
46 | #define RCU_TREE_NONCORE | 46 | #define RCU_TREE_NONCORE |
47 | #include "rcutree.h" | 47 | #include "rcutree.h" |
48 | 48 | ||
49 | static int show_rcubarrier(struct seq_file *m, void *unused) | 49 | #define ulong2long(a) (*(long *)(&(a))) |
50 | |||
51 | static int r_open(struct inode *inode, struct file *file, | ||
52 | const struct seq_operations *op) | ||
50 | { | 53 | { |
51 | struct rcu_state *rsp; | 54 | int ret = seq_open(file, op); |
55 | if (!ret) { | ||
56 | struct seq_file *m = (struct seq_file *)file->private_data; | ||
57 | m->private = inode->i_private; | ||
58 | } | ||
59 | return ret; | ||
60 | } | ||
61 | |||
62 | static void *r_start(struct seq_file *m, loff_t *pos) | ||
63 | { | ||
64 | struct rcu_state *rsp = (struct rcu_state *)m->private; | ||
65 | *pos = cpumask_next(*pos - 1, cpu_possible_mask); | ||
66 | if ((*pos) < nr_cpu_ids) | ||
67 | return per_cpu_ptr(rsp->rda, *pos); | ||
68 | return NULL; | ||
69 | } | ||
52 | 70 | ||
53 | for_each_rcu_flavor(rsp) | 71 | static void *r_next(struct seq_file *m, void *v, loff_t *pos) |
54 | seq_printf(m, "%s: %c bcc: %d nbd: %lu\n", | 72 | { |
55 | rsp->name, rsp->rcu_barrier_in_progress ? 'B' : '.', | 73 | (*pos)++; |
56 | atomic_read(&rsp->barrier_cpu_count), | 74 | return r_start(m, pos); |
57 | rsp->n_barrier_done); | 75 | } |
76 | |||
77 | static void r_stop(struct seq_file *m, void *v) | ||
78 | { | ||
79 | } | ||
80 | |||
81 | static int show_rcubarrier(struct seq_file *m, void *v) | ||
82 | { | ||
83 | struct rcu_state *rsp = (struct rcu_state *)m->private; | ||
84 | seq_printf(m, "bcc: %d nbd: %lu\n", | ||
85 | atomic_read(&rsp->barrier_cpu_count), | ||
86 | rsp->n_barrier_done); | ||
58 | return 0; | 87 | return 0; |
59 | } | 88 | } |
60 | 89 | ||
61 | static int rcubarrier_open(struct inode *inode, struct file *file) | 90 | static int rcubarrier_open(struct inode *inode, struct file *file) |
62 | { | 91 | { |
63 | return single_open(file, show_rcubarrier, NULL); | 92 | return single_open(file, show_rcubarrier, inode->i_private); |
64 | } | 93 | } |
65 | 94 | ||
66 | static const struct file_operations rcubarrier_fops = { | 95 | static const struct file_operations rcubarrier_fops = { |
67 | .owner = THIS_MODULE, | 96 | .owner = THIS_MODULE, |
68 | .open = rcubarrier_open, | 97 | .open = rcubarrier_open, |
69 | .read = seq_read, | 98 | .read = seq_read, |
70 | .llseek = seq_lseek, | 99 | .llseek = no_llseek, |
71 | .release = single_release, | 100 | .release = seq_release, |
72 | }; | 101 | }; |
73 | 102 | ||
74 | #ifdef CONFIG_RCU_BOOST | 103 | #ifdef CONFIG_RCU_BOOST |
@@ -84,22 +113,26 @@ static char convert_kthread_status(unsigned int kthread_status) | |||
84 | 113 | ||
85 | static void print_one_rcu_data(struct seq_file *m, struct rcu_data *rdp) | 114 | static void print_one_rcu_data(struct seq_file *m, struct rcu_data *rdp) |
86 | { | 115 | { |
116 | long ql, qll; | ||
117 | |||
87 | if (!rdp->beenonline) | 118 | if (!rdp->beenonline) |
88 | return; | 119 | return; |
89 | seq_printf(m, "%3d%cc=%lu g=%lu pq=%d pgp=%lu qp=%d", | 120 | seq_printf(m, "%3d%cc=%ld g=%ld pq=%d qp=%d", |
90 | rdp->cpu, | 121 | rdp->cpu, |
91 | cpu_is_offline(rdp->cpu) ? '!' : ' ', | 122 | cpu_is_offline(rdp->cpu) ? '!' : ' ', |
92 | rdp->completed, rdp->gpnum, | 123 | ulong2long(rdp->completed), ulong2long(rdp->gpnum), |
93 | rdp->passed_quiesce, rdp->passed_quiesce_gpnum, | 124 | rdp->passed_quiesce, rdp->qs_pending); |
94 | rdp->qs_pending); | ||
95 | seq_printf(m, " dt=%d/%llx/%d df=%lu", | 125 | seq_printf(m, " dt=%d/%llx/%d df=%lu", |
96 | atomic_read(&rdp->dynticks->dynticks), | 126 | atomic_read(&rdp->dynticks->dynticks), |
97 | rdp->dynticks->dynticks_nesting, | 127 | rdp->dynticks->dynticks_nesting, |
98 | rdp->dynticks->dynticks_nmi_nesting, | 128 | rdp->dynticks->dynticks_nmi_nesting, |
99 | rdp->dynticks_fqs); | 129 | rdp->dynticks_fqs); |
100 | seq_printf(m, " of=%lu", rdp->offline_fqs); | 130 | seq_printf(m, " of=%lu", rdp->offline_fqs); |
131 | rcu_nocb_q_lengths(rdp, &ql, &qll); | ||
132 | qll += rdp->qlen_lazy; | ||
133 | ql += rdp->qlen; | ||
101 | seq_printf(m, " ql=%ld/%ld qs=%c%c%c%c", | 134 | seq_printf(m, " ql=%ld/%ld qs=%c%c%c%c", |
102 | rdp->qlen_lazy, rdp->qlen, | 135 | qll, ql, |
103 | ".N"[rdp->nxttail[RCU_NEXT_READY_TAIL] != | 136 | ".N"[rdp->nxttail[RCU_NEXT_READY_TAIL] != |
104 | rdp->nxttail[RCU_NEXT_TAIL]], | 137 | rdp->nxttail[RCU_NEXT_TAIL]], |
105 | ".R"[rdp->nxttail[RCU_WAIT_TAIL] != | 138 | ".R"[rdp->nxttail[RCU_WAIT_TAIL] != |
@@ -108,110 +141,74 @@ static void print_one_rcu_data(struct seq_file *m, struct rcu_data *rdp) | |||
108 | rdp->nxttail[RCU_WAIT_TAIL]], | 141 | rdp->nxttail[RCU_WAIT_TAIL]], |
109 | ".D"[&rdp->nxtlist != rdp->nxttail[RCU_DONE_TAIL]]); | 142 | ".D"[&rdp->nxtlist != rdp->nxttail[RCU_DONE_TAIL]]); |
110 | #ifdef CONFIG_RCU_BOOST | 143 | #ifdef CONFIG_RCU_BOOST |
111 | seq_printf(m, " kt=%d/%c/%d ktl=%x", | 144 | seq_printf(m, " kt=%d/%c ktl=%x", |
112 | per_cpu(rcu_cpu_has_work, rdp->cpu), | 145 | per_cpu(rcu_cpu_has_work, rdp->cpu), |
113 | convert_kthread_status(per_cpu(rcu_cpu_kthread_status, | 146 | convert_kthread_status(per_cpu(rcu_cpu_kthread_status, |
114 | rdp->cpu)), | 147 | rdp->cpu)), |
115 | per_cpu(rcu_cpu_kthread_cpu, rdp->cpu), | ||
116 | per_cpu(rcu_cpu_kthread_loops, rdp->cpu) & 0xffff); | 148 | per_cpu(rcu_cpu_kthread_loops, rdp->cpu) & 0xffff); |
117 | #endif /* #ifdef CONFIG_RCU_BOOST */ | 149 | #endif /* #ifdef CONFIG_RCU_BOOST */ |
118 | seq_printf(m, " b=%ld", rdp->blimit); | 150 | seq_printf(m, " b=%ld", rdp->blimit); |
119 | seq_printf(m, " ci=%lu co=%lu ca=%lu\n", | 151 | seq_printf(m, " ci=%lu nci=%lu co=%lu ca=%lu\n", |
120 | rdp->n_cbs_invoked, rdp->n_cbs_orphaned, rdp->n_cbs_adopted); | 152 | rdp->n_cbs_invoked, rdp->n_nocbs_invoked, |
153 | rdp->n_cbs_orphaned, rdp->n_cbs_adopted); | ||
121 | } | 154 | } |
122 | 155 | ||
123 | static int show_rcudata(struct seq_file *m, void *unused) | 156 | static int show_rcudata(struct seq_file *m, void *v) |
124 | { | 157 | { |
125 | int cpu; | 158 | print_one_rcu_data(m, (struct rcu_data *)v); |
126 | struct rcu_state *rsp; | ||
127 | |||
128 | for_each_rcu_flavor(rsp) { | ||
129 | seq_printf(m, "%s:\n", rsp->name); | ||
130 | for_each_possible_cpu(cpu) | ||
131 | print_one_rcu_data(m, per_cpu_ptr(rsp->rda, cpu)); | ||
132 | } | ||
133 | return 0; | 159 | return 0; |
134 | } | 160 | } |
135 | 161 | ||
162 | static const struct seq_operations rcudate_op = { | ||
163 | .start = r_start, | ||
164 | .next = r_next, | ||
165 | .stop = r_stop, | ||
166 | .show = show_rcudata, | ||
167 | }; | ||
168 | |||
136 | static int rcudata_open(struct inode *inode, struct file *file) | 169 | static int rcudata_open(struct inode *inode, struct file *file) |
137 | { | 170 | { |
138 | return single_open(file, show_rcudata, NULL); | 171 | return r_open(inode, file, &rcudate_op); |
139 | } | 172 | } |
140 | 173 | ||
141 | static const struct file_operations rcudata_fops = { | 174 | static const struct file_operations rcudata_fops = { |
142 | .owner = THIS_MODULE, | 175 | .owner = THIS_MODULE, |
143 | .open = rcudata_open, | 176 | .open = rcudata_open, |
144 | .read = seq_read, | 177 | .read = seq_read, |
145 | .llseek = seq_lseek, | 178 | .llseek = no_llseek, |
146 | .release = single_release, | 179 | .release = seq_release, |
147 | }; | 180 | }; |
148 | 181 | ||
149 | static void print_one_rcu_data_csv(struct seq_file *m, struct rcu_data *rdp) | 182 | static int show_rcuexp(struct seq_file *m, void *v) |
150 | { | 183 | { |
151 | if (!rdp->beenonline) | 184 | struct rcu_state *rsp = (struct rcu_state *)m->private; |
152 | return; | 185 | |
153 | seq_printf(m, "%d,%s,%lu,%lu,%d,%lu,%d", | 186 | seq_printf(m, "s=%lu d=%lu w=%lu tf=%lu wd1=%lu wd2=%lu n=%lu sc=%lu dt=%lu dl=%lu dx=%lu\n", |
154 | rdp->cpu, | 187 | atomic_long_read(&rsp->expedited_start), |
155 | cpu_is_offline(rdp->cpu) ? "\"N\"" : "\"Y\"", | 188 | atomic_long_read(&rsp->expedited_done), |
156 | rdp->completed, rdp->gpnum, | 189 | atomic_long_read(&rsp->expedited_wrap), |
157 | rdp->passed_quiesce, rdp->passed_quiesce_gpnum, | 190 | atomic_long_read(&rsp->expedited_tryfail), |
158 | rdp->qs_pending); | 191 | atomic_long_read(&rsp->expedited_workdone1), |
159 | seq_printf(m, ",%d,%llx,%d,%lu", | 192 | atomic_long_read(&rsp->expedited_workdone2), |
160 | atomic_read(&rdp->dynticks->dynticks), | 193 | atomic_long_read(&rsp->expedited_normal), |
161 | rdp->dynticks->dynticks_nesting, | 194 | atomic_long_read(&rsp->expedited_stoppedcpus), |
162 | rdp->dynticks->dynticks_nmi_nesting, | 195 | atomic_long_read(&rsp->expedited_done_tries), |
163 | rdp->dynticks_fqs); | 196 | atomic_long_read(&rsp->expedited_done_lost), |
164 | seq_printf(m, ",%lu", rdp->offline_fqs); | 197 | atomic_long_read(&rsp->expedited_done_exit)); |
165 | seq_printf(m, ",%ld,%ld,\"%c%c%c%c\"", rdp->qlen_lazy, rdp->qlen, | ||
166 | ".N"[rdp->nxttail[RCU_NEXT_READY_TAIL] != | ||
167 | rdp->nxttail[RCU_NEXT_TAIL]], | ||
168 | ".R"[rdp->nxttail[RCU_WAIT_TAIL] != | ||
169 | rdp->nxttail[RCU_NEXT_READY_TAIL]], | ||
170 | ".W"[rdp->nxttail[RCU_DONE_TAIL] != | ||
171 | rdp->nxttail[RCU_WAIT_TAIL]], | ||
172 | ".D"[&rdp->nxtlist != rdp->nxttail[RCU_DONE_TAIL]]); | ||
173 | #ifdef CONFIG_RCU_BOOST | ||
174 | seq_printf(m, ",%d,\"%c\"", | ||
175 | per_cpu(rcu_cpu_has_work, rdp->cpu), | ||
176 | convert_kthread_status(per_cpu(rcu_cpu_kthread_status, | ||
177 | rdp->cpu))); | ||
178 | #endif /* #ifdef CONFIG_RCU_BOOST */ | ||
179 | seq_printf(m, ",%ld", rdp->blimit); | ||
180 | seq_printf(m, ",%lu,%lu,%lu\n", | ||
181 | rdp->n_cbs_invoked, rdp->n_cbs_orphaned, rdp->n_cbs_adopted); | ||
182 | } | ||
183 | |||
184 | static int show_rcudata_csv(struct seq_file *m, void *unused) | ||
185 | { | ||
186 | int cpu; | ||
187 | struct rcu_state *rsp; | ||
188 | |||
189 | seq_puts(m, "\"CPU\",\"Online?\",\"c\",\"g\",\"pq\",\"pgp\",\"pq\","); | ||
190 | seq_puts(m, "\"dt\",\"dt nesting\",\"dt NMI nesting\",\"df\","); | ||
191 | seq_puts(m, "\"of\",\"qll\",\"ql\",\"qs\""); | ||
192 | #ifdef CONFIG_RCU_BOOST | ||
193 | seq_puts(m, "\"kt\",\"ktl\""); | ||
194 | #endif /* #ifdef CONFIG_RCU_BOOST */ | ||
195 | seq_puts(m, ",\"b\",\"ci\",\"co\",\"ca\"\n"); | ||
196 | for_each_rcu_flavor(rsp) { | ||
197 | seq_printf(m, "\"%s:\"\n", rsp->name); | ||
198 | for_each_possible_cpu(cpu) | ||
199 | print_one_rcu_data_csv(m, per_cpu_ptr(rsp->rda, cpu)); | ||
200 | } | ||
201 | return 0; | 198 | return 0; |
202 | } | 199 | } |
203 | 200 | ||
204 | static int rcudata_csv_open(struct inode *inode, struct file *file) | 201 | static int rcuexp_open(struct inode *inode, struct file *file) |
205 | { | 202 | { |
206 | return single_open(file, show_rcudata_csv, NULL); | 203 | return single_open(file, show_rcuexp, inode->i_private); |
207 | } | 204 | } |
208 | 205 | ||
209 | static const struct file_operations rcudata_csv_fops = { | 206 | static const struct file_operations rcuexp_fops = { |
210 | .owner = THIS_MODULE, | 207 | .owner = THIS_MODULE, |
211 | .open = rcudata_csv_open, | 208 | .open = rcuexp_open, |
212 | .read = seq_read, | 209 | .read = seq_read, |
213 | .llseek = seq_lseek, | 210 | .llseek = no_llseek, |
214 | .release = single_release, | 211 | .release = seq_release, |
215 | }; | 212 | }; |
216 | 213 | ||
217 | #ifdef CONFIG_RCU_BOOST | 214 | #ifdef CONFIG_RCU_BOOST |
@@ -257,27 +254,11 @@ static const struct file_operations rcu_node_boost_fops = { | |||
257 | .owner = THIS_MODULE, | 254 | .owner = THIS_MODULE, |
258 | .open = rcu_node_boost_open, | 255 | .open = rcu_node_boost_open, |
259 | .read = seq_read, | 256 | .read = seq_read, |
260 | .llseek = seq_lseek, | 257 | .llseek = no_llseek, |
261 | .release = single_release, | 258 | .release = single_release, |
262 | }; | 259 | }; |
263 | 260 | ||
264 | /* | 261 | #endif /* #ifdef CONFIG_RCU_BOOST */ |
265 | * Create the rcuboost debugfs entry. Standard error return. | ||
266 | */ | ||
267 | static int rcu_boost_trace_create_file(struct dentry *rcudir) | ||
268 | { | ||
269 | return !debugfs_create_file("rcuboost", 0444, rcudir, NULL, | ||
270 | &rcu_node_boost_fops); | ||
271 | } | ||
272 | |||
273 | #else /* #ifdef CONFIG_RCU_BOOST */ | ||
274 | |||
275 | static int rcu_boost_trace_create_file(struct dentry *rcudir) | ||
276 | { | ||
277 | return 0; /* There cannot be an error if we didn't create it! */ | ||
278 | } | ||
279 | |||
280 | #endif /* #else #ifdef CONFIG_RCU_BOOST */ | ||
281 | 262 | ||
282 | static void print_one_rcu_state(struct seq_file *m, struct rcu_state *rsp) | 263 | static void print_one_rcu_state(struct seq_file *m, struct rcu_state *rsp) |
283 | { | 264 | { |
@@ -286,8 +267,9 @@ static void print_one_rcu_state(struct seq_file *m, struct rcu_state *rsp) | |||
286 | struct rcu_node *rnp; | 267 | struct rcu_node *rnp; |
287 | 268 | ||
288 | gpnum = rsp->gpnum; | 269 | gpnum = rsp->gpnum; |
289 | seq_printf(m, "%s: c=%lu g=%lu s=%d jfq=%ld j=%x ", | 270 | seq_printf(m, "c=%ld g=%ld s=%d jfq=%ld j=%x ", |
290 | rsp->name, rsp->completed, gpnum, rsp->fqs_state, | 271 | ulong2long(rsp->completed), ulong2long(gpnum), |
272 | rsp->fqs_state, | ||
291 | (long)(rsp->jiffies_force_qs - jiffies), | 273 | (long)(rsp->jiffies_force_qs - jiffies), |
292 | (int)(jiffies & 0xffff)); | 274 | (int)(jiffies & 0xffff)); |
293 | seq_printf(m, "nfqs=%lu/nfqsng=%lu(%lu) fqlh=%lu oqlen=%ld/%ld\n", | 275 | seq_printf(m, "nfqs=%lu/nfqsng=%lu(%lu) fqlh=%lu oqlen=%ld/%ld\n", |
@@ -309,26 +291,24 @@ static void print_one_rcu_state(struct seq_file *m, struct rcu_state *rsp) | |||
309 | seq_puts(m, "\n"); | 291 | seq_puts(m, "\n"); |
310 | } | 292 | } |
311 | 293 | ||
312 | static int show_rcuhier(struct seq_file *m, void *unused) | 294 | static int show_rcuhier(struct seq_file *m, void *v) |
313 | { | 295 | { |
314 | struct rcu_state *rsp; | 296 | struct rcu_state *rsp = (struct rcu_state *)m->private; |
315 | 297 | print_one_rcu_state(m, rsp); | |
316 | for_each_rcu_flavor(rsp) | ||
317 | print_one_rcu_state(m, rsp); | ||
318 | return 0; | 298 | return 0; |
319 | } | 299 | } |
320 | 300 | ||
321 | static int rcuhier_open(struct inode *inode, struct file *file) | 301 | static int rcuhier_open(struct inode *inode, struct file *file) |
322 | { | 302 | { |
323 | return single_open(file, show_rcuhier, NULL); | 303 | return single_open(file, show_rcuhier, inode->i_private); |
324 | } | 304 | } |
325 | 305 | ||
326 | static const struct file_operations rcuhier_fops = { | 306 | static const struct file_operations rcuhier_fops = { |
327 | .owner = THIS_MODULE, | 307 | .owner = THIS_MODULE, |
328 | .open = rcuhier_open, | 308 | .open = rcuhier_open, |
329 | .read = seq_read, | 309 | .read = seq_read, |
330 | .llseek = seq_lseek, | 310 | .llseek = no_llseek, |
331 | .release = single_release, | 311 | .release = seq_release, |
332 | }; | 312 | }; |
333 | 313 | ||
334 | static void show_one_rcugp(struct seq_file *m, struct rcu_state *rsp) | 314 | static void show_one_rcugp(struct seq_file *m, struct rcu_state *rsp) |
@@ -341,42 +321,42 @@ static void show_one_rcugp(struct seq_file *m, struct rcu_state *rsp) | |||
341 | struct rcu_node *rnp = &rsp->node[0]; | 321 | struct rcu_node *rnp = &rsp->node[0]; |
342 | 322 | ||
343 | raw_spin_lock_irqsave(&rnp->lock, flags); | 323 | raw_spin_lock_irqsave(&rnp->lock, flags); |
344 | completed = rsp->completed; | 324 | completed = ACCESS_ONCE(rsp->completed); |
345 | gpnum = rsp->gpnum; | 325 | gpnum = ACCESS_ONCE(rsp->gpnum); |
346 | if (rsp->completed == rsp->gpnum) | 326 | if (completed == gpnum) |
347 | gpage = 0; | 327 | gpage = 0; |
348 | else | 328 | else |
349 | gpage = jiffies - rsp->gp_start; | 329 | gpage = jiffies - rsp->gp_start; |
350 | gpmax = rsp->gp_max; | 330 | gpmax = rsp->gp_max; |
351 | raw_spin_unlock_irqrestore(&rnp->lock, flags); | 331 | raw_spin_unlock_irqrestore(&rnp->lock, flags); |
352 | seq_printf(m, "%s: completed=%ld gpnum=%lu age=%ld max=%ld\n", | 332 | seq_printf(m, "completed=%ld gpnum=%ld age=%ld max=%ld\n", |
353 | rsp->name, completed, gpnum, gpage, gpmax); | 333 | ulong2long(completed), ulong2long(gpnum), gpage, gpmax); |
354 | } | 334 | } |
355 | 335 | ||
356 | static int show_rcugp(struct seq_file *m, void *unused) | 336 | static int show_rcugp(struct seq_file *m, void *v) |
357 | { | 337 | { |
358 | struct rcu_state *rsp; | 338 | struct rcu_state *rsp = (struct rcu_state *)m->private; |
359 | 339 | show_one_rcugp(m, rsp); | |
360 | for_each_rcu_flavor(rsp) | ||
361 | show_one_rcugp(m, rsp); | ||
362 | return 0; | 340 | return 0; |
363 | } | 341 | } |
364 | 342 | ||
365 | static int rcugp_open(struct inode *inode, struct file *file) | 343 | static int rcugp_open(struct inode *inode, struct file *file) |
366 | { | 344 | { |
367 | return single_open(file, show_rcugp, NULL); | 345 | return single_open(file, show_rcugp, inode->i_private); |
368 | } | 346 | } |
369 | 347 | ||
370 | static const struct file_operations rcugp_fops = { | 348 | static const struct file_operations rcugp_fops = { |
371 | .owner = THIS_MODULE, | 349 | .owner = THIS_MODULE, |
372 | .open = rcugp_open, | 350 | .open = rcugp_open, |
373 | .read = seq_read, | 351 | .read = seq_read, |
374 | .llseek = seq_lseek, | 352 | .llseek = no_llseek, |
375 | .release = single_release, | 353 | .release = seq_release, |
376 | }; | 354 | }; |
377 | 355 | ||
378 | static void print_one_rcu_pending(struct seq_file *m, struct rcu_data *rdp) | 356 | static void print_one_rcu_pending(struct seq_file *m, struct rcu_data *rdp) |
379 | { | 357 | { |
358 | if (!rdp->beenonline) | ||
359 | return; | ||
380 | seq_printf(m, "%3d%cnp=%ld ", | 360 | seq_printf(m, "%3d%cnp=%ld ", |
381 | rdp->cpu, | 361 | rdp->cpu, |
382 | cpu_is_offline(rdp->cpu) ? '!' : ' ', | 362 | cpu_is_offline(rdp->cpu) ? '!' : ' ', |
@@ -386,41 +366,36 @@ static void print_one_rcu_pending(struct seq_file *m, struct rcu_data *rdp) | |||
386 | rdp->n_rp_report_qs, | 366 | rdp->n_rp_report_qs, |
387 | rdp->n_rp_cb_ready, | 367 | rdp->n_rp_cb_ready, |
388 | rdp->n_rp_cpu_needs_gp); | 368 | rdp->n_rp_cpu_needs_gp); |
389 | seq_printf(m, "gpc=%ld gps=%ld nf=%ld nn=%ld\n", | 369 | seq_printf(m, "gpc=%ld gps=%ld nn=%ld\n", |
390 | rdp->n_rp_gp_completed, | 370 | rdp->n_rp_gp_completed, |
391 | rdp->n_rp_gp_started, | 371 | rdp->n_rp_gp_started, |
392 | rdp->n_rp_need_fqs, | ||
393 | rdp->n_rp_need_nothing); | 372 | rdp->n_rp_need_nothing); |
394 | } | 373 | } |
395 | 374 | ||
396 | static int show_rcu_pending(struct seq_file *m, void *unused) | 375 | static int show_rcu_pending(struct seq_file *m, void *v) |
397 | { | 376 | { |
398 | int cpu; | 377 | print_one_rcu_pending(m, (struct rcu_data *)v); |
399 | struct rcu_data *rdp; | ||
400 | struct rcu_state *rsp; | ||
401 | |||
402 | for_each_rcu_flavor(rsp) { | ||
403 | seq_printf(m, "%s:\n", rsp->name); | ||
404 | for_each_possible_cpu(cpu) { | ||
405 | rdp = per_cpu_ptr(rsp->rda, cpu); | ||
406 | if (rdp->beenonline) | ||
407 | print_one_rcu_pending(m, rdp); | ||
408 | } | ||
409 | } | ||
410 | return 0; | 378 | return 0; |
411 | } | 379 | } |
412 | 380 | ||
381 | static const struct seq_operations rcu_pending_op = { | ||
382 | .start = r_start, | ||
383 | .next = r_next, | ||
384 | .stop = r_stop, | ||
385 | .show = show_rcu_pending, | ||
386 | }; | ||
387 | |||
413 | static int rcu_pending_open(struct inode *inode, struct file *file) | 388 | static int rcu_pending_open(struct inode *inode, struct file *file) |
414 | { | 389 | { |
415 | return single_open(file, show_rcu_pending, NULL); | 390 | return r_open(inode, file, &rcu_pending_op); |
416 | } | 391 | } |
417 | 392 | ||
418 | static const struct file_operations rcu_pending_fops = { | 393 | static const struct file_operations rcu_pending_fops = { |
419 | .owner = THIS_MODULE, | 394 | .owner = THIS_MODULE, |
420 | .open = rcu_pending_open, | 395 | .open = rcu_pending_open, |
421 | .read = seq_read, | 396 | .read = seq_read, |
422 | .llseek = seq_lseek, | 397 | .llseek = no_llseek, |
423 | .release = single_release, | 398 | .release = seq_release, |
424 | }; | 399 | }; |
425 | 400 | ||
426 | static int show_rcutorture(struct seq_file *m, void *unused) | 401 | static int show_rcutorture(struct seq_file *m, void *unused) |
@@ -450,43 +425,58 @@ static struct dentry *rcudir; | |||
450 | 425 | ||
451 | static int __init rcutree_trace_init(void) | 426 | static int __init rcutree_trace_init(void) |
452 | { | 427 | { |
428 | struct rcu_state *rsp; | ||
453 | struct dentry *retval; | 429 | struct dentry *retval; |
430 | struct dentry *rspdir; | ||
454 | 431 | ||
455 | rcudir = debugfs_create_dir("rcu", NULL); | 432 | rcudir = debugfs_create_dir("rcu", NULL); |
456 | if (!rcudir) | 433 | if (!rcudir) |
457 | goto free_out; | 434 | goto free_out; |
458 | 435 | ||
459 | retval = debugfs_create_file("rcubarrier", 0444, rcudir, | 436 | for_each_rcu_flavor(rsp) { |
460 | NULL, &rcubarrier_fops); | 437 | rspdir = debugfs_create_dir(rsp->name, rcudir); |
461 | if (!retval) | 438 | if (!rspdir) |
462 | goto free_out; | 439 | goto free_out; |
463 | 440 | ||
464 | retval = debugfs_create_file("rcudata", 0444, rcudir, | 441 | retval = debugfs_create_file("rcudata", 0444, |
465 | NULL, &rcudata_fops); | 442 | rspdir, rsp, &rcudata_fops); |
466 | if (!retval) | 443 | if (!retval) |
467 | goto free_out; | 444 | goto free_out; |
468 | 445 | ||
469 | retval = debugfs_create_file("rcudata.csv", 0444, rcudir, | 446 | retval = debugfs_create_file("rcuexp", 0444, |
470 | NULL, &rcudata_csv_fops); | 447 | rspdir, rsp, &rcuexp_fops); |
471 | if (!retval) | 448 | if (!retval) |
472 | goto free_out; | 449 | goto free_out; |
473 | 450 | ||
474 | if (rcu_boost_trace_create_file(rcudir)) | 451 | retval = debugfs_create_file("rcu_pending", 0444, |
475 | goto free_out; | 452 | rspdir, rsp, &rcu_pending_fops); |
453 | if (!retval) | ||
454 | goto free_out; | ||
455 | |||
456 | retval = debugfs_create_file("rcubarrier", 0444, | ||
457 | rspdir, rsp, &rcubarrier_fops); | ||
458 | if (!retval) | ||
459 | goto free_out; | ||
476 | 460 | ||
477 | retval = debugfs_create_file("rcugp", 0444, rcudir, NULL, &rcugp_fops); | 461 | #ifdef CONFIG_RCU_BOOST |
478 | if (!retval) | 462 | if (rsp == &rcu_preempt_state) { |
479 | goto free_out; | 463 | retval = debugfs_create_file("rcuboost", 0444, |
464 | rspdir, NULL, &rcu_node_boost_fops); | ||
465 | if (!retval) | ||
466 | goto free_out; | ||
467 | } | ||
468 | #endif | ||
480 | 469 | ||
481 | retval = debugfs_create_file("rcuhier", 0444, rcudir, | 470 | retval = debugfs_create_file("rcugp", 0444, |
482 | NULL, &rcuhier_fops); | 471 | rspdir, rsp, &rcugp_fops); |
483 | if (!retval) | 472 | if (!retval) |
484 | goto free_out; | 473 | goto free_out; |
485 | 474 | ||
486 | retval = debugfs_create_file("rcu_pending", 0444, rcudir, | 475 | retval = debugfs_create_file("rcuhier", 0444, |
487 | NULL, &rcu_pending_fops); | 476 | rspdir, rsp, &rcuhier_fops); |
488 | if (!retval) | 477 | if (!retval) |
489 | goto free_out; | 478 | goto free_out; |
479 | } | ||
490 | 480 | ||
491 | retval = debugfs_create_file("rcutorture", 0444, rcudir, | 481 | retval = debugfs_create_file("rcutorture", 0444, rcudir, |
492 | NULL, &rcutorture_fops); | 482 | NULL, &rcutorture_fops); |
diff --git a/kernel/res_counter.c b/kernel/res_counter.c index ad581aa2369a..ff55247e7049 100644 --- a/kernel/res_counter.c +++ b/kernel/res_counter.c | |||
@@ -86,33 +86,39 @@ int res_counter_charge_nofail(struct res_counter *counter, unsigned long val, | |||
86 | return __res_counter_charge(counter, val, limit_fail_at, true); | 86 | return __res_counter_charge(counter, val, limit_fail_at, true); |
87 | } | 87 | } |
88 | 88 | ||
89 | void res_counter_uncharge_locked(struct res_counter *counter, unsigned long val) | 89 | u64 res_counter_uncharge_locked(struct res_counter *counter, unsigned long val) |
90 | { | 90 | { |
91 | if (WARN_ON(counter->usage < val)) | 91 | if (WARN_ON(counter->usage < val)) |
92 | val = counter->usage; | 92 | val = counter->usage; |
93 | 93 | ||
94 | counter->usage -= val; | 94 | counter->usage -= val; |
95 | return counter->usage; | ||
95 | } | 96 | } |
96 | 97 | ||
97 | void res_counter_uncharge_until(struct res_counter *counter, | 98 | u64 res_counter_uncharge_until(struct res_counter *counter, |
98 | struct res_counter *top, | 99 | struct res_counter *top, |
99 | unsigned long val) | 100 | unsigned long val) |
100 | { | 101 | { |
101 | unsigned long flags; | 102 | unsigned long flags; |
102 | struct res_counter *c; | 103 | struct res_counter *c; |
104 | u64 ret = 0; | ||
103 | 105 | ||
104 | local_irq_save(flags); | 106 | local_irq_save(flags); |
105 | for (c = counter; c != top; c = c->parent) { | 107 | for (c = counter; c != top; c = c->parent) { |
108 | u64 r; | ||
106 | spin_lock(&c->lock); | 109 | spin_lock(&c->lock); |
107 | res_counter_uncharge_locked(c, val); | 110 | r = res_counter_uncharge_locked(c, val); |
111 | if (c == counter) | ||
112 | ret = r; | ||
108 | spin_unlock(&c->lock); | 113 | spin_unlock(&c->lock); |
109 | } | 114 | } |
110 | local_irq_restore(flags); | 115 | local_irq_restore(flags); |
116 | return ret; | ||
111 | } | 117 | } |
112 | 118 | ||
113 | void res_counter_uncharge(struct res_counter *counter, unsigned long val) | 119 | u64 res_counter_uncharge(struct res_counter *counter, unsigned long val) |
114 | { | 120 | { |
115 | res_counter_uncharge_until(counter, NULL, val); | 121 | return res_counter_uncharge_until(counter, NULL, val); |
116 | } | 122 | } |
117 | 123 | ||
118 | static inline unsigned long long * | 124 | static inline unsigned long long * |
@@ -192,25 +198,3 @@ int res_counter_memparse_write_strategy(const char *buf, | |||
192 | *res = PAGE_ALIGN(*res); | 198 | *res = PAGE_ALIGN(*res); |
193 | return 0; | 199 | return 0; |
194 | } | 200 | } |
195 | |||
196 | int res_counter_write(struct res_counter *counter, int member, | ||
197 | const char *buf, write_strategy_fn write_strategy) | ||
198 | { | ||
199 | char *end; | ||
200 | unsigned long flags; | ||
201 | unsigned long long tmp, *val; | ||
202 | |||
203 | if (write_strategy) { | ||
204 | if (write_strategy(buf, &tmp)) | ||
205 | return -EINVAL; | ||
206 | } else { | ||
207 | tmp = simple_strtoull(buf, &end, 10); | ||
208 | if (*end != '\0') | ||
209 | return -EINVAL; | ||
210 | } | ||
211 | spin_lock_irqsave(&counter->lock, flags); | ||
212 | val = res_counter_member(counter, member); | ||
213 | *val = tmp; | ||
214 | spin_unlock_irqrestore(&counter->lock, flags); | ||
215 | return 0; | ||
216 | } | ||
diff --git a/kernel/resource.c b/kernel/resource.c index 34d45886ee84..73f35d4b30b9 100644 --- a/kernel/resource.c +++ b/kernel/resource.c | |||
@@ -763,6 +763,7 @@ static void __init __reserve_region_with_split(struct resource *root, | |||
763 | struct resource *parent = root; | 763 | struct resource *parent = root; |
764 | struct resource *conflict; | 764 | struct resource *conflict; |
765 | struct resource *res = kzalloc(sizeof(*res), GFP_ATOMIC); | 765 | struct resource *res = kzalloc(sizeof(*res), GFP_ATOMIC); |
766 | struct resource *next_res = NULL; | ||
766 | 767 | ||
767 | if (!res) | 768 | if (!res) |
768 | return; | 769 | return; |
@@ -772,21 +773,46 @@ static void __init __reserve_region_with_split(struct resource *root, | |||
772 | res->end = end; | 773 | res->end = end; |
773 | res->flags = IORESOURCE_BUSY; | 774 | res->flags = IORESOURCE_BUSY; |
774 | 775 | ||
775 | conflict = __request_resource(parent, res); | 776 | while (1) { |
776 | if (!conflict) | ||
777 | return; | ||
778 | 777 | ||
779 | /* failed, split and try again */ | 778 | conflict = __request_resource(parent, res); |
780 | kfree(res); | 779 | if (!conflict) { |
780 | if (!next_res) | ||
781 | break; | ||
782 | res = next_res; | ||
783 | next_res = NULL; | ||
784 | continue; | ||
785 | } | ||
781 | 786 | ||
782 | /* conflict covered whole area */ | 787 | /* conflict covered whole area */ |
783 | if (conflict->start <= start && conflict->end >= end) | 788 | if (conflict->start <= res->start && |
784 | return; | 789 | conflict->end >= res->end) { |
790 | kfree(res); | ||
791 | WARN_ON(next_res); | ||
792 | break; | ||
793 | } | ||
794 | |||
795 | /* failed, split and try again */ | ||
796 | if (conflict->start > res->start) { | ||
797 | end = res->end; | ||
798 | res->end = conflict->start - 1; | ||
799 | if (conflict->end < end) { | ||
800 | next_res = kzalloc(sizeof(*next_res), | ||
801 | GFP_ATOMIC); | ||
802 | if (!next_res) { | ||
803 | kfree(res); | ||
804 | break; | ||
805 | } | ||
806 | next_res->name = name; | ||
807 | next_res->start = conflict->end + 1; | ||
808 | next_res->end = end; | ||
809 | next_res->flags = IORESOURCE_BUSY; | ||
810 | } | ||
811 | } else { | ||
812 | res->start = conflict->end + 1; | ||
813 | } | ||
814 | } | ||
785 | 815 | ||
786 | if (conflict->start > start) | ||
787 | __reserve_region_with_split(root, start, conflict->start-1, name); | ||
788 | if (conflict->end < end) | ||
789 | __reserve_region_with_split(root, conflict->end+1, end, name); | ||
790 | } | 816 | } |
791 | 817 | ||
792 | void __init reserve_region_with_split(struct resource *root, | 818 | void __init reserve_region_with_split(struct resource *root, |
diff --git a/kernel/sched/Makefile b/kernel/sched/Makefile index 173ea52f3af0..f06d249e103b 100644 --- a/kernel/sched/Makefile +++ b/kernel/sched/Makefile | |||
@@ -11,7 +11,7 @@ ifneq ($(CONFIG_SCHED_OMIT_FRAME_POINTER),y) | |||
11 | CFLAGS_core.o := $(PROFILING) -fno-omit-frame-pointer | 11 | CFLAGS_core.o := $(PROFILING) -fno-omit-frame-pointer |
12 | endif | 12 | endif |
13 | 13 | ||
14 | obj-y += core.o clock.o idle_task.o fair.o rt.o stop_task.o | 14 | obj-y += core.o clock.o cputime.o idle_task.o fair.o rt.o stop_task.o |
15 | obj-$(CONFIG_SMP) += cpupri.o | 15 | obj-$(CONFIG_SMP) += cpupri.o |
16 | obj-$(CONFIG_SCHED_AUTOGROUP) += auto_group.o | 16 | obj-$(CONFIG_SCHED_AUTOGROUP) += auto_group.o |
17 | obj-$(CONFIG_SCHEDSTATS) += stats.o | 17 | obj-$(CONFIG_SCHEDSTATS) += stats.o |
diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 649c9f876cb1..257002c13bb0 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c | |||
@@ -72,6 +72,7 @@ | |||
72 | #include <linux/slab.h> | 72 | #include <linux/slab.h> |
73 | #include <linux/init_task.h> | 73 | #include <linux/init_task.h> |
74 | #include <linux/binfmts.h> | 74 | #include <linux/binfmts.h> |
75 | #include <linux/context_tracking.h> | ||
75 | 76 | ||
76 | #include <asm/switch_to.h> | 77 | #include <asm/switch_to.h> |
77 | #include <asm/tlb.h> | 78 | #include <asm/tlb.h> |
@@ -192,23 +193,10 @@ static void sched_feat_disable(int i) { }; | |||
192 | static void sched_feat_enable(int i) { }; | 193 | static void sched_feat_enable(int i) { }; |
193 | #endif /* HAVE_JUMP_LABEL */ | 194 | #endif /* HAVE_JUMP_LABEL */ |
194 | 195 | ||
195 | static ssize_t | 196 | static int sched_feat_set(char *cmp) |
196 | sched_feat_write(struct file *filp, const char __user *ubuf, | ||
197 | size_t cnt, loff_t *ppos) | ||
198 | { | 197 | { |
199 | char buf[64]; | ||
200 | char *cmp; | ||
201 | int neg = 0; | ||
202 | int i; | 198 | int i; |
203 | 199 | int neg = 0; | |
204 | if (cnt > 63) | ||
205 | cnt = 63; | ||
206 | |||
207 | if (copy_from_user(&buf, ubuf, cnt)) | ||
208 | return -EFAULT; | ||
209 | |||
210 | buf[cnt] = 0; | ||
211 | cmp = strstrip(buf); | ||
212 | 200 | ||
213 | if (strncmp(cmp, "NO_", 3) == 0) { | 201 | if (strncmp(cmp, "NO_", 3) == 0) { |
214 | neg = 1; | 202 | neg = 1; |
@@ -228,6 +216,27 @@ sched_feat_write(struct file *filp, const char __user *ubuf, | |||
228 | } | 216 | } |
229 | } | 217 | } |
230 | 218 | ||
219 | return i; | ||
220 | } | ||
221 | |||
222 | static ssize_t | ||
223 | sched_feat_write(struct file *filp, const char __user *ubuf, | ||
224 | size_t cnt, loff_t *ppos) | ||
225 | { | ||
226 | char buf[64]; | ||
227 | char *cmp; | ||
228 | int i; | ||
229 | |||
230 | if (cnt > 63) | ||
231 | cnt = 63; | ||
232 | |||
233 | if (copy_from_user(&buf, ubuf, cnt)) | ||
234 | return -EFAULT; | ||
235 | |||
236 | buf[cnt] = 0; | ||
237 | cmp = strstrip(buf); | ||
238 | |||
239 | i = sched_feat_set(cmp); | ||
231 | if (i == __SCHED_FEAT_NR) | 240 | if (i == __SCHED_FEAT_NR) |
232 | return -EINVAL; | 241 | return -EINVAL; |
233 | 242 | ||
@@ -505,7 +514,7 @@ static inline void init_hrtick(void) | |||
505 | #ifdef CONFIG_SMP | 514 | #ifdef CONFIG_SMP |
506 | 515 | ||
507 | #ifndef tsk_is_polling | 516 | #ifndef tsk_is_polling |
508 | #define tsk_is_polling(t) test_tsk_thread_flag(t, TIF_POLLING_NRFLAG) | 517 | #define tsk_is_polling(t) 0 |
509 | #endif | 518 | #endif |
510 | 519 | ||
511 | void resched_task(struct task_struct *p) | 520 | void resched_task(struct task_struct *p) |
@@ -740,126 +749,6 @@ void deactivate_task(struct rq *rq, struct task_struct *p, int flags) | |||
740 | dequeue_task(rq, p, flags); | 749 | dequeue_task(rq, p, flags); |
741 | } | 750 | } |
742 | 751 | ||
743 | #ifdef CONFIG_IRQ_TIME_ACCOUNTING | ||
744 | |||
745 | /* | ||
746 | * There are no locks covering percpu hardirq/softirq time. | ||
747 | * They are only modified in account_system_vtime, on corresponding CPU | ||
748 | * with interrupts disabled. So, writes are safe. | ||
749 | * They are read and saved off onto struct rq in update_rq_clock(). | ||
750 | * This may result in other CPU reading this CPU's irq time and can | ||
751 | * race with irq/account_system_vtime on this CPU. We would either get old | ||
752 | * or new value with a side effect of accounting a slice of irq time to wrong | ||
753 | * task when irq is in progress while we read rq->clock. That is a worthy | ||
754 | * compromise in place of having locks on each irq in account_system_time. | ||
755 | */ | ||
756 | static DEFINE_PER_CPU(u64, cpu_hardirq_time); | ||
757 | static DEFINE_PER_CPU(u64, cpu_softirq_time); | ||
758 | |||
759 | static DEFINE_PER_CPU(u64, irq_start_time); | ||
760 | static int sched_clock_irqtime; | ||
761 | |||
762 | void enable_sched_clock_irqtime(void) | ||
763 | { | ||
764 | sched_clock_irqtime = 1; | ||
765 | } | ||
766 | |||
767 | void disable_sched_clock_irqtime(void) | ||
768 | { | ||
769 | sched_clock_irqtime = 0; | ||
770 | } | ||
771 | |||
772 | #ifndef CONFIG_64BIT | ||
773 | static DEFINE_PER_CPU(seqcount_t, irq_time_seq); | ||
774 | |||
775 | static inline void irq_time_write_begin(void) | ||
776 | { | ||
777 | __this_cpu_inc(irq_time_seq.sequence); | ||
778 | smp_wmb(); | ||
779 | } | ||
780 | |||
781 | static inline void irq_time_write_end(void) | ||
782 | { | ||
783 | smp_wmb(); | ||
784 | __this_cpu_inc(irq_time_seq.sequence); | ||
785 | } | ||
786 | |||
787 | static inline u64 irq_time_read(int cpu) | ||
788 | { | ||
789 | u64 irq_time; | ||
790 | unsigned seq; | ||
791 | |||
792 | do { | ||
793 | seq = read_seqcount_begin(&per_cpu(irq_time_seq, cpu)); | ||
794 | irq_time = per_cpu(cpu_softirq_time, cpu) + | ||
795 | per_cpu(cpu_hardirq_time, cpu); | ||
796 | } while (read_seqcount_retry(&per_cpu(irq_time_seq, cpu), seq)); | ||
797 | |||
798 | return irq_time; | ||
799 | } | ||
800 | #else /* CONFIG_64BIT */ | ||
801 | static inline void irq_time_write_begin(void) | ||
802 | { | ||
803 | } | ||
804 | |||
805 | static inline void irq_time_write_end(void) | ||
806 | { | ||
807 | } | ||
808 | |||
809 | static inline u64 irq_time_read(int cpu) | ||
810 | { | ||
811 | return per_cpu(cpu_softirq_time, cpu) + per_cpu(cpu_hardirq_time, cpu); | ||
812 | } | ||
813 | #endif /* CONFIG_64BIT */ | ||
814 | |||
815 | /* | ||
816 | * Called before incrementing preempt_count on {soft,}irq_enter | ||
817 | * and before decrementing preempt_count on {soft,}irq_exit. | ||
818 | */ | ||
819 | void account_system_vtime(struct task_struct *curr) | ||
820 | { | ||
821 | unsigned long flags; | ||
822 | s64 delta; | ||
823 | int cpu; | ||
824 | |||
825 | if (!sched_clock_irqtime) | ||
826 | return; | ||
827 | |||
828 | local_irq_save(flags); | ||
829 | |||
830 | cpu = smp_processor_id(); | ||
831 | delta = sched_clock_cpu(cpu) - __this_cpu_read(irq_start_time); | ||
832 | __this_cpu_add(irq_start_time, delta); | ||
833 | |||
834 | irq_time_write_begin(); | ||
835 | /* | ||
836 | * We do not account for softirq time from ksoftirqd here. | ||
837 | * We want to continue accounting softirq time to ksoftirqd thread | ||
838 | * in that case, so as not to confuse scheduler with a special task | ||
839 | * that do not consume any time, but still wants to run. | ||
840 | */ | ||
841 | if (hardirq_count()) | ||
842 | __this_cpu_add(cpu_hardirq_time, delta); | ||
843 | else if (in_serving_softirq() && curr != this_cpu_ksoftirqd()) | ||
844 | __this_cpu_add(cpu_softirq_time, delta); | ||
845 | |||
846 | irq_time_write_end(); | ||
847 | local_irq_restore(flags); | ||
848 | } | ||
849 | EXPORT_SYMBOL_GPL(account_system_vtime); | ||
850 | |||
851 | #endif /* CONFIG_IRQ_TIME_ACCOUNTING */ | ||
852 | |||
853 | #ifdef CONFIG_PARAVIRT | ||
854 | static inline u64 steal_ticks(u64 steal) | ||
855 | { | ||
856 | if (unlikely(steal > NSEC_PER_SEC)) | ||
857 | return div_u64(steal, TICK_NSEC); | ||
858 | |||
859 | return __iter_div_u64_rem(steal, TICK_NSEC, &steal); | ||
860 | } | ||
861 | #endif | ||
862 | |||
863 | static void update_rq_clock_task(struct rq *rq, s64 delta) | 752 | static void update_rq_clock_task(struct rq *rq, s64 delta) |
864 | { | 753 | { |
865 | /* | 754 | /* |
@@ -920,43 +809,6 @@ static void update_rq_clock_task(struct rq *rq, s64 delta) | |||
920 | #endif | 809 | #endif |
921 | } | 810 | } |
922 | 811 | ||
923 | #ifdef CONFIG_IRQ_TIME_ACCOUNTING | ||
924 | static int irqtime_account_hi_update(void) | ||
925 | { | ||
926 | u64 *cpustat = kcpustat_this_cpu->cpustat; | ||
927 | unsigned long flags; | ||
928 | u64 latest_ns; | ||
929 | int ret = 0; | ||
930 | |||
931 | local_irq_save(flags); | ||
932 | latest_ns = this_cpu_read(cpu_hardirq_time); | ||
933 | if (nsecs_to_cputime64(latest_ns) > cpustat[CPUTIME_IRQ]) | ||
934 | ret = 1; | ||
935 | local_irq_restore(flags); | ||
936 | return ret; | ||
937 | } | ||
938 | |||
939 | static int irqtime_account_si_update(void) | ||
940 | { | ||
941 | u64 *cpustat = kcpustat_this_cpu->cpustat; | ||
942 | unsigned long flags; | ||
943 | u64 latest_ns; | ||
944 | int ret = 0; | ||
945 | |||
946 | local_irq_save(flags); | ||
947 | latest_ns = this_cpu_read(cpu_softirq_time); | ||
948 | if (nsecs_to_cputime64(latest_ns) > cpustat[CPUTIME_SOFTIRQ]) | ||
949 | ret = 1; | ||
950 | local_irq_restore(flags); | ||
951 | return ret; | ||
952 | } | ||
953 | |||
954 | #else /* CONFIG_IRQ_TIME_ACCOUNTING */ | ||
955 | |||
956 | #define sched_clock_irqtime (0) | ||
957 | |||
958 | #endif | ||
959 | |||
960 | void sched_set_stop_task(int cpu, struct task_struct *stop) | 812 | void sched_set_stop_task(int cpu, struct task_struct *stop) |
961 | { | 813 | { |
962 | struct sched_param param = { .sched_priority = MAX_RT_PRIO - 1 }; | 814 | struct sched_param param = { .sched_priority = MAX_RT_PRIO - 1 }; |
@@ -1079,6 +931,13 @@ void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags) | |||
1079 | rq->skip_clock_update = 1; | 931 | rq->skip_clock_update = 1; |
1080 | } | 932 | } |
1081 | 933 | ||
934 | static ATOMIC_NOTIFIER_HEAD(task_migration_notifier); | ||
935 | |||
936 | void register_task_migration_notifier(struct notifier_block *n) | ||
937 | { | ||
938 | atomic_notifier_chain_register(&task_migration_notifier, n); | ||
939 | } | ||
940 | |||
1082 | #ifdef CONFIG_SMP | 941 | #ifdef CONFIG_SMP |
1083 | void set_task_cpu(struct task_struct *p, unsigned int new_cpu) | 942 | void set_task_cpu(struct task_struct *p, unsigned int new_cpu) |
1084 | { | 943 | { |
@@ -1109,8 +968,18 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu) | |||
1109 | trace_sched_migrate_task(p, new_cpu); | 968 | trace_sched_migrate_task(p, new_cpu); |
1110 | 969 | ||
1111 | if (task_cpu(p) != new_cpu) { | 970 | if (task_cpu(p) != new_cpu) { |
971 | struct task_migration_notifier tmn; | ||
972 | |||
973 | if (p->sched_class->migrate_task_rq) | ||
974 | p->sched_class->migrate_task_rq(p, new_cpu); | ||
1112 | p->se.nr_migrations++; | 975 | p->se.nr_migrations++; |
1113 | perf_sw_event(PERF_COUNT_SW_CPU_MIGRATIONS, 1, NULL, 0); | 976 | perf_sw_event(PERF_COUNT_SW_CPU_MIGRATIONS, 1, NULL, 0); |
977 | |||
978 | tmn.task = p; | ||
979 | tmn.from_cpu = task_cpu(p); | ||
980 | tmn.to_cpu = new_cpu; | ||
981 | |||
982 | atomic_notifier_call_chain(&task_migration_notifier, 0, &tmn); | ||
1114 | } | 983 | } |
1115 | 984 | ||
1116 | __set_task_cpu(p, new_cpu); | 985 | __set_task_cpu(p, new_cpu); |
@@ -1518,25 +1387,6 @@ static void ttwu_queue_remote(struct task_struct *p, int cpu) | |||
1518 | smp_send_reschedule(cpu); | 1387 | smp_send_reschedule(cpu); |
1519 | } | 1388 | } |
1520 | 1389 | ||
1521 | #ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW | ||
1522 | static int ttwu_activate_remote(struct task_struct *p, int wake_flags) | ||
1523 | { | ||
1524 | struct rq *rq; | ||
1525 | int ret = 0; | ||
1526 | |||
1527 | rq = __task_rq_lock(p); | ||
1528 | if (p->on_cpu) { | ||
1529 | ttwu_activate(rq, p, ENQUEUE_WAKEUP); | ||
1530 | ttwu_do_wakeup(rq, p, wake_flags); | ||
1531 | ret = 1; | ||
1532 | } | ||
1533 | __task_rq_unlock(rq); | ||
1534 | |||
1535 | return ret; | ||
1536 | |||
1537 | } | ||
1538 | #endif /* __ARCH_WANT_INTERRUPTS_ON_CTXSW */ | ||
1539 | |||
1540 | bool cpus_share_cache(int this_cpu, int that_cpu) | 1390 | bool cpus_share_cache(int this_cpu, int that_cpu) |
1541 | { | 1391 | { |
1542 | return per_cpu(sd_llc_id, this_cpu) == per_cpu(sd_llc_id, that_cpu); | 1392 | return per_cpu(sd_llc_id, this_cpu) == per_cpu(sd_llc_id, that_cpu); |
@@ -1597,21 +1447,8 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags) | |||
1597 | * If the owning (remote) cpu is still in the middle of schedule() with | 1447 | * If the owning (remote) cpu is still in the middle of schedule() with |
1598 | * this task as prev, wait until its done referencing the task. | 1448 | * this task as prev, wait until its done referencing the task. |
1599 | */ | 1449 | */ |
1600 | while (p->on_cpu) { | 1450 | while (p->on_cpu) |
1601 | #ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW | ||
1602 | /* | ||
1603 | * In case the architecture enables interrupts in | ||
1604 | * context_switch(), we cannot busy wait, since that | ||
1605 | * would lead to deadlocks when an interrupt hits and | ||
1606 | * tries to wake up @prev. So bail and do a complete | ||
1607 | * remote wakeup. | ||
1608 | */ | ||
1609 | if (ttwu_activate_remote(p, wake_flags)) | ||
1610 | goto stat; | ||
1611 | #else | ||
1612 | cpu_relax(); | 1451 | cpu_relax(); |
1613 | #endif | ||
1614 | } | ||
1615 | /* | 1452 | /* |
1616 | * Pairs with the smp_wmb() in finish_lock_switch(). | 1453 | * Pairs with the smp_wmb() in finish_lock_switch(). |
1617 | */ | 1454 | */ |
@@ -1713,6 +1550,15 @@ static void __sched_fork(struct task_struct *p) | |||
1713 | p->se.vruntime = 0; | 1550 | p->se.vruntime = 0; |
1714 | INIT_LIST_HEAD(&p->se.group_node); | 1551 | INIT_LIST_HEAD(&p->se.group_node); |
1715 | 1552 | ||
1553 | /* | ||
1554 | * Load-tracking only depends on SMP, FAIR_GROUP_SCHED dependency below may be | ||
1555 | * removed when useful for applications beyond shares distribution (e.g. | ||
1556 | * load-balance). | ||
1557 | */ | ||
1558 | #if defined(CONFIG_SMP) && defined(CONFIG_FAIR_GROUP_SCHED) | ||
1559 | p->se.avg.runnable_avg_period = 0; | ||
1560 | p->se.avg.runnable_avg_sum = 0; | ||
1561 | #endif | ||
1716 | #ifdef CONFIG_SCHEDSTATS | 1562 | #ifdef CONFIG_SCHEDSTATS |
1717 | memset(&p->se.statistics, 0, sizeof(p->se.statistics)); | 1563 | memset(&p->se.statistics, 0, sizeof(p->se.statistics)); |
1718 | #endif | 1564 | #endif |
@@ -1722,8 +1568,41 @@ static void __sched_fork(struct task_struct *p) | |||
1722 | #ifdef CONFIG_PREEMPT_NOTIFIERS | 1568 | #ifdef CONFIG_PREEMPT_NOTIFIERS |
1723 | INIT_HLIST_HEAD(&p->preempt_notifiers); | 1569 | INIT_HLIST_HEAD(&p->preempt_notifiers); |
1724 | #endif | 1570 | #endif |
1571 | |||
1572 | #ifdef CONFIG_NUMA_BALANCING | ||
1573 | if (p->mm && atomic_read(&p->mm->mm_users) == 1) { | ||
1574 | p->mm->numa_next_scan = jiffies; | ||
1575 | p->mm->numa_next_reset = jiffies; | ||
1576 | p->mm->numa_scan_seq = 0; | ||
1577 | } | ||
1578 | |||
1579 | p->node_stamp = 0ULL; | ||
1580 | p->numa_scan_seq = p->mm ? p->mm->numa_scan_seq : 0; | ||
1581 | p->numa_migrate_seq = p->mm ? p->mm->numa_scan_seq - 1 : 0; | ||
1582 | p->numa_scan_period = sysctl_numa_balancing_scan_delay; | ||
1583 | p->numa_work.next = &p->numa_work; | ||
1584 | #endif /* CONFIG_NUMA_BALANCING */ | ||
1725 | } | 1585 | } |
1726 | 1586 | ||
1587 | #ifdef CONFIG_NUMA_BALANCING | ||
1588 | #ifdef CONFIG_SCHED_DEBUG | ||
1589 | void set_numabalancing_state(bool enabled) | ||
1590 | { | ||
1591 | if (enabled) | ||
1592 | sched_feat_set("NUMA"); | ||
1593 | else | ||
1594 | sched_feat_set("NO_NUMA"); | ||
1595 | } | ||
1596 | #else | ||
1597 | __read_mostly bool numabalancing_enabled; | ||
1598 | |||
1599 | void set_numabalancing_state(bool enabled) | ||
1600 | { | ||
1601 | numabalancing_enabled = enabled; | ||
1602 | } | ||
1603 | #endif /* CONFIG_SCHED_DEBUG */ | ||
1604 | #endif /* CONFIG_NUMA_BALANCING */ | ||
1605 | |||
1727 | /* | 1606 | /* |
1728 | * fork()/clone()-time setup: | 1607 | * fork()/clone()-time setup: |
1729 | */ | 1608 | */ |
@@ -1953,14 +1832,9 @@ static void finish_task_switch(struct rq *rq, struct task_struct *prev) | |||
1953 | * Manfred Spraul <manfred@colorfullife.com> | 1832 | * Manfred Spraul <manfred@colorfullife.com> |
1954 | */ | 1833 | */ |
1955 | prev_state = prev->state; | 1834 | prev_state = prev->state; |
1835 | vtime_task_switch(prev); | ||
1956 | finish_arch_switch(prev); | 1836 | finish_arch_switch(prev); |
1957 | #ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW | ||
1958 | local_irq_disable(); | ||
1959 | #endif /* __ARCH_WANT_INTERRUPTS_ON_CTXSW */ | ||
1960 | perf_event_task_sched_in(prev, current); | 1837 | perf_event_task_sched_in(prev, current); |
1961 | #ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW | ||
1962 | local_irq_enable(); | ||
1963 | #endif /* __ARCH_WANT_INTERRUPTS_ON_CTXSW */ | ||
1964 | finish_lock_switch(rq, prev); | 1838 | finish_lock_switch(rq, prev); |
1965 | finish_arch_post_lock_switch(); | 1839 | finish_arch_post_lock_switch(); |
1966 | 1840 | ||
@@ -2080,6 +1954,7 @@ context_switch(struct rq *rq, struct task_struct *prev, | |||
2080 | spin_release(&rq->lock.dep_map, 1, _THIS_IP_); | 1954 | spin_release(&rq->lock.dep_map, 1, _THIS_IP_); |
2081 | #endif | 1955 | #endif |
2082 | 1956 | ||
1957 | context_tracking_task_switch(prev, next); | ||
2083 | /* Here we just switch the register state and the stack. */ | 1958 | /* Here we just switch the register state and the stack. */ |
2084 | switch_to(prev, next, prev); | 1959 | switch_to(prev, next, prev); |
2085 | 1960 | ||
@@ -2809,404 +2684,6 @@ unsigned long long task_sched_runtime(struct task_struct *p) | |||
2809 | return ns; | 2684 | return ns; |
2810 | } | 2685 | } |
2811 | 2686 | ||
2812 | #ifdef CONFIG_CGROUP_CPUACCT | ||
2813 | struct cgroup_subsys cpuacct_subsys; | ||
2814 | struct cpuacct root_cpuacct; | ||
2815 | #endif | ||
2816 | |||
2817 | static inline void task_group_account_field(struct task_struct *p, int index, | ||
2818 | u64 tmp) | ||
2819 | { | ||
2820 | #ifdef CONFIG_CGROUP_CPUACCT | ||
2821 | struct kernel_cpustat *kcpustat; | ||
2822 | struct cpuacct *ca; | ||
2823 | #endif | ||
2824 | /* | ||
2825 | * Since all updates are sure to touch the root cgroup, we | ||
2826 | * get ourselves ahead and touch it first. If the root cgroup | ||
2827 | * is the only cgroup, then nothing else should be necessary. | ||
2828 | * | ||
2829 | */ | ||
2830 | __get_cpu_var(kernel_cpustat).cpustat[index] += tmp; | ||
2831 | |||
2832 | #ifdef CONFIG_CGROUP_CPUACCT | ||
2833 | if (unlikely(!cpuacct_subsys.active)) | ||
2834 | return; | ||
2835 | |||
2836 | rcu_read_lock(); | ||
2837 | ca = task_ca(p); | ||
2838 | while (ca && (ca != &root_cpuacct)) { | ||
2839 | kcpustat = this_cpu_ptr(ca->cpustat); | ||
2840 | kcpustat->cpustat[index] += tmp; | ||
2841 | ca = parent_ca(ca); | ||
2842 | } | ||
2843 | rcu_read_unlock(); | ||
2844 | #endif | ||
2845 | } | ||
2846 | |||
2847 | |||
2848 | /* | ||
2849 | * Account user cpu time to a process. | ||
2850 | * @p: the process that the cpu time gets accounted to | ||
2851 | * @cputime: the cpu time spent in user space since the last update | ||
2852 | * @cputime_scaled: cputime scaled by cpu frequency | ||
2853 | */ | ||
2854 | void account_user_time(struct task_struct *p, cputime_t cputime, | ||
2855 | cputime_t cputime_scaled) | ||
2856 | { | ||
2857 | int index; | ||
2858 | |||
2859 | /* Add user time to process. */ | ||
2860 | p->utime += cputime; | ||
2861 | p->utimescaled += cputime_scaled; | ||
2862 | account_group_user_time(p, cputime); | ||
2863 | |||
2864 | index = (TASK_NICE(p) > 0) ? CPUTIME_NICE : CPUTIME_USER; | ||
2865 | |||
2866 | /* Add user time to cpustat. */ | ||
2867 | task_group_account_field(p, index, (__force u64) cputime); | ||
2868 | |||
2869 | /* Account for user time used */ | ||
2870 | acct_update_integrals(p); | ||
2871 | } | ||
2872 | |||
2873 | /* | ||
2874 | * Account guest cpu time to a process. | ||
2875 | * @p: the process that the cpu time gets accounted to | ||
2876 | * @cputime: the cpu time spent in virtual machine since the last update | ||
2877 | * @cputime_scaled: cputime scaled by cpu frequency | ||
2878 | */ | ||
2879 | static void account_guest_time(struct task_struct *p, cputime_t cputime, | ||
2880 | cputime_t cputime_scaled) | ||
2881 | { | ||
2882 | u64 *cpustat = kcpustat_this_cpu->cpustat; | ||
2883 | |||
2884 | /* Add guest time to process. */ | ||
2885 | p->utime += cputime; | ||
2886 | p->utimescaled += cputime_scaled; | ||
2887 | account_group_user_time(p, cputime); | ||
2888 | p->gtime += cputime; | ||
2889 | |||
2890 | /* Add guest time to cpustat. */ | ||
2891 | if (TASK_NICE(p) > 0) { | ||
2892 | cpustat[CPUTIME_NICE] += (__force u64) cputime; | ||
2893 | cpustat[CPUTIME_GUEST_NICE] += (__force u64) cputime; | ||
2894 | } else { | ||
2895 | cpustat[CPUTIME_USER] += (__force u64) cputime; | ||
2896 | cpustat[CPUTIME_GUEST] += (__force u64) cputime; | ||
2897 | } | ||
2898 | } | ||
2899 | |||
2900 | /* | ||
2901 | * Account system cpu time to a process and desired cpustat field | ||
2902 | * @p: the process that the cpu time gets accounted to | ||
2903 | * @cputime: the cpu time spent in kernel space since the last update | ||
2904 | * @cputime_scaled: cputime scaled by cpu frequency | ||
2905 | * @target_cputime64: pointer to cpustat field that has to be updated | ||
2906 | */ | ||
2907 | static inline | ||
2908 | void __account_system_time(struct task_struct *p, cputime_t cputime, | ||
2909 | cputime_t cputime_scaled, int index) | ||
2910 | { | ||
2911 | /* Add system time to process. */ | ||
2912 | p->stime += cputime; | ||
2913 | p->stimescaled += cputime_scaled; | ||
2914 | account_group_system_time(p, cputime); | ||
2915 | |||
2916 | /* Add system time to cpustat. */ | ||
2917 | task_group_account_field(p, index, (__force u64) cputime); | ||
2918 | |||
2919 | /* Account for system time used */ | ||
2920 | acct_update_integrals(p); | ||
2921 | } | ||
2922 | |||
2923 | /* | ||
2924 | * Account system cpu time to a process. | ||
2925 | * @p: the process that the cpu time gets accounted to | ||
2926 | * @hardirq_offset: the offset to subtract from hardirq_count() | ||
2927 | * @cputime: the cpu time spent in kernel space since the last update | ||
2928 | * @cputime_scaled: cputime scaled by cpu frequency | ||
2929 | */ | ||
2930 | void account_system_time(struct task_struct *p, int hardirq_offset, | ||
2931 | cputime_t cputime, cputime_t cputime_scaled) | ||
2932 | { | ||
2933 | int index; | ||
2934 | |||
2935 | if ((p->flags & PF_VCPU) && (irq_count() - hardirq_offset == 0)) { | ||
2936 | account_guest_time(p, cputime, cputime_scaled); | ||
2937 | return; | ||
2938 | } | ||
2939 | |||
2940 | if (hardirq_count() - hardirq_offset) | ||
2941 | index = CPUTIME_IRQ; | ||
2942 | else if (in_serving_softirq()) | ||
2943 | index = CPUTIME_SOFTIRQ; | ||
2944 | else | ||
2945 | index = CPUTIME_SYSTEM; | ||
2946 | |||
2947 | __account_system_time(p, cputime, cputime_scaled, index); | ||
2948 | } | ||
2949 | |||
2950 | /* | ||
2951 | * Account for involuntary wait time. | ||
2952 | * @cputime: the cpu time spent in involuntary wait | ||
2953 | */ | ||
2954 | void account_steal_time(cputime_t cputime) | ||
2955 | { | ||
2956 | u64 *cpustat = kcpustat_this_cpu->cpustat; | ||
2957 | |||
2958 | cpustat[CPUTIME_STEAL] += (__force u64) cputime; | ||
2959 | } | ||
2960 | |||
2961 | /* | ||
2962 | * Account for idle time. | ||
2963 | * @cputime: the cpu time spent in idle wait | ||
2964 | */ | ||
2965 | void account_idle_time(cputime_t cputime) | ||
2966 | { | ||
2967 | u64 *cpustat = kcpustat_this_cpu->cpustat; | ||
2968 | struct rq *rq = this_rq(); | ||
2969 | |||
2970 | if (atomic_read(&rq->nr_iowait) > 0) | ||
2971 | cpustat[CPUTIME_IOWAIT] += (__force u64) cputime; | ||
2972 | else | ||
2973 | cpustat[CPUTIME_IDLE] += (__force u64) cputime; | ||
2974 | } | ||
2975 | |||
2976 | static __always_inline bool steal_account_process_tick(void) | ||
2977 | { | ||
2978 | #ifdef CONFIG_PARAVIRT | ||
2979 | if (static_key_false(¶virt_steal_enabled)) { | ||
2980 | u64 steal, st = 0; | ||
2981 | |||
2982 | steal = paravirt_steal_clock(smp_processor_id()); | ||
2983 | steal -= this_rq()->prev_steal_time; | ||
2984 | |||
2985 | st = steal_ticks(steal); | ||
2986 | this_rq()->prev_steal_time += st * TICK_NSEC; | ||
2987 | |||
2988 | account_steal_time(st); | ||
2989 | return st; | ||
2990 | } | ||
2991 | #endif | ||
2992 | return false; | ||
2993 | } | ||
2994 | |||
2995 | #ifndef CONFIG_VIRT_CPU_ACCOUNTING | ||
2996 | |||
2997 | #ifdef CONFIG_IRQ_TIME_ACCOUNTING | ||
2998 | /* | ||
2999 | * Account a tick to a process and cpustat | ||
3000 | * @p: the process that the cpu time gets accounted to | ||
3001 | * @user_tick: is the tick from userspace | ||
3002 | * @rq: the pointer to rq | ||
3003 | * | ||
3004 | * Tick demultiplexing follows the order | ||
3005 | * - pending hardirq update | ||
3006 | * - pending softirq update | ||
3007 | * - user_time | ||
3008 | * - idle_time | ||
3009 | * - system time | ||
3010 | * - check for guest_time | ||
3011 | * - else account as system_time | ||
3012 | * | ||
3013 | * Check for hardirq is done both for system and user time as there is | ||
3014 | * no timer going off while we are on hardirq and hence we may never get an | ||
3015 | * opportunity to update it solely in system time. | ||
3016 | * p->stime and friends are only updated on system time and not on irq | ||
3017 | * softirq as those do not count in task exec_runtime any more. | ||
3018 | */ | ||
3019 | static void irqtime_account_process_tick(struct task_struct *p, int user_tick, | ||
3020 | struct rq *rq) | ||
3021 | { | ||
3022 | cputime_t one_jiffy_scaled = cputime_to_scaled(cputime_one_jiffy); | ||
3023 | u64 *cpustat = kcpustat_this_cpu->cpustat; | ||
3024 | |||
3025 | if (steal_account_process_tick()) | ||
3026 | return; | ||
3027 | |||
3028 | if (irqtime_account_hi_update()) { | ||
3029 | cpustat[CPUTIME_IRQ] += (__force u64) cputime_one_jiffy; | ||
3030 | } else if (irqtime_account_si_update()) { | ||
3031 | cpustat[CPUTIME_SOFTIRQ] += (__force u64) cputime_one_jiffy; | ||
3032 | } else if (this_cpu_ksoftirqd() == p) { | ||
3033 | /* | ||
3034 | * ksoftirqd time do not get accounted in cpu_softirq_time. | ||
3035 | * So, we have to handle it separately here. | ||
3036 | * Also, p->stime needs to be updated for ksoftirqd. | ||
3037 | */ | ||
3038 | __account_system_time(p, cputime_one_jiffy, one_jiffy_scaled, | ||
3039 | CPUTIME_SOFTIRQ); | ||
3040 | } else if (user_tick) { | ||
3041 | account_user_time(p, cputime_one_jiffy, one_jiffy_scaled); | ||
3042 | } else if (p == rq->idle) { | ||
3043 | account_idle_time(cputime_one_jiffy); | ||
3044 | } else if (p->flags & PF_VCPU) { /* System time or guest time */ | ||
3045 | account_guest_time(p, cputime_one_jiffy, one_jiffy_scaled); | ||
3046 | } else { | ||
3047 | __account_system_time(p, cputime_one_jiffy, one_jiffy_scaled, | ||
3048 | CPUTIME_SYSTEM); | ||
3049 | } | ||
3050 | } | ||
3051 | |||
3052 | static void irqtime_account_idle_ticks(int ticks) | ||
3053 | { | ||
3054 | int i; | ||
3055 | struct rq *rq = this_rq(); | ||
3056 | |||
3057 | for (i = 0; i < ticks; i++) | ||
3058 | irqtime_account_process_tick(current, 0, rq); | ||
3059 | } | ||
3060 | #else /* CONFIG_IRQ_TIME_ACCOUNTING */ | ||
3061 | static void irqtime_account_idle_ticks(int ticks) {} | ||
3062 | static void irqtime_account_process_tick(struct task_struct *p, int user_tick, | ||
3063 | struct rq *rq) {} | ||
3064 | #endif /* CONFIG_IRQ_TIME_ACCOUNTING */ | ||
3065 | |||
3066 | /* | ||
3067 | * Account a single tick of cpu time. | ||
3068 | * @p: the process that the cpu time gets accounted to | ||
3069 | * @user_tick: indicates if the tick is a user or a system tick | ||
3070 | */ | ||
3071 | void account_process_tick(struct task_struct *p, int user_tick) | ||
3072 | { | ||
3073 | cputime_t one_jiffy_scaled = cputime_to_scaled(cputime_one_jiffy); | ||
3074 | struct rq *rq = this_rq(); | ||
3075 | |||
3076 | if (sched_clock_irqtime) { | ||
3077 | irqtime_account_process_tick(p, user_tick, rq); | ||
3078 | return; | ||
3079 | } | ||
3080 | |||
3081 | if (steal_account_process_tick()) | ||
3082 | return; | ||
3083 | |||
3084 | if (user_tick) | ||
3085 | account_user_time(p, cputime_one_jiffy, one_jiffy_scaled); | ||
3086 | else if ((p != rq->idle) || (irq_count() != HARDIRQ_OFFSET)) | ||
3087 | account_system_time(p, HARDIRQ_OFFSET, cputime_one_jiffy, | ||
3088 | one_jiffy_scaled); | ||
3089 | else | ||
3090 | account_idle_time(cputime_one_jiffy); | ||
3091 | } | ||
3092 | |||
3093 | /* | ||
3094 | * Account multiple ticks of steal time. | ||
3095 | * @p: the process from which the cpu time has been stolen | ||
3096 | * @ticks: number of stolen ticks | ||
3097 | */ | ||
3098 | void account_steal_ticks(unsigned long ticks) | ||
3099 | { | ||
3100 | account_steal_time(jiffies_to_cputime(ticks)); | ||
3101 | } | ||
3102 | |||
3103 | /* | ||
3104 | * Account multiple ticks of idle time. | ||
3105 | * @ticks: number of stolen ticks | ||
3106 | */ | ||
3107 | void account_idle_ticks(unsigned long ticks) | ||
3108 | { | ||
3109 | |||
3110 | if (sched_clock_irqtime) { | ||
3111 | irqtime_account_idle_ticks(ticks); | ||
3112 | return; | ||
3113 | } | ||
3114 | |||
3115 | account_idle_time(jiffies_to_cputime(ticks)); | ||
3116 | } | ||
3117 | |||
3118 | #endif | ||
3119 | |||
3120 | /* | ||
3121 | * Use precise platform statistics if available: | ||
3122 | */ | ||
3123 | #ifdef CONFIG_VIRT_CPU_ACCOUNTING | ||
3124 | void task_times(struct task_struct *p, cputime_t *ut, cputime_t *st) | ||
3125 | { | ||
3126 | *ut = p->utime; | ||
3127 | *st = p->stime; | ||
3128 | } | ||
3129 | |||
3130 | void thread_group_times(struct task_struct *p, cputime_t *ut, cputime_t *st) | ||
3131 | { | ||
3132 | struct task_cputime cputime; | ||
3133 | |||
3134 | thread_group_cputime(p, &cputime); | ||
3135 | |||
3136 | *ut = cputime.utime; | ||
3137 | *st = cputime.stime; | ||
3138 | } | ||
3139 | #else | ||
3140 | |||
3141 | #ifndef nsecs_to_cputime | ||
3142 | # define nsecs_to_cputime(__nsecs) nsecs_to_jiffies(__nsecs) | ||
3143 | #endif | ||
3144 | |||
3145 | static cputime_t scale_utime(cputime_t utime, cputime_t rtime, cputime_t total) | ||
3146 | { | ||
3147 | u64 temp = (__force u64) rtime; | ||
3148 | |||
3149 | temp *= (__force u64) utime; | ||
3150 | |||
3151 | if (sizeof(cputime_t) == 4) | ||
3152 | temp = div_u64(temp, (__force u32) total); | ||
3153 | else | ||
3154 | temp = div64_u64(temp, (__force u64) total); | ||
3155 | |||
3156 | return (__force cputime_t) temp; | ||
3157 | } | ||
3158 | |||
3159 | void task_times(struct task_struct *p, cputime_t *ut, cputime_t *st) | ||
3160 | { | ||
3161 | cputime_t rtime, utime = p->utime, total = utime + p->stime; | ||
3162 | |||
3163 | /* | ||
3164 | * Use CFS's precise accounting: | ||
3165 | */ | ||
3166 | rtime = nsecs_to_cputime(p->se.sum_exec_runtime); | ||
3167 | |||
3168 | if (total) | ||
3169 | utime = scale_utime(utime, rtime, total); | ||
3170 | else | ||
3171 | utime = rtime; | ||
3172 | |||
3173 | /* | ||
3174 | * Compare with previous values, to keep monotonicity: | ||
3175 | */ | ||
3176 | p->prev_utime = max(p->prev_utime, utime); | ||
3177 | p->prev_stime = max(p->prev_stime, rtime - p->prev_utime); | ||
3178 | |||
3179 | *ut = p->prev_utime; | ||
3180 | *st = p->prev_stime; | ||
3181 | } | ||
3182 | |||
3183 | /* | ||
3184 | * Must be called with siglock held. | ||
3185 | */ | ||
3186 | void thread_group_times(struct task_struct *p, cputime_t *ut, cputime_t *st) | ||
3187 | { | ||
3188 | struct signal_struct *sig = p->signal; | ||
3189 | struct task_cputime cputime; | ||
3190 | cputime_t rtime, utime, total; | ||
3191 | |||
3192 | thread_group_cputime(p, &cputime); | ||
3193 | |||
3194 | total = cputime.utime + cputime.stime; | ||
3195 | rtime = nsecs_to_cputime(cputime.sum_exec_runtime); | ||
3196 | |||
3197 | if (total) | ||
3198 | utime = scale_utime(cputime.utime, rtime, total); | ||
3199 | else | ||
3200 | utime = rtime; | ||
3201 | |||
3202 | sig->prev_utime = max(sig->prev_utime, utime); | ||
3203 | sig->prev_stime = max(sig->prev_stime, rtime - sig->prev_utime); | ||
3204 | |||
3205 | *ut = sig->prev_utime; | ||
3206 | *st = sig->prev_stime; | ||
3207 | } | ||
3208 | #endif | ||
3209 | |||
3210 | /* | 2687 | /* |
3211 | * This function gets called by the timer code, with HZ frequency. | 2688 | * This function gets called by the timer code, with HZ frequency. |
3212 | * We call it with interrupts disabled. | 2689 | * We call it with interrupts disabled. |
@@ -3367,6 +2844,40 @@ pick_next_task(struct rq *rq) | |||
3367 | 2844 | ||
3368 | /* | 2845 | /* |
3369 | * __schedule() is the main scheduler function. | 2846 | * __schedule() is the main scheduler function. |
2847 | * | ||
2848 | * The main means of driving the scheduler and thus entering this function are: | ||
2849 | * | ||
2850 | * 1. Explicit blocking: mutex, semaphore, waitqueue, etc. | ||
2851 | * | ||
2852 | * 2. TIF_NEED_RESCHED flag is checked on interrupt and userspace return | ||
2853 | * paths. For example, see arch/x86/entry_64.S. | ||
2854 | * | ||
2855 | * To drive preemption between tasks, the scheduler sets the flag in timer | ||
2856 | * interrupt handler scheduler_tick(). | ||
2857 | * | ||
2858 | * 3. Wakeups don't really cause entry into schedule(). They add a | ||
2859 | * task to the run-queue and that's it. | ||
2860 | * | ||
2861 | * Now, if the new task added to the run-queue preempts the current | ||
2862 | * task, then the wakeup sets TIF_NEED_RESCHED and schedule() gets | ||
2863 | * called on the nearest possible occasion: | ||
2864 | * | ||
2865 | * - If the kernel is preemptible (CONFIG_PREEMPT=y): | ||
2866 | * | ||
2867 | * - in syscall or exception context, at the next outmost | ||
2868 | * preempt_enable(). (this might be as soon as the wake_up()'s | ||
2869 | * spin_unlock()!) | ||
2870 | * | ||
2871 | * - in IRQ context, return from interrupt-handler to | ||
2872 | * preemptible context | ||
2873 | * | ||
2874 | * - If the kernel is not preemptible (CONFIG_PREEMPT is not set) | ||
2875 | * then at the next: | ||
2876 | * | ||
2877 | * - cond_resched() call | ||
2878 | * - explicit schedule() call | ||
2879 | * - return from syscall or exception to user-space | ||
2880 | * - return from interrupt-handler to user-space | ||
3370 | */ | 2881 | */ |
3371 | static void __sched __schedule(void) | 2882 | static void __sched __schedule(void) |
3372 | { | 2883 | { |
@@ -3468,6 +2979,21 @@ asmlinkage void __sched schedule(void) | |||
3468 | } | 2979 | } |
3469 | EXPORT_SYMBOL(schedule); | 2980 | EXPORT_SYMBOL(schedule); |
3470 | 2981 | ||
2982 | #ifdef CONFIG_CONTEXT_TRACKING | ||
2983 | asmlinkage void __sched schedule_user(void) | ||
2984 | { | ||
2985 | /* | ||
2986 | * If we come here after a random call to set_need_resched(), | ||
2987 | * or we have been woken up remotely but the IPI has not yet arrived, | ||
2988 | * we haven't yet exited the RCU idle mode. Do it here manually until | ||
2989 | * we find a better solution. | ||
2990 | */ | ||
2991 | user_exit(); | ||
2992 | schedule(); | ||
2993 | user_enter(); | ||
2994 | } | ||
2995 | #endif | ||
2996 | |||
3471 | /** | 2997 | /** |
3472 | * schedule_preempt_disabled - called with preemption disabled | 2998 | * schedule_preempt_disabled - called with preemption disabled |
3473 | * | 2999 | * |
@@ -3569,6 +3095,7 @@ asmlinkage void __sched preempt_schedule_irq(void) | |||
3569 | /* Catch callers which need to be fixed */ | 3095 | /* Catch callers which need to be fixed */ |
3570 | BUG_ON(ti->preempt_count || !irqs_disabled()); | 3096 | BUG_ON(ti->preempt_count || !irqs_disabled()); |
3571 | 3097 | ||
3098 | user_exit(); | ||
3572 | do { | 3099 | do { |
3573 | add_preempt_count(PREEMPT_ACTIVE); | 3100 | add_preempt_count(PREEMPT_ACTIVE); |
3574 | local_irq_enable(); | 3101 | local_irq_enable(); |
@@ -4570,8 +4097,14 @@ long sched_setaffinity(pid_t pid, const struct cpumask *in_mask) | |||
4570 | goto out_free_cpus_allowed; | 4097 | goto out_free_cpus_allowed; |
4571 | } | 4098 | } |
4572 | retval = -EPERM; | 4099 | retval = -EPERM; |
4573 | if (!check_same_owner(p) && !ns_capable(task_user_ns(p), CAP_SYS_NICE)) | 4100 | if (!check_same_owner(p)) { |
4574 | goto out_unlock; | 4101 | rcu_read_lock(); |
4102 | if (!ns_capable(__task_cred(p)->user_ns, CAP_SYS_NICE)) { | ||
4103 | rcu_read_unlock(); | ||
4104 | goto out_unlock; | ||
4105 | } | ||
4106 | rcu_read_unlock(); | ||
4107 | } | ||
4575 | 4108 | ||
4576 | retval = security_task_setscheduler(p); | 4109 | retval = security_task_setscheduler(p); |
4577 | if (retval) | 4110 | if (retval) |
@@ -4868,13 +4401,6 @@ again: | |||
4868 | */ | 4401 | */ |
4869 | if (preempt && rq != p_rq) | 4402 | if (preempt && rq != p_rq) |
4870 | resched_task(p_rq->curr); | 4403 | resched_task(p_rq->curr); |
4871 | } else { | ||
4872 | /* | ||
4873 | * We might have set it in task_yield_fair(), but are | ||
4874 | * not going to schedule(), so don't want to skip | ||
4875 | * the next update. | ||
4876 | */ | ||
4877 | rq->skip_clock_update = 0; | ||
4878 | } | 4404 | } |
4879 | 4405 | ||
4880 | out: | 4406 | out: |
@@ -5022,6 +4548,7 @@ static const char stat_nam[] = TASK_STATE_TO_CHAR_STR; | |||
5022 | void sched_show_task(struct task_struct *p) | 4548 | void sched_show_task(struct task_struct *p) |
5023 | { | 4549 | { |
5024 | unsigned long free = 0; | 4550 | unsigned long free = 0; |
4551 | int ppid; | ||
5025 | unsigned state; | 4552 | unsigned state; |
5026 | 4553 | ||
5027 | state = p->state ? __ffs(p->state) + 1 : 0; | 4554 | state = p->state ? __ffs(p->state) + 1 : 0; |
@@ -5041,8 +4568,11 @@ void sched_show_task(struct task_struct *p) | |||
5041 | #ifdef CONFIG_DEBUG_STACK_USAGE | 4568 | #ifdef CONFIG_DEBUG_STACK_USAGE |
5042 | free = stack_not_used(p); | 4569 | free = stack_not_used(p); |
5043 | #endif | 4570 | #endif |
4571 | rcu_read_lock(); | ||
4572 | ppid = task_pid_nr(rcu_dereference(p->real_parent)); | ||
4573 | rcu_read_unlock(); | ||
5044 | printk(KERN_CONT "%5lu %5d %6d 0x%08lx\n", free, | 4574 | printk(KERN_CONT "%5lu %5d %6d 0x%08lx\n", free, |
5045 | task_pid_nr(p), task_pid_nr(rcu_dereference(p->real_parent)), | 4575 | task_pid_nr(p), ppid, |
5046 | (unsigned long)task_thread_info(p)->flags); | 4576 | (unsigned long)task_thread_info(p)->flags); |
5047 | 4577 | ||
5048 | show_stack(p, NULL); | 4578 | show_stack(p, NULL); |
@@ -5416,16 +4946,25 @@ static void sd_free_ctl_entry(struct ctl_table **tablep) | |||
5416 | *tablep = NULL; | 4946 | *tablep = NULL; |
5417 | } | 4947 | } |
5418 | 4948 | ||
4949 | static int min_load_idx = 0; | ||
4950 | static int max_load_idx = CPU_LOAD_IDX_MAX; | ||
4951 | |||
5419 | static void | 4952 | static void |
5420 | set_table_entry(struct ctl_table *entry, | 4953 | set_table_entry(struct ctl_table *entry, |
5421 | const char *procname, void *data, int maxlen, | 4954 | const char *procname, void *data, int maxlen, |
5422 | umode_t mode, proc_handler *proc_handler) | 4955 | umode_t mode, proc_handler *proc_handler, |
4956 | bool load_idx) | ||
5423 | { | 4957 | { |
5424 | entry->procname = procname; | 4958 | entry->procname = procname; |
5425 | entry->data = data; | 4959 | entry->data = data; |
5426 | entry->maxlen = maxlen; | 4960 | entry->maxlen = maxlen; |
5427 | entry->mode = mode; | 4961 | entry->mode = mode; |
5428 | entry->proc_handler = proc_handler; | 4962 | entry->proc_handler = proc_handler; |
4963 | |||
4964 | if (load_idx) { | ||
4965 | entry->extra1 = &min_load_idx; | ||
4966 | entry->extra2 = &max_load_idx; | ||
4967 | } | ||
5429 | } | 4968 | } |
5430 | 4969 | ||
5431 | static struct ctl_table * | 4970 | static struct ctl_table * |
@@ -5437,30 +4976,30 @@ sd_alloc_ctl_domain_table(struct sched_domain *sd) | |||
5437 | return NULL; | 4976 | return NULL; |
5438 | 4977 | ||
5439 | set_table_entry(&table[0], "min_interval", &sd->min_interval, | 4978 | set_table_entry(&table[0], "min_interval", &sd->min_interval, |
5440 | sizeof(long), 0644, proc_doulongvec_minmax); | 4979 | sizeof(long), 0644, proc_doulongvec_minmax, false); |
5441 | set_table_entry(&table[1], "max_interval", &sd->max_interval, | 4980 | set_table_entry(&table[1], "max_interval", &sd->max_interval, |
5442 | sizeof(long), 0644, proc_doulongvec_minmax); | 4981 | sizeof(long), 0644, proc_doulongvec_minmax, false); |
5443 | set_table_entry(&table[2], "busy_idx", &sd->busy_idx, | 4982 | set_table_entry(&table[2], "busy_idx", &sd->busy_idx, |
5444 | sizeof(int), 0644, proc_dointvec_minmax); | 4983 | sizeof(int), 0644, proc_dointvec_minmax, true); |
5445 | set_table_entry(&table[3], "idle_idx", &sd->idle_idx, | 4984 | set_table_entry(&table[3], "idle_idx", &sd->idle_idx, |
5446 | sizeof(int), 0644, proc_dointvec_minmax); | 4985 | sizeof(int), 0644, proc_dointvec_minmax, true); |
5447 | set_table_entry(&table[4], "newidle_idx", &sd->newidle_idx, | 4986 | set_table_entry(&table[4], "newidle_idx", &sd->newidle_idx, |
5448 | sizeof(int), 0644, proc_dointvec_minmax); | 4987 | sizeof(int), 0644, proc_dointvec_minmax, true); |
5449 | set_table_entry(&table[5], "wake_idx", &sd->wake_idx, | 4988 | set_table_entry(&table[5], "wake_idx", &sd->wake_idx, |
5450 | sizeof(int), 0644, proc_dointvec_minmax); | 4989 | sizeof(int), 0644, proc_dointvec_minmax, true); |
5451 | set_table_entry(&table[6], "forkexec_idx", &sd->forkexec_idx, | 4990 | set_table_entry(&table[6], "forkexec_idx", &sd->forkexec_idx, |
5452 | sizeof(int), 0644, proc_dointvec_minmax); | 4991 | sizeof(int), 0644, proc_dointvec_minmax, true); |
5453 | set_table_entry(&table[7], "busy_factor", &sd->busy_factor, | 4992 | set_table_entry(&table[7], "busy_factor", &sd->busy_factor, |
5454 | sizeof(int), 0644, proc_dointvec_minmax); | 4993 | sizeof(int), 0644, proc_dointvec_minmax, false); |
5455 | set_table_entry(&table[8], "imbalance_pct", &sd->imbalance_pct, | 4994 | set_table_entry(&table[8], "imbalance_pct", &sd->imbalance_pct, |
5456 | sizeof(int), 0644, proc_dointvec_minmax); | 4995 | sizeof(int), 0644, proc_dointvec_minmax, false); |
5457 | set_table_entry(&table[9], "cache_nice_tries", | 4996 | set_table_entry(&table[9], "cache_nice_tries", |
5458 | &sd->cache_nice_tries, | 4997 | &sd->cache_nice_tries, |
5459 | sizeof(int), 0644, proc_dointvec_minmax); | 4998 | sizeof(int), 0644, proc_dointvec_minmax, false); |
5460 | set_table_entry(&table[10], "flags", &sd->flags, | 4999 | set_table_entry(&table[10], "flags", &sd->flags, |
5461 | sizeof(int), 0644, proc_dointvec_minmax); | 5000 | sizeof(int), 0644, proc_dointvec_minmax, false); |
5462 | set_table_entry(&table[11], "name", sd->name, | 5001 | set_table_entry(&table[11], "name", sd->name, |
5463 | CORENAME_MAX_SIZE, 0444, proc_dostring); | 5002 | CORENAME_MAX_SIZE, 0444, proc_dostring, false); |
5464 | /* &table[12] is terminator */ | 5003 | /* &table[12] is terminator */ |
5465 | 5004 | ||
5466 | return table; | 5005 | return table; |
@@ -5604,7 +5143,9 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu) | |||
5604 | migrate_tasks(cpu); | 5143 | migrate_tasks(cpu); |
5605 | BUG_ON(rq->nr_running != 1); /* the migration thread */ | 5144 | BUG_ON(rq->nr_running != 1); /* the migration thread */ |
5606 | raw_spin_unlock_irqrestore(&rq->lock, flags); | 5145 | raw_spin_unlock_irqrestore(&rq->lock, flags); |
5146 | break; | ||
5607 | 5147 | ||
5148 | case CPU_DEAD: | ||
5608 | calc_load_migrate(rq); | 5149 | calc_load_migrate(rq); |
5609 | break; | 5150 | break; |
5610 | #endif | 5151 | #endif |
@@ -6537,7 +6078,6 @@ sd_numa_init(struct sched_domain_topology_level *tl, int cpu) | |||
6537 | | 0*SD_BALANCE_FORK | 6078 | | 0*SD_BALANCE_FORK |
6538 | | 0*SD_BALANCE_WAKE | 6079 | | 0*SD_BALANCE_WAKE |
6539 | | 0*SD_WAKE_AFFINE | 6080 | | 0*SD_WAKE_AFFINE |
6540 | | 0*SD_PREFER_LOCAL | ||
6541 | | 0*SD_SHARE_CPUPOWER | 6081 | | 0*SD_SHARE_CPUPOWER |
6542 | | 0*SD_SHARE_PKG_RESOURCES | 6082 | | 0*SD_SHARE_PKG_RESOURCES |
6543 | | 1*SD_SERIALIZE | 6083 | | 1*SD_SERIALIZE |
@@ -6660,6 +6200,17 @@ static void sched_init_numa(void) | |||
6660 | * numbers. | 6200 | * numbers. |
6661 | */ | 6201 | */ |
6662 | 6202 | ||
6203 | /* | ||
6204 | * Here, we should temporarily reset sched_domains_numa_levels to 0. | ||
6205 | * If it fails to allocate memory for array sched_domains_numa_masks[][], | ||
6206 | * the array will contain less then 'level' members. This could be | ||
6207 | * dangerous when we use it to iterate array sched_domains_numa_masks[][] | ||
6208 | * in other functions. | ||
6209 | * | ||
6210 | * We reset it to 'level' at the end of this function. | ||
6211 | */ | ||
6212 | sched_domains_numa_levels = 0; | ||
6213 | |||
6663 | sched_domains_numa_masks = kzalloc(sizeof(void *) * level, GFP_KERNEL); | 6214 | sched_domains_numa_masks = kzalloc(sizeof(void *) * level, GFP_KERNEL); |
6664 | if (!sched_domains_numa_masks) | 6215 | if (!sched_domains_numa_masks) |
6665 | return; | 6216 | return; |
@@ -6714,11 +6265,68 @@ static void sched_init_numa(void) | |||
6714 | } | 6265 | } |
6715 | 6266 | ||
6716 | sched_domain_topology = tl; | 6267 | sched_domain_topology = tl; |
6268 | |||
6269 | sched_domains_numa_levels = level; | ||
6270 | } | ||
6271 | |||
6272 | static void sched_domains_numa_masks_set(int cpu) | ||
6273 | { | ||
6274 | int i, j; | ||
6275 | int node = cpu_to_node(cpu); | ||
6276 | |||
6277 | for (i = 0; i < sched_domains_numa_levels; i++) { | ||
6278 | for (j = 0; j < nr_node_ids; j++) { | ||
6279 | if (node_distance(j, node) <= sched_domains_numa_distance[i]) | ||
6280 | cpumask_set_cpu(cpu, sched_domains_numa_masks[i][j]); | ||
6281 | } | ||
6282 | } | ||
6283 | } | ||
6284 | |||
6285 | static void sched_domains_numa_masks_clear(int cpu) | ||
6286 | { | ||
6287 | int i, j; | ||
6288 | for (i = 0; i < sched_domains_numa_levels; i++) { | ||
6289 | for (j = 0; j < nr_node_ids; j++) | ||
6290 | cpumask_clear_cpu(cpu, sched_domains_numa_masks[i][j]); | ||
6291 | } | ||
6292 | } | ||
6293 | |||
6294 | /* | ||
6295 | * Update sched_domains_numa_masks[level][node] array when new cpus | ||
6296 | * are onlined. | ||
6297 | */ | ||
6298 | static int sched_domains_numa_masks_update(struct notifier_block *nfb, | ||
6299 | unsigned long action, | ||
6300 | void *hcpu) | ||
6301 | { | ||
6302 | int cpu = (long)hcpu; | ||
6303 | |||
6304 | switch (action & ~CPU_TASKS_FROZEN) { | ||
6305 | case CPU_ONLINE: | ||
6306 | sched_domains_numa_masks_set(cpu); | ||
6307 | break; | ||
6308 | |||
6309 | case CPU_DEAD: | ||
6310 | sched_domains_numa_masks_clear(cpu); | ||
6311 | break; | ||
6312 | |||
6313 | default: | ||
6314 | return NOTIFY_DONE; | ||
6315 | } | ||
6316 | |||
6317 | return NOTIFY_OK; | ||
6717 | } | 6318 | } |
6718 | #else | 6319 | #else |
6719 | static inline void sched_init_numa(void) | 6320 | static inline void sched_init_numa(void) |
6720 | { | 6321 | { |
6721 | } | 6322 | } |
6323 | |||
6324 | static int sched_domains_numa_masks_update(struct notifier_block *nfb, | ||
6325 | unsigned long action, | ||
6326 | void *hcpu) | ||
6327 | { | ||
6328 | return 0; | ||
6329 | } | ||
6722 | #endif /* CONFIG_NUMA */ | 6330 | #endif /* CONFIG_NUMA */ |
6723 | 6331 | ||
6724 | static int __sdt_alloc(const struct cpumask *cpu_map) | 6332 | static int __sdt_alloc(const struct cpumask *cpu_map) |
@@ -7167,6 +6775,7 @@ void __init sched_init_smp(void) | |||
7167 | mutex_unlock(&sched_domains_mutex); | 6775 | mutex_unlock(&sched_domains_mutex); |
7168 | put_online_cpus(); | 6776 | put_online_cpus(); |
7169 | 6777 | ||
6778 | hotcpu_notifier(sched_domains_numa_masks_update, CPU_PRI_SCHED_ACTIVE); | ||
7170 | hotcpu_notifier(cpuset_cpu_active, CPU_PRI_CPUSET_ACTIVE); | 6779 | hotcpu_notifier(cpuset_cpu_active, CPU_PRI_CPUSET_ACTIVE); |
7171 | hotcpu_notifier(cpuset_cpu_inactive, CPU_PRI_CPUSET_INACTIVE); | 6780 | hotcpu_notifier(cpuset_cpu_inactive, CPU_PRI_CPUSET_INACTIVE); |
7172 | 6781 | ||
@@ -7937,7 +7546,7 @@ static inline struct task_group *cgroup_tg(struct cgroup *cgrp) | |||
7937 | struct task_group, css); | 7546 | struct task_group, css); |
7938 | } | 7547 | } |
7939 | 7548 | ||
7940 | static struct cgroup_subsys_state *cpu_cgroup_create(struct cgroup *cgrp) | 7549 | static struct cgroup_subsys_state *cpu_cgroup_css_alloc(struct cgroup *cgrp) |
7941 | { | 7550 | { |
7942 | struct task_group *tg, *parent; | 7551 | struct task_group *tg, *parent; |
7943 | 7552 | ||
@@ -7954,7 +7563,7 @@ static struct cgroup_subsys_state *cpu_cgroup_create(struct cgroup *cgrp) | |||
7954 | return &tg->css; | 7563 | return &tg->css; |
7955 | } | 7564 | } |
7956 | 7565 | ||
7957 | static void cpu_cgroup_destroy(struct cgroup *cgrp) | 7566 | static void cpu_cgroup_css_free(struct cgroup *cgrp) |
7958 | { | 7567 | { |
7959 | struct task_group *tg = cgroup_tg(cgrp); | 7568 | struct task_group *tg = cgroup_tg(cgrp); |
7960 | 7569 | ||
@@ -8314,8 +7923,8 @@ static struct cftype cpu_files[] = { | |||
8314 | 7923 | ||
8315 | struct cgroup_subsys cpu_cgroup_subsys = { | 7924 | struct cgroup_subsys cpu_cgroup_subsys = { |
8316 | .name = "cpu", | 7925 | .name = "cpu", |
8317 | .create = cpu_cgroup_create, | 7926 | .css_alloc = cpu_cgroup_css_alloc, |
8318 | .destroy = cpu_cgroup_destroy, | 7927 | .css_free = cpu_cgroup_css_free, |
8319 | .can_attach = cpu_cgroup_can_attach, | 7928 | .can_attach = cpu_cgroup_can_attach, |
8320 | .attach = cpu_cgroup_attach, | 7929 | .attach = cpu_cgroup_attach, |
8321 | .exit = cpu_cgroup_exit, | 7930 | .exit = cpu_cgroup_exit, |
@@ -8335,8 +7944,10 @@ struct cgroup_subsys cpu_cgroup_subsys = { | |||
8335 | * (balbir@in.ibm.com). | 7944 | * (balbir@in.ibm.com). |
8336 | */ | 7945 | */ |
8337 | 7946 | ||
7947 | struct cpuacct root_cpuacct; | ||
7948 | |||
8338 | /* create a new cpu accounting group */ | 7949 | /* create a new cpu accounting group */ |
8339 | static struct cgroup_subsys_state *cpuacct_create(struct cgroup *cgrp) | 7950 | static struct cgroup_subsys_state *cpuacct_css_alloc(struct cgroup *cgrp) |
8340 | { | 7951 | { |
8341 | struct cpuacct *ca; | 7952 | struct cpuacct *ca; |
8342 | 7953 | ||
@@ -8366,7 +7977,7 @@ out: | |||
8366 | } | 7977 | } |
8367 | 7978 | ||
8368 | /* destroy an existing cpu accounting group */ | 7979 | /* destroy an existing cpu accounting group */ |
8369 | static void cpuacct_destroy(struct cgroup *cgrp) | 7980 | static void cpuacct_css_free(struct cgroup *cgrp) |
8370 | { | 7981 | { |
8371 | struct cpuacct *ca = cgroup_ca(cgrp); | 7982 | struct cpuacct *ca = cgroup_ca(cgrp); |
8372 | 7983 | ||
@@ -8537,9 +8148,15 @@ void cpuacct_charge(struct task_struct *tsk, u64 cputime) | |||
8537 | 8148 | ||
8538 | struct cgroup_subsys cpuacct_subsys = { | 8149 | struct cgroup_subsys cpuacct_subsys = { |
8539 | .name = "cpuacct", | 8150 | .name = "cpuacct", |
8540 | .create = cpuacct_create, | 8151 | .css_alloc = cpuacct_css_alloc, |
8541 | .destroy = cpuacct_destroy, | 8152 | .css_free = cpuacct_css_free, |
8542 | .subsys_id = cpuacct_subsys_id, | 8153 | .subsys_id = cpuacct_subsys_id, |
8543 | .base_cftypes = files, | 8154 | .base_cftypes = files, |
8544 | }; | 8155 | }; |
8545 | #endif /* CONFIG_CGROUP_CPUACCT */ | 8156 | #endif /* CONFIG_CGROUP_CPUACCT */ |
8157 | |||
8158 | void dump_cpu_task(int cpu) | ||
8159 | { | ||
8160 | pr_info("Task dump for CPU %d:\n", cpu); | ||
8161 | sched_show_task(cpu_curr(cpu)); | ||
8162 | } | ||
diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c new file mode 100644 index 000000000000..293b202fcf79 --- /dev/null +++ b/kernel/sched/cputime.c | |||
@@ -0,0 +1,589 @@ | |||
1 | #include <linux/export.h> | ||
2 | #include <linux/sched.h> | ||
3 | #include <linux/tsacct_kern.h> | ||
4 | #include <linux/kernel_stat.h> | ||
5 | #include <linux/static_key.h> | ||
6 | #include "sched.h" | ||
7 | |||
8 | |||
9 | #ifdef CONFIG_IRQ_TIME_ACCOUNTING | ||
10 | |||
11 | /* | ||
12 | * There are no locks covering percpu hardirq/softirq time. | ||
13 | * They are only modified in vtime_account, on corresponding CPU | ||
14 | * with interrupts disabled. So, writes are safe. | ||
15 | * They are read and saved off onto struct rq in update_rq_clock(). | ||
16 | * This may result in other CPU reading this CPU's irq time and can | ||
17 | * race with irq/vtime_account on this CPU. We would either get old | ||
18 | * or new value with a side effect of accounting a slice of irq time to wrong | ||
19 | * task when irq is in progress while we read rq->clock. That is a worthy | ||
20 | * compromise in place of having locks on each irq in account_system_time. | ||
21 | */ | ||
22 | DEFINE_PER_CPU(u64, cpu_hardirq_time); | ||
23 | DEFINE_PER_CPU(u64, cpu_softirq_time); | ||
24 | |||
25 | static DEFINE_PER_CPU(u64, irq_start_time); | ||
26 | static int sched_clock_irqtime; | ||
27 | |||
28 | void enable_sched_clock_irqtime(void) | ||
29 | { | ||
30 | sched_clock_irqtime = 1; | ||
31 | } | ||
32 | |||
33 | void disable_sched_clock_irqtime(void) | ||
34 | { | ||
35 | sched_clock_irqtime = 0; | ||
36 | } | ||
37 | |||
38 | #ifndef CONFIG_64BIT | ||
39 | DEFINE_PER_CPU(seqcount_t, irq_time_seq); | ||
40 | #endif /* CONFIG_64BIT */ | ||
41 | |||
42 | /* | ||
43 | * Called before incrementing preempt_count on {soft,}irq_enter | ||
44 | * and before decrementing preempt_count on {soft,}irq_exit. | ||
45 | */ | ||
46 | void irqtime_account_irq(struct task_struct *curr) | ||
47 | { | ||
48 | unsigned long flags; | ||
49 | s64 delta; | ||
50 | int cpu; | ||
51 | |||
52 | if (!sched_clock_irqtime) | ||
53 | return; | ||
54 | |||
55 | local_irq_save(flags); | ||
56 | |||
57 | cpu = smp_processor_id(); | ||
58 | delta = sched_clock_cpu(cpu) - __this_cpu_read(irq_start_time); | ||
59 | __this_cpu_add(irq_start_time, delta); | ||
60 | |||
61 | irq_time_write_begin(); | ||
62 | /* | ||
63 | * We do not account for softirq time from ksoftirqd here. | ||
64 | * We want to continue accounting softirq time to ksoftirqd thread | ||
65 | * in that case, so as not to confuse scheduler with a special task | ||
66 | * that do not consume any time, but still wants to run. | ||
67 | */ | ||
68 | if (hardirq_count()) | ||
69 | __this_cpu_add(cpu_hardirq_time, delta); | ||
70 | else if (in_serving_softirq() && curr != this_cpu_ksoftirqd()) | ||
71 | __this_cpu_add(cpu_softirq_time, delta); | ||
72 | |||
73 | irq_time_write_end(); | ||
74 | local_irq_restore(flags); | ||
75 | } | ||
76 | EXPORT_SYMBOL_GPL(irqtime_account_irq); | ||
77 | |||
78 | static int irqtime_account_hi_update(void) | ||
79 | { | ||
80 | u64 *cpustat = kcpustat_this_cpu->cpustat; | ||
81 | unsigned long flags; | ||
82 | u64 latest_ns; | ||
83 | int ret = 0; | ||
84 | |||
85 | local_irq_save(flags); | ||
86 | latest_ns = this_cpu_read(cpu_hardirq_time); | ||
87 | if (nsecs_to_cputime64(latest_ns) > cpustat[CPUTIME_IRQ]) | ||
88 | ret = 1; | ||
89 | local_irq_restore(flags); | ||
90 | return ret; | ||
91 | } | ||
92 | |||
93 | static int irqtime_account_si_update(void) | ||
94 | { | ||
95 | u64 *cpustat = kcpustat_this_cpu->cpustat; | ||
96 | unsigned long flags; | ||
97 | u64 latest_ns; | ||
98 | int ret = 0; | ||
99 | |||
100 | local_irq_save(flags); | ||
101 | latest_ns = this_cpu_read(cpu_softirq_time); | ||
102 | if (nsecs_to_cputime64(latest_ns) > cpustat[CPUTIME_SOFTIRQ]) | ||
103 | ret = 1; | ||
104 | local_irq_restore(flags); | ||
105 | return ret; | ||
106 | } | ||
107 | |||
108 | #else /* CONFIG_IRQ_TIME_ACCOUNTING */ | ||
109 | |||
110 | #define sched_clock_irqtime (0) | ||
111 | |||
112 | #endif /* !CONFIG_IRQ_TIME_ACCOUNTING */ | ||
113 | |||
114 | static inline void task_group_account_field(struct task_struct *p, int index, | ||
115 | u64 tmp) | ||
116 | { | ||
117 | #ifdef CONFIG_CGROUP_CPUACCT | ||
118 | struct kernel_cpustat *kcpustat; | ||
119 | struct cpuacct *ca; | ||
120 | #endif | ||
121 | /* | ||
122 | * Since all updates are sure to touch the root cgroup, we | ||
123 | * get ourselves ahead and touch it first. If the root cgroup | ||
124 | * is the only cgroup, then nothing else should be necessary. | ||
125 | * | ||
126 | */ | ||
127 | __get_cpu_var(kernel_cpustat).cpustat[index] += tmp; | ||
128 | |||
129 | #ifdef CONFIG_CGROUP_CPUACCT | ||
130 | if (unlikely(!cpuacct_subsys.active)) | ||
131 | return; | ||
132 | |||
133 | rcu_read_lock(); | ||
134 | ca = task_ca(p); | ||
135 | while (ca && (ca != &root_cpuacct)) { | ||
136 | kcpustat = this_cpu_ptr(ca->cpustat); | ||
137 | kcpustat->cpustat[index] += tmp; | ||
138 | ca = parent_ca(ca); | ||
139 | } | ||
140 | rcu_read_unlock(); | ||
141 | #endif | ||
142 | } | ||
143 | |||
144 | /* | ||
145 | * Account user cpu time to a process. | ||
146 | * @p: the process that the cpu time gets accounted to | ||
147 | * @cputime: the cpu time spent in user space since the last update | ||
148 | * @cputime_scaled: cputime scaled by cpu frequency | ||
149 | */ | ||
150 | void account_user_time(struct task_struct *p, cputime_t cputime, | ||
151 | cputime_t cputime_scaled) | ||
152 | { | ||
153 | int index; | ||
154 | |||
155 | /* Add user time to process. */ | ||
156 | p->utime += cputime; | ||
157 | p->utimescaled += cputime_scaled; | ||
158 | account_group_user_time(p, cputime); | ||
159 | |||
160 | index = (TASK_NICE(p) > 0) ? CPUTIME_NICE : CPUTIME_USER; | ||
161 | |||
162 | /* Add user time to cpustat. */ | ||
163 | task_group_account_field(p, index, (__force u64) cputime); | ||
164 | |||
165 | /* Account for user time used */ | ||
166 | acct_update_integrals(p); | ||
167 | } | ||
168 | |||
169 | /* | ||
170 | * Account guest cpu time to a process. | ||
171 | * @p: the process that the cpu time gets accounted to | ||
172 | * @cputime: the cpu time spent in virtual machine since the last update | ||
173 | * @cputime_scaled: cputime scaled by cpu frequency | ||
174 | */ | ||
175 | static void account_guest_time(struct task_struct *p, cputime_t cputime, | ||
176 | cputime_t cputime_scaled) | ||
177 | { | ||
178 | u64 *cpustat = kcpustat_this_cpu->cpustat; | ||
179 | |||
180 | /* Add guest time to process. */ | ||
181 | p->utime += cputime; | ||
182 | p->utimescaled += cputime_scaled; | ||
183 | account_group_user_time(p, cputime); | ||
184 | p->gtime += cputime; | ||
185 | |||
186 | /* Add guest time to cpustat. */ | ||
187 | if (TASK_NICE(p) > 0) { | ||
188 | cpustat[CPUTIME_NICE] += (__force u64) cputime; | ||
189 | cpustat[CPUTIME_GUEST_NICE] += (__force u64) cputime; | ||
190 | } else { | ||
191 | cpustat[CPUTIME_USER] += (__force u64) cputime; | ||
192 | cpustat[CPUTIME_GUEST] += (__force u64) cputime; | ||
193 | } | ||
194 | } | ||
195 | |||
196 | /* | ||
197 | * Account system cpu time to a process and desired cpustat field | ||
198 | * @p: the process that the cpu time gets accounted to | ||
199 | * @cputime: the cpu time spent in kernel space since the last update | ||
200 | * @cputime_scaled: cputime scaled by cpu frequency | ||
201 | * @target_cputime64: pointer to cpustat field that has to be updated | ||
202 | */ | ||
203 | static inline | ||
204 | void __account_system_time(struct task_struct *p, cputime_t cputime, | ||
205 | cputime_t cputime_scaled, int index) | ||
206 | { | ||
207 | /* Add system time to process. */ | ||
208 | p->stime += cputime; | ||
209 | p->stimescaled += cputime_scaled; | ||
210 | account_group_system_time(p, cputime); | ||
211 | |||
212 | /* Add system time to cpustat. */ | ||
213 | task_group_account_field(p, index, (__force u64) cputime); | ||
214 | |||
215 | /* Account for system time used */ | ||
216 | acct_update_integrals(p); | ||
217 | } | ||
218 | |||
219 | /* | ||
220 | * Account system cpu time to a process. | ||
221 | * @p: the process that the cpu time gets accounted to | ||
222 | * @hardirq_offset: the offset to subtract from hardirq_count() | ||
223 | * @cputime: the cpu time spent in kernel space since the last update | ||
224 | * @cputime_scaled: cputime scaled by cpu frequency | ||
225 | */ | ||
226 | void account_system_time(struct task_struct *p, int hardirq_offset, | ||
227 | cputime_t cputime, cputime_t cputime_scaled) | ||
228 | { | ||
229 | int index; | ||
230 | |||
231 | if ((p->flags & PF_VCPU) && (irq_count() - hardirq_offset == 0)) { | ||
232 | account_guest_time(p, cputime, cputime_scaled); | ||
233 | return; | ||
234 | } | ||
235 | |||
236 | if (hardirq_count() - hardirq_offset) | ||
237 | index = CPUTIME_IRQ; | ||
238 | else if (in_serving_softirq()) | ||
239 | index = CPUTIME_SOFTIRQ; | ||
240 | else | ||
241 | index = CPUTIME_SYSTEM; | ||
242 | |||
243 | __account_system_time(p, cputime, cputime_scaled, index); | ||
244 | } | ||
245 | |||
246 | /* | ||
247 | * Account for involuntary wait time. | ||
248 | * @cputime: the cpu time spent in involuntary wait | ||
249 | */ | ||
250 | void account_steal_time(cputime_t cputime) | ||
251 | { | ||
252 | u64 *cpustat = kcpustat_this_cpu->cpustat; | ||
253 | |||
254 | cpustat[CPUTIME_STEAL] += (__force u64) cputime; | ||
255 | } | ||
256 | |||
257 | /* | ||
258 | * Account for idle time. | ||
259 | * @cputime: the cpu time spent in idle wait | ||
260 | */ | ||
261 | void account_idle_time(cputime_t cputime) | ||
262 | { | ||
263 | u64 *cpustat = kcpustat_this_cpu->cpustat; | ||
264 | struct rq *rq = this_rq(); | ||
265 | |||
266 | if (atomic_read(&rq->nr_iowait) > 0) | ||
267 | cpustat[CPUTIME_IOWAIT] += (__force u64) cputime; | ||
268 | else | ||
269 | cpustat[CPUTIME_IDLE] += (__force u64) cputime; | ||
270 | } | ||
271 | |||
272 | static __always_inline bool steal_account_process_tick(void) | ||
273 | { | ||
274 | #ifdef CONFIG_PARAVIRT | ||
275 | if (static_key_false(¶virt_steal_enabled)) { | ||
276 | u64 steal, st = 0; | ||
277 | |||
278 | steal = paravirt_steal_clock(smp_processor_id()); | ||
279 | steal -= this_rq()->prev_steal_time; | ||
280 | |||
281 | st = steal_ticks(steal); | ||
282 | this_rq()->prev_steal_time += st * TICK_NSEC; | ||
283 | |||
284 | account_steal_time(st); | ||
285 | return st; | ||
286 | } | ||
287 | #endif | ||
288 | return false; | ||
289 | } | ||
290 | |||
291 | /* | ||
292 | * Accumulate raw cputime values of dead tasks (sig->[us]time) and live | ||
293 | * tasks (sum on group iteration) belonging to @tsk's group. | ||
294 | */ | ||
295 | void thread_group_cputime(struct task_struct *tsk, struct task_cputime *times) | ||
296 | { | ||
297 | struct signal_struct *sig = tsk->signal; | ||
298 | struct task_struct *t; | ||
299 | |||
300 | times->utime = sig->utime; | ||
301 | times->stime = sig->stime; | ||
302 | times->sum_exec_runtime = sig->sum_sched_runtime; | ||
303 | |||
304 | rcu_read_lock(); | ||
305 | /* make sure we can trust tsk->thread_group list */ | ||
306 | if (!likely(pid_alive(tsk))) | ||
307 | goto out; | ||
308 | |||
309 | t = tsk; | ||
310 | do { | ||
311 | times->utime += t->utime; | ||
312 | times->stime += t->stime; | ||
313 | times->sum_exec_runtime += task_sched_runtime(t); | ||
314 | } while_each_thread(tsk, t); | ||
315 | out: | ||
316 | rcu_read_unlock(); | ||
317 | } | ||
318 | |||
319 | #ifndef CONFIG_VIRT_CPU_ACCOUNTING | ||
320 | |||
321 | #ifdef CONFIG_IRQ_TIME_ACCOUNTING | ||
322 | /* | ||
323 | * Account a tick to a process and cpustat | ||
324 | * @p: the process that the cpu time gets accounted to | ||
325 | * @user_tick: is the tick from userspace | ||
326 | * @rq: the pointer to rq | ||
327 | * | ||
328 | * Tick demultiplexing follows the order | ||
329 | * - pending hardirq update | ||
330 | * - pending softirq update | ||
331 | * - user_time | ||
332 | * - idle_time | ||
333 | * - system time | ||
334 | * - check for guest_time | ||
335 | * - else account as system_time | ||
336 | * | ||
337 | * Check for hardirq is done both for system and user time as there is | ||
338 | * no timer going off while we are on hardirq and hence we may never get an | ||
339 | * opportunity to update it solely in system time. | ||
340 | * p->stime and friends are only updated on system time and not on irq | ||
341 | * softirq as those do not count in task exec_runtime any more. | ||
342 | */ | ||
343 | static void irqtime_account_process_tick(struct task_struct *p, int user_tick, | ||
344 | struct rq *rq) | ||
345 | { | ||
346 | cputime_t one_jiffy_scaled = cputime_to_scaled(cputime_one_jiffy); | ||
347 | u64 *cpustat = kcpustat_this_cpu->cpustat; | ||
348 | |||
349 | if (steal_account_process_tick()) | ||
350 | return; | ||
351 | |||
352 | if (irqtime_account_hi_update()) { | ||
353 | cpustat[CPUTIME_IRQ] += (__force u64) cputime_one_jiffy; | ||
354 | } else if (irqtime_account_si_update()) { | ||
355 | cpustat[CPUTIME_SOFTIRQ] += (__force u64) cputime_one_jiffy; | ||
356 | } else if (this_cpu_ksoftirqd() == p) { | ||
357 | /* | ||
358 | * ksoftirqd time do not get accounted in cpu_softirq_time. | ||
359 | * So, we have to handle it separately here. | ||
360 | * Also, p->stime needs to be updated for ksoftirqd. | ||
361 | */ | ||
362 | __account_system_time(p, cputime_one_jiffy, one_jiffy_scaled, | ||
363 | CPUTIME_SOFTIRQ); | ||
364 | } else if (user_tick) { | ||
365 | account_user_time(p, cputime_one_jiffy, one_jiffy_scaled); | ||
366 | } else if (p == rq->idle) { | ||
367 | account_idle_time(cputime_one_jiffy); | ||
368 | } else if (p->flags & PF_VCPU) { /* System time or guest time */ | ||
369 | account_guest_time(p, cputime_one_jiffy, one_jiffy_scaled); | ||
370 | } else { | ||
371 | __account_system_time(p, cputime_one_jiffy, one_jiffy_scaled, | ||
372 | CPUTIME_SYSTEM); | ||
373 | } | ||
374 | } | ||
375 | |||
376 | static void irqtime_account_idle_ticks(int ticks) | ||
377 | { | ||
378 | int i; | ||
379 | struct rq *rq = this_rq(); | ||
380 | |||
381 | for (i = 0; i < ticks; i++) | ||
382 | irqtime_account_process_tick(current, 0, rq); | ||
383 | } | ||
384 | #else /* CONFIG_IRQ_TIME_ACCOUNTING */ | ||
385 | static void irqtime_account_idle_ticks(int ticks) {} | ||
386 | static void irqtime_account_process_tick(struct task_struct *p, int user_tick, | ||
387 | struct rq *rq) {} | ||
388 | #endif /* CONFIG_IRQ_TIME_ACCOUNTING */ | ||
389 | |||
390 | /* | ||
391 | * Account a single tick of cpu time. | ||
392 | * @p: the process that the cpu time gets accounted to | ||
393 | * @user_tick: indicates if the tick is a user or a system tick | ||
394 | */ | ||
395 | void account_process_tick(struct task_struct *p, int user_tick) | ||
396 | { | ||
397 | cputime_t one_jiffy_scaled = cputime_to_scaled(cputime_one_jiffy); | ||
398 | struct rq *rq = this_rq(); | ||
399 | |||
400 | if (sched_clock_irqtime) { | ||
401 | irqtime_account_process_tick(p, user_tick, rq); | ||
402 | return; | ||
403 | } | ||
404 | |||
405 | if (steal_account_process_tick()) | ||
406 | return; | ||
407 | |||
408 | if (user_tick) | ||
409 | account_user_time(p, cputime_one_jiffy, one_jiffy_scaled); | ||
410 | else if ((p != rq->idle) || (irq_count() != HARDIRQ_OFFSET)) | ||
411 | account_system_time(p, HARDIRQ_OFFSET, cputime_one_jiffy, | ||
412 | one_jiffy_scaled); | ||
413 | else | ||
414 | account_idle_time(cputime_one_jiffy); | ||
415 | } | ||
416 | |||
417 | /* | ||
418 | * Account multiple ticks of steal time. | ||
419 | * @p: the process from which the cpu time has been stolen | ||
420 | * @ticks: number of stolen ticks | ||
421 | */ | ||
422 | void account_steal_ticks(unsigned long ticks) | ||
423 | { | ||
424 | account_steal_time(jiffies_to_cputime(ticks)); | ||
425 | } | ||
426 | |||
427 | /* | ||
428 | * Account multiple ticks of idle time. | ||
429 | * @ticks: number of stolen ticks | ||
430 | */ | ||
431 | void account_idle_ticks(unsigned long ticks) | ||
432 | { | ||
433 | |||
434 | if (sched_clock_irqtime) { | ||
435 | irqtime_account_idle_ticks(ticks); | ||
436 | return; | ||
437 | } | ||
438 | |||
439 | account_idle_time(jiffies_to_cputime(ticks)); | ||
440 | } | ||
441 | |||
442 | #endif | ||
443 | |||
444 | /* | ||
445 | * Use precise platform statistics if available: | ||
446 | */ | ||
447 | #ifdef CONFIG_VIRT_CPU_ACCOUNTING | ||
448 | void task_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime_t *st) | ||
449 | { | ||
450 | *ut = p->utime; | ||
451 | *st = p->stime; | ||
452 | } | ||
453 | |||
454 | void thread_group_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime_t *st) | ||
455 | { | ||
456 | struct task_cputime cputime; | ||
457 | |||
458 | thread_group_cputime(p, &cputime); | ||
459 | |||
460 | *ut = cputime.utime; | ||
461 | *st = cputime.stime; | ||
462 | } | ||
463 | |||
464 | void vtime_account_system_irqsafe(struct task_struct *tsk) | ||
465 | { | ||
466 | unsigned long flags; | ||
467 | |||
468 | local_irq_save(flags); | ||
469 | vtime_account_system(tsk); | ||
470 | local_irq_restore(flags); | ||
471 | } | ||
472 | EXPORT_SYMBOL_GPL(vtime_account_system_irqsafe); | ||
473 | |||
474 | #ifndef __ARCH_HAS_VTIME_TASK_SWITCH | ||
475 | void vtime_task_switch(struct task_struct *prev) | ||
476 | { | ||
477 | if (is_idle_task(prev)) | ||
478 | vtime_account_idle(prev); | ||
479 | else | ||
480 | vtime_account_system(prev); | ||
481 | |||
482 | vtime_account_user(prev); | ||
483 | arch_vtime_task_switch(prev); | ||
484 | } | ||
485 | #endif | ||
486 | |||
487 | /* | ||
488 | * Archs that account the whole time spent in the idle task | ||
489 | * (outside irq) as idle time can rely on this and just implement | ||
490 | * vtime_account_system() and vtime_account_idle(). Archs that | ||
491 | * have other meaning of the idle time (s390 only includes the | ||
492 | * time spent by the CPU when it's in low power mode) must override | ||
493 | * vtime_account(). | ||
494 | */ | ||
495 | #ifndef __ARCH_HAS_VTIME_ACCOUNT | ||
496 | void vtime_account(struct task_struct *tsk) | ||
497 | { | ||
498 | if (in_interrupt() || !is_idle_task(tsk)) | ||
499 | vtime_account_system(tsk); | ||
500 | else | ||
501 | vtime_account_idle(tsk); | ||
502 | } | ||
503 | EXPORT_SYMBOL_GPL(vtime_account); | ||
504 | #endif /* __ARCH_HAS_VTIME_ACCOUNT */ | ||
505 | |||
506 | #else | ||
507 | |||
508 | #ifndef nsecs_to_cputime | ||
509 | # define nsecs_to_cputime(__nsecs) nsecs_to_jiffies(__nsecs) | ||
510 | #endif | ||
511 | |||
512 | static cputime_t scale_utime(cputime_t utime, cputime_t rtime, cputime_t total) | ||
513 | { | ||
514 | u64 temp = (__force u64) rtime; | ||
515 | |||
516 | temp *= (__force u64) utime; | ||
517 | |||
518 | if (sizeof(cputime_t) == 4) | ||
519 | temp = div_u64(temp, (__force u32) total); | ||
520 | else | ||
521 | temp = div64_u64(temp, (__force u64) total); | ||
522 | |||
523 | return (__force cputime_t) temp; | ||
524 | } | ||
525 | |||
526 | /* | ||
527 | * Adjust tick based cputime random precision against scheduler | ||
528 | * runtime accounting. | ||
529 | */ | ||
530 | static void cputime_adjust(struct task_cputime *curr, | ||
531 | struct cputime *prev, | ||
532 | cputime_t *ut, cputime_t *st) | ||
533 | { | ||
534 | cputime_t rtime, utime, total; | ||
535 | |||
536 | utime = curr->utime; | ||
537 | total = utime + curr->stime; | ||
538 | |||
539 | /* | ||
540 | * Tick based cputime accounting depend on random scheduling | ||
541 | * timeslices of a task to be interrupted or not by the timer. | ||
542 | * Depending on these circumstances, the number of these interrupts | ||
543 | * may be over or under-optimistic, matching the real user and system | ||
544 | * cputime with a variable precision. | ||
545 | * | ||
546 | * Fix this by scaling these tick based values against the total | ||
547 | * runtime accounted by the CFS scheduler. | ||
548 | */ | ||
549 | rtime = nsecs_to_cputime(curr->sum_exec_runtime); | ||
550 | |||
551 | if (total) | ||
552 | utime = scale_utime(utime, rtime, total); | ||
553 | else | ||
554 | utime = rtime; | ||
555 | |||
556 | /* | ||
557 | * If the tick based count grows faster than the scheduler one, | ||
558 | * the result of the scaling may go backward. | ||
559 | * Let's enforce monotonicity. | ||
560 | */ | ||
561 | prev->utime = max(prev->utime, utime); | ||
562 | prev->stime = max(prev->stime, rtime - prev->utime); | ||
563 | |||
564 | *ut = prev->utime; | ||
565 | *st = prev->stime; | ||
566 | } | ||
567 | |||
568 | void task_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime_t *st) | ||
569 | { | ||
570 | struct task_cputime cputime = { | ||
571 | .utime = p->utime, | ||
572 | .stime = p->stime, | ||
573 | .sum_exec_runtime = p->se.sum_exec_runtime, | ||
574 | }; | ||
575 | |||
576 | cputime_adjust(&cputime, &p->prev_cputime, ut, st); | ||
577 | } | ||
578 | |||
579 | /* | ||
580 | * Must be called with siglock held. | ||
581 | */ | ||
582 | void thread_group_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime_t *st) | ||
583 | { | ||
584 | struct task_cputime cputime; | ||
585 | |||
586 | thread_group_cputime(p, &cputime); | ||
587 | cputime_adjust(&cputime, &p->signal->prev_cputime, ut, st); | ||
588 | } | ||
589 | #endif | ||
diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c index 6f79596e0ea9..2cd3c1b4e582 100644 --- a/kernel/sched/debug.c +++ b/kernel/sched/debug.c | |||
@@ -61,14 +61,20 @@ static unsigned long nsec_low(unsigned long long nsec) | |||
61 | static void print_cfs_group_stats(struct seq_file *m, int cpu, struct task_group *tg) | 61 | static void print_cfs_group_stats(struct seq_file *m, int cpu, struct task_group *tg) |
62 | { | 62 | { |
63 | struct sched_entity *se = tg->se[cpu]; | 63 | struct sched_entity *se = tg->se[cpu]; |
64 | if (!se) | ||
65 | return; | ||
66 | 64 | ||
67 | #define P(F) \ | 65 | #define P(F) \ |
68 | SEQ_printf(m, " .%-30s: %lld\n", #F, (long long)F) | 66 | SEQ_printf(m, " .%-30s: %lld\n", #F, (long long)F) |
69 | #define PN(F) \ | 67 | #define PN(F) \ |
70 | SEQ_printf(m, " .%-30s: %lld.%06ld\n", #F, SPLIT_NS((long long)F)) | 68 | SEQ_printf(m, " .%-30s: %lld.%06ld\n", #F, SPLIT_NS((long long)F)) |
71 | 69 | ||
70 | if (!se) { | ||
71 | struct sched_avg *avg = &cpu_rq(cpu)->avg; | ||
72 | P(avg->runnable_avg_sum); | ||
73 | P(avg->runnable_avg_period); | ||
74 | return; | ||
75 | } | ||
76 | |||
77 | |||
72 | PN(se->exec_start); | 78 | PN(se->exec_start); |
73 | PN(se->vruntime); | 79 | PN(se->vruntime); |
74 | PN(se->sum_exec_runtime); | 80 | PN(se->sum_exec_runtime); |
@@ -85,6 +91,12 @@ static void print_cfs_group_stats(struct seq_file *m, int cpu, struct task_group | |||
85 | P(se->statistics.wait_count); | 91 | P(se->statistics.wait_count); |
86 | #endif | 92 | #endif |
87 | P(se->load.weight); | 93 | P(se->load.weight); |
94 | #ifdef CONFIG_SMP | ||
95 | P(se->avg.runnable_avg_sum); | ||
96 | P(se->avg.runnable_avg_period); | ||
97 | P(se->avg.load_avg_contrib); | ||
98 | P(se->avg.decay_count); | ||
99 | #endif | ||
88 | #undef PN | 100 | #undef PN |
89 | #undef P | 101 | #undef P |
90 | } | 102 | } |
@@ -206,14 +218,18 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq) | |||
206 | SEQ_printf(m, " .%-30s: %ld\n", "load", cfs_rq->load.weight); | 218 | SEQ_printf(m, " .%-30s: %ld\n", "load", cfs_rq->load.weight); |
207 | #ifdef CONFIG_FAIR_GROUP_SCHED | 219 | #ifdef CONFIG_FAIR_GROUP_SCHED |
208 | #ifdef CONFIG_SMP | 220 | #ifdef CONFIG_SMP |
209 | SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "load_avg", | 221 | SEQ_printf(m, " .%-30s: %lld\n", "runnable_load_avg", |
210 | SPLIT_NS(cfs_rq->load_avg)); | 222 | cfs_rq->runnable_load_avg); |
211 | SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "load_period", | 223 | SEQ_printf(m, " .%-30s: %lld\n", "blocked_load_avg", |
212 | SPLIT_NS(cfs_rq->load_period)); | 224 | cfs_rq->blocked_load_avg); |
213 | SEQ_printf(m, " .%-30s: %ld\n", "load_contrib", | 225 | SEQ_printf(m, " .%-30s: %ld\n", "tg_load_avg", |
214 | cfs_rq->load_contribution); | 226 | atomic64_read(&cfs_rq->tg->load_avg)); |
215 | SEQ_printf(m, " .%-30s: %d\n", "load_tg", | 227 | SEQ_printf(m, " .%-30s: %lld\n", "tg_load_contrib", |
216 | atomic_read(&cfs_rq->tg->load_weight)); | 228 | cfs_rq->tg_load_contrib); |
229 | SEQ_printf(m, " .%-30s: %d\n", "tg_runnable_contrib", | ||
230 | cfs_rq->tg_runnable_contrib); | ||
231 | SEQ_printf(m, " .%-30s: %d\n", "tg->runnable_avg", | ||
232 | atomic_read(&cfs_rq->tg->runnable_avg)); | ||
217 | #endif | 233 | #endif |
218 | 234 | ||
219 | print_cfs_group_stats(m, cpu, cfs_rq->tg); | 235 | print_cfs_group_stats(m, cpu, cfs_rq->tg); |
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 96e2b18b6283..5eea8707234a 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c | |||
@@ -26,6 +26,9 @@ | |||
26 | #include <linux/slab.h> | 26 | #include <linux/slab.h> |
27 | #include <linux/profile.h> | 27 | #include <linux/profile.h> |
28 | #include <linux/interrupt.h> | 28 | #include <linux/interrupt.h> |
29 | #include <linux/mempolicy.h> | ||
30 | #include <linux/migrate.h> | ||
31 | #include <linux/task_work.h> | ||
29 | 32 | ||
30 | #include <trace/events/sched.h> | 33 | #include <trace/events/sched.h> |
31 | 34 | ||
@@ -259,6 +262,9 @@ static inline struct cfs_rq *group_cfs_rq(struct sched_entity *grp) | |||
259 | return grp->my_q; | 262 | return grp->my_q; |
260 | } | 263 | } |
261 | 264 | ||
265 | static void update_cfs_rq_blocked_load(struct cfs_rq *cfs_rq, | ||
266 | int force_update); | ||
267 | |||
262 | static inline void list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq) | 268 | static inline void list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq) |
263 | { | 269 | { |
264 | if (!cfs_rq->on_list) { | 270 | if (!cfs_rq->on_list) { |
@@ -278,6 +284,8 @@ static inline void list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq) | |||
278 | } | 284 | } |
279 | 285 | ||
280 | cfs_rq->on_list = 1; | 286 | cfs_rq->on_list = 1; |
287 | /* We should have no load, but we need to update last_decay. */ | ||
288 | update_cfs_rq_blocked_load(cfs_rq, 0); | ||
281 | } | 289 | } |
282 | } | 290 | } |
283 | 291 | ||
@@ -597,7 +605,7 @@ calc_delta_fair(unsigned long delta, struct sched_entity *se) | |||
597 | /* | 605 | /* |
598 | * The idea is to set a period in which each task runs once. | 606 | * The idea is to set a period in which each task runs once. |
599 | * | 607 | * |
600 | * When there are too many tasks (sysctl_sched_nr_latency) we have to stretch | 608 | * When there are too many tasks (sched_nr_latency) we have to stretch |
601 | * this period because otherwise the slices get too small. | 609 | * this period because otherwise the slices get too small. |
602 | * | 610 | * |
603 | * p = (nr <= nl) ? l : l*nr/nl | 611 | * p = (nr <= nl) ? l : l*nr/nl |
@@ -653,9 +661,6 @@ static u64 sched_vslice(struct cfs_rq *cfs_rq, struct sched_entity *se) | |||
653 | return calc_delta_fair(sched_slice(cfs_rq, se), se); | 661 | return calc_delta_fair(sched_slice(cfs_rq, se), se); |
654 | } | 662 | } |
655 | 663 | ||
656 | static void update_cfs_load(struct cfs_rq *cfs_rq, int global_update); | ||
657 | static void update_cfs_shares(struct cfs_rq *cfs_rq); | ||
658 | |||
659 | /* | 664 | /* |
660 | * Update the current task's runtime statistics. Skip current tasks that | 665 | * Update the current task's runtime statistics. Skip current tasks that |
661 | * are not in our scheduling class. | 666 | * are not in our scheduling class. |
@@ -675,10 +680,6 @@ __update_curr(struct cfs_rq *cfs_rq, struct sched_entity *curr, | |||
675 | 680 | ||
676 | curr->vruntime += delta_exec_weighted; | 681 | curr->vruntime += delta_exec_weighted; |
677 | update_min_vruntime(cfs_rq); | 682 | update_min_vruntime(cfs_rq); |
678 | |||
679 | #if defined CONFIG_SMP && defined CONFIG_FAIR_GROUP_SCHED | ||
680 | cfs_rq->load_unacc_exec_time += delta_exec; | ||
681 | #endif | ||
682 | } | 683 | } |
683 | 684 | ||
684 | static void update_curr(struct cfs_rq *cfs_rq) | 685 | static void update_curr(struct cfs_rq *cfs_rq) |
@@ -776,6 +777,230 @@ update_stats_curr_start(struct cfs_rq *cfs_rq, struct sched_entity *se) | |||
776 | * Scheduling class queueing methods: | 777 | * Scheduling class queueing methods: |
777 | */ | 778 | */ |
778 | 779 | ||
780 | #ifdef CONFIG_NUMA_BALANCING | ||
781 | /* | ||
782 | * numa task sample period in ms | ||
783 | */ | ||
784 | unsigned int sysctl_numa_balancing_scan_period_min = 100; | ||
785 | unsigned int sysctl_numa_balancing_scan_period_max = 100*50; | ||
786 | unsigned int sysctl_numa_balancing_scan_period_reset = 100*600; | ||
787 | |||
788 | /* Portion of address space to scan in MB */ | ||
789 | unsigned int sysctl_numa_balancing_scan_size = 256; | ||
790 | |||
791 | /* Scan @scan_size MB every @scan_period after an initial @scan_delay in ms */ | ||
792 | unsigned int sysctl_numa_balancing_scan_delay = 1000; | ||
793 | |||
794 | static void task_numa_placement(struct task_struct *p) | ||
795 | { | ||
796 | int seq; | ||
797 | |||
798 | if (!p->mm) /* for example, ksmd faulting in a user's mm */ | ||
799 | return; | ||
800 | seq = ACCESS_ONCE(p->mm->numa_scan_seq); | ||
801 | if (p->numa_scan_seq == seq) | ||
802 | return; | ||
803 | p->numa_scan_seq = seq; | ||
804 | |||
805 | /* FIXME: Scheduling placement policy hints go here */ | ||
806 | } | ||
807 | |||
808 | /* | ||
809 | * Got a PROT_NONE fault for a page on @node. | ||
810 | */ | ||
811 | void task_numa_fault(int node, int pages, bool migrated) | ||
812 | { | ||
813 | struct task_struct *p = current; | ||
814 | |||
815 | if (!sched_feat_numa(NUMA)) | ||
816 | return; | ||
817 | |||
818 | /* FIXME: Allocate task-specific structure for placement policy here */ | ||
819 | |||
820 | /* | ||
821 | * If pages are properly placed (did not migrate) then scan slower. | ||
822 | * This is reset periodically in case of phase changes | ||
823 | */ | ||
824 | if (!migrated) | ||
825 | p->numa_scan_period = min(sysctl_numa_balancing_scan_period_max, | ||
826 | p->numa_scan_period + jiffies_to_msecs(10)); | ||
827 | |||
828 | task_numa_placement(p); | ||
829 | } | ||
830 | |||
831 | static void reset_ptenuma_scan(struct task_struct *p) | ||
832 | { | ||
833 | ACCESS_ONCE(p->mm->numa_scan_seq)++; | ||
834 | p->mm->numa_scan_offset = 0; | ||
835 | } | ||
836 | |||
837 | /* | ||
838 | * The expensive part of numa migration is done from task_work context. | ||
839 | * Triggered from task_tick_numa(). | ||
840 | */ | ||
841 | void task_numa_work(struct callback_head *work) | ||
842 | { | ||
843 | unsigned long migrate, next_scan, now = jiffies; | ||
844 | struct task_struct *p = current; | ||
845 | struct mm_struct *mm = p->mm; | ||
846 | struct vm_area_struct *vma; | ||
847 | unsigned long start, end; | ||
848 | long pages; | ||
849 | |||
850 | WARN_ON_ONCE(p != container_of(work, struct task_struct, numa_work)); | ||
851 | |||
852 | work->next = work; /* protect against double add */ | ||
853 | /* | ||
854 | * Who cares about NUMA placement when they're dying. | ||
855 | * | ||
856 | * NOTE: make sure not to dereference p->mm before this check, | ||
857 | * exit_task_work() happens _after_ exit_mm() so we could be called | ||
858 | * without p->mm even though we still had it when we enqueued this | ||
859 | * work. | ||
860 | */ | ||
861 | if (p->flags & PF_EXITING) | ||
862 | return; | ||
863 | |||
864 | /* | ||
865 | * We do not care about task placement until a task runs on a node | ||
866 | * other than the first one used by the address space. This is | ||
867 | * largely because migrations are driven by what CPU the task | ||
868 | * is running on. If it's never scheduled on another node, it'll | ||
869 | * not migrate so why bother trapping the fault. | ||
870 | */ | ||
871 | if (mm->first_nid == NUMA_PTE_SCAN_INIT) | ||
872 | mm->first_nid = numa_node_id(); | ||
873 | if (mm->first_nid != NUMA_PTE_SCAN_ACTIVE) { | ||
874 | /* Are we running on a new node yet? */ | ||
875 | if (numa_node_id() == mm->first_nid && | ||
876 | !sched_feat_numa(NUMA_FORCE)) | ||
877 | return; | ||
878 | |||
879 | mm->first_nid = NUMA_PTE_SCAN_ACTIVE; | ||
880 | } | ||
881 | |||
882 | /* | ||
883 | * Reset the scan period if enough time has gone by. Objective is that | ||
884 | * scanning will be reduced if pages are properly placed. As tasks | ||
885 | * can enter different phases this needs to be re-examined. Lacking | ||
886 | * proper tracking of reference behaviour, this blunt hammer is used. | ||
887 | */ | ||
888 | migrate = mm->numa_next_reset; | ||
889 | if (time_after(now, migrate)) { | ||
890 | p->numa_scan_period = sysctl_numa_balancing_scan_period_min; | ||
891 | next_scan = now + msecs_to_jiffies(sysctl_numa_balancing_scan_period_reset); | ||
892 | xchg(&mm->numa_next_reset, next_scan); | ||
893 | } | ||
894 | |||
895 | /* | ||
896 | * Enforce maximal scan/migration frequency.. | ||
897 | */ | ||
898 | migrate = mm->numa_next_scan; | ||
899 | if (time_before(now, migrate)) | ||
900 | return; | ||
901 | |||
902 | if (p->numa_scan_period == 0) | ||
903 | p->numa_scan_period = sysctl_numa_balancing_scan_period_min; | ||
904 | |||
905 | next_scan = now + msecs_to_jiffies(p->numa_scan_period); | ||
906 | if (cmpxchg(&mm->numa_next_scan, migrate, next_scan) != migrate) | ||
907 | return; | ||
908 | |||
909 | /* | ||
910 | * Do not set pte_numa if the current running node is rate-limited. | ||
911 | * This loses statistics on the fault but if we are unwilling to | ||
912 | * migrate to this node, it is less likely we can do useful work | ||
913 | */ | ||
914 | if (migrate_ratelimited(numa_node_id())) | ||
915 | return; | ||
916 | |||
917 | start = mm->numa_scan_offset; | ||
918 | pages = sysctl_numa_balancing_scan_size; | ||
919 | pages <<= 20 - PAGE_SHIFT; /* MB in pages */ | ||
920 | if (!pages) | ||
921 | return; | ||
922 | |||
923 | down_read(&mm->mmap_sem); | ||
924 | vma = find_vma(mm, start); | ||
925 | if (!vma) { | ||
926 | reset_ptenuma_scan(p); | ||
927 | start = 0; | ||
928 | vma = mm->mmap; | ||
929 | } | ||
930 | for (; vma; vma = vma->vm_next) { | ||
931 | if (!vma_migratable(vma)) | ||
932 | continue; | ||
933 | |||
934 | /* Skip small VMAs. They are not likely to be of relevance */ | ||
935 | if (vma->vm_end - vma->vm_start < HPAGE_SIZE) | ||
936 | continue; | ||
937 | |||
938 | do { | ||
939 | start = max(start, vma->vm_start); | ||
940 | end = ALIGN(start + (pages << PAGE_SHIFT), HPAGE_SIZE); | ||
941 | end = min(end, vma->vm_end); | ||
942 | pages -= change_prot_numa(vma, start, end); | ||
943 | |||
944 | start = end; | ||
945 | if (pages <= 0) | ||
946 | goto out; | ||
947 | } while (end != vma->vm_end); | ||
948 | } | ||
949 | |||
950 | out: | ||
951 | /* | ||
952 | * It is possible to reach the end of the VMA list but the last few VMAs are | ||
953 | * not guaranteed to the vma_migratable. If they are not, we would find the | ||
954 | * !migratable VMA on the next scan but not reset the scanner to the start | ||
955 | * so check it now. | ||
956 | */ | ||
957 | if (vma) | ||
958 | mm->numa_scan_offset = start; | ||
959 | else | ||
960 | reset_ptenuma_scan(p); | ||
961 | up_read(&mm->mmap_sem); | ||
962 | } | ||
963 | |||
964 | /* | ||
965 | * Drive the periodic memory faults.. | ||
966 | */ | ||
967 | void task_tick_numa(struct rq *rq, struct task_struct *curr) | ||
968 | { | ||
969 | struct callback_head *work = &curr->numa_work; | ||
970 | u64 period, now; | ||
971 | |||
972 | /* | ||
973 | * We don't care about NUMA placement if we don't have memory. | ||
974 | */ | ||
975 | if (!curr->mm || (curr->flags & PF_EXITING) || work->next != work) | ||
976 | return; | ||
977 | |||
978 | /* | ||
979 | * Using runtime rather than walltime has the dual advantage that | ||
980 | * we (mostly) drive the selection from busy threads and that the | ||
981 | * task needs to have done some actual work before we bother with | ||
982 | * NUMA placement. | ||
983 | */ | ||
984 | now = curr->se.sum_exec_runtime; | ||
985 | period = (u64)curr->numa_scan_period * NSEC_PER_MSEC; | ||
986 | |||
987 | if (now - curr->node_stamp > period) { | ||
988 | if (!curr->node_stamp) | ||
989 | curr->numa_scan_period = sysctl_numa_balancing_scan_period_min; | ||
990 | curr->node_stamp = now; | ||
991 | |||
992 | if (!time_before(jiffies, curr->mm->numa_next_scan)) { | ||
993 | init_task_work(work, task_numa_work); /* TODO: move this into sched_fork() */ | ||
994 | task_work_add(curr, work, true); | ||
995 | } | ||
996 | } | ||
997 | } | ||
998 | #else | ||
999 | static void task_tick_numa(struct rq *rq, struct task_struct *curr) | ||
1000 | { | ||
1001 | } | ||
1002 | #endif /* CONFIG_NUMA_BALANCING */ | ||
1003 | |||
779 | static void | 1004 | static void |
780 | account_entity_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se) | 1005 | account_entity_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se) |
781 | { | 1006 | { |
@@ -801,72 +1026,7 @@ account_entity_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se) | |||
801 | } | 1026 | } |
802 | 1027 | ||
803 | #ifdef CONFIG_FAIR_GROUP_SCHED | 1028 | #ifdef CONFIG_FAIR_GROUP_SCHED |
804 | /* we need this in update_cfs_load and load-balance functions below */ | ||
805 | static inline int throttled_hierarchy(struct cfs_rq *cfs_rq); | ||
806 | # ifdef CONFIG_SMP | 1029 | # ifdef CONFIG_SMP |
807 | static void update_cfs_rq_load_contribution(struct cfs_rq *cfs_rq, | ||
808 | int global_update) | ||
809 | { | ||
810 | struct task_group *tg = cfs_rq->tg; | ||
811 | long load_avg; | ||
812 | |||
813 | load_avg = div64_u64(cfs_rq->load_avg, cfs_rq->load_period+1); | ||
814 | load_avg -= cfs_rq->load_contribution; | ||
815 | |||
816 | if (global_update || abs(load_avg) > cfs_rq->load_contribution / 8) { | ||
817 | atomic_add(load_avg, &tg->load_weight); | ||
818 | cfs_rq->load_contribution += load_avg; | ||
819 | } | ||
820 | } | ||
821 | |||
822 | static void update_cfs_load(struct cfs_rq *cfs_rq, int global_update) | ||
823 | { | ||
824 | u64 period = sysctl_sched_shares_window; | ||
825 | u64 now, delta; | ||
826 | unsigned long load = cfs_rq->load.weight; | ||
827 | |||
828 | if (cfs_rq->tg == &root_task_group || throttled_hierarchy(cfs_rq)) | ||
829 | return; | ||
830 | |||
831 | now = rq_of(cfs_rq)->clock_task; | ||
832 | delta = now - cfs_rq->load_stamp; | ||
833 | |||
834 | /* truncate load history at 4 idle periods */ | ||
835 | if (cfs_rq->load_stamp > cfs_rq->load_last && | ||
836 | now - cfs_rq->load_last > 4 * period) { | ||
837 | cfs_rq->load_period = 0; | ||
838 | cfs_rq->load_avg = 0; | ||
839 | delta = period - 1; | ||
840 | } | ||
841 | |||
842 | cfs_rq->load_stamp = now; | ||
843 | cfs_rq->load_unacc_exec_time = 0; | ||
844 | cfs_rq->load_period += delta; | ||
845 | if (load) { | ||
846 | cfs_rq->load_last = now; | ||
847 | cfs_rq->load_avg += delta * load; | ||
848 | } | ||
849 | |||
850 | /* consider updating load contribution on each fold or truncate */ | ||
851 | if (global_update || cfs_rq->load_period > period | ||
852 | || !cfs_rq->load_period) | ||
853 | update_cfs_rq_load_contribution(cfs_rq, global_update); | ||
854 | |||
855 | while (cfs_rq->load_period > period) { | ||
856 | /* | ||
857 | * Inline assembly required to prevent the compiler | ||
858 | * optimising this loop into a divmod call. | ||
859 | * See __iter_div_u64_rem() for another example of this. | ||
860 | */ | ||
861 | asm("" : "+rm" (cfs_rq->load_period)); | ||
862 | cfs_rq->load_period /= 2; | ||
863 | cfs_rq->load_avg /= 2; | ||
864 | } | ||
865 | |||
866 | if (!cfs_rq->curr && !cfs_rq->nr_running && !cfs_rq->load_avg) | ||
867 | list_del_leaf_cfs_rq(cfs_rq); | ||
868 | } | ||
869 | |||
870 | static inline long calc_tg_weight(struct task_group *tg, struct cfs_rq *cfs_rq) | 1030 | static inline long calc_tg_weight(struct task_group *tg, struct cfs_rq *cfs_rq) |
871 | { | 1031 | { |
872 | long tg_weight; | 1032 | long tg_weight; |
@@ -876,8 +1036,8 @@ static inline long calc_tg_weight(struct task_group *tg, struct cfs_rq *cfs_rq) | |||
876 | * to gain a more accurate current total weight. See | 1036 | * to gain a more accurate current total weight. See |
877 | * update_cfs_rq_load_contribution(). | 1037 | * update_cfs_rq_load_contribution(). |
878 | */ | 1038 | */ |
879 | tg_weight = atomic_read(&tg->load_weight); | 1039 | tg_weight = atomic64_read(&tg->load_avg); |
880 | tg_weight -= cfs_rq->load_contribution; | 1040 | tg_weight -= cfs_rq->tg_load_contrib; |
881 | tg_weight += cfs_rq->load.weight; | 1041 | tg_weight += cfs_rq->load.weight; |
882 | 1042 | ||
883 | return tg_weight; | 1043 | return tg_weight; |
@@ -901,27 +1061,11 @@ static long calc_cfs_shares(struct cfs_rq *cfs_rq, struct task_group *tg) | |||
901 | 1061 | ||
902 | return shares; | 1062 | return shares; |
903 | } | 1063 | } |
904 | |||
905 | static void update_entity_shares_tick(struct cfs_rq *cfs_rq) | ||
906 | { | ||
907 | if (cfs_rq->load_unacc_exec_time > sysctl_sched_shares_window) { | ||
908 | update_cfs_load(cfs_rq, 0); | ||
909 | update_cfs_shares(cfs_rq); | ||
910 | } | ||
911 | } | ||
912 | # else /* CONFIG_SMP */ | 1064 | # else /* CONFIG_SMP */ |
913 | static void update_cfs_load(struct cfs_rq *cfs_rq, int global_update) | ||
914 | { | ||
915 | } | ||
916 | |||
917 | static inline long calc_cfs_shares(struct cfs_rq *cfs_rq, struct task_group *tg) | 1065 | static inline long calc_cfs_shares(struct cfs_rq *cfs_rq, struct task_group *tg) |
918 | { | 1066 | { |
919 | return tg->shares; | 1067 | return tg->shares; |
920 | } | 1068 | } |
921 | |||
922 | static inline void update_entity_shares_tick(struct cfs_rq *cfs_rq) | ||
923 | { | ||
924 | } | ||
925 | # endif /* CONFIG_SMP */ | 1069 | # endif /* CONFIG_SMP */ |
926 | static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, | 1070 | static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, |
927 | unsigned long weight) | 1071 | unsigned long weight) |
@@ -939,6 +1083,8 @@ static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, | |||
939 | account_entity_enqueue(cfs_rq, se); | 1083 | account_entity_enqueue(cfs_rq, se); |
940 | } | 1084 | } |
941 | 1085 | ||
1086 | static inline int throttled_hierarchy(struct cfs_rq *cfs_rq); | ||
1087 | |||
942 | static void update_cfs_shares(struct cfs_rq *cfs_rq) | 1088 | static void update_cfs_shares(struct cfs_rq *cfs_rq) |
943 | { | 1089 | { |
944 | struct task_group *tg; | 1090 | struct task_group *tg; |
@@ -958,18 +1104,477 @@ static void update_cfs_shares(struct cfs_rq *cfs_rq) | |||
958 | reweight_entity(cfs_rq_of(se), se, shares); | 1104 | reweight_entity(cfs_rq_of(se), se, shares); |
959 | } | 1105 | } |
960 | #else /* CONFIG_FAIR_GROUP_SCHED */ | 1106 | #else /* CONFIG_FAIR_GROUP_SCHED */ |
961 | static void update_cfs_load(struct cfs_rq *cfs_rq, int global_update) | 1107 | static inline void update_cfs_shares(struct cfs_rq *cfs_rq) |
962 | { | 1108 | { |
963 | } | 1109 | } |
1110 | #endif /* CONFIG_FAIR_GROUP_SCHED */ | ||
964 | 1111 | ||
965 | static inline void update_cfs_shares(struct cfs_rq *cfs_rq) | 1112 | /* Only depends on SMP, FAIR_GROUP_SCHED may be removed when useful in lb */ |
1113 | #if defined(CONFIG_SMP) && defined(CONFIG_FAIR_GROUP_SCHED) | ||
1114 | /* | ||
1115 | * We choose a half-life close to 1 scheduling period. | ||
1116 | * Note: The tables below are dependent on this value. | ||
1117 | */ | ||
1118 | #define LOAD_AVG_PERIOD 32 | ||
1119 | #define LOAD_AVG_MAX 47742 /* maximum possible load avg */ | ||
1120 | #define LOAD_AVG_MAX_N 345 /* number of full periods to produce LOAD_MAX_AVG */ | ||
1121 | |||
1122 | /* Precomputed fixed inverse multiplies for multiplication by y^n */ | ||
1123 | static const u32 runnable_avg_yN_inv[] = { | ||
1124 | 0xffffffff, 0xfa83b2da, 0xf5257d14, 0xefe4b99a, 0xeac0c6e6, 0xe5b906e6, | ||
1125 | 0xe0ccdeeb, 0xdbfbb796, 0xd744fcc9, 0xd2a81d91, 0xce248c14, 0xc9b9bd85, | ||
1126 | 0xc5672a10, 0xc12c4cc9, 0xbd08a39e, 0xb8fbaf46, 0xb504f333, 0xb123f581, | ||
1127 | 0xad583ee9, 0xa9a15ab4, 0xa5fed6a9, 0xa2704302, 0x9ef5325f, 0x9b8d39b9, | ||
1128 | 0x9837f050, 0x94f4efa8, 0x91c3d373, 0x8ea4398a, 0x8b95c1e3, 0x88980e80, | ||
1129 | 0x85aac367, 0x82cd8698, | ||
1130 | }; | ||
1131 | |||
1132 | /* | ||
1133 | * Precomputed \Sum y^k { 1<=k<=n }. These are floor(true_value) to prevent | ||
1134 | * over-estimates when re-combining. | ||
1135 | */ | ||
1136 | static const u32 runnable_avg_yN_sum[] = { | ||
1137 | 0, 1002, 1982, 2941, 3880, 4798, 5697, 6576, 7437, 8279, 9103, | ||
1138 | 9909,10698,11470,12226,12966,13690,14398,15091,15769,16433,17082, | ||
1139 | 17718,18340,18949,19545,20128,20698,21256,21802,22336,22859,23371, | ||
1140 | }; | ||
1141 | |||
1142 | /* | ||
1143 | * Approximate: | ||
1144 | * val * y^n, where y^32 ~= 0.5 (~1 scheduling period) | ||
1145 | */ | ||
1146 | static __always_inline u64 decay_load(u64 val, u64 n) | ||
966 | { | 1147 | { |
1148 | unsigned int local_n; | ||
1149 | |||
1150 | if (!n) | ||
1151 | return val; | ||
1152 | else if (unlikely(n > LOAD_AVG_PERIOD * 63)) | ||
1153 | return 0; | ||
1154 | |||
1155 | /* after bounds checking we can collapse to 32-bit */ | ||
1156 | local_n = n; | ||
1157 | |||
1158 | /* | ||
1159 | * As y^PERIOD = 1/2, we can combine | ||
1160 | * y^n = 1/2^(n/PERIOD) * k^(n%PERIOD) | ||
1161 | * With a look-up table which covers k^n (n<PERIOD) | ||
1162 | * | ||
1163 | * To achieve constant time decay_load. | ||
1164 | */ | ||
1165 | if (unlikely(local_n >= LOAD_AVG_PERIOD)) { | ||
1166 | val >>= local_n / LOAD_AVG_PERIOD; | ||
1167 | local_n %= LOAD_AVG_PERIOD; | ||
1168 | } | ||
1169 | |||
1170 | val *= runnable_avg_yN_inv[local_n]; | ||
1171 | /* We don't use SRR here since we always want to round down. */ | ||
1172 | return val >> 32; | ||
967 | } | 1173 | } |
968 | 1174 | ||
969 | static inline void update_entity_shares_tick(struct cfs_rq *cfs_rq) | 1175 | /* |
1176 | * For updates fully spanning n periods, the contribution to runnable | ||
1177 | * average will be: \Sum 1024*y^n | ||
1178 | * | ||
1179 | * We can compute this reasonably efficiently by combining: | ||
1180 | * y^PERIOD = 1/2 with precomputed \Sum 1024*y^n {for n <PERIOD} | ||
1181 | */ | ||
1182 | static u32 __compute_runnable_contrib(u64 n) | ||
970 | { | 1183 | { |
1184 | u32 contrib = 0; | ||
1185 | |||
1186 | if (likely(n <= LOAD_AVG_PERIOD)) | ||
1187 | return runnable_avg_yN_sum[n]; | ||
1188 | else if (unlikely(n >= LOAD_AVG_MAX_N)) | ||
1189 | return LOAD_AVG_MAX; | ||
1190 | |||
1191 | /* Compute \Sum k^n combining precomputed values for k^i, \Sum k^j */ | ||
1192 | do { | ||
1193 | contrib /= 2; /* y^LOAD_AVG_PERIOD = 1/2 */ | ||
1194 | contrib += runnable_avg_yN_sum[LOAD_AVG_PERIOD]; | ||
1195 | |||
1196 | n -= LOAD_AVG_PERIOD; | ||
1197 | } while (n > LOAD_AVG_PERIOD); | ||
1198 | |||
1199 | contrib = decay_load(contrib, n); | ||
1200 | return contrib + runnable_avg_yN_sum[n]; | ||
971 | } | 1201 | } |
972 | #endif /* CONFIG_FAIR_GROUP_SCHED */ | 1202 | |
1203 | /* | ||
1204 | * We can represent the historical contribution to runnable average as the | ||
1205 | * coefficients of a geometric series. To do this we sub-divide our runnable | ||
1206 | * history into segments of approximately 1ms (1024us); label the segment that | ||
1207 | * occurred N-ms ago p_N, with p_0 corresponding to the current period, e.g. | ||
1208 | * | ||
1209 | * [<- 1024us ->|<- 1024us ->|<- 1024us ->| ... | ||
1210 | * p0 p1 p2 | ||
1211 | * (now) (~1ms ago) (~2ms ago) | ||
1212 | * | ||
1213 | * Let u_i denote the fraction of p_i that the entity was runnable. | ||
1214 | * | ||
1215 | * We then designate the fractions u_i as our co-efficients, yielding the | ||
1216 | * following representation of historical load: | ||
1217 | * u_0 + u_1*y + u_2*y^2 + u_3*y^3 + ... | ||
1218 | * | ||
1219 | * We choose y based on the with of a reasonably scheduling period, fixing: | ||
1220 | * y^32 = 0.5 | ||
1221 | * | ||
1222 | * This means that the contribution to load ~32ms ago (u_32) will be weighted | ||
1223 | * approximately half as much as the contribution to load within the last ms | ||
1224 | * (u_0). | ||
1225 | * | ||
1226 | * When a period "rolls over" and we have new u_0`, multiplying the previous | ||
1227 | * sum again by y is sufficient to update: | ||
1228 | * load_avg = u_0` + y*(u_0 + u_1*y + u_2*y^2 + ... ) | ||
1229 | * = u_0 + u_1*y + u_2*y^2 + ... [re-labeling u_i --> u_{i+1}] | ||
1230 | */ | ||
1231 | static __always_inline int __update_entity_runnable_avg(u64 now, | ||
1232 | struct sched_avg *sa, | ||
1233 | int runnable) | ||
1234 | { | ||
1235 | u64 delta, periods; | ||
1236 | u32 runnable_contrib; | ||
1237 | int delta_w, decayed = 0; | ||
1238 | |||
1239 | delta = now - sa->last_runnable_update; | ||
1240 | /* | ||
1241 | * This should only happen when time goes backwards, which it | ||
1242 | * unfortunately does during sched clock init when we swap over to TSC. | ||
1243 | */ | ||
1244 | if ((s64)delta < 0) { | ||
1245 | sa->last_runnable_update = now; | ||
1246 | return 0; | ||
1247 | } | ||
1248 | |||
1249 | /* | ||
1250 | * Use 1024ns as the unit of measurement since it's a reasonable | ||
1251 | * approximation of 1us and fast to compute. | ||
1252 | */ | ||
1253 | delta >>= 10; | ||
1254 | if (!delta) | ||
1255 | return 0; | ||
1256 | sa->last_runnable_update = now; | ||
1257 | |||
1258 | /* delta_w is the amount already accumulated against our next period */ | ||
1259 | delta_w = sa->runnable_avg_period % 1024; | ||
1260 | if (delta + delta_w >= 1024) { | ||
1261 | /* period roll-over */ | ||
1262 | decayed = 1; | ||
1263 | |||
1264 | /* | ||
1265 | * Now that we know we're crossing a period boundary, figure | ||
1266 | * out how much from delta we need to complete the current | ||
1267 | * period and accrue it. | ||
1268 | */ | ||
1269 | delta_w = 1024 - delta_w; | ||
1270 | if (runnable) | ||
1271 | sa->runnable_avg_sum += delta_w; | ||
1272 | sa->runnable_avg_period += delta_w; | ||
1273 | |||
1274 | delta -= delta_w; | ||
1275 | |||
1276 | /* Figure out how many additional periods this update spans */ | ||
1277 | periods = delta / 1024; | ||
1278 | delta %= 1024; | ||
1279 | |||
1280 | sa->runnable_avg_sum = decay_load(sa->runnable_avg_sum, | ||
1281 | periods + 1); | ||
1282 | sa->runnable_avg_period = decay_load(sa->runnable_avg_period, | ||
1283 | periods + 1); | ||
1284 | |||
1285 | /* Efficiently calculate \sum (1..n_period) 1024*y^i */ | ||
1286 | runnable_contrib = __compute_runnable_contrib(periods); | ||
1287 | if (runnable) | ||
1288 | sa->runnable_avg_sum += runnable_contrib; | ||
1289 | sa->runnable_avg_period += runnable_contrib; | ||
1290 | } | ||
1291 | |||
1292 | /* Remainder of delta accrued against u_0` */ | ||
1293 | if (runnable) | ||
1294 | sa->runnable_avg_sum += delta; | ||
1295 | sa->runnable_avg_period += delta; | ||
1296 | |||
1297 | return decayed; | ||
1298 | } | ||
1299 | |||
1300 | /* Synchronize an entity's decay with its parenting cfs_rq.*/ | ||
1301 | static inline u64 __synchronize_entity_decay(struct sched_entity *se) | ||
1302 | { | ||
1303 | struct cfs_rq *cfs_rq = cfs_rq_of(se); | ||
1304 | u64 decays = atomic64_read(&cfs_rq->decay_counter); | ||
1305 | |||
1306 | decays -= se->avg.decay_count; | ||
1307 | if (!decays) | ||
1308 | return 0; | ||
1309 | |||
1310 | se->avg.load_avg_contrib = decay_load(se->avg.load_avg_contrib, decays); | ||
1311 | se->avg.decay_count = 0; | ||
1312 | |||
1313 | return decays; | ||
1314 | } | ||
1315 | |||
1316 | #ifdef CONFIG_FAIR_GROUP_SCHED | ||
1317 | static inline void __update_cfs_rq_tg_load_contrib(struct cfs_rq *cfs_rq, | ||
1318 | int force_update) | ||
1319 | { | ||
1320 | struct task_group *tg = cfs_rq->tg; | ||
1321 | s64 tg_contrib; | ||
1322 | |||
1323 | tg_contrib = cfs_rq->runnable_load_avg + cfs_rq->blocked_load_avg; | ||
1324 | tg_contrib -= cfs_rq->tg_load_contrib; | ||
1325 | |||
1326 | if (force_update || abs64(tg_contrib) > cfs_rq->tg_load_contrib / 8) { | ||
1327 | atomic64_add(tg_contrib, &tg->load_avg); | ||
1328 | cfs_rq->tg_load_contrib += tg_contrib; | ||
1329 | } | ||
1330 | } | ||
1331 | |||
1332 | /* | ||
1333 | * Aggregate cfs_rq runnable averages into an equivalent task_group | ||
1334 | * representation for computing load contributions. | ||
1335 | */ | ||
1336 | static inline void __update_tg_runnable_avg(struct sched_avg *sa, | ||
1337 | struct cfs_rq *cfs_rq) | ||
1338 | { | ||
1339 | struct task_group *tg = cfs_rq->tg; | ||
1340 | long contrib; | ||
1341 | |||
1342 | /* The fraction of a cpu used by this cfs_rq */ | ||
1343 | contrib = div_u64(sa->runnable_avg_sum << NICE_0_SHIFT, | ||
1344 | sa->runnable_avg_period + 1); | ||
1345 | contrib -= cfs_rq->tg_runnable_contrib; | ||
1346 | |||
1347 | if (abs(contrib) > cfs_rq->tg_runnable_contrib / 64) { | ||
1348 | atomic_add(contrib, &tg->runnable_avg); | ||
1349 | cfs_rq->tg_runnable_contrib += contrib; | ||
1350 | } | ||
1351 | } | ||
1352 | |||
1353 | static inline void __update_group_entity_contrib(struct sched_entity *se) | ||
1354 | { | ||
1355 | struct cfs_rq *cfs_rq = group_cfs_rq(se); | ||
1356 | struct task_group *tg = cfs_rq->tg; | ||
1357 | int runnable_avg; | ||
1358 | |||
1359 | u64 contrib; | ||
1360 | |||
1361 | contrib = cfs_rq->tg_load_contrib * tg->shares; | ||
1362 | se->avg.load_avg_contrib = div64_u64(contrib, | ||
1363 | atomic64_read(&tg->load_avg) + 1); | ||
1364 | |||
1365 | /* | ||
1366 | * For group entities we need to compute a correction term in the case | ||
1367 | * that they are consuming <1 cpu so that we would contribute the same | ||
1368 | * load as a task of equal weight. | ||
1369 | * | ||
1370 | * Explicitly co-ordinating this measurement would be expensive, but | ||
1371 | * fortunately the sum of each cpus contribution forms a usable | ||
1372 | * lower-bound on the true value. | ||
1373 | * | ||
1374 | * Consider the aggregate of 2 contributions. Either they are disjoint | ||
1375 | * (and the sum represents true value) or they are disjoint and we are | ||
1376 | * understating by the aggregate of their overlap. | ||
1377 | * | ||
1378 | * Extending this to N cpus, for a given overlap, the maximum amount we | ||
1379 | * understand is then n_i(n_i+1)/2 * w_i where n_i is the number of | ||
1380 | * cpus that overlap for this interval and w_i is the interval width. | ||
1381 | * | ||
1382 | * On a small machine; the first term is well-bounded which bounds the | ||
1383 | * total error since w_i is a subset of the period. Whereas on a | ||
1384 | * larger machine, while this first term can be larger, if w_i is the | ||
1385 | * of consequential size guaranteed to see n_i*w_i quickly converge to | ||
1386 | * our upper bound of 1-cpu. | ||
1387 | */ | ||
1388 | runnable_avg = atomic_read(&tg->runnable_avg); | ||
1389 | if (runnable_avg < NICE_0_LOAD) { | ||
1390 | se->avg.load_avg_contrib *= runnable_avg; | ||
1391 | se->avg.load_avg_contrib >>= NICE_0_SHIFT; | ||
1392 | } | ||
1393 | } | ||
1394 | #else | ||
1395 | static inline void __update_cfs_rq_tg_load_contrib(struct cfs_rq *cfs_rq, | ||
1396 | int force_update) {} | ||
1397 | static inline void __update_tg_runnable_avg(struct sched_avg *sa, | ||
1398 | struct cfs_rq *cfs_rq) {} | ||
1399 | static inline void __update_group_entity_contrib(struct sched_entity *se) {} | ||
1400 | #endif | ||
1401 | |||
1402 | static inline void __update_task_entity_contrib(struct sched_entity *se) | ||
1403 | { | ||
1404 | u32 contrib; | ||
1405 | |||
1406 | /* avoid overflowing a 32-bit type w/ SCHED_LOAD_SCALE */ | ||
1407 | contrib = se->avg.runnable_avg_sum * scale_load_down(se->load.weight); | ||
1408 | contrib /= (se->avg.runnable_avg_period + 1); | ||
1409 | se->avg.load_avg_contrib = scale_load(contrib); | ||
1410 | } | ||
1411 | |||
1412 | /* Compute the current contribution to load_avg by se, return any delta */ | ||
1413 | static long __update_entity_load_avg_contrib(struct sched_entity *se) | ||
1414 | { | ||
1415 | long old_contrib = se->avg.load_avg_contrib; | ||
1416 | |||
1417 | if (entity_is_task(se)) { | ||
1418 | __update_task_entity_contrib(se); | ||
1419 | } else { | ||
1420 | __update_tg_runnable_avg(&se->avg, group_cfs_rq(se)); | ||
1421 | __update_group_entity_contrib(se); | ||
1422 | } | ||
1423 | |||
1424 | return se->avg.load_avg_contrib - old_contrib; | ||
1425 | } | ||
1426 | |||
1427 | static inline void subtract_blocked_load_contrib(struct cfs_rq *cfs_rq, | ||
1428 | long load_contrib) | ||
1429 | { | ||
1430 | if (likely(load_contrib < cfs_rq->blocked_load_avg)) | ||
1431 | cfs_rq->blocked_load_avg -= load_contrib; | ||
1432 | else | ||
1433 | cfs_rq->blocked_load_avg = 0; | ||
1434 | } | ||
1435 | |||
1436 | static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq); | ||
1437 | |||
1438 | /* Update a sched_entity's runnable average */ | ||
1439 | static inline void update_entity_load_avg(struct sched_entity *se, | ||
1440 | int update_cfs_rq) | ||
1441 | { | ||
1442 | struct cfs_rq *cfs_rq = cfs_rq_of(se); | ||
1443 | long contrib_delta; | ||
1444 | u64 now; | ||
1445 | |||
1446 | /* | ||
1447 | * For a group entity we need to use their owned cfs_rq_clock_task() in | ||
1448 | * case they are the parent of a throttled hierarchy. | ||
1449 | */ | ||
1450 | if (entity_is_task(se)) | ||
1451 | now = cfs_rq_clock_task(cfs_rq); | ||
1452 | else | ||
1453 | now = cfs_rq_clock_task(group_cfs_rq(se)); | ||
1454 | |||
1455 | if (!__update_entity_runnable_avg(now, &se->avg, se->on_rq)) | ||
1456 | return; | ||
1457 | |||
1458 | contrib_delta = __update_entity_load_avg_contrib(se); | ||
1459 | |||
1460 | if (!update_cfs_rq) | ||
1461 | return; | ||
1462 | |||
1463 | if (se->on_rq) | ||
1464 | cfs_rq->runnable_load_avg += contrib_delta; | ||
1465 | else | ||
1466 | subtract_blocked_load_contrib(cfs_rq, -contrib_delta); | ||
1467 | } | ||
1468 | |||
1469 | /* | ||
1470 | * Decay the load contributed by all blocked children and account this so that | ||
1471 | * their contribution may appropriately discounted when they wake up. | ||
1472 | */ | ||
1473 | static void update_cfs_rq_blocked_load(struct cfs_rq *cfs_rq, int force_update) | ||
1474 | { | ||
1475 | u64 now = cfs_rq_clock_task(cfs_rq) >> 20; | ||
1476 | u64 decays; | ||
1477 | |||
1478 | decays = now - cfs_rq->last_decay; | ||
1479 | if (!decays && !force_update) | ||
1480 | return; | ||
1481 | |||
1482 | if (atomic64_read(&cfs_rq->removed_load)) { | ||
1483 | u64 removed_load = atomic64_xchg(&cfs_rq->removed_load, 0); | ||
1484 | subtract_blocked_load_contrib(cfs_rq, removed_load); | ||
1485 | } | ||
1486 | |||
1487 | if (decays) { | ||
1488 | cfs_rq->blocked_load_avg = decay_load(cfs_rq->blocked_load_avg, | ||
1489 | decays); | ||
1490 | atomic64_add(decays, &cfs_rq->decay_counter); | ||
1491 | cfs_rq->last_decay = now; | ||
1492 | } | ||
1493 | |||
1494 | __update_cfs_rq_tg_load_contrib(cfs_rq, force_update); | ||
1495 | } | ||
1496 | |||
1497 | static inline void update_rq_runnable_avg(struct rq *rq, int runnable) | ||
1498 | { | ||
1499 | __update_entity_runnable_avg(rq->clock_task, &rq->avg, runnable); | ||
1500 | __update_tg_runnable_avg(&rq->avg, &rq->cfs); | ||
1501 | } | ||
1502 | |||
1503 | /* Add the load generated by se into cfs_rq's child load-average */ | ||
1504 | static inline void enqueue_entity_load_avg(struct cfs_rq *cfs_rq, | ||
1505 | struct sched_entity *se, | ||
1506 | int wakeup) | ||
1507 | { | ||
1508 | /* | ||
1509 | * We track migrations using entity decay_count <= 0, on a wake-up | ||
1510 | * migration we use a negative decay count to track the remote decays | ||
1511 | * accumulated while sleeping. | ||
1512 | */ | ||
1513 | if (unlikely(se->avg.decay_count <= 0)) { | ||
1514 | se->avg.last_runnable_update = rq_of(cfs_rq)->clock_task; | ||
1515 | if (se->avg.decay_count) { | ||
1516 | /* | ||
1517 | * In a wake-up migration we have to approximate the | ||
1518 | * time sleeping. This is because we can't synchronize | ||
1519 | * clock_task between the two cpus, and it is not | ||
1520 | * guaranteed to be read-safe. Instead, we can | ||
1521 | * approximate this using our carried decays, which are | ||
1522 | * explicitly atomically readable. | ||
1523 | */ | ||
1524 | se->avg.last_runnable_update -= (-se->avg.decay_count) | ||
1525 | << 20; | ||
1526 | update_entity_load_avg(se, 0); | ||
1527 | /* Indicate that we're now synchronized and on-rq */ | ||
1528 | se->avg.decay_count = 0; | ||
1529 | } | ||
1530 | wakeup = 0; | ||
1531 | } else { | ||
1532 | __synchronize_entity_decay(se); | ||
1533 | } | ||
1534 | |||
1535 | /* migrated tasks did not contribute to our blocked load */ | ||
1536 | if (wakeup) { | ||
1537 | subtract_blocked_load_contrib(cfs_rq, se->avg.load_avg_contrib); | ||
1538 | update_entity_load_avg(se, 0); | ||
1539 | } | ||
1540 | |||
1541 | cfs_rq->runnable_load_avg += se->avg.load_avg_contrib; | ||
1542 | /* we force update consideration on load-balancer moves */ | ||
1543 | update_cfs_rq_blocked_load(cfs_rq, !wakeup); | ||
1544 | } | ||
1545 | |||
1546 | /* | ||
1547 | * Remove se's load from this cfs_rq child load-average, if the entity is | ||
1548 | * transitioning to a blocked state we track its projected decay using | ||
1549 | * blocked_load_avg. | ||
1550 | */ | ||
1551 | static inline void dequeue_entity_load_avg(struct cfs_rq *cfs_rq, | ||
1552 | struct sched_entity *se, | ||
1553 | int sleep) | ||
1554 | { | ||
1555 | update_entity_load_avg(se, 1); | ||
1556 | /* we force update consideration on load-balancer moves */ | ||
1557 | update_cfs_rq_blocked_load(cfs_rq, !sleep); | ||
1558 | |||
1559 | cfs_rq->runnable_load_avg -= se->avg.load_avg_contrib; | ||
1560 | if (sleep) { | ||
1561 | cfs_rq->blocked_load_avg += se->avg.load_avg_contrib; | ||
1562 | se->avg.decay_count = atomic64_read(&cfs_rq->decay_counter); | ||
1563 | } /* migrations, e.g. sleep=0 leave decay_count == 0 */ | ||
1564 | } | ||
1565 | #else | ||
1566 | static inline void update_entity_load_avg(struct sched_entity *se, | ||
1567 | int update_cfs_rq) {} | ||
1568 | static inline void update_rq_runnable_avg(struct rq *rq, int runnable) {} | ||
1569 | static inline void enqueue_entity_load_avg(struct cfs_rq *cfs_rq, | ||
1570 | struct sched_entity *se, | ||
1571 | int wakeup) {} | ||
1572 | static inline void dequeue_entity_load_avg(struct cfs_rq *cfs_rq, | ||
1573 | struct sched_entity *se, | ||
1574 | int sleep) {} | ||
1575 | static inline void update_cfs_rq_blocked_load(struct cfs_rq *cfs_rq, | ||
1576 | int force_update) {} | ||
1577 | #endif | ||
973 | 1578 | ||
974 | static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se) | 1579 | static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se) |
975 | { | 1580 | { |
@@ -1096,7 +1701,7 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) | |||
1096 | * Update run-time statistics of the 'current'. | 1701 | * Update run-time statistics of the 'current'. |
1097 | */ | 1702 | */ |
1098 | update_curr(cfs_rq); | 1703 | update_curr(cfs_rq); |
1099 | update_cfs_load(cfs_rq, 0); | 1704 | enqueue_entity_load_avg(cfs_rq, se, flags & ENQUEUE_WAKEUP); |
1100 | account_entity_enqueue(cfs_rq, se); | 1705 | account_entity_enqueue(cfs_rq, se); |
1101 | update_cfs_shares(cfs_rq); | 1706 | update_cfs_shares(cfs_rq); |
1102 | 1707 | ||
@@ -1171,6 +1776,7 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) | |||
1171 | * Update run-time statistics of the 'current'. | 1776 | * Update run-time statistics of the 'current'. |
1172 | */ | 1777 | */ |
1173 | update_curr(cfs_rq); | 1778 | update_curr(cfs_rq); |
1779 | dequeue_entity_load_avg(cfs_rq, se, flags & DEQUEUE_SLEEP); | ||
1174 | 1780 | ||
1175 | update_stats_dequeue(cfs_rq, se); | 1781 | update_stats_dequeue(cfs_rq, se); |
1176 | if (flags & DEQUEUE_SLEEP) { | 1782 | if (flags & DEQUEUE_SLEEP) { |
@@ -1191,7 +1797,6 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) | |||
1191 | if (se != cfs_rq->curr) | 1797 | if (se != cfs_rq->curr) |
1192 | __dequeue_entity(cfs_rq, se); | 1798 | __dequeue_entity(cfs_rq, se); |
1193 | se->on_rq = 0; | 1799 | se->on_rq = 0; |
1194 | update_cfs_load(cfs_rq, 0); | ||
1195 | account_entity_dequeue(cfs_rq, se); | 1800 | account_entity_dequeue(cfs_rq, se); |
1196 | 1801 | ||
1197 | /* | 1802 | /* |
@@ -1340,6 +1945,8 @@ static void put_prev_entity(struct cfs_rq *cfs_rq, struct sched_entity *prev) | |||
1340 | update_stats_wait_start(cfs_rq, prev); | 1945 | update_stats_wait_start(cfs_rq, prev); |
1341 | /* Put 'current' back into the tree. */ | 1946 | /* Put 'current' back into the tree. */ |
1342 | __enqueue_entity(cfs_rq, prev); | 1947 | __enqueue_entity(cfs_rq, prev); |
1948 | /* in !on_rq case, update occurred at dequeue */ | ||
1949 | update_entity_load_avg(prev, 1); | ||
1343 | } | 1950 | } |
1344 | cfs_rq->curr = NULL; | 1951 | cfs_rq->curr = NULL; |
1345 | } | 1952 | } |
@@ -1353,9 +1960,10 @@ entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr, int queued) | |||
1353 | update_curr(cfs_rq); | 1960 | update_curr(cfs_rq); |
1354 | 1961 | ||
1355 | /* | 1962 | /* |
1356 | * Update share accounting for long-running entities. | 1963 | * Ensure that runnable average is periodically updated. |
1357 | */ | 1964 | */ |
1358 | update_entity_shares_tick(cfs_rq); | 1965 | update_entity_load_avg(curr, 1); |
1966 | update_cfs_rq_blocked_load(cfs_rq, 1); | ||
1359 | 1967 | ||
1360 | #ifdef CONFIG_SCHED_HRTICK | 1968 | #ifdef CONFIG_SCHED_HRTICK |
1361 | /* | 1969 | /* |
@@ -1448,6 +2056,15 @@ static inline struct cfs_bandwidth *tg_cfs_bandwidth(struct task_group *tg) | |||
1448 | return &tg->cfs_bandwidth; | 2056 | return &tg->cfs_bandwidth; |
1449 | } | 2057 | } |
1450 | 2058 | ||
2059 | /* rq->task_clock normalized against any time this cfs_rq has spent throttled */ | ||
2060 | static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq) | ||
2061 | { | ||
2062 | if (unlikely(cfs_rq->throttle_count)) | ||
2063 | return cfs_rq->throttled_clock_task; | ||
2064 | |||
2065 | return rq_of(cfs_rq)->clock_task - cfs_rq->throttled_clock_task_time; | ||
2066 | } | ||
2067 | |||
1451 | /* returns 0 on failure to allocate runtime */ | 2068 | /* returns 0 on failure to allocate runtime */ |
1452 | static int assign_cfs_rq_runtime(struct cfs_rq *cfs_rq) | 2069 | static int assign_cfs_rq_runtime(struct cfs_rq *cfs_rq) |
1453 | { | 2070 | { |
@@ -1592,14 +2209,9 @@ static int tg_unthrottle_up(struct task_group *tg, void *data) | |||
1592 | cfs_rq->throttle_count--; | 2209 | cfs_rq->throttle_count--; |
1593 | #ifdef CONFIG_SMP | 2210 | #ifdef CONFIG_SMP |
1594 | if (!cfs_rq->throttle_count) { | 2211 | if (!cfs_rq->throttle_count) { |
1595 | u64 delta = rq->clock_task - cfs_rq->load_stamp; | 2212 | /* adjust cfs_rq_clock_task() */ |
1596 | 2213 | cfs_rq->throttled_clock_task_time += rq->clock_task - | |
1597 | /* leaving throttled state, advance shares averaging windows */ | 2214 | cfs_rq->throttled_clock_task; |
1598 | cfs_rq->load_stamp += delta; | ||
1599 | cfs_rq->load_last += delta; | ||
1600 | |||
1601 | /* update entity weight now that we are on_rq again */ | ||
1602 | update_cfs_shares(cfs_rq); | ||
1603 | } | 2215 | } |
1604 | #endif | 2216 | #endif |
1605 | 2217 | ||
@@ -1611,9 +2223,9 @@ static int tg_throttle_down(struct task_group *tg, void *data) | |||
1611 | struct rq *rq = data; | 2223 | struct rq *rq = data; |
1612 | struct cfs_rq *cfs_rq = tg->cfs_rq[cpu_of(rq)]; | 2224 | struct cfs_rq *cfs_rq = tg->cfs_rq[cpu_of(rq)]; |
1613 | 2225 | ||
1614 | /* group is entering throttled state, record last load */ | 2226 | /* group is entering throttled state, stop time */ |
1615 | if (!cfs_rq->throttle_count) | 2227 | if (!cfs_rq->throttle_count) |
1616 | update_cfs_load(cfs_rq, 0); | 2228 | cfs_rq->throttled_clock_task = rq->clock_task; |
1617 | cfs_rq->throttle_count++; | 2229 | cfs_rq->throttle_count++; |
1618 | 2230 | ||
1619 | return 0; | 2231 | return 0; |
@@ -1628,7 +2240,7 @@ static void throttle_cfs_rq(struct cfs_rq *cfs_rq) | |||
1628 | 2240 | ||
1629 | se = cfs_rq->tg->se[cpu_of(rq_of(cfs_rq))]; | 2241 | se = cfs_rq->tg->se[cpu_of(rq_of(cfs_rq))]; |
1630 | 2242 | ||
1631 | /* account load preceding throttle */ | 2243 | /* freeze hierarchy runnable averages while throttled */ |
1632 | rcu_read_lock(); | 2244 | rcu_read_lock(); |
1633 | walk_tg_tree_from(cfs_rq->tg, tg_throttle_down, tg_nop, (void *)rq); | 2245 | walk_tg_tree_from(cfs_rq->tg, tg_throttle_down, tg_nop, (void *)rq); |
1634 | rcu_read_unlock(); | 2246 | rcu_read_unlock(); |
@@ -1652,7 +2264,7 @@ static void throttle_cfs_rq(struct cfs_rq *cfs_rq) | |||
1652 | rq->nr_running -= task_delta; | 2264 | rq->nr_running -= task_delta; |
1653 | 2265 | ||
1654 | cfs_rq->throttled = 1; | 2266 | cfs_rq->throttled = 1; |
1655 | cfs_rq->throttled_timestamp = rq->clock; | 2267 | cfs_rq->throttled_clock = rq->clock; |
1656 | raw_spin_lock(&cfs_b->lock); | 2268 | raw_spin_lock(&cfs_b->lock); |
1657 | list_add_tail_rcu(&cfs_rq->throttled_list, &cfs_b->throttled_cfs_rq); | 2269 | list_add_tail_rcu(&cfs_rq->throttled_list, &cfs_b->throttled_cfs_rq); |
1658 | raw_spin_unlock(&cfs_b->lock); | 2270 | raw_spin_unlock(&cfs_b->lock); |
@@ -1670,10 +2282,9 @@ void unthrottle_cfs_rq(struct cfs_rq *cfs_rq) | |||
1670 | 2282 | ||
1671 | cfs_rq->throttled = 0; | 2283 | cfs_rq->throttled = 0; |
1672 | raw_spin_lock(&cfs_b->lock); | 2284 | raw_spin_lock(&cfs_b->lock); |
1673 | cfs_b->throttled_time += rq->clock - cfs_rq->throttled_timestamp; | 2285 | cfs_b->throttled_time += rq->clock - cfs_rq->throttled_clock; |
1674 | list_del_rcu(&cfs_rq->throttled_list); | 2286 | list_del_rcu(&cfs_rq->throttled_list); |
1675 | raw_spin_unlock(&cfs_b->lock); | 2287 | raw_spin_unlock(&cfs_b->lock); |
1676 | cfs_rq->throttled_timestamp = 0; | ||
1677 | 2288 | ||
1678 | update_rq_clock(rq); | 2289 | update_rq_clock(rq); |
1679 | /* update hierarchical throttle state */ | 2290 | /* update hierarchical throttle state */ |
@@ -2073,8 +2684,13 @@ static void unthrottle_offline_cfs_rqs(struct rq *rq) | |||
2073 | } | 2684 | } |
2074 | 2685 | ||
2075 | #else /* CONFIG_CFS_BANDWIDTH */ | 2686 | #else /* CONFIG_CFS_BANDWIDTH */ |
2076 | static __always_inline | 2687 | static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq) |
2077 | void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, unsigned long delta_exec) {} | 2688 | { |
2689 | return rq_of(cfs_rq)->clock_task; | ||
2690 | } | ||
2691 | |||
2692 | static void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, | ||
2693 | unsigned long delta_exec) {} | ||
2078 | static void check_cfs_rq_runtime(struct cfs_rq *cfs_rq) {} | 2694 | static void check_cfs_rq_runtime(struct cfs_rq *cfs_rq) {} |
2079 | static void check_enqueue_throttle(struct cfs_rq *cfs_rq) {} | 2695 | static void check_enqueue_throttle(struct cfs_rq *cfs_rq) {} |
2080 | static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq) {} | 2696 | static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq) {} |
@@ -2207,12 +2823,14 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags) | |||
2207 | if (cfs_rq_throttled(cfs_rq)) | 2823 | if (cfs_rq_throttled(cfs_rq)) |
2208 | break; | 2824 | break; |
2209 | 2825 | ||
2210 | update_cfs_load(cfs_rq, 0); | ||
2211 | update_cfs_shares(cfs_rq); | 2826 | update_cfs_shares(cfs_rq); |
2827 | update_entity_load_avg(se, 1); | ||
2212 | } | 2828 | } |
2213 | 2829 | ||
2214 | if (!se) | 2830 | if (!se) { |
2831 | update_rq_runnable_avg(rq, rq->nr_running); | ||
2215 | inc_nr_running(rq); | 2832 | inc_nr_running(rq); |
2833 | } | ||
2216 | hrtick_update(rq); | 2834 | hrtick_update(rq); |
2217 | } | 2835 | } |
2218 | 2836 | ||
@@ -2266,12 +2884,14 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags) | |||
2266 | if (cfs_rq_throttled(cfs_rq)) | 2884 | if (cfs_rq_throttled(cfs_rq)) |
2267 | break; | 2885 | break; |
2268 | 2886 | ||
2269 | update_cfs_load(cfs_rq, 0); | ||
2270 | update_cfs_shares(cfs_rq); | 2887 | update_cfs_shares(cfs_rq); |
2888 | update_entity_load_avg(se, 1); | ||
2271 | } | 2889 | } |
2272 | 2890 | ||
2273 | if (!se) | 2891 | if (!se) { |
2274 | dec_nr_running(rq); | 2892 | dec_nr_running(rq); |
2893 | update_rq_runnable_avg(rq, 1); | ||
2894 | } | ||
2275 | hrtick_update(rq); | 2895 | hrtick_update(rq); |
2276 | } | 2896 | } |
2277 | 2897 | ||
@@ -2700,7 +3320,6 @@ select_task_rq_fair(struct task_struct *p, int sd_flag, int wake_flags) | |||
2700 | int prev_cpu = task_cpu(p); | 3320 | int prev_cpu = task_cpu(p); |
2701 | int new_cpu = cpu; | 3321 | int new_cpu = cpu; |
2702 | int want_affine = 0; | 3322 | int want_affine = 0; |
2703 | int want_sd = 1; | ||
2704 | int sync = wake_flags & WF_SYNC; | 3323 | int sync = wake_flags & WF_SYNC; |
2705 | 3324 | ||
2706 | if (p->nr_cpus_allowed == 1) | 3325 | if (p->nr_cpus_allowed == 1) |
@@ -2718,48 +3337,21 @@ select_task_rq_fair(struct task_struct *p, int sd_flag, int wake_flags) | |||
2718 | continue; | 3337 | continue; |
2719 | 3338 | ||
2720 | /* | 3339 | /* |
2721 | * If power savings logic is enabled for a domain, see if we | ||
2722 | * are not overloaded, if so, don't balance wider. | ||
2723 | */ | ||
2724 | if (tmp->flags & (SD_PREFER_LOCAL)) { | ||
2725 | unsigned long power = 0; | ||
2726 | unsigned long nr_running = 0; | ||
2727 | unsigned long capacity; | ||
2728 | int i; | ||
2729 | |||
2730 | for_each_cpu(i, sched_domain_span(tmp)) { | ||
2731 | power += power_of(i); | ||
2732 | nr_running += cpu_rq(i)->cfs.nr_running; | ||
2733 | } | ||
2734 | |||
2735 | capacity = DIV_ROUND_CLOSEST(power, SCHED_POWER_SCALE); | ||
2736 | |||
2737 | if (nr_running < capacity) | ||
2738 | want_sd = 0; | ||
2739 | } | ||
2740 | |||
2741 | /* | ||
2742 | * If both cpu and prev_cpu are part of this domain, | 3340 | * If both cpu and prev_cpu are part of this domain, |
2743 | * cpu is a valid SD_WAKE_AFFINE target. | 3341 | * cpu is a valid SD_WAKE_AFFINE target. |
2744 | */ | 3342 | */ |
2745 | if (want_affine && (tmp->flags & SD_WAKE_AFFINE) && | 3343 | if (want_affine && (tmp->flags & SD_WAKE_AFFINE) && |
2746 | cpumask_test_cpu(prev_cpu, sched_domain_span(tmp))) { | 3344 | cpumask_test_cpu(prev_cpu, sched_domain_span(tmp))) { |
2747 | affine_sd = tmp; | 3345 | affine_sd = tmp; |
2748 | want_affine = 0; | ||
2749 | } | ||
2750 | |||
2751 | if (!want_sd && !want_affine) | ||
2752 | break; | 3346 | break; |
3347 | } | ||
2753 | 3348 | ||
2754 | if (!(tmp->flags & sd_flag)) | 3349 | if (tmp->flags & sd_flag) |
2755 | continue; | ||
2756 | |||
2757 | if (want_sd) | ||
2758 | sd = tmp; | 3350 | sd = tmp; |
2759 | } | 3351 | } |
2760 | 3352 | ||
2761 | if (affine_sd) { | 3353 | if (affine_sd) { |
2762 | if (cpu == prev_cpu || wake_affine(affine_sd, p, sync)) | 3354 | if (cpu != prev_cpu && wake_affine(affine_sd, p, sync)) |
2763 | prev_cpu = cpu; | 3355 | prev_cpu = cpu; |
2764 | 3356 | ||
2765 | new_cpu = select_idle_sibling(p, prev_cpu); | 3357 | new_cpu = select_idle_sibling(p, prev_cpu); |
@@ -2809,6 +3401,37 @@ unlock: | |||
2809 | 3401 | ||
2810 | return new_cpu; | 3402 | return new_cpu; |
2811 | } | 3403 | } |
3404 | |||
3405 | /* | ||
3406 | * Load-tracking only depends on SMP, FAIR_GROUP_SCHED dependency below may be | ||
3407 | * removed when useful for applications beyond shares distribution (e.g. | ||
3408 | * load-balance). | ||
3409 | */ | ||
3410 | #ifdef CONFIG_FAIR_GROUP_SCHED | ||
3411 | /* | ||
3412 | * Called immediately before a task is migrated to a new cpu; task_cpu(p) and | ||
3413 | * cfs_rq_of(p) references at time of call are still valid and identify the | ||
3414 | * previous cpu. However, the caller only guarantees p->pi_lock is held; no | ||
3415 | * other assumptions, including the state of rq->lock, should be made. | ||
3416 | */ | ||
3417 | static void | ||
3418 | migrate_task_rq_fair(struct task_struct *p, int next_cpu) | ||
3419 | { | ||
3420 | struct sched_entity *se = &p->se; | ||
3421 | struct cfs_rq *cfs_rq = cfs_rq_of(se); | ||
3422 | |||
3423 | /* | ||
3424 | * Load tracking: accumulate removed load so that it can be processed | ||
3425 | * when we next update owning cfs_rq under rq->lock. Tasks contribute | ||
3426 | * to blocked load iff they have a positive decay-count. It can never | ||
3427 | * be negative here since on-rq tasks have decay-count == 0. | ||
3428 | */ | ||
3429 | if (se->avg.decay_count) { | ||
3430 | se->avg.decay_count = -__synchronize_entity_decay(se); | ||
3431 | atomic64_add(se->avg.load_avg_contrib, &cfs_rq->removed_load); | ||
3432 | } | ||
3433 | } | ||
3434 | #endif | ||
2812 | #endif /* CONFIG_SMP */ | 3435 | #endif /* CONFIG_SMP */ |
2813 | 3436 | ||
2814 | static unsigned long | 3437 | static unsigned long |
@@ -2935,7 +3558,7 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_ | |||
2935 | * Batch and idle tasks do not preempt non-idle tasks (their preemption | 3558 | * Batch and idle tasks do not preempt non-idle tasks (their preemption |
2936 | * is driven by the tick): | 3559 | * is driven by the tick): |
2937 | */ | 3560 | */ |
2938 | if (unlikely(p->policy != SCHED_NORMAL)) | 3561 | if (unlikely(p->policy != SCHED_NORMAL) || !sched_feat(WAKEUP_PREEMPTION)) |
2939 | return; | 3562 | return; |
2940 | 3563 | ||
2941 | find_matching_se(&se, &pse); | 3564 | find_matching_se(&se, &pse); |
@@ -3061,8 +3684,122 @@ static bool yield_to_task_fair(struct rq *rq, struct task_struct *p, bool preemp | |||
3061 | 3684 | ||
3062 | #ifdef CONFIG_SMP | 3685 | #ifdef CONFIG_SMP |
3063 | /************************************************** | 3686 | /************************************************** |
3064 | * Fair scheduling class load-balancing methods: | 3687 | * Fair scheduling class load-balancing methods. |
3065 | */ | 3688 | * |
3689 | * BASICS | ||
3690 | * | ||
3691 | * The purpose of load-balancing is to achieve the same basic fairness the | ||
3692 | * per-cpu scheduler provides, namely provide a proportional amount of compute | ||
3693 | * time to each task. This is expressed in the following equation: | ||
3694 | * | ||
3695 | * W_i,n/P_i == W_j,n/P_j for all i,j (1) | ||
3696 | * | ||
3697 | * Where W_i,n is the n-th weight average for cpu i. The instantaneous weight | ||
3698 | * W_i,0 is defined as: | ||
3699 | * | ||
3700 | * W_i,0 = \Sum_j w_i,j (2) | ||
3701 | * | ||
3702 | * Where w_i,j is the weight of the j-th runnable task on cpu i. This weight | ||
3703 | * is derived from the nice value as per prio_to_weight[]. | ||
3704 | * | ||
3705 | * The weight average is an exponential decay average of the instantaneous | ||
3706 | * weight: | ||
3707 | * | ||
3708 | * W'_i,n = (2^n - 1) / 2^n * W_i,n + 1 / 2^n * W_i,0 (3) | ||
3709 | * | ||
3710 | * P_i is the cpu power (or compute capacity) of cpu i, typically it is the | ||
3711 | * fraction of 'recent' time available for SCHED_OTHER task execution. But it | ||
3712 | * can also include other factors [XXX]. | ||
3713 | * | ||
3714 | * To achieve this balance we define a measure of imbalance which follows | ||
3715 | * directly from (1): | ||
3716 | * | ||
3717 | * imb_i,j = max{ avg(W/P), W_i/P_i } - min{ avg(W/P), W_j/P_j } (4) | ||
3718 | * | ||
3719 | * We them move tasks around to minimize the imbalance. In the continuous | ||
3720 | * function space it is obvious this converges, in the discrete case we get | ||
3721 | * a few fun cases generally called infeasible weight scenarios. | ||
3722 | * | ||
3723 | * [XXX expand on: | ||
3724 | * - infeasible weights; | ||
3725 | * - local vs global optima in the discrete case. ] | ||
3726 | * | ||
3727 | * | ||
3728 | * SCHED DOMAINS | ||
3729 | * | ||
3730 | * In order to solve the imbalance equation (4), and avoid the obvious O(n^2) | ||
3731 | * for all i,j solution, we create a tree of cpus that follows the hardware | ||
3732 | * topology where each level pairs two lower groups (or better). This results | ||
3733 | * in O(log n) layers. Furthermore we reduce the number of cpus going up the | ||
3734 | * tree to only the first of the previous level and we decrease the frequency | ||
3735 | * of load-balance at each level inv. proportional to the number of cpus in | ||
3736 | * the groups. | ||
3737 | * | ||
3738 | * This yields: | ||
3739 | * | ||
3740 | * log_2 n 1 n | ||
3741 | * \Sum { --- * --- * 2^i } = O(n) (5) | ||
3742 | * i = 0 2^i 2^i | ||
3743 | * `- size of each group | ||
3744 | * | | `- number of cpus doing load-balance | ||
3745 | * | `- freq | ||
3746 | * `- sum over all levels | ||
3747 | * | ||
3748 | * Coupled with a limit on how many tasks we can migrate every balance pass, | ||
3749 | * this makes (5) the runtime complexity of the balancer. | ||
3750 | * | ||
3751 | * An important property here is that each CPU is still (indirectly) connected | ||
3752 | * to every other cpu in at most O(log n) steps: | ||
3753 | * | ||
3754 | * The adjacency matrix of the resulting graph is given by: | ||
3755 | * | ||
3756 | * log_2 n | ||
3757 | * A_i,j = \Union (i % 2^k == 0) && i / 2^(k+1) == j / 2^(k+1) (6) | ||
3758 | * k = 0 | ||
3759 | * | ||
3760 | * And you'll find that: | ||
3761 | * | ||
3762 | * A^(log_2 n)_i,j != 0 for all i,j (7) | ||
3763 | * | ||
3764 | * Showing there's indeed a path between every cpu in at most O(log n) steps. | ||
3765 | * The task movement gives a factor of O(m), giving a convergence complexity | ||
3766 | * of: | ||
3767 | * | ||
3768 | * O(nm log n), n := nr_cpus, m := nr_tasks (8) | ||
3769 | * | ||
3770 | * | ||
3771 | * WORK CONSERVING | ||
3772 | * | ||
3773 | * In order to avoid CPUs going idle while there's still work to do, new idle | ||
3774 | * balancing is more aggressive and has the newly idle cpu iterate up the domain | ||
3775 | * tree itself instead of relying on other CPUs to bring it work. | ||
3776 | * | ||
3777 | * This adds some complexity to both (5) and (8) but it reduces the total idle | ||
3778 | * time. | ||
3779 | * | ||
3780 | * [XXX more?] | ||
3781 | * | ||
3782 | * | ||
3783 | * CGROUPS | ||
3784 | * | ||
3785 | * Cgroups make a horror show out of (2), instead of a simple sum we get: | ||
3786 | * | ||
3787 | * s_k,i | ||
3788 | * W_i,0 = \Sum_j \Prod_k w_k * ----- (9) | ||
3789 | * S_k | ||
3790 | * | ||
3791 | * Where | ||
3792 | * | ||
3793 | * s_k,i = \Sum_j w_i,j,k and S_k = \Sum_i s_k,i (10) | ||
3794 | * | ||
3795 | * w_i,j,k is the weight of the j-th runnable task in the k-th cgroup on cpu i. | ||
3796 | * | ||
3797 | * The big problem is S_k, its a global sum needed to compute a local (W_i) | ||
3798 | * property. | ||
3799 | * | ||
3800 | * [XXX write more on how we solve this.. _after_ merging pjt's patches that | ||
3801 | * rewrite all of this once again.] | ||
3802 | */ | ||
3066 | 3803 | ||
3067 | static unsigned long __read_mostly max_load_balance_interval = HZ/10; | 3804 | static unsigned long __read_mostly max_load_balance_interval = HZ/10; |
3068 | 3805 | ||
@@ -3328,52 +4065,58 @@ next: | |||
3328 | /* | 4065 | /* |
3329 | * update tg->load_weight by folding this cpu's load_avg | 4066 | * update tg->load_weight by folding this cpu's load_avg |
3330 | */ | 4067 | */ |
3331 | static int update_shares_cpu(struct task_group *tg, int cpu) | 4068 | static void __update_blocked_averages_cpu(struct task_group *tg, int cpu) |
3332 | { | 4069 | { |
3333 | struct cfs_rq *cfs_rq; | 4070 | struct sched_entity *se = tg->se[cpu]; |
3334 | unsigned long flags; | 4071 | struct cfs_rq *cfs_rq = tg->cfs_rq[cpu]; |
3335 | struct rq *rq; | ||
3336 | |||
3337 | if (!tg->se[cpu]) | ||
3338 | return 0; | ||
3339 | |||
3340 | rq = cpu_rq(cpu); | ||
3341 | cfs_rq = tg->cfs_rq[cpu]; | ||
3342 | |||
3343 | raw_spin_lock_irqsave(&rq->lock, flags); | ||
3344 | |||
3345 | update_rq_clock(rq); | ||
3346 | update_cfs_load(cfs_rq, 1); | ||
3347 | 4072 | ||
3348 | /* | 4073 | /* throttled entities do not contribute to load */ |
3349 | * We need to update shares after updating tg->load_weight in | 4074 | if (throttled_hierarchy(cfs_rq)) |
3350 | * order to adjust the weight of groups with long running tasks. | 4075 | return; |
3351 | */ | ||
3352 | update_cfs_shares(cfs_rq); | ||
3353 | 4076 | ||
3354 | raw_spin_unlock_irqrestore(&rq->lock, flags); | 4077 | update_cfs_rq_blocked_load(cfs_rq, 1); |
3355 | 4078 | ||
3356 | return 0; | 4079 | if (se) { |
4080 | update_entity_load_avg(se, 1); | ||
4081 | /* | ||
4082 | * We pivot on our runnable average having decayed to zero for | ||
4083 | * list removal. This generally implies that all our children | ||
4084 | * have also been removed (modulo rounding error or bandwidth | ||
4085 | * control); however, such cases are rare and we can fix these | ||
4086 | * at enqueue. | ||
4087 | * | ||
4088 | * TODO: fix up out-of-order children on enqueue. | ||
4089 | */ | ||
4090 | if (!se->avg.runnable_avg_sum && !cfs_rq->nr_running) | ||
4091 | list_del_leaf_cfs_rq(cfs_rq); | ||
4092 | } else { | ||
4093 | struct rq *rq = rq_of(cfs_rq); | ||
4094 | update_rq_runnable_avg(rq, rq->nr_running); | ||
4095 | } | ||
3357 | } | 4096 | } |
3358 | 4097 | ||
3359 | static void update_shares(int cpu) | 4098 | static void update_blocked_averages(int cpu) |
3360 | { | 4099 | { |
3361 | struct cfs_rq *cfs_rq; | ||
3362 | struct rq *rq = cpu_rq(cpu); | 4100 | struct rq *rq = cpu_rq(cpu); |
4101 | struct cfs_rq *cfs_rq; | ||
4102 | unsigned long flags; | ||
3363 | 4103 | ||
3364 | rcu_read_lock(); | 4104 | raw_spin_lock_irqsave(&rq->lock, flags); |
4105 | update_rq_clock(rq); | ||
3365 | /* | 4106 | /* |
3366 | * Iterates the task_group tree in a bottom up fashion, see | 4107 | * Iterates the task_group tree in a bottom up fashion, see |
3367 | * list_add_leaf_cfs_rq() for details. | 4108 | * list_add_leaf_cfs_rq() for details. |
3368 | */ | 4109 | */ |
3369 | for_each_leaf_cfs_rq(rq, cfs_rq) { | 4110 | for_each_leaf_cfs_rq(rq, cfs_rq) { |
3370 | /* throttled entities do not contribute to load */ | 4111 | /* |
3371 | if (throttled_hierarchy(cfs_rq)) | 4112 | * Note: We may want to consider periodically releasing |
3372 | continue; | 4113 | * rq->lock about these updates so that creating many task |
3373 | 4114 | * groups does not result in continually extending hold time. | |
3374 | update_shares_cpu(cfs_rq->tg, cpu); | 4115 | */ |
4116 | __update_blocked_averages_cpu(cfs_rq->tg, rq->cpu); | ||
3375 | } | 4117 | } |
3376 | rcu_read_unlock(); | 4118 | |
4119 | raw_spin_unlock_irqrestore(&rq->lock, flags); | ||
3377 | } | 4120 | } |
3378 | 4121 | ||
3379 | /* | 4122 | /* |
@@ -3425,7 +4168,7 @@ static unsigned long task_h_load(struct task_struct *p) | |||
3425 | return load; | 4168 | return load; |
3426 | } | 4169 | } |
3427 | #else | 4170 | #else |
3428 | static inline void update_shares(int cpu) | 4171 | static inline void update_blocked_averages(int cpu) |
3429 | { | 4172 | { |
3430 | } | 4173 | } |
3431 | 4174 | ||
@@ -4295,7 +5038,7 @@ redo: | |||
4295 | goto out_balanced; | 5038 | goto out_balanced; |
4296 | } | 5039 | } |
4297 | 5040 | ||
4298 | BUG_ON(busiest == this_rq); | 5041 | BUG_ON(busiest == env.dst_rq); |
4299 | 5042 | ||
4300 | schedstat_add(sd, lb_imbalance[idle], env.imbalance); | 5043 | schedstat_add(sd, lb_imbalance[idle], env.imbalance); |
4301 | 5044 | ||
@@ -4316,7 +5059,7 @@ redo: | |||
4316 | update_h_load(env.src_cpu); | 5059 | update_h_load(env.src_cpu); |
4317 | more_balance: | 5060 | more_balance: |
4318 | local_irq_save(flags); | 5061 | local_irq_save(flags); |
4319 | double_rq_lock(this_rq, busiest); | 5062 | double_rq_lock(env.dst_rq, busiest); |
4320 | 5063 | ||
4321 | /* | 5064 | /* |
4322 | * cur_ld_moved - load moved in current iteration | 5065 | * cur_ld_moved - load moved in current iteration |
@@ -4324,7 +5067,7 @@ more_balance: | |||
4324 | */ | 5067 | */ |
4325 | cur_ld_moved = move_tasks(&env); | 5068 | cur_ld_moved = move_tasks(&env); |
4326 | ld_moved += cur_ld_moved; | 5069 | ld_moved += cur_ld_moved; |
4327 | double_rq_unlock(this_rq, busiest); | 5070 | double_rq_unlock(env.dst_rq, busiest); |
4328 | local_irq_restore(flags); | 5071 | local_irq_restore(flags); |
4329 | 5072 | ||
4330 | if (env.flags & LBF_NEED_BREAK) { | 5073 | if (env.flags & LBF_NEED_BREAK) { |
@@ -4360,8 +5103,7 @@ more_balance: | |||
4360 | if ((env.flags & LBF_SOME_PINNED) && env.imbalance > 0 && | 5103 | if ((env.flags & LBF_SOME_PINNED) && env.imbalance > 0 && |
4361 | lb_iterations++ < max_lb_iterations) { | 5104 | lb_iterations++ < max_lb_iterations) { |
4362 | 5105 | ||
4363 | this_rq = cpu_rq(env.new_dst_cpu); | 5106 | env.dst_rq = cpu_rq(env.new_dst_cpu); |
4364 | env.dst_rq = this_rq; | ||
4365 | env.dst_cpu = env.new_dst_cpu; | 5107 | env.dst_cpu = env.new_dst_cpu; |
4366 | env.flags &= ~LBF_SOME_PINNED; | 5108 | env.flags &= ~LBF_SOME_PINNED; |
4367 | env.loop = 0; | 5109 | env.loop = 0; |
@@ -4486,12 +5228,14 @@ void idle_balance(int this_cpu, struct rq *this_rq) | |||
4486 | if (this_rq->avg_idle < sysctl_sched_migration_cost) | 5228 | if (this_rq->avg_idle < sysctl_sched_migration_cost) |
4487 | return; | 5229 | return; |
4488 | 5230 | ||
5231 | update_rq_runnable_avg(this_rq, 1); | ||
5232 | |||
4489 | /* | 5233 | /* |
4490 | * Drop the rq->lock, but keep IRQ/preempt disabled. | 5234 | * Drop the rq->lock, but keep IRQ/preempt disabled. |
4491 | */ | 5235 | */ |
4492 | raw_spin_unlock(&this_rq->lock); | 5236 | raw_spin_unlock(&this_rq->lock); |
4493 | 5237 | ||
4494 | update_shares(this_cpu); | 5238 | update_blocked_averages(this_cpu); |
4495 | rcu_read_lock(); | 5239 | rcu_read_lock(); |
4496 | for_each_domain(this_cpu, sd) { | 5240 | for_each_domain(this_cpu, sd) { |
4497 | unsigned long interval; | 5241 | unsigned long interval; |
@@ -4646,7 +5390,7 @@ static void nohz_balancer_kick(int cpu) | |||
4646 | return; | 5390 | return; |
4647 | } | 5391 | } |
4648 | 5392 | ||
4649 | static inline void clear_nohz_tick_stopped(int cpu) | 5393 | static inline void nohz_balance_exit_idle(int cpu) |
4650 | { | 5394 | { |
4651 | if (unlikely(test_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu)))) { | 5395 | if (unlikely(test_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu)))) { |
4652 | cpumask_clear_cpu(cpu, nohz.idle_cpus_mask); | 5396 | cpumask_clear_cpu(cpu, nohz.idle_cpus_mask); |
@@ -4686,28 +5430,23 @@ void set_cpu_sd_state_idle(void) | |||
4686 | } | 5430 | } |
4687 | 5431 | ||
4688 | /* | 5432 | /* |
4689 | * This routine will record that this cpu is going idle with tick stopped. | 5433 | * This routine will record that the cpu is going idle with tick stopped. |
4690 | * This info will be used in performing idle load balancing in the future. | 5434 | * This info will be used in performing idle load balancing in the future. |
4691 | */ | 5435 | */ |
4692 | void select_nohz_load_balancer(int stop_tick) | 5436 | void nohz_balance_enter_idle(int cpu) |
4693 | { | 5437 | { |
4694 | int cpu = smp_processor_id(); | ||
4695 | |||
4696 | /* | 5438 | /* |
4697 | * If this cpu is going down, then nothing needs to be done. | 5439 | * If this cpu is going down, then nothing needs to be done. |
4698 | */ | 5440 | */ |
4699 | if (!cpu_active(cpu)) | 5441 | if (!cpu_active(cpu)) |
4700 | return; | 5442 | return; |
4701 | 5443 | ||
4702 | if (stop_tick) { | 5444 | if (test_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu))) |
4703 | if (test_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu))) | 5445 | return; |
4704 | return; | ||
4705 | 5446 | ||
4706 | cpumask_set_cpu(cpu, nohz.idle_cpus_mask); | 5447 | cpumask_set_cpu(cpu, nohz.idle_cpus_mask); |
4707 | atomic_inc(&nohz.nr_cpus); | 5448 | atomic_inc(&nohz.nr_cpus); |
4708 | set_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu)); | 5449 | set_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu)); |
4709 | } | ||
4710 | return; | ||
4711 | } | 5450 | } |
4712 | 5451 | ||
4713 | static int __cpuinit sched_ilb_notifier(struct notifier_block *nfb, | 5452 | static int __cpuinit sched_ilb_notifier(struct notifier_block *nfb, |
@@ -4715,7 +5454,7 @@ static int __cpuinit sched_ilb_notifier(struct notifier_block *nfb, | |||
4715 | { | 5454 | { |
4716 | switch (action & ~CPU_TASKS_FROZEN) { | 5455 | switch (action & ~CPU_TASKS_FROZEN) { |
4717 | case CPU_DYING: | 5456 | case CPU_DYING: |
4718 | clear_nohz_tick_stopped(smp_processor_id()); | 5457 | nohz_balance_exit_idle(smp_processor_id()); |
4719 | return NOTIFY_OK; | 5458 | return NOTIFY_OK; |
4720 | default: | 5459 | default: |
4721 | return NOTIFY_DONE; | 5460 | return NOTIFY_DONE; |
@@ -4751,7 +5490,7 @@ static void rebalance_domains(int cpu, enum cpu_idle_type idle) | |||
4751 | int update_next_balance = 0; | 5490 | int update_next_balance = 0; |
4752 | int need_serialize; | 5491 | int need_serialize; |
4753 | 5492 | ||
4754 | update_shares(cpu); | 5493 | update_blocked_averages(cpu); |
4755 | 5494 | ||
4756 | rcu_read_lock(); | 5495 | rcu_read_lock(); |
4757 | for_each_domain(cpu, sd) { | 5496 | for_each_domain(cpu, sd) { |
@@ -4837,14 +5576,15 @@ static void nohz_idle_balance(int this_cpu, enum cpu_idle_type idle) | |||
4837 | if (need_resched()) | 5576 | if (need_resched()) |
4838 | break; | 5577 | break; |
4839 | 5578 | ||
4840 | raw_spin_lock_irq(&this_rq->lock); | 5579 | rq = cpu_rq(balance_cpu); |
4841 | update_rq_clock(this_rq); | 5580 | |
4842 | update_idle_cpu_load(this_rq); | 5581 | raw_spin_lock_irq(&rq->lock); |
4843 | raw_spin_unlock_irq(&this_rq->lock); | 5582 | update_rq_clock(rq); |
5583 | update_idle_cpu_load(rq); | ||
5584 | raw_spin_unlock_irq(&rq->lock); | ||
4844 | 5585 | ||
4845 | rebalance_domains(balance_cpu, CPU_IDLE); | 5586 | rebalance_domains(balance_cpu, CPU_IDLE); |
4846 | 5587 | ||
4847 | rq = cpu_rq(balance_cpu); | ||
4848 | if (time_after(this_rq->next_balance, rq->next_balance)) | 5588 | if (time_after(this_rq->next_balance, rq->next_balance)) |
4849 | this_rq->next_balance = rq->next_balance; | 5589 | this_rq->next_balance = rq->next_balance; |
4850 | } | 5590 | } |
@@ -4875,7 +5615,7 @@ static inline int nohz_kick_needed(struct rq *rq, int cpu) | |||
4875 | * busy tick after returning from idle, we will update the busy stats. | 5615 | * busy tick after returning from idle, we will update the busy stats. |
4876 | */ | 5616 | */ |
4877 | set_cpu_sd_state_busy(); | 5617 | set_cpu_sd_state_busy(); |
4878 | clear_nohz_tick_stopped(cpu); | 5618 | nohz_balance_exit_idle(cpu); |
4879 | 5619 | ||
4880 | /* | 5620 | /* |
4881 | * None are in tickless mode and hence no need for NOHZ idle load | 5621 | * None are in tickless mode and hence no need for NOHZ idle load |
@@ -4987,6 +5727,11 @@ static void task_tick_fair(struct rq *rq, struct task_struct *curr, int queued) | |||
4987 | cfs_rq = cfs_rq_of(se); | 5727 | cfs_rq = cfs_rq_of(se); |
4988 | entity_tick(cfs_rq, se, queued); | 5728 | entity_tick(cfs_rq, se, queued); |
4989 | } | 5729 | } |
5730 | |||
5731 | if (sched_feat_numa(NUMA)) | ||
5732 | task_tick_numa(rq, curr); | ||
5733 | |||
5734 | update_rq_runnable_avg(rq, 1); | ||
4990 | } | 5735 | } |
4991 | 5736 | ||
4992 | /* | 5737 | /* |
@@ -5079,6 +5824,20 @@ static void switched_from_fair(struct rq *rq, struct task_struct *p) | |||
5079 | place_entity(cfs_rq, se, 0); | 5824 | place_entity(cfs_rq, se, 0); |
5080 | se->vruntime -= cfs_rq->min_vruntime; | 5825 | se->vruntime -= cfs_rq->min_vruntime; |
5081 | } | 5826 | } |
5827 | |||
5828 | #if defined(CONFIG_FAIR_GROUP_SCHED) && defined(CONFIG_SMP) | ||
5829 | /* | ||
5830 | * Remove our load from contribution when we leave sched_fair | ||
5831 | * and ensure we don't carry in an old decay_count if we | ||
5832 | * switch back. | ||
5833 | */ | ||
5834 | if (p->se.avg.decay_count) { | ||
5835 | struct cfs_rq *cfs_rq = cfs_rq_of(&p->se); | ||
5836 | __synchronize_entity_decay(&p->se); | ||
5837 | subtract_blocked_load_contrib(cfs_rq, | ||
5838 | p->se.avg.load_avg_contrib); | ||
5839 | } | ||
5840 | #endif | ||
5082 | } | 5841 | } |
5083 | 5842 | ||
5084 | /* | 5843 | /* |
@@ -5125,11 +5884,16 @@ void init_cfs_rq(struct cfs_rq *cfs_rq) | |||
5125 | #ifndef CONFIG_64BIT | 5884 | #ifndef CONFIG_64BIT |
5126 | cfs_rq->min_vruntime_copy = cfs_rq->min_vruntime; | 5885 | cfs_rq->min_vruntime_copy = cfs_rq->min_vruntime; |
5127 | #endif | 5886 | #endif |
5887 | #if defined(CONFIG_FAIR_GROUP_SCHED) && defined(CONFIG_SMP) | ||
5888 | atomic64_set(&cfs_rq->decay_counter, 1); | ||
5889 | atomic64_set(&cfs_rq->removed_load, 0); | ||
5890 | #endif | ||
5128 | } | 5891 | } |
5129 | 5892 | ||
5130 | #ifdef CONFIG_FAIR_GROUP_SCHED | 5893 | #ifdef CONFIG_FAIR_GROUP_SCHED |
5131 | static void task_move_group_fair(struct task_struct *p, int on_rq) | 5894 | static void task_move_group_fair(struct task_struct *p, int on_rq) |
5132 | { | 5895 | { |
5896 | struct cfs_rq *cfs_rq; | ||
5133 | /* | 5897 | /* |
5134 | * If the task was not on the rq at the time of this cgroup movement | 5898 | * If the task was not on the rq at the time of this cgroup movement |
5135 | * it must have been asleep, sleeping tasks keep their ->vruntime | 5899 | * it must have been asleep, sleeping tasks keep their ->vruntime |
@@ -5161,8 +5925,19 @@ static void task_move_group_fair(struct task_struct *p, int on_rq) | |||
5161 | if (!on_rq) | 5925 | if (!on_rq) |
5162 | p->se.vruntime -= cfs_rq_of(&p->se)->min_vruntime; | 5926 | p->se.vruntime -= cfs_rq_of(&p->se)->min_vruntime; |
5163 | set_task_rq(p, task_cpu(p)); | 5927 | set_task_rq(p, task_cpu(p)); |
5164 | if (!on_rq) | 5928 | if (!on_rq) { |
5165 | p->se.vruntime += cfs_rq_of(&p->se)->min_vruntime; | 5929 | cfs_rq = cfs_rq_of(&p->se); |
5930 | p->se.vruntime += cfs_rq->min_vruntime; | ||
5931 | #ifdef CONFIG_SMP | ||
5932 | /* | ||
5933 | * migrate_task_rq_fair() will have removed our previous | ||
5934 | * contribution, but we must synchronize for ongoing future | ||
5935 | * decay. | ||
5936 | */ | ||
5937 | p->se.avg.decay_count = atomic64_read(&cfs_rq->decay_counter); | ||
5938 | cfs_rq->blocked_load_avg += p->se.avg.load_avg_contrib; | ||
5939 | #endif | ||
5940 | } | ||
5166 | } | 5941 | } |
5167 | 5942 | ||
5168 | void free_fair_sched_group(struct task_group *tg) | 5943 | void free_fair_sched_group(struct task_group *tg) |
@@ -5247,10 +6022,6 @@ void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq, | |||
5247 | 6022 | ||
5248 | cfs_rq->tg = tg; | 6023 | cfs_rq->tg = tg; |
5249 | cfs_rq->rq = rq; | 6024 | cfs_rq->rq = rq; |
5250 | #ifdef CONFIG_SMP | ||
5251 | /* allow initial update_cfs_load() to truncate */ | ||
5252 | cfs_rq->load_stamp = 1; | ||
5253 | #endif | ||
5254 | init_cfs_rq_runtime(cfs_rq); | 6025 | init_cfs_rq_runtime(cfs_rq); |
5255 | 6026 | ||
5256 | tg->cfs_rq[cpu] = cfs_rq; | 6027 | tg->cfs_rq[cpu] = cfs_rq; |
@@ -5352,7 +6123,9 @@ const struct sched_class fair_sched_class = { | |||
5352 | 6123 | ||
5353 | #ifdef CONFIG_SMP | 6124 | #ifdef CONFIG_SMP |
5354 | .select_task_rq = select_task_rq_fair, | 6125 | .select_task_rq = select_task_rq_fair, |
5355 | 6126 | #ifdef CONFIG_FAIR_GROUP_SCHED | |
6127 | .migrate_task_rq = migrate_task_rq_fair, | ||
6128 | #endif | ||
5356 | .rq_online = rq_online_fair, | 6129 | .rq_online = rq_online_fair, |
5357 | .rq_offline = rq_offline_fair, | 6130 | .rq_offline = rq_offline_fair, |
5358 | 6131 | ||
diff --git a/kernel/sched/features.h b/kernel/sched/features.h index de00a486c5c6..1ad1d2b5395f 100644 --- a/kernel/sched/features.h +++ b/kernel/sched/features.h | |||
@@ -12,14 +12,6 @@ SCHED_FEAT(GENTLE_FAIR_SLEEPERS, true) | |||
12 | SCHED_FEAT(START_DEBIT, true) | 12 | SCHED_FEAT(START_DEBIT, true) |
13 | 13 | ||
14 | /* | 14 | /* |
15 | * Based on load and program behaviour, see if it makes sense to place | ||
16 | * a newly woken task on the same cpu as the task that woke it -- | ||
17 | * improve cache locality. Typically used with SYNC wakeups as | ||
18 | * generated by pipes and the like, see also SYNC_WAKEUPS. | ||
19 | */ | ||
20 | SCHED_FEAT(AFFINE_WAKEUPS, true) | ||
21 | |||
22 | /* | ||
23 | * Prefer to schedule the task we woke last (assuming it failed | 15 | * Prefer to schedule the task we woke last (assuming it failed |
24 | * wakeup-preemption), since its likely going to consume data we | 16 | * wakeup-preemption), since its likely going to consume data we |
25 | * touched, increases cache locality. | 17 | * touched, increases cache locality. |
@@ -40,9 +32,14 @@ SCHED_FEAT(LAST_BUDDY, true) | |||
40 | SCHED_FEAT(CACHE_HOT_BUDDY, true) | 32 | SCHED_FEAT(CACHE_HOT_BUDDY, true) |
41 | 33 | ||
42 | /* | 34 | /* |
35 | * Allow wakeup-time preemption of the current task: | ||
36 | */ | ||
37 | SCHED_FEAT(WAKEUP_PREEMPTION, true) | ||
38 | |||
39 | /* | ||
43 | * Use arch dependent cpu power functions | 40 | * Use arch dependent cpu power functions |
44 | */ | 41 | */ |
45 | SCHED_FEAT(ARCH_POWER, false) | 42 | SCHED_FEAT(ARCH_POWER, true) |
46 | 43 | ||
47 | SCHED_FEAT(HRTICK, false) | 44 | SCHED_FEAT(HRTICK, false) |
48 | SCHED_FEAT(DOUBLE_TICK, false) | 45 | SCHED_FEAT(DOUBLE_TICK, false) |
@@ -69,3 +66,14 @@ SCHED_FEAT(TTWU_QUEUE, true) | |||
69 | SCHED_FEAT(FORCE_SD_OVERLAP, false) | 66 | SCHED_FEAT(FORCE_SD_OVERLAP, false) |
70 | SCHED_FEAT(RT_RUNTIME_SHARE, true) | 67 | SCHED_FEAT(RT_RUNTIME_SHARE, true) |
71 | SCHED_FEAT(LB_MIN, false) | 68 | SCHED_FEAT(LB_MIN, false) |
69 | |||
70 | /* | ||
71 | * Apply the automatic NUMA scheduling policy. Enabled automatically | ||
72 | * at runtime if running on a NUMA machine. Can be controlled via | ||
73 | * numa_balancing=. Allow PTE scanning to be forced on UMA machines | ||
74 | * for debugging the core machinery. | ||
75 | */ | ||
76 | #ifdef CONFIG_NUMA_BALANCING | ||
77 | SCHED_FEAT(NUMA, false) | ||
78 | SCHED_FEAT(NUMA_FORCE, false) | ||
79 | #endif | ||
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c index e0b7ba9c040f..418feb01344e 100644 --- a/kernel/sched/rt.c +++ b/kernel/sched/rt.c | |||
@@ -1632,11 +1632,6 @@ static int push_rt_task(struct rq *rq) | |||
1632 | if (!next_task) | 1632 | if (!next_task) |
1633 | return 0; | 1633 | return 0; |
1634 | 1634 | ||
1635 | #ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW | ||
1636 | if (unlikely(task_running(rq, next_task))) | ||
1637 | return 0; | ||
1638 | #endif | ||
1639 | |||
1640 | retry: | 1635 | retry: |
1641 | if (unlikely(next_task == rq->curr)) { | 1636 | if (unlikely(next_task == rq->curr)) { |
1642 | WARN_ON(1); | 1637 | WARN_ON(1); |
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 0848fa36c383..fc886441436a 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h | |||
@@ -112,6 +112,8 @@ struct task_group { | |||
112 | unsigned long shares; | 112 | unsigned long shares; |
113 | 113 | ||
114 | atomic_t load_weight; | 114 | atomic_t load_weight; |
115 | atomic64_t load_avg; | ||
116 | atomic_t runnable_avg; | ||
115 | #endif | 117 | #endif |
116 | 118 | ||
117 | #ifdef CONFIG_RT_GROUP_SCHED | 119 | #ifdef CONFIG_RT_GROUP_SCHED |
@@ -222,22 +224,29 @@ struct cfs_rq { | |||
222 | unsigned int nr_spread_over; | 224 | unsigned int nr_spread_over; |
223 | #endif | 225 | #endif |
224 | 226 | ||
227 | #ifdef CONFIG_SMP | ||
228 | /* | ||
229 | * Load-tracking only depends on SMP, FAIR_GROUP_SCHED dependency below may be | ||
230 | * removed when useful for applications beyond shares distribution (e.g. | ||
231 | * load-balance). | ||
232 | */ | ||
225 | #ifdef CONFIG_FAIR_GROUP_SCHED | 233 | #ifdef CONFIG_FAIR_GROUP_SCHED |
226 | struct rq *rq; /* cpu runqueue to which this cfs_rq is attached */ | ||
227 | |||
228 | /* | 234 | /* |
229 | * leaf cfs_rqs are those that hold tasks (lowest schedulable entity in | 235 | * CFS Load tracking |
230 | * a hierarchy). Non-leaf lrqs hold other higher schedulable entities | 236 | * Under CFS, load is tracked on a per-entity basis and aggregated up. |
231 | * (like users, containers etc.) | 237 | * This allows for the description of both thread and group usage (in |
232 | * | 238 | * the FAIR_GROUP_SCHED case). |
233 | * leaf_cfs_rq_list ties together list of leaf cfs_rq's in a cpu. This | ||
234 | * list is used during load balance. | ||
235 | */ | 239 | */ |
236 | int on_list; | 240 | u64 runnable_load_avg, blocked_load_avg; |
237 | struct list_head leaf_cfs_rq_list; | 241 | atomic64_t decay_counter, removed_load; |
238 | struct task_group *tg; /* group that "owns" this runqueue */ | 242 | u64 last_decay; |
243 | #endif /* CONFIG_FAIR_GROUP_SCHED */ | ||
244 | /* These always depend on CONFIG_FAIR_GROUP_SCHED */ | ||
245 | #ifdef CONFIG_FAIR_GROUP_SCHED | ||
246 | u32 tg_runnable_contrib; | ||
247 | u64 tg_load_contrib; | ||
248 | #endif /* CONFIG_FAIR_GROUP_SCHED */ | ||
239 | 249 | ||
240 | #ifdef CONFIG_SMP | ||
241 | /* | 250 | /* |
242 | * h_load = weight * f(tg) | 251 | * h_load = weight * f(tg) |
243 | * | 252 | * |
@@ -245,26 +254,30 @@ struct cfs_rq { | |||
245 | * this group. | 254 | * this group. |
246 | */ | 255 | */ |
247 | unsigned long h_load; | 256 | unsigned long h_load; |
257 | #endif /* CONFIG_SMP */ | ||
258 | |||
259 | #ifdef CONFIG_FAIR_GROUP_SCHED | ||
260 | struct rq *rq; /* cpu runqueue to which this cfs_rq is attached */ | ||
248 | 261 | ||
249 | /* | 262 | /* |
250 | * Maintaining per-cpu shares distribution for group scheduling | 263 | * leaf cfs_rqs are those that hold tasks (lowest schedulable entity in |
264 | * a hierarchy). Non-leaf lrqs hold other higher schedulable entities | ||
265 | * (like users, containers etc.) | ||
251 | * | 266 | * |
252 | * load_stamp is the last time we updated the load average | 267 | * leaf_cfs_rq_list ties together list of leaf cfs_rq's in a cpu. This |
253 | * load_last is the last time we updated the load average and saw load | 268 | * list is used during load balance. |
254 | * load_unacc_exec_time is currently unaccounted execution time | ||
255 | */ | 269 | */ |
256 | u64 load_avg; | 270 | int on_list; |
257 | u64 load_period; | 271 | struct list_head leaf_cfs_rq_list; |
258 | u64 load_stamp, load_last, load_unacc_exec_time; | 272 | struct task_group *tg; /* group that "owns" this runqueue */ |
259 | 273 | ||
260 | unsigned long load_contribution; | ||
261 | #endif /* CONFIG_SMP */ | ||
262 | #ifdef CONFIG_CFS_BANDWIDTH | 274 | #ifdef CONFIG_CFS_BANDWIDTH |
263 | int runtime_enabled; | 275 | int runtime_enabled; |
264 | u64 runtime_expires; | 276 | u64 runtime_expires; |
265 | s64 runtime_remaining; | 277 | s64 runtime_remaining; |
266 | 278 | ||
267 | u64 throttled_timestamp; | 279 | u64 throttled_clock, throttled_clock_task; |
280 | u64 throttled_clock_task_time; | ||
268 | int throttled, throttle_count; | 281 | int throttled, throttle_count; |
269 | struct list_head throttled_list; | 282 | struct list_head throttled_list; |
270 | #endif /* CONFIG_CFS_BANDWIDTH */ | 283 | #endif /* CONFIG_CFS_BANDWIDTH */ |
@@ -467,6 +480,8 @@ struct rq { | |||
467 | #ifdef CONFIG_SMP | 480 | #ifdef CONFIG_SMP |
468 | struct llist_head wake_list; | 481 | struct llist_head wake_list; |
469 | #endif | 482 | #endif |
483 | |||
484 | struct sched_avg avg; | ||
470 | }; | 485 | }; |
471 | 486 | ||
472 | static inline int cpu_of(struct rq *rq) | 487 | static inline int cpu_of(struct rq *rq) |
@@ -648,6 +663,18 @@ extern struct static_key sched_feat_keys[__SCHED_FEAT_NR]; | |||
648 | #define sched_feat(x) (sysctl_sched_features & (1UL << __SCHED_FEAT_##x)) | 663 | #define sched_feat(x) (sysctl_sched_features & (1UL << __SCHED_FEAT_##x)) |
649 | #endif /* SCHED_DEBUG && HAVE_JUMP_LABEL */ | 664 | #endif /* SCHED_DEBUG && HAVE_JUMP_LABEL */ |
650 | 665 | ||
666 | #ifdef CONFIG_NUMA_BALANCING | ||
667 | #define sched_feat_numa(x) sched_feat(x) | ||
668 | #ifdef CONFIG_SCHED_DEBUG | ||
669 | #define numabalancing_enabled sched_feat_numa(NUMA) | ||
670 | #else | ||
671 | extern bool numabalancing_enabled; | ||
672 | #endif /* CONFIG_SCHED_DEBUG */ | ||
673 | #else | ||
674 | #define sched_feat_numa(x) (0) | ||
675 | #define numabalancing_enabled (0) | ||
676 | #endif /* CONFIG_NUMA_BALANCING */ | ||
677 | |||
651 | static inline u64 global_rt_period(void) | 678 | static inline u64 global_rt_period(void) |
652 | { | 679 | { |
653 | return (u64)sysctl_sched_rt_period * NSEC_PER_USEC; | 680 | return (u64)sysctl_sched_rt_period * NSEC_PER_USEC; |
@@ -737,11 +764,7 @@ static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next) | |||
737 | */ | 764 | */ |
738 | next->on_cpu = 1; | 765 | next->on_cpu = 1; |
739 | #endif | 766 | #endif |
740 | #ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW | ||
741 | raw_spin_unlock_irq(&rq->lock); | ||
742 | #else | ||
743 | raw_spin_unlock(&rq->lock); | 767 | raw_spin_unlock(&rq->lock); |
744 | #endif | ||
745 | } | 768 | } |
746 | 769 | ||
747 | static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev) | 770 | static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev) |
@@ -755,9 +778,7 @@ static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev) | |||
755 | smp_wmb(); | 778 | smp_wmb(); |
756 | prev->on_cpu = 0; | 779 | prev->on_cpu = 0; |
757 | #endif | 780 | #endif |
758 | #ifndef __ARCH_WANT_INTERRUPTS_ON_CTXSW | ||
759 | local_irq_enable(); | 781 | local_irq_enable(); |
760 | #endif | ||
761 | } | 782 | } |
762 | #endif /* __ARCH_WANT_UNLOCKED_CTXSW */ | 783 | #endif /* __ARCH_WANT_UNLOCKED_CTXSW */ |
763 | 784 | ||
@@ -891,6 +912,9 @@ struct cpuacct { | |||
891 | struct kernel_cpustat __percpu *cpustat; | 912 | struct kernel_cpustat __percpu *cpustat; |
892 | }; | 913 | }; |
893 | 914 | ||
915 | extern struct cgroup_subsys cpuacct_subsys; | ||
916 | extern struct cpuacct root_cpuacct; | ||
917 | |||
894 | /* return cpu accounting group corresponding to this container */ | 918 | /* return cpu accounting group corresponding to this container */ |
895 | static inline struct cpuacct *cgroup_ca(struct cgroup *cgrp) | 919 | static inline struct cpuacct *cgroup_ca(struct cgroup *cgrp) |
896 | { | 920 | { |
@@ -917,6 +941,16 @@ extern void cpuacct_charge(struct task_struct *tsk, u64 cputime); | |||
917 | static inline void cpuacct_charge(struct task_struct *tsk, u64 cputime) {} | 941 | static inline void cpuacct_charge(struct task_struct *tsk, u64 cputime) {} |
918 | #endif | 942 | #endif |
919 | 943 | ||
944 | #ifdef CONFIG_PARAVIRT | ||
945 | static inline u64 steal_ticks(u64 steal) | ||
946 | { | ||
947 | if (unlikely(steal > NSEC_PER_SEC)) | ||
948 | return div_u64(steal, TICK_NSEC); | ||
949 | |||
950 | return __iter_div_u64_rem(steal, TICK_NSEC, &steal); | ||
951 | } | ||
952 | #endif | ||
953 | |||
920 | static inline void inc_nr_running(struct rq *rq) | 954 | static inline void inc_nr_running(struct rq *rq) |
921 | { | 955 | { |
922 | rq->nr_running++; | 956 | rq->nr_running++; |
@@ -1156,3 +1190,52 @@ enum rq_nohz_flag_bits { | |||
1156 | 1190 | ||
1157 | #define nohz_flags(cpu) (&cpu_rq(cpu)->nohz_flags) | 1191 | #define nohz_flags(cpu) (&cpu_rq(cpu)->nohz_flags) |
1158 | #endif | 1192 | #endif |
1193 | |||
1194 | #ifdef CONFIG_IRQ_TIME_ACCOUNTING | ||
1195 | |||
1196 | DECLARE_PER_CPU(u64, cpu_hardirq_time); | ||
1197 | DECLARE_PER_CPU(u64, cpu_softirq_time); | ||
1198 | |||
1199 | #ifndef CONFIG_64BIT | ||
1200 | DECLARE_PER_CPU(seqcount_t, irq_time_seq); | ||
1201 | |||
1202 | static inline void irq_time_write_begin(void) | ||
1203 | { | ||
1204 | __this_cpu_inc(irq_time_seq.sequence); | ||
1205 | smp_wmb(); | ||
1206 | } | ||
1207 | |||
1208 | static inline void irq_time_write_end(void) | ||
1209 | { | ||
1210 | smp_wmb(); | ||
1211 | __this_cpu_inc(irq_time_seq.sequence); | ||
1212 | } | ||
1213 | |||
1214 | static inline u64 irq_time_read(int cpu) | ||
1215 | { | ||
1216 | u64 irq_time; | ||
1217 | unsigned seq; | ||
1218 | |||
1219 | do { | ||
1220 | seq = read_seqcount_begin(&per_cpu(irq_time_seq, cpu)); | ||
1221 | irq_time = per_cpu(cpu_softirq_time, cpu) + | ||
1222 | per_cpu(cpu_hardirq_time, cpu); | ||
1223 | } while (read_seqcount_retry(&per_cpu(irq_time_seq, cpu), seq)); | ||
1224 | |||
1225 | return irq_time; | ||
1226 | } | ||
1227 | #else /* CONFIG_64BIT */ | ||
1228 | static inline void irq_time_write_begin(void) | ||
1229 | { | ||
1230 | } | ||
1231 | |||
1232 | static inline void irq_time_write_end(void) | ||
1233 | { | ||
1234 | } | ||
1235 | |||
1236 | static inline u64 irq_time_read(int cpu) | ||
1237 | { | ||
1238 | return per_cpu(cpu_softirq_time, cpu) + per_cpu(cpu_hardirq_time, cpu); | ||
1239 | } | ||
1240 | #endif /* CONFIG_64BIT */ | ||
1241 | #endif /* CONFIG_IRQ_TIME_ACCOUNTING */ | ||
diff --git a/kernel/seccomp.c b/kernel/seccomp.c index ee376beedaf9..5af44b593770 100644 --- a/kernel/seccomp.c +++ b/kernel/seccomp.c | |||
@@ -396,25 +396,29 @@ int __secure_computing(int this_syscall) | |||
396 | #ifdef CONFIG_SECCOMP_FILTER | 396 | #ifdef CONFIG_SECCOMP_FILTER |
397 | case SECCOMP_MODE_FILTER: { | 397 | case SECCOMP_MODE_FILTER: { |
398 | int data; | 398 | int data; |
399 | struct pt_regs *regs = task_pt_regs(current); | ||
399 | ret = seccomp_run_filters(this_syscall); | 400 | ret = seccomp_run_filters(this_syscall); |
400 | data = ret & SECCOMP_RET_DATA; | 401 | data = ret & SECCOMP_RET_DATA; |
401 | ret &= SECCOMP_RET_ACTION; | 402 | ret &= SECCOMP_RET_ACTION; |
402 | switch (ret) { | 403 | switch (ret) { |
403 | case SECCOMP_RET_ERRNO: | 404 | case SECCOMP_RET_ERRNO: |
404 | /* Set the low-order 16-bits as a errno. */ | 405 | /* Set the low-order 16-bits as a errno. */ |
405 | syscall_set_return_value(current, task_pt_regs(current), | 406 | syscall_set_return_value(current, regs, |
406 | -data, 0); | 407 | -data, 0); |
407 | goto skip; | 408 | goto skip; |
408 | case SECCOMP_RET_TRAP: | 409 | case SECCOMP_RET_TRAP: |
409 | /* Show the handler the original registers. */ | 410 | /* Show the handler the original registers. */ |
410 | syscall_rollback(current, task_pt_regs(current)); | 411 | syscall_rollback(current, regs); |
411 | /* Let the filter pass back 16 bits of data. */ | 412 | /* Let the filter pass back 16 bits of data. */ |
412 | seccomp_send_sigsys(this_syscall, data); | 413 | seccomp_send_sigsys(this_syscall, data); |
413 | goto skip; | 414 | goto skip; |
414 | case SECCOMP_RET_TRACE: | 415 | case SECCOMP_RET_TRACE: |
415 | /* Skip these calls if there is no tracer. */ | 416 | /* Skip these calls if there is no tracer. */ |
416 | if (!ptrace_event_enabled(current, PTRACE_EVENT_SECCOMP)) | 417 | if (!ptrace_event_enabled(current, PTRACE_EVENT_SECCOMP)) { |
418 | syscall_set_return_value(current, regs, | ||
419 | -ENOSYS, 0); | ||
417 | goto skip; | 420 | goto skip; |
421 | } | ||
418 | /* Allow the BPF to provide the event message */ | 422 | /* Allow the BPF to provide the event message */ |
419 | ptrace_event(PTRACE_EVENT_SECCOMP, data); | 423 | ptrace_event(PTRACE_EVENT_SECCOMP, data); |
420 | /* | 424 | /* |
@@ -425,6 +429,9 @@ int __secure_computing(int this_syscall) | |||
425 | */ | 429 | */ |
426 | if (fatal_signal_pending(current)) | 430 | if (fatal_signal_pending(current)) |
427 | break; | 431 | break; |
432 | if (syscall_get_nr(current, regs) < 0) | ||
433 | goto skip; /* Explicit request to skip. */ | ||
434 | |||
428 | return 0; | 435 | return 0; |
429 | case SECCOMP_RET_ALLOW: | 436 | case SECCOMP_RET_ALLOW: |
430 | return 0; | 437 | return 0; |
diff --git a/kernel/signal.c b/kernel/signal.c index be4f856d52f8..7aaa51d8e5b8 100644 --- a/kernel/signal.c +++ b/kernel/signal.c | |||
@@ -17,6 +17,7 @@ | |||
17 | #include <linux/fs.h> | 17 | #include <linux/fs.h> |
18 | #include <linux/tty.h> | 18 | #include <linux/tty.h> |
19 | #include <linux/binfmts.h> | 19 | #include <linux/binfmts.h> |
20 | #include <linux/coredump.h> | ||
20 | #include <linux/security.h> | 21 | #include <linux/security.h> |
21 | #include <linux/syscalls.h> | 22 | #include <linux/syscalls.h> |
22 | #include <linux/ptrace.h> | 23 | #include <linux/ptrace.h> |
@@ -30,6 +31,7 @@ | |||
30 | #include <linux/nsproxy.h> | 31 | #include <linux/nsproxy.h> |
31 | #include <linux/user_namespace.h> | 32 | #include <linux/user_namespace.h> |
32 | #include <linux/uprobes.h> | 33 | #include <linux/uprobes.h> |
34 | #include <linux/compat.h> | ||
33 | #define CREATE_TRACE_POINTS | 35 | #define CREATE_TRACE_POINTS |
34 | #include <trace/events/signal.h> | 36 | #include <trace/events/signal.h> |
35 | 37 | ||
@@ -1158,8 +1160,9 @@ static int send_signal(int sig, struct siginfo *info, struct task_struct *t, | |||
1158 | return __send_signal(sig, info, t, group, from_ancestor_ns); | 1160 | return __send_signal(sig, info, t, group, from_ancestor_ns); |
1159 | } | 1161 | } |
1160 | 1162 | ||
1161 | static void print_fatal_signal(struct pt_regs *regs, int signr) | 1163 | static void print_fatal_signal(int signr) |
1162 | { | 1164 | { |
1165 | struct pt_regs *regs = signal_pt_regs(); | ||
1163 | printk("%s/%d: potentially unexpected fatal signal %d.\n", | 1166 | printk("%s/%d: potentially unexpected fatal signal %d.\n", |
1164 | current->comm, task_pid_nr(current), signr); | 1167 | current->comm, task_pid_nr(current), signr); |
1165 | 1168 | ||
@@ -1751,7 +1754,7 @@ static void do_notify_parent_cldstop(struct task_struct *tsk, | |||
1751 | * see comment in do_notify_parent() about the following 4 lines | 1754 | * see comment in do_notify_parent() about the following 4 lines |
1752 | */ | 1755 | */ |
1753 | rcu_read_lock(); | 1756 | rcu_read_lock(); |
1754 | info.si_pid = task_pid_nr_ns(tsk, parent->nsproxy->pid_ns); | 1757 | info.si_pid = task_pid_nr_ns(tsk, task_active_pid_ns(parent)); |
1755 | info.si_uid = from_kuid_munged(task_cred_xxx(parent, user_ns), task_uid(tsk)); | 1758 | info.si_uid = from_kuid_munged(task_cred_xxx(parent, user_ns), task_uid(tsk)); |
1756 | rcu_read_unlock(); | 1759 | rcu_read_unlock(); |
1757 | 1760 | ||
@@ -1907,7 +1910,7 @@ static void ptrace_stop(int exit_code, int why, int clear_code, siginfo_t *info) | |||
1907 | preempt_disable(); | 1910 | preempt_disable(); |
1908 | read_unlock(&tasklist_lock); | 1911 | read_unlock(&tasklist_lock); |
1909 | preempt_enable_no_resched(); | 1912 | preempt_enable_no_resched(); |
1910 | schedule(); | 1913 | freezable_schedule(); |
1911 | } else { | 1914 | } else { |
1912 | /* | 1915 | /* |
1913 | * By the time we got the lock, our tracer went away. | 1916 | * By the time we got the lock, our tracer went away. |
@@ -1929,13 +1932,6 @@ static void ptrace_stop(int exit_code, int why, int clear_code, siginfo_t *info) | |||
1929 | } | 1932 | } |
1930 | 1933 | ||
1931 | /* | 1934 | /* |
1932 | * While in TASK_TRACED, we were considered "frozen enough". | ||
1933 | * Now that we woke up, it's crucial if we're supposed to be | ||
1934 | * frozen that we freeze now before running anything substantial. | ||
1935 | */ | ||
1936 | try_to_freeze(); | ||
1937 | |||
1938 | /* | ||
1939 | * We are back. Now reacquire the siglock before touching | 1935 | * We are back. Now reacquire the siglock before touching |
1940 | * last_siginfo, so that we are sure to have synchronized with | 1936 | * last_siginfo, so that we are sure to have synchronized with |
1941 | * any signal-sending on another CPU that wants to examine it. | 1937 | * any signal-sending on another CPU that wants to examine it. |
@@ -1971,13 +1967,8 @@ static void ptrace_do_notify(int signr, int exit_code, int why) | |||
1971 | void ptrace_notify(int exit_code) | 1967 | void ptrace_notify(int exit_code) |
1972 | { | 1968 | { |
1973 | BUG_ON((exit_code & (0x7f | ~0xffff)) != SIGTRAP); | 1969 | BUG_ON((exit_code & (0x7f | ~0xffff)) != SIGTRAP); |
1974 | if (unlikely(current->task_works)) { | 1970 | if (unlikely(current->task_works)) |
1975 | if (test_and_clear_ti_thread_flag(current_thread_info(), | 1971 | task_work_run(); |
1976 | TIF_NOTIFY_RESUME)) { | ||
1977 | smp_mb__after_clear_bit(); | ||
1978 | task_work_run(); | ||
1979 | } | ||
1980 | } | ||
1981 | 1972 | ||
1982 | spin_lock_irq(¤t->sighand->siglock); | 1973 | spin_lock_irq(¤t->sighand->siglock); |
1983 | ptrace_do_notify(SIGTRAP, exit_code, CLD_TRAPPED); | 1974 | ptrace_do_notify(SIGTRAP, exit_code, CLD_TRAPPED); |
@@ -2096,7 +2087,7 @@ static bool do_signal_stop(int signr) | |||
2096 | } | 2087 | } |
2097 | 2088 | ||
2098 | /* Now we don't run again until woken by SIGCONT or SIGKILL */ | 2089 | /* Now we don't run again until woken by SIGCONT or SIGKILL */ |
2099 | schedule(); | 2090 | freezable_schedule(); |
2100 | return true; | 2091 | return true; |
2101 | } else { | 2092 | } else { |
2102 | /* | 2093 | /* |
@@ -2142,10 +2133,9 @@ static void do_jobctl_trap(void) | |||
2142 | } | 2133 | } |
2143 | } | 2134 | } |
2144 | 2135 | ||
2145 | static int ptrace_signal(int signr, siginfo_t *info, | 2136 | static int ptrace_signal(int signr, siginfo_t *info) |
2146 | struct pt_regs *regs, void *cookie) | ||
2147 | { | 2137 | { |
2148 | ptrace_signal_deliver(regs, cookie); | 2138 | ptrace_signal_deliver(); |
2149 | /* | 2139 | /* |
2150 | * We do not check sig_kernel_stop(signr) but set this marker | 2140 | * We do not check sig_kernel_stop(signr) but set this marker |
2151 | * unconditionally because we do not know whether debugger will | 2141 | * unconditionally because we do not know whether debugger will |
@@ -2198,26 +2188,20 @@ int get_signal_to_deliver(siginfo_t *info, struct k_sigaction *return_ka, | |||
2198 | struct signal_struct *signal = current->signal; | 2188 | struct signal_struct *signal = current->signal; |
2199 | int signr; | 2189 | int signr; |
2200 | 2190 | ||
2201 | if (unlikely(current->task_works)) { | 2191 | if (unlikely(current->task_works)) |
2202 | if (test_and_clear_ti_thread_flag(current_thread_info(), | 2192 | task_work_run(); |
2203 | TIF_NOTIFY_RESUME)) { | ||
2204 | smp_mb__after_clear_bit(); | ||
2205 | task_work_run(); | ||
2206 | } | ||
2207 | } | ||
2208 | 2193 | ||
2209 | if (unlikely(uprobe_deny_signal())) | 2194 | if (unlikely(uprobe_deny_signal())) |
2210 | return 0; | 2195 | return 0; |
2211 | 2196 | ||
2212 | relock: | ||
2213 | /* | 2197 | /* |
2214 | * We'll jump back here after any time we were stopped in TASK_STOPPED. | 2198 | * Do this once, we can't return to user-mode if freezing() == T. |
2215 | * While in TASK_STOPPED, we were considered "frozen enough". | 2199 | * do_signal_stop() and ptrace_stop() do freezable_schedule() and |
2216 | * Now that we woke up, it's crucial if we're supposed to be | 2200 | * thus do not need another check after return. |
2217 | * frozen that we freeze now before running anything substantial. | ||
2218 | */ | 2201 | */ |
2219 | try_to_freeze(); | 2202 | try_to_freeze(); |
2220 | 2203 | ||
2204 | relock: | ||
2221 | spin_lock_irq(&sighand->siglock); | 2205 | spin_lock_irq(&sighand->siglock); |
2222 | /* | 2206 | /* |
2223 | * Every stopped thread goes here after wakeup. Check to see if | 2207 | * Every stopped thread goes here after wakeup. Check to see if |
@@ -2274,8 +2258,7 @@ relock: | |||
2274 | break; /* will return 0 */ | 2258 | break; /* will return 0 */ |
2275 | 2259 | ||
2276 | if (unlikely(current->ptrace) && signr != SIGKILL) { | 2260 | if (unlikely(current->ptrace) && signr != SIGKILL) { |
2277 | signr = ptrace_signal(signr, info, | 2261 | signr = ptrace_signal(signr, info); |
2278 | regs, cookie); | ||
2279 | if (!signr) | 2262 | if (!signr) |
2280 | continue; | 2263 | continue; |
2281 | } | 2264 | } |
@@ -2360,7 +2343,7 @@ relock: | |||
2360 | 2343 | ||
2361 | if (sig_kernel_coredump(signr)) { | 2344 | if (sig_kernel_coredump(signr)) { |
2362 | if (print_fatal_signals) | 2345 | if (print_fatal_signals) |
2363 | print_fatal_signal(regs, info->si_signo); | 2346 | print_fatal_signal(info->si_signo); |
2364 | /* | 2347 | /* |
2365 | * If it was able to dump core, this kills all | 2348 | * If it was able to dump core, this kills all |
2366 | * other threads in the group and synchronizes with | 2349 | * other threads in the group and synchronizes with |
@@ -2369,7 +2352,7 @@ relock: | |||
2369 | * first and our do_group_exit call below will use | 2352 | * first and our do_group_exit call below will use |
2370 | * that value and ignore the one we pass it. | 2353 | * that value and ignore the one we pass it. |
2371 | */ | 2354 | */ |
2372 | do_coredump(info->si_signo, info->si_signo, regs); | 2355 | do_coredump(info); |
2373 | } | 2356 | } |
2374 | 2357 | ||
2375 | /* | 2358 | /* |
@@ -3112,6 +3095,79 @@ do_sigaltstack (const stack_t __user *uss, stack_t __user *uoss, unsigned long s | |||
3112 | out: | 3095 | out: |
3113 | return error; | 3096 | return error; |
3114 | } | 3097 | } |
3098 | #ifdef CONFIG_GENERIC_SIGALTSTACK | ||
3099 | SYSCALL_DEFINE2(sigaltstack,const stack_t __user *,uss, stack_t __user *,uoss) | ||
3100 | { | ||
3101 | return do_sigaltstack(uss, uoss, current_user_stack_pointer()); | ||
3102 | } | ||
3103 | #endif | ||
3104 | |||
3105 | int restore_altstack(const stack_t __user *uss) | ||
3106 | { | ||
3107 | int err = do_sigaltstack(uss, NULL, current_user_stack_pointer()); | ||
3108 | /* squash all but EFAULT for now */ | ||
3109 | return err == -EFAULT ? err : 0; | ||
3110 | } | ||
3111 | |||
3112 | int __save_altstack(stack_t __user *uss, unsigned long sp) | ||
3113 | { | ||
3114 | struct task_struct *t = current; | ||
3115 | return __put_user((void __user *)t->sas_ss_sp, &uss->ss_sp) | | ||
3116 | __put_user(sas_ss_flags(sp), &uss->ss_flags) | | ||
3117 | __put_user(t->sas_ss_size, &uss->ss_size); | ||
3118 | } | ||
3119 | |||
3120 | #ifdef CONFIG_COMPAT | ||
3121 | #ifdef CONFIG_GENERIC_SIGALTSTACK | ||
3122 | asmlinkage long compat_sys_sigaltstack(const compat_stack_t __user *uss_ptr, | ||
3123 | compat_stack_t __user *uoss_ptr) | ||
3124 | { | ||
3125 | stack_t uss, uoss; | ||
3126 | int ret; | ||
3127 | mm_segment_t seg; | ||
3128 | |||
3129 | if (uss_ptr) { | ||
3130 | compat_stack_t uss32; | ||
3131 | |||
3132 | memset(&uss, 0, sizeof(stack_t)); | ||
3133 | if (copy_from_user(&uss32, uss_ptr, sizeof(compat_stack_t))) | ||
3134 | return -EFAULT; | ||
3135 | uss.ss_sp = compat_ptr(uss32.ss_sp); | ||
3136 | uss.ss_flags = uss32.ss_flags; | ||
3137 | uss.ss_size = uss32.ss_size; | ||
3138 | } | ||
3139 | seg = get_fs(); | ||
3140 | set_fs(KERNEL_DS); | ||
3141 | ret = do_sigaltstack((stack_t __force __user *) (uss_ptr ? &uss : NULL), | ||
3142 | (stack_t __force __user *) &uoss, | ||
3143 | compat_user_stack_pointer()); | ||
3144 | set_fs(seg); | ||
3145 | if (ret >= 0 && uoss_ptr) { | ||
3146 | if (!access_ok(VERIFY_WRITE, uoss_ptr, sizeof(compat_stack_t)) || | ||
3147 | __put_user(ptr_to_compat(uoss.ss_sp), &uoss_ptr->ss_sp) || | ||
3148 | __put_user(uoss.ss_flags, &uoss_ptr->ss_flags) || | ||
3149 | __put_user(uoss.ss_size, &uoss_ptr->ss_size)) | ||
3150 | ret = -EFAULT; | ||
3151 | } | ||
3152 | return ret; | ||
3153 | } | ||
3154 | |||
3155 | int compat_restore_altstack(const compat_stack_t __user *uss) | ||
3156 | { | ||
3157 | int err = compat_sys_sigaltstack(uss, NULL); | ||
3158 | /* squash all but -EFAULT for now */ | ||
3159 | return err == -EFAULT ? err : 0; | ||
3160 | } | ||
3161 | |||
3162 | int __compat_save_altstack(compat_stack_t __user *uss, unsigned long sp) | ||
3163 | { | ||
3164 | struct task_struct *t = current; | ||
3165 | return __put_user(ptr_to_compat((void __user *)t->sas_ss_sp), &uss->ss_sp) | | ||
3166 | __put_user(sas_ss_flags(sp), &uss->ss_flags) | | ||
3167 | __put_user(t->sas_ss_size, &uss->ss_size); | ||
3168 | } | ||
3169 | #endif | ||
3170 | #endif | ||
3115 | 3171 | ||
3116 | #ifdef __ARCH_WANT_SYS_SIGPENDING | 3172 | #ifdef __ARCH_WANT_SYS_SIGPENDING |
3117 | 3173 | ||
diff --git a/kernel/smpboot.c b/kernel/smpboot.c index 98f60c5caa1b..d6c5fc054242 100644 --- a/kernel/smpboot.c +++ b/kernel/smpboot.c | |||
@@ -1,14 +1,22 @@ | |||
1 | /* | 1 | /* |
2 | * Common SMP CPU bringup/teardown functions | 2 | * Common SMP CPU bringup/teardown functions |
3 | */ | 3 | */ |
4 | #include <linux/cpu.h> | ||
4 | #include <linux/err.h> | 5 | #include <linux/err.h> |
5 | #include <linux/smp.h> | 6 | #include <linux/smp.h> |
6 | #include <linux/init.h> | 7 | #include <linux/init.h> |
8 | #include <linux/list.h> | ||
9 | #include <linux/slab.h> | ||
7 | #include <linux/sched.h> | 10 | #include <linux/sched.h> |
11 | #include <linux/export.h> | ||
8 | #include <linux/percpu.h> | 12 | #include <linux/percpu.h> |
13 | #include <linux/kthread.h> | ||
14 | #include <linux/smpboot.h> | ||
9 | 15 | ||
10 | #include "smpboot.h" | 16 | #include "smpboot.h" |
11 | 17 | ||
18 | #ifdef CONFIG_SMP | ||
19 | |||
12 | #ifdef CONFIG_GENERIC_SMP_IDLE_THREAD | 20 | #ifdef CONFIG_GENERIC_SMP_IDLE_THREAD |
13 | /* | 21 | /* |
14 | * For the hotplug case we keep the task structs around and reuse | 22 | * For the hotplug case we keep the task structs around and reuse |
@@ -65,3 +73,228 @@ void __init idle_threads_init(void) | |||
65 | } | 73 | } |
66 | } | 74 | } |
67 | #endif | 75 | #endif |
76 | |||
77 | #endif /* #ifdef CONFIG_SMP */ | ||
78 | |||
79 | static LIST_HEAD(hotplug_threads); | ||
80 | static DEFINE_MUTEX(smpboot_threads_lock); | ||
81 | |||
82 | struct smpboot_thread_data { | ||
83 | unsigned int cpu; | ||
84 | unsigned int status; | ||
85 | struct smp_hotplug_thread *ht; | ||
86 | }; | ||
87 | |||
88 | enum { | ||
89 | HP_THREAD_NONE = 0, | ||
90 | HP_THREAD_ACTIVE, | ||
91 | HP_THREAD_PARKED, | ||
92 | }; | ||
93 | |||
94 | /** | ||
95 | * smpboot_thread_fn - percpu hotplug thread loop function | ||
96 | * @data: thread data pointer | ||
97 | * | ||
98 | * Checks for thread stop and park conditions. Calls the necessary | ||
99 | * setup, cleanup, park and unpark functions for the registered | ||
100 | * thread. | ||
101 | * | ||
102 | * Returns 1 when the thread should exit, 0 otherwise. | ||
103 | */ | ||
104 | static int smpboot_thread_fn(void *data) | ||
105 | { | ||
106 | struct smpboot_thread_data *td = data; | ||
107 | struct smp_hotplug_thread *ht = td->ht; | ||
108 | |||
109 | while (1) { | ||
110 | set_current_state(TASK_INTERRUPTIBLE); | ||
111 | preempt_disable(); | ||
112 | if (kthread_should_stop()) { | ||
113 | set_current_state(TASK_RUNNING); | ||
114 | preempt_enable(); | ||
115 | if (ht->cleanup) | ||
116 | ht->cleanup(td->cpu, cpu_online(td->cpu)); | ||
117 | kfree(td); | ||
118 | return 0; | ||
119 | } | ||
120 | |||
121 | if (kthread_should_park()) { | ||
122 | __set_current_state(TASK_RUNNING); | ||
123 | preempt_enable(); | ||
124 | if (ht->park && td->status == HP_THREAD_ACTIVE) { | ||
125 | BUG_ON(td->cpu != smp_processor_id()); | ||
126 | ht->park(td->cpu); | ||
127 | td->status = HP_THREAD_PARKED; | ||
128 | } | ||
129 | kthread_parkme(); | ||
130 | /* We might have been woken for stop */ | ||
131 | continue; | ||
132 | } | ||
133 | |||
134 | BUG_ON(td->cpu != smp_processor_id()); | ||
135 | |||
136 | /* Check for state change setup */ | ||
137 | switch (td->status) { | ||
138 | case HP_THREAD_NONE: | ||
139 | preempt_enable(); | ||
140 | if (ht->setup) | ||
141 | ht->setup(td->cpu); | ||
142 | td->status = HP_THREAD_ACTIVE; | ||
143 | preempt_disable(); | ||
144 | break; | ||
145 | case HP_THREAD_PARKED: | ||
146 | preempt_enable(); | ||
147 | if (ht->unpark) | ||
148 | ht->unpark(td->cpu); | ||
149 | td->status = HP_THREAD_ACTIVE; | ||
150 | preempt_disable(); | ||
151 | break; | ||
152 | } | ||
153 | |||
154 | if (!ht->thread_should_run(td->cpu)) { | ||
155 | preempt_enable(); | ||
156 | schedule(); | ||
157 | } else { | ||
158 | set_current_state(TASK_RUNNING); | ||
159 | preempt_enable(); | ||
160 | ht->thread_fn(td->cpu); | ||
161 | } | ||
162 | } | ||
163 | } | ||
164 | |||
165 | static int | ||
166 | __smpboot_create_thread(struct smp_hotplug_thread *ht, unsigned int cpu) | ||
167 | { | ||
168 | struct task_struct *tsk = *per_cpu_ptr(ht->store, cpu); | ||
169 | struct smpboot_thread_data *td; | ||
170 | |||
171 | if (tsk) | ||
172 | return 0; | ||
173 | |||
174 | td = kzalloc_node(sizeof(*td), GFP_KERNEL, cpu_to_node(cpu)); | ||
175 | if (!td) | ||
176 | return -ENOMEM; | ||
177 | td->cpu = cpu; | ||
178 | td->ht = ht; | ||
179 | |||
180 | tsk = kthread_create_on_cpu(smpboot_thread_fn, td, cpu, | ||
181 | ht->thread_comm); | ||
182 | if (IS_ERR(tsk)) { | ||
183 | kfree(td); | ||
184 | return PTR_ERR(tsk); | ||
185 | } | ||
186 | |||
187 | get_task_struct(tsk); | ||
188 | *per_cpu_ptr(ht->store, cpu) = tsk; | ||
189 | return 0; | ||
190 | } | ||
191 | |||
192 | int smpboot_create_threads(unsigned int cpu) | ||
193 | { | ||
194 | struct smp_hotplug_thread *cur; | ||
195 | int ret = 0; | ||
196 | |||
197 | mutex_lock(&smpboot_threads_lock); | ||
198 | list_for_each_entry(cur, &hotplug_threads, list) { | ||
199 | ret = __smpboot_create_thread(cur, cpu); | ||
200 | if (ret) | ||
201 | break; | ||
202 | } | ||
203 | mutex_unlock(&smpboot_threads_lock); | ||
204 | return ret; | ||
205 | } | ||
206 | |||
207 | static void smpboot_unpark_thread(struct smp_hotplug_thread *ht, unsigned int cpu) | ||
208 | { | ||
209 | struct task_struct *tsk = *per_cpu_ptr(ht->store, cpu); | ||
210 | |||
211 | kthread_unpark(tsk); | ||
212 | } | ||
213 | |||
214 | void smpboot_unpark_threads(unsigned int cpu) | ||
215 | { | ||
216 | struct smp_hotplug_thread *cur; | ||
217 | |||
218 | mutex_lock(&smpboot_threads_lock); | ||
219 | list_for_each_entry(cur, &hotplug_threads, list) | ||
220 | smpboot_unpark_thread(cur, cpu); | ||
221 | mutex_unlock(&smpboot_threads_lock); | ||
222 | } | ||
223 | |||
224 | static void smpboot_park_thread(struct smp_hotplug_thread *ht, unsigned int cpu) | ||
225 | { | ||
226 | struct task_struct *tsk = *per_cpu_ptr(ht->store, cpu); | ||
227 | |||
228 | if (tsk) | ||
229 | kthread_park(tsk); | ||
230 | } | ||
231 | |||
232 | void smpboot_park_threads(unsigned int cpu) | ||
233 | { | ||
234 | struct smp_hotplug_thread *cur; | ||
235 | |||
236 | mutex_lock(&smpboot_threads_lock); | ||
237 | list_for_each_entry_reverse(cur, &hotplug_threads, list) | ||
238 | smpboot_park_thread(cur, cpu); | ||
239 | mutex_unlock(&smpboot_threads_lock); | ||
240 | } | ||
241 | |||
242 | static void smpboot_destroy_threads(struct smp_hotplug_thread *ht) | ||
243 | { | ||
244 | unsigned int cpu; | ||
245 | |||
246 | /* We need to destroy also the parked threads of offline cpus */ | ||
247 | for_each_possible_cpu(cpu) { | ||
248 | struct task_struct *tsk = *per_cpu_ptr(ht->store, cpu); | ||
249 | |||
250 | if (tsk) { | ||
251 | kthread_stop(tsk); | ||
252 | put_task_struct(tsk); | ||
253 | *per_cpu_ptr(ht->store, cpu) = NULL; | ||
254 | } | ||
255 | } | ||
256 | } | ||
257 | |||
258 | /** | ||
259 | * smpboot_register_percpu_thread - Register a per_cpu thread related to hotplug | ||
260 | * @plug_thread: Hotplug thread descriptor | ||
261 | * | ||
262 | * Creates and starts the threads on all online cpus. | ||
263 | */ | ||
264 | int smpboot_register_percpu_thread(struct smp_hotplug_thread *plug_thread) | ||
265 | { | ||
266 | unsigned int cpu; | ||
267 | int ret = 0; | ||
268 | |||
269 | mutex_lock(&smpboot_threads_lock); | ||
270 | for_each_online_cpu(cpu) { | ||
271 | ret = __smpboot_create_thread(plug_thread, cpu); | ||
272 | if (ret) { | ||
273 | smpboot_destroy_threads(plug_thread); | ||
274 | goto out; | ||
275 | } | ||
276 | smpboot_unpark_thread(plug_thread, cpu); | ||
277 | } | ||
278 | list_add(&plug_thread->list, &hotplug_threads); | ||
279 | out: | ||
280 | mutex_unlock(&smpboot_threads_lock); | ||
281 | return ret; | ||
282 | } | ||
283 | EXPORT_SYMBOL_GPL(smpboot_register_percpu_thread); | ||
284 | |||
285 | /** | ||
286 | * smpboot_unregister_percpu_thread - Unregister a per_cpu thread related to hotplug | ||
287 | * @plug_thread: Hotplug thread descriptor | ||
288 | * | ||
289 | * Stops all threads on all possible cpus. | ||
290 | */ | ||
291 | void smpboot_unregister_percpu_thread(struct smp_hotplug_thread *plug_thread) | ||
292 | { | ||
293 | get_online_cpus(); | ||
294 | mutex_lock(&smpboot_threads_lock); | ||
295 | list_del(&plug_thread->list); | ||
296 | smpboot_destroy_threads(plug_thread); | ||
297 | mutex_unlock(&smpboot_threads_lock); | ||
298 | put_online_cpus(); | ||
299 | } | ||
300 | EXPORT_SYMBOL_GPL(smpboot_unregister_percpu_thread); | ||
diff --git a/kernel/smpboot.h b/kernel/smpboot.h index 6ef9433e1c70..72415a0eb955 100644 --- a/kernel/smpboot.h +++ b/kernel/smpboot.h | |||
@@ -13,4 +13,8 @@ static inline void idle_thread_set_boot_cpu(void) { } | |||
13 | static inline void idle_threads_init(void) { } | 13 | static inline void idle_threads_init(void) { } |
14 | #endif | 14 | #endif |
15 | 15 | ||
16 | int smpboot_create_threads(unsigned int cpu); | ||
17 | void smpboot_park_threads(unsigned int cpu); | ||
18 | void smpboot_unpark_threads(unsigned int cpu); | ||
19 | |||
16 | #endif | 20 | #endif |
diff --git a/kernel/softirq.c b/kernel/softirq.c index b73e681df09e..ed567babe789 100644 --- a/kernel/softirq.c +++ b/kernel/softirq.c | |||
@@ -23,6 +23,7 @@ | |||
23 | #include <linux/rcupdate.h> | 23 | #include <linux/rcupdate.h> |
24 | #include <linux/ftrace.h> | 24 | #include <linux/ftrace.h> |
25 | #include <linux/smp.h> | 25 | #include <linux/smp.h> |
26 | #include <linux/smpboot.h> | ||
26 | #include <linux/tick.h> | 27 | #include <linux/tick.h> |
27 | 28 | ||
28 | #define CREATE_TRACE_POINTS | 29 | #define CREATE_TRACE_POINTS |
@@ -220,7 +221,7 @@ asmlinkage void __do_softirq(void) | |||
220 | current->flags &= ~PF_MEMALLOC; | 221 | current->flags &= ~PF_MEMALLOC; |
221 | 222 | ||
222 | pending = local_softirq_pending(); | 223 | pending = local_softirq_pending(); |
223 | account_system_vtime(current); | 224 | vtime_account_irq_enter(current); |
224 | 225 | ||
225 | __local_bh_disable((unsigned long)__builtin_return_address(0), | 226 | __local_bh_disable((unsigned long)__builtin_return_address(0), |
226 | SOFTIRQ_OFFSET); | 227 | SOFTIRQ_OFFSET); |
@@ -271,7 +272,7 @@ restart: | |||
271 | 272 | ||
272 | lockdep_softirq_exit(); | 273 | lockdep_softirq_exit(); |
273 | 274 | ||
274 | account_system_vtime(current); | 275 | vtime_account_irq_exit(current); |
275 | __local_bh_enable(SOFTIRQ_OFFSET); | 276 | __local_bh_enable(SOFTIRQ_OFFSET); |
276 | tsk_restore_flags(current, old_flags, PF_MEMALLOC); | 277 | tsk_restore_flags(current, old_flags, PF_MEMALLOC); |
277 | } | 278 | } |
@@ -340,7 +341,7 @@ static inline void invoke_softirq(void) | |||
340 | */ | 341 | */ |
341 | void irq_exit(void) | 342 | void irq_exit(void) |
342 | { | 343 | { |
343 | account_system_vtime(current); | 344 | vtime_account_irq_exit(current); |
344 | trace_hardirq_exit(); | 345 | trace_hardirq_exit(); |
345 | sub_preempt_count(IRQ_EXIT_OFFSET); | 346 | sub_preempt_count(IRQ_EXIT_OFFSET); |
346 | if (!in_interrupt() && local_softirq_pending()) | 347 | if (!in_interrupt() && local_softirq_pending()) |
@@ -742,49 +743,22 @@ void __init softirq_init(void) | |||
742 | open_softirq(HI_SOFTIRQ, tasklet_hi_action); | 743 | open_softirq(HI_SOFTIRQ, tasklet_hi_action); |
743 | } | 744 | } |
744 | 745 | ||
745 | static int run_ksoftirqd(void * __bind_cpu) | 746 | static int ksoftirqd_should_run(unsigned int cpu) |
746 | { | 747 | { |
747 | set_current_state(TASK_INTERRUPTIBLE); | 748 | return local_softirq_pending(); |
748 | 749 | } | |
749 | while (!kthread_should_stop()) { | ||
750 | preempt_disable(); | ||
751 | if (!local_softirq_pending()) { | ||
752 | schedule_preempt_disabled(); | ||
753 | } | ||
754 | |||
755 | __set_current_state(TASK_RUNNING); | ||
756 | |||
757 | while (local_softirq_pending()) { | ||
758 | /* Preempt disable stops cpu going offline. | ||
759 | If already offline, we'll be on wrong CPU: | ||
760 | don't process */ | ||
761 | if (cpu_is_offline((long)__bind_cpu)) | ||
762 | goto wait_to_die; | ||
763 | local_irq_disable(); | ||
764 | if (local_softirq_pending()) | ||
765 | __do_softirq(); | ||
766 | local_irq_enable(); | ||
767 | sched_preempt_enable_no_resched(); | ||
768 | cond_resched(); | ||
769 | preempt_disable(); | ||
770 | rcu_note_context_switch((long)__bind_cpu); | ||
771 | } | ||
772 | preempt_enable(); | ||
773 | set_current_state(TASK_INTERRUPTIBLE); | ||
774 | } | ||
775 | __set_current_state(TASK_RUNNING); | ||
776 | return 0; | ||
777 | 750 | ||
778 | wait_to_die: | 751 | static void run_ksoftirqd(unsigned int cpu) |
779 | preempt_enable(); | 752 | { |
780 | /* Wait for kthread_stop */ | 753 | local_irq_disable(); |
781 | set_current_state(TASK_INTERRUPTIBLE); | 754 | if (local_softirq_pending()) { |
782 | while (!kthread_should_stop()) { | 755 | __do_softirq(); |
783 | schedule(); | 756 | rcu_note_context_switch(cpu); |
784 | set_current_state(TASK_INTERRUPTIBLE); | 757 | local_irq_enable(); |
758 | cond_resched(); | ||
759 | return; | ||
785 | } | 760 | } |
786 | __set_current_state(TASK_RUNNING); | 761 | local_irq_enable(); |
787 | return 0; | ||
788 | } | 762 | } |
789 | 763 | ||
790 | #ifdef CONFIG_HOTPLUG_CPU | 764 | #ifdef CONFIG_HOTPLUG_CPU |
@@ -850,50 +824,14 @@ static int __cpuinit cpu_callback(struct notifier_block *nfb, | |||
850 | unsigned long action, | 824 | unsigned long action, |
851 | void *hcpu) | 825 | void *hcpu) |
852 | { | 826 | { |
853 | int hotcpu = (unsigned long)hcpu; | ||
854 | struct task_struct *p; | ||
855 | |||
856 | switch (action) { | 827 | switch (action) { |
857 | case CPU_UP_PREPARE: | ||
858 | case CPU_UP_PREPARE_FROZEN: | ||
859 | p = kthread_create_on_node(run_ksoftirqd, | ||
860 | hcpu, | ||
861 | cpu_to_node(hotcpu), | ||
862 | "ksoftirqd/%d", hotcpu); | ||
863 | if (IS_ERR(p)) { | ||
864 | printk("ksoftirqd for %i failed\n", hotcpu); | ||
865 | return notifier_from_errno(PTR_ERR(p)); | ||
866 | } | ||
867 | kthread_bind(p, hotcpu); | ||
868 | per_cpu(ksoftirqd, hotcpu) = p; | ||
869 | break; | ||
870 | case CPU_ONLINE: | ||
871 | case CPU_ONLINE_FROZEN: | ||
872 | wake_up_process(per_cpu(ksoftirqd, hotcpu)); | ||
873 | break; | ||
874 | #ifdef CONFIG_HOTPLUG_CPU | 828 | #ifdef CONFIG_HOTPLUG_CPU |
875 | case CPU_UP_CANCELED: | ||
876 | case CPU_UP_CANCELED_FROZEN: | ||
877 | if (!per_cpu(ksoftirqd, hotcpu)) | ||
878 | break; | ||
879 | /* Unbind so it can run. Fall thru. */ | ||
880 | kthread_bind(per_cpu(ksoftirqd, hotcpu), | ||
881 | cpumask_any(cpu_online_mask)); | ||
882 | case CPU_DEAD: | 829 | case CPU_DEAD: |
883 | case CPU_DEAD_FROZEN: { | 830 | case CPU_DEAD_FROZEN: |
884 | static const struct sched_param param = { | 831 | takeover_tasklets((unsigned long)hcpu); |
885 | .sched_priority = MAX_RT_PRIO-1 | ||
886 | }; | ||
887 | |||
888 | p = per_cpu(ksoftirqd, hotcpu); | ||
889 | per_cpu(ksoftirqd, hotcpu) = NULL; | ||
890 | sched_setscheduler_nocheck(p, SCHED_FIFO, ¶m); | ||
891 | kthread_stop(p); | ||
892 | takeover_tasklets(hotcpu); | ||
893 | break; | 832 | break; |
894 | } | ||
895 | #endif /* CONFIG_HOTPLUG_CPU */ | 833 | #endif /* CONFIG_HOTPLUG_CPU */ |
896 | } | 834 | } |
897 | return NOTIFY_OK; | 835 | return NOTIFY_OK; |
898 | } | 836 | } |
899 | 837 | ||
@@ -901,14 +839,19 @@ static struct notifier_block __cpuinitdata cpu_nfb = { | |||
901 | .notifier_call = cpu_callback | 839 | .notifier_call = cpu_callback |
902 | }; | 840 | }; |
903 | 841 | ||
842 | static struct smp_hotplug_thread softirq_threads = { | ||
843 | .store = &ksoftirqd, | ||
844 | .thread_should_run = ksoftirqd_should_run, | ||
845 | .thread_fn = run_ksoftirqd, | ||
846 | .thread_comm = "ksoftirqd/%u", | ||
847 | }; | ||
848 | |||
904 | static __init int spawn_ksoftirqd(void) | 849 | static __init int spawn_ksoftirqd(void) |
905 | { | 850 | { |
906 | void *cpu = (void *)(long)smp_processor_id(); | ||
907 | int err = cpu_callback(&cpu_nfb, CPU_UP_PREPARE, cpu); | ||
908 | |||
909 | BUG_ON(err != NOTIFY_OK); | ||
910 | cpu_callback(&cpu_nfb, CPU_ONLINE, cpu); | ||
911 | register_cpu_notifier(&cpu_nfb); | 851 | register_cpu_notifier(&cpu_nfb); |
852 | |||
853 | BUG_ON(smpboot_register_percpu_thread(&softirq_threads)); | ||
854 | |||
912 | return 0; | 855 | return 0; |
913 | } | 856 | } |
914 | early_initcall(spawn_ksoftirqd); | 857 | early_initcall(spawn_ksoftirqd); |
diff --git a/kernel/srcu.c b/kernel/srcu.c index 2095be3318d5..2b859828cdc3 100644 --- a/kernel/srcu.c +++ b/kernel/srcu.c | |||
@@ -16,8 +16,10 @@ | |||
16 | * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. | 16 | * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. |
17 | * | 17 | * |
18 | * Copyright (C) IBM Corporation, 2006 | 18 | * Copyright (C) IBM Corporation, 2006 |
19 | * Copyright (C) Fujitsu, 2012 | ||
19 | * | 20 | * |
20 | * Author: Paul McKenney <paulmck@us.ibm.com> | 21 | * Author: Paul McKenney <paulmck@us.ibm.com> |
22 | * Lai Jiangshan <laijs@cn.fujitsu.com> | ||
21 | * | 23 | * |
22 | * For detailed explanation of Read-Copy Update mechanism see - | 24 | * For detailed explanation of Read-Copy Update mechanism see - |
23 | * Documentation/RCU/ *.txt | 25 | * Documentation/RCU/ *.txt |
@@ -34,6 +36,10 @@ | |||
34 | #include <linux/delay.h> | 36 | #include <linux/delay.h> |
35 | #include <linux/srcu.h> | 37 | #include <linux/srcu.h> |
36 | 38 | ||
39 | #include <trace/events/rcu.h> | ||
40 | |||
41 | #include "rcu.h" | ||
42 | |||
37 | /* | 43 | /* |
38 | * Initialize an rcu_batch structure to empty. | 44 | * Initialize an rcu_batch structure to empty. |
39 | */ | 45 | */ |
@@ -92,9 +98,6 @@ static inline void rcu_batch_move(struct rcu_batch *to, struct rcu_batch *from) | |||
92 | } | 98 | } |
93 | } | 99 | } |
94 | 100 | ||
95 | /* single-thread state-machine */ | ||
96 | static void process_srcu(struct work_struct *work); | ||
97 | |||
98 | static int init_srcu_struct_fields(struct srcu_struct *sp) | 101 | static int init_srcu_struct_fields(struct srcu_struct *sp) |
99 | { | 102 | { |
100 | sp->completed = 0; | 103 | sp->completed = 0; |
@@ -379,7 +382,7 @@ void call_srcu(struct srcu_struct *sp, struct rcu_head *head, | |||
379 | rcu_batch_queue(&sp->batch_queue, head); | 382 | rcu_batch_queue(&sp->batch_queue, head); |
380 | if (!sp->running) { | 383 | if (!sp->running) { |
381 | sp->running = true; | 384 | sp->running = true; |
382 | queue_delayed_work(system_nrt_wq, &sp->work, 0); | 385 | schedule_delayed_work(&sp->work, 0); |
383 | } | 386 | } |
384 | spin_unlock_irqrestore(&sp->queue_lock, flags); | 387 | spin_unlock_irqrestore(&sp->queue_lock, flags); |
385 | } | 388 | } |
@@ -464,7 +467,9 @@ static void __synchronize_srcu(struct srcu_struct *sp, int trycount) | |||
464 | */ | 467 | */ |
465 | void synchronize_srcu(struct srcu_struct *sp) | 468 | void synchronize_srcu(struct srcu_struct *sp) |
466 | { | 469 | { |
467 | __synchronize_srcu(sp, SYNCHRONIZE_SRCU_TRYCOUNT); | 470 | __synchronize_srcu(sp, rcu_expedited |
471 | ? SYNCHRONIZE_SRCU_EXP_TRYCOUNT | ||
472 | : SYNCHRONIZE_SRCU_TRYCOUNT); | ||
468 | } | 473 | } |
469 | EXPORT_SYMBOL_GPL(synchronize_srcu); | 474 | EXPORT_SYMBOL_GPL(synchronize_srcu); |
470 | 475 | ||
@@ -631,13 +636,13 @@ static void srcu_reschedule(struct srcu_struct *sp) | |||
631 | } | 636 | } |
632 | 637 | ||
633 | if (pending) | 638 | if (pending) |
634 | queue_delayed_work(system_nrt_wq, &sp->work, SRCU_INTERVAL); | 639 | schedule_delayed_work(&sp->work, SRCU_INTERVAL); |
635 | } | 640 | } |
636 | 641 | ||
637 | /* | 642 | /* |
638 | * This is the work-queue function that handles SRCU grace periods. | 643 | * This is the work-queue function that handles SRCU grace periods. |
639 | */ | 644 | */ |
640 | static void process_srcu(struct work_struct *work) | 645 | void process_srcu(struct work_struct *work) |
641 | { | 646 | { |
642 | struct srcu_struct *sp; | 647 | struct srcu_struct *sp; |
643 | 648 | ||
@@ -648,3 +653,4 @@ static void process_srcu(struct work_struct *work) | |||
648 | srcu_invoke_callbacks(sp); | 653 | srcu_invoke_callbacks(sp); |
649 | srcu_reschedule(sp); | 654 | srcu_reschedule(sp); |
650 | } | 655 | } |
656 | EXPORT_SYMBOL_GPL(process_srcu); | ||
diff --git a/kernel/sys.c b/kernel/sys.c index 241507f23eca..265b37690421 100644 --- a/kernel/sys.c +++ b/kernel/sys.c | |||
@@ -368,6 +368,7 @@ EXPORT_SYMBOL(unregister_reboot_notifier); | |||
368 | void kernel_restart(char *cmd) | 368 | void kernel_restart(char *cmd) |
369 | { | 369 | { |
370 | kernel_restart_prepare(cmd); | 370 | kernel_restart_prepare(cmd); |
371 | disable_nonboot_cpus(); | ||
371 | if (!cmd) | 372 | if (!cmd) |
372 | printk(KERN_EMERG "Restarting system.\n"); | 373 | printk(KERN_EMERG "Restarting system.\n"); |
373 | else | 374 | else |
@@ -1045,7 +1046,7 @@ void do_sys_times(struct tms *tms) | |||
1045 | cputime_t tgutime, tgstime, cutime, cstime; | 1046 | cputime_t tgutime, tgstime, cutime, cstime; |
1046 | 1047 | ||
1047 | spin_lock_irq(¤t->sighand->siglock); | 1048 | spin_lock_irq(¤t->sighand->siglock); |
1048 | thread_group_times(current, &tgutime, &tgstime); | 1049 | thread_group_cputime_adjusted(current, &tgutime, &tgstime); |
1049 | cutime = current->signal->cutime; | 1050 | cutime = current->signal->cutime; |
1050 | cstime = current->signal->cstime; | 1051 | cstime = current->signal->cstime; |
1051 | spin_unlock_irq(¤t->sighand->siglock); | 1052 | spin_unlock_irq(¤t->sighand->siglock); |
@@ -1264,15 +1265,16 @@ DECLARE_RWSEM(uts_sem); | |||
1264 | * Work around broken programs that cannot handle "Linux 3.0". | 1265 | * Work around broken programs that cannot handle "Linux 3.0". |
1265 | * Instead we map 3.x to 2.6.40+x, so e.g. 3.0 would be 2.6.40 | 1266 | * Instead we map 3.x to 2.6.40+x, so e.g. 3.0 would be 2.6.40 |
1266 | */ | 1267 | */ |
1267 | static int override_release(char __user *release, int len) | 1268 | static int override_release(char __user *release, size_t len) |
1268 | { | 1269 | { |
1269 | int ret = 0; | 1270 | int ret = 0; |
1270 | char buf[65]; | ||
1271 | 1271 | ||
1272 | if (current->personality & UNAME26) { | 1272 | if (current->personality & UNAME26) { |
1273 | char *rest = UTS_RELEASE; | 1273 | const char *rest = UTS_RELEASE; |
1274 | char buf[65] = { 0 }; | ||
1274 | int ndots = 0; | 1275 | int ndots = 0; |
1275 | unsigned v; | 1276 | unsigned v; |
1277 | size_t copy; | ||
1276 | 1278 | ||
1277 | while (*rest) { | 1279 | while (*rest) { |
1278 | if (*rest == '.' && ++ndots >= 3) | 1280 | if (*rest == '.' && ++ndots >= 3) |
@@ -1282,8 +1284,9 @@ static int override_release(char __user *release, int len) | |||
1282 | rest++; | 1284 | rest++; |
1283 | } | 1285 | } |
1284 | v = ((LINUX_VERSION_CODE >> 8) & 0xff) + 40; | 1286 | v = ((LINUX_VERSION_CODE >> 8) & 0xff) + 40; |
1285 | snprintf(buf, len, "2.6.%u%s", v, rest); | 1287 | copy = clamp_t(size_t, len, 1, sizeof(buf)); |
1286 | ret = copy_to_user(release, buf, len); | 1288 | copy = scnprintf(buf, copy, "2.6.%u%s", v, rest); |
1289 | ret = copy_to_user(release, buf, copy + 1); | ||
1287 | } | 1290 | } |
1288 | return ret; | 1291 | return ret; |
1289 | } | 1292 | } |
@@ -1701,7 +1704,7 @@ static void k_getrusage(struct task_struct *p, int who, struct rusage *r) | |||
1701 | utime = stime = 0; | 1704 | utime = stime = 0; |
1702 | 1705 | ||
1703 | if (who == RUSAGE_THREAD) { | 1706 | if (who == RUSAGE_THREAD) { |
1704 | task_times(current, &utime, &stime); | 1707 | task_cputime_adjusted(current, &utime, &stime); |
1705 | accumulate_thread_rusage(p, r); | 1708 | accumulate_thread_rusage(p, r); |
1706 | maxrss = p->signal->maxrss; | 1709 | maxrss = p->signal->maxrss; |
1707 | goto out; | 1710 | goto out; |
@@ -1727,7 +1730,7 @@ static void k_getrusage(struct task_struct *p, int who, struct rusage *r) | |||
1727 | break; | 1730 | break; |
1728 | 1731 | ||
1729 | case RUSAGE_SELF: | 1732 | case RUSAGE_SELF: |
1730 | thread_group_times(p, &tgutime, &tgstime); | 1733 | thread_group_cputime_adjusted(p, &tgutime, &tgstime); |
1731 | utime += tgutime; | 1734 | utime += tgutime; |
1732 | stime += tgstime; | 1735 | stime += tgstime; |
1733 | r->ru_nvcsw += p->signal->nvcsw; | 1736 | r->ru_nvcsw += p->signal->nvcsw; |
@@ -1788,15 +1791,15 @@ SYSCALL_DEFINE1(umask, int, mask) | |||
1788 | #ifdef CONFIG_CHECKPOINT_RESTORE | 1791 | #ifdef CONFIG_CHECKPOINT_RESTORE |
1789 | static int prctl_set_mm_exe_file(struct mm_struct *mm, unsigned int fd) | 1792 | static int prctl_set_mm_exe_file(struct mm_struct *mm, unsigned int fd) |
1790 | { | 1793 | { |
1791 | struct file *exe_file; | 1794 | struct fd exe; |
1792 | struct dentry *dentry; | 1795 | struct dentry *dentry; |
1793 | int err; | 1796 | int err; |
1794 | 1797 | ||
1795 | exe_file = fget(fd); | 1798 | exe = fdget(fd); |
1796 | if (!exe_file) | 1799 | if (!exe.file) |
1797 | return -EBADF; | 1800 | return -EBADF; |
1798 | 1801 | ||
1799 | dentry = exe_file->f_path.dentry; | 1802 | dentry = exe.file->f_path.dentry; |
1800 | 1803 | ||
1801 | /* | 1804 | /* |
1802 | * Because the original mm->exe_file points to executable file, make | 1805 | * Because the original mm->exe_file points to executable file, make |
@@ -1805,7 +1808,7 @@ static int prctl_set_mm_exe_file(struct mm_struct *mm, unsigned int fd) | |||
1805 | */ | 1808 | */ |
1806 | err = -EACCES; | 1809 | err = -EACCES; |
1807 | if (!S_ISREG(dentry->d_inode->i_mode) || | 1810 | if (!S_ISREG(dentry->d_inode->i_mode) || |
1808 | exe_file->f_path.mnt->mnt_flags & MNT_NOEXEC) | 1811 | exe.file->f_path.mnt->mnt_flags & MNT_NOEXEC) |
1809 | goto exit; | 1812 | goto exit; |
1810 | 1813 | ||
1811 | err = inode_permission(dentry->d_inode, MAY_EXEC); | 1814 | err = inode_permission(dentry->d_inode, MAY_EXEC); |
@@ -1839,12 +1842,12 @@ static int prctl_set_mm_exe_file(struct mm_struct *mm, unsigned int fd) | |||
1839 | goto exit_unlock; | 1842 | goto exit_unlock; |
1840 | 1843 | ||
1841 | err = 0; | 1844 | err = 0; |
1842 | set_mm_exe_file(mm, exe_file); | 1845 | set_mm_exe_file(mm, exe.file); /* this grabs a reference to exe.file */ |
1843 | exit_unlock: | 1846 | exit_unlock: |
1844 | up_write(&mm->mmap_sem); | 1847 | up_write(&mm->mmap_sem); |
1845 | 1848 | ||
1846 | exit: | 1849 | exit: |
1847 | fput(exe_file); | 1850 | fdput(exe); |
1848 | return err; | 1851 | return err; |
1849 | } | 1852 | } |
1850 | 1853 | ||
@@ -2204,7 +2207,7 @@ static int __orderly_poweroff(void) | |||
2204 | return -ENOMEM; | 2207 | return -ENOMEM; |
2205 | } | 2208 | } |
2206 | 2209 | ||
2207 | ret = call_usermodehelper_fns(argv[0], argv, envp, UMH_NO_WAIT, | 2210 | ret = call_usermodehelper_fns(argv[0], argv, envp, UMH_WAIT_EXEC, |
2208 | NULL, argv_cleanup, NULL); | 2211 | NULL, argv_cleanup, NULL); |
2209 | if (ret == -ENOMEM) | 2212 | if (ret == -ENOMEM) |
2210 | argv_free(argv); | 2213 | argv_free(argv); |
diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c index dbff751e4086..395084d4ce16 100644 --- a/kernel/sys_ni.c +++ b/kernel/sys_ni.c | |||
@@ -25,6 +25,7 @@ cond_syscall(sys_swapoff); | |||
25 | cond_syscall(sys_kexec_load); | 25 | cond_syscall(sys_kexec_load); |
26 | cond_syscall(compat_sys_kexec_load); | 26 | cond_syscall(compat_sys_kexec_load); |
27 | cond_syscall(sys_init_module); | 27 | cond_syscall(sys_init_module); |
28 | cond_syscall(sys_finit_module); | ||
28 | cond_syscall(sys_delete_module); | 29 | cond_syscall(sys_delete_module); |
29 | cond_syscall(sys_socketpair); | 30 | cond_syscall(sys_socketpair); |
30 | cond_syscall(sys_bind); | 31 | cond_syscall(sys_bind); |
diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 87174ef59161..c88878db491e 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c | |||
@@ -97,10 +97,12 @@ | |||
97 | extern int sysctl_overcommit_memory; | 97 | extern int sysctl_overcommit_memory; |
98 | extern int sysctl_overcommit_ratio; | 98 | extern int sysctl_overcommit_ratio; |
99 | extern int max_threads; | 99 | extern int max_threads; |
100 | extern int core_uses_pid; | ||
101 | extern int suid_dumpable; | 100 | extern int suid_dumpable; |
101 | #ifdef CONFIG_COREDUMP | ||
102 | extern int core_uses_pid; | ||
102 | extern char core_pattern[]; | 103 | extern char core_pattern[]; |
103 | extern unsigned int core_pipe_limit; | 104 | extern unsigned int core_pipe_limit; |
105 | #endif | ||
104 | extern int pid_max; | 106 | extern int pid_max; |
105 | extern int min_free_kbytes; | 107 | extern int min_free_kbytes; |
106 | extern int pid_max_min, pid_max_max; | 108 | extern int pid_max_min, pid_max_max; |
@@ -177,8 +179,10 @@ static int proc_dointvec_minmax_sysadmin(struct ctl_table *table, int write, | |||
177 | 179 | ||
178 | static int proc_dointvec_minmax_coredump(struct ctl_table *table, int write, | 180 | static int proc_dointvec_minmax_coredump(struct ctl_table *table, int write, |
179 | void __user *buffer, size_t *lenp, loff_t *ppos); | 181 | void __user *buffer, size_t *lenp, loff_t *ppos); |
182 | #ifdef CONFIG_COREDUMP | ||
180 | static int proc_dostring_coredump(struct ctl_table *table, int write, | 183 | static int proc_dostring_coredump(struct ctl_table *table, int write, |
181 | void __user *buffer, size_t *lenp, loff_t *ppos); | 184 | void __user *buffer, size_t *lenp, loff_t *ppos); |
185 | #endif | ||
182 | 186 | ||
183 | #ifdef CONFIG_MAGIC_SYSRQ | 187 | #ifdef CONFIG_MAGIC_SYSRQ |
184 | /* Note: sysrq code uses it's own private copy */ | 188 | /* Note: sysrq code uses it's own private copy */ |
@@ -252,9 +256,11 @@ static int min_sched_granularity_ns = 100000; /* 100 usecs */ | |||
252 | static int max_sched_granularity_ns = NSEC_PER_SEC; /* 1 second */ | 256 | static int max_sched_granularity_ns = NSEC_PER_SEC; /* 1 second */ |
253 | static int min_wakeup_granularity_ns; /* 0 usecs */ | 257 | static int min_wakeup_granularity_ns; /* 0 usecs */ |
254 | static int max_wakeup_granularity_ns = NSEC_PER_SEC; /* 1 second */ | 258 | static int max_wakeup_granularity_ns = NSEC_PER_SEC; /* 1 second */ |
259 | #ifdef CONFIG_SMP | ||
255 | static int min_sched_tunable_scaling = SCHED_TUNABLESCALING_NONE; | 260 | static int min_sched_tunable_scaling = SCHED_TUNABLESCALING_NONE; |
256 | static int max_sched_tunable_scaling = SCHED_TUNABLESCALING_END-1; | 261 | static int max_sched_tunable_scaling = SCHED_TUNABLESCALING_END-1; |
257 | #endif | 262 | #endif /* CONFIG_SMP */ |
263 | #endif /* CONFIG_SCHED_DEBUG */ | ||
258 | 264 | ||
259 | #ifdef CONFIG_COMPACTION | 265 | #ifdef CONFIG_COMPACTION |
260 | static int min_extfrag_threshold; | 266 | static int min_extfrag_threshold; |
@@ -297,6 +303,7 @@ static struct ctl_table kern_table[] = { | |||
297 | .extra1 = &min_wakeup_granularity_ns, | 303 | .extra1 = &min_wakeup_granularity_ns, |
298 | .extra2 = &max_wakeup_granularity_ns, | 304 | .extra2 = &max_wakeup_granularity_ns, |
299 | }, | 305 | }, |
306 | #ifdef CONFIG_SMP | ||
300 | { | 307 | { |
301 | .procname = "sched_tunable_scaling", | 308 | .procname = "sched_tunable_scaling", |
302 | .data = &sysctl_sched_tunable_scaling, | 309 | .data = &sysctl_sched_tunable_scaling, |
@@ -307,7 +314,7 @@ static struct ctl_table kern_table[] = { | |||
307 | .extra2 = &max_sched_tunable_scaling, | 314 | .extra2 = &max_sched_tunable_scaling, |
308 | }, | 315 | }, |
309 | { | 316 | { |
310 | .procname = "sched_migration_cost", | 317 | .procname = "sched_migration_cost_ns", |
311 | .data = &sysctl_sched_migration_cost, | 318 | .data = &sysctl_sched_migration_cost, |
312 | .maxlen = sizeof(unsigned int), | 319 | .maxlen = sizeof(unsigned int), |
313 | .mode = 0644, | 320 | .mode = 0644, |
@@ -321,14 +328,14 @@ static struct ctl_table kern_table[] = { | |||
321 | .proc_handler = proc_dointvec, | 328 | .proc_handler = proc_dointvec, |
322 | }, | 329 | }, |
323 | { | 330 | { |
324 | .procname = "sched_time_avg", | 331 | .procname = "sched_time_avg_ms", |
325 | .data = &sysctl_sched_time_avg, | 332 | .data = &sysctl_sched_time_avg, |
326 | .maxlen = sizeof(unsigned int), | 333 | .maxlen = sizeof(unsigned int), |
327 | .mode = 0644, | 334 | .mode = 0644, |
328 | .proc_handler = proc_dointvec, | 335 | .proc_handler = proc_dointvec, |
329 | }, | 336 | }, |
330 | { | 337 | { |
331 | .procname = "sched_shares_window", | 338 | .procname = "sched_shares_window_ns", |
332 | .data = &sysctl_sched_shares_window, | 339 | .data = &sysctl_sched_shares_window, |
333 | .maxlen = sizeof(unsigned int), | 340 | .maxlen = sizeof(unsigned int), |
334 | .mode = 0644, | 341 | .mode = 0644, |
@@ -343,7 +350,45 @@ static struct ctl_table kern_table[] = { | |||
343 | .extra1 = &zero, | 350 | .extra1 = &zero, |
344 | .extra2 = &one, | 351 | .extra2 = &one, |
345 | }, | 352 | }, |
346 | #endif | 353 | #endif /* CONFIG_SMP */ |
354 | #ifdef CONFIG_NUMA_BALANCING | ||
355 | { | ||
356 | .procname = "numa_balancing_scan_delay_ms", | ||
357 | .data = &sysctl_numa_balancing_scan_delay, | ||
358 | .maxlen = sizeof(unsigned int), | ||
359 | .mode = 0644, | ||
360 | .proc_handler = proc_dointvec, | ||
361 | }, | ||
362 | { | ||
363 | .procname = "numa_balancing_scan_period_min_ms", | ||
364 | .data = &sysctl_numa_balancing_scan_period_min, | ||
365 | .maxlen = sizeof(unsigned int), | ||
366 | .mode = 0644, | ||
367 | .proc_handler = proc_dointvec, | ||
368 | }, | ||
369 | { | ||
370 | .procname = "numa_balancing_scan_period_reset", | ||
371 | .data = &sysctl_numa_balancing_scan_period_reset, | ||
372 | .maxlen = sizeof(unsigned int), | ||
373 | .mode = 0644, | ||
374 | .proc_handler = proc_dointvec, | ||
375 | }, | ||
376 | { | ||
377 | .procname = "numa_balancing_scan_period_max_ms", | ||
378 | .data = &sysctl_numa_balancing_scan_period_max, | ||
379 | .maxlen = sizeof(unsigned int), | ||
380 | .mode = 0644, | ||
381 | .proc_handler = proc_dointvec, | ||
382 | }, | ||
383 | { | ||
384 | .procname = "numa_balancing_scan_size_mb", | ||
385 | .data = &sysctl_numa_balancing_scan_size, | ||
386 | .maxlen = sizeof(unsigned int), | ||
387 | .mode = 0644, | ||
388 | .proc_handler = proc_dointvec, | ||
389 | }, | ||
390 | #endif /* CONFIG_NUMA_BALANCING */ | ||
391 | #endif /* CONFIG_SCHED_DEBUG */ | ||
347 | { | 392 | { |
348 | .procname = "sched_rt_period_us", | 393 | .procname = "sched_rt_period_us", |
349 | .data = &sysctl_sched_rt_period, | 394 | .data = &sysctl_sched_rt_period, |
@@ -404,6 +449,7 @@ static struct ctl_table kern_table[] = { | |||
404 | .mode = 0644, | 449 | .mode = 0644, |
405 | .proc_handler = proc_dointvec, | 450 | .proc_handler = proc_dointvec, |
406 | }, | 451 | }, |
452 | #ifdef CONFIG_COREDUMP | ||
407 | { | 453 | { |
408 | .procname = "core_uses_pid", | 454 | .procname = "core_uses_pid", |
409 | .data = &core_uses_pid, | 455 | .data = &core_uses_pid, |
@@ -425,6 +471,7 @@ static struct ctl_table kern_table[] = { | |||
425 | .mode = 0644, | 471 | .mode = 0644, |
426 | .proc_handler = proc_dointvec, | 472 | .proc_handler = proc_dointvec, |
427 | }, | 473 | }, |
474 | #endif | ||
428 | #ifdef CONFIG_PROC_SYSCTL | 475 | #ifdef CONFIG_PROC_SYSCTL |
429 | { | 476 | { |
430 | .procname = "tainted", | 477 | .procname = "tainted", |
@@ -559,7 +606,7 @@ static struct ctl_table kern_table[] = { | |||
559 | .extra2 = &one, | 606 | .extra2 = &one, |
560 | }, | 607 | }, |
561 | #endif | 608 | #endif |
562 | #ifdef CONFIG_HOTPLUG | 609 | |
563 | { | 610 | { |
564 | .procname = "hotplug", | 611 | .procname = "hotplug", |
565 | .data = &uevent_helper, | 612 | .data = &uevent_helper, |
@@ -567,7 +614,7 @@ static struct ctl_table kern_table[] = { | |||
567 | .mode = 0644, | 614 | .mode = 0644, |
568 | .proc_handler = proc_dostring, | 615 | .proc_handler = proc_dostring, |
569 | }, | 616 | }, |
570 | #endif | 617 | |
571 | #ifdef CONFIG_CHR_DEV_SG | 618 | #ifdef CONFIG_CHR_DEV_SG |
572 | { | 619 | { |
573 | .procname = "sg-big-buff", | 620 | .procname = "sg-big-buff", |
@@ -1543,8 +1590,7 @@ static struct ctl_table fs_table[] = { | |||
1543 | }; | 1590 | }; |
1544 | 1591 | ||
1545 | static struct ctl_table debug_table[] = { | 1592 | static struct ctl_table debug_table[] = { |
1546 | #if defined(CONFIG_X86) || defined(CONFIG_PPC) || defined(CONFIG_SPARC) || \ | 1593 | #ifdef CONFIG_SYSCTL_EXCEPTION_TRACE |
1547 | defined(CONFIG_S390) || defined(CONFIG_TILE) | ||
1548 | { | 1594 | { |
1549 | .procname = "exception-trace", | 1595 | .procname = "exception-trace", |
1550 | .data = &show_unhandled_signals, | 1596 | .data = &show_unhandled_signals, |
@@ -2036,12 +2082,14 @@ int proc_dointvec_minmax(struct ctl_table *table, int write, | |||
2036 | 2082 | ||
2037 | static void validate_coredump_safety(void) | 2083 | static void validate_coredump_safety(void) |
2038 | { | 2084 | { |
2085 | #ifdef CONFIG_COREDUMP | ||
2039 | if (suid_dumpable == SUID_DUMPABLE_SAFE && | 2086 | if (suid_dumpable == SUID_DUMPABLE_SAFE && |
2040 | core_pattern[0] != '/' && core_pattern[0] != '|') { | 2087 | core_pattern[0] != '/' && core_pattern[0] != '|') { |
2041 | printk(KERN_WARNING "Unsafe core_pattern used with "\ | 2088 | printk(KERN_WARNING "Unsafe core_pattern used with "\ |
2042 | "suid_dumpable=2. Pipe handler or fully qualified "\ | 2089 | "suid_dumpable=2. Pipe handler or fully qualified "\ |
2043 | "core dump path required.\n"); | 2090 | "core dump path required.\n"); |
2044 | } | 2091 | } |
2092 | #endif | ||
2045 | } | 2093 | } |
2046 | 2094 | ||
2047 | static int proc_dointvec_minmax_coredump(struct ctl_table *table, int write, | 2095 | static int proc_dointvec_minmax_coredump(struct ctl_table *table, int write, |
@@ -2053,6 +2101,7 @@ static int proc_dointvec_minmax_coredump(struct ctl_table *table, int write, | |||
2053 | return error; | 2101 | return error; |
2054 | } | 2102 | } |
2055 | 2103 | ||
2104 | #ifdef CONFIG_COREDUMP | ||
2056 | static int proc_dostring_coredump(struct ctl_table *table, int write, | 2105 | static int proc_dostring_coredump(struct ctl_table *table, int write, |
2057 | void __user *buffer, size_t *lenp, loff_t *ppos) | 2106 | void __user *buffer, size_t *lenp, loff_t *ppos) |
2058 | { | 2107 | { |
@@ -2061,6 +2110,7 @@ static int proc_dostring_coredump(struct ctl_table *table, int write, | |||
2061 | validate_coredump_safety(); | 2110 | validate_coredump_safety(); |
2062 | return error; | 2111 | return error; |
2063 | } | 2112 | } |
2113 | #endif | ||
2064 | 2114 | ||
2065 | static int __do_proc_doulongvec_minmax(void *data, struct ctl_table *table, int write, | 2115 | static int __do_proc_doulongvec_minmax(void *data, struct ctl_table *table, int write, |
2066 | void __user *buffer, | 2116 | void __user *buffer, |
diff --git a/kernel/sysctl_binary.c b/kernel/sysctl_binary.c index 65bdcf198d4e..5a6384450501 100644 --- a/kernel/sysctl_binary.c +++ b/kernel/sysctl_binary.c | |||
@@ -1344,7 +1344,7 @@ static ssize_t binary_sysctl(const int *name, int nlen, | |||
1344 | goto out_putname; | 1344 | goto out_putname; |
1345 | } | 1345 | } |
1346 | 1346 | ||
1347 | mnt = current->nsproxy->pid_ns->proc_mnt; | 1347 | mnt = task_active_pid_ns(current)->proc_mnt; |
1348 | file = file_open_root(mnt->mnt_root, mnt, pathname, flags); | 1348 | file = file_open_root(mnt->mnt_root, mnt, pathname, flags); |
1349 | result = PTR_ERR(file); | 1349 | result = PTR_ERR(file); |
1350 | if (IS_ERR(file)) | 1350 | if (IS_ERR(file)) |
diff --git a/kernel/task_work.c b/kernel/task_work.c index d320d44903bd..65bd3c92d6f3 100644 --- a/kernel/task_work.c +++ b/kernel/task_work.c | |||
@@ -2,26 +2,20 @@ | |||
2 | #include <linux/task_work.h> | 2 | #include <linux/task_work.h> |
3 | #include <linux/tracehook.h> | 3 | #include <linux/tracehook.h> |
4 | 4 | ||
5 | static struct callback_head work_exited; /* all we need is ->next == NULL */ | ||
6 | |||
5 | int | 7 | int |
6 | task_work_add(struct task_struct *task, struct callback_head *twork, bool notify) | 8 | task_work_add(struct task_struct *task, struct callback_head *work, bool notify) |
7 | { | 9 | { |
8 | struct callback_head *last, *first; | 10 | struct callback_head *head; |
9 | unsigned long flags; | ||
10 | 11 | ||
11 | /* | 12 | do { |
12 | * Not inserting the new work if the task has already passed | 13 | head = ACCESS_ONCE(task->task_works); |
13 | * exit_task_work() is the responisbility of callers. | 14 | if (unlikely(head == &work_exited)) |
14 | */ | 15 | return -ESRCH; |
15 | raw_spin_lock_irqsave(&task->pi_lock, flags); | 16 | work->next = head; |
16 | last = task->task_works; | 17 | } while (cmpxchg(&task->task_works, head, work) != head); |
17 | first = last ? last->next : twork; | ||
18 | twork->next = first; | ||
19 | if (last) | ||
20 | last->next = twork; | ||
21 | task->task_works = twork; | ||
22 | raw_spin_unlock_irqrestore(&task->pi_lock, flags); | ||
23 | 18 | ||
24 | /* test_and_set_bit() implies mb(), see tracehook_notify_resume(). */ | ||
25 | if (notify) | 19 | if (notify) |
26 | set_notify_resume(task); | 20 | set_notify_resume(task); |
27 | return 0; | 21 | return 0; |
@@ -30,52 +24,69 @@ task_work_add(struct task_struct *task, struct callback_head *twork, bool notify | |||
30 | struct callback_head * | 24 | struct callback_head * |
31 | task_work_cancel(struct task_struct *task, task_work_func_t func) | 25 | task_work_cancel(struct task_struct *task, task_work_func_t func) |
32 | { | 26 | { |
27 | struct callback_head **pprev = &task->task_works; | ||
28 | struct callback_head *work = NULL; | ||
33 | unsigned long flags; | 29 | unsigned long flags; |
34 | struct callback_head *last, *res = NULL; | 30 | /* |
35 | 31 | * If cmpxchg() fails we continue without updating pprev. | |
32 | * Either we raced with task_work_add() which added the | ||
33 | * new entry before this work, we will find it again. Or | ||
34 | * we raced with task_work_run(), *pprev == NULL/exited. | ||
35 | */ | ||
36 | raw_spin_lock_irqsave(&task->pi_lock, flags); | 36 | raw_spin_lock_irqsave(&task->pi_lock, flags); |
37 | last = task->task_works; | 37 | while ((work = ACCESS_ONCE(*pprev))) { |
38 | if (last) { | 38 | read_barrier_depends(); |
39 | struct callback_head *q = last, *p = q->next; | 39 | if (work->func != func) |
40 | while (1) { | 40 | pprev = &work->next; |
41 | if (p->func == func) { | 41 | else if (cmpxchg(pprev, work, work->next) == work) |
42 | q->next = p->next; | 42 | break; |
43 | if (p == last) | ||
44 | task->task_works = q == p ? NULL : q; | ||
45 | res = p; | ||
46 | break; | ||
47 | } | ||
48 | if (p == last) | ||
49 | break; | ||
50 | q = p; | ||
51 | p = q->next; | ||
52 | } | ||
53 | } | 43 | } |
54 | raw_spin_unlock_irqrestore(&task->pi_lock, flags); | 44 | raw_spin_unlock_irqrestore(&task->pi_lock, flags); |
55 | return res; | 45 | |
46 | return work; | ||
56 | } | 47 | } |
57 | 48 | ||
58 | void task_work_run(void) | 49 | void task_work_run(void) |
59 | { | 50 | { |
60 | struct task_struct *task = current; | 51 | struct task_struct *task = current; |
61 | struct callback_head *p, *q; | 52 | struct callback_head *work, *head, *next; |
53 | |||
54 | for (;;) { | ||
55 | /* | ||
56 | * work->func() can do task_work_add(), do not set | ||
57 | * work_exited unless the list is empty. | ||
58 | */ | ||
59 | do { | ||
60 | work = ACCESS_ONCE(task->task_works); | ||
61 | head = !work && (task->flags & PF_EXITING) ? | ||
62 | &work_exited : NULL; | ||
63 | } while (cmpxchg(&task->task_works, work, head) != work); | ||
62 | 64 | ||
63 | while (1) { | 65 | if (!work) |
64 | raw_spin_lock_irq(&task->pi_lock); | 66 | break; |
65 | p = task->task_works; | 67 | /* |
66 | task->task_works = NULL; | 68 | * Synchronize with task_work_cancel(). It can't remove |
67 | raw_spin_unlock_irq(&task->pi_lock); | 69 | * the first entry == work, cmpxchg(task_works) should |
70 | * fail, but it can play with *work and other entries. | ||
71 | */ | ||
72 | raw_spin_unlock_wait(&task->pi_lock); | ||
73 | smp_mb(); | ||
68 | 74 | ||
69 | if (unlikely(!p)) | 75 | /* Reverse the list to run the works in fifo order */ |
70 | return; | 76 | head = NULL; |
77 | do { | ||
78 | next = work->next; | ||
79 | work->next = head; | ||
80 | head = work; | ||
81 | work = next; | ||
82 | } while (work); | ||
71 | 83 | ||
72 | q = p->next; /* head */ | 84 | work = head; |
73 | p->next = NULL; /* cut it */ | 85 | do { |
74 | while (q) { | 86 | next = work->next; |
75 | p = q->next; | 87 | work->func(work); |
76 | q->func(q); | 88 | work = next; |
77 | q = p; | ||
78 | cond_resched(); | 89 | cond_resched(); |
79 | } | 90 | } while (work); |
80 | } | 91 | } |
81 | } | 92 | } |
diff --git a/kernel/taskstats.c b/kernel/taskstats.c index d0a32796550f..145bb4d3bd4d 100644 --- a/kernel/taskstats.c +++ b/kernel/taskstats.c | |||
@@ -27,6 +27,7 @@ | |||
27 | #include <linux/cgroup.h> | 27 | #include <linux/cgroup.h> |
28 | #include <linux/fs.h> | 28 | #include <linux/fs.h> |
29 | #include <linux/file.h> | 29 | #include <linux/file.h> |
30 | #include <linux/pid_namespace.h> | ||
30 | #include <net/genetlink.h> | 31 | #include <net/genetlink.h> |
31 | #include <linux/atomic.h> | 32 | #include <linux/atomic.h> |
32 | 33 | ||
@@ -174,7 +175,9 @@ static void send_cpu_listeners(struct sk_buff *skb, | |||
174 | up_write(&listeners->sem); | 175 | up_write(&listeners->sem); |
175 | } | 176 | } |
176 | 177 | ||
177 | static void fill_stats(struct task_struct *tsk, struct taskstats *stats) | 178 | static void fill_stats(struct user_namespace *user_ns, |
179 | struct pid_namespace *pid_ns, | ||
180 | struct task_struct *tsk, struct taskstats *stats) | ||
178 | { | 181 | { |
179 | memset(stats, 0, sizeof(*stats)); | 182 | memset(stats, 0, sizeof(*stats)); |
180 | /* | 183 | /* |
@@ -190,7 +193,7 @@ static void fill_stats(struct task_struct *tsk, struct taskstats *stats) | |||
190 | stats->version = TASKSTATS_VERSION; | 193 | stats->version = TASKSTATS_VERSION; |
191 | stats->nvcsw = tsk->nvcsw; | 194 | stats->nvcsw = tsk->nvcsw; |
192 | stats->nivcsw = tsk->nivcsw; | 195 | stats->nivcsw = tsk->nivcsw; |
193 | bacct_add_tsk(stats, tsk); | 196 | bacct_add_tsk(user_ns, pid_ns, stats, tsk); |
194 | 197 | ||
195 | /* fill in extended acct fields */ | 198 | /* fill in extended acct fields */ |
196 | xacct_add_tsk(stats, tsk); | 199 | xacct_add_tsk(stats, tsk); |
@@ -207,7 +210,7 @@ static int fill_stats_for_pid(pid_t pid, struct taskstats *stats) | |||
207 | rcu_read_unlock(); | 210 | rcu_read_unlock(); |
208 | if (!tsk) | 211 | if (!tsk) |
209 | return -ESRCH; | 212 | return -ESRCH; |
210 | fill_stats(tsk, stats); | 213 | fill_stats(current_user_ns(), task_active_pid_ns(current), tsk, stats); |
211 | put_task_struct(tsk); | 214 | put_task_struct(tsk); |
212 | return 0; | 215 | return 0; |
213 | } | 216 | } |
@@ -291,6 +294,12 @@ static int add_del_listener(pid_t pid, const struct cpumask *mask, int isadd) | |||
291 | if (!cpumask_subset(mask, cpu_possible_mask)) | 294 | if (!cpumask_subset(mask, cpu_possible_mask)) |
292 | return -EINVAL; | 295 | return -EINVAL; |
293 | 296 | ||
297 | if (current_user_ns() != &init_user_ns) | ||
298 | return -EINVAL; | ||
299 | |||
300 | if (task_active_pid_ns(current) != &init_pid_ns) | ||
301 | return -EINVAL; | ||
302 | |||
294 | if (isadd == REGISTER) { | 303 | if (isadd == REGISTER) { |
295 | for_each_cpu(cpu, mask) { | 304 | for_each_cpu(cpu, mask) { |
296 | s = kmalloc_node(sizeof(struct listener), | 305 | s = kmalloc_node(sizeof(struct listener), |
@@ -415,16 +424,15 @@ static int cgroupstats_user_cmd(struct sk_buff *skb, struct genl_info *info) | |||
415 | struct nlattr *na; | 424 | struct nlattr *na; |
416 | size_t size; | 425 | size_t size; |
417 | u32 fd; | 426 | u32 fd; |
418 | struct file *file; | 427 | struct fd f; |
419 | int fput_needed; | ||
420 | 428 | ||
421 | na = info->attrs[CGROUPSTATS_CMD_ATTR_FD]; | 429 | na = info->attrs[CGROUPSTATS_CMD_ATTR_FD]; |
422 | if (!na) | 430 | if (!na) |
423 | return -EINVAL; | 431 | return -EINVAL; |
424 | 432 | ||
425 | fd = nla_get_u32(info->attrs[CGROUPSTATS_CMD_ATTR_FD]); | 433 | fd = nla_get_u32(info->attrs[CGROUPSTATS_CMD_ATTR_FD]); |
426 | file = fget_light(fd, &fput_needed); | 434 | f = fdget(fd); |
427 | if (!file) | 435 | if (!f.file) |
428 | return 0; | 436 | return 0; |
429 | 437 | ||
430 | size = nla_total_size(sizeof(struct cgroupstats)); | 438 | size = nla_total_size(sizeof(struct cgroupstats)); |
@@ -437,6 +445,7 @@ static int cgroupstats_user_cmd(struct sk_buff *skb, struct genl_info *info) | |||
437 | na = nla_reserve(rep_skb, CGROUPSTATS_TYPE_CGROUP_STATS, | 445 | na = nla_reserve(rep_skb, CGROUPSTATS_TYPE_CGROUP_STATS, |
438 | sizeof(struct cgroupstats)); | 446 | sizeof(struct cgroupstats)); |
439 | if (na == NULL) { | 447 | if (na == NULL) { |
448 | nlmsg_free(rep_skb); | ||
440 | rc = -EMSGSIZE; | 449 | rc = -EMSGSIZE; |
441 | goto err; | 450 | goto err; |
442 | } | 451 | } |
@@ -444,7 +453,7 @@ static int cgroupstats_user_cmd(struct sk_buff *skb, struct genl_info *info) | |||
444 | stats = nla_data(na); | 453 | stats = nla_data(na); |
445 | memset(stats, 0, sizeof(*stats)); | 454 | memset(stats, 0, sizeof(*stats)); |
446 | 455 | ||
447 | rc = cgroupstats_build(stats, file->f_dentry); | 456 | rc = cgroupstats_build(stats, f.file->f_dentry); |
448 | if (rc < 0) { | 457 | if (rc < 0) { |
449 | nlmsg_free(rep_skb); | 458 | nlmsg_free(rep_skb); |
450 | goto err; | 459 | goto err; |
@@ -453,7 +462,7 @@ static int cgroupstats_user_cmd(struct sk_buff *skb, struct genl_info *info) | |||
453 | rc = send_reply(rep_skb, info); | 462 | rc = send_reply(rep_skb, info); |
454 | 463 | ||
455 | err: | 464 | err: |
456 | fput_light(file, fput_needed); | 465 | fdput(f); |
457 | return rc; | 466 | return rc; |
458 | } | 467 | } |
459 | 468 | ||
@@ -467,7 +476,7 @@ static int cmd_attr_register_cpumask(struct genl_info *info) | |||
467 | rc = parse(info->attrs[TASKSTATS_CMD_ATTR_REGISTER_CPUMASK], mask); | 476 | rc = parse(info->attrs[TASKSTATS_CMD_ATTR_REGISTER_CPUMASK], mask); |
468 | if (rc < 0) | 477 | if (rc < 0) |
469 | goto out; | 478 | goto out; |
470 | rc = add_del_listener(info->snd_pid, mask, REGISTER); | 479 | rc = add_del_listener(info->snd_portid, mask, REGISTER); |
471 | out: | 480 | out: |
472 | free_cpumask_var(mask); | 481 | free_cpumask_var(mask); |
473 | return rc; | 482 | return rc; |
@@ -483,7 +492,7 @@ static int cmd_attr_deregister_cpumask(struct genl_info *info) | |||
483 | rc = parse(info->attrs[TASKSTATS_CMD_ATTR_DEREGISTER_CPUMASK], mask); | 492 | rc = parse(info->attrs[TASKSTATS_CMD_ATTR_DEREGISTER_CPUMASK], mask); |
484 | if (rc < 0) | 493 | if (rc < 0) |
485 | goto out; | 494 | goto out; |
486 | rc = add_del_listener(info->snd_pid, mask, DEREGISTER); | 495 | rc = add_del_listener(info->snd_portid, mask, DEREGISTER); |
487 | out: | 496 | out: |
488 | free_cpumask_var(mask); | 497 | free_cpumask_var(mask); |
489 | return rc; | 498 | return rc; |
@@ -631,11 +640,12 @@ void taskstats_exit(struct task_struct *tsk, int group_dead) | |||
631 | if (rc < 0) | 640 | if (rc < 0) |
632 | return; | 641 | return; |
633 | 642 | ||
634 | stats = mk_reply(rep_skb, TASKSTATS_TYPE_PID, tsk->pid); | 643 | stats = mk_reply(rep_skb, TASKSTATS_TYPE_PID, |
644 | task_pid_nr_ns(tsk, &init_pid_ns)); | ||
635 | if (!stats) | 645 | if (!stats) |
636 | goto err; | 646 | goto err; |
637 | 647 | ||
638 | fill_stats(tsk, stats); | 648 | fill_stats(&init_user_ns, &init_pid_ns, tsk, stats); |
639 | 649 | ||
640 | /* | 650 | /* |
641 | * Doesn't matter if tsk is the leader or the last group member leaving | 651 | * Doesn't matter if tsk is the leader or the last group member leaving |
@@ -643,7 +653,8 @@ void taskstats_exit(struct task_struct *tsk, int group_dead) | |||
643 | if (!is_thread_group || !group_dead) | 653 | if (!is_thread_group || !group_dead) |
644 | goto send; | 654 | goto send; |
645 | 655 | ||
646 | stats = mk_reply(rep_skb, TASKSTATS_TYPE_TGID, tsk->tgid); | 656 | stats = mk_reply(rep_skb, TASKSTATS_TYPE_TGID, |
657 | task_tgid_nr_ns(tsk, &init_pid_ns)); | ||
647 | if (!stats) | 658 | if (!stats) |
648 | goto err; | 659 | goto err; |
649 | 660 | ||
diff --git a/kernel/time.c b/kernel/time.c index ba744cf80696..d226c6a3fd28 100644 --- a/kernel/time.c +++ b/kernel/time.c | |||
@@ -30,7 +30,7 @@ | |||
30 | #include <linux/export.h> | 30 | #include <linux/export.h> |
31 | #include <linux/timex.h> | 31 | #include <linux/timex.h> |
32 | #include <linux/capability.h> | 32 | #include <linux/capability.h> |
33 | #include <linux/clocksource.h> | 33 | #include <linux/timekeeper_internal.h> |
34 | #include <linux/errno.h> | 34 | #include <linux/errno.h> |
35 | #include <linux/syscalls.h> | 35 | #include <linux/syscalls.h> |
36 | #include <linux/security.h> | 36 | #include <linux/security.h> |
diff --git a/kernel/time/Kconfig b/kernel/time/Kconfig index fd42bd452b75..8601f0db1261 100644 --- a/kernel/time/Kconfig +++ b/kernel/time/Kconfig | |||
@@ -16,6 +16,10 @@ config ARCH_CLOCKSOURCE_DATA | |||
16 | config GENERIC_TIME_VSYSCALL | 16 | config GENERIC_TIME_VSYSCALL |
17 | bool | 17 | bool |
18 | 18 | ||
19 | # Timekeeping vsyscall support | ||
20 | config GENERIC_TIME_VSYSCALL_OLD | ||
21 | bool | ||
22 | |||
19 | # ktime_t scalar 64bit nsec representation | 23 | # ktime_t scalar 64bit nsec representation |
20 | config KTIME_SCALAR | 24 | config KTIME_SCALAR |
21 | bool | 25 | bool |
diff --git a/kernel/time/Makefile b/kernel/time/Makefile index e2fd74b8e8c2..ff7d9d2ab504 100644 --- a/kernel/time/Makefile +++ b/kernel/time/Makefile | |||
@@ -1,4 +1,4 @@ | |||
1 | obj-y += timekeeping.o ntp.o clocksource.o jiffies.o timer_list.o timecompare.o | 1 | obj-y += timekeeping.o ntp.o clocksource.o jiffies.o timer_list.o |
2 | obj-y += timeconv.o posix-clock.o alarmtimer.o | 2 | obj-y += timeconv.o posix-clock.o alarmtimer.o |
3 | 3 | ||
4 | obj-$(CONFIG_GENERIC_CLOCKEVENTS_BUILD) += clockevents.o | 4 | obj-$(CONFIG_GENERIC_CLOCKEVENTS_BUILD) += clockevents.o |
diff --git a/kernel/time/alarmtimer.c b/kernel/time/alarmtimer.c index aa27d391bfc8..f11d83b12949 100644 --- a/kernel/time/alarmtimer.c +++ b/kernel/time/alarmtimer.c | |||
@@ -37,7 +37,6 @@ | |||
37 | static struct alarm_base { | 37 | static struct alarm_base { |
38 | spinlock_t lock; | 38 | spinlock_t lock; |
39 | struct timerqueue_head timerqueue; | 39 | struct timerqueue_head timerqueue; |
40 | struct hrtimer timer; | ||
41 | ktime_t (*gettime)(void); | 40 | ktime_t (*gettime)(void); |
42 | clockid_t base_clockid; | 41 | clockid_t base_clockid; |
43 | } alarm_bases[ALARM_NUMTYPE]; | 42 | } alarm_bases[ALARM_NUMTYPE]; |
@@ -46,6 +45,8 @@ static struct alarm_base { | |||
46 | static ktime_t freezer_delta; | 45 | static ktime_t freezer_delta; |
47 | static DEFINE_SPINLOCK(freezer_delta_lock); | 46 | static DEFINE_SPINLOCK(freezer_delta_lock); |
48 | 47 | ||
48 | static struct wakeup_source *ws; | ||
49 | |||
49 | #ifdef CONFIG_RTC_CLASS | 50 | #ifdef CONFIG_RTC_CLASS |
50 | /* rtc timer and device for setting alarm wakeups at suspend */ | 51 | /* rtc timer and device for setting alarm wakeups at suspend */ |
51 | static struct rtc_timer rtctimer; | 52 | static struct rtc_timer rtctimer; |
@@ -130,50 +131,35 @@ static inline void alarmtimer_rtc_timer_init(void) { } | |||
130 | * @base: pointer to the base where the timer is being run | 131 | * @base: pointer to the base where the timer is being run |
131 | * @alarm: pointer to alarm being enqueued. | 132 | * @alarm: pointer to alarm being enqueued. |
132 | * | 133 | * |
133 | * Adds alarm to a alarm_base timerqueue and if necessary sets | 134 | * Adds alarm to a alarm_base timerqueue |
134 | * an hrtimer to run. | ||
135 | * | 135 | * |
136 | * Must hold base->lock when calling. | 136 | * Must hold base->lock when calling. |
137 | */ | 137 | */ |
138 | static void alarmtimer_enqueue(struct alarm_base *base, struct alarm *alarm) | 138 | static void alarmtimer_enqueue(struct alarm_base *base, struct alarm *alarm) |
139 | { | 139 | { |
140 | if (alarm->state & ALARMTIMER_STATE_ENQUEUED) | ||
141 | timerqueue_del(&base->timerqueue, &alarm->node); | ||
142 | |||
140 | timerqueue_add(&base->timerqueue, &alarm->node); | 143 | timerqueue_add(&base->timerqueue, &alarm->node); |
141 | alarm->state |= ALARMTIMER_STATE_ENQUEUED; | 144 | alarm->state |= ALARMTIMER_STATE_ENQUEUED; |
142 | |||
143 | if (&alarm->node == timerqueue_getnext(&base->timerqueue)) { | ||
144 | hrtimer_try_to_cancel(&base->timer); | ||
145 | hrtimer_start(&base->timer, alarm->node.expires, | ||
146 | HRTIMER_MODE_ABS); | ||
147 | } | ||
148 | } | 145 | } |
149 | 146 | ||
150 | /** | 147 | /** |
151 | * alarmtimer_remove - Removes an alarm timer from an alarm_base timerqueue | 148 | * alarmtimer_dequeue - Removes an alarm timer from an alarm_base timerqueue |
152 | * @base: pointer to the base where the timer is running | 149 | * @base: pointer to the base where the timer is running |
153 | * @alarm: pointer to alarm being removed | 150 | * @alarm: pointer to alarm being removed |
154 | * | 151 | * |
155 | * Removes alarm to a alarm_base timerqueue and if necessary sets | 152 | * Removes alarm to a alarm_base timerqueue |
156 | * a new timer to run. | ||
157 | * | 153 | * |
158 | * Must hold base->lock when calling. | 154 | * Must hold base->lock when calling. |
159 | */ | 155 | */ |
160 | static void alarmtimer_remove(struct alarm_base *base, struct alarm *alarm) | 156 | static void alarmtimer_dequeue(struct alarm_base *base, struct alarm *alarm) |
161 | { | 157 | { |
162 | struct timerqueue_node *next = timerqueue_getnext(&base->timerqueue); | ||
163 | |||
164 | if (!(alarm->state & ALARMTIMER_STATE_ENQUEUED)) | 158 | if (!(alarm->state & ALARMTIMER_STATE_ENQUEUED)) |
165 | return; | 159 | return; |
166 | 160 | ||
167 | timerqueue_del(&base->timerqueue, &alarm->node); | 161 | timerqueue_del(&base->timerqueue, &alarm->node); |
168 | alarm->state &= ~ALARMTIMER_STATE_ENQUEUED; | 162 | alarm->state &= ~ALARMTIMER_STATE_ENQUEUED; |
169 | |||
170 | if (next == &alarm->node) { | ||
171 | hrtimer_try_to_cancel(&base->timer); | ||
172 | next = timerqueue_getnext(&base->timerqueue); | ||
173 | if (!next) | ||
174 | return; | ||
175 | hrtimer_start(&base->timer, next->expires, HRTIMER_MODE_ABS); | ||
176 | } | ||
177 | } | 163 | } |
178 | 164 | ||
179 | 165 | ||
@@ -188,42 +174,23 @@ static void alarmtimer_remove(struct alarm_base *base, struct alarm *alarm) | |||
188 | */ | 174 | */ |
189 | static enum hrtimer_restart alarmtimer_fired(struct hrtimer *timer) | 175 | static enum hrtimer_restart alarmtimer_fired(struct hrtimer *timer) |
190 | { | 176 | { |
191 | struct alarm_base *base = container_of(timer, struct alarm_base, timer); | 177 | struct alarm *alarm = container_of(timer, struct alarm, timer); |
192 | struct timerqueue_node *next; | 178 | struct alarm_base *base = &alarm_bases[alarm->type]; |
193 | unsigned long flags; | 179 | unsigned long flags; |
194 | ktime_t now; | ||
195 | int ret = HRTIMER_NORESTART; | 180 | int ret = HRTIMER_NORESTART; |
196 | int restart = ALARMTIMER_NORESTART; | 181 | int restart = ALARMTIMER_NORESTART; |
197 | 182 | ||
198 | spin_lock_irqsave(&base->lock, flags); | 183 | spin_lock_irqsave(&base->lock, flags); |
199 | now = base->gettime(); | 184 | alarmtimer_dequeue(base, alarm); |
200 | while ((next = timerqueue_getnext(&base->timerqueue))) { | 185 | spin_unlock_irqrestore(&base->lock, flags); |
201 | struct alarm *alarm; | ||
202 | ktime_t expired = next->expires; | ||
203 | |||
204 | if (expired.tv64 > now.tv64) | ||
205 | break; | ||
206 | |||
207 | alarm = container_of(next, struct alarm, node); | ||
208 | |||
209 | timerqueue_del(&base->timerqueue, &alarm->node); | ||
210 | alarm->state &= ~ALARMTIMER_STATE_ENQUEUED; | ||
211 | |||
212 | alarm->state |= ALARMTIMER_STATE_CALLBACK; | ||
213 | spin_unlock_irqrestore(&base->lock, flags); | ||
214 | if (alarm->function) | ||
215 | restart = alarm->function(alarm, now); | ||
216 | spin_lock_irqsave(&base->lock, flags); | ||
217 | alarm->state &= ~ALARMTIMER_STATE_CALLBACK; | ||
218 | 186 | ||
219 | if (restart != ALARMTIMER_NORESTART) { | 187 | if (alarm->function) |
220 | timerqueue_add(&base->timerqueue, &alarm->node); | 188 | restart = alarm->function(alarm, base->gettime()); |
221 | alarm->state |= ALARMTIMER_STATE_ENQUEUED; | ||
222 | } | ||
223 | } | ||
224 | 189 | ||
225 | if (next) { | 190 | spin_lock_irqsave(&base->lock, flags); |
226 | hrtimer_set_expires(&base->timer, next->expires); | 191 | if (restart != ALARMTIMER_NORESTART) { |
192 | hrtimer_set_expires(&alarm->timer, alarm->node.expires); | ||
193 | alarmtimer_enqueue(base, alarm); | ||
227 | ret = HRTIMER_RESTART; | 194 | ret = HRTIMER_RESTART; |
228 | } | 195 | } |
229 | spin_unlock_irqrestore(&base->lock, flags); | 196 | spin_unlock_irqrestore(&base->lock, flags); |
@@ -250,6 +217,7 @@ static int alarmtimer_suspend(struct device *dev) | |||
250 | unsigned long flags; | 217 | unsigned long flags; |
251 | struct rtc_device *rtc; | 218 | struct rtc_device *rtc; |
252 | int i; | 219 | int i; |
220 | int ret; | ||
253 | 221 | ||
254 | spin_lock_irqsave(&freezer_delta_lock, flags); | 222 | spin_lock_irqsave(&freezer_delta_lock, flags); |
255 | min = freezer_delta; | 223 | min = freezer_delta; |
@@ -279,8 +247,10 @@ static int alarmtimer_suspend(struct device *dev) | |||
279 | if (min.tv64 == 0) | 247 | if (min.tv64 == 0) |
280 | return 0; | 248 | return 0; |
281 | 249 | ||
282 | /* XXX - Should we enforce a minimum sleep time? */ | 250 | if (ktime_to_ns(min) < 2 * NSEC_PER_SEC) { |
283 | WARN_ON(min.tv64 < NSEC_PER_SEC); | 251 | __pm_wakeup_event(ws, 2 * MSEC_PER_SEC); |
252 | return -EBUSY; | ||
253 | } | ||
284 | 254 | ||
285 | /* Setup an rtc timer to fire that far in the future */ | 255 | /* Setup an rtc timer to fire that far in the future */ |
286 | rtc_timer_cancel(rtc, &rtctimer); | 256 | rtc_timer_cancel(rtc, &rtctimer); |
@@ -288,9 +258,11 @@ static int alarmtimer_suspend(struct device *dev) | |||
288 | now = rtc_tm_to_ktime(tm); | 258 | now = rtc_tm_to_ktime(tm); |
289 | now = ktime_add(now, min); | 259 | now = ktime_add(now, min); |
290 | 260 | ||
291 | rtc_timer_start(rtc, &rtctimer, now, ktime_set(0, 0)); | 261 | /* Set alarm, if in the past reject suspend briefly to handle */ |
292 | 262 | ret = rtc_timer_start(rtc, &rtctimer, now, ktime_set(0, 0)); | |
293 | return 0; | 263 | if (ret < 0) |
264 | __pm_wakeup_event(ws, MSEC_PER_SEC); | ||
265 | return ret; | ||
294 | } | 266 | } |
295 | #else | 267 | #else |
296 | static int alarmtimer_suspend(struct device *dev) | 268 | static int alarmtimer_suspend(struct device *dev) |
@@ -324,6 +296,9 @@ void alarm_init(struct alarm *alarm, enum alarmtimer_type type, | |||
324 | enum alarmtimer_restart (*function)(struct alarm *, ktime_t)) | 296 | enum alarmtimer_restart (*function)(struct alarm *, ktime_t)) |
325 | { | 297 | { |
326 | timerqueue_init(&alarm->node); | 298 | timerqueue_init(&alarm->node); |
299 | hrtimer_init(&alarm->timer, alarm_bases[type].base_clockid, | ||
300 | HRTIMER_MODE_ABS); | ||
301 | alarm->timer.function = alarmtimer_fired; | ||
327 | alarm->function = function; | 302 | alarm->function = function; |
328 | alarm->type = type; | 303 | alarm->type = type; |
329 | alarm->state = ALARMTIMER_STATE_INACTIVE; | 304 | alarm->state = ALARMTIMER_STATE_INACTIVE; |
@@ -334,17 +309,19 @@ void alarm_init(struct alarm *alarm, enum alarmtimer_type type, | |||
334 | * @alarm: ptr to alarm to set | 309 | * @alarm: ptr to alarm to set |
335 | * @start: time to run the alarm | 310 | * @start: time to run the alarm |
336 | */ | 311 | */ |
337 | void alarm_start(struct alarm *alarm, ktime_t start) | 312 | int alarm_start(struct alarm *alarm, ktime_t start) |
338 | { | 313 | { |
339 | struct alarm_base *base = &alarm_bases[alarm->type]; | 314 | struct alarm_base *base = &alarm_bases[alarm->type]; |
340 | unsigned long flags; | 315 | unsigned long flags; |
316 | int ret; | ||
341 | 317 | ||
342 | spin_lock_irqsave(&base->lock, flags); | 318 | spin_lock_irqsave(&base->lock, flags); |
343 | if (alarmtimer_active(alarm)) | ||
344 | alarmtimer_remove(base, alarm); | ||
345 | alarm->node.expires = start; | 319 | alarm->node.expires = start; |
346 | alarmtimer_enqueue(base, alarm); | 320 | alarmtimer_enqueue(base, alarm); |
321 | ret = hrtimer_start(&alarm->timer, alarm->node.expires, | ||
322 | HRTIMER_MODE_ABS); | ||
347 | spin_unlock_irqrestore(&base->lock, flags); | 323 | spin_unlock_irqrestore(&base->lock, flags); |
324 | return ret; | ||
348 | } | 325 | } |
349 | 326 | ||
350 | /** | 327 | /** |
@@ -358,18 +335,12 @@ int alarm_try_to_cancel(struct alarm *alarm) | |||
358 | { | 335 | { |
359 | struct alarm_base *base = &alarm_bases[alarm->type]; | 336 | struct alarm_base *base = &alarm_bases[alarm->type]; |
360 | unsigned long flags; | 337 | unsigned long flags; |
361 | int ret = -1; | 338 | int ret; |
362 | spin_lock_irqsave(&base->lock, flags); | ||
363 | |||
364 | if (alarmtimer_callback_running(alarm)) | ||
365 | goto out; | ||
366 | 339 | ||
367 | if (alarmtimer_is_queued(alarm)) { | 340 | spin_lock_irqsave(&base->lock, flags); |
368 | alarmtimer_remove(base, alarm); | 341 | ret = hrtimer_try_to_cancel(&alarm->timer); |
369 | ret = 1; | 342 | if (ret >= 0) |
370 | } else | 343 | alarmtimer_dequeue(base, alarm); |
371 | ret = 0; | ||
372 | out: | ||
373 | spin_unlock_irqrestore(&base->lock, flags); | 344 | spin_unlock_irqrestore(&base->lock, flags); |
374 | return ret; | 345 | return ret; |
375 | } | 346 | } |
@@ -802,10 +773,6 @@ static int __init alarmtimer_init(void) | |||
802 | for (i = 0; i < ALARM_NUMTYPE; i++) { | 773 | for (i = 0; i < ALARM_NUMTYPE; i++) { |
803 | timerqueue_init_head(&alarm_bases[i].timerqueue); | 774 | timerqueue_init_head(&alarm_bases[i].timerqueue); |
804 | spin_lock_init(&alarm_bases[i].lock); | 775 | spin_lock_init(&alarm_bases[i].lock); |
805 | hrtimer_init(&alarm_bases[i].timer, | ||
806 | alarm_bases[i].base_clockid, | ||
807 | HRTIMER_MODE_ABS); | ||
808 | alarm_bases[i].timer.function = alarmtimer_fired; | ||
809 | } | 776 | } |
810 | 777 | ||
811 | error = alarmtimer_rtc_interface_setup(); | 778 | error = alarmtimer_rtc_interface_setup(); |
@@ -821,6 +788,7 @@ static int __init alarmtimer_init(void) | |||
821 | error = PTR_ERR(pdev); | 788 | error = PTR_ERR(pdev); |
822 | goto out_drv; | 789 | goto out_drv; |
823 | } | 790 | } |
791 | ws = wakeup_source_register("alarmtimer"); | ||
824 | return 0; | 792 | return 0; |
825 | 793 | ||
826 | out_drv: | 794 | out_drv: |
diff --git a/kernel/time/clockevents.c b/kernel/time/clockevents.c index 7e1ce012a851..30b6de0d977c 100644 --- a/kernel/time/clockevents.c +++ b/kernel/time/clockevents.c | |||
@@ -397,6 +397,30 @@ void clockevents_exchange_device(struct clock_event_device *old, | |||
397 | local_irq_restore(flags); | 397 | local_irq_restore(flags); |
398 | } | 398 | } |
399 | 399 | ||
400 | /** | ||
401 | * clockevents_suspend - suspend clock devices | ||
402 | */ | ||
403 | void clockevents_suspend(void) | ||
404 | { | ||
405 | struct clock_event_device *dev; | ||
406 | |||
407 | list_for_each_entry_reverse(dev, &clockevent_devices, list) | ||
408 | if (dev->suspend) | ||
409 | dev->suspend(dev); | ||
410 | } | ||
411 | |||
412 | /** | ||
413 | * clockevents_resume - resume clock devices | ||
414 | */ | ||
415 | void clockevents_resume(void) | ||
416 | { | ||
417 | struct clock_event_device *dev; | ||
418 | |||
419 | list_for_each_entry(dev, &clockevent_devices, list) | ||
420 | if (dev->resume) | ||
421 | dev->resume(dev); | ||
422 | } | ||
423 | |||
400 | #ifdef CONFIG_GENERIC_CLOCKEVENTS | 424 | #ifdef CONFIG_GENERIC_CLOCKEVENTS |
401 | /** | 425 | /** |
402 | * clockevents_notify - notification about relevant events | 426 | * clockevents_notify - notification about relevant events |
diff --git a/kernel/time/jiffies.c b/kernel/time/jiffies.c index 46da0537c10b..7a925ba456fb 100644 --- a/kernel/time/jiffies.c +++ b/kernel/time/jiffies.c | |||
@@ -37,7 +37,7 @@ | |||
37 | * requested HZ value. It is also not recommended | 37 | * requested HZ value. It is also not recommended |
38 | * for "tick-less" systems. | 38 | * for "tick-less" systems. |
39 | */ | 39 | */ |
40 | #define NSEC_PER_JIFFY ((u32)((((u64)NSEC_PER_SEC)<<8)/SHIFTED_HZ)) | 40 | #define NSEC_PER_JIFFY ((NSEC_PER_SEC+HZ/2)/HZ) |
41 | 41 | ||
42 | /* Since jiffies uses a simple NSEC_PER_JIFFY multiplier | 42 | /* Since jiffies uses a simple NSEC_PER_JIFFY multiplier |
43 | * conversion, the .shift value could be zero. However | 43 | * conversion, the .shift value could be zero. However |
@@ -58,7 +58,7 @@ static cycle_t jiffies_read(struct clocksource *cs) | |||
58 | return (cycle_t) jiffies; | 58 | return (cycle_t) jiffies; |
59 | } | 59 | } |
60 | 60 | ||
61 | struct clocksource clocksource_jiffies = { | 61 | static struct clocksource clocksource_jiffies = { |
62 | .name = "jiffies", | 62 | .name = "jiffies", |
63 | .rating = 1, /* lowest valid rating*/ | 63 | .rating = 1, /* lowest valid rating*/ |
64 | .read = jiffies_read, | 64 | .read = jiffies_read, |
@@ -67,6 +67,8 @@ struct clocksource clocksource_jiffies = { | |||
67 | .shift = JIFFIES_SHIFT, | 67 | .shift = JIFFIES_SHIFT, |
68 | }; | 68 | }; |
69 | 69 | ||
70 | __cacheline_aligned_in_smp DEFINE_SEQLOCK(jiffies_lock); | ||
71 | |||
70 | #if (BITS_PER_LONG < 64) | 72 | #if (BITS_PER_LONG < 64) |
71 | u64 get_jiffies_64(void) | 73 | u64 get_jiffies_64(void) |
72 | { | 74 | { |
@@ -74,9 +76,9 @@ u64 get_jiffies_64(void) | |||
74 | u64 ret; | 76 | u64 ret; |
75 | 77 | ||
76 | do { | 78 | do { |
77 | seq = read_seqbegin(&xtime_lock); | 79 | seq = read_seqbegin(&jiffies_lock); |
78 | ret = jiffies_64; | 80 | ret = jiffies_64; |
79 | } while (read_seqretry(&xtime_lock, seq)); | 81 | } while (read_seqretry(&jiffies_lock, seq)); |
80 | return ret; | 82 | return ret; |
81 | } | 83 | } |
82 | EXPORT_SYMBOL(get_jiffies_64); | 84 | EXPORT_SYMBOL(get_jiffies_64); |
@@ -95,3 +97,33 @@ struct clocksource * __init __weak clocksource_default_clock(void) | |||
95 | { | 97 | { |
96 | return &clocksource_jiffies; | 98 | return &clocksource_jiffies; |
97 | } | 99 | } |
100 | |||
101 | struct clocksource refined_jiffies; | ||
102 | |||
103 | int register_refined_jiffies(long cycles_per_second) | ||
104 | { | ||
105 | u64 nsec_per_tick, shift_hz; | ||
106 | long cycles_per_tick; | ||
107 | |||
108 | |||
109 | |||
110 | refined_jiffies = clocksource_jiffies; | ||
111 | refined_jiffies.name = "refined-jiffies"; | ||
112 | refined_jiffies.rating++; | ||
113 | |||
114 | /* Calc cycles per tick */ | ||
115 | cycles_per_tick = (cycles_per_second + HZ/2)/HZ; | ||
116 | /* shift_hz stores hz<<8 for extra accuracy */ | ||
117 | shift_hz = (u64)cycles_per_second << 8; | ||
118 | shift_hz += cycles_per_tick/2; | ||
119 | do_div(shift_hz, cycles_per_tick); | ||
120 | /* Calculate nsec_per_tick using shift_hz */ | ||
121 | nsec_per_tick = (u64)NSEC_PER_SEC << 8; | ||
122 | nsec_per_tick += (u32)shift_hz/2; | ||
123 | do_div(nsec_per_tick, (u32)shift_hz); | ||
124 | |||
125 | refined_jiffies.mult = ((u32)nsec_per_tick) << JIFFIES_SHIFT; | ||
126 | |||
127 | clocksource_register(&refined_jiffies); | ||
128 | return 0; | ||
129 | } | ||
diff --git a/kernel/time/tick-common.c b/kernel/time/tick-common.c index da6c9ecad4e4..b1600a6973f4 100644 --- a/kernel/time/tick-common.c +++ b/kernel/time/tick-common.c | |||
@@ -63,13 +63,13 @@ int tick_is_oneshot_available(void) | |||
63 | static void tick_periodic(int cpu) | 63 | static void tick_periodic(int cpu) |
64 | { | 64 | { |
65 | if (tick_do_timer_cpu == cpu) { | 65 | if (tick_do_timer_cpu == cpu) { |
66 | write_seqlock(&xtime_lock); | 66 | write_seqlock(&jiffies_lock); |
67 | 67 | ||
68 | /* Keep track of the next tick event */ | 68 | /* Keep track of the next tick event */ |
69 | tick_next_period = ktime_add(tick_next_period, tick_period); | 69 | tick_next_period = ktime_add(tick_next_period, tick_period); |
70 | 70 | ||
71 | do_timer(1); | 71 | do_timer(1); |
72 | write_sequnlock(&xtime_lock); | 72 | write_sequnlock(&jiffies_lock); |
73 | } | 73 | } |
74 | 74 | ||
75 | update_process_times(user_mode(get_irq_regs())); | 75 | update_process_times(user_mode(get_irq_regs())); |
@@ -130,9 +130,9 @@ void tick_setup_periodic(struct clock_event_device *dev, int broadcast) | |||
130 | ktime_t next; | 130 | ktime_t next; |
131 | 131 | ||
132 | do { | 132 | do { |
133 | seq = read_seqbegin(&xtime_lock); | 133 | seq = read_seqbegin(&jiffies_lock); |
134 | next = tick_next_period; | 134 | next = tick_next_period; |
135 | } while (read_seqretry(&xtime_lock, seq)); | 135 | } while (read_seqretry(&jiffies_lock, seq)); |
136 | 136 | ||
137 | clockevents_set_mode(dev, CLOCK_EVT_MODE_ONESHOT); | 137 | clockevents_set_mode(dev, CLOCK_EVT_MODE_ONESHOT); |
138 | 138 | ||
diff --git a/kernel/time/tick-internal.h b/kernel/time/tick-internal.h index 4e265b901fed..cf3e59ed6dc0 100644 --- a/kernel/time/tick-internal.h +++ b/kernel/time/tick-internal.h | |||
@@ -141,4 +141,3 @@ static inline int tick_device_is_functional(struct clock_event_device *dev) | |||
141 | #endif | 141 | #endif |
142 | 142 | ||
143 | extern void do_timer(unsigned long ticks); | 143 | extern void do_timer(unsigned long ticks); |
144 | extern seqlock_t xtime_lock; | ||
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index 3a9e5d5c1091..d58e552d9fd1 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c | |||
@@ -31,7 +31,7 @@ | |||
31 | static DEFINE_PER_CPU(struct tick_sched, tick_cpu_sched); | 31 | static DEFINE_PER_CPU(struct tick_sched, tick_cpu_sched); |
32 | 32 | ||
33 | /* | 33 | /* |
34 | * The time, when the last jiffy update happened. Protected by xtime_lock. | 34 | * The time, when the last jiffy update happened. Protected by jiffies_lock. |
35 | */ | 35 | */ |
36 | static ktime_t last_jiffies_update; | 36 | static ktime_t last_jiffies_update; |
37 | 37 | ||
@@ -49,14 +49,14 @@ static void tick_do_update_jiffies64(ktime_t now) | |||
49 | ktime_t delta; | 49 | ktime_t delta; |
50 | 50 | ||
51 | /* | 51 | /* |
52 | * Do a quick check without holding xtime_lock: | 52 | * Do a quick check without holding jiffies_lock: |
53 | */ | 53 | */ |
54 | delta = ktime_sub(now, last_jiffies_update); | 54 | delta = ktime_sub(now, last_jiffies_update); |
55 | if (delta.tv64 < tick_period.tv64) | 55 | if (delta.tv64 < tick_period.tv64) |
56 | return; | 56 | return; |
57 | 57 | ||
58 | /* Reevalute with xtime_lock held */ | 58 | /* Reevalute with jiffies_lock held */ |
59 | write_seqlock(&xtime_lock); | 59 | write_seqlock(&jiffies_lock); |
60 | 60 | ||
61 | delta = ktime_sub(now, last_jiffies_update); | 61 | delta = ktime_sub(now, last_jiffies_update); |
62 | if (delta.tv64 >= tick_period.tv64) { | 62 | if (delta.tv64 >= tick_period.tv64) { |
@@ -79,7 +79,7 @@ static void tick_do_update_jiffies64(ktime_t now) | |||
79 | /* Keep the tick_next_period variable up to date */ | 79 | /* Keep the tick_next_period variable up to date */ |
80 | tick_next_period = ktime_add(last_jiffies_update, tick_period); | 80 | tick_next_period = ktime_add(last_jiffies_update, tick_period); |
81 | } | 81 | } |
82 | write_sequnlock(&xtime_lock); | 82 | write_sequnlock(&jiffies_lock); |
83 | } | 83 | } |
84 | 84 | ||
85 | /* | 85 | /* |
@@ -89,15 +89,58 @@ static ktime_t tick_init_jiffy_update(void) | |||
89 | { | 89 | { |
90 | ktime_t period; | 90 | ktime_t period; |
91 | 91 | ||
92 | write_seqlock(&xtime_lock); | 92 | write_seqlock(&jiffies_lock); |
93 | /* Did we start the jiffies update yet ? */ | 93 | /* Did we start the jiffies update yet ? */ |
94 | if (last_jiffies_update.tv64 == 0) | 94 | if (last_jiffies_update.tv64 == 0) |
95 | last_jiffies_update = tick_next_period; | 95 | last_jiffies_update = tick_next_period; |
96 | period = last_jiffies_update; | 96 | period = last_jiffies_update; |
97 | write_sequnlock(&xtime_lock); | 97 | write_sequnlock(&jiffies_lock); |
98 | return period; | 98 | return period; |
99 | } | 99 | } |
100 | 100 | ||
101 | |||
102 | static void tick_sched_do_timer(ktime_t now) | ||
103 | { | ||
104 | int cpu = smp_processor_id(); | ||
105 | |||
106 | #ifdef CONFIG_NO_HZ | ||
107 | /* | ||
108 | * Check if the do_timer duty was dropped. We don't care about | ||
109 | * concurrency: This happens only when the cpu in charge went | ||
110 | * into a long sleep. If two cpus happen to assign themself to | ||
111 | * this duty, then the jiffies update is still serialized by | ||
112 | * jiffies_lock. | ||
113 | */ | ||
114 | if (unlikely(tick_do_timer_cpu == TICK_DO_TIMER_NONE)) | ||
115 | tick_do_timer_cpu = cpu; | ||
116 | #endif | ||
117 | |||
118 | /* Check, if the jiffies need an update */ | ||
119 | if (tick_do_timer_cpu == cpu) | ||
120 | tick_do_update_jiffies64(now); | ||
121 | } | ||
122 | |||
123 | static void tick_sched_handle(struct tick_sched *ts, struct pt_regs *regs) | ||
124 | { | ||
125 | #ifdef CONFIG_NO_HZ | ||
126 | /* | ||
127 | * When we are idle and the tick is stopped, we have to touch | ||
128 | * the watchdog as we might not schedule for a really long | ||
129 | * time. This happens on complete idle SMP systems while | ||
130 | * waiting on the login prompt. We also increment the "start of | ||
131 | * idle" jiffy stamp so the idle accounting adjustment we do | ||
132 | * when we go busy again does not account too much ticks. | ||
133 | */ | ||
134 | if (ts->tick_stopped) { | ||
135 | touch_softlockup_watchdog(); | ||
136 | if (is_idle_task(current)) | ||
137 | ts->idle_jiffies++; | ||
138 | } | ||
139 | #endif | ||
140 | update_process_times(user_mode(regs)); | ||
141 | profile_tick(CPU_PROFILING); | ||
142 | } | ||
143 | |||
101 | /* | 144 | /* |
102 | * NOHZ - aka dynamic tick functionality | 145 | * NOHZ - aka dynamic tick functionality |
103 | */ | 146 | */ |
@@ -282,11 +325,11 @@ static ktime_t tick_nohz_stop_sched_tick(struct tick_sched *ts, | |||
282 | 325 | ||
283 | /* Read jiffies and the time when jiffies were updated last */ | 326 | /* Read jiffies and the time when jiffies were updated last */ |
284 | do { | 327 | do { |
285 | seq = read_seqbegin(&xtime_lock); | 328 | seq = read_seqbegin(&jiffies_lock); |
286 | last_update = last_jiffies_update; | 329 | last_update = last_jiffies_update; |
287 | last_jiffies = jiffies; | 330 | last_jiffies = jiffies; |
288 | time_delta = timekeeping_max_deferment(); | 331 | time_delta = timekeeping_max_deferment(); |
289 | } while (read_seqretry(&xtime_lock, seq)); | 332 | } while (read_seqretry(&jiffies_lock, seq)); |
290 | 333 | ||
291 | if (rcu_needs_cpu(cpu, &rcu_delta_jiffies) || printk_needs_cpu(cpu) || | 334 | if (rcu_needs_cpu(cpu, &rcu_delta_jiffies) || printk_needs_cpu(cpu) || |
292 | arch_needs_cpu(cpu)) { | 335 | arch_needs_cpu(cpu)) { |
@@ -372,7 +415,7 @@ static ktime_t tick_nohz_stop_sched_tick(struct tick_sched *ts, | |||
372 | * the scheduler tick in nohz_restart_sched_tick. | 415 | * the scheduler tick in nohz_restart_sched_tick. |
373 | */ | 416 | */ |
374 | if (!ts->tick_stopped) { | 417 | if (!ts->tick_stopped) { |
375 | select_nohz_load_balancer(1); | 418 | nohz_balance_enter_idle(cpu); |
376 | calc_load_enter_idle(); | 419 | calc_load_enter_idle(); |
377 | 420 | ||
378 | ts->last_tick = hrtimer_get_expires(&ts->sched_timer); | 421 | ts->last_tick = hrtimer_get_expires(&ts->sched_timer); |
@@ -436,7 +479,8 @@ static bool can_stop_idle_tick(int cpu, struct tick_sched *ts) | |||
436 | if (unlikely(local_softirq_pending() && cpu_online(cpu))) { | 479 | if (unlikely(local_softirq_pending() && cpu_online(cpu))) { |
437 | static int ratelimit; | 480 | static int ratelimit; |
438 | 481 | ||
439 | if (ratelimit < 10) { | 482 | if (ratelimit < 10 && |
483 | (local_softirq_pending() & SOFTIRQ_STOP_IDLE_MASK)) { | ||
440 | printk(KERN_ERR "NOHZ: local_softirq_pending %02x\n", | 484 | printk(KERN_ERR "NOHZ: local_softirq_pending %02x\n", |
441 | (unsigned int) local_softirq_pending()); | 485 | (unsigned int) local_softirq_pending()); |
442 | ratelimit++; | 486 | ratelimit++; |
@@ -525,6 +569,8 @@ void tick_nohz_irq_exit(void) | |||
525 | if (!ts->inidle) | 569 | if (!ts->inidle) |
526 | return; | 570 | return; |
527 | 571 | ||
572 | /* Cancel the timer because CPU already waken up from the C-states*/ | ||
573 | menu_hrtimer_cancel(); | ||
528 | __tick_nohz_idle_enter(ts); | 574 | __tick_nohz_idle_enter(ts); |
529 | } | 575 | } |
530 | 576 | ||
@@ -569,7 +615,6 @@ static void tick_nohz_restart(struct tick_sched *ts, ktime_t now) | |||
569 | static void tick_nohz_restart_sched_tick(struct tick_sched *ts, ktime_t now) | 615 | static void tick_nohz_restart_sched_tick(struct tick_sched *ts, ktime_t now) |
570 | { | 616 | { |
571 | /* Update jiffies first */ | 617 | /* Update jiffies first */ |
572 | select_nohz_load_balancer(0); | ||
573 | tick_do_update_jiffies64(now); | 618 | tick_do_update_jiffies64(now); |
574 | update_cpu_load_nohz(); | 619 | update_cpu_load_nohz(); |
575 | 620 | ||
@@ -621,6 +666,8 @@ void tick_nohz_idle_exit(void) | |||
621 | 666 | ||
622 | ts->inidle = 0; | 667 | ts->inidle = 0; |
623 | 668 | ||
669 | /* Cancel the timer because CPU already waken up from the C-states*/ | ||
670 | menu_hrtimer_cancel(); | ||
624 | if (ts->idle_active || ts->tick_stopped) | 671 | if (ts->idle_active || ts->tick_stopped) |
625 | now = ktime_get(); | 672 | now = ktime_get(); |
626 | 673 | ||
@@ -648,40 +695,12 @@ static void tick_nohz_handler(struct clock_event_device *dev) | |||
648 | { | 695 | { |
649 | struct tick_sched *ts = &__get_cpu_var(tick_cpu_sched); | 696 | struct tick_sched *ts = &__get_cpu_var(tick_cpu_sched); |
650 | struct pt_regs *regs = get_irq_regs(); | 697 | struct pt_regs *regs = get_irq_regs(); |
651 | int cpu = smp_processor_id(); | ||
652 | ktime_t now = ktime_get(); | 698 | ktime_t now = ktime_get(); |
653 | 699 | ||
654 | dev->next_event.tv64 = KTIME_MAX; | 700 | dev->next_event.tv64 = KTIME_MAX; |
655 | 701 | ||
656 | /* | 702 | tick_sched_do_timer(now); |
657 | * Check if the do_timer duty was dropped. We don't care about | 703 | tick_sched_handle(ts, regs); |
658 | * concurrency: This happens only when the cpu in charge went | ||
659 | * into a long sleep. If two cpus happen to assign themself to | ||
660 | * this duty, then the jiffies update is still serialized by | ||
661 | * xtime_lock. | ||
662 | */ | ||
663 | if (unlikely(tick_do_timer_cpu == TICK_DO_TIMER_NONE)) | ||
664 | tick_do_timer_cpu = cpu; | ||
665 | |||
666 | /* Check, if the jiffies need an update */ | ||
667 | if (tick_do_timer_cpu == cpu) | ||
668 | tick_do_update_jiffies64(now); | ||
669 | |||
670 | /* | ||
671 | * When we are idle and the tick is stopped, we have to touch | ||
672 | * the watchdog as we might not schedule for a really long | ||
673 | * time. This happens on complete idle SMP systems while | ||
674 | * waiting on the login prompt. We also increment the "start | ||
675 | * of idle" jiffy stamp so the idle accounting adjustment we | ||
676 | * do when we go busy again does not account too much ticks. | ||
677 | */ | ||
678 | if (ts->tick_stopped) { | ||
679 | touch_softlockup_watchdog(); | ||
680 | ts->idle_jiffies++; | ||
681 | } | ||
682 | |||
683 | update_process_times(user_mode(regs)); | ||
684 | profile_tick(CPU_PROFILING); | ||
685 | 704 | ||
686 | while (tick_nohz_reprogram(ts, now)) { | 705 | while (tick_nohz_reprogram(ts, now)) { |
687 | now = ktime_get(); | 706 | now = ktime_get(); |
@@ -794,7 +813,7 @@ void tick_check_idle(int cpu) | |||
794 | #ifdef CONFIG_HIGH_RES_TIMERS | 813 | #ifdef CONFIG_HIGH_RES_TIMERS |
795 | /* | 814 | /* |
796 | * We rearm the timer until we get disabled by the idle code. | 815 | * We rearm the timer until we get disabled by the idle code. |
797 | * Called with interrupts disabled and timer->base->cpu_base->lock held. | 816 | * Called with interrupts disabled. |
798 | */ | 817 | */ |
799 | static enum hrtimer_restart tick_sched_timer(struct hrtimer *timer) | 818 | static enum hrtimer_restart tick_sched_timer(struct hrtimer *timer) |
800 | { | 819 | { |
@@ -802,45 +821,15 @@ static enum hrtimer_restart tick_sched_timer(struct hrtimer *timer) | |||
802 | container_of(timer, struct tick_sched, sched_timer); | 821 | container_of(timer, struct tick_sched, sched_timer); |
803 | struct pt_regs *regs = get_irq_regs(); | 822 | struct pt_regs *regs = get_irq_regs(); |
804 | ktime_t now = ktime_get(); | 823 | ktime_t now = ktime_get(); |
805 | int cpu = smp_processor_id(); | ||
806 | 824 | ||
807 | #ifdef CONFIG_NO_HZ | 825 | tick_sched_do_timer(now); |
808 | /* | ||
809 | * Check if the do_timer duty was dropped. We don't care about | ||
810 | * concurrency: This happens only when the cpu in charge went | ||
811 | * into a long sleep. If two cpus happen to assign themself to | ||
812 | * this duty, then the jiffies update is still serialized by | ||
813 | * xtime_lock. | ||
814 | */ | ||
815 | if (unlikely(tick_do_timer_cpu == TICK_DO_TIMER_NONE)) | ||
816 | tick_do_timer_cpu = cpu; | ||
817 | #endif | ||
818 | |||
819 | /* Check, if the jiffies need an update */ | ||
820 | if (tick_do_timer_cpu == cpu) | ||
821 | tick_do_update_jiffies64(now); | ||
822 | 826 | ||
823 | /* | 827 | /* |
824 | * Do not call, when we are not in irq context and have | 828 | * Do not call, when we are not in irq context and have |
825 | * no valid regs pointer | 829 | * no valid regs pointer |
826 | */ | 830 | */ |
827 | if (regs) { | 831 | if (regs) |
828 | /* | 832 | tick_sched_handle(ts, regs); |
829 | * When we are idle and the tick is stopped, we have to touch | ||
830 | * the watchdog as we might not schedule for a really long | ||
831 | * time. This happens on complete idle SMP systems while | ||
832 | * waiting on the login prompt. We also increment the "start of | ||
833 | * idle" jiffy stamp so the idle accounting adjustment we do | ||
834 | * when we go busy again does not account too much ticks. | ||
835 | */ | ||
836 | if (ts->tick_stopped) { | ||
837 | touch_softlockup_watchdog(); | ||
838 | if (idle_cpu(cpu)) | ||
839 | ts->idle_jiffies++; | ||
840 | } | ||
841 | update_process_times(user_mode(regs)); | ||
842 | profile_tick(CPU_PROFILING); | ||
843 | } | ||
844 | 833 | ||
845 | hrtimer_forward(timer, now, tick_period); | 834 | hrtimer_forward(timer, now, tick_period); |
846 | 835 | ||
@@ -874,7 +863,7 @@ void tick_setup_sched_timer(void) | |||
874 | /* Get the next period (per cpu) */ | 863 | /* Get the next period (per cpu) */ |
875 | hrtimer_set_expires(&ts->sched_timer, tick_init_jiffy_update()); | 864 | hrtimer_set_expires(&ts->sched_timer, tick_init_jiffy_update()); |
876 | 865 | ||
877 | /* Offset the tick to avert xtime_lock contention. */ | 866 | /* Offset the tick to avert jiffies_lock contention. */ |
878 | if (sched_skew_tick) { | 867 | if (sched_skew_tick) { |
879 | u64 offset = ktime_to_ns(tick_period) >> 1; | 868 | u64 offset = ktime_to_ns(tick_period) >> 1; |
880 | do_div(offset, num_possible_cpus()); | 869 | do_div(offset, num_possible_cpus()); |
diff --git a/kernel/time/timecompare.c b/kernel/time/timecompare.c deleted file mode 100644 index a9ae369925ce..000000000000 --- a/kernel/time/timecompare.c +++ /dev/null | |||
@@ -1,193 +0,0 @@ | |||
1 | /* | ||
2 | * Copyright (C) 2009 Intel Corporation. | ||
3 | * Author: Patrick Ohly <patrick.ohly@intel.com> | ||
4 | * | ||
5 | * This program is free software; you can redistribute it and/or modify | ||
6 | * it under the terms of the GNU General Public License as published by | ||
7 | * the Free Software Foundation; either version 2 of the License, or | ||
8 | * (at your option) any later version. | ||
9 | * | ||
10 | * This program is distributed in the hope that it will be useful, | ||
11 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
12 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | ||
13 | * GNU General Public License for more details. | ||
14 | * | ||
15 | * You should have received a copy of the GNU General Public License | ||
16 | * along with this program; if not, write to the Free Software | ||
17 | * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. | ||
18 | */ | ||
19 | |||
20 | #include <linux/timecompare.h> | ||
21 | #include <linux/module.h> | ||
22 | #include <linux/slab.h> | ||
23 | #include <linux/math64.h> | ||
24 | #include <linux/kernel.h> | ||
25 | |||
26 | /* | ||
27 | * fixed point arithmetic scale factor for skew | ||
28 | * | ||
29 | * Usually one would measure skew in ppb (parts per billion, 1e9), but | ||
30 | * using a factor of 2 simplifies the math. | ||
31 | */ | ||
32 | #define TIMECOMPARE_SKEW_RESOLUTION (((s64)1)<<30) | ||
33 | |||
34 | ktime_t timecompare_transform(struct timecompare *sync, | ||
35 | u64 source_tstamp) | ||
36 | { | ||
37 | u64 nsec; | ||
38 | |||
39 | nsec = source_tstamp + sync->offset; | ||
40 | nsec += (s64)(source_tstamp - sync->last_update) * sync->skew / | ||
41 | TIMECOMPARE_SKEW_RESOLUTION; | ||
42 | |||
43 | return ns_to_ktime(nsec); | ||
44 | } | ||
45 | EXPORT_SYMBOL_GPL(timecompare_transform); | ||
46 | |||
47 | int timecompare_offset(struct timecompare *sync, | ||
48 | s64 *offset, | ||
49 | u64 *source_tstamp) | ||
50 | { | ||
51 | u64 start_source = 0, end_source = 0; | ||
52 | struct { | ||
53 | s64 offset; | ||
54 | s64 duration_target; | ||
55 | } buffer[10], sample, *samples; | ||
56 | int counter = 0, i; | ||
57 | int used; | ||
58 | int index; | ||
59 | int num_samples = sync->num_samples; | ||
60 | |||
61 | if (num_samples > ARRAY_SIZE(buffer)) { | ||
62 | samples = kmalloc(sizeof(*samples) * num_samples, GFP_ATOMIC); | ||
63 | if (!samples) { | ||
64 | samples = buffer; | ||
65 | num_samples = ARRAY_SIZE(buffer); | ||
66 | } | ||
67 | } else { | ||
68 | samples = buffer; | ||
69 | } | ||
70 | |||
71 | /* run until we have enough valid samples, but do not try forever */ | ||
72 | i = 0; | ||
73 | counter = 0; | ||
74 | while (1) { | ||
75 | u64 ts; | ||
76 | ktime_t start, end; | ||
77 | |||
78 | start = sync->target(); | ||
79 | ts = timecounter_read(sync->source); | ||
80 | end = sync->target(); | ||
81 | |||
82 | if (!i) | ||
83 | start_source = ts; | ||
84 | |||
85 | /* ignore negative durations */ | ||
86 | sample.duration_target = ktime_to_ns(ktime_sub(end, start)); | ||
87 | if (sample.duration_target >= 0) { | ||
88 | /* | ||
89 | * assume symetric delay to and from source: | ||
90 | * average target time corresponds to measured | ||
91 | * source time | ||
92 | */ | ||
93 | sample.offset = | ||
94 | (ktime_to_ns(end) + ktime_to_ns(start)) / 2 - | ||
95 | ts; | ||
96 | |||
97 | /* simple insertion sort based on duration */ | ||
98 | index = counter - 1; | ||
99 | while (index >= 0) { | ||
100 | if (samples[index].duration_target < | ||
101 | sample.duration_target) | ||
102 | break; | ||
103 | samples[index + 1] = samples[index]; | ||
104 | index--; | ||
105 | } | ||
106 | samples[index + 1] = sample; | ||
107 | counter++; | ||
108 | } | ||
109 | |||
110 | i++; | ||
111 | if (counter >= num_samples || i >= 100000) { | ||
112 | end_source = ts; | ||
113 | break; | ||
114 | } | ||
115 | } | ||
116 | |||
117 | *source_tstamp = (end_source + start_source) / 2; | ||
118 | |||
119 | /* remove outliers by only using 75% of the samples */ | ||
120 | used = counter * 3 / 4; | ||
121 | if (!used) | ||
122 | used = counter; | ||
123 | if (used) { | ||
124 | /* calculate average */ | ||
125 | s64 off = 0; | ||
126 | for (index = 0; index < used; index++) | ||
127 | off += samples[index].offset; | ||
128 | *offset = div_s64(off, used); | ||
129 | } | ||
130 | |||
131 | if (samples && samples != buffer) | ||
132 | kfree(samples); | ||
133 | |||
134 | return used; | ||
135 | } | ||
136 | EXPORT_SYMBOL_GPL(timecompare_offset); | ||
137 | |||
138 | void __timecompare_update(struct timecompare *sync, | ||
139 | u64 source_tstamp) | ||
140 | { | ||
141 | s64 offset; | ||
142 | u64 average_time; | ||
143 | |||
144 | if (!timecompare_offset(sync, &offset, &average_time)) | ||
145 | return; | ||
146 | |||
147 | if (!sync->last_update) { | ||
148 | sync->last_update = average_time; | ||
149 | sync->offset = offset; | ||
150 | sync->skew = 0; | ||
151 | } else { | ||
152 | s64 delta_nsec = average_time - sync->last_update; | ||
153 | |||
154 | /* avoid division by negative or small deltas */ | ||
155 | if (delta_nsec >= 10000) { | ||
156 | s64 delta_offset_nsec = offset - sync->offset; | ||
157 | s64 skew; /* delta_offset_nsec * | ||
158 | TIMECOMPARE_SKEW_RESOLUTION / | ||
159 | delta_nsec */ | ||
160 | u64 divisor; | ||
161 | |||
162 | /* div_s64() is limited to 32 bit divisor */ | ||
163 | skew = delta_offset_nsec * TIMECOMPARE_SKEW_RESOLUTION; | ||
164 | divisor = delta_nsec; | ||
165 | while (unlikely(divisor >= ((s64)1) << 32)) { | ||
166 | /* divide both by 2; beware, right shift | ||
167 | of negative value has undefined | ||
168 | behavior and can only be used for | ||
169 | the positive divisor */ | ||
170 | skew = div_s64(skew, 2); | ||
171 | divisor >>= 1; | ||
172 | } | ||
173 | skew = div_s64(skew, divisor); | ||
174 | |||
175 | /* | ||
176 | * Calculate new overall skew as 4/16 the | ||
177 | * old value and 12/16 the new one. This is | ||
178 | * a rather arbitrary tradeoff between | ||
179 | * only using the latest measurement (0/16 and | ||
180 | * 16/16) and even more weight on past measurements. | ||
181 | */ | ||
182 | #define TIMECOMPARE_NEW_SKEW_PER_16 12 | ||
183 | sync->skew = | ||
184 | div_s64((16 - TIMECOMPARE_NEW_SKEW_PER_16) * | ||
185 | sync->skew + | ||
186 | TIMECOMPARE_NEW_SKEW_PER_16 * skew, | ||
187 | 16); | ||
188 | sync->last_update = average_time; | ||
189 | sync->offset = offset; | ||
190 | } | ||
191 | } | ||
192 | } | ||
193 | EXPORT_SYMBOL_GPL(__timecompare_update); | ||
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index d3b91e75cecd..cbc6acb0db3f 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c | |||
@@ -8,6 +8,7 @@ | |||
8 | * | 8 | * |
9 | */ | 9 | */ |
10 | 10 | ||
11 | #include <linux/timekeeper_internal.h> | ||
11 | #include <linux/module.h> | 12 | #include <linux/module.h> |
12 | #include <linux/interrupt.h> | 13 | #include <linux/interrupt.h> |
13 | #include <linux/percpu.h> | 14 | #include <linux/percpu.h> |
@@ -20,71 +21,11 @@ | |||
20 | #include <linux/time.h> | 21 | #include <linux/time.h> |
21 | #include <linux/tick.h> | 22 | #include <linux/tick.h> |
22 | #include <linux/stop_machine.h> | 23 | #include <linux/stop_machine.h> |
24 | #include <linux/pvclock_gtod.h> | ||
23 | 25 | ||
24 | /* Structure holding internal timekeeping values. */ | ||
25 | struct timekeeper { | ||
26 | /* Current clocksource used for timekeeping. */ | ||
27 | struct clocksource *clock; | ||
28 | /* NTP adjusted clock multiplier */ | ||
29 | u32 mult; | ||
30 | /* The shift value of the current clocksource. */ | ||
31 | u32 shift; | ||
32 | /* Number of clock cycles in one NTP interval. */ | ||
33 | cycle_t cycle_interval; | ||
34 | /* Number of clock shifted nano seconds in one NTP interval. */ | ||
35 | u64 xtime_interval; | ||
36 | /* shifted nano seconds left over when rounding cycle_interval */ | ||
37 | s64 xtime_remainder; | ||
38 | /* Raw nano seconds accumulated per NTP interval. */ | ||
39 | u32 raw_interval; | ||
40 | |||
41 | /* Current CLOCK_REALTIME time in seconds */ | ||
42 | u64 xtime_sec; | ||
43 | /* Clock shifted nano seconds */ | ||
44 | u64 xtime_nsec; | ||
45 | |||
46 | /* Difference between accumulated time and NTP time in ntp | ||
47 | * shifted nano seconds. */ | ||
48 | s64 ntp_error; | ||
49 | /* Shift conversion between clock shifted nano seconds and | ||
50 | * ntp shifted nano seconds. */ | ||
51 | u32 ntp_error_shift; | ||
52 | |||
53 | /* | ||
54 | * wall_to_monotonic is what we need to add to xtime (or xtime corrected | ||
55 | * for sub jiffie times) to get to monotonic time. Monotonic is pegged | ||
56 | * at zero at system boot time, so wall_to_monotonic will be negative, | ||
57 | * however, we will ALWAYS keep the tv_nsec part positive so we can use | ||
58 | * the usual normalization. | ||
59 | * | ||
60 | * wall_to_monotonic is moved after resume from suspend for the | ||
61 | * monotonic time not to jump. We need to add total_sleep_time to | ||
62 | * wall_to_monotonic to get the real boot based time offset. | ||
63 | * | ||
64 | * - wall_to_monotonic is no longer the boot time, getboottime must be | ||
65 | * used instead. | ||
66 | */ | ||
67 | struct timespec wall_to_monotonic; | ||
68 | /* Offset clock monotonic -> clock realtime */ | ||
69 | ktime_t offs_real; | ||
70 | /* time spent in suspend */ | ||
71 | struct timespec total_sleep_time; | ||
72 | /* Offset clock monotonic -> clock boottime */ | ||
73 | ktime_t offs_boot; | ||
74 | /* The raw monotonic time for the CLOCK_MONOTONIC_RAW posix clock. */ | ||
75 | struct timespec raw_time; | ||
76 | /* Seqlock for all timekeeper values */ | ||
77 | seqlock_t lock; | ||
78 | }; | ||
79 | 26 | ||
80 | static struct timekeeper timekeeper; | 27 | static struct timekeeper timekeeper; |
81 | 28 | ||
82 | /* | ||
83 | * This read-write spinlock protects us from races in SMP while | ||
84 | * playing with xtime. | ||
85 | */ | ||
86 | __cacheline_aligned_in_smp DEFINE_SEQLOCK(xtime_lock); | ||
87 | |||
88 | /* flag for if timekeeping is suspended */ | 29 | /* flag for if timekeeping is suspended */ |
89 | int __read_mostly timekeeping_suspended; | 30 | int __read_mostly timekeeping_suspended; |
90 | 31 | ||
@@ -96,15 +37,6 @@ static inline void tk_normalize_xtime(struct timekeeper *tk) | |||
96 | } | 37 | } |
97 | } | 38 | } |
98 | 39 | ||
99 | static struct timespec tk_xtime(struct timekeeper *tk) | ||
100 | { | ||
101 | struct timespec ts; | ||
102 | |||
103 | ts.tv_sec = tk->xtime_sec; | ||
104 | ts.tv_nsec = (long)(tk->xtime_nsec >> tk->shift); | ||
105 | return ts; | ||
106 | } | ||
107 | |||
108 | static void tk_set_xtime(struct timekeeper *tk, const struct timespec *ts) | 40 | static void tk_set_xtime(struct timekeeper *tk, const struct timespec *ts) |
109 | { | 41 | { |
110 | tk->xtime_sec = ts->tv_sec; | 42 | tk->xtime_sec = ts->tv_sec; |
@@ -243,17 +175,63 @@ static inline s64 timekeeping_get_ns_raw(struct timekeeper *tk) | |||
243 | return nsec + arch_gettimeoffset(); | 175 | return nsec + arch_gettimeoffset(); |
244 | } | 176 | } |
245 | 177 | ||
178 | static RAW_NOTIFIER_HEAD(pvclock_gtod_chain); | ||
179 | |||
180 | static void update_pvclock_gtod(struct timekeeper *tk) | ||
181 | { | ||
182 | raw_notifier_call_chain(&pvclock_gtod_chain, 0, tk); | ||
183 | } | ||
184 | |||
185 | /** | ||
186 | * pvclock_gtod_register_notifier - register a pvclock timedata update listener | ||
187 | * | ||
188 | * Must hold write on timekeeper.lock | ||
189 | */ | ||
190 | int pvclock_gtod_register_notifier(struct notifier_block *nb) | ||
191 | { | ||
192 | struct timekeeper *tk = &timekeeper; | ||
193 | unsigned long flags; | ||
194 | int ret; | ||
195 | |||
196 | write_seqlock_irqsave(&tk->lock, flags); | ||
197 | ret = raw_notifier_chain_register(&pvclock_gtod_chain, nb); | ||
198 | /* update timekeeping data */ | ||
199 | update_pvclock_gtod(tk); | ||
200 | write_sequnlock_irqrestore(&tk->lock, flags); | ||
201 | |||
202 | return ret; | ||
203 | } | ||
204 | EXPORT_SYMBOL_GPL(pvclock_gtod_register_notifier); | ||
205 | |||
206 | /** | ||
207 | * pvclock_gtod_unregister_notifier - unregister a pvclock | ||
208 | * timedata update listener | ||
209 | * | ||
210 | * Must hold write on timekeeper.lock | ||
211 | */ | ||
212 | int pvclock_gtod_unregister_notifier(struct notifier_block *nb) | ||
213 | { | ||
214 | struct timekeeper *tk = &timekeeper; | ||
215 | unsigned long flags; | ||
216 | int ret; | ||
217 | |||
218 | write_seqlock_irqsave(&tk->lock, flags); | ||
219 | ret = raw_notifier_chain_unregister(&pvclock_gtod_chain, nb); | ||
220 | write_sequnlock_irqrestore(&tk->lock, flags); | ||
221 | |||
222 | return ret; | ||
223 | } | ||
224 | EXPORT_SYMBOL_GPL(pvclock_gtod_unregister_notifier); | ||
225 | |||
246 | /* must hold write on timekeeper.lock */ | 226 | /* must hold write on timekeeper.lock */ |
247 | static void timekeeping_update(struct timekeeper *tk, bool clearntp) | 227 | static void timekeeping_update(struct timekeeper *tk, bool clearntp) |
248 | { | 228 | { |
249 | struct timespec xt; | ||
250 | |||
251 | if (clearntp) { | 229 | if (clearntp) { |
252 | tk->ntp_error = 0; | 230 | tk->ntp_error = 0; |
253 | ntp_clear(); | 231 | ntp_clear(); |
254 | } | 232 | } |
255 | xt = tk_xtime(tk); | 233 | update_vsyscall(tk); |
256 | update_vsyscall(&xt, &tk->wall_to_monotonic, tk->clock, tk->mult); | 234 | update_pvclock_gtod(tk); |
257 | } | 235 | } |
258 | 236 | ||
259 | /** | 237 | /** |
@@ -776,6 +754,7 @@ static void timekeeping_resume(void) | |||
776 | 754 | ||
777 | read_persistent_clock(&ts); | 755 | read_persistent_clock(&ts); |
778 | 756 | ||
757 | clockevents_resume(); | ||
779 | clocksource_resume(); | 758 | clocksource_resume(); |
780 | 759 | ||
781 | write_seqlock_irqsave(&tk->lock, flags); | 760 | write_seqlock_irqsave(&tk->lock, flags); |
@@ -835,6 +814,7 @@ static int timekeeping_suspend(void) | |||
835 | 814 | ||
836 | clockevents_notify(CLOCK_EVT_NOTIFY_SUSPEND, NULL); | 815 | clockevents_notify(CLOCK_EVT_NOTIFY_SUSPEND, NULL); |
837 | clocksource_suspend(); | 816 | clocksource_suspend(); |
817 | clockevents_suspend(); | ||
838 | 818 | ||
839 | return 0; | 819 | return 0; |
840 | } | 820 | } |
@@ -1111,7 +1091,7 @@ static cycle_t logarithmic_accumulation(struct timekeeper *tk, cycle_t offset, | |||
1111 | accumulate_nsecs_to_secs(tk); | 1091 | accumulate_nsecs_to_secs(tk); |
1112 | 1092 | ||
1113 | /* Accumulate raw time */ | 1093 | /* Accumulate raw time */ |
1114 | raw_nsecs = tk->raw_interval << shift; | 1094 | raw_nsecs = (u64)tk->raw_interval << shift; |
1115 | raw_nsecs += tk->raw_time.tv_nsec; | 1095 | raw_nsecs += tk->raw_time.tv_nsec; |
1116 | if (raw_nsecs >= NSEC_PER_SEC) { | 1096 | if (raw_nsecs >= NSEC_PER_SEC) { |
1117 | u64 raw_secs = raw_nsecs; | 1097 | u64 raw_secs = raw_nsecs; |
@@ -1128,6 +1108,33 @@ static cycle_t logarithmic_accumulation(struct timekeeper *tk, cycle_t offset, | |||
1128 | return offset; | 1108 | return offset; |
1129 | } | 1109 | } |
1130 | 1110 | ||
1111 | #ifdef CONFIG_GENERIC_TIME_VSYSCALL_OLD | ||
1112 | static inline void old_vsyscall_fixup(struct timekeeper *tk) | ||
1113 | { | ||
1114 | s64 remainder; | ||
1115 | |||
1116 | /* | ||
1117 | * Store only full nanoseconds into xtime_nsec after rounding | ||
1118 | * it up and add the remainder to the error difference. | ||
1119 | * XXX - This is necessary to avoid small 1ns inconsistnecies caused | ||
1120 | * by truncating the remainder in vsyscalls. However, it causes | ||
1121 | * additional work to be done in timekeeping_adjust(). Once | ||
1122 | * the vsyscall implementations are converted to use xtime_nsec | ||
1123 | * (shifted nanoseconds), and CONFIG_GENERIC_TIME_VSYSCALL_OLD | ||
1124 | * users are removed, this can be killed. | ||
1125 | */ | ||
1126 | remainder = tk->xtime_nsec & ((1ULL << tk->shift) - 1); | ||
1127 | tk->xtime_nsec -= remainder; | ||
1128 | tk->xtime_nsec += 1ULL << tk->shift; | ||
1129 | tk->ntp_error += remainder << tk->ntp_error_shift; | ||
1130 | |||
1131 | } | ||
1132 | #else | ||
1133 | #define old_vsyscall_fixup(tk) | ||
1134 | #endif | ||
1135 | |||
1136 | |||
1137 | |||
1131 | /** | 1138 | /** |
1132 | * update_wall_time - Uses the current clocksource to increment the wall time | 1139 | * update_wall_time - Uses the current clocksource to increment the wall time |
1133 | * | 1140 | * |
@@ -1139,7 +1146,6 @@ static void update_wall_time(void) | |||
1139 | cycle_t offset; | 1146 | cycle_t offset; |
1140 | int shift = 0, maxshift; | 1147 | int shift = 0, maxshift; |
1141 | unsigned long flags; | 1148 | unsigned long flags; |
1142 | s64 remainder; | ||
1143 | 1149 | ||
1144 | write_seqlock_irqsave(&tk->lock, flags); | 1150 | write_seqlock_irqsave(&tk->lock, flags); |
1145 | 1151 | ||
@@ -1181,20 +1187,11 @@ static void update_wall_time(void) | |||
1181 | /* correct the clock when NTP error is too big */ | 1187 | /* correct the clock when NTP error is too big */ |
1182 | timekeeping_adjust(tk, offset); | 1188 | timekeeping_adjust(tk, offset); |
1183 | 1189 | ||
1184 | |||
1185 | /* | 1190 | /* |
1186 | * Store only full nanoseconds into xtime_nsec after rounding | 1191 | * XXX This can be killed once everyone converts |
1187 | * it up and add the remainder to the error difference. | 1192 | * to the new update_vsyscall. |
1188 | * XXX - This is necessary to avoid small 1ns inconsistnecies caused | 1193 | */ |
1189 | * by truncating the remainder in vsyscalls. However, it causes | 1194 | old_vsyscall_fixup(tk); |
1190 | * additional work to be done in timekeeping_adjust(). Once | ||
1191 | * the vsyscall implementations are converted to use xtime_nsec | ||
1192 | * (shifted nanoseconds), this can be killed. | ||
1193 | */ | ||
1194 | remainder = tk->xtime_nsec & ((1ULL << tk->shift) - 1); | ||
1195 | tk->xtime_nsec -= remainder; | ||
1196 | tk->xtime_nsec += 1ULL << tk->shift; | ||
1197 | tk->ntp_error += remainder << tk->ntp_error_shift; | ||
1198 | 1195 | ||
1199 | /* | 1196 | /* |
1200 | * Finally, make sure that after the rounding | 1197 | * Finally, make sure that after the rounding |
@@ -1346,9 +1343,7 @@ struct timespec get_monotonic_coarse(void) | |||
1346 | } | 1343 | } |
1347 | 1344 | ||
1348 | /* | 1345 | /* |
1349 | * The 64-bit jiffies value is not atomic - you MUST NOT read it | 1346 | * Must hold jiffies_lock |
1350 | * without sampling the sequence number in xtime_lock. | ||
1351 | * jiffies is defined in the linker script... | ||
1352 | */ | 1347 | */ |
1353 | void do_timer(unsigned long ticks) | 1348 | void do_timer(unsigned long ticks) |
1354 | { | 1349 | { |
@@ -1436,7 +1431,7 @@ EXPORT_SYMBOL_GPL(ktime_get_monotonic_offset); | |||
1436 | */ | 1431 | */ |
1437 | void xtime_update(unsigned long ticks) | 1432 | void xtime_update(unsigned long ticks) |
1438 | { | 1433 | { |
1439 | write_seqlock(&xtime_lock); | 1434 | write_seqlock(&jiffies_lock); |
1440 | do_timer(ticks); | 1435 | do_timer(ticks); |
1441 | write_sequnlock(&xtime_lock); | 1436 | write_sequnlock(&jiffies_lock); |
1442 | } | 1437 | } |
diff --git a/kernel/timer.c b/kernel/timer.c index 8c5e7b908c68..367d00858482 100644 --- a/kernel/timer.c +++ b/kernel/timer.c | |||
@@ -63,6 +63,7 @@ EXPORT_SYMBOL(jiffies_64); | |||
63 | #define TVR_SIZE (1 << TVR_BITS) | 63 | #define TVR_SIZE (1 << TVR_BITS) |
64 | #define TVN_MASK (TVN_SIZE - 1) | 64 | #define TVN_MASK (TVN_SIZE - 1) |
65 | #define TVR_MASK (TVR_SIZE - 1) | 65 | #define TVR_MASK (TVR_SIZE - 1) |
66 | #define MAX_TVAL ((unsigned long)((1ULL << (TVR_BITS + 4*TVN_BITS)) - 1)) | ||
66 | 67 | ||
67 | struct tvec { | 68 | struct tvec { |
68 | struct list_head vec[TVN_SIZE]; | 69 | struct list_head vec[TVN_SIZE]; |
@@ -92,24 +93,25 @@ static DEFINE_PER_CPU(struct tvec_base *, tvec_bases) = &boot_tvec_bases; | |||
92 | /* Functions below help us manage 'deferrable' flag */ | 93 | /* Functions below help us manage 'deferrable' flag */ |
93 | static inline unsigned int tbase_get_deferrable(struct tvec_base *base) | 94 | static inline unsigned int tbase_get_deferrable(struct tvec_base *base) |
94 | { | 95 | { |
95 | return ((unsigned int)(unsigned long)base & TBASE_DEFERRABLE_FLAG); | 96 | return ((unsigned int)(unsigned long)base & TIMER_DEFERRABLE); |
96 | } | 97 | } |
97 | 98 | ||
98 | static inline struct tvec_base *tbase_get_base(struct tvec_base *base) | 99 | static inline unsigned int tbase_get_irqsafe(struct tvec_base *base) |
99 | { | 100 | { |
100 | return ((struct tvec_base *)((unsigned long)base & ~TBASE_DEFERRABLE_FLAG)); | 101 | return ((unsigned int)(unsigned long)base & TIMER_IRQSAFE); |
101 | } | 102 | } |
102 | 103 | ||
103 | static inline void timer_set_deferrable(struct timer_list *timer) | 104 | static inline struct tvec_base *tbase_get_base(struct tvec_base *base) |
104 | { | 105 | { |
105 | timer->base = TBASE_MAKE_DEFERRED(timer->base); | 106 | return ((struct tvec_base *)((unsigned long)base & ~TIMER_FLAG_MASK)); |
106 | } | 107 | } |
107 | 108 | ||
108 | static inline void | 109 | static inline void |
109 | timer_set_base(struct timer_list *timer, struct tvec_base *new_base) | 110 | timer_set_base(struct timer_list *timer, struct tvec_base *new_base) |
110 | { | 111 | { |
111 | timer->base = (struct tvec_base *)((unsigned long)(new_base) | | 112 | unsigned long flags = (unsigned long)timer->base & TIMER_FLAG_MASK; |
112 | tbase_get_deferrable(timer->base)); | 113 | |
114 | timer->base = (struct tvec_base *)((unsigned long)(new_base) | flags); | ||
113 | } | 115 | } |
114 | 116 | ||
115 | static unsigned long round_jiffies_common(unsigned long j, int cpu, | 117 | static unsigned long round_jiffies_common(unsigned long j, int cpu, |
@@ -358,11 +360,12 @@ __internal_add_timer(struct tvec_base *base, struct timer_list *timer) | |||
358 | vec = base->tv1.vec + (base->timer_jiffies & TVR_MASK); | 360 | vec = base->tv1.vec + (base->timer_jiffies & TVR_MASK); |
359 | } else { | 361 | } else { |
360 | int i; | 362 | int i; |
361 | /* If the timeout is larger than 0xffffffff on 64-bit | 363 | /* If the timeout is larger than MAX_TVAL (on 64-bit |
362 | * architectures then we use the maximum timeout: | 364 | * architectures or with CONFIG_BASE_SMALL=1) then we |
365 | * use the maximum timeout. | ||
363 | */ | 366 | */ |
364 | if (idx > 0xffffffffUL) { | 367 | if (idx > MAX_TVAL) { |
365 | idx = 0xffffffffUL; | 368 | idx = MAX_TVAL; |
366 | expires = idx + base->timer_jiffies; | 369 | expires = idx + base->timer_jiffies; |
367 | } | 370 | } |
368 | i = (expires >> (TVR_BITS + 3 * TVN_BITS)) & TVN_MASK; | 371 | i = (expires >> (TVR_BITS + 3 * TVN_BITS)) & TVN_MASK; |
@@ -563,16 +566,14 @@ static inline void debug_timer_assert_init(struct timer_list *timer) | |||
563 | debug_object_assert_init(timer, &timer_debug_descr); | 566 | debug_object_assert_init(timer, &timer_debug_descr); |
564 | } | 567 | } |
565 | 568 | ||
566 | static void __init_timer(struct timer_list *timer, | 569 | static void do_init_timer(struct timer_list *timer, unsigned int flags, |
567 | const char *name, | 570 | const char *name, struct lock_class_key *key); |
568 | struct lock_class_key *key); | ||
569 | 571 | ||
570 | void init_timer_on_stack_key(struct timer_list *timer, | 572 | void init_timer_on_stack_key(struct timer_list *timer, unsigned int flags, |
571 | const char *name, | 573 | const char *name, struct lock_class_key *key) |
572 | struct lock_class_key *key) | ||
573 | { | 574 | { |
574 | debug_object_init_on_stack(timer, &timer_debug_descr); | 575 | debug_object_init_on_stack(timer, &timer_debug_descr); |
575 | __init_timer(timer, name, key); | 576 | do_init_timer(timer, flags, name, key); |
576 | } | 577 | } |
577 | EXPORT_SYMBOL_GPL(init_timer_on_stack_key); | 578 | EXPORT_SYMBOL_GPL(init_timer_on_stack_key); |
578 | 579 | ||
@@ -613,12 +614,13 @@ static inline void debug_assert_init(struct timer_list *timer) | |||
613 | debug_timer_assert_init(timer); | 614 | debug_timer_assert_init(timer); |
614 | } | 615 | } |
615 | 616 | ||
616 | static void __init_timer(struct timer_list *timer, | 617 | static void do_init_timer(struct timer_list *timer, unsigned int flags, |
617 | const char *name, | 618 | const char *name, struct lock_class_key *key) |
618 | struct lock_class_key *key) | ||
619 | { | 619 | { |
620 | struct tvec_base *base = __raw_get_cpu_var(tvec_bases); | ||
621 | |||
620 | timer->entry.next = NULL; | 622 | timer->entry.next = NULL; |
621 | timer->base = __raw_get_cpu_var(tvec_bases); | 623 | timer->base = (void *)((unsigned long)base | flags); |
622 | timer->slack = -1; | 624 | timer->slack = -1; |
623 | #ifdef CONFIG_TIMER_STATS | 625 | #ifdef CONFIG_TIMER_STATS |
624 | timer->start_site = NULL; | 626 | timer->start_site = NULL; |
@@ -628,22 +630,10 @@ static void __init_timer(struct timer_list *timer, | |||
628 | lockdep_init_map(&timer->lockdep_map, name, key, 0); | 630 | lockdep_init_map(&timer->lockdep_map, name, key, 0); |
629 | } | 631 | } |
630 | 632 | ||
631 | void setup_deferrable_timer_on_stack_key(struct timer_list *timer, | ||
632 | const char *name, | ||
633 | struct lock_class_key *key, | ||
634 | void (*function)(unsigned long), | ||
635 | unsigned long data) | ||
636 | { | ||
637 | timer->function = function; | ||
638 | timer->data = data; | ||
639 | init_timer_on_stack_key(timer, name, key); | ||
640 | timer_set_deferrable(timer); | ||
641 | } | ||
642 | EXPORT_SYMBOL_GPL(setup_deferrable_timer_on_stack_key); | ||
643 | |||
644 | /** | 633 | /** |
645 | * init_timer_key - initialize a timer | 634 | * init_timer_key - initialize a timer |
646 | * @timer: the timer to be initialized | 635 | * @timer: the timer to be initialized |
636 | * @flags: timer flags | ||
647 | * @name: name of the timer | 637 | * @name: name of the timer |
648 | * @key: lockdep class key of the fake lock used for tracking timer | 638 | * @key: lockdep class key of the fake lock used for tracking timer |
649 | * sync lock dependencies | 639 | * sync lock dependencies |
@@ -651,24 +641,14 @@ EXPORT_SYMBOL_GPL(setup_deferrable_timer_on_stack_key); | |||
651 | * init_timer_key() must be done to a timer prior calling *any* of the | 641 | * init_timer_key() must be done to a timer prior calling *any* of the |
652 | * other timer functions. | 642 | * other timer functions. |
653 | */ | 643 | */ |
654 | void init_timer_key(struct timer_list *timer, | 644 | void init_timer_key(struct timer_list *timer, unsigned int flags, |
655 | const char *name, | 645 | const char *name, struct lock_class_key *key) |
656 | struct lock_class_key *key) | ||
657 | { | 646 | { |
658 | debug_init(timer); | 647 | debug_init(timer); |
659 | __init_timer(timer, name, key); | 648 | do_init_timer(timer, flags, name, key); |
660 | } | 649 | } |
661 | EXPORT_SYMBOL(init_timer_key); | 650 | EXPORT_SYMBOL(init_timer_key); |
662 | 651 | ||
663 | void init_timer_deferrable_key(struct timer_list *timer, | ||
664 | const char *name, | ||
665 | struct lock_class_key *key) | ||
666 | { | ||
667 | init_timer_key(timer, name, key); | ||
668 | timer_set_deferrable(timer); | ||
669 | } | ||
670 | EXPORT_SYMBOL(init_timer_deferrable_key); | ||
671 | |||
672 | static inline void detach_timer(struct timer_list *timer, bool clear_pending) | 652 | static inline void detach_timer(struct timer_list *timer, bool clear_pending) |
673 | { | 653 | { |
674 | struct list_head *entry = &timer->entry; | 654 | struct list_head *entry = &timer->entry; |
@@ -686,7 +666,7 @@ detach_expired_timer(struct timer_list *timer, struct tvec_base *base) | |||
686 | { | 666 | { |
687 | detach_timer(timer, true); | 667 | detach_timer(timer, true); |
688 | if (!tbase_get_deferrable(timer->base)) | 668 | if (!tbase_get_deferrable(timer->base)) |
689 | timer->base->active_timers--; | 669 | base->active_timers--; |
690 | } | 670 | } |
691 | 671 | ||
692 | static int detach_if_pending(struct timer_list *timer, struct tvec_base *base, | 672 | static int detach_if_pending(struct timer_list *timer, struct tvec_base *base, |
@@ -697,7 +677,7 @@ static int detach_if_pending(struct timer_list *timer, struct tvec_base *base, | |||
697 | 677 | ||
698 | detach_timer(timer, clear_pending); | 678 | detach_timer(timer, clear_pending); |
699 | if (!tbase_get_deferrable(timer->base)) { | 679 | if (!tbase_get_deferrable(timer->base)) { |
700 | timer->base->active_timers--; | 680 | base->active_timers--; |
701 | if (timer->expires == base->next_timer) | 681 | if (timer->expires == base->next_timer) |
702 | base->next_timer = base->timer_jiffies; | 682 | base->next_timer = base->timer_jiffies; |
703 | } | 683 | } |
@@ -1029,14 +1009,14 @@ EXPORT_SYMBOL(try_to_del_timer_sync); | |||
1029 | * | 1009 | * |
1030 | * Synchronization rules: Callers must prevent restarting of the timer, | 1010 | * Synchronization rules: Callers must prevent restarting of the timer, |
1031 | * otherwise this function is meaningless. It must not be called from | 1011 | * otherwise this function is meaningless. It must not be called from |
1032 | * interrupt contexts. The caller must not hold locks which would prevent | 1012 | * interrupt contexts unless the timer is an irqsafe one. The caller must |
1033 | * completion of the timer's handler. The timer's handler must not call | 1013 | * not hold locks which would prevent completion of the timer's |
1034 | * add_timer_on(). Upon exit the timer is not queued and the handler is | 1014 | * handler. The timer's handler must not call add_timer_on(). Upon exit the |
1035 | * not running on any CPU. | 1015 | * timer is not queued and the handler is not running on any CPU. |
1036 | * | 1016 | * |
1037 | * Note: You must not hold locks that are held in interrupt context | 1017 | * Note: For !irqsafe timers, you must not hold locks that are held in |
1038 | * while calling this function. Even if the lock has nothing to do | 1018 | * interrupt context while calling this function. Even if the lock has |
1039 | * with the timer in question. Here's why: | 1019 | * nothing to do with the timer in question. Here's why: |
1040 | * | 1020 | * |
1041 | * CPU0 CPU1 | 1021 | * CPU0 CPU1 |
1042 | * ---- ---- | 1022 | * ---- ---- |
@@ -1073,7 +1053,7 @@ int del_timer_sync(struct timer_list *timer) | |||
1073 | * don't use it in hardirq context, because it | 1053 | * don't use it in hardirq context, because it |
1074 | * could lead to deadlock. | 1054 | * could lead to deadlock. |
1075 | */ | 1055 | */ |
1076 | WARN_ON(in_irq()); | 1056 | WARN_ON(in_irq() && !tbase_get_irqsafe(timer->base)); |
1077 | for (;;) { | 1057 | for (;;) { |
1078 | int ret = try_to_del_timer_sync(timer); | 1058 | int ret = try_to_del_timer_sync(timer); |
1079 | if (ret >= 0) | 1059 | if (ret >= 0) |
@@ -1180,19 +1160,27 @@ static inline void __run_timers(struct tvec_base *base) | |||
1180 | while (!list_empty(head)) { | 1160 | while (!list_empty(head)) { |
1181 | void (*fn)(unsigned long); | 1161 | void (*fn)(unsigned long); |
1182 | unsigned long data; | 1162 | unsigned long data; |
1163 | bool irqsafe; | ||
1183 | 1164 | ||
1184 | timer = list_first_entry(head, struct timer_list,entry); | 1165 | timer = list_first_entry(head, struct timer_list,entry); |
1185 | fn = timer->function; | 1166 | fn = timer->function; |
1186 | data = timer->data; | 1167 | data = timer->data; |
1168 | irqsafe = tbase_get_irqsafe(timer->base); | ||
1187 | 1169 | ||
1188 | timer_stats_account_timer(timer); | 1170 | timer_stats_account_timer(timer); |
1189 | 1171 | ||
1190 | base->running_timer = timer; | 1172 | base->running_timer = timer; |
1191 | detach_expired_timer(timer, base); | 1173 | detach_expired_timer(timer, base); |
1192 | 1174 | ||
1193 | spin_unlock_irq(&base->lock); | 1175 | if (irqsafe) { |
1194 | call_timer_fn(timer, fn, data); | 1176 | spin_unlock(&base->lock); |
1195 | spin_lock_irq(&base->lock); | 1177 | call_timer_fn(timer, fn, data); |
1178 | spin_lock(&base->lock); | ||
1179 | } else { | ||
1180 | spin_unlock_irq(&base->lock); | ||
1181 | call_timer_fn(timer, fn, data); | ||
1182 | spin_lock_irq(&base->lock); | ||
1183 | } | ||
1196 | } | 1184 | } |
1197 | } | 1185 | } |
1198 | base->running_timer = NULL; | 1186 | base->running_timer = NULL; |
@@ -1791,9 +1779,13 @@ static struct notifier_block __cpuinitdata timers_nb = { | |||
1791 | 1779 | ||
1792 | void __init init_timers(void) | 1780 | void __init init_timers(void) |
1793 | { | 1781 | { |
1794 | int err = timer_cpu_notify(&timers_nb, (unsigned long)CPU_UP_PREPARE, | 1782 | int err; |
1795 | (void *)(long)smp_processor_id()); | 1783 | |
1784 | /* ensure there are enough low bits for flags in timer->base pointer */ | ||
1785 | BUILD_BUG_ON(__alignof__(struct tvec_base) & TIMER_FLAG_MASK); | ||
1796 | 1786 | ||
1787 | err = timer_cpu_notify(&timers_nb, (unsigned long)CPU_UP_PREPARE, | ||
1788 | (void *)(long)smp_processor_id()); | ||
1797 | init_timer_stats(); | 1789 | init_timer_stats(); |
1798 | 1790 | ||
1799 | BUG_ON(err != NOTIFY_OK); | 1791 | BUG_ON(err != NOTIFY_OK); |
diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index 8c4c07071cc5..5d89335a485f 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig | |||
@@ -49,6 +49,11 @@ config HAVE_SYSCALL_TRACEPOINTS | |||
49 | help | 49 | help |
50 | See Documentation/trace/ftrace-design.txt | 50 | See Documentation/trace/ftrace-design.txt |
51 | 51 | ||
52 | config HAVE_FENTRY | ||
53 | bool | ||
54 | help | ||
55 | Arch supports the gcc options -pg with -mfentry | ||
56 | |||
52 | config HAVE_C_RECORDMCOUNT | 57 | config HAVE_C_RECORDMCOUNT |
53 | bool | 58 | bool |
54 | help | 59 | help |
@@ -57,8 +62,12 @@ config HAVE_C_RECORDMCOUNT | |||
57 | config TRACER_MAX_TRACE | 62 | config TRACER_MAX_TRACE |
58 | bool | 63 | bool |
59 | 64 | ||
65 | config TRACE_CLOCK | ||
66 | bool | ||
67 | |||
60 | config RING_BUFFER | 68 | config RING_BUFFER |
61 | bool | 69 | bool |
70 | select TRACE_CLOCK | ||
62 | 71 | ||
63 | config FTRACE_NMI_ENTER | 72 | config FTRACE_NMI_ENTER |
64 | bool | 73 | bool |
@@ -109,6 +118,8 @@ config TRACING | |||
109 | select NOP_TRACER | 118 | select NOP_TRACER |
110 | select BINARY_PRINTF | 119 | select BINARY_PRINTF |
111 | select EVENT_TRACING | 120 | select EVENT_TRACING |
121 | select TRACE_CLOCK | ||
122 | select IRQ_WORK | ||
112 | 123 | ||
113 | config GENERIC_TRACER | 124 | config GENERIC_TRACER |
114 | bool | 125 | bool |
diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile index b831087c8200..d7e2068e4b71 100644 --- a/kernel/trace/Makefile +++ b/kernel/trace/Makefile | |||
@@ -5,10 +5,12 @@ ifdef CONFIG_FUNCTION_TRACER | |||
5 | ORIG_CFLAGS := $(KBUILD_CFLAGS) | 5 | ORIG_CFLAGS := $(KBUILD_CFLAGS) |
6 | KBUILD_CFLAGS = $(subst -pg,,$(ORIG_CFLAGS)) | 6 | KBUILD_CFLAGS = $(subst -pg,,$(ORIG_CFLAGS)) |
7 | 7 | ||
8 | ifdef CONFIG_FTRACE_SELFTEST | ||
8 | # selftest needs instrumentation | 9 | # selftest needs instrumentation |
9 | CFLAGS_trace_selftest_dynamic.o = -pg | 10 | CFLAGS_trace_selftest_dynamic.o = -pg |
10 | obj-y += trace_selftest_dynamic.o | 11 | obj-y += trace_selftest_dynamic.o |
11 | endif | 12 | endif |
13 | endif | ||
12 | 14 | ||
13 | # If unlikely tracing is enabled, do not trace these files | 15 | # If unlikely tracing is enabled, do not trace these files |
14 | ifdef CONFIG_TRACING_BRANCHES | 16 | ifdef CONFIG_TRACING_BRANCHES |
@@ -17,11 +19,7 @@ endif | |||
17 | 19 | ||
18 | CFLAGS_trace_events_filter.o := -I$(src) | 20 | CFLAGS_trace_events_filter.o := -I$(src) |
19 | 21 | ||
20 | # | 22 | obj-$(CONFIG_TRACE_CLOCK) += trace_clock.o |
21 | # Make the trace clocks available generally: it's infrastructure | ||
22 | # relied on by ptrace for example: | ||
23 | # | ||
24 | obj-y += trace_clock.o | ||
25 | 23 | ||
26 | obj-$(CONFIG_FUNCTION_TRACER) += libftrace.o | 24 | obj-$(CONFIG_FUNCTION_TRACER) += libftrace.o |
27 | obj-$(CONFIG_RING_BUFFER) += ring_buffer.o | 25 | obj-$(CONFIG_RING_BUFFER) += ring_buffer.o |
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index b4f20fba09fc..3ffe4c5ad3f3 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c | |||
@@ -10,7 +10,7 @@ | |||
10 | * Based on code in the latency_tracer, that is: | 10 | * Based on code in the latency_tracer, that is: |
11 | * | 11 | * |
12 | * Copyright (C) 2004-2006 Ingo Molnar | 12 | * Copyright (C) 2004-2006 Ingo Molnar |
13 | * Copyright (C) 2004 William Lee Irwin III | 13 | * Copyright (C) 2004 Nadia Yvette Chambers |
14 | */ | 14 | */ |
15 | 15 | ||
16 | #include <linux/stop_machine.h> | 16 | #include <linux/stop_machine.h> |
@@ -64,12 +64,20 @@ | |||
64 | 64 | ||
65 | #define FL_GLOBAL_CONTROL_MASK (FTRACE_OPS_FL_GLOBAL | FTRACE_OPS_FL_CONTROL) | 65 | #define FL_GLOBAL_CONTROL_MASK (FTRACE_OPS_FL_GLOBAL | FTRACE_OPS_FL_CONTROL) |
66 | 66 | ||
67 | static struct ftrace_ops ftrace_list_end __read_mostly = { | ||
68 | .func = ftrace_stub, | ||
69 | .flags = FTRACE_OPS_FL_RECURSION_SAFE, | ||
70 | }; | ||
71 | |||
67 | /* ftrace_enabled is a method to turn ftrace on or off */ | 72 | /* ftrace_enabled is a method to turn ftrace on or off */ |
68 | int ftrace_enabled __read_mostly; | 73 | int ftrace_enabled __read_mostly; |
69 | static int last_ftrace_enabled; | 74 | static int last_ftrace_enabled; |
70 | 75 | ||
71 | /* Quick disabling of function tracer. */ | 76 | /* Quick disabling of function tracer. */ |
72 | int function_trace_stop; | 77 | int function_trace_stop __read_mostly; |
78 | |||
79 | /* Current function tracing op */ | ||
80 | struct ftrace_ops *function_trace_op __read_mostly = &ftrace_list_end; | ||
73 | 81 | ||
74 | /* List for set_ftrace_pid's pids. */ | 82 | /* List for set_ftrace_pid's pids. */ |
75 | LIST_HEAD(ftrace_pids); | 83 | LIST_HEAD(ftrace_pids); |
@@ -86,22 +94,43 @@ static int ftrace_disabled __read_mostly; | |||
86 | 94 | ||
87 | static DEFINE_MUTEX(ftrace_lock); | 95 | static DEFINE_MUTEX(ftrace_lock); |
88 | 96 | ||
89 | static struct ftrace_ops ftrace_list_end __read_mostly = { | ||
90 | .func = ftrace_stub, | ||
91 | }; | ||
92 | |||
93 | static struct ftrace_ops *ftrace_global_list __read_mostly = &ftrace_list_end; | 97 | static struct ftrace_ops *ftrace_global_list __read_mostly = &ftrace_list_end; |
94 | static struct ftrace_ops *ftrace_control_list __read_mostly = &ftrace_list_end; | 98 | static struct ftrace_ops *ftrace_control_list __read_mostly = &ftrace_list_end; |
95 | static struct ftrace_ops *ftrace_ops_list __read_mostly = &ftrace_list_end; | 99 | static struct ftrace_ops *ftrace_ops_list __read_mostly = &ftrace_list_end; |
96 | ftrace_func_t ftrace_trace_function __read_mostly = ftrace_stub; | 100 | ftrace_func_t ftrace_trace_function __read_mostly = ftrace_stub; |
97 | static ftrace_func_t __ftrace_trace_function_delay __read_mostly = ftrace_stub; | ||
98 | ftrace_func_t __ftrace_trace_function __read_mostly = ftrace_stub; | ||
99 | ftrace_func_t ftrace_pid_function __read_mostly = ftrace_stub; | 101 | ftrace_func_t ftrace_pid_function __read_mostly = ftrace_stub; |
100 | static struct ftrace_ops global_ops; | 102 | static struct ftrace_ops global_ops; |
101 | static struct ftrace_ops control_ops; | 103 | static struct ftrace_ops control_ops; |
102 | 104 | ||
103 | static void | 105 | #if ARCH_SUPPORTS_FTRACE_OPS |
104 | ftrace_ops_list_func(unsigned long ip, unsigned long parent_ip); | 106 | static void ftrace_ops_list_func(unsigned long ip, unsigned long parent_ip, |
107 | struct ftrace_ops *op, struct pt_regs *regs); | ||
108 | #else | ||
109 | /* See comment below, where ftrace_ops_list_func is defined */ | ||
110 | static void ftrace_ops_no_ops(unsigned long ip, unsigned long parent_ip); | ||
111 | #define ftrace_ops_list_func ((ftrace_func_t)ftrace_ops_no_ops) | ||
112 | #endif | ||
113 | |||
114 | /** | ||
115 | * ftrace_nr_registered_ops - return number of ops registered | ||
116 | * | ||
117 | * Returns the number of ftrace_ops registered and tracing functions | ||
118 | */ | ||
119 | int ftrace_nr_registered_ops(void) | ||
120 | { | ||
121 | struct ftrace_ops *ops; | ||
122 | int cnt = 0; | ||
123 | |||
124 | mutex_lock(&ftrace_lock); | ||
125 | |||
126 | for (ops = ftrace_ops_list; | ||
127 | ops != &ftrace_list_end; ops = ops->next) | ||
128 | cnt++; | ||
129 | |||
130 | mutex_unlock(&ftrace_lock); | ||
131 | |||
132 | return cnt; | ||
133 | } | ||
105 | 134 | ||
106 | /* | 135 | /* |
107 | * Traverse the ftrace_global_list, invoking all entries. The reason that we | 136 | * Traverse the ftrace_global_list, invoking all entries. The reason that we |
@@ -112,29 +141,29 @@ ftrace_ops_list_func(unsigned long ip, unsigned long parent_ip); | |||
112 | * | 141 | * |
113 | * Silly Alpha and silly pointer-speculation compiler optimizations! | 142 | * Silly Alpha and silly pointer-speculation compiler optimizations! |
114 | */ | 143 | */ |
115 | static void ftrace_global_list_func(unsigned long ip, | 144 | static void |
116 | unsigned long parent_ip) | 145 | ftrace_global_list_func(unsigned long ip, unsigned long parent_ip, |
146 | struct ftrace_ops *op, struct pt_regs *regs) | ||
117 | { | 147 | { |
118 | struct ftrace_ops *op; | ||
119 | |||
120 | if (unlikely(trace_recursion_test(TRACE_GLOBAL_BIT))) | 148 | if (unlikely(trace_recursion_test(TRACE_GLOBAL_BIT))) |
121 | return; | 149 | return; |
122 | 150 | ||
123 | trace_recursion_set(TRACE_GLOBAL_BIT); | 151 | trace_recursion_set(TRACE_GLOBAL_BIT); |
124 | op = rcu_dereference_raw(ftrace_global_list); /*see above*/ | 152 | op = rcu_dereference_raw(ftrace_global_list); /*see above*/ |
125 | while (op != &ftrace_list_end) { | 153 | while (op != &ftrace_list_end) { |
126 | op->func(ip, parent_ip); | 154 | op->func(ip, parent_ip, op, regs); |
127 | op = rcu_dereference_raw(op->next); /*see above*/ | 155 | op = rcu_dereference_raw(op->next); /*see above*/ |
128 | }; | 156 | }; |
129 | trace_recursion_clear(TRACE_GLOBAL_BIT); | 157 | trace_recursion_clear(TRACE_GLOBAL_BIT); |
130 | } | 158 | } |
131 | 159 | ||
132 | static void ftrace_pid_func(unsigned long ip, unsigned long parent_ip) | 160 | static void ftrace_pid_func(unsigned long ip, unsigned long parent_ip, |
161 | struct ftrace_ops *op, struct pt_regs *regs) | ||
133 | { | 162 | { |
134 | if (!test_tsk_trace_trace(current)) | 163 | if (!test_tsk_trace_trace(current)) |
135 | return; | 164 | return; |
136 | 165 | ||
137 | ftrace_pid_function(ip, parent_ip); | 166 | ftrace_pid_function(ip, parent_ip, op, regs); |
138 | } | 167 | } |
139 | 168 | ||
140 | static void set_ftrace_pid_function(ftrace_func_t func) | 169 | static void set_ftrace_pid_function(ftrace_func_t func) |
@@ -153,25 +182,9 @@ static void set_ftrace_pid_function(ftrace_func_t func) | |||
153 | void clear_ftrace_function(void) | 182 | void clear_ftrace_function(void) |
154 | { | 183 | { |
155 | ftrace_trace_function = ftrace_stub; | 184 | ftrace_trace_function = ftrace_stub; |
156 | __ftrace_trace_function = ftrace_stub; | ||
157 | __ftrace_trace_function_delay = ftrace_stub; | ||
158 | ftrace_pid_function = ftrace_stub; | 185 | ftrace_pid_function = ftrace_stub; |
159 | } | 186 | } |
160 | 187 | ||
161 | #ifndef CONFIG_HAVE_FUNCTION_TRACE_MCOUNT_TEST | ||
162 | /* | ||
163 | * For those archs that do not test ftrace_trace_stop in their | ||
164 | * mcount call site, we need to do it from C. | ||
165 | */ | ||
166 | static void ftrace_test_stop_func(unsigned long ip, unsigned long parent_ip) | ||
167 | { | ||
168 | if (function_trace_stop) | ||
169 | return; | ||
170 | |||
171 | __ftrace_trace_function(ip, parent_ip); | ||
172 | } | ||
173 | #endif | ||
174 | |||
175 | static void control_ops_disable_all(struct ftrace_ops *ops) | 188 | static void control_ops_disable_all(struct ftrace_ops *ops) |
176 | { | 189 | { |
177 | int cpu; | 190 | int cpu; |
@@ -230,28 +243,27 @@ static void update_ftrace_function(void) | |||
230 | 243 | ||
231 | /* | 244 | /* |
232 | * If we are at the end of the list and this ops is | 245 | * If we are at the end of the list and this ops is |
233 | * not dynamic, then have the mcount trampoline call | 246 | * recursion safe and not dynamic and the arch supports passing ops, |
234 | * the function directly | 247 | * then have the mcount trampoline call the function directly. |
235 | */ | 248 | */ |
236 | if (ftrace_ops_list == &ftrace_list_end || | 249 | if (ftrace_ops_list == &ftrace_list_end || |
237 | (ftrace_ops_list->next == &ftrace_list_end && | 250 | (ftrace_ops_list->next == &ftrace_list_end && |
238 | !(ftrace_ops_list->flags & FTRACE_OPS_FL_DYNAMIC))) | 251 | !(ftrace_ops_list->flags & FTRACE_OPS_FL_DYNAMIC) && |
252 | (ftrace_ops_list->flags & FTRACE_OPS_FL_RECURSION_SAFE) && | ||
253 | !FTRACE_FORCE_LIST_FUNC)) { | ||
254 | /* Set the ftrace_ops that the arch callback uses */ | ||
255 | if (ftrace_ops_list == &global_ops) | ||
256 | function_trace_op = ftrace_global_list; | ||
257 | else | ||
258 | function_trace_op = ftrace_ops_list; | ||
239 | func = ftrace_ops_list->func; | 259 | func = ftrace_ops_list->func; |
240 | else | 260 | } else { |
261 | /* Just use the default ftrace_ops */ | ||
262 | function_trace_op = &ftrace_list_end; | ||
241 | func = ftrace_ops_list_func; | 263 | func = ftrace_ops_list_func; |
264 | } | ||
242 | 265 | ||
243 | #ifdef CONFIG_HAVE_FUNCTION_TRACE_MCOUNT_TEST | ||
244 | ftrace_trace_function = func; | 266 | ftrace_trace_function = func; |
245 | #else | ||
246 | #ifdef CONFIG_DYNAMIC_FTRACE | ||
247 | /* do not update till all functions have been modified */ | ||
248 | __ftrace_trace_function_delay = func; | ||
249 | #else | ||
250 | __ftrace_trace_function = func; | ||
251 | #endif | ||
252 | ftrace_trace_function = | ||
253 | (func == ftrace_stub) ? func : ftrace_test_stop_func; | ||
254 | #endif | ||
255 | } | 267 | } |
256 | 268 | ||
257 | static void add_ftrace_ops(struct ftrace_ops **list, struct ftrace_ops *ops) | 269 | static void add_ftrace_ops(struct ftrace_ops **list, struct ftrace_ops *ops) |
@@ -325,6 +337,20 @@ static int __register_ftrace_function(struct ftrace_ops *ops) | |||
325 | if ((ops->flags & FL_GLOBAL_CONTROL_MASK) == FL_GLOBAL_CONTROL_MASK) | 337 | if ((ops->flags & FL_GLOBAL_CONTROL_MASK) == FL_GLOBAL_CONTROL_MASK) |
326 | return -EINVAL; | 338 | return -EINVAL; |
327 | 339 | ||
340 | #ifndef ARCH_SUPPORTS_FTRACE_SAVE_REGS | ||
341 | /* | ||
342 | * If the ftrace_ops specifies SAVE_REGS, then it only can be used | ||
343 | * if the arch supports it, or SAVE_REGS_IF_SUPPORTED is also set. | ||
344 | * Setting SAVE_REGS_IF_SUPPORTED makes SAVE_REGS irrelevant. | ||
345 | */ | ||
346 | if (ops->flags & FTRACE_OPS_FL_SAVE_REGS && | ||
347 | !(ops->flags & FTRACE_OPS_FL_SAVE_REGS_IF_SUPPORTED)) | ||
348 | return -EINVAL; | ||
349 | |||
350 | if (ops->flags & FTRACE_OPS_FL_SAVE_REGS_IF_SUPPORTED) | ||
351 | ops->flags |= FTRACE_OPS_FL_SAVE_REGS; | ||
352 | #endif | ||
353 | |||
328 | if (!core_kernel_data((unsigned long)ops)) | 354 | if (!core_kernel_data((unsigned long)ops)) |
329 | ops->flags |= FTRACE_OPS_FL_DYNAMIC; | 355 | ops->flags |= FTRACE_OPS_FL_DYNAMIC; |
330 | 356 | ||
@@ -773,7 +799,8 @@ ftrace_profile_alloc(struct ftrace_profile_stat *stat, unsigned long ip) | |||
773 | } | 799 | } |
774 | 800 | ||
775 | static void | 801 | static void |
776 | function_profile_call(unsigned long ip, unsigned long parent_ip) | 802 | function_profile_call(unsigned long ip, unsigned long parent_ip, |
803 | struct ftrace_ops *ops, struct pt_regs *regs) | ||
777 | { | 804 | { |
778 | struct ftrace_profile_stat *stat; | 805 | struct ftrace_profile_stat *stat; |
779 | struct ftrace_profile *rec; | 806 | struct ftrace_profile *rec; |
@@ -803,7 +830,7 @@ function_profile_call(unsigned long ip, unsigned long parent_ip) | |||
803 | #ifdef CONFIG_FUNCTION_GRAPH_TRACER | 830 | #ifdef CONFIG_FUNCTION_GRAPH_TRACER |
804 | static int profile_graph_entry(struct ftrace_graph_ent *trace) | 831 | static int profile_graph_entry(struct ftrace_graph_ent *trace) |
805 | { | 832 | { |
806 | function_profile_call(trace->func, 0); | 833 | function_profile_call(trace->func, 0, NULL, NULL); |
807 | return 1; | 834 | return 1; |
808 | } | 835 | } |
809 | 836 | ||
@@ -863,6 +890,7 @@ static void unregister_ftrace_profiler(void) | |||
863 | #else | 890 | #else |
864 | static struct ftrace_ops ftrace_profile_ops __read_mostly = { | 891 | static struct ftrace_ops ftrace_profile_ops __read_mostly = { |
865 | .func = function_profile_call, | 892 | .func = function_profile_call, |
893 | .flags = FTRACE_OPS_FL_RECURSION_SAFE, | ||
866 | }; | 894 | }; |
867 | 895 | ||
868 | static int register_ftrace_profiler(void) | 896 | static int register_ftrace_profiler(void) |
@@ -1045,6 +1073,7 @@ static struct ftrace_ops global_ops = { | |||
1045 | .func = ftrace_stub, | 1073 | .func = ftrace_stub, |
1046 | .notrace_hash = EMPTY_HASH, | 1074 | .notrace_hash = EMPTY_HASH, |
1047 | .filter_hash = EMPTY_HASH, | 1075 | .filter_hash = EMPTY_HASH, |
1076 | .flags = FTRACE_OPS_FL_RECURSION_SAFE, | ||
1048 | }; | 1077 | }; |
1049 | 1078 | ||
1050 | static DEFINE_MUTEX(ftrace_regex_lock); | 1079 | static DEFINE_MUTEX(ftrace_regex_lock); |
@@ -1525,6 +1554,12 @@ static void __ftrace_hash_rec_update(struct ftrace_ops *ops, | |||
1525 | rec->flags++; | 1554 | rec->flags++; |
1526 | if (FTRACE_WARN_ON((rec->flags & ~FTRACE_FL_MASK) == FTRACE_REF_MAX)) | 1555 | if (FTRACE_WARN_ON((rec->flags & ~FTRACE_FL_MASK) == FTRACE_REF_MAX)) |
1527 | return; | 1556 | return; |
1557 | /* | ||
1558 | * If any ops wants regs saved for this function | ||
1559 | * then all ops will get saved regs. | ||
1560 | */ | ||
1561 | if (ops->flags & FTRACE_OPS_FL_SAVE_REGS) | ||
1562 | rec->flags |= FTRACE_FL_REGS; | ||
1528 | } else { | 1563 | } else { |
1529 | if (FTRACE_WARN_ON((rec->flags & ~FTRACE_FL_MASK) == 0)) | 1564 | if (FTRACE_WARN_ON((rec->flags & ~FTRACE_FL_MASK) == 0)) |
1530 | return; | 1565 | return; |
@@ -1616,18 +1651,59 @@ static int ftrace_check_record(struct dyn_ftrace *rec, int enable, int update) | |||
1616 | if (enable && (rec->flags & ~FTRACE_FL_MASK)) | 1651 | if (enable && (rec->flags & ~FTRACE_FL_MASK)) |
1617 | flag = FTRACE_FL_ENABLED; | 1652 | flag = FTRACE_FL_ENABLED; |
1618 | 1653 | ||
1654 | /* | ||
1655 | * If enabling and the REGS flag does not match the REGS_EN, then | ||
1656 | * do not ignore this record. Set flags to fail the compare against | ||
1657 | * ENABLED. | ||
1658 | */ | ||
1659 | if (flag && | ||
1660 | (!(rec->flags & FTRACE_FL_REGS) != !(rec->flags & FTRACE_FL_REGS_EN))) | ||
1661 | flag |= FTRACE_FL_REGS; | ||
1662 | |||
1619 | /* If the state of this record hasn't changed, then do nothing */ | 1663 | /* If the state of this record hasn't changed, then do nothing */ |
1620 | if ((rec->flags & FTRACE_FL_ENABLED) == flag) | 1664 | if ((rec->flags & FTRACE_FL_ENABLED) == flag) |
1621 | return FTRACE_UPDATE_IGNORE; | 1665 | return FTRACE_UPDATE_IGNORE; |
1622 | 1666 | ||
1623 | if (flag) { | 1667 | if (flag) { |
1624 | if (update) | 1668 | /* Save off if rec is being enabled (for return value) */ |
1669 | flag ^= rec->flags & FTRACE_FL_ENABLED; | ||
1670 | |||
1671 | if (update) { | ||
1625 | rec->flags |= FTRACE_FL_ENABLED; | 1672 | rec->flags |= FTRACE_FL_ENABLED; |
1626 | return FTRACE_UPDATE_MAKE_CALL; | 1673 | if (flag & FTRACE_FL_REGS) { |
1674 | if (rec->flags & FTRACE_FL_REGS) | ||
1675 | rec->flags |= FTRACE_FL_REGS_EN; | ||
1676 | else | ||
1677 | rec->flags &= ~FTRACE_FL_REGS_EN; | ||
1678 | } | ||
1679 | } | ||
1680 | |||
1681 | /* | ||
1682 | * If this record is being updated from a nop, then | ||
1683 | * return UPDATE_MAKE_CALL. | ||
1684 | * Otherwise, if the EN flag is set, then return | ||
1685 | * UPDATE_MODIFY_CALL_REGS to tell the caller to convert | ||
1686 | * from the non-save regs, to a save regs function. | ||
1687 | * Otherwise, | ||
1688 | * return UPDATE_MODIFY_CALL to tell the caller to convert | ||
1689 | * from the save regs, to a non-save regs function. | ||
1690 | */ | ||
1691 | if (flag & FTRACE_FL_ENABLED) | ||
1692 | return FTRACE_UPDATE_MAKE_CALL; | ||
1693 | else if (rec->flags & FTRACE_FL_REGS_EN) | ||
1694 | return FTRACE_UPDATE_MODIFY_CALL_REGS; | ||
1695 | else | ||
1696 | return FTRACE_UPDATE_MODIFY_CALL; | ||
1627 | } | 1697 | } |
1628 | 1698 | ||
1629 | if (update) | 1699 | if (update) { |
1630 | rec->flags &= ~FTRACE_FL_ENABLED; | 1700 | /* If there's no more users, clear all flags */ |
1701 | if (!(rec->flags & ~FTRACE_FL_MASK)) | ||
1702 | rec->flags = 0; | ||
1703 | else | ||
1704 | /* Just disable the record (keep REGS state) */ | ||
1705 | rec->flags &= ~FTRACE_FL_ENABLED; | ||
1706 | } | ||
1631 | 1707 | ||
1632 | return FTRACE_UPDATE_MAKE_NOP; | 1708 | return FTRACE_UPDATE_MAKE_NOP; |
1633 | } | 1709 | } |
@@ -1662,13 +1738,17 @@ int ftrace_test_record(struct dyn_ftrace *rec, int enable) | |||
1662 | static int | 1738 | static int |
1663 | __ftrace_replace_code(struct dyn_ftrace *rec, int enable) | 1739 | __ftrace_replace_code(struct dyn_ftrace *rec, int enable) |
1664 | { | 1740 | { |
1741 | unsigned long ftrace_old_addr; | ||
1665 | unsigned long ftrace_addr; | 1742 | unsigned long ftrace_addr; |
1666 | int ret; | 1743 | int ret; |
1667 | 1744 | ||
1668 | ftrace_addr = (unsigned long)FTRACE_ADDR; | ||
1669 | |||
1670 | ret = ftrace_update_record(rec, enable); | 1745 | ret = ftrace_update_record(rec, enable); |
1671 | 1746 | ||
1747 | if (rec->flags & FTRACE_FL_REGS) | ||
1748 | ftrace_addr = (unsigned long)FTRACE_REGS_ADDR; | ||
1749 | else | ||
1750 | ftrace_addr = (unsigned long)FTRACE_ADDR; | ||
1751 | |||
1672 | switch (ret) { | 1752 | switch (ret) { |
1673 | case FTRACE_UPDATE_IGNORE: | 1753 | case FTRACE_UPDATE_IGNORE: |
1674 | return 0; | 1754 | return 0; |
@@ -1678,6 +1758,15 @@ __ftrace_replace_code(struct dyn_ftrace *rec, int enable) | |||
1678 | 1758 | ||
1679 | case FTRACE_UPDATE_MAKE_NOP: | 1759 | case FTRACE_UPDATE_MAKE_NOP: |
1680 | return ftrace_make_nop(NULL, rec, ftrace_addr); | 1760 | return ftrace_make_nop(NULL, rec, ftrace_addr); |
1761 | |||
1762 | case FTRACE_UPDATE_MODIFY_CALL_REGS: | ||
1763 | case FTRACE_UPDATE_MODIFY_CALL: | ||
1764 | if (rec->flags & FTRACE_FL_REGS) | ||
1765 | ftrace_old_addr = (unsigned long)FTRACE_ADDR; | ||
1766 | else | ||
1767 | ftrace_old_addr = (unsigned long)FTRACE_REGS_ADDR; | ||
1768 | |||
1769 | return ftrace_modify_call(rec, ftrace_old_addr, ftrace_addr); | ||
1681 | } | 1770 | } |
1682 | 1771 | ||
1683 | return -1; /* unknow ftrace bug */ | 1772 | return -1; /* unknow ftrace bug */ |
@@ -1882,16 +1971,6 @@ static void ftrace_run_update_code(int command) | |||
1882 | */ | 1971 | */ |
1883 | arch_ftrace_update_code(command); | 1972 | arch_ftrace_update_code(command); |
1884 | 1973 | ||
1885 | #ifndef CONFIG_HAVE_FUNCTION_TRACE_MCOUNT_TEST | ||
1886 | /* | ||
1887 | * For archs that call ftrace_test_stop_func(), we must | ||
1888 | * wait till after we update all the function callers | ||
1889 | * before we update the callback. This keeps different | ||
1890 | * ops that record different functions from corrupting | ||
1891 | * each other. | ||
1892 | */ | ||
1893 | __ftrace_trace_function = __ftrace_trace_function_delay; | ||
1894 | #endif | ||
1895 | function_trace_stop--; | 1974 | function_trace_stop--; |
1896 | 1975 | ||
1897 | ret = ftrace_arch_code_modify_post_process(); | 1976 | ret = ftrace_arch_code_modify_post_process(); |
@@ -2358,7 +2437,7 @@ static void reset_iter_read(struct ftrace_iterator *iter) | |||
2358 | { | 2437 | { |
2359 | iter->pos = 0; | 2438 | iter->pos = 0; |
2360 | iter->func_pos = 0; | 2439 | iter->func_pos = 0; |
2361 | iter->flags &= ~(FTRACE_ITER_PRINTALL & FTRACE_ITER_HASH); | 2440 | iter->flags &= ~(FTRACE_ITER_PRINTALL | FTRACE_ITER_HASH); |
2362 | } | 2441 | } |
2363 | 2442 | ||
2364 | static void *t_start(struct seq_file *m, loff_t *pos) | 2443 | static void *t_start(struct seq_file *m, loff_t *pos) |
@@ -2441,8 +2520,9 @@ static int t_show(struct seq_file *m, void *v) | |||
2441 | 2520 | ||
2442 | seq_printf(m, "%ps", (void *)rec->ip); | 2521 | seq_printf(m, "%ps", (void *)rec->ip); |
2443 | if (iter->flags & FTRACE_ITER_ENABLED) | 2522 | if (iter->flags & FTRACE_ITER_ENABLED) |
2444 | seq_printf(m, " (%ld)", | 2523 | seq_printf(m, " (%ld)%s", |
2445 | rec->flags & ~FTRACE_FL_MASK); | 2524 | rec->flags & ~FTRACE_FL_MASK, |
2525 | rec->flags & FTRACE_FL_REGS ? " R" : ""); | ||
2446 | seq_printf(m, "\n"); | 2526 | seq_printf(m, "\n"); |
2447 | 2527 | ||
2448 | return 0; | 2528 | return 0; |
@@ -2595,12 +2675,12 @@ ftrace_notrace_open(struct inode *inode, struct file *file) | |||
2595 | } | 2675 | } |
2596 | 2676 | ||
2597 | loff_t | 2677 | loff_t |
2598 | ftrace_regex_lseek(struct file *file, loff_t offset, int origin) | 2678 | ftrace_regex_lseek(struct file *file, loff_t offset, int whence) |
2599 | { | 2679 | { |
2600 | loff_t ret; | 2680 | loff_t ret; |
2601 | 2681 | ||
2602 | if (file->f_mode & FMODE_READ) | 2682 | if (file->f_mode & FMODE_READ) |
2603 | ret = seq_lseek(file, offset, origin); | 2683 | ret = seq_lseek(file, offset, whence); |
2604 | else | 2684 | else |
2605 | file->f_pos = ret = 1; | 2685 | file->f_pos = ret = 1; |
2606 | 2686 | ||
@@ -2788,10 +2868,10 @@ static int __init ftrace_mod_cmd_init(void) | |||
2788 | { | 2868 | { |
2789 | return register_ftrace_command(&ftrace_mod_cmd); | 2869 | return register_ftrace_command(&ftrace_mod_cmd); |
2790 | } | 2870 | } |
2791 | device_initcall(ftrace_mod_cmd_init); | 2871 | core_initcall(ftrace_mod_cmd_init); |
2792 | 2872 | ||
2793 | static void | 2873 | static void function_trace_probe_call(unsigned long ip, unsigned long parent_ip, |
2794 | function_trace_probe_call(unsigned long ip, unsigned long parent_ip) | 2874 | struct ftrace_ops *op, struct pt_regs *pt_regs) |
2795 | { | 2875 | { |
2796 | struct ftrace_func_probe *entry; | 2876 | struct ftrace_func_probe *entry; |
2797 | struct hlist_head *hhd; | 2877 | struct hlist_head *hhd; |
@@ -3162,8 +3242,27 @@ ftrace_notrace_write(struct file *file, const char __user *ubuf, | |||
3162 | } | 3242 | } |
3163 | 3243 | ||
3164 | static int | 3244 | static int |
3165 | ftrace_set_regex(struct ftrace_ops *ops, unsigned char *buf, int len, | 3245 | ftrace_match_addr(struct ftrace_hash *hash, unsigned long ip, int remove) |
3166 | int reset, int enable) | 3246 | { |
3247 | struct ftrace_func_entry *entry; | ||
3248 | |||
3249 | if (!ftrace_location(ip)) | ||
3250 | return -EINVAL; | ||
3251 | |||
3252 | if (remove) { | ||
3253 | entry = ftrace_lookup_ip(hash, ip); | ||
3254 | if (!entry) | ||
3255 | return -ENOENT; | ||
3256 | free_hash_entry(hash, entry); | ||
3257 | return 0; | ||
3258 | } | ||
3259 | |||
3260 | return add_hash_entry(hash, ip); | ||
3261 | } | ||
3262 | |||
3263 | static int | ||
3264 | ftrace_set_hash(struct ftrace_ops *ops, unsigned char *buf, int len, | ||
3265 | unsigned long ip, int remove, int reset, int enable) | ||
3167 | { | 3266 | { |
3168 | struct ftrace_hash **orig_hash; | 3267 | struct ftrace_hash **orig_hash; |
3169 | struct ftrace_hash *hash; | 3268 | struct ftrace_hash *hash; |
@@ -3192,6 +3291,11 @@ ftrace_set_regex(struct ftrace_ops *ops, unsigned char *buf, int len, | |||
3192 | ret = -EINVAL; | 3291 | ret = -EINVAL; |
3193 | goto out_regex_unlock; | 3292 | goto out_regex_unlock; |
3194 | } | 3293 | } |
3294 | if (ip) { | ||
3295 | ret = ftrace_match_addr(hash, ip, remove); | ||
3296 | if (ret < 0) | ||
3297 | goto out_regex_unlock; | ||
3298 | } | ||
3195 | 3299 | ||
3196 | mutex_lock(&ftrace_lock); | 3300 | mutex_lock(&ftrace_lock); |
3197 | ret = ftrace_hash_move(ops, enable, orig_hash, hash); | 3301 | ret = ftrace_hash_move(ops, enable, orig_hash, hash); |
@@ -3208,6 +3312,37 @@ ftrace_set_regex(struct ftrace_ops *ops, unsigned char *buf, int len, | |||
3208 | return ret; | 3312 | return ret; |
3209 | } | 3313 | } |
3210 | 3314 | ||
3315 | static int | ||
3316 | ftrace_set_addr(struct ftrace_ops *ops, unsigned long ip, int remove, | ||
3317 | int reset, int enable) | ||
3318 | { | ||
3319 | return ftrace_set_hash(ops, 0, 0, ip, remove, reset, enable); | ||
3320 | } | ||
3321 | |||
3322 | /** | ||
3323 | * ftrace_set_filter_ip - set a function to filter on in ftrace by address | ||
3324 | * @ops - the ops to set the filter with | ||
3325 | * @ip - the address to add to or remove from the filter. | ||
3326 | * @remove - non zero to remove the ip from the filter | ||
3327 | * @reset - non zero to reset all filters before applying this filter. | ||
3328 | * | ||
3329 | * Filters denote which functions should be enabled when tracing is enabled | ||
3330 | * If @ip is NULL, it failes to update filter. | ||
3331 | */ | ||
3332 | int ftrace_set_filter_ip(struct ftrace_ops *ops, unsigned long ip, | ||
3333 | int remove, int reset) | ||
3334 | { | ||
3335 | return ftrace_set_addr(ops, ip, remove, reset, 1); | ||
3336 | } | ||
3337 | EXPORT_SYMBOL_GPL(ftrace_set_filter_ip); | ||
3338 | |||
3339 | static int | ||
3340 | ftrace_set_regex(struct ftrace_ops *ops, unsigned char *buf, int len, | ||
3341 | int reset, int enable) | ||
3342 | { | ||
3343 | return ftrace_set_hash(ops, buf, len, 0, 0, reset, enable); | ||
3344 | } | ||
3345 | |||
3211 | /** | 3346 | /** |
3212 | * ftrace_set_filter - set a function to filter on in ftrace | 3347 | * ftrace_set_filter - set a function to filter on in ftrace |
3213 | * @ops - the ops to set the filter with | 3348 | * @ops - the ops to set the filter with |
@@ -3912,6 +4047,7 @@ void __init ftrace_init(void) | |||
3912 | 4047 | ||
3913 | static struct ftrace_ops global_ops = { | 4048 | static struct ftrace_ops global_ops = { |
3914 | .func = ftrace_stub, | 4049 | .func = ftrace_stub, |
4050 | .flags = FTRACE_OPS_FL_RECURSION_SAFE, | ||
3915 | }; | 4051 | }; |
3916 | 4052 | ||
3917 | static int __init ftrace_nodyn_init(void) | 4053 | static int __init ftrace_nodyn_init(void) |
@@ -3919,7 +4055,7 @@ static int __init ftrace_nodyn_init(void) | |||
3919 | ftrace_enabled = 1; | 4055 | ftrace_enabled = 1; |
3920 | return 0; | 4056 | return 0; |
3921 | } | 4057 | } |
3922 | device_initcall(ftrace_nodyn_init); | 4058 | core_initcall(ftrace_nodyn_init); |
3923 | 4059 | ||
3924 | static inline int ftrace_init_dyn_debugfs(struct dentry *d_tracer) { return 0; } | 4060 | static inline int ftrace_init_dyn_debugfs(struct dentry *d_tracer) { return 0; } |
3925 | static inline void ftrace_startup_enable(int command) { } | 4061 | static inline void ftrace_startup_enable(int command) { } |
@@ -3942,10 +4078,9 @@ ftrace_ops_test(struct ftrace_ops *ops, unsigned long ip) | |||
3942 | #endif /* CONFIG_DYNAMIC_FTRACE */ | 4078 | #endif /* CONFIG_DYNAMIC_FTRACE */ |
3943 | 4079 | ||
3944 | static void | 4080 | static void |
3945 | ftrace_ops_control_func(unsigned long ip, unsigned long parent_ip) | 4081 | ftrace_ops_control_func(unsigned long ip, unsigned long parent_ip, |
4082 | struct ftrace_ops *op, struct pt_regs *regs) | ||
3946 | { | 4083 | { |
3947 | struct ftrace_ops *op; | ||
3948 | |||
3949 | if (unlikely(trace_recursion_test(TRACE_CONTROL_BIT))) | 4084 | if (unlikely(trace_recursion_test(TRACE_CONTROL_BIT))) |
3950 | return; | 4085 | return; |
3951 | 4086 | ||
@@ -3959,7 +4094,7 @@ ftrace_ops_control_func(unsigned long ip, unsigned long parent_ip) | |||
3959 | while (op != &ftrace_list_end) { | 4094 | while (op != &ftrace_list_end) { |
3960 | if (!ftrace_function_local_disabled(op) && | 4095 | if (!ftrace_function_local_disabled(op) && |
3961 | ftrace_ops_test(op, ip)) | 4096 | ftrace_ops_test(op, ip)) |
3962 | op->func(ip, parent_ip); | 4097 | op->func(ip, parent_ip, op, regs); |
3963 | 4098 | ||
3964 | op = rcu_dereference_raw(op->next); | 4099 | op = rcu_dereference_raw(op->next); |
3965 | }; | 4100 | }; |
@@ -3969,13 +4104,18 @@ ftrace_ops_control_func(unsigned long ip, unsigned long parent_ip) | |||
3969 | 4104 | ||
3970 | static struct ftrace_ops control_ops = { | 4105 | static struct ftrace_ops control_ops = { |
3971 | .func = ftrace_ops_control_func, | 4106 | .func = ftrace_ops_control_func, |
4107 | .flags = FTRACE_OPS_FL_RECURSION_SAFE, | ||
3972 | }; | 4108 | }; |
3973 | 4109 | ||
3974 | static void | 4110 | static inline void |
3975 | ftrace_ops_list_func(unsigned long ip, unsigned long parent_ip) | 4111 | __ftrace_ops_list_func(unsigned long ip, unsigned long parent_ip, |
4112 | struct ftrace_ops *ignored, struct pt_regs *regs) | ||
3976 | { | 4113 | { |
3977 | struct ftrace_ops *op; | 4114 | struct ftrace_ops *op; |
3978 | 4115 | ||
4116 | if (function_trace_stop) | ||
4117 | return; | ||
4118 | |||
3979 | if (unlikely(trace_recursion_test(TRACE_INTERNAL_BIT))) | 4119 | if (unlikely(trace_recursion_test(TRACE_INTERNAL_BIT))) |
3980 | return; | 4120 | return; |
3981 | 4121 | ||
@@ -3988,13 +4128,39 @@ ftrace_ops_list_func(unsigned long ip, unsigned long parent_ip) | |||
3988 | op = rcu_dereference_raw(ftrace_ops_list); | 4128 | op = rcu_dereference_raw(ftrace_ops_list); |
3989 | while (op != &ftrace_list_end) { | 4129 | while (op != &ftrace_list_end) { |
3990 | if (ftrace_ops_test(op, ip)) | 4130 | if (ftrace_ops_test(op, ip)) |
3991 | op->func(ip, parent_ip); | 4131 | op->func(ip, parent_ip, op, regs); |
3992 | op = rcu_dereference_raw(op->next); | 4132 | op = rcu_dereference_raw(op->next); |
3993 | }; | 4133 | }; |
3994 | preempt_enable_notrace(); | 4134 | preempt_enable_notrace(); |
3995 | trace_recursion_clear(TRACE_INTERNAL_BIT); | 4135 | trace_recursion_clear(TRACE_INTERNAL_BIT); |
3996 | } | 4136 | } |
3997 | 4137 | ||
4138 | /* | ||
4139 | * Some archs only support passing ip and parent_ip. Even though | ||
4140 | * the list function ignores the op parameter, we do not want any | ||
4141 | * C side effects, where a function is called without the caller | ||
4142 | * sending a third parameter. | ||
4143 | * Archs are to support both the regs and ftrace_ops at the same time. | ||
4144 | * If they support ftrace_ops, it is assumed they support regs. | ||
4145 | * If call backs want to use regs, they must either check for regs | ||
4146 | * being NULL, or ARCH_SUPPORTS_FTRACE_SAVE_REGS. | ||
4147 | * Note, ARCH_SUPPORT_SAVE_REGS expects a full regs to be saved. | ||
4148 | * An architecture can pass partial regs with ftrace_ops and still | ||
4149 | * set the ARCH_SUPPORT_FTARCE_OPS. | ||
4150 | */ | ||
4151 | #if ARCH_SUPPORTS_FTRACE_OPS | ||
4152 | static void ftrace_ops_list_func(unsigned long ip, unsigned long parent_ip, | ||
4153 | struct ftrace_ops *op, struct pt_regs *regs) | ||
4154 | { | ||
4155 | __ftrace_ops_list_func(ip, parent_ip, NULL, regs); | ||
4156 | } | ||
4157 | #else | ||
4158 | static void ftrace_ops_no_ops(unsigned long ip, unsigned long parent_ip) | ||
4159 | { | ||
4160 | __ftrace_ops_list_func(ip, parent_ip, NULL, NULL); | ||
4161 | } | ||
4162 | #endif | ||
4163 | |||
3998 | static void clear_ftrace_swapper(void) | 4164 | static void clear_ftrace_swapper(void) |
3999 | { | 4165 | { |
4000 | struct task_struct *p; | 4166 | struct task_struct *p; |
@@ -4215,7 +4381,7 @@ ftrace_pid_write(struct file *filp, const char __user *ubuf, | |||
4215 | if (strlen(tmp) == 0) | 4381 | if (strlen(tmp) == 0) |
4216 | return 1; | 4382 | return 1; |
4217 | 4383 | ||
4218 | ret = strict_strtol(tmp, 10, &val); | 4384 | ret = kstrtol(tmp, 10, &val); |
4219 | if (ret < 0) | 4385 | if (ret < 0) |
4220 | return ret; | 4386 | return ret; |
4221 | 4387 | ||
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 49491fa7daa2..ce8514feedcd 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c | |||
@@ -460,9 +460,10 @@ struct ring_buffer_per_cpu { | |||
460 | unsigned long lost_events; | 460 | unsigned long lost_events; |
461 | unsigned long last_overrun; | 461 | unsigned long last_overrun; |
462 | local_t entries_bytes; | 462 | local_t entries_bytes; |
463 | local_t commit_overrun; | ||
464 | local_t overrun; | ||
465 | local_t entries; | 463 | local_t entries; |
464 | local_t overrun; | ||
465 | local_t commit_overrun; | ||
466 | local_t dropped_events; | ||
466 | local_t committing; | 467 | local_t committing; |
467 | local_t commits; | 468 | local_t commits; |
468 | unsigned long read; | 469 | unsigned long read; |
@@ -1396,6 +1397,8 @@ rb_insert_pages(struct ring_buffer_per_cpu *cpu_buffer) | |||
1396 | struct list_head *head_page_with_bit; | 1397 | struct list_head *head_page_with_bit; |
1397 | 1398 | ||
1398 | head_page = &rb_set_head_page(cpu_buffer)->list; | 1399 | head_page = &rb_set_head_page(cpu_buffer)->list; |
1400 | if (!head_page) | ||
1401 | break; | ||
1399 | prev_page = head_page->prev; | 1402 | prev_page = head_page->prev; |
1400 | 1403 | ||
1401 | first_page = pages->next; | 1404 | first_page = pages->next; |
@@ -1567,6 +1570,10 @@ int ring_buffer_resize(struct ring_buffer *buffer, unsigned long size, | |||
1567 | 1570 | ||
1568 | put_online_cpus(); | 1571 | put_online_cpus(); |
1569 | } else { | 1572 | } else { |
1573 | /* Make sure this CPU has been intitialized */ | ||
1574 | if (!cpumask_test_cpu(cpu_id, buffer->cpumask)) | ||
1575 | goto out; | ||
1576 | |||
1570 | cpu_buffer = buffer->buffers[cpu_id]; | 1577 | cpu_buffer = buffer->buffers[cpu_id]; |
1571 | 1578 | ||
1572 | if (nr_pages == cpu_buffer->nr_pages) | 1579 | if (nr_pages == cpu_buffer->nr_pages) |
@@ -1816,7 +1823,7 @@ rb_add_time_stamp(struct ring_buffer_event *event, u64 delta) | |||
1816 | } | 1823 | } |
1817 | 1824 | ||
1818 | /** | 1825 | /** |
1819 | * ring_buffer_update_event - update event type and data | 1826 | * rb_update_event - update event type and data |
1820 | * @event: the even to update | 1827 | * @event: the even to update |
1821 | * @type: the type of event | 1828 | * @type: the type of event |
1822 | * @length: the size of the event field in the ring buffer | 1829 | * @length: the size of the event field in the ring buffer |
@@ -2151,8 +2158,10 @@ rb_move_tail(struct ring_buffer_per_cpu *cpu_buffer, | |||
2151 | * If we are not in overwrite mode, | 2158 | * If we are not in overwrite mode, |
2152 | * this is easy, just stop here. | 2159 | * this is easy, just stop here. |
2153 | */ | 2160 | */ |
2154 | if (!(buffer->flags & RB_FL_OVERWRITE)) | 2161 | if (!(buffer->flags & RB_FL_OVERWRITE)) { |
2162 | local_inc(&cpu_buffer->dropped_events); | ||
2155 | goto out_reset; | 2163 | goto out_reset; |
2164 | } | ||
2156 | 2165 | ||
2157 | ret = rb_handle_head_page(cpu_buffer, | 2166 | ret = rb_handle_head_page(cpu_buffer, |
2158 | tail_page, | 2167 | tail_page, |
@@ -2716,8 +2725,8 @@ EXPORT_SYMBOL_GPL(ring_buffer_discard_commit); | |||
2716 | * and not the length of the event which would hold the header. | 2725 | * and not the length of the event which would hold the header. |
2717 | */ | 2726 | */ |
2718 | int ring_buffer_write(struct ring_buffer *buffer, | 2727 | int ring_buffer_write(struct ring_buffer *buffer, |
2719 | unsigned long length, | 2728 | unsigned long length, |
2720 | void *data) | 2729 | void *data) |
2721 | { | 2730 | { |
2722 | struct ring_buffer_per_cpu *cpu_buffer; | 2731 | struct ring_buffer_per_cpu *cpu_buffer; |
2723 | struct ring_buffer_event *event; | 2732 | struct ring_buffer_event *event; |
@@ -2816,7 +2825,7 @@ EXPORT_SYMBOL_GPL(ring_buffer_record_enable); | |||
2816 | * to the buffer after this will fail and return NULL. | 2825 | * to the buffer after this will fail and return NULL. |
2817 | * | 2826 | * |
2818 | * This is different than ring_buffer_record_disable() as | 2827 | * This is different than ring_buffer_record_disable() as |
2819 | * it works like an on/off switch, where as the disable() verison | 2828 | * it works like an on/off switch, where as the disable() version |
2820 | * must be paired with a enable(). | 2829 | * must be paired with a enable(). |
2821 | */ | 2830 | */ |
2822 | void ring_buffer_record_off(struct ring_buffer *buffer) | 2831 | void ring_buffer_record_off(struct ring_buffer *buffer) |
@@ -2839,7 +2848,7 @@ EXPORT_SYMBOL_GPL(ring_buffer_record_off); | |||
2839 | * ring_buffer_record_off(). | 2848 | * ring_buffer_record_off(). |
2840 | * | 2849 | * |
2841 | * This is different than ring_buffer_record_enable() as | 2850 | * This is different than ring_buffer_record_enable() as |
2842 | * it works like an on/off switch, where as the enable() verison | 2851 | * it works like an on/off switch, where as the enable() version |
2843 | * must be paired with a disable(). | 2852 | * must be paired with a disable(). |
2844 | */ | 2853 | */ |
2845 | void ring_buffer_record_on(struct ring_buffer *buffer) | 2854 | void ring_buffer_record_on(struct ring_buffer *buffer) |
@@ -2925,12 +2934,12 @@ rb_num_of_entries(struct ring_buffer_per_cpu *cpu_buffer) | |||
2925 | * @buffer: The ring buffer | 2934 | * @buffer: The ring buffer |
2926 | * @cpu: The per CPU buffer to read from. | 2935 | * @cpu: The per CPU buffer to read from. |
2927 | */ | 2936 | */ |
2928 | unsigned long ring_buffer_oldest_event_ts(struct ring_buffer *buffer, int cpu) | 2937 | u64 ring_buffer_oldest_event_ts(struct ring_buffer *buffer, int cpu) |
2929 | { | 2938 | { |
2930 | unsigned long flags; | 2939 | unsigned long flags; |
2931 | struct ring_buffer_per_cpu *cpu_buffer; | 2940 | struct ring_buffer_per_cpu *cpu_buffer; |
2932 | struct buffer_page *bpage; | 2941 | struct buffer_page *bpage; |
2933 | unsigned long ret; | 2942 | u64 ret = 0; |
2934 | 2943 | ||
2935 | if (!cpumask_test_cpu(cpu, buffer->cpumask)) | 2944 | if (!cpumask_test_cpu(cpu, buffer->cpumask)) |
2936 | return 0; | 2945 | return 0; |
@@ -2945,7 +2954,8 @@ unsigned long ring_buffer_oldest_event_ts(struct ring_buffer *buffer, int cpu) | |||
2945 | bpage = cpu_buffer->reader_page; | 2954 | bpage = cpu_buffer->reader_page; |
2946 | else | 2955 | else |
2947 | bpage = rb_set_head_page(cpu_buffer); | 2956 | bpage = rb_set_head_page(cpu_buffer); |
2948 | ret = bpage->page->time_stamp; | 2957 | if (bpage) |
2958 | ret = bpage->page->time_stamp; | ||
2949 | raw_spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags); | 2959 | raw_spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags); |
2950 | 2960 | ||
2951 | return ret; | 2961 | return ret; |
@@ -2991,7 +3001,8 @@ unsigned long ring_buffer_entries_cpu(struct ring_buffer *buffer, int cpu) | |||
2991 | EXPORT_SYMBOL_GPL(ring_buffer_entries_cpu); | 3001 | EXPORT_SYMBOL_GPL(ring_buffer_entries_cpu); |
2992 | 3002 | ||
2993 | /** | 3003 | /** |
2994 | * ring_buffer_overrun_cpu - get the number of overruns in a cpu_buffer | 3004 | * ring_buffer_overrun_cpu - get the number of overruns caused by the ring |
3005 | * buffer wrapping around (only if RB_FL_OVERWRITE is on). | ||
2995 | * @buffer: The ring buffer | 3006 | * @buffer: The ring buffer |
2996 | * @cpu: The per CPU buffer to get the number of overruns from | 3007 | * @cpu: The per CPU buffer to get the number of overruns from |
2997 | */ | 3008 | */ |
@@ -3011,7 +3022,9 @@ unsigned long ring_buffer_overrun_cpu(struct ring_buffer *buffer, int cpu) | |||
3011 | EXPORT_SYMBOL_GPL(ring_buffer_overrun_cpu); | 3022 | EXPORT_SYMBOL_GPL(ring_buffer_overrun_cpu); |
3012 | 3023 | ||
3013 | /** | 3024 | /** |
3014 | * ring_buffer_commit_overrun_cpu - get the number of overruns caused by commits | 3025 | * ring_buffer_commit_overrun_cpu - get the number of overruns caused by |
3026 | * commits failing due to the buffer wrapping around while there are uncommitted | ||
3027 | * events, such as during an interrupt storm. | ||
3015 | * @buffer: The ring buffer | 3028 | * @buffer: The ring buffer |
3016 | * @cpu: The per CPU buffer to get the number of overruns from | 3029 | * @cpu: The per CPU buffer to get the number of overruns from |
3017 | */ | 3030 | */ |
@@ -3032,6 +3045,28 @@ ring_buffer_commit_overrun_cpu(struct ring_buffer *buffer, int cpu) | |||
3032 | EXPORT_SYMBOL_GPL(ring_buffer_commit_overrun_cpu); | 3045 | EXPORT_SYMBOL_GPL(ring_buffer_commit_overrun_cpu); |
3033 | 3046 | ||
3034 | /** | 3047 | /** |
3048 | * ring_buffer_dropped_events_cpu - get the number of dropped events caused by | ||
3049 | * the ring buffer filling up (only if RB_FL_OVERWRITE is off). | ||
3050 | * @buffer: The ring buffer | ||
3051 | * @cpu: The per CPU buffer to get the number of overruns from | ||
3052 | */ | ||
3053 | unsigned long | ||
3054 | ring_buffer_dropped_events_cpu(struct ring_buffer *buffer, int cpu) | ||
3055 | { | ||
3056 | struct ring_buffer_per_cpu *cpu_buffer; | ||
3057 | unsigned long ret; | ||
3058 | |||
3059 | if (!cpumask_test_cpu(cpu, buffer->cpumask)) | ||
3060 | return 0; | ||
3061 | |||
3062 | cpu_buffer = buffer->buffers[cpu]; | ||
3063 | ret = local_read(&cpu_buffer->dropped_events); | ||
3064 | |||
3065 | return ret; | ||
3066 | } | ||
3067 | EXPORT_SYMBOL_GPL(ring_buffer_dropped_events_cpu); | ||
3068 | |||
3069 | /** | ||
3035 | * ring_buffer_entries - get the number of entries in a buffer | 3070 | * ring_buffer_entries - get the number of entries in a buffer |
3036 | * @buffer: The ring buffer | 3071 | * @buffer: The ring buffer |
3037 | * | 3072 | * |
@@ -3256,6 +3291,8 @@ rb_get_reader_page(struct ring_buffer_per_cpu *cpu_buffer) | |||
3256 | * Splice the empty reader page into the list around the head. | 3291 | * Splice the empty reader page into the list around the head. |
3257 | */ | 3292 | */ |
3258 | reader = rb_set_head_page(cpu_buffer); | 3293 | reader = rb_set_head_page(cpu_buffer); |
3294 | if (!reader) | ||
3295 | goto out; | ||
3259 | cpu_buffer->reader_page->list.next = rb_list_head(reader->list.next); | 3296 | cpu_buffer->reader_page->list.next = rb_list_head(reader->list.next); |
3260 | cpu_buffer->reader_page->list.prev = reader->list.prev; | 3297 | cpu_buffer->reader_page->list.prev = reader->list.prev; |
3261 | 3298 | ||
@@ -3774,12 +3811,17 @@ void | |||
3774 | ring_buffer_read_finish(struct ring_buffer_iter *iter) | 3811 | ring_buffer_read_finish(struct ring_buffer_iter *iter) |
3775 | { | 3812 | { |
3776 | struct ring_buffer_per_cpu *cpu_buffer = iter->cpu_buffer; | 3813 | struct ring_buffer_per_cpu *cpu_buffer = iter->cpu_buffer; |
3814 | unsigned long flags; | ||
3777 | 3815 | ||
3778 | /* | 3816 | /* |
3779 | * Ring buffer is disabled from recording, here's a good place | 3817 | * Ring buffer is disabled from recording, here's a good place |
3780 | * to check the integrity of the ring buffer. | 3818 | * to check the integrity of the ring buffer. |
3819 | * Must prevent readers from trying to read, as the check | ||
3820 | * clears the HEAD page and readers require it. | ||
3781 | */ | 3821 | */ |
3822 | raw_spin_lock_irqsave(&cpu_buffer->reader_lock, flags); | ||
3782 | rb_check_pages(cpu_buffer); | 3823 | rb_check_pages(cpu_buffer); |
3824 | raw_spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags); | ||
3783 | 3825 | ||
3784 | atomic_dec(&cpu_buffer->record_disabled); | 3826 | atomic_dec(&cpu_buffer->record_disabled); |
3785 | atomic_dec(&cpu_buffer->buffer->resize_disabled); | 3827 | atomic_dec(&cpu_buffer->buffer->resize_disabled); |
@@ -3860,9 +3902,10 @@ rb_reset_cpu(struct ring_buffer_per_cpu *cpu_buffer) | |||
3860 | local_set(&cpu_buffer->reader_page->page->commit, 0); | 3902 | local_set(&cpu_buffer->reader_page->page->commit, 0); |
3861 | cpu_buffer->reader_page->read = 0; | 3903 | cpu_buffer->reader_page->read = 0; |
3862 | 3904 | ||
3863 | local_set(&cpu_buffer->commit_overrun, 0); | ||
3864 | local_set(&cpu_buffer->entries_bytes, 0); | 3905 | local_set(&cpu_buffer->entries_bytes, 0); |
3865 | local_set(&cpu_buffer->overrun, 0); | 3906 | local_set(&cpu_buffer->overrun, 0); |
3907 | local_set(&cpu_buffer->commit_overrun, 0); | ||
3908 | local_set(&cpu_buffer->dropped_events, 0); | ||
3866 | local_set(&cpu_buffer->entries, 0); | 3909 | local_set(&cpu_buffer->entries, 0); |
3867 | local_set(&cpu_buffer->committing, 0); | 3910 | local_set(&cpu_buffer->committing, 0); |
3868 | local_set(&cpu_buffer->commits, 0); | 3911 | local_set(&cpu_buffer->commits, 0); |
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 5c38c81496ce..e5125677efa0 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c | |||
@@ -9,7 +9,7 @@ | |||
9 | * | 9 | * |
10 | * Based on code from the latency_tracer, that is: | 10 | * Based on code from the latency_tracer, that is: |
11 | * Copyright (C) 2004-2006 Ingo Molnar | 11 | * Copyright (C) 2004-2006 Ingo Molnar |
12 | * Copyright (C) 2004 William Lee Irwin III | 12 | * Copyright (C) 2004 Nadia Yvette Chambers |
13 | */ | 13 | */ |
14 | #include <linux/ring_buffer.h> | 14 | #include <linux/ring_buffer.h> |
15 | #include <generated/utsrelease.h> | 15 | #include <generated/utsrelease.h> |
@@ -19,6 +19,7 @@ | |||
19 | #include <linux/seq_file.h> | 19 | #include <linux/seq_file.h> |
20 | #include <linux/notifier.h> | 20 | #include <linux/notifier.h> |
21 | #include <linux/irqflags.h> | 21 | #include <linux/irqflags.h> |
22 | #include <linux/irq_work.h> | ||
22 | #include <linux/debugfs.h> | 23 | #include <linux/debugfs.h> |
23 | #include <linux/pagemap.h> | 24 | #include <linux/pagemap.h> |
24 | #include <linux/hardirq.h> | 25 | #include <linux/hardirq.h> |
@@ -78,6 +79,21 @@ static int dummy_set_flag(u32 old_flags, u32 bit, int set) | |||
78 | } | 79 | } |
79 | 80 | ||
80 | /* | 81 | /* |
82 | * To prevent the comm cache from being overwritten when no | ||
83 | * tracing is active, only save the comm when a trace event | ||
84 | * occurred. | ||
85 | */ | ||
86 | static DEFINE_PER_CPU(bool, trace_cmdline_save); | ||
87 | |||
88 | /* | ||
89 | * When a reader is waiting for data, then this variable is | ||
90 | * set to true. | ||
91 | */ | ||
92 | static bool trace_wakeup_needed; | ||
93 | |||
94 | static struct irq_work trace_work_wakeup; | ||
95 | |||
96 | /* | ||
81 | * Kill all tracing for good (never come back). | 97 | * Kill all tracing for good (never come back). |
82 | * It is initialized to 1 but will turn to zero if the initialization | 98 | * It is initialized to 1 but will turn to zero if the initialization |
83 | * of the tracer is successful. But that is the only place that sets | 99 | * of the tracer is successful. But that is the only place that sets |
@@ -139,6 +155,18 @@ static int __init set_ftrace_dump_on_oops(char *str) | |||
139 | } | 155 | } |
140 | __setup("ftrace_dump_on_oops", set_ftrace_dump_on_oops); | 156 | __setup("ftrace_dump_on_oops", set_ftrace_dump_on_oops); |
141 | 157 | ||
158 | |||
159 | static char trace_boot_options_buf[MAX_TRACER_SIZE] __initdata; | ||
160 | static char *trace_boot_options __initdata; | ||
161 | |||
162 | static int __init set_trace_boot_options(char *str) | ||
163 | { | ||
164 | strncpy(trace_boot_options_buf, str, MAX_TRACER_SIZE); | ||
165 | trace_boot_options = trace_boot_options_buf; | ||
166 | return 0; | ||
167 | } | ||
168 | __setup("trace_options=", set_trace_boot_options); | ||
169 | |||
142 | unsigned long long ns2usecs(cycle_t nsec) | 170 | unsigned long long ns2usecs(cycle_t nsec) |
143 | { | 171 | { |
144 | nsec += 500; | 172 | nsec += 500; |
@@ -198,20 +226,9 @@ static struct trace_array max_tr; | |||
198 | 226 | ||
199 | static DEFINE_PER_CPU(struct trace_array_cpu, max_tr_data); | 227 | static DEFINE_PER_CPU(struct trace_array_cpu, max_tr_data); |
200 | 228 | ||
201 | /* tracer_enabled is used to toggle activation of a tracer */ | ||
202 | static int tracer_enabled = 1; | ||
203 | |||
204 | /** | ||
205 | * tracing_is_enabled - return tracer_enabled status | ||
206 | * | ||
207 | * This function is used by other tracers to know the status | ||
208 | * of the tracer_enabled flag. Tracers may use this function | ||
209 | * to know if it should enable their features when starting | ||
210 | * up. See irqsoff tracer for an example (start_irqsoff_tracer). | ||
211 | */ | ||
212 | int tracing_is_enabled(void) | 229 | int tracing_is_enabled(void) |
213 | { | 230 | { |
214 | return tracer_enabled; | 231 | return tracing_is_on(); |
215 | } | 232 | } |
216 | 233 | ||
217 | /* | 234 | /* |
@@ -328,17 +345,23 @@ static DECLARE_WAIT_QUEUE_HEAD(trace_wait); | |||
328 | unsigned long trace_flags = TRACE_ITER_PRINT_PARENT | TRACE_ITER_PRINTK | | 345 | unsigned long trace_flags = TRACE_ITER_PRINT_PARENT | TRACE_ITER_PRINTK | |
329 | TRACE_ITER_ANNOTATE | TRACE_ITER_CONTEXT_INFO | TRACE_ITER_SLEEP_TIME | | 346 | TRACE_ITER_ANNOTATE | TRACE_ITER_CONTEXT_INFO | TRACE_ITER_SLEEP_TIME | |
330 | TRACE_ITER_GRAPH_TIME | TRACE_ITER_RECORD_CMD | TRACE_ITER_OVERWRITE | | 347 | TRACE_ITER_GRAPH_TIME | TRACE_ITER_RECORD_CMD | TRACE_ITER_OVERWRITE | |
331 | TRACE_ITER_IRQ_INFO; | 348 | TRACE_ITER_IRQ_INFO | TRACE_ITER_MARKERS; |
332 | 349 | ||
333 | static int trace_stop_count; | 350 | static int trace_stop_count; |
334 | static DEFINE_RAW_SPINLOCK(tracing_start_lock); | 351 | static DEFINE_RAW_SPINLOCK(tracing_start_lock); |
335 | 352 | ||
336 | static void wakeup_work_handler(struct work_struct *work) | 353 | /** |
354 | * trace_wake_up - wake up tasks waiting for trace input | ||
355 | * | ||
356 | * Schedules a delayed work to wake up any task that is blocked on the | ||
357 | * trace_wait queue. These is used with trace_poll for tasks polling the | ||
358 | * trace. | ||
359 | */ | ||
360 | static void trace_wake_up(struct irq_work *work) | ||
337 | { | 361 | { |
338 | wake_up(&trace_wait); | 362 | wake_up_all(&trace_wait); |
339 | } | ||
340 | 363 | ||
341 | static DECLARE_DELAYED_WORK(wakeup_work, wakeup_work_handler); | 364 | } |
342 | 365 | ||
343 | /** | 366 | /** |
344 | * tracing_on - enable tracing buffers | 367 | * tracing_on - enable tracing buffers |
@@ -393,22 +416,6 @@ int tracing_is_on(void) | |||
393 | } | 416 | } |
394 | EXPORT_SYMBOL_GPL(tracing_is_on); | 417 | EXPORT_SYMBOL_GPL(tracing_is_on); |
395 | 418 | ||
396 | /** | ||
397 | * trace_wake_up - wake up tasks waiting for trace input | ||
398 | * | ||
399 | * Schedules a delayed work to wake up any task that is blocked on the | ||
400 | * trace_wait queue. These is used with trace_poll for tasks polling the | ||
401 | * trace. | ||
402 | */ | ||
403 | void trace_wake_up(void) | ||
404 | { | ||
405 | const unsigned long delay = msecs_to_jiffies(2); | ||
406 | |||
407 | if (trace_flags & TRACE_ITER_BLOCK) | ||
408 | return; | ||
409 | schedule_delayed_work(&wakeup_work, delay); | ||
410 | } | ||
411 | |||
412 | static int __init set_buf_size(char *str) | 419 | static int __init set_buf_size(char *str) |
413 | { | 420 | { |
414 | unsigned long buf_size; | 421 | unsigned long buf_size; |
@@ -426,15 +433,15 @@ __setup("trace_buf_size=", set_buf_size); | |||
426 | 433 | ||
427 | static int __init set_tracing_thresh(char *str) | 434 | static int __init set_tracing_thresh(char *str) |
428 | { | 435 | { |
429 | unsigned long threshhold; | 436 | unsigned long threshold; |
430 | int ret; | 437 | int ret; |
431 | 438 | ||
432 | if (!str) | 439 | if (!str) |
433 | return 0; | 440 | return 0; |
434 | ret = strict_strtoul(str, 0, &threshhold); | 441 | ret = kstrtoul(str, 0, &threshold); |
435 | if (ret < 0) | 442 | if (ret < 0) |
436 | return 0; | 443 | return 0; |
437 | tracing_thresh = threshhold * 1000; | 444 | tracing_thresh = threshold * 1000; |
438 | return 1; | 445 | return 1; |
439 | } | 446 | } |
440 | __setup("tracing_thresh=", set_tracing_thresh); | 447 | __setup("tracing_thresh=", set_tracing_thresh); |
@@ -470,16 +477,19 @@ static const char *trace_options[] = { | |||
470 | "overwrite", | 477 | "overwrite", |
471 | "disable_on_free", | 478 | "disable_on_free", |
472 | "irq-info", | 479 | "irq-info", |
480 | "markers", | ||
473 | NULL | 481 | NULL |
474 | }; | 482 | }; |
475 | 483 | ||
476 | static struct { | 484 | static struct { |
477 | u64 (*func)(void); | 485 | u64 (*func)(void); |
478 | const char *name; | 486 | const char *name; |
487 | int in_ns; /* is this clock in nanoseconds? */ | ||
479 | } trace_clocks[] = { | 488 | } trace_clocks[] = { |
480 | { trace_clock_local, "local" }, | 489 | { trace_clock_local, "local", 1 }, |
481 | { trace_clock_global, "global" }, | 490 | { trace_clock_global, "global", 1 }, |
482 | { trace_clock_counter, "counter" }, | 491 | { trace_clock_counter, "counter", 0 }, |
492 | ARCH_TRACE_CLOCKS | ||
483 | }; | 493 | }; |
484 | 494 | ||
485 | int trace_clock_id; | 495 | int trace_clock_id; |
@@ -756,6 +766,40 @@ update_max_tr_single(struct trace_array *tr, struct task_struct *tsk, int cpu) | |||
756 | } | 766 | } |
757 | #endif /* CONFIG_TRACER_MAX_TRACE */ | 767 | #endif /* CONFIG_TRACER_MAX_TRACE */ |
758 | 768 | ||
769 | static void default_wait_pipe(struct trace_iterator *iter) | ||
770 | { | ||
771 | DEFINE_WAIT(wait); | ||
772 | |||
773 | prepare_to_wait(&trace_wait, &wait, TASK_INTERRUPTIBLE); | ||
774 | |||
775 | /* | ||
776 | * The events can happen in critical sections where | ||
777 | * checking a work queue can cause deadlocks. | ||
778 | * After adding a task to the queue, this flag is set | ||
779 | * only to notify events to try to wake up the queue | ||
780 | * using irq_work. | ||
781 | * | ||
782 | * We don't clear it even if the buffer is no longer | ||
783 | * empty. The flag only causes the next event to run | ||
784 | * irq_work to do the work queue wake up. The worse | ||
785 | * that can happen if we race with !trace_empty() is that | ||
786 | * an event will cause an irq_work to try to wake up | ||
787 | * an empty queue. | ||
788 | * | ||
789 | * There's no reason to protect this flag either, as | ||
790 | * the work queue and irq_work logic will do the necessary | ||
791 | * synchronization for the wake ups. The only thing | ||
792 | * that is necessary is that the wake up happens after | ||
793 | * a task has been queued. It's OK for spurious wake ups. | ||
794 | */ | ||
795 | trace_wakeup_needed = true; | ||
796 | |||
797 | if (trace_empty(iter)) | ||
798 | schedule(); | ||
799 | |||
800 | finish_wait(&trace_wait, &wait); | ||
801 | } | ||
802 | |||
759 | /** | 803 | /** |
760 | * register_tracer - register a tracer with the ftrace system. | 804 | * register_tracer - register a tracer with the ftrace system. |
761 | * @type - the plugin for the tracer | 805 | * @type - the plugin for the tracer |
@@ -874,32 +918,6 @@ int register_tracer(struct tracer *type) | |||
874 | return ret; | 918 | return ret; |
875 | } | 919 | } |
876 | 920 | ||
877 | void unregister_tracer(struct tracer *type) | ||
878 | { | ||
879 | struct tracer **t; | ||
880 | |||
881 | mutex_lock(&trace_types_lock); | ||
882 | for (t = &trace_types; *t; t = &(*t)->next) { | ||
883 | if (*t == type) | ||
884 | goto found; | ||
885 | } | ||
886 | pr_info("Tracer %s not registered\n", type->name); | ||
887 | goto out; | ||
888 | |||
889 | found: | ||
890 | *t = (*t)->next; | ||
891 | |||
892 | if (type == current_trace && tracer_enabled) { | ||
893 | tracer_enabled = 0; | ||
894 | tracing_stop(); | ||
895 | if (current_trace->stop) | ||
896 | current_trace->stop(&global_trace); | ||
897 | current_trace = &nop_trace; | ||
898 | } | ||
899 | out: | ||
900 | mutex_unlock(&trace_types_lock); | ||
901 | } | ||
902 | |||
903 | void tracing_reset(struct trace_array *tr, int cpu) | 921 | void tracing_reset(struct trace_array *tr, int cpu) |
904 | { | 922 | { |
905 | struct ring_buffer *buffer = tr->buffer; | 923 | struct ring_buffer *buffer = tr->buffer; |
@@ -1130,10 +1148,14 @@ void trace_find_cmdline(int pid, char comm[]) | |||
1130 | 1148 | ||
1131 | void tracing_record_cmdline(struct task_struct *tsk) | 1149 | void tracing_record_cmdline(struct task_struct *tsk) |
1132 | { | 1150 | { |
1133 | if (atomic_read(&trace_record_cmdline_disabled) || !tracer_enabled || | 1151 | if (atomic_read(&trace_record_cmdline_disabled) || !tracing_is_on()) |
1134 | !tracing_is_on()) | 1152 | return; |
1153 | |||
1154 | if (!__this_cpu_read(trace_cmdline_save)) | ||
1135 | return; | 1155 | return; |
1136 | 1156 | ||
1157 | __this_cpu_write(trace_cmdline_save, false); | ||
1158 | |||
1137 | trace_save_cmdline(tsk); | 1159 | trace_save_cmdline(tsk); |
1138 | } | 1160 | } |
1139 | 1161 | ||
@@ -1177,27 +1199,36 @@ trace_buffer_lock_reserve(struct ring_buffer *buffer, | |||
1177 | return event; | 1199 | return event; |
1178 | } | 1200 | } |
1179 | 1201 | ||
1202 | void | ||
1203 | __buffer_unlock_commit(struct ring_buffer *buffer, struct ring_buffer_event *event) | ||
1204 | { | ||
1205 | __this_cpu_write(trace_cmdline_save, true); | ||
1206 | if (trace_wakeup_needed) { | ||
1207 | trace_wakeup_needed = false; | ||
1208 | /* irq_work_queue() supplies it's own memory barriers */ | ||
1209 | irq_work_queue(&trace_work_wakeup); | ||
1210 | } | ||
1211 | ring_buffer_unlock_commit(buffer, event); | ||
1212 | } | ||
1213 | |||
1180 | static inline void | 1214 | static inline void |
1181 | __trace_buffer_unlock_commit(struct ring_buffer *buffer, | 1215 | __trace_buffer_unlock_commit(struct ring_buffer *buffer, |
1182 | struct ring_buffer_event *event, | 1216 | struct ring_buffer_event *event, |
1183 | unsigned long flags, int pc, | 1217 | unsigned long flags, int pc) |
1184 | int wake) | ||
1185 | { | 1218 | { |
1186 | ring_buffer_unlock_commit(buffer, event); | 1219 | __buffer_unlock_commit(buffer, event); |
1187 | 1220 | ||
1188 | ftrace_trace_stack(buffer, flags, 6, pc); | 1221 | ftrace_trace_stack(buffer, flags, 6, pc); |
1189 | ftrace_trace_userstack(buffer, flags, pc); | 1222 | ftrace_trace_userstack(buffer, flags, pc); |
1190 | |||
1191 | if (wake) | ||
1192 | trace_wake_up(); | ||
1193 | } | 1223 | } |
1194 | 1224 | ||
1195 | void trace_buffer_unlock_commit(struct ring_buffer *buffer, | 1225 | void trace_buffer_unlock_commit(struct ring_buffer *buffer, |
1196 | struct ring_buffer_event *event, | 1226 | struct ring_buffer_event *event, |
1197 | unsigned long flags, int pc) | 1227 | unsigned long flags, int pc) |
1198 | { | 1228 | { |
1199 | __trace_buffer_unlock_commit(buffer, event, flags, pc, 1); | 1229 | __trace_buffer_unlock_commit(buffer, event, flags, pc); |
1200 | } | 1230 | } |
1231 | EXPORT_SYMBOL_GPL(trace_buffer_unlock_commit); | ||
1201 | 1232 | ||
1202 | struct ring_buffer_event * | 1233 | struct ring_buffer_event * |
1203 | trace_current_buffer_lock_reserve(struct ring_buffer **current_rb, | 1234 | trace_current_buffer_lock_reserve(struct ring_buffer **current_rb, |
@@ -1214,29 +1245,21 @@ void trace_current_buffer_unlock_commit(struct ring_buffer *buffer, | |||
1214 | struct ring_buffer_event *event, | 1245 | struct ring_buffer_event *event, |
1215 | unsigned long flags, int pc) | 1246 | unsigned long flags, int pc) |
1216 | { | 1247 | { |
1217 | __trace_buffer_unlock_commit(buffer, event, flags, pc, 1); | 1248 | __trace_buffer_unlock_commit(buffer, event, flags, pc); |
1218 | } | 1249 | } |
1219 | EXPORT_SYMBOL_GPL(trace_current_buffer_unlock_commit); | 1250 | EXPORT_SYMBOL_GPL(trace_current_buffer_unlock_commit); |
1220 | 1251 | ||
1221 | void trace_nowake_buffer_unlock_commit(struct ring_buffer *buffer, | 1252 | void trace_buffer_unlock_commit_regs(struct ring_buffer *buffer, |
1222 | struct ring_buffer_event *event, | 1253 | struct ring_buffer_event *event, |
1223 | unsigned long flags, int pc) | 1254 | unsigned long flags, int pc, |
1224 | { | 1255 | struct pt_regs *regs) |
1225 | __trace_buffer_unlock_commit(buffer, event, flags, pc, 0); | ||
1226 | } | ||
1227 | EXPORT_SYMBOL_GPL(trace_nowake_buffer_unlock_commit); | ||
1228 | |||
1229 | void trace_nowake_buffer_unlock_commit_regs(struct ring_buffer *buffer, | ||
1230 | struct ring_buffer_event *event, | ||
1231 | unsigned long flags, int pc, | ||
1232 | struct pt_regs *regs) | ||
1233 | { | 1256 | { |
1234 | ring_buffer_unlock_commit(buffer, event); | 1257 | __buffer_unlock_commit(buffer, event); |
1235 | 1258 | ||
1236 | ftrace_trace_stack_regs(buffer, flags, 0, pc, regs); | 1259 | ftrace_trace_stack_regs(buffer, flags, 0, pc, regs); |
1237 | ftrace_trace_userstack(buffer, flags, pc); | 1260 | ftrace_trace_userstack(buffer, flags, pc); |
1238 | } | 1261 | } |
1239 | EXPORT_SYMBOL_GPL(trace_nowake_buffer_unlock_commit_regs); | 1262 | EXPORT_SYMBOL_GPL(trace_buffer_unlock_commit_regs); |
1240 | 1263 | ||
1241 | void trace_current_buffer_discard_commit(struct ring_buffer *buffer, | 1264 | void trace_current_buffer_discard_commit(struct ring_buffer *buffer, |
1242 | struct ring_buffer_event *event) | 1265 | struct ring_buffer_event *event) |
@@ -1268,7 +1291,7 @@ trace_function(struct trace_array *tr, | |||
1268 | entry->parent_ip = parent_ip; | 1291 | entry->parent_ip = parent_ip; |
1269 | 1292 | ||
1270 | if (!filter_check_discard(call, entry, buffer, event)) | 1293 | if (!filter_check_discard(call, entry, buffer, event)) |
1271 | ring_buffer_unlock_commit(buffer, event); | 1294 | __buffer_unlock_commit(buffer, event); |
1272 | } | 1295 | } |
1273 | 1296 | ||
1274 | void | 1297 | void |
@@ -1361,7 +1384,7 @@ static void __ftrace_trace_stack(struct ring_buffer *buffer, | |||
1361 | entry->size = trace.nr_entries; | 1384 | entry->size = trace.nr_entries; |
1362 | 1385 | ||
1363 | if (!filter_check_discard(call, entry, buffer, event)) | 1386 | if (!filter_check_discard(call, entry, buffer, event)) |
1364 | ring_buffer_unlock_commit(buffer, event); | 1387 | __buffer_unlock_commit(buffer, event); |
1365 | 1388 | ||
1366 | out: | 1389 | out: |
1367 | /* Again, don't let gcc optimize things here */ | 1390 | /* Again, don't let gcc optimize things here */ |
@@ -1457,7 +1480,7 @@ ftrace_trace_userstack(struct ring_buffer *buffer, unsigned long flags, int pc) | |||
1457 | 1480 | ||
1458 | save_stack_trace_user(&trace); | 1481 | save_stack_trace_user(&trace); |
1459 | if (!filter_check_discard(call, entry, buffer, event)) | 1482 | if (!filter_check_discard(call, entry, buffer, event)) |
1460 | ring_buffer_unlock_commit(buffer, event); | 1483 | __buffer_unlock_commit(buffer, event); |
1461 | 1484 | ||
1462 | out_drop_count: | 1485 | out_drop_count: |
1463 | __this_cpu_dec(user_stack_count); | 1486 | __this_cpu_dec(user_stack_count); |
@@ -1558,10 +1581,10 @@ static int alloc_percpu_trace_buffer(void) | |||
1558 | return -ENOMEM; | 1581 | return -ENOMEM; |
1559 | } | 1582 | } |
1560 | 1583 | ||
1584 | static int buffers_allocated; | ||
1585 | |||
1561 | void trace_printk_init_buffers(void) | 1586 | void trace_printk_init_buffers(void) |
1562 | { | 1587 | { |
1563 | static int buffers_allocated; | ||
1564 | |||
1565 | if (buffers_allocated) | 1588 | if (buffers_allocated) |
1566 | return; | 1589 | return; |
1567 | 1590 | ||
@@ -1570,7 +1593,38 @@ void trace_printk_init_buffers(void) | |||
1570 | 1593 | ||
1571 | pr_info("ftrace: Allocated trace_printk buffers\n"); | 1594 | pr_info("ftrace: Allocated trace_printk buffers\n"); |
1572 | 1595 | ||
1596 | /* Expand the buffers to set size */ | ||
1597 | tracing_update_buffers(); | ||
1598 | |||
1573 | buffers_allocated = 1; | 1599 | buffers_allocated = 1; |
1600 | |||
1601 | /* | ||
1602 | * trace_printk_init_buffers() can be called by modules. | ||
1603 | * If that happens, then we need to start cmdline recording | ||
1604 | * directly here. If the global_trace.buffer is already | ||
1605 | * allocated here, then this was called by module code. | ||
1606 | */ | ||
1607 | if (global_trace.buffer) | ||
1608 | tracing_start_cmdline_record(); | ||
1609 | } | ||
1610 | |||
1611 | void trace_printk_start_comm(void) | ||
1612 | { | ||
1613 | /* Start tracing comms if trace printk is set */ | ||
1614 | if (!buffers_allocated) | ||
1615 | return; | ||
1616 | tracing_start_cmdline_record(); | ||
1617 | } | ||
1618 | |||
1619 | static void trace_printk_start_stop_comm(int enabled) | ||
1620 | { | ||
1621 | if (!buffers_allocated) | ||
1622 | return; | ||
1623 | |||
1624 | if (enabled) | ||
1625 | tracing_start_cmdline_record(); | ||
1626 | else | ||
1627 | tracing_stop_cmdline_record(); | ||
1574 | } | 1628 | } |
1575 | 1629 | ||
1576 | /** | 1630 | /** |
@@ -1621,7 +1675,7 @@ int trace_vbprintk(unsigned long ip, const char *fmt, va_list args) | |||
1621 | 1675 | ||
1622 | memcpy(entry->buf, tbuffer, sizeof(u32) * len); | 1676 | memcpy(entry->buf, tbuffer, sizeof(u32) * len); |
1623 | if (!filter_check_discard(call, entry, buffer, event)) { | 1677 | if (!filter_check_discard(call, entry, buffer, event)) { |
1624 | ring_buffer_unlock_commit(buffer, event); | 1678 | __buffer_unlock_commit(buffer, event); |
1625 | ftrace_trace_stack(buffer, flags, 6, pc); | 1679 | ftrace_trace_stack(buffer, flags, 6, pc); |
1626 | } | 1680 | } |
1627 | 1681 | ||
@@ -1692,7 +1746,7 @@ int trace_array_vprintk(struct trace_array *tr, | |||
1692 | memcpy(&entry->buf, tbuffer, len); | 1746 | memcpy(&entry->buf, tbuffer, len); |
1693 | entry->buf[len] = '\0'; | 1747 | entry->buf[len] = '\0'; |
1694 | if (!filter_check_discard(call, entry, buffer, event)) { | 1748 | if (!filter_check_discard(call, entry, buffer, event)) { |
1695 | ring_buffer_unlock_commit(buffer, event); | 1749 | __buffer_unlock_commit(buffer, event); |
1696 | ftrace_trace_stack(buffer, flags, 6, pc); | 1750 | ftrace_trace_stack(buffer, flags, 6, pc); |
1697 | } | 1751 | } |
1698 | out: | 1752 | out: |
@@ -2060,7 +2114,8 @@ print_trace_header(struct seq_file *m, struct trace_iterator *iter) | |||
2060 | seq_puts(m, "# -----------------\n"); | 2114 | seq_puts(m, "# -----------------\n"); |
2061 | seq_printf(m, "# | task: %.16s-%d " | 2115 | seq_printf(m, "# | task: %.16s-%d " |
2062 | "(uid:%d nice:%ld policy:%ld rt_prio:%ld)\n", | 2116 | "(uid:%d nice:%ld policy:%ld rt_prio:%ld)\n", |
2063 | data->comm, data->pid, data->uid, data->nice, | 2117 | data->comm, data->pid, |
2118 | from_kuid_munged(seq_user_ns(m), data->uid), data->nice, | ||
2064 | data->policy, data->rt_priority); | 2119 | data->policy, data->rt_priority); |
2065 | seq_puts(m, "# -----------------\n"); | 2120 | seq_puts(m, "# -----------------\n"); |
2066 | 2121 | ||
@@ -2424,6 +2479,10 @@ __tracing_open(struct inode *inode, struct file *file) | |||
2424 | if (ring_buffer_overruns(iter->tr->buffer)) | 2479 | if (ring_buffer_overruns(iter->tr->buffer)) |
2425 | iter->iter_flags |= TRACE_FILE_ANNOTATE; | 2480 | iter->iter_flags |= TRACE_FILE_ANNOTATE; |
2426 | 2481 | ||
2482 | /* Output in nanoseconds only if we are using a clock in nanoseconds. */ | ||
2483 | if (trace_clocks[trace_clock_id].in_ns) | ||
2484 | iter->iter_flags |= TRACE_FILE_TIME_IN_NS; | ||
2485 | |||
2427 | /* stop the trace while dumping */ | 2486 | /* stop the trace while dumping */ |
2428 | tracing_stop(); | 2487 | tracing_stop(); |
2429 | 2488 | ||
@@ -2792,26 +2851,19 @@ static void set_tracer_flags(unsigned int mask, int enabled) | |||
2792 | 2851 | ||
2793 | if (mask == TRACE_ITER_OVERWRITE) | 2852 | if (mask == TRACE_ITER_OVERWRITE) |
2794 | ring_buffer_change_overwrite(global_trace.buffer, enabled); | 2853 | ring_buffer_change_overwrite(global_trace.buffer, enabled); |
2854 | |||
2855 | if (mask == TRACE_ITER_PRINTK) | ||
2856 | trace_printk_start_stop_comm(enabled); | ||
2795 | } | 2857 | } |
2796 | 2858 | ||
2797 | static ssize_t | 2859 | static int trace_set_options(char *option) |
2798 | tracing_trace_options_write(struct file *filp, const char __user *ubuf, | ||
2799 | size_t cnt, loff_t *ppos) | ||
2800 | { | 2860 | { |
2801 | char buf[64]; | ||
2802 | char *cmp; | 2861 | char *cmp; |
2803 | int neg = 0; | 2862 | int neg = 0; |
2804 | int ret; | 2863 | int ret = 0; |
2805 | int i; | 2864 | int i; |
2806 | 2865 | ||
2807 | if (cnt >= sizeof(buf)) | 2866 | cmp = strstrip(option); |
2808 | return -EINVAL; | ||
2809 | |||
2810 | if (copy_from_user(&buf, ubuf, cnt)) | ||
2811 | return -EFAULT; | ||
2812 | |||
2813 | buf[cnt] = 0; | ||
2814 | cmp = strstrip(buf); | ||
2815 | 2867 | ||
2816 | if (strncmp(cmp, "no", 2) == 0) { | 2868 | if (strncmp(cmp, "no", 2) == 0) { |
2817 | neg = 1; | 2869 | neg = 1; |
@@ -2830,10 +2882,25 @@ tracing_trace_options_write(struct file *filp, const char __user *ubuf, | |||
2830 | mutex_lock(&trace_types_lock); | 2882 | mutex_lock(&trace_types_lock); |
2831 | ret = set_tracer_option(current_trace, cmp, neg); | 2883 | ret = set_tracer_option(current_trace, cmp, neg); |
2832 | mutex_unlock(&trace_types_lock); | 2884 | mutex_unlock(&trace_types_lock); |
2833 | if (ret) | ||
2834 | return ret; | ||
2835 | } | 2885 | } |
2836 | 2886 | ||
2887 | return ret; | ||
2888 | } | ||
2889 | |||
2890 | static ssize_t | ||
2891 | tracing_trace_options_write(struct file *filp, const char __user *ubuf, | ||
2892 | size_t cnt, loff_t *ppos) | ||
2893 | { | ||
2894 | char buf[64]; | ||
2895 | |||
2896 | if (cnt >= sizeof(buf)) | ||
2897 | return -EINVAL; | ||
2898 | |||
2899 | if (copy_from_user(&buf, ubuf, cnt)) | ||
2900 | return -EFAULT; | ||
2901 | |||
2902 | trace_set_options(buf); | ||
2903 | |||
2837 | *ppos += cnt; | 2904 | *ppos += cnt; |
2838 | 2905 | ||
2839 | return cnt; | 2906 | return cnt; |
@@ -2938,56 +3005,6 @@ static const struct file_operations tracing_saved_cmdlines_fops = { | |||
2938 | }; | 3005 | }; |
2939 | 3006 | ||
2940 | static ssize_t | 3007 | static ssize_t |
2941 | tracing_ctrl_read(struct file *filp, char __user *ubuf, | ||
2942 | size_t cnt, loff_t *ppos) | ||
2943 | { | ||
2944 | char buf[64]; | ||
2945 | int r; | ||
2946 | |||
2947 | r = sprintf(buf, "%u\n", tracer_enabled); | ||
2948 | return simple_read_from_buffer(ubuf, cnt, ppos, buf, r); | ||
2949 | } | ||
2950 | |||
2951 | static ssize_t | ||
2952 | tracing_ctrl_write(struct file *filp, const char __user *ubuf, | ||
2953 | size_t cnt, loff_t *ppos) | ||
2954 | { | ||
2955 | struct trace_array *tr = filp->private_data; | ||
2956 | unsigned long val; | ||
2957 | int ret; | ||
2958 | |||
2959 | ret = kstrtoul_from_user(ubuf, cnt, 10, &val); | ||
2960 | if (ret) | ||
2961 | return ret; | ||
2962 | |||
2963 | val = !!val; | ||
2964 | |||
2965 | mutex_lock(&trace_types_lock); | ||
2966 | if (tracer_enabled ^ val) { | ||
2967 | |||
2968 | /* Only need to warn if this is used to change the state */ | ||
2969 | WARN_ONCE(1, "tracing_enabled is deprecated. Use tracing_on"); | ||
2970 | |||
2971 | if (val) { | ||
2972 | tracer_enabled = 1; | ||
2973 | if (current_trace->start) | ||
2974 | current_trace->start(tr); | ||
2975 | tracing_start(); | ||
2976 | } else { | ||
2977 | tracer_enabled = 0; | ||
2978 | tracing_stop(); | ||
2979 | if (current_trace->stop) | ||
2980 | current_trace->stop(tr); | ||
2981 | } | ||
2982 | } | ||
2983 | mutex_unlock(&trace_types_lock); | ||
2984 | |||
2985 | *ppos += cnt; | ||
2986 | |||
2987 | return cnt; | ||
2988 | } | ||
2989 | |||
2990 | static ssize_t | ||
2991 | tracing_set_trace_read(struct file *filp, char __user *ubuf, | 3008 | tracing_set_trace_read(struct file *filp, char __user *ubuf, |
2992 | size_t cnt, loff_t *ppos) | 3009 | size_t cnt, loff_t *ppos) |
2993 | { | 3010 | { |
@@ -3017,6 +3034,31 @@ static void set_buffer_entries(struct trace_array *tr, unsigned long val) | |||
3017 | tr->data[cpu]->entries = val; | 3034 | tr->data[cpu]->entries = val; |
3018 | } | 3035 | } |
3019 | 3036 | ||
3037 | /* resize @tr's buffer to the size of @size_tr's entries */ | ||
3038 | static int resize_buffer_duplicate_size(struct trace_array *tr, | ||
3039 | struct trace_array *size_tr, int cpu_id) | ||
3040 | { | ||
3041 | int cpu, ret = 0; | ||
3042 | |||
3043 | if (cpu_id == RING_BUFFER_ALL_CPUS) { | ||
3044 | for_each_tracing_cpu(cpu) { | ||
3045 | ret = ring_buffer_resize(tr->buffer, | ||
3046 | size_tr->data[cpu]->entries, cpu); | ||
3047 | if (ret < 0) | ||
3048 | break; | ||
3049 | tr->data[cpu]->entries = size_tr->data[cpu]->entries; | ||
3050 | } | ||
3051 | } else { | ||
3052 | ret = ring_buffer_resize(tr->buffer, | ||
3053 | size_tr->data[cpu_id]->entries, cpu_id); | ||
3054 | if (ret == 0) | ||
3055 | tr->data[cpu_id]->entries = | ||
3056 | size_tr->data[cpu_id]->entries; | ||
3057 | } | ||
3058 | |||
3059 | return ret; | ||
3060 | } | ||
3061 | |||
3020 | static int __tracing_resize_ring_buffer(unsigned long size, int cpu) | 3062 | static int __tracing_resize_ring_buffer(unsigned long size, int cpu) |
3021 | { | 3063 | { |
3022 | int ret; | 3064 | int ret; |
@@ -3028,6 +3070,10 @@ static int __tracing_resize_ring_buffer(unsigned long size, int cpu) | |||
3028 | */ | 3070 | */ |
3029 | ring_buffer_expanded = 1; | 3071 | ring_buffer_expanded = 1; |
3030 | 3072 | ||
3073 | /* May be called before buffers are initialized */ | ||
3074 | if (!global_trace.buffer) | ||
3075 | return 0; | ||
3076 | |||
3031 | ret = ring_buffer_resize(global_trace.buffer, size, cpu); | 3077 | ret = ring_buffer_resize(global_trace.buffer, size, cpu); |
3032 | if (ret < 0) | 3078 | if (ret < 0) |
3033 | return ret; | 3079 | return ret; |
@@ -3037,23 +3083,8 @@ static int __tracing_resize_ring_buffer(unsigned long size, int cpu) | |||
3037 | 3083 | ||
3038 | ret = ring_buffer_resize(max_tr.buffer, size, cpu); | 3084 | ret = ring_buffer_resize(max_tr.buffer, size, cpu); |
3039 | if (ret < 0) { | 3085 | if (ret < 0) { |
3040 | int r = 0; | 3086 | int r = resize_buffer_duplicate_size(&global_trace, |
3041 | 3087 | &global_trace, cpu); | |
3042 | if (cpu == RING_BUFFER_ALL_CPUS) { | ||
3043 | int i; | ||
3044 | for_each_tracing_cpu(i) { | ||
3045 | r = ring_buffer_resize(global_trace.buffer, | ||
3046 | global_trace.data[i]->entries, | ||
3047 | i); | ||
3048 | if (r < 0) | ||
3049 | break; | ||
3050 | } | ||
3051 | } else { | ||
3052 | r = ring_buffer_resize(global_trace.buffer, | ||
3053 | global_trace.data[cpu]->entries, | ||
3054 | cpu); | ||
3055 | } | ||
3056 | |||
3057 | if (r < 0) { | 3088 | if (r < 0) { |
3058 | /* | 3089 | /* |
3059 | * AARGH! We are left with different | 3090 | * AARGH! We are left with different |
@@ -3191,17 +3222,11 @@ static int tracing_set_tracer(const char *buf) | |||
3191 | 3222 | ||
3192 | topts = create_trace_option_files(t); | 3223 | topts = create_trace_option_files(t); |
3193 | if (t->use_max_tr) { | 3224 | if (t->use_max_tr) { |
3194 | int cpu; | ||
3195 | /* we need to make per cpu buffer sizes equivalent */ | 3225 | /* we need to make per cpu buffer sizes equivalent */ |
3196 | for_each_tracing_cpu(cpu) { | 3226 | ret = resize_buffer_duplicate_size(&max_tr, &global_trace, |
3197 | ret = ring_buffer_resize(max_tr.buffer, | 3227 | RING_BUFFER_ALL_CPUS); |
3198 | global_trace.data[cpu]->entries, | 3228 | if (ret < 0) |
3199 | cpu); | 3229 | goto out; |
3200 | if (ret < 0) | ||
3201 | goto out; | ||
3202 | max_tr.data[cpu]->entries = | ||
3203 | global_trace.data[cpu]->entries; | ||
3204 | } | ||
3205 | } | 3230 | } |
3206 | 3231 | ||
3207 | if (t->init) { | 3232 | if (t->init) { |
@@ -3323,6 +3348,10 @@ static int tracing_open_pipe(struct inode *inode, struct file *filp) | |||
3323 | if (trace_flags & TRACE_ITER_LATENCY_FMT) | 3348 | if (trace_flags & TRACE_ITER_LATENCY_FMT) |
3324 | iter->iter_flags |= TRACE_FILE_LAT_FMT; | 3349 | iter->iter_flags |= TRACE_FILE_LAT_FMT; |
3325 | 3350 | ||
3351 | /* Output in nanoseconds only if we are using a clock in nanoseconds. */ | ||
3352 | if (trace_clocks[trace_clock_id].in_ns) | ||
3353 | iter->iter_flags |= TRACE_FILE_TIME_IN_NS; | ||
3354 | |||
3326 | iter->cpu_file = cpu_file; | 3355 | iter->cpu_file = cpu_file; |
3327 | iter->tr = &global_trace; | 3356 | iter->tr = &global_trace; |
3328 | mutex_init(&iter->mutex); | 3357 | mutex_init(&iter->mutex); |
@@ -3383,19 +3412,6 @@ tracing_poll_pipe(struct file *filp, poll_table *poll_table) | |||
3383 | } | 3412 | } |
3384 | } | 3413 | } |
3385 | 3414 | ||
3386 | |||
3387 | void default_wait_pipe(struct trace_iterator *iter) | ||
3388 | { | ||
3389 | DEFINE_WAIT(wait); | ||
3390 | |||
3391 | prepare_to_wait(&trace_wait, &wait, TASK_INTERRUPTIBLE); | ||
3392 | |||
3393 | if (trace_empty(iter)) | ||
3394 | schedule(); | ||
3395 | |||
3396 | finish_wait(&trace_wait, &wait); | ||
3397 | } | ||
3398 | |||
3399 | /* | 3415 | /* |
3400 | * This is a make-shift waitqueue. | 3416 | * This is a make-shift waitqueue. |
3401 | * A tracer might use this callback on some rare cases: | 3417 | * A tracer might use this callback on some rare cases: |
@@ -3436,7 +3452,7 @@ static int tracing_wait_pipe(struct file *filp) | |||
3436 | return -EINTR; | 3452 | return -EINTR; |
3437 | 3453 | ||
3438 | /* | 3454 | /* |
3439 | * We block until we read something and tracing is disabled. | 3455 | * We block until we read something and tracing is enabled. |
3440 | * We still block if tracing is disabled, but we have never | 3456 | * We still block if tracing is disabled, but we have never |
3441 | * read anything. This allows a user to cat this file, and | 3457 | * read anything. This allows a user to cat this file, and |
3442 | * then enable tracing. But after we have read something, | 3458 | * then enable tracing. But after we have read something, |
@@ -3444,7 +3460,7 @@ static int tracing_wait_pipe(struct file *filp) | |||
3444 | * | 3460 | * |
3445 | * iter->pos will be 0 if we haven't read anything. | 3461 | * iter->pos will be 0 if we haven't read anything. |
3446 | */ | 3462 | */ |
3447 | if (!tracer_enabled && iter->pos) | 3463 | if (tracing_is_enabled() && iter->pos) |
3448 | break; | 3464 | break; |
3449 | } | 3465 | } |
3450 | 3466 | ||
@@ -3886,6 +3902,9 @@ tracing_mark_write(struct file *filp, const char __user *ubuf, | |||
3886 | if (tracing_disabled) | 3902 | if (tracing_disabled) |
3887 | return -EINVAL; | 3903 | return -EINVAL; |
3888 | 3904 | ||
3905 | if (!(trace_flags & TRACE_ITER_MARKERS)) | ||
3906 | return -EINVAL; | ||
3907 | |||
3889 | if (cnt > TRACE_BUF_SIZE) | 3908 | if (cnt > TRACE_BUF_SIZE) |
3890 | cnt = TRACE_BUF_SIZE; | 3909 | cnt = TRACE_BUF_SIZE; |
3891 | 3910 | ||
@@ -3950,7 +3969,7 @@ tracing_mark_write(struct file *filp, const char __user *ubuf, | |||
3950 | } else | 3969 | } else |
3951 | entry->buf[cnt] = '\0'; | 3970 | entry->buf[cnt] = '\0'; |
3952 | 3971 | ||
3953 | ring_buffer_unlock_commit(buffer, event); | 3972 | __buffer_unlock_commit(buffer, event); |
3954 | 3973 | ||
3955 | written = cnt; | 3974 | written = cnt; |
3956 | 3975 | ||
@@ -4011,6 +4030,14 @@ static ssize_t tracing_clock_write(struct file *filp, const char __user *ubuf, | |||
4011 | if (max_tr.buffer) | 4030 | if (max_tr.buffer) |
4012 | ring_buffer_set_clock(max_tr.buffer, trace_clocks[i].func); | 4031 | ring_buffer_set_clock(max_tr.buffer, trace_clocks[i].func); |
4013 | 4032 | ||
4033 | /* | ||
4034 | * New clock may not be consistent with the previous clock. | ||
4035 | * Reset the buffer so that it doesn't have incomparable timestamps. | ||
4036 | */ | ||
4037 | tracing_reset_online_cpus(&global_trace); | ||
4038 | if (max_tr.buffer) | ||
4039 | tracing_reset_online_cpus(&max_tr); | ||
4040 | |||
4014 | mutex_unlock(&trace_types_lock); | 4041 | mutex_unlock(&trace_types_lock); |
4015 | 4042 | ||
4016 | *fpos += cnt; | 4043 | *fpos += cnt; |
@@ -4032,13 +4059,6 @@ static const struct file_operations tracing_max_lat_fops = { | |||
4032 | .llseek = generic_file_llseek, | 4059 | .llseek = generic_file_llseek, |
4033 | }; | 4060 | }; |
4034 | 4061 | ||
4035 | static const struct file_operations tracing_ctrl_fops = { | ||
4036 | .open = tracing_open_generic, | ||
4037 | .read = tracing_ctrl_read, | ||
4038 | .write = tracing_ctrl_write, | ||
4039 | .llseek = generic_file_llseek, | ||
4040 | }; | ||
4041 | |||
4042 | static const struct file_operations set_tracer_fops = { | 4062 | static const struct file_operations set_tracer_fops = { |
4043 | .open = tracing_open_generic, | 4063 | .open = tracing_open_generic, |
4044 | .read = tracing_set_trace_read, | 4064 | .read = tracing_set_trace_read, |
@@ -4195,12 +4215,6 @@ static void buffer_pipe_buf_release(struct pipe_inode_info *pipe, | |||
4195 | buf->private = 0; | 4215 | buf->private = 0; |
4196 | } | 4216 | } |
4197 | 4217 | ||
4198 | static int buffer_pipe_buf_steal(struct pipe_inode_info *pipe, | ||
4199 | struct pipe_buffer *buf) | ||
4200 | { | ||
4201 | return 1; | ||
4202 | } | ||
4203 | |||
4204 | static void buffer_pipe_buf_get(struct pipe_inode_info *pipe, | 4218 | static void buffer_pipe_buf_get(struct pipe_inode_info *pipe, |
4205 | struct pipe_buffer *buf) | 4219 | struct pipe_buffer *buf) |
4206 | { | 4220 | { |
@@ -4216,7 +4230,7 @@ static const struct pipe_buf_operations buffer_pipe_buf_ops = { | |||
4216 | .unmap = generic_pipe_buf_unmap, | 4230 | .unmap = generic_pipe_buf_unmap, |
4217 | .confirm = generic_pipe_buf_confirm, | 4231 | .confirm = generic_pipe_buf_confirm, |
4218 | .release = buffer_pipe_buf_release, | 4232 | .release = buffer_pipe_buf_release, |
4219 | .steal = buffer_pipe_buf_steal, | 4233 | .steal = generic_pipe_buf_steal, |
4220 | .get = buffer_pipe_buf_get, | 4234 | .get = buffer_pipe_buf_get, |
4221 | }; | 4235 | }; |
4222 | 4236 | ||
@@ -4261,13 +4275,11 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos, | |||
4261 | return -ENOMEM; | 4275 | return -ENOMEM; |
4262 | 4276 | ||
4263 | if (*ppos & (PAGE_SIZE - 1)) { | 4277 | if (*ppos & (PAGE_SIZE - 1)) { |
4264 | WARN_ONCE(1, "Ftrace: previous read must page-align\n"); | ||
4265 | ret = -EINVAL; | 4278 | ret = -EINVAL; |
4266 | goto out; | 4279 | goto out; |
4267 | } | 4280 | } |
4268 | 4281 | ||
4269 | if (len & (PAGE_SIZE - 1)) { | 4282 | if (len & (PAGE_SIZE - 1)) { |
4270 | WARN_ONCE(1, "Ftrace: splice_read should page-align\n"); | ||
4271 | if (len < PAGE_SIZE) { | 4283 | if (len < PAGE_SIZE) { |
4272 | ret = -EINVAL; | 4284 | ret = -EINVAL; |
4273 | goto out; | 4285 | goto out; |
@@ -4378,13 +4390,27 @@ tracing_stats_read(struct file *filp, char __user *ubuf, | |||
4378 | cnt = ring_buffer_bytes_cpu(tr->buffer, cpu); | 4390 | cnt = ring_buffer_bytes_cpu(tr->buffer, cpu); |
4379 | trace_seq_printf(s, "bytes: %ld\n", cnt); | 4391 | trace_seq_printf(s, "bytes: %ld\n", cnt); |
4380 | 4392 | ||
4381 | t = ns2usecs(ring_buffer_oldest_event_ts(tr->buffer, cpu)); | 4393 | if (trace_clocks[trace_clock_id].in_ns) { |
4382 | usec_rem = do_div(t, USEC_PER_SEC); | 4394 | /* local or global for trace_clock */ |
4383 | trace_seq_printf(s, "oldest event ts: %5llu.%06lu\n", t, usec_rem); | 4395 | t = ns2usecs(ring_buffer_oldest_event_ts(tr->buffer, cpu)); |
4396 | usec_rem = do_div(t, USEC_PER_SEC); | ||
4397 | trace_seq_printf(s, "oldest event ts: %5llu.%06lu\n", | ||
4398 | t, usec_rem); | ||
4399 | |||
4400 | t = ns2usecs(ring_buffer_time_stamp(tr->buffer, cpu)); | ||
4401 | usec_rem = do_div(t, USEC_PER_SEC); | ||
4402 | trace_seq_printf(s, "now ts: %5llu.%06lu\n", t, usec_rem); | ||
4403 | } else { | ||
4404 | /* counter or tsc mode for trace_clock */ | ||
4405 | trace_seq_printf(s, "oldest event ts: %llu\n", | ||
4406 | ring_buffer_oldest_event_ts(tr->buffer, cpu)); | ||
4407 | |||
4408 | trace_seq_printf(s, "now ts: %llu\n", | ||
4409 | ring_buffer_time_stamp(tr->buffer, cpu)); | ||
4410 | } | ||
4384 | 4411 | ||
4385 | t = ns2usecs(ring_buffer_time_stamp(tr->buffer, cpu)); | 4412 | cnt = ring_buffer_dropped_events_cpu(tr->buffer, cpu); |
4386 | usec_rem = do_div(t, USEC_PER_SEC); | 4413 | trace_seq_printf(s, "dropped events: %ld\n", cnt); |
4387 | trace_seq_printf(s, "now ts: %5llu.%06lu\n", t, usec_rem); | ||
4388 | 4414 | ||
4389 | count = simple_read_from_buffer(ubuf, count, ppos, s->buffer, s->len); | 4415 | count = simple_read_from_buffer(ubuf, count, ppos, s->buffer, s->len); |
4390 | 4416 | ||
@@ -4816,9 +4842,6 @@ static __init int tracer_init_debugfs(void) | |||
4816 | 4842 | ||
4817 | d_tracer = tracing_init_dentry(); | 4843 | d_tracer = tracing_init_dentry(); |
4818 | 4844 | ||
4819 | trace_create_file("tracing_enabled", 0644, d_tracer, | ||
4820 | &global_trace, &tracing_ctrl_fops); | ||
4821 | |||
4822 | trace_create_file("trace_options", 0644, d_tracer, | 4845 | trace_create_file("trace_options", 0644, d_tracer, |
4823 | NULL, &tracing_iter_fops); | 4846 | NULL, &tracing_iter_fops); |
4824 | 4847 | ||
@@ -5090,6 +5113,7 @@ __init static int tracer_alloc_buffers(void) | |||
5090 | 5113 | ||
5091 | /* Only allocate trace_printk buffers if a trace_printk exists */ | 5114 | /* Only allocate trace_printk buffers if a trace_printk exists */ |
5092 | if (__stop___trace_bprintk_fmt != __start___trace_bprintk_fmt) | 5115 | if (__stop___trace_bprintk_fmt != __start___trace_bprintk_fmt) |
5116 | /* Must be called before global_trace.buffer is allocated */ | ||
5093 | trace_printk_init_buffers(); | 5117 | trace_printk_init_buffers(); |
5094 | 5118 | ||
5095 | /* To save memory, keep the ring buffer size to its minimum */ | 5119 | /* To save memory, keep the ring buffer size to its minimum */ |
@@ -5137,6 +5161,7 @@ __init static int tracer_alloc_buffers(void) | |||
5137 | #endif | 5161 | #endif |
5138 | 5162 | ||
5139 | trace_init_cmdlines(); | 5163 | trace_init_cmdlines(); |
5164 | init_irq_work(&trace_work_wakeup, trace_wake_up); | ||
5140 | 5165 | ||
5141 | register_tracer(&nop_trace); | 5166 | register_tracer(&nop_trace); |
5142 | current_trace = &nop_trace; | 5167 | current_trace = &nop_trace; |
@@ -5148,6 +5173,13 @@ __init static int tracer_alloc_buffers(void) | |||
5148 | 5173 | ||
5149 | register_die_notifier(&trace_die_notifier); | 5174 | register_die_notifier(&trace_die_notifier); |
5150 | 5175 | ||
5176 | while (trace_boot_options) { | ||
5177 | char *option; | ||
5178 | |||
5179 | option = strsep(&trace_boot_options, ","); | ||
5180 | trace_set_options(option); | ||
5181 | } | ||
5182 | |||
5151 | return 0; | 5183 | return 0; |
5152 | 5184 | ||
5153 | out_free_cpumask: | 5185 | out_free_cpumask: |
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 55e1f7f0db12..c75d7988902c 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h | |||
@@ -147,7 +147,7 @@ struct trace_array_cpu { | |||
147 | unsigned long skipped_entries; | 147 | unsigned long skipped_entries; |
148 | cycle_t preempt_timestamp; | 148 | cycle_t preempt_timestamp; |
149 | pid_t pid; | 149 | pid_t pid; |
150 | uid_t uid; | 150 | kuid_t uid; |
151 | char comm[TASK_COMM_LEN]; | 151 | char comm[TASK_COMM_LEN]; |
152 | }; | 152 | }; |
153 | 153 | ||
@@ -285,8 +285,8 @@ struct tracer { | |||
285 | int (*set_flag)(u32 old_flags, u32 bit, int set); | 285 | int (*set_flag)(u32 old_flags, u32 bit, int set); |
286 | struct tracer *next; | 286 | struct tracer *next; |
287 | struct tracer_flags *flags; | 287 | struct tracer_flags *flags; |
288 | int print_max; | 288 | bool print_max; |
289 | int use_max_tr; | 289 | bool use_max_tr; |
290 | }; | 290 | }; |
291 | 291 | ||
292 | 292 | ||
@@ -327,7 +327,6 @@ trace_buffer_iter(struct trace_iterator *iter, int cpu) | |||
327 | 327 | ||
328 | int tracer_init(struct tracer *t, struct trace_array *tr); | 328 | int tracer_init(struct tracer *t, struct trace_array *tr); |
329 | int tracing_is_enabled(void); | 329 | int tracing_is_enabled(void); |
330 | void trace_wake_up(void); | ||
331 | void tracing_reset(struct trace_array *tr, int cpu); | 330 | void tracing_reset(struct trace_array *tr, int cpu); |
332 | void tracing_reset_online_cpus(struct trace_array *tr); | 331 | void tracing_reset_online_cpus(struct trace_array *tr); |
333 | void tracing_reset_current(int cpu); | 332 | void tracing_reset_current(int cpu); |
@@ -349,9 +348,6 @@ trace_buffer_lock_reserve(struct ring_buffer *buffer, | |||
349 | unsigned long len, | 348 | unsigned long len, |
350 | unsigned long flags, | 349 | unsigned long flags, |
351 | int pc); | 350 | int pc); |
352 | void trace_buffer_unlock_commit(struct ring_buffer *buffer, | ||
353 | struct ring_buffer_event *event, | ||
354 | unsigned long flags, int pc); | ||
355 | 351 | ||
356 | struct trace_entry *tracing_get_trace_entry(struct trace_array *tr, | 352 | struct trace_entry *tracing_get_trace_entry(struct trace_array *tr, |
357 | struct trace_array_cpu *data); | 353 | struct trace_array_cpu *data); |
@@ -359,6 +355,9 @@ struct trace_entry *tracing_get_trace_entry(struct trace_array *tr, | |||
359 | struct trace_entry *trace_find_next_entry(struct trace_iterator *iter, | 355 | struct trace_entry *trace_find_next_entry(struct trace_iterator *iter, |
360 | int *ent_cpu, u64 *ent_ts); | 356 | int *ent_cpu, u64 *ent_ts); |
361 | 357 | ||
358 | void __buffer_unlock_commit(struct ring_buffer *buffer, | ||
359 | struct ring_buffer_event *event); | ||
360 | |||
362 | int trace_empty(struct trace_iterator *iter); | 361 | int trace_empty(struct trace_iterator *iter); |
363 | 362 | ||
364 | void *trace_find_next_entry_inc(struct trace_iterator *iter); | 363 | void *trace_find_next_entry_inc(struct trace_iterator *iter); |
@@ -367,7 +366,6 @@ void trace_init_global_iter(struct trace_iterator *iter); | |||
367 | 366 | ||
368 | void tracing_iter_reset(struct trace_iterator *iter, int cpu); | 367 | void tracing_iter_reset(struct trace_iterator *iter, int cpu); |
369 | 368 | ||
370 | void default_wait_pipe(struct trace_iterator *iter); | ||
371 | void poll_wait_pipe(struct trace_iterator *iter); | 369 | void poll_wait_pipe(struct trace_iterator *iter); |
372 | 370 | ||
373 | void ftrace(struct trace_array *tr, | 371 | void ftrace(struct trace_array *tr, |
@@ -407,12 +405,7 @@ void tracing_sched_switch_assign_trace(struct trace_array *tr); | |||
407 | void tracing_stop_sched_switch_record(void); | 405 | void tracing_stop_sched_switch_record(void); |
408 | void tracing_start_sched_switch_record(void); | 406 | void tracing_start_sched_switch_record(void); |
409 | int register_tracer(struct tracer *type); | 407 | int register_tracer(struct tracer *type); |
410 | void unregister_tracer(struct tracer *type); | ||
411 | int is_tracing_stopped(void); | 408 | int is_tracing_stopped(void); |
412 | enum trace_file_type { | ||
413 | TRACE_FILE_LAT_FMT = 1, | ||
414 | TRACE_FILE_ANNOTATE = 2, | ||
415 | }; | ||
416 | 409 | ||
417 | extern cpumask_var_t __read_mostly tracing_buffer_mask; | 410 | extern cpumask_var_t __read_mostly tracing_buffer_mask; |
418 | 411 | ||
@@ -472,11 +465,11 @@ extern void trace_find_cmdline(int pid, char comm[]); | |||
472 | 465 | ||
473 | #ifdef CONFIG_DYNAMIC_FTRACE | 466 | #ifdef CONFIG_DYNAMIC_FTRACE |
474 | extern unsigned long ftrace_update_tot_cnt; | 467 | extern unsigned long ftrace_update_tot_cnt; |
468 | #endif | ||
475 | #define DYN_FTRACE_TEST_NAME trace_selftest_dynamic_test_func | 469 | #define DYN_FTRACE_TEST_NAME trace_selftest_dynamic_test_func |
476 | extern int DYN_FTRACE_TEST_NAME(void); | 470 | extern int DYN_FTRACE_TEST_NAME(void); |
477 | #define DYN_FTRACE_TEST_NAME2 trace_selftest_dynamic_test_func2 | 471 | #define DYN_FTRACE_TEST_NAME2 trace_selftest_dynamic_test_func2 |
478 | extern int DYN_FTRACE_TEST_NAME2(void); | 472 | extern int DYN_FTRACE_TEST_NAME2(void); |
479 | #endif | ||
480 | 473 | ||
481 | extern int ring_buffer_expanded; | 474 | extern int ring_buffer_expanded; |
482 | extern bool tracing_selftest_disabled; | 475 | extern bool tracing_selftest_disabled; |
@@ -680,6 +673,7 @@ enum trace_iterator_flags { | |||
680 | TRACE_ITER_OVERWRITE = 0x200000, | 673 | TRACE_ITER_OVERWRITE = 0x200000, |
681 | TRACE_ITER_STOP_ON_FREE = 0x400000, | 674 | TRACE_ITER_STOP_ON_FREE = 0x400000, |
682 | TRACE_ITER_IRQ_INFO = 0x800000, | 675 | TRACE_ITER_IRQ_INFO = 0x800000, |
676 | TRACE_ITER_MARKERS = 0x1000000, | ||
683 | }; | 677 | }; |
684 | 678 | ||
685 | /* | 679 | /* |
@@ -840,6 +834,7 @@ extern const char *__start___trace_bprintk_fmt[]; | |||
840 | extern const char *__stop___trace_bprintk_fmt[]; | 834 | extern const char *__stop___trace_bprintk_fmt[]; |
841 | 835 | ||
842 | void trace_printk_init_buffers(void); | 836 | void trace_printk_init_buffers(void); |
837 | void trace_printk_start_comm(void); | ||
843 | 838 | ||
844 | #undef FTRACE_ENTRY | 839 | #undef FTRACE_ENTRY |
845 | #define FTRACE_ENTRY(call, struct_name, id, tstruct, print, filter) \ | 840 | #define FTRACE_ENTRY(call, struct_name, id, tstruct, print, filter) \ |
diff --git a/kernel/trace/trace_branch.c b/kernel/trace/trace_branch.c index 8d3538b4ea5f..95e96842ed29 100644 --- a/kernel/trace/trace_branch.c +++ b/kernel/trace/trace_branch.c | |||
@@ -77,7 +77,7 @@ probe_likely_condition(struct ftrace_branch_data *f, int val, int expect) | |||
77 | entry->correct = val == expect; | 77 | entry->correct = val == expect; |
78 | 78 | ||
79 | if (!filter_check_discard(call, entry, buffer, event)) | 79 | if (!filter_check_discard(call, entry, buffer, event)) |
80 | ring_buffer_unlock_commit(buffer, event); | 80 | __buffer_unlock_commit(buffer, event); |
81 | 81 | ||
82 | out: | 82 | out: |
83 | atomic_dec(&tr->data[cpu]->disabled); | 83 | atomic_dec(&tr->data[cpu]->disabled); |
@@ -199,7 +199,7 @@ __init static int init_branch_tracer(void) | |||
199 | } | 199 | } |
200 | return register_tracer(&branch_trace); | 200 | return register_tracer(&branch_trace); |
201 | } | 201 | } |
202 | device_initcall(init_branch_tracer); | 202 | core_initcall(init_branch_tracer); |
203 | 203 | ||
204 | #else | 204 | #else |
205 | static inline | 205 | static inline |
diff --git a/kernel/trace/trace_event_perf.c b/kernel/trace/trace_event_perf.c index 8a6d2ee2086c..84b1e045faba 100644 --- a/kernel/trace/trace_event_perf.c +++ b/kernel/trace/trace_event_perf.c | |||
@@ -258,7 +258,8 @@ EXPORT_SYMBOL_GPL(perf_trace_buf_prepare); | |||
258 | 258 | ||
259 | #ifdef CONFIG_FUNCTION_TRACER | 259 | #ifdef CONFIG_FUNCTION_TRACER |
260 | static void | 260 | static void |
261 | perf_ftrace_function_call(unsigned long ip, unsigned long parent_ip) | 261 | perf_ftrace_function_call(unsigned long ip, unsigned long parent_ip, |
262 | struct ftrace_ops *ops, struct pt_regs *pt_regs) | ||
262 | { | 263 | { |
263 | struct ftrace_entry *entry; | 264 | struct ftrace_entry *entry; |
264 | struct hlist_head *head; | 265 | struct hlist_head *head; |
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index 29111da1d100..880073d0b946 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c | |||
@@ -491,19 +491,6 @@ static void t_stop(struct seq_file *m, void *p) | |||
491 | mutex_unlock(&event_mutex); | 491 | mutex_unlock(&event_mutex); |
492 | } | 492 | } |
493 | 493 | ||
494 | static int | ||
495 | ftrace_event_seq_open(struct inode *inode, struct file *file) | ||
496 | { | ||
497 | const struct seq_operations *seq_ops; | ||
498 | |||
499 | if ((file->f_mode & FMODE_WRITE) && | ||
500 | (file->f_flags & O_TRUNC)) | ||
501 | ftrace_clear_events(); | ||
502 | |||
503 | seq_ops = inode->i_private; | ||
504 | return seq_open(file, seq_ops); | ||
505 | } | ||
506 | |||
507 | static ssize_t | 494 | static ssize_t |
508 | event_enable_read(struct file *filp, char __user *ubuf, size_t cnt, | 495 | event_enable_read(struct file *filp, char __user *ubuf, size_t cnt, |
509 | loff_t *ppos) | 496 | loff_t *ppos) |
@@ -980,6 +967,9 @@ show_header(struct file *filp, char __user *ubuf, size_t cnt, loff_t *ppos) | |||
980 | return r; | 967 | return r; |
981 | } | 968 | } |
982 | 969 | ||
970 | static int ftrace_event_avail_open(struct inode *inode, struct file *file); | ||
971 | static int ftrace_event_set_open(struct inode *inode, struct file *file); | ||
972 | |||
983 | static const struct seq_operations show_event_seq_ops = { | 973 | static const struct seq_operations show_event_seq_ops = { |
984 | .start = t_start, | 974 | .start = t_start, |
985 | .next = t_next, | 975 | .next = t_next, |
@@ -995,14 +985,14 @@ static const struct seq_operations show_set_event_seq_ops = { | |||
995 | }; | 985 | }; |
996 | 986 | ||
997 | static const struct file_operations ftrace_avail_fops = { | 987 | static const struct file_operations ftrace_avail_fops = { |
998 | .open = ftrace_event_seq_open, | 988 | .open = ftrace_event_avail_open, |
999 | .read = seq_read, | 989 | .read = seq_read, |
1000 | .llseek = seq_lseek, | 990 | .llseek = seq_lseek, |
1001 | .release = seq_release, | 991 | .release = seq_release, |
1002 | }; | 992 | }; |
1003 | 993 | ||
1004 | static const struct file_operations ftrace_set_event_fops = { | 994 | static const struct file_operations ftrace_set_event_fops = { |
1005 | .open = ftrace_event_seq_open, | 995 | .open = ftrace_event_set_open, |
1006 | .read = seq_read, | 996 | .read = seq_read, |
1007 | .write = ftrace_event_write, | 997 | .write = ftrace_event_write, |
1008 | .llseek = seq_lseek, | 998 | .llseek = seq_lseek, |
@@ -1078,6 +1068,26 @@ static struct dentry *event_trace_events_dir(void) | |||
1078 | return d_events; | 1068 | return d_events; |
1079 | } | 1069 | } |
1080 | 1070 | ||
1071 | static int | ||
1072 | ftrace_event_avail_open(struct inode *inode, struct file *file) | ||
1073 | { | ||
1074 | const struct seq_operations *seq_ops = &show_event_seq_ops; | ||
1075 | |||
1076 | return seq_open(file, seq_ops); | ||
1077 | } | ||
1078 | |||
1079 | static int | ||
1080 | ftrace_event_set_open(struct inode *inode, struct file *file) | ||
1081 | { | ||
1082 | const struct seq_operations *seq_ops = &show_set_event_seq_ops; | ||
1083 | |||
1084 | if ((file->f_mode & FMODE_WRITE) && | ||
1085 | (file->f_flags & O_TRUNC)) | ||
1086 | ftrace_clear_events(); | ||
1087 | |||
1088 | return seq_open(file, seq_ops); | ||
1089 | } | ||
1090 | |||
1081 | static struct dentry * | 1091 | static struct dentry * |
1082 | event_subsystem_dir(const char *name, struct dentry *d_events) | 1092 | event_subsystem_dir(const char *name, struct dentry *d_events) |
1083 | { | 1093 | { |
@@ -1199,6 +1209,31 @@ event_create_dir(struct ftrace_event_call *call, struct dentry *d_events, | |||
1199 | return 0; | 1209 | return 0; |
1200 | } | 1210 | } |
1201 | 1211 | ||
1212 | static void event_remove(struct ftrace_event_call *call) | ||
1213 | { | ||
1214 | ftrace_event_enable_disable(call, 0); | ||
1215 | if (call->event.funcs) | ||
1216 | __unregister_ftrace_event(&call->event); | ||
1217 | list_del(&call->list); | ||
1218 | } | ||
1219 | |||
1220 | static int event_init(struct ftrace_event_call *call) | ||
1221 | { | ||
1222 | int ret = 0; | ||
1223 | |||
1224 | if (WARN_ON(!call->name)) | ||
1225 | return -EINVAL; | ||
1226 | |||
1227 | if (call->class->raw_init) { | ||
1228 | ret = call->class->raw_init(call); | ||
1229 | if (ret < 0 && ret != -ENOSYS) | ||
1230 | pr_warn("Could not initialize trace events/%s\n", | ||
1231 | call->name); | ||
1232 | } | ||
1233 | |||
1234 | return ret; | ||
1235 | } | ||
1236 | |||
1202 | static int | 1237 | static int |
1203 | __trace_add_event_call(struct ftrace_event_call *call, struct module *mod, | 1238 | __trace_add_event_call(struct ftrace_event_call *call, struct module *mod, |
1204 | const struct file_operations *id, | 1239 | const struct file_operations *id, |
@@ -1209,19 +1244,9 @@ __trace_add_event_call(struct ftrace_event_call *call, struct module *mod, | |||
1209 | struct dentry *d_events; | 1244 | struct dentry *d_events; |
1210 | int ret; | 1245 | int ret; |
1211 | 1246 | ||
1212 | /* The linker may leave blanks */ | 1247 | ret = event_init(call); |
1213 | if (!call->name) | 1248 | if (ret < 0) |
1214 | return -EINVAL; | 1249 | return ret; |
1215 | |||
1216 | if (call->class->raw_init) { | ||
1217 | ret = call->class->raw_init(call); | ||
1218 | if (ret < 0) { | ||
1219 | if (ret != -ENOSYS) | ||
1220 | pr_warning("Could not initialize trace events/%s\n", | ||
1221 | call->name); | ||
1222 | return ret; | ||
1223 | } | ||
1224 | } | ||
1225 | 1250 | ||
1226 | d_events = event_trace_events_dir(); | 1251 | d_events = event_trace_events_dir(); |
1227 | if (!d_events) | 1252 | if (!d_events) |
@@ -1272,13 +1297,10 @@ static void remove_subsystem_dir(const char *name) | |||
1272 | */ | 1297 | */ |
1273 | static void __trace_remove_event_call(struct ftrace_event_call *call) | 1298 | static void __trace_remove_event_call(struct ftrace_event_call *call) |
1274 | { | 1299 | { |
1275 | ftrace_event_enable_disable(call, 0); | 1300 | event_remove(call); |
1276 | if (call->event.funcs) | ||
1277 | __unregister_ftrace_event(&call->event); | ||
1278 | debugfs_remove_recursive(call->dir); | ||
1279 | list_del(&call->list); | ||
1280 | trace_destroy_fields(call); | 1301 | trace_destroy_fields(call); |
1281 | destroy_preds(call); | 1302 | destroy_preds(call); |
1303 | debugfs_remove_recursive(call->dir); | ||
1282 | remove_subsystem_dir(call->class->system); | 1304 | remove_subsystem_dir(call->class->system); |
1283 | } | 1305 | } |
1284 | 1306 | ||
@@ -1450,30 +1472,59 @@ static __init int setup_trace_event(char *str) | |||
1450 | } | 1472 | } |
1451 | __setup("trace_event=", setup_trace_event); | 1473 | __setup("trace_event=", setup_trace_event); |
1452 | 1474 | ||
1475 | static __init int event_trace_enable(void) | ||
1476 | { | ||
1477 | struct ftrace_event_call **iter, *call; | ||
1478 | char *buf = bootup_event_buf; | ||
1479 | char *token; | ||
1480 | int ret; | ||
1481 | |||
1482 | for_each_event(iter, __start_ftrace_events, __stop_ftrace_events) { | ||
1483 | |||
1484 | call = *iter; | ||
1485 | ret = event_init(call); | ||
1486 | if (!ret) | ||
1487 | list_add(&call->list, &ftrace_events); | ||
1488 | } | ||
1489 | |||
1490 | while (true) { | ||
1491 | token = strsep(&buf, ","); | ||
1492 | |||
1493 | if (!token) | ||
1494 | break; | ||
1495 | if (!*token) | ||
1496 | continue; | ||
1497 | |||
1498 | ret = ftrace_set_clr_event(token, 1); | ||
1499 | if (ret) | ||
1500 | pr_warn("Failed to enable trace event: %s\n", token); | ||
1501 | } | ||
1502 | |||
1503 | trace_printk_start_comm(); | ||
1504 | |||
1505 | return 0; | ||
1506 | } | ||
1507 | |||
1453 | static __init int event_trace_init(void) | 1508 | static __init int event_trace_init(void) |
1454 | { | 1509 | { |
1455 | struct ftrace_event_call **call; | 1510 | struct ftrace_event_call *call; |
1456 | struct dentry *d_tracer; | 1511 | struct dentry *d_tracer; |
1457 | struct dentry *entry; | 1512 | struct dentry *entry; |
1458 | struct dentry *d_events; | 1513 | struct dentry *d_events; |
1459 | int ret; | 1514 | int ret; |
1460 | char *buf = bootup_event_buf; | ||
1461 | char *token; | ||
1462 | 1515 | ||
1463 | d_tracer = tracing_init_dentry(); | 1516 | d_tracer = tracing_init_dentry(); |
1464 | if (!d_tracer) | 1517 | if (!d_tracer) |
1465 | return 0; | 1518 | return 0; |
1466 | 1519 | ||
1467 | entry = debugfs_create_file("available_events", 0444, d_tracer, | 1520 | entry = debugfs_create_file("available_events", 0444, d_tracer, |
1468 | (void *)&show_event_seq_ops, | 1521 | NULL, &ftrace_avail_fops); |
1469 | &ftrace_avail_fops); | ||
1470 | if (!entry) | 1522 | if (!entry) |
1471 | pr_warning("Could not create debugfs " | 1523 | pr_warning("Could not create debugfs " |
1472 | "'available_events' entry\n"); | 1524 | "'available_events' entry\n"); |
1473 | 1525 | ||
1474 | entry = debugfs_create_file("set_event", 0644, d_tracer, | 1526 | entry = debugfs_create_file("set_event", 0644, d_tracer, |
1475 | (void *)&show_set_event_seq_ops, | 1527 | NULL, &ftrace_set_event_fops); |
1476 | &ftrace_set_event_fops); | ||
1477 | if (!entry) | 1528 | if (!entry) |
1478 | pr_warning("Could not create debugfs " | 1529 | pr_warning("Could not create debugfs " |
1479 | "'set_event' entry\n"); | 1530 | "'set_event' entry\n"); |
@@ -1497,24 +1548,19 @@ static __init int event_trace_init(void) | |||
1497 | if (trace_define_common_fields()) | 1548 | if (trace_define_common_fields()) |
1498 | pr_warning("tracing: Failed to allocate common fields"); | 1549 | pr_warning("tracing: Failed to allocate common fields"); |
1499 | 1550 | ||
1500 | for_each_event(call, __start_ftrace_events, __stop_ftrace_events) { | 1551 | /* |
1501 | __trace_add_event_call(*call, NULL, &ftrace_event_id_fops, | 1552 | * Early initialization already enabled ftrace event. |
1553 | * Now it's only necessary to create the event directory. | ||
1554 | */ | ||
1555 | list_for_each_entry(call, &ftrace_events, list) { | ||
1556 | |||
1557 | ret = event_create_dir(call, d_events, | ||
1558 | &ftrace_event_id_fops, | ||
1502 | &ftrace_enable_fops, | 1559 | &ftrace_enable_fops, |
1503 | &ftrace_event_filter_fops, | 1560 | &ftrace_event_filter_fops, |
1504 | &ftrace_event_format_fops); | 1561 | &ftrace_event_format_fops); |
1505 | } | 1562 | if (ret < 0) |
1506 | 1563 | event_remove(call); | |
1507 | while (true) { | ||
1508 | token = strsep(&buf, ","); | ||
1509 | |||
1510 | if (!token) | ||
1511 | break; | ||
1512 | if (!*token) | ||
1513 | continue; | ||
1514 | |||
1515 | ret = ftrace_set_clr_event(token, 1); | ||
1516 | if (ret) | ||
1517 | pr_warning("Failed to enable trace event: %s\n", token); | ||
1518 | } | 1564 | } |
1519 | 1565 | ||
1520 | ret = register_module_notifier(&trace_module_nb); | 1566 | ret = register_module_notifier(&trace_module_nb); |
@@ -1523,6 +1569,7 @@ static __init int event_trace_init(void) | |||
1523 | 1569 | ||
1524 | return 0; | 1570 | return 0; |
1525 | } | 1571 | } |
1572 | core_initcall(event_trace_enable); | ||
1526 | fs_initcall(event_trace_init); | 1573 | fs_initcall(event_trace_init); |
1527 | 1574 | ||
1528 | #ifdef CONFIG_FTRACE_STARTUP_TEST | 1575 | #ifdef CONFIG_FTRACE_STARTUP_TEST |
@@ -1646,9 +1693,11 @@ static __init void event_trace_self_tests(void) | |||
1646 | event_test_stuff(); | 1693 | event_test_stuff(); |
1647 | 1694 | ||
1648 | ret = __ftrace_set_clr_event(NULL, system->name, NULL, 0); | 1695 | ret = __ftrace_set_clr_event(NULL, system->name, NULL, 0); |
1649 | if (WARN_ON_ONCE(ret)) | 1696 | if (WARN_ON_ONCE(ret)) { |
1650 | pr_warning("error disabling system %s\n", | 1697 | pr_warning("error disabling system %s\n", |
1651 | system->name); | 1698 | system->name); |
1699 | continue; | ||
1700 | } | ||
1652 | 1701 | ||
1653 | pr_cont("OK\n"); | 1702 | pr_cont("OK\n"); |
1654 | } | 1703 | } |
@@ -1681,7 +1730,8 @@ static __init void event_trace_self_tests(void) | |||
1681 | static DEFINE_PER_CPU(atomic_t, ftrace_test_event_disable); | 1730 | static DEFINE_PER_CPU(atomic_t, ftrace_test_event_disable); |
1682 | 1731 | ||
1683 | static void | 1732 | static void |
1684 | function_test_events_call(unsigned long ip, unsigned long parent_ip) | 1733 | function_test_events_call(unsigned long ip, unsigned long parent_ip, |
1734 | struct ftrace_ops *op, struct pt_regs *pt_regs) | ||
1685 | { | 1735 | { |
1686 | struct ring_buffer_event *event; | 1736 | struct ring_buffer_event *event; |
1687 | struct ring_buffer *buffer; | 1737 | struct ring_buffer *buffer; |
@@ -1710,7 +1760,7 @@ function_test_events_call(unsigned long ip, unsigned long parent_ip) | |||
1710 | entry->ip = ip; | 1760 | entry->ip = ip; |
1711 | entry->parent_ip = parent_ip; | 1761 | entry->parent_ip = parent_ip; |
1712 | 1762 | ||
1713 | trace_nowake_buffer_unlock_commit(buffer, event, flags, pc); | 1763 | trace_buffer_unlock_commit(buffer, event, flags, pc); |
1714 | 1764 | ||
1715 | out: | 1765 | out: |
1716 | atomic_dec(&per_cpu(ftrace_test_event_disable, cpu)); | 1766 | atomic_dec(&per_cpu(ftrace_test_event_disable, cpu)); |
@@ -1720,6 +1770,7 @@ function_test_events_call(unsigned long ip, unsigned long parent_ip) | |||
1720 | static struct ftrace_ops trace_ops __initdata = | 1770 | static struct ftrace_ops trace_ops __initdata = |
1721 | { | 1771 | { |
1722 | .func = function_test_events_call, | 1772 | .func = function_test_events_call, |
1773 | .flags = FTRACE_OPS_FL_RECURSION_SAFE, | ||
1723 | }; | 1774 | }; |
1724 | 1775 | ||
1725 | static __init void event_trace_self_test_with_function(void) | 1776 | static __init void event_trace_self_test_with_function(void) |
diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c index 431dba8b7542..e5b0ca8b8d4d 100644 --- a/kernel/trace/trace_events_filter.c +++ b/kernel/trace/trace_events_filter.c | |||
@@ -1000,9 +1000,9 @@ static int init_pred(struct filter_parse_state *ps, | |||
1000 | } | 1000 | } |
1001 | } else { | 1001 | } else { |
1002 | if (field->is_signed) | 1002 | if (field->is_signed) |
1003 | ret = strict_strtoll(pred->regex.pattern, 0, &val); | 1003 | ret = kstrtoll(pred->regex.pattern, 0, &val); |
1004 | else | 1004 | else |
1005 | ret = strict_strtoull(pred->regex.pattern, 0, &val); | 1005 | ret = kstrtoull(pred->regex.pattern, 0, &val); |
1006 | if (ret) { | 1006 | if (ret) { |
1007 | parse_error(ps, FILT_ERR_ILLEGAL_INTVAL, 0); | 1007 | parse_error(ps, FILT_ERR_ILLEGAL_INTVAL, 0); |
1008 | return -EINVAL; | 1008 | return -EINVAL; |
@@ -2002,7 +2002,7 @@ static int ftrace_function_set_regexp(struct ftrace_ops *ops, int filter, | |||
2002 | static int __ftrace_function_set_filter(int filter, char *buf, int len, | 2002 | static int __ftrace_function_set_filter(int filter, char *buf, int len, |
2003 | struct function_filter_data *data) | 2003 | struct function_filter_data *data) |
2004 | { | 2004 | { |
2005 | int i, re_cnt, ret; | 2005 | int i, re_cnt, ret = -EINVAL; |
2006 | int *reset; | 2006 | int *reset; |
2007 | char **re; | 2007 | char **re; |
2008 | 2008 | ||
diff --git a/kernel/trace/trace_functions.c b/kernel/trace/trace_functions.c index a426f410c060..8e3ad8082ab7 100644 --- a/kernel/trace/trace_functions.c +++ b/kernel/trace/trace_functions.c | |||
@@ -7,13 +7,12 @@ | |||
7 | * Based on code from the latency_tracer, that is: | 7 | * Based on code from the latency_tracer, that is: |
8 | * | 8 | * |
9 | * Copyright (C) 2004-2006 Ingo Molnar | 9 | * Copyright (C) 2004-2006 Ingo Molnar |
10 | * Copyright (C) 2004 William Lee Irwin III | 10 | * Copyright (C) 2004 Nadia Yvette Chambers |
11 | */ | 11 | */ |
12 | #include <linux/ring_buffer.h> | 12 | #include <linux/ring_buffer.h> |
13 | #include <linux/debugfs.h> | 13 | #include <linux/debugfs.h> |
14 | #include <linux/uaccess.h> | 14 | #include <linux/uaccess.h> |
15 | #include <linux/ftrace.h> | 15 | #include <linux/ftrace.h> |
16 | #include <linux/pstore.h> | ||
17 | #include <linux/fs.h> | 16 | #include <linux/fs.h> |
18 | 17 | ||
19 | #include "trace.h" | 18 | #include "trace.h" |
@@ -49,7 +48,8 @@ static void function_trace_start(struct trace_array *tr) | |||
49 | } | 48 | } |
50 | 49 | ||
51 | static void | 50 | static void |
52 | function_trace_call_preempt_only(unsigned long ip, unsigned long parent_ip) | 51 | function_trace_call_preempt_only(unsigned long ip, unsigned long parent_ip, |
52 | struct ftrace_ops *op, struct pt_regs *pt_regs) | ||
53 | { | 53 | { |
54 | struct trace_array *tr = func_trace; | 54 | struct trace_array *tr = func_trace; |
55 | struct trace_array_cpu *data; | 55 | struct trace_array_cpu *data; |
@@ -75,16 +75,17 @@ function_trace_call_preempt_only(unsigned long ip, unsigned long parent_ip) | |||
75 | preempt_enable_notrace(); | 75 | preempt_enable_notrace(); |
76 | } | 76 | } |
77 | 77 | ||
78 | /* Our two options */ | 78 | /* Our option */ |
79 | enum { | 79 | enum { |
80 | TRACE_FUNC_OPT_STACK = 0x1, | 80 | TRACE_FUNC_OPT_STACK = 0x1, |
81 | TRACE_FUNC_OPT_PSTORE = 0x2, | ||
82 | }; | 81 | }; |
83 | 82 | ||
84 | static struct tracer_flags func_flags; | 83 | static struct tracer_flags func_flags; |
85 | 84 | ||
86 | static void | 85 | static void |
87 | function_trace_call(unsigned long ip, unsigned long parent_ip) | 86 | function_trace_call(unsigned long ip, unsigned long parent_ip, |
87 | struct ftrace_ops *op, struct pt_regs *pt_regs) | ||
88 | |||
88 | { | 89 | { |
89 | struct trace_array *tr = func_trace; | 90 | struct trace_array *tr = func_trace; |
90 | struct trace_array_cpu *data; | 91 | struct trace_array_cpu *data; |
@@ -106,12 +107,6 @@ function_trace_call(unsigned long ip, unsigned long parent_ip) | |||
106 | disabled = atomic_inc_return(&data->disabled); | 107 | disabled = atomic_inc_return(&data->disabled); |
107 | 108 | ||
108 | if (likely(disabled == 1)) { | 109 | if (likely(disabled == 1)) { |
109 | /* | ||
110 | * So far tracing doesn't support multiple buffers, so | ||
111 | * we make an explicit call for now. | ||
112 | */ | ||
113 | if (unlikely(func_flags.val & TRACE_FUNC_OPT_PSTORE)) | ||
114 | pstore_ftrace_call(ip, parent_ip); | ||
115 | pc = preempt_count(); | 110 | pc = preempt_count(); |
116 | trace_function(tr, ip, parent_ip, flags, pc); | 111 | trace_function(tr, ip, parent_ip, flags, pc); |
117 | } | 112 | } |
@@ -121,7 +116,8 @@ function_trace_call(unsigned long ip, unsigned long parent_ip) | |||
121 | } | 116 | } |
122 | 117 | ||
123 | static void | 118 | static void |
124 | function_stack_trace_call(unsigned long ip, unsigned long parent_ip) | 119 | function_stack_trace_call(unsigned long ip, unsigned long parent_ip, |
120 | struct ftrace_ops *op, struct pt_regs *pt_regs) | ||
125 | { | 121 | { |
126 | struct trace_array *tr = func_trace; | 122 | struct trace_array *tr = func_trace; |
127 | struct trace_array_cpu *data; | 123 | struct trace_array_cpu *data; |
@@ -164,22 +160,19 @@ function_stack_trace_call(unsigned long ip, unsigned long parent_ip) | |||
164 | static struct ftrace_ops trace_ops __read_mostly = | 160 | static struct ftrace_ops trace_ops __read_mostly = |
165 | { | 161 | { |
166 | .func = function_trace_call, | 162 | .func = function_trace_call, |
167 | .flags = FTRACE_OPS_FL_GLOBAL, | 163 | .flags = FTRACE_OPS_FL_GLOBAL | FTRACE_OPS_FL_RECURSION_SAFE, |
168 | }; | 164 | }; |
169 | 165 | ||
170 | static struct ftrace_ops trace_stack_ops __read_mostly = | 166 | static struct ftrace_ops trace_stack_ops __read_mostly = |
171 | { | 167 | { |
172 | .func = function_stack_trace_call, | 168 | .func = function_stack_trace_call, |
173 | .flags = FTRACE_OPS_FL_GLOBAL, | 169 | .flags = FTRACE_OPS_FL_GLOBAL | FTRACE_OPS_FL_RECURSION_SAFE, |
174 | }; | 170 | }; |
175 | 171 | ||
176 | static struct tracer_opt func_opts[] = { | 172 | static struct tracer_opt func_opts[] = { |
177 | #ifdef CONFIG_STACKTRACE | 173 | #ifdef CONFIG_STACKTRACE |
178 | { TRACER_OPT(func_stack_trace, TRACE_FUNC_OPT_STACK) }, | 174 | { TRACER_OPT(func_stack_trace, TRACE_FUNC_OPT_STACK) }, |
179 | #endif | 175 | #endif |
180 | #ifdef CONFIG_PSTORE_FTRACE | ||
181 | { TRACER_OPT(func_pstore, TRACE_FUNC_OPT_PSTORE) }, | ||
182 | #endif | ||
183 | { } /* Always set a last empty entry */ | 176 | { } /* Always set a last empty entry */ |
184 | }; | 177 | }; |
185 | 178 | ||
@@ -232,8 +225,6 @@ static int func_set_flag(u32 old_flags, u32 bit, int set) | |||
232 | } | 225 | } |
233 | 226 | ||
234 | break; | 227 | break; |
235 | case TRACE_FUNC_OPT_PSTORE: | ||
236 | break; | ||
237 | default: | 228 | default: |
238 | return -EINVAL; | 229 | return -EINVAL; |
239 | } | 230 | } |
@@ -375,7 +366,7 @@ ftrace_trace_onoff_callback(struct ftrace_hash *hash, | |||
375 | * We use the callback data field (which is a pointer) | 366 | * We use the callback data field (which is a pointer) |
376 | * as our counter. | 367 | * as our counter. |
377 | */ | 368 | */ |
378 | ret = strict_strtoul(number, 0, (unsigned long *)&count); | 369 | ret = kstrtoul(number, 0, (unsigned long *)&count); |
379 | if (ret) | 370 | if (ret) |
380 | return ret; | 371 | return ret; |
381 | 372 | ||
@@ -420,5 +411,4 @@ static __init int init_function_trace(void) | |||
420 | init_func_cmd_traceon(); | 411 | init_func_cmd_traceon(); |
421 | return register_tracer(&function_trace); | 412 | return register_tracer(&function_trace); |
422 | } | 413 | } |
423 | device_initcall(init_function_trace); | 414 | core_initcall(init_function_trace); |
424 | |||
diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c index ce27c8ba8d31..4edb4b74eb7e 100644 --- a/kernel/trace/trace_functions_graph.c +++ b/kernel/trace/trace_functions_graph.c | |||
@@ -143,7 +143,7 @@ ftrace_pop_return_trace(struct ftrace_graph_ret *trace, unsigned long *ret, | |||
143 | return; | 143 | return; |
144 | } | 144 | } |
145 | 145 | ||
146 | #ifdef CONFIG_HAVE_FUNCTION_GRAPH_FP_TEST | 146 | #if defined(CONFIG_HAVE_FUNCTION_GRAPH_FP_TEST) && !defined(CC_USING_FENTRY) |
147 | /* | 147 | /* |
148 | * The arch may choose to record the frame pointer used | 148 | * The arch may choose to record the frame pointer used |
149 | * and check it here to make sure that it is what we expect it | 149 | * and check it here to make sure that it is what we expect it |
@@ -154,6 +154,9 @@ ftrace_pop_return_trace(struct ftrace_graph_ret *trace, unsigned long *ret, | |||
154 | * | 154 | * |
155 | * Currently, x86_32 with optimize for size (-Os) makes the latest | 155 | * Currently, x86_32 with optimize for size (-Os) makes the latest |
156 | * gcc do the above. | 156 | * gcc do the above. |
157 | * | ||
158 | * Note, -mfentry does not use frame pointers, and this test | ||
159 | * is not needed if CC_USING_FENTRY is set. | ||
157 | */ | 160 | */ |
158 | if (unlikely(current->ret_stack[index].fp != frame_pointer)) { | 161 | if (unlikely(current->ret_stack[index].fp != frame_pointer)) { |
159 | ftrace_graph_stop(); | 162 | ftrace_graph_stop(); |
@@ -220,7 +223,7 @@ int __trace_graph_entry(struct trace_array *tr, | |||
220 | entry = ring_buffer_event_data(event); | 223 | entry = ring_buffer_event_data(event); |
221 | entry->graph_ent = *trace; | 224 | entry->graph_ent = *trace; |
222 | if (!filter_current_check_discard(buffer, call, entry, event)) | 225 | if (!filter_current_check_discard(buffer, call, entry, event)) |
223 | ring_buffer_unlock_commit(buffer, event); | 226 | __buffer_unlock_commit(buffer, event); |
224 | 227 | ||
225 | return 1; | 228 | return 1; |
226 | } | 229 | } |
@@ -324,7 +327,7 @@ void __trace_graph_return(struct trace_array *tr, | |||
324 | entry = ring_buffer_event_data(event); | 327 | entry = ring_buffer_event_data(event); |
325 | entry->ret = *trace; | 328 | entry->ret = *trace; |
326 | if (!filter_current_check_discard(buffer, call, entry, event)) | 329 | if (!filter_current_check_discard(buffer, call, entry, event)) |
327 | ring_buffer_unlock_commit(buffer, event); | 330 | __buffer_unlock_commit(buffer, event); |
328 | } | 331 | } |
329 | 332 | ||
330 | void trace_graph_return(struct ftrace_graph_ret *trace) | 333 | void trace_graph_return(struct ftrace_graph_ret *trace) |
@@ -1471,4 +1474,4 @@ static __init int init_graph_trace(void) | |||
1471 | return register_tracer(&graph_trace); | 1474 | return register_tracer(&graph_trace); |
1472 | } | 1475 | } |
1473 | 1476 | ||
1474 | device_initcall(init_graph_trace); | 1477 | core_initcall(init_graph_trace); |
diff --git a/kernel/trace/trace_irqsoff.c b/kernel/trace/trace_irqsoff.c index 99d20e920368..713a2cac4881 100644 --- a/kernel/trace/trace_irqsoff.c +++ b/kernel/trace/trace_irqsoff.c | |||
@@ -7,7 +7,7 @@ | |||
7 | * From code in the latency_tracer, that is: | 7 | * From code in the latency_tracer, that is: |
8 | * | 8 | * |
9 | * Copyright (C) 2004-2006 Ingo Molnar | 9 | * Copyright (C) 2004-2006 Ingo Molnar |
10 | * Copyright (C) 2004 William Lee Irwin III | 10 | * Copyright (C) 2004 Nadia Yvette Chambers |
11 | */ | 11 | */ |
12 | #include <linux/kallsyms.h> | 12 | #include <linux/kallsyms.h> |
13 | #include <linux/debugfs.h> | 13 | #include <linux/debugfs.h> |
@@ -136,7 +136,8 @@ static int func_prolog_dec(struct trace_array *tr, | |||
136 | * irqsoff uses its own tracer function to keep the overhead down: | 136 | * irqsoff uses its own tracer function to keep the overhead down: |
137 | */ | 137 | */ |
138 | static void | 138 | static void |
139 | irqsoff_tracer_call(unsigned long ip, unsigned long parent_ip) | 139 | irqsoff_tracer_call(unsigned long ip, unsigned long parent_ip, |
140 | struct ftrace_ops *op, struct pt_regs *pt_regs) | ||
140 | { | 141 | { |
141 | struct trace_array *tr = irqsoff_trace; | 142 | struct trace_array *tr = irqsoff_trace; |
142 | struct trace_array_cpu *data; | 143 | struct trace_array_cpu *data; |
@@ -153,7 +154,7 @@ irqsoff_tracer_call(unsigned long ip, unsigned long parent_ip) | |||
153 | static struct ftrace_ops trace_ops __read_mostly = | 154 | static struct ftrace_ops trace_ops __read_mostly = |
154 | { | 155 | { |
155 | .func = irqsoff_tracer_call, | 156 | .func = irqsoff_tracer_call, |
156 | .flags = FTRACE_OPS_FL_GLOBAL, | 157 | .flags = FTRACE_OPS_FL_GLOBAL | FTRACE_OPS_FL_RECURSION_SAFE, |
157 | }; | 158 | }; |
158 | #endif /* CONFIG_FUNCTION_TRACER */ | 159 | #endif /* CONFIG_FUNCTION_TRACER */ |
159 | 160 | ||
@@ -603,7 +604,7 @@ static struct tracer irqsoff_tracer __read_mostly = | |||
603 | .reset = irqsoff_tracer_reset, | 604 | .reset = irqsoff_tracer_reset, |
604 | .start = irqsoff_tracer_start, | 605 | .start = irqsoff_tracer_start, |
605 | .stop = irqsoff_tracer_stop, | 606 | .stop = irqsoff_tracer_stop, |
606 | .print_max = 1, | 607 | .print_max = true, |
607 | .print_header = irqsoff_print_header, | 608 | .print_header = irqsoff_print_header, |
608 | .print_line = irqsoff_print_line, | 609 | .print_line = irqsoff_print_line, |
609 | .flags = &tracer_flags, | 610 | .flags = &tracer_flags, |
@@ -613,7 +614,7 @@ static struct tracer irqsoff_tracer __read_mostly = | |||
613 | #endif | 614 | #endif |
614 | .open = irqsoff_trace_open, | 615 | .open = irqsoff_trace_open, |
615 | .close = irqsoff_trace_close, | 616 | .close = irqsoff_trace_close, |
616 | .use_max_tr = 1, | 617 | .use_max_tr = true, |
617 | }; | 618 | }; |
618 | # define register_irqsoff(trace) register_tracer(&trace) | 619 | # define register_irqsoff(trace) register_tracer(&trace) |
619 | #else | 620 | #else |
@@ -636,7 +637,7 @@ static struct tracer preemptoff_tracer __read_mostly = | |||
636 | .reset = irqsoff_tracer_reset, | 637 | .reset = irqsoff_tracer_reset, |
637 | .start = irqsoff_tracer_start, | 638 | .start = irqsoff_tracer_start, |
638 | .stop = irqsoff_tracer_stop, | 639 | .stop = irqsoff_tracer_stop, |
639 | .print_max = 1, | 640 | .print_max = true, |
640 | .print_header = irqsoff_print_header, | 641 | .print_header = irqsoff_print_header, |
641 | .print_line = irqsoff_print_line, | 642 | .print_line = irqsoff_print_line, |
642 | .flags = &tracer_flags, | 643 | .flags = &tracer_flags, |
@@ -646,7 +647,7 @@ static struct tracer preemptoff_tracer __read_mostly = | |||
646 | #endif | 647 | #endif |
647 | .open = irqsoff_trace_open, | 648 | .open = irqsoff_trace_open, |
648 | .close = irqsoff_trace_close, | 649 | .close = irqsoff_trace_close, |
649 | .use_max_tr = 1, | 650 | .use_max_tr = true, |
650 | }; | 651 | }; |
651 | # define register_preemptoff(trace) register_tracer(&trace) | 652 | # define register_preemptoff(trace) register_tracer(&trace) |
652 | #else | 653 | #else |
@@ -671,7 +672,7 @@ static struct tracer preemptirqsoff_tracer __read_mostly = | |||
671 | .reset = irqsoff_tracer_reset, | 672 | .reset = irqsoff_tracer_reset, |
672 | .start = irqsoff_tracer_start, | 673 | .start = irqsoff_tracer_start, |
673 | .stop = irqsoff_tracer_stop, | 674 | .stop = irqsoff_tracer_stop, |
674 | .print_max = 1, | 675 | .print_max = true, |
675 | .print_header = irqsoff_print_header, | 676 | .print_header = irqsoff_print_header, |
676 | .print_line = irqsoff_print_line, | 677 | .print_line = irqsoff_print_line, |
677 | .flags = &tracer_flags, | 678 | .flags = &tracer_flags, |
@@ -681,7 +682,7 @@ static struct tracer preemptirqsoff_tracer __read_mostly = | |||
681 | #endif | 682 | #endif |
682 | .open = irqsoff_trace_open, | 683 | .open = irqsoff_trace_open, |
683 | .close = irqsoff_trace_close, | 684 | .close = irqsoff_trace_close, |
684 | .use_max_tr = 1, | 685 | .use_max_tr = true, |
685 | }; | 686 | }; |
686 | 687 | ||
687 | # define register_preemptirqsoff(trace) register_tracer(&trace) | 688 | # define register_preemptirqsoff(trace) register_tracer(&trace) |
@@ -697,4 +698,4 @@ __init static int init_irqsoff_tracer(void) | |||
697 | 698 | ||
698 | return 0; | 699 | return 0; |
699 | } | 700 | } |
700 | device_initcall(init_irqsoff_tracer); | 701 | core_initcall(init_irqsoff_tracer); |
diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index 1a2117043bb1..1865d5f76538 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c | |||
@@ -444,7 +444,7 @@ static int create_trace_probe(int argc, char **argv) | |||
444 | return -EINVAL; | 444 | return -EINVAL; |
445 | } | 445 | } |
446 | /* an address specified */ | 446 | /* an address specified */ |
447 | ret = strict_strtoul(&argv[1][0], 0, (unsigned long *)&addr); | 447 | ret = kstrtoul(&argv[1][0], 0, (unsigned long *)&addr); |
448 | if (ret) { | 448 | if (ret) { |
449 | pr_info("Failed to parse address.\n"); | 449 | pr_info("Failed to parse address.\n"); |
450 | return ret; | 450 | return ret; |
@@ -751,8 +751,8 @@ static __kprobes void kprobe_trace_func(struct kprobe *kp, struct pt_regs *regs) | |||
751 | store_trace_args(sizeof(*entry), tp, regs, (u8 *)&entry[1], dsize); | 751 | store_trace_args(sizeof(*entry), tp, regs, (u8 *)&entry[1], dsize); |
752 | 752 | ||
753 | if (!filter_current_check_discard(buffer, call, entry, event)) | 753 | if (!filter_current_check_discard(buffer, call, entry, event)) |
754 | trace_nowake_buffer_unlock_commit_regs(buffer, event, | 754 | trace_buffer_unlock_commit_regs(buffer, event, |
755 | irq_flags, pc, regs); | 755 | irq_flags, pc, regs); |
756 | } | 756 | } |
757 | 757 | ||
758 | /* Kretprobe handler */ | 758 | /* Kretprobe handler */ |
@@ -784,8 +784,8 @@ static __kprobes void kretprobe_trace_func(struct kretprobe_instance *ri, | |||
784 | store_trace_args(sizeof(*entry), tp, regs, (u8 *)&entry[1], dsize); | 784 | store_trace_args(sizeof(*entry), tp, regs, (u8 *)&entry[1], dsize); |
785 | 785 | ||
786 | if (!filter_current_check_discard(buffer, call, entry, event)) | 786 | if (!filter_current_check_discard(buffer, call, entry, event)) |
787 | trace_nowake_buffer_unlock_commit_regs(buffer, event, | 787 | trace_buffer_unlock_commit_regs(buffer, event, |
788 | irq_flags, pc, regs); | 788 | irq_flags, pc, regs); |
789 | } | 789 | } |
790 | 790 | ||
791 | /* Event entry printers */ | 791 | /* Event entry printers */ |
diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c index 123b189c732c..194d79602dc7 100644 --- a/kernel/trace/trace_output.c +++ b/kernel/trace/trace_output.c | |||
@@ -610,24 +610,54 @@ lat_print_generic(struct trace_seq *s, struct trace_entry *entry, int cpu) | |||
610 | return trace_print_lat_fmt(s, entry); | 610 | return trace_print_lat_fmt(s, entry); |
611 | } | 611 | } |
612 | 612 | ||
613 | static unsigned long preempt_mark_thresh = 100; | 613 | static unsigned long preempt_mark_thresh_us = 100; |
614 | 614 | ||
615 | static int | 615 | static int |
616 | lat_print_timestamp(struct trace_seq *s, u64 abs_usecs, | 616 | lat_print_timestamp(struct trace_iterator *iter, u64 next_ts) |
617 | unsigned long rel_usecs) | ||
618 | { | 617 | { |
619 | return trace_seq_printf(s, " %4lldus%c: ", abs_usecs, | 618 | unsigned long verbose = trace_flags & TRACE_ITER_VERBOSE; |
620 | rel_usecs > preempt_mark_thresh ? '!' : | 619 | unsigned long in_ns = iter->iter_flags & TRACE_FILE_TIME_IN_NS; |
621 | rel_usecs > 1 ? '+' : ' '); | 620 | unsigned long long abs_ts = iter->ts - iter->tr->time_start; |
621 | unsigned long long rel_ts = next_ts - iter->ts; | ||
622 | struct trace_seq *s = &iter->seq; | ||
623 | |||
624 | if (in_ns) { | ||
625 | abs_ts = ns2usecs(abs_ts); | ||
626 | rel_ts = ns2usecs(rel_ts); | ||
627 | } | ||
628 | |||
629 | if (verbose && in_ns) { | ||
630 | unsigned long abs_usec = do_div(abs_ts, USEC_PER_MSEC); | ||
631 | unsigned long abs_msec = (unsigned long)abs_ts; | ||
632 | unsigned long rel_usec = do_div(rel_ts, USEC_PER_MSEC); | ||
633 | unsigned long rel_msec = (unsigned long)rel_ts; | ||
634 | |||
635 | return trace_seq_printf( | ||
636 | s, "[%08llx] %ld.%03ldms (+%ld.%03ldms): ", | ||
637 | ns2usecs(iter->ts), | ||
638 | abs_msec, abs_usec, | ||
639 | rel_msec, rel_usec); | ||
640 | } else if (verbose && !in_ns) { | ||
641 | return trace_seq_printf( | ||
642 | s, "[%016llx] %lld (+%lld): ", | ||
643 | iter->ts, abs_ts, rel_ts); | ||
644 | } else if (!verbose && in_ns) { | ||
645 | return trace_seq_printf( | ||
646 | s, " %4lldus%c: ", | ||
647 | abs_ts, | ||
648 | rel_ts > preempt_mark_thresh_us ? '!' : | ||
649 | rel_ts > 1 ? '+' : ' '); | ||
650 | } else { /* !verbose && !in_ns */ | ||
651 | return trace_seq_printf(s, " %4lld: ", abs_ts); | ||
652 | } | ||
622 | } | 653 | } |
623 | 654 | ||
624 | int trace_print_context(struct trace_iterator *iter) | 655 | int trace_print_context(struct trace_iterator *iter) |
625 | { | 656 | { |
626 | struct trace_seq *s = &iter->seq; | 657 | struct trace_seq *s = &iter->seq; |
627 | struct trace_entry *entry = iter->ent; | 658 | struct trace_entry *entry = iter->ent; |
628 | unsigned long long t = ns2usecs(iter->ts); | 659 | unsigned long long t; |
629 | unsigned long usec_rem = do_div(t, USEC_PER_SEC); | 660 | unsigned long secs, usec_rem; |
630 | unsigned long secs = (unsigned long)t; | ||
631 | char comm[TASK_COMM_LEN]; | 661 | char comm[TASK_COMM_LEN]; |
632 | int ret; | 662 | int ret; |
633 | 663 | ||
@@ -644,8 +674,13 @@ int trace_print_context(struct trace_iterator *iter) | |||
644 | return 0; | 674 | return 0; |
645 | } | 675 | } |
646 | 676 | ||
647 | return trace_seq_printf(s, " %5lu.%06lu: ", | 677 | if (iter->iter_flags & TRACE_FILE_TIME_IN_NS) { |
648 | secs, usec_rem); | 678 | t = ns2usecs(iter->ts); |
679 | usec_rem = do_div(t, USEC_PER_SEC); | ||
680 | secs = (unsigned long)t; | ||
681 | return trace_seq_printf(s, " %5lu.%06lu: ", secs, usec_rem); | ||
682 | } else | ||
683 | return trace_seq_printf(s, " %12llu: ", iter->ts); | ||
649 | } | 684 | } |
650 | 685 | ||
651 | int trace_print_lat_context(struct trace_iterator *iter) | 686 | int trace_print_lat_context(struct trace_iterator *iter) |
@@ -659,36 +694,29 @@ int trace_print_lat_context(struct trace_iterator *iter) | |||
659 | *next_entry = trace_find_next_entry(iter, NULL, | 694 | *next_entry = trace_find_next_entry(iter, NULL, |
660 | &next_ts); | 695 | &next_ts); |
661 | unsigned long verbose = (trace_flags & TRACE_ITER_VERBOSE); | 696 | unsigned long verbose = (trace_flags & TRACE_ITER_VERBOSE); |
662 | unsigned long abs_usecs = ns2usecs(iter->ts - iter->tr->time_start); | ||
663 | unsigned long rel_usecs; | ||
664 | 697 | ||
665 | /* Restore the original ent_size */ | 698 | /* Restore the original ent_size */ |
666 | iter->ent_size = ent_size; | 699 | iter->ent_size = ent_size; |
667 | 700 | ||
668 | if (!next_entry) | 701 | if (!next_entry) |
669 | next_ts = iter->ts; | 702 | next_ts = iter->ts; |
670 | rel_usecs = ns2usecs(next_ts - iter->ts); | ||
671 | 703 | ||
672 | if (verbose) { | 704 | if (verbose) { |
673 | char comm[TASK_COMM_LEN]; | 705 | char comm[TASK_COMM_LEN]; |
674 | 706 | ||
675 | trace_find_cmdline(entry->pid, comm); | 707 | trace_find_cmdline(entry->pid, comm); |
676 | 708 | ||
677 | ret = trace_seq_printf(s, "%16s %5d %3d %d %08x %08lx [%08llx]" | 709 | ret = trace_seq_printf( |
678 | " %ld.%03ldms (+%ld.%03ldms): ", comm, | 710 | s, "%16s %5d %3d %d %08x %08lx ", |
679 | entry->pid, iter->cpu, entry->flags, | 711 | comm, entry->pid, iter->cpu, entry->flags, |
680 | entry->preempt_count, iter->idx, | 712 | entry->preempt_count, iter->idx); |
681 | ns2usecs(iter->ts), | ||
682 | abs_usecs / USEC_PER_MSEC, | ||
683 | abs_usecs % USEC_PER_MSEC, | ||
684 | rel_usecs / USEC_PER_MSEC, | ||
685 | rel_usecs % USEC_PER_MSEC); | ||
686 | } else { | 713 | } else { |
687 | ret = lat_print_generic(s, entry, iter->cpu); | 714 | ret = lat_print_generic(s, entry, iter->cpu); |
688 | if (ret) | ||
689 | ret = lat_print_timestamp(s, abs_usecs, rel_usecs); | ||
690 | } | 715 | } |
691 | 716 | ||
717 | if (ret) | ||
718 | ret = lat_print_timestamp(iter, next_ts); | ||
719 | |||
692 | return ret; | 720 | return ret; |
693 | } | 721 | } |
694 | 722 | ||
diff --git a/kernel/trace/trace_probe.c b/kernel/trace/trace_probe.c index daa9980153af..412e959709b4 100644 --- a/kernel/trace/trace_probe.c +++ b/kernel/trace/trace_probe.c | |||
@@ -441,7 +441,7 @@ static const struct fetch_type *find_fetch_type(const char *type) | |||
441 | goto fail; | 441 | goto fail; |
442 | 442 | ||
443 | type++; | 443 | type++; |
444 | if (strict_strtoul(type, 0, &bs)) | 444 | if (kstrtoul(type, 0, &bs)) |
445 | goto fail; | 445 | goto fail; |
446 | 446 | ||
447 | switch (bs) { | 447 | switch (bs) { |
@@ -501,8 +501,8 @@ int traceprobe_split_symbol_offset(char *symbol, unsigned long *offset) | |||
501 | 501 | ||
502 | tmp = strchr(symbol, '+'); | 502 | tmp = strchr(symbol, '+'); |
503 | if (tmp) { | 503 | if (tmp) { |
504 | /* skip sign because strict_strtol doesn't accept '+' */ | 504 | /* skip sign because kstrtoul doesn't accept '+' */ |
505 | ret = strict_strtoul(tmp + 1, 0, offset); | 505 | ret = kstrtoul(tmp + 1, 0, offset); |
506 | if (ret) | 506 | if (ret) |
507 | return ret; | 507 | return ret; |
508 | 508 | ||
@@ -533,7 +533,7 @@ static int parse_probe_vars(char *arg, const struct fetch_type *t, | |||
533 | else | 533 | else |
534 | ret = -EINVAL; | 534 | ret = -EINVAL; |
535 | } else if (isdigit(arg[5])) { | 535 | } else if (isdigit(arg[5])) { |
536 | ret = strict_strtoul(arg + 5, 10, ¶m); | 536 | ret = kstrtoul(arg + 5, 10, ¶m); |
537 | if (ret || param > PARAM_MAX_STACK) | 537 | if (ret || param > PARAM_MAX_STACK) |
538 | ret = -EINVAL; | 538 | ret = -EINVAL; |
539 | else { | 539 | else { |
@@ -579,7 +579,7 @@ static int parse_probe_arg(char *arg, const struct fetch_type *t, | |||
579 | 579 | ||
580 | case '@': /* memory or symbol */ | 580 | case '@': /* memory or symbol */ |
581 | if (isdigit(arg[1])) { | 581 | if (isdigit(arg[1])) { |
582 | ret = strict_strtoul(arg + 1, 0, ¶m); | 582 | ret = kstrtoul(arg + 1, 0, ¶m); |
583 | if (ret) | 583 | if (ret) |
584 | break; | 584 | break; |
585 | 585 | ||
@@ -597,14 +597,14 @@ static int parse_probe_arg(char *arg, const struct fetch_type *t, | |||
597 | break; | 597 | break; |
598 | 598 | ||
599 | case '+': /* deref memory */ | 599 | case '+': /* deref memory */ |
600 | arg++; /* Skip '+', because strict_strtol() rejects it. */ | 600 | arg++; /* Skip '+', because kstrtol() rejects it. */ |
601 | case '-': | 601 | case '-': |
602 | tmp = strchr(arg, '('); | 602 | tmp = strchr(arg, '('); |
603 | if (!tmp) | 603 | if (!tmp) |
604 | break; | 604 | break; |
605 | 605 | ||
606 | *tmp = '\0'; | 606 | *tmp = '\0'; |
607 | ret = strict_strtol(arg, 0, &offset); | 607 | ret = kstrtol(arg, 0, &offset); |
608 | 608 | ||
609 | if (ret) | 609 | if (ret) |
610 | break; | 610 | break; |
diff --git a/kernel/trace/trace_sched_switch.c b/kernel/trace/trace_sched_switch.c index 7e62c0a18456..3374c792ccd8 100644 --- a/kernel/trace/trace_sched_switch.c +++ b/kernel/trace/trace_sched_switch.c | |||
@@ -102,9 +102,7 @@ tracing_sched_wakeup_trace(struct trace_array *tr, | |||
102 | entry->next_cpu = task_cpu(wakee); | 102 | entry->next_cpu = task_cpu(wakee); |
103 | 103 | ||
104 | if (!filter_check_discard(call, entry, buffer, event)) | 104 | if (!filter_check_discard(call, entry, buffer, event)) |
105 | ring_buffer_unlock_commit(buffer, event); | 105 | trace_buffer_unlock_commit(buffer, event, flags, pc); |
106 | ftrace_trace_stack(tr->buffer, flags, 6, pc); | ||
107 | ftrace_trace_userstack(tr->buffer, flags, pc); | ||
108 | } | 106 | } |
109 | 107 | ||
110 | static void | 108 | static void |
diff --git a/kernel/trace/trace_sched_wakeup.c b/kernel/trace/trace_sched_wakeup.c index ff791ea48b57..9fe45fcefca0 100644 --- a/kernel/trace/trace_sched_wakeup.c +++ b/kernel/trace/trace_sched_wakeup.c | |||
@@ -7,7 +7,7 @@ | |||
7 | * Based on code from the latency_tracer, that is: | 7 | * Based on code from the latency_tracer, that is: |
8 | * | 8 | * |
9 | * Copyright (C) 2004-2006 Ingo Molnar | 9 | * Copyright (C) 2004-2006 Ingo Molnar |
10 | * Copyright (C) 2004 William Lee Irwin III | 10 | * Copyright (C) 2004 Nadia Yvette Chambers |
11 | */ | 11 | */ |
12 | #include <linux/module.h> | 12 | #include <linux/module.h> |
13 | #include <linux/fs.h> | 13 | #include <linux/fs.h> |
@@ -108,7 +108,8 @@ out_enable: | |||
108 | * wakeup uses its own tracer function to keep the overhead down: | 108 | * wakeup uses its own tracer function to keep the overhead down: |
109 | */ | 109 | */ |
110 | static void | 110 | static void |
111 | wakeup_tracer_call(unsigned long ip, unsigned long parent_ip) | 111 | wakeup_tracer_call(unsigned long ip, unsigned long parent_ip, |
112 | struct ftrace_ops *op, struct pt_regs *pt_regs) | ||
112 | { | 113 | { |
113 | struct trace_array *tr = wakeup_trace; | 114 | struct trace_array *tr = wakeup_trace; |
114 | struct trace_array_cpu *data; | 115 | struct trace_array_cpu *data; |
@@ -129,7 +130,7 @@ wakeup_tracer_call(unsigned long ip, unsigned long parent_ip) | |||
129 | static struct ftrace_ops trace_ops __read_mostly = | 130 | static struct ftrace_ops trace_ops __read_mostly = |
130 | { | 131 | { |
131 | .func = wakeup_tracer_call, | 132 | .func = wakeup_tracer_call, |
132 | .flags = FTRACE_OPS_FL_GLOBAL, | 133 | .flags = FTRACE_OPS_FL_GLOBAL | FTRACE_OPS_FL_RECURSION_SAFE, |
133 | }; | 134 | }; |
134 | #endif /* CONFIG_FUNCTION_TRACER */ | 135 | #endif /* CONFIG_FUNCTION_TRACER */ |
135 | 136 | ||
@@ -588,7 +589,7 @@ static struct tracer wakeup_tracer __read_mostly = | |||
588 | .reset = wakeup_tracer_reset, | 589 | .reset = wakeup_tracer_reset, |
589 | .start = wakeup_tracer_start, | 590 | .start = wakeup_tracer_start, |
590 | .stop = wakeup_tracer_stop, | 591 | .stop = wakeup_tracer_stop, |
591 | .print_max = 1, | 592 | .print_max = true, |
592 | .print_header = wakeup_print_header, | 593 | .print_header = wakeup_print_header, |
593 | .print_line = wakeup_print_line, | 594 | .print_line = wakeup_print_line, |
594 | .flags = &tracer_flags, | 595 | .flags = &tracer_flags, |
@@ -598,7 +599,7 @@ static struct tracer wakeup_tracer __read_mostly = | |||
598 | #endif | 599 | #endif |
599 | .open = wakeup_trace_open, | 600 | .open = wakeup_trace_open, |
600 | .close = wakeup_trace_close, | 601 | .close = wakeup_trace_close, |
601 | .use_max_tr = 1, | 602 | .use_max_tr = true, |
602 | }; | 603 | }; |
603 | 604 | ||
604 | static struct tracer wakeup_rt_tracer __read_mostly = | 605 | static struct tracer wakeup_rt_tracer __read_mostly = |
@@ -609,7 +610,7 @@ static struct tracer wakeup_rt_tracer __read_mostly = | |||
609 | .start = wakeup_tracer_start, | 610 | .start = wakeup_tracer_start, |
610 | .stop = wakeup_tracer_stop, | 611 | .stop = wakeup_tracer_stop, |
611 | .wait_pipe = poll_wait_pipe, | 612 | .wait_pipe = poll_wait_pipe, |
612 | .print_max = 1, | 613 | .print_max = true, |
613 | .print_header = wakeup_print_header, | 614 | .print_header = wakeup_print_header, |
614 | .print_line = wakeup_print_line, | 615 | .print_line = wakeup_print_line, |
615 | .flags = &tracer_flags, | 616 | .flags = &tracer_flags, |
@@ -619,7 +620,7 @@ static struct tracer wakeup_rt_tracer __read_mostly = | |||
619 | #endif | 620 | #endif |
620 | .open = wakeup_trace_open, | 621 | .open = wakeup_trace_open, |
621 | .close = wakeup_trace_close, | 622 | .close = wakeup_trace_close, |
622 | .use_max_tr = 1, | 623 | .use_max_tr = true, |
623 | }; | 624 | }; |
624 | 625 | ||
625 | __init static int init_wakeup_tracer(void) | 626 | __init static int init_wakeup_tracer(void) |
@@ -636,4 +637,4 @@ __init static int init_wakeup_tracer(void) | |||
636 | 637 | ||
637 | return 0; | 638 | return 0; |
638 | } | 639 | } |
639 | device_initcall(init_wakeup_tracer); | 640 | core_initcall(init_wakeup_tracer); |
diff --git a/kernel/trace/trace_selftest.c b/kernel/trace/trace_selftest.c index 288541f977fb..47623169a815 100644 --- a/kernel/trace/trace_selftest.c +++ b/kernel/trace/trace_selftest.c | |||
@@ -103,54 +103,67 @@ static inline void warn_failed_init_tracer(struct tracer *trace, int init_ret) | |||
103 | 103 | ||
104 | static int trace_selftest_test_probe1_cnt; | 104 | static int trace_selftest_test_probe1_cnt; |
105 | static void trace_selftest_test_probe1_func(unsigned long ip, | 105 | static void trace_selftest_test_probe1_func(unsigned long ip, |
106 | unsigned long pip) | 106 | unsigned long pip, |
107 | struct ftrace_ops *op, | ||
108 | struct pt_regs *pt_regs) | ||
107 | { | 109 | { |
108 | trace_selftest_test_probe1_cnt++; | 110 | trace_selftest_test_probe1_cnt++; |
109 | } | 111 | } |
110 | 112 | ||
111 | static int trace_selftest_test_probe2_cnt; | 113 | static int trace_selftest_test_probe2_cnt; |
112 | static void trace_selftest_test_probe2_func(unsigned long ip, | 114 | static void trace_selftest_test_probe2_func(unsigned long ip, |
113 | unsigned long pip) | 115 | unsigned long pip, |
116 | struct ftrace_ops *op, | ||
117 | struct pt_regs *pt_regs) | ||
114 | { | 118 | { |
115 | trace_selftest_test_probe2_cnt++; | 119 | trace_selftest_test_probe2_cnt++; |
116 | } | 120 | } |
117 | 121 | ||
118 | static int trace_selftest_test_probe3_cnt; | 122 | static int trace_selftest_test_probe3_cnt; |
119 | static void trace_selftest_test_probe3_func(unsigned long ip, | 123 | static void trace_selftest_test_probe3_func(unsigned long ip, |
120 | unsigned long pip) | 124 | unsigned long pip, |
125 | struct ftrace_ops *op, | ||
126 | struct pt_regs *pt_regs) | ||
121 | { | 127 | { |
122 | trace_selftest_test_probe3_cnt++; | 128 | trace_selftest_test_probe3_cnt++; |
123 | } | 129 | } |
124 | 130 | ||
125 | static int trace_selftest_test_global_cnt; | 131 | static int trace_selftest_test_global_cnt; |
126 | static void trace_selftest_test_global_func(unsigned long ip, | 132 | static void trace_selftest_test_global_func(unsigned long ip, |
127 | unsigned long pip) | 133 | unsigned long pip, |
134 | struct ftrace_ops *op, | ||
135 | struct pt_regs *pt_regs) | ||
128 | { | 136 | { |
129 | trace_selftest_test_global_cnt++; | 137 | trace_selftest_test_global_cnt++; |
130 | } | 138 | } |
131 | 139 | ||
132 | static int trace_selftest_test_dyn_cnt; | 140 | static int trace_selftest_test_dyn_cnt; |
133 | static void trace_selftest_test_dyn_func(unsigned long ip, | 141 | static void trace_selftest_test_dyn_func(unsigned long ip, |
134 | unsigned long pip) | 142 | unsigned long pip, |
143 | struct ftrace_ops *op, | ||
144 | struct pt_regs *pt_regs) | ||
135 | { | 145 | { |
136 | trace_selftest_test_dyn_cnt++; | 146 | trace_selftest_test_dyn_cnt++; |
137 | } | 147 | } |
138 | 148 | ||
139 | static struct ftrace_ops test_probe1 = { | 149 | static struct ftrace_ops test_probe1 = { |
140 | .func = trace_selftest_test_probe1_func, | 150 | .func = trace_selftest_test_probe1_func, |
151 | .flags = FTRACE_OPS_FL_RECURSION_SAFE, | ||
141 | }; | 152 | }; |
142 | 153 | ||
143 | static struct ftrace_ops test_probe2 = { | 154 | static struct ftrace_ops test_probe2 = { |
144 | .func = trace_selftest_test_probe2_func, | 155 | .func = trace_selftest_test_probe2_func, |
156 | .flags = FTRACE_OPS_FL_RECURSION_SAFE, | ||
145 | }; | 157 | }; |
146 | 158 | ||
147 | static struct ftrace_ops test_probe3 = { | 159 | static struct ftrace_ops test_probe3 = { |
148 | .func = trace_selftest_test_probe3_func, | 160 | .func = trace_selftest_test_probe3_func, |
161 | .flags = FTRACE_OPS_FL_RECURSION_SAFE, | ||
149 | }; | 162 | }; |
150 | 163 | ||
151 | static struct ftrace_ops test_global = { | 164 | static struct ftrace_ops test_global = { |
152 | .func = trace_selftest_test_global_func, | 165 | .func = trace_selftest_test_global_func, |
153 | .flags = FTRACE_OPS_FL_GLOBAL, | 166 | .flags = FTRACE_OPS_FL_GLOBAL | FTRACE_OPS_FL_RECURSION_SAFE, |
154 | }; | 167 | }; |
155 | 168 | ||
156 | static void print_counts(void) | 169 | static void print_counts(void) |
@@ -307,7 +320,6 @@ int trace_selftest_startup_dynamic_tracing(struct tracer *trace, | |||
307 | int (*func)(void)) | 320 | int (*func)(void)) |
308 | { | 321 | { |
309 | int save_ftrace_enabled = ftrace_enabled; | 322 | int save_ftrace_enabled = ftrace_enabled; |
310 | int save_tracer_enabled = tracer_enabled; | ||
311 | unsigned long count; | 323 | unsigned long count; |
312 | char *func_name; | 324 | char *func_name; |
313 | int ret; | 325 | int ret; |
@@ -318,7 +330,6 @@ int trace_selftest_startup_dynamic_tracing(struct tracer *trace, | |||
318 | 330 | ||
319 | /* enable tracing, and record the filter function */ | 331 | /* enable tracing, and record the filter function */ |
320 | ftrace_enabled = 1; | 332 | ftrace_enabled = 1; |
321 | tracer_enabled = 1; | ||
322 | 333 | ||
323 | /* passed in by parameter to fool gcc from optimizing */ | 334 | /* passed in by parameter to fool gcc from optimizing */ |
324 | func(); | 335 | func(); |
@@ -382,7 +393,6 @@ int trace_selftest_startup_dynamic_tracing(struct tracer *trace, | |||
382 | 393 | ||
383 | out: | 394 | out: |
384 | ftrace_enabled = save_ftrace_enabled; | 395 | ftrace_enabled = save_ftrace_enabled; |
385 | tracer_enabled = save_tracer_enabled; | ||
386 | 396 | ||
387 | /* Enable tracing on all functions again */ | 397 | /* Enable tracing on all functions again */ |
388 | ftrace_set_global_filter(NULL, 0, 1); | 398 | ftrace_set_global_filter(NULL, 0, 1); |
@@ -393,10 +403,247 @@ int trace_selftest_startup_dynamic_tracing(struct tracer *trace, | |||
393 | 403 | ||
394 | return ret; | 404 | return ret; |
395 | } | 405 | } |
406 | |||
407 | static int trace_selftest_recursion_cnt; | ||
408 | static void trace_selftest_test_recursion_func(unsigned long ip, | ||
409 | unsigned long pip, | ||
410 | struct ftrace_ops *op, | ||
411 | struct pt_regs *pt_regs) | ||
412 | { | ||
413 | /* | ||
414 | * This function is registered without the recursion safe flag. | ||
415 | * The ftrace infrastructure should provide the recursion | ||
416 | * protection. If not, this will crash the kernel! | ||
417 | */ | ||
418 | trace_selftest_recursion_cnt++; | ||
419 | DYN_FTRACE_TEST_NAME(); | ||
420 | } | ||
421 | |||
422 | static void trace_selftest_test_recursion_safe_func(unsigned long ip, | ||
423 | unsigned long pip, | ||
424 | struct ftrace_ops *op, | ||
425 | struct pt_regs *pt_regs) | ||
426 | { | ||
427 | /* | ||
428 | * We said we would provide our own recursion. By calling | ||
429 | * this function again, we should recurse back into this function | ||
430 | * and count again. But this only happens if the arch supports | ||
431 | * all of ftrace features and nothing else is using the function | ||
432 | * tracing utility. | ||
433 | */ | ||
434 | if (trace_selftest_recursion_cnt++) | ||
435 | return; | ||
436 | DYN_FTRACE_TEST_NAME(); | ||
437 | } | ||
438 | |||
439 | static struct ftrace_ops test_rec_probe = { | ||
440 | .func = trace_selftest_test_recursion_func, | ||
441 | }; | ||
442 | |||
443 | static struct ftrace_ops test_recsafe_probe = { | ||
444 | .func = trace_selftest_test_recursion_safe_func, | ||
445 | .flags = FTRACE_OPS_FL_RECURSION_SAFE, | ||
446 | }; | ||
447 | |||
448 | static int | ||
449 | trace_selftest_function_recursion(void) | ||
450 | { | ||
451 | int save_ftrace_enabled = ftrace_enabled; | ||
452 | char *func_name; | ||
453 | int len; | ||
454 | int ret; | ||
455 | int cnt; | ||
456 | |||
457 | /* The previous test PASSED */ | ||
458 | pr_cont("PASSED\n"); | ||
459 | pr_info("Testing ftrace recursion: "); | ||
460 | |||
461 | |||
462 | /* enable tracing, and record the filter function */ | ||
463 | ftrace_enabled = 1; | ||
464 | |||
465 | /* Handle PPC64 '.' name */ | ||
466 | func_name = "*" __stringify(DYN_FTRACE_TEST_NAME); | ||
467 | len = strlen(func_name); | ||
468 | |||
469 | ret = ftrace_set_filter(&test_rec_probe, func_name, len, 1); | ||
470 | if (ret) { | ||
471 | pr_cont("*Could not set filter* "); | ||
472 | goto out; | ||
473 | } | ||
474 | |||
475 | ret = register_ftrace_function(&test_rec_probe); | ||
476 | if (ret) { | ||
477 | pr_cont("*could not register callback* "); | ||
478 | goto out; | ||
479 | } | ||
480 | |||
481 | DYN_FTRACE_TEST_NAME(); | ||
482 | |||
483 | unregister_ftrace_function(&test_rec_probe); | ||
484 | |||
485 | ret = -1; | ||
486 | if (trace_selftest_recursion_cnt != 1) { | ||
487 | pr_cont("*callback not called once (%d)* ", | ||
488 | trace_selftest_recursion_cnt); | ||
489 | goto out; | ||
490 | } | ||
491 | |||
492 | trace_selftest_recursion_cnt = 1; | ||
493 | |||
494 | pr_cont("PASSED\n"); | ||
495 | pr_info("Testing ftrace recursion safe: "); | ||
496 | |||
497 | ret = ftrace_set_filter(&test_recsafe_probe, func_name, len, 1); | ||
498 | if (ret) { | ||
499 | pr_cont("*Could not set filter* "); | ||
500 | goto out; | ||
501 | } | ||
502 | |||
503 | ret = register_ftrace_function(&test_recsafe_probe); | ||
504 | if (ret) { | ||
505 | pr_cont("*could not register callback* "); | ||
506 | goto out; | ||
507 | } | ||
508 | |||
509 | DYN_FTRACE_TEST_NAME(); | ||
510 | |||
511 | unregister_ftrace_function(&test_recsafe_probe); | ||
512 | |||
513 | /* | ||
514 | * If arch supports all ftrace features, and no other task | ||
515 | * was on the list, we should be fine. | ||
516 | */ | ||
517 | if (!ftrace_nr_registered_ops() && !FTRACE_FORCE_LIST_FUNC) | ||
518 | cnt = 2; /* Should have recursed */ | ||
519 | else | ||
520 | cnt = 1; | ||
521 | |||
522 | ret = -1; | ||
523 | if (trace_selftest_recursion_cnt != cnt) { | ||
524 | pr_cont("*callback not called expected %d times (%d)* ", | ||
525 | cnt, trace_selftest_recursion_cnt); | ||
526 | goto out; | ||
527 | } | ||
528 | |||
529 | ret = 0; | ||
530 | out: | ||
531 | ftrace_enabled = save_ftrace_enabled; | ||
532 | |||
533 | return ret; | ||
534 | } | ||
396 | #else | 535 | #else |
397 | # define trace_selftest_startup_dynamic_tracing(trace, tr, func) ({ 0; }) | 536 | # define trace_selftest_startup_dynamic_tracing(trace, tr, func) ({ 0; }) |
537 | # define trace_selftest_function_recursion() ({ 0; }) | ||
398 | #endif /* CONFIG_DYNAMIC_FTRACE */ | 538 | #endif /* CONFIG_DYNAMIC_FTRACE */ |
399 | 539 | ||
540 | static enum { | ||
541 | TRACE_SELFTEST_REGS_START, | ||
542 | TRACE_SELFTEST_REGS_FOUND, | ||
543 | TRACE_SELFTEST_REGS_NOT_FOUND, | ||
544 | } trace_selftest_regs_stat; | ||
545 | |||
546 | static void trace_selftest_test_regs_func(unsigned long ip, | ||
547 | unsigned long pip, | ||
548 | struct ftrace_ops *op, | ||
549 | struct pt_regs *pt_regs) | ||
550 | { | ||
551 | if (pt_regs) | ||
552 | trace_selftest_regs_stat = TRACE_SELFTEST_REGS_FOUND; | ||
553 | else | ||
554 | trace_selftest_regs_stat = TRACE_SELFTEST_REGS_NOT_FOUND; | ||
555 | } | ||
556 | |||
557 | static struct ftrace_ops test_regs_probe = { | ||
558 | .func = trace_selftest_test_regs_func, | ||
559 | .flags = FTRACE_OPS_FL_RECURSION_SAFE | FTRACE_OPS_FL_SAVE_REGS, | ||
560 | }; | ||
561 | |||
562 | static int | ||
563 | trace_selftest_function_regs(void) | ||
564 | { | ||
565 | int save_ftrace_enabled = ftrace_enabled; | ||
566 | char *func_name; | ||
567 | int len; | ||
568 | int ret; | ||
569 | int supported = 0; | ||
570 | |||
571 | #ifdef ARCH_SUPPORTS_FTRACE_SAVE_REGS | ||
572 | supported = 1; | ||
573 | #endif | ||
574 | |||
575 | /* The previous test PASSED */ | ||
576 | pr_cont("PASSED\n"); | ||
577 | pr_info("Testing ftrace regs%s: ", | ||
578 | !supported ? "(no arch support)" : ""); | ||
579 | |||
580 | /* enable tracing, and record the filter function */ | ||
581 | ftrace_enabled = 1; | ||
582 | |||
583 | /* Handle PPC64 '.' name */ | ||
584 | func_name = "*" __stringify(DYN_FTRACE_TEST_NAME); | ||
585 | len = strlen(func_name); | ||
586 | |||
587 | ret = ftrace_set_filter(&test_regs_probe, func_name, len, 1); | ||
588 | /* | ||
589 | * If DYNAMIC_FTRACE is not set, then we just trace all functions. | ||
590 | * This test really doesn't care. | ||
591 | */ | ||
592 | if (ret && ret != -ENODEV) { | ||
593 | pr_cont("*Could not set filter* "); | ||
594 | goto out; | ||
595 | } | ||
596 | |||
597 | ret = register_ftrace_function(&test_regs_probe); | ||
598 | /* | ||
599 | * Now if the arch does not support passing regs, then this should | ||
600 | * have failed. | ||
601 | */ | ||
602 | if (!supported) { | ||
603 | if (!ret) { | ||
604 | pr_cont("*registered save-regs without arch support* "); | ||
605 | goto out; | ||
606 | } | ||
607 | test_regs_probe.flags |= FTRACE_OPS_FL_SAVE_REGS_IF_SUPPORTED; | ||
608 | ret = register_ftrace_function(&test_regs_probe); | ||
609 | } | ||
610 | if (ret) { | ||
611 | pr_cont("*could not register callback* "); | ||
612 | goto out; | ||
613 | } | ||
614 | |||
615 | |||
616 | DYN_FTRACE_TEST_NAME(); | ||
617 | |||
618 | unregister_ftrace_function(&test_regs_probe); | ||
619 | |||
620 | ret = -1; | ||
621 | |||
622 | switch (trace_selftest_regs_stat) { | ||
623 | case TRACE_SELFTEST_REGS_START: | ||
624 | pr_cont("*callback never called* "); | ||
625 | goto out; | ||
626 | |||
627 | case TRACE_SELFTEST_REGS_FOUND: | ||
628 | if (supported) | ||
629 | break; | ||
630 | pr_cont("*callback received regs without arch support* "); | ||
631 | goto out; | ||
632 | |||
633 | case TRACE_SELFTEST_REGS_NOT_FOUND: | ||
634 | if (!supported) | ||
635 | break; | ||
636 | pr_cont("*callback received NULL regs* "); | ||
637 | goto out; | ||
638 | } | ||
639 | |||
640 | ret = 0; | ||
641 | out: | ||
642 | ftrace_enabled = save_ftrace_enabled; | ||
643 | |||
644 | return ret; | ||
645 | } | ||
646 | |||
400 | /* | 647 | /* |
401 | * Simple verification test of ftrace function tracer. | 648 | * Simple verification test of ftrace function tracer. |
402 | * Enable ftrace, sleep 1/10 second, and then read the trace | 649 | * Enable ftrace, sleep 1/10 second, and then read the trace |
@@ -406,7 +653,6 @@ int | |||
406 | trace_selftest_startup_function(struct tracer *trace, struct trace_array *tr) | 653 | trace_selftest_startup_function(struct tracer *trace, struct trace_array *tr) |
407 | { | 654 | { |
408 | int save_ftrace_enabled = ftrace_enabled; | 655 | int save_ftrace_enabled = ftrace_enabled; |
409 | int save_tracer_enabled = tracer_enabled; | ||
410 | unsigned long count; | 656 | unsigned long count; |
411 | int ret; | 657 | int ret; |
412 | 658 | ||
@@ -415,7 +661,6 @@ trace_selftest_startup_function(struct tracer *trace, struct trace_array *tr) | |||
415 | 661 | ||
416 | /* start the tracing */ | 662 | /* start the tracing */ |
417 | ftrace_enabled = 1; | 663 | ftrace_enabled = 1; |
418 | tracer_enabled = 1; | ||
419 | 664 | ||
420 | ret = tracer_init(trace, tr); | 665 | ret = tracer_init(trace, tr); |
421 | if (ret) { | 666 | if (ret) { |
@@ -442,10 +687,16 @@ trace_selftest_startup_function(struct tracer *trace, struct trace_array *tr) | |||
442 | 687 | ||
443 | ret = trace_selftest_startup_dynamic_tracing(trace, tr, | 688 | ret = trace_selftest_startup_dynamic_tracing(trace, tr, |
444 | DYN_FTRACE_TEST_NAME); | 689 | DYN_FTRACE_TEST_NAME); |
690 | if (ret) | ||
691 | goto out; | ||
692 | |||
693 | ret = trace_selftest_function_recursion(); | ||
694 | if (ret) | ||
695 | goto out; | ||
445 | 696 | ||
697 | ret = trace_selftest_function_regs(); | ||
446 | out: | 698 | out: |
447 | ftrace_enabled = save_ftrace_enabled; | 699 | ftrace_enabled = save_ftrace_enabled; |
448 | tracer_enabled = save_tracer_enabled; | ||
449 | 700 | ||
450 | /* kill ftrace totally if we failed */ | 701 | /* kill ftrace totally if we failed */ |
451 | if (ret) | 702 | if (ret) |
@@ -778,6 +1029,8 @@ static int trace_wakeup_test_thread(void *data) | |||
778 | set_current_state(TASK_INTERRUPTIBLE); | 1029 | set_current_state(TASK_INTERRUPTIBLE); |
779 | schedule(); | 1030 | schedule(); |
780 | 1031 | ||
1032 | complete(x); | ||
1033 | |||
781 | /* we are awake, now wait to disappear */ | 1034 | /* we are awake, now wait to disappear */ |
782 | while (!kthread_should_stop()) { | 1035 | while (!kthread_should_stop()) { |
783 | /* | 1036 | /* |
@@ -821,29 +1074,27 @@ trace_selftest_startup_wakeup(struct tracer *trace, struct trace_array *tr) | |||
821 | /* reset the max latency */ | 1074 | /* reset the max latency */ |
822 | tracing_max_latency = 0; | 1075 | tracing_max_latency = 0; |
823 | 1076 | ||
824 | /* sleep to let the RT thread sleep too */ | 1077 | while (p->on_rq) { |
825 | msleep(100); | 1078 | /* |
1079 | * Sleep to make sure the RT thread is asleep too. | ||
1080 | * On virtual machines we can't rely on timings, | ||
1081 | * but we want to make sure this test still works. | ||
1082 | */ | ||
1083 | msleep(100); | ||
1084 | } | ||
826 | 1085 | ||
827 | /* | 1086 | init_completion(&isrt); |
828 | * Yes this is slightly racy. It is possible that for some | ||
829 | * strange reason that the RT thread we created, did not | ||
830 | * call schedule for 100ms after doing the completion, | ||
831 | * and we do a wakeup on a task that already is awake. | ||
832 | * But that is extremely unlikely, and the worst thing that | ||
833 | * happens in such a case, is that we disable tracing. | ||
834 | * Honestly, if this race does happen something is horrible | ||
835 | * wrong with the system. | ||
836 | */ | ||
837 | 1087 | ||
838 | wake_up_process(p); | 1088 | wake_up_process(p); |
839 | 1089 | ||
840 | /* give a little time to let the thread wake up */ | 1090 | /* Wait for the task to wake up */ |
841 | msleep(100); | 1091 | wait_for_completion(&isrt); |
842 | 1092 | ||
843 | /* stop the tracing. */ | 1093 | /* stop the tracing. */ |
844 | tracing_stop(); | 1094 | tracing_stop(); |
845 | /* check both trace buffers */ | 1095 | /* check both trace buffers */ |
846 | ret = trace_test_buffer(tr, NULL); | 1096 | ret = trace_test_buffer(tr, NULL); |
1097 | printk("ret = %d\n", ret); | ||
847 | if (!ret) | 1098 | if (!ret) |
848 | ret = trace_test_buffer(&max_tr, &count); | 1099 | ret = trace_test_buffer(&max_tr, &count); |
849 | 1100 | ||
diff --git a/kernel/trace/trace_stack.c b/kernel/trace/trace_stack.c index d4545f49242e..42ca822fc701 100644 --- a/kernel/trace/trace_stack.c +++ b/kernel/trace/trace_stack.c | |||
@@ -33,7 +33,6 @@ static unsigned long max_stack_size; | |||
33 | static arch_spinlock_t max_stack_lock = | 33 | static arch_spinlock_t max_stack_lock = |
34 | (arch_spinlock_t)__ARCH_SPIN_LOCK_UNLOCKED; | 34 | (arch_spinlock_t)__ARCH_SPIN_LOCK_UNLOCKED; |
35 | 35 | ||
36 | static int stack_trace_disabled __read_mostly; | ||
37 | static DEFINE_PER_CPU(int, trace_active); | 36 | static DEFINE_PER_CPU(int, trace_active); |
38 | static DEFINE_MUTEX(stack_sysctl_mutex); | 37 | static DEFINE_MUTEX(stack_sysctl_mutex); |
39 | 38 | ||
@@ -111,13 +110,11 @@ static inline void check_stack(void) | |||
111 | } | 110 | } |
112 | 111 | ||
113 | static void | 112 | static void |
114 | stack_trace_call(unsigned long ip, unsigned long parent_ip) | 113 | stack_trace_call(unsigned long ip, unsigned long parent_ip, |
114 | struct ftrace_ops *op, struct pt_regs *pt_regs) | ||
115 | { | 115 | { |
116 | int cpu; | 116 | int cpu; |
117 | 117 | ||
118 | if (unlikely(!ftrace_enabled || stack_trace_disabled)) | ||
119 | return; | ||
120 | |||
121 | preempt_disable_notrace(); | 118 | preempt_disable_notrace(); |
122 | 119 | ||
123 | cpu = raw_smp_processor_id(); | 120 | cpu = raw_smp_processor_id(); |
@@ -136,6 +133,7 @@ stack_trace_call(unsigned long ip, unsigned long parent_ip) | |||
136 | static struct ftrace_ops trace_ops __read_mostly = | 133 | static struct ftrace_ops trace_ops __read_mostly = |
137 | { | 134 | { |
138 | .func = stack_trace_call, | 135 | .func = stack_trace_call, |
136 | .flags = FTRACE_OPS_FL_RECURSION_SAFE, | ||
139 | }; | 137 | }; |
140 | 138 | ||
141 | static ssize_t | 139 | static ssize_t |
diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c index 6b245f64c8dd..7609dd6714c2 100644 --- a/kernel/trace/trace_syscalls.c +++ b/kernel/trace/trace_syscalls.c | |||
@@ -21,9 +21,6 @@ static int syscall_enter_register(struct ftrace_event_call *event, | |||
21 | static int syscall_exit_register(struct ftrace_event_call *event, | 21 | static int syscall_exit_register(struct ftrace_event_call *event, |
22 | enum trace_reg type, void *data); | 22 | enum trace_reg type, void *data); |
23 | 23 | ||
24 | static int syscall_enter_define_fields(struct ftrace_event_call *call); | ||
25 | static int syscall_exit_define_fields(struct ftrace_event_call *call); | ||
26 | |||
27 | static struct list_head * | 24 | static struct list_head * |
28 | syscall_get_enter_fields(struct ftrace_event_call *call) | 25 | syscall_get_enter_fields(struct ftrace_event_call *call) |
29 | { | 26 | { |
@@ -32,30 +29,6 @@ syscall_get_enter_fields(struct ftrace_event_call *call) | |||
32 | return &entry->enter_fields; | 29 | return &entry->enter_fields; |
33 | } | 30 | } |
34 | 31 | ||
35 | struct trace_event_functions enter_syscall_print_funcs = { | ||
36 | .trace = print_syscall_enter, | ||
37 | }; | ||
38 | |||
39 | struct trace_event_functions exit_syscall_print_funcs = { | ||
40 | .trace = print_syscall_exit, | ||
41 | }; | ||
42 | |||
43 | struct ftrace_event_class event_class_syscall_enter = { | ||
44 | .system = "syscalls", | ||
45 | .reg = syscall_enter_register, | ||
46 | .define_fields = syscall_enter_define_fields, | ||
47 | .get_fields = syscall_get_enter_fields, | ||
48 | .raw_init = init_syscall_trace, | ||
49 | }; | ||
50 | |||
51 | struct ftrace_event_class event_class_syscall_exit = { | ||
52 | .system = "syscalls", | ||
53 | .reg = syscall_exit_register, | ||
54 | .define_fields = syscall_exit_define_fields, | ||
55 | .fields = LIST_HEAD_INIT(event_class_syscall_exit.fields), | ||
56 | .raw_init = init_syscall_trace, | ||
57 | }; | ||
58 | |||
59 | extern struct syscall_metadata *__start_syscalls_metadata[]; | 32 | extern struct syscall_metadata *__start_syscalls_metadata[]; |
60 | extern struct syscall_metadata *__stop_syscalls_metadata[]; | 33 | extern struct syscall_metadata *__stop_syscalls_metadata[]; |
61 | 34 | ||
@@ -432,7 +405,7 @@ void unreg_event_syscall_exit(struct ftrace_event_call *call) | |||
432 | mutex_unlock(&syscall_trace_lock); | 405 | mutex_unlock(&syscall_trace_lock); |
433 | } | 406 | } |
434 | 407 | ||
435 | int init_syscall_trace(struct ftrace_event_call *call) | 408 | static int init_syscall_trace(struct ftrace_event_call *call) |
436 | { | 409 | { |
437 | int id; | 410 | int id; |
438 | int num; | 411 | int num; |
@@ -457,6 +430,30 @@ int init_syscall_trace(struct ftrace_event_call *call) | |||
457 | return id; | 430 | return id; |
458 | } | 431 | } |
459 | 432 | ||
433 | struct trace_event_functions enter_syscall_print_funcs = { | ||
434 | .trace = print_syscall_enter, | ||
435 | }; | ||
436 | |||
437 | struct trace_event_functions exit_syscall_print_funcs = { | ||
438 | .trace = print_syscall_exit, | ||
439 | }; | ||
440 | |||
441 | struct ftrace_event_class event_class_syscall_enter = { | ||
442 | .system = "syscalls", | ||
443 | .reg = syscall_enter_register, | ||
444 | .define_fields = syscall_enter_define_fields, | ||
445 | .get_fields = syscall_get_enter_fields, | ||
446 | .raw_init = init_syscall_trace, | ||
447 | }; | ||
448 | |||
449 | struct ftrace_event_class event_class_syscall_exit = { | ||
450 | .system = "syscalls", | ||
451 | .reg = syscall_exit_register, | ||
452 | .define_fields = syscall_exit_define_fields, | ||
453 | .fields = LIST_HEAD_INIT(event_class_syscall_exit.fields), | ||
454 | .raw_init = init_syscall_trace, | ||
455 | }; | ||
456 | |||
460 | unsigned long __init __weak arch_syscall_addr(int nr) | 457 | unsigned long __init __weak arch_syscall_addr(int nr) |
461 | { | 458 | { |
462 | return (unsigned long)sys_call_table[nr]; | 459 | return (unsigned long)sys_call_table[nr]; |
@@ -487,7 +484,7 @@ int __init init_ftrace_syscalls(void) | |||
487 | 484 | ||
488 | return 0; | 485 | return 0; |
489 | } | 486 | } |
490 | core_initcall(init_ftrace_syscalls); | 487 | early_initcall(init_ftrace_syscalls); |
491 | 488 | ||
492 | #ifdef CONFIG_PERF_EVENTS | 489 | #ifdef CONFIG_PERF_EVENTS |
493 | 490 | ||
@@ -537,7 +534,7 @@ static void perf_syscall_enter(void *ignore, struct pt_regs *regs, long id) | |||
537 | perf_trace_buf_submit(rec, size, rctx, 0, 1, regs, head, NULL); | 534 | perf_trace_buf_submit(rec, size, rctx, 0, 1, regs, head, NULL); |
538 | } | 535 | } |
539 | 536 | ||
540 | int perf_sysenter_enable(struct ftrace_event_call *call) | 537 | static int perf_sysenter_enable(struct ftrace_event_call *call) |
541 | { | 538 | { |
542 | int ret = 0; | 539 | int ret = 0; |
543 | int num; | 540 | int num; |
@@ -558,7 +555,7 @@ int perf_sysenter_enable(struct ftrace_event_call *call) | |||
558 | return ret; | 555 | return ret; |
559 | } | 556 | } |
560 | 557 | ||
561 | void perf_sysenter_disable(struct ftrace_event_call *call) | 558 | static void perf_sysenter_disable(struct ftrace_event_call *call) |
562 | { | 559 | { |
563 | int num; | 560 | int num; |
564 | 561 | ||
@@ -615,7 +612,7 @@ static void perf_syscall_exit(void *ignore, struct pt_regs *regs, long ret) | |||
615 | perf_trace_buf_submit(rec, size, rctx, 0, 1, regs, head, NULL); | 612 | perf_trace_buf_submit(rec, size, rctx, 0, 1, regs, head, NULL); |
616 | } | 613 | } |
617 | 614 | ||
618 | int perf_sysexit_enable(struct ftrace_event_call *call) | 615 | static int perf_sysexit_enable(struct ftrace_event_call *call) |
619 | { | 616 | { |
620 | int ret = 0; | 617 | int ret = 0; |
621 | int num; | 618 | int num; |
@@ -636,7 +633,7 @@ int perf_sysexit_enable(struct ftrace_event_call *call) | |||
636 | return ret; | 633 | return ret; |
637 | } | 634 | } |
638 | 635 | ||
639 | void perf_sysexit_disable(struct ftrace_event_call *call) | 636 | static void perf_sysexit_disable(struct ftrace_event_call *call) |
640 | { | 637 | { |
641 | int num; | 638 | int num; |
642 | 639 | ||
diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c index 03003cd7dd96..c86e6d4f67fb 100644 --- a/kernel/trace/trace_uprobe.c +++ b/kernel/trace/trace_uprobe.c | |||
@@ -22,6 +22,7 @@ | |||
22 | #include <linux/uaccess.h> | 22 | #include <linux/uaccess.h> |
23 | #include <linux/uprobes.h> | 23 | #include <linux/uprobes.h> |
24 | #include <linux/namei.h> | 24 | #include <linux/namei.h> |
25 | #include <linux/string.h> | ||
25 | 26 | ||
26 | #include "trace_probe.h" | 27 | #include "trace_probe.h" |
27 | 28 | ||
@@ -189,7 +190,7 @@ static int create_trace_uprobe(int argc, char **argv) | |||
189 | if (argv[0][0] == '-') | 190 | if (argv[0][0] == '-') |
190 | is_delete = true; | 191 | is_delete = true; |
191 | else if (argv[0][0] != 'p') { | 192 | else if (argv[0][0] != 'p') { |
192 | pr_info("Probe definition must be started with 'p', 'r' or" " '-'.\n"); | 193 | pr_info("Probe definition must be started with 'p' or '-'.\n"); |
193 | return -EINVAL; | 194 | return -EINVAL; |
194 | } | 195 | } |
195 | 196 | ||
@@ -252,7 +253,7 @@ static int create_trace_uprobe(int argc, char **argv) | |||
252 | if (ret) | 253 | if (ret) |
253 | goto fail_address_parse; | 254 | goto fail_address_parse; |
254 | 255 | ||
255 | ret = strict_strtoul(arg, 0, &offset); | 256 | ret = kstrtoul(arg, 0, &offset); |
256 | if (ret) | 257 | if (ret) |
257 | goto fail_address_parse; | 258 | goto fail_address_parse; |
258 | 259 | ||
@@ -263,16 +264,15 @@ static int create_trace_uprobe(int argc, char **argv) | |||
263 | 264 | ||
264 | /* setup a probe */ | 265 | /* setup a probe */ |
265 | if (!event) { | 266 | if (!event) { |
266 | char *tail = strrchr(filename, '/'); | 267 | char *tail; |
267 | char *ptr; | 268 | char *ptr; |
268 | 269 | ||
269 | ptr = kstrdup((tail ? tail + 1 : filename), GFP_KERNEL); | 270 | tail = kstrdup(kbasename(filename), GFP_KERNEL); |
270 | if (!ptr) { | 271 | if (!tail) { |
271 | ret = -ENOMEM; | 272 | ret = -ENOMEM; |
272 | goto fail_address_parse; | 273 | goto fail_address_parse; |
273 | } | 274 | } |
274 | 275 | ||
275 | tail = ptr; | ||
276 | ptr = strpbrk(tail, ".-_"); | 276 | ptr = strpbrk(tail, ".-_"); |
277 | if (ptr) | 277 | if (ptr) |
278 | *ptr = '\0'; | 278 | *ptr = '\0'; |
diff --git a/kernel/tsacct.c b/kernel/tsacct.c index 23b4d784ebdd..625df0b44690 100644 --- a/kernel/tsacct.c +++ b/kernel/tsacct.c | |||
@@ -26,7 +26,9 @@ | |||
26 | /* | 26 | /* |
27 | * fill in basic accounting fields | 27 | * fill in basic accounting fields |
28 | */ | 28 | */ |
29 | void bacct_add_tsk(struct taskstats *stats, struct task_struct *tsk) | 29 | void bacct_add_tsk(struct user_namespace *user_ns, |
30 | struct pid_namespace *pid_ns, | ||
31 | struct taskstats *stats, struct task_struct *tsk) | ||
30 | { | 32 | { |
31 | const struct cred *tcred; | 33 | const struct cred *tcred; |
32 | struct timespec uptime, ts; | 34 | struct timespec uptime, ts; |
@@ -55,13 +57,13 @@ void bacct_add_tsk(struct taskstats *stats, struct task_struct *tsk) | |||
55 | stats->ac_flag |= AXSIG; | 57 | stats->ac_flag |= AXSIG; |
56 | stats->ac_nice = task_nice(tsk); | 58 | stats->ac_nice = task_nice(tsk); |
57 | stats->ac_sched = tsk->policy; | 59 | stats->ac_sched = tsk->policy; |
58 | stats->ac_pid = tsk->pid; | 60 | stats->ac_pid = task_pid_nr_ns(tsk, pid_ns); |
59 | rcu_read_lock(); | 61 | rcu_read_lock(); |
60 | tcred = __task_cred(tsk); | 62 | tcred = __task_cred(tsk); |
61 | stats->ac_uid = tcred->uid; | 63 | stats->ac_uid = from_kuid_munged(user_ns, tcred->uid); |
62 | stats->ac_gid = tcred->gid; | 64 | stats->ac_gid = from_kgid_munged(user_ns, tcred->gid); |
63 | stats->ac_ppid = pid_alive(tsk) ? | 65 | stats->ac_ppid = pid_alive(tsk) ? |
64 | rcu_dereference(tsk->real_parent)->tgid : 0; | 66 | task_tgid_nr_ns(rcu_dereference(tsk->real_parent), pid_ns) : 0; |
65 | rcu_read_unlock(); | 67 | rcu_read_unlock(); |
66 | stats->ac_utime = cputime_to_usecs(tsk->utime); | 68 | stats->ac_utime = cputime_to_usecs(tsk->utime); |
67 | stats->ac_stime = cputime_to_usecs(tsk->stime); | 69 | stats->ac_stime = cputime_to_usecs(tsk->stime); |
diff --git a/kernel/user.c b/kernel/user.c index b815fefbe76f..33acb5e53a5f 100644 --- a/kernel/user.c +++ b/kernel/user.c | |||
@@ -16,6 +16,7 @@ | |||
16 | #include <linux/interrupt.h> | 16 | #include <linux/interrupt.h> |
17 | #include <linux/export.h> | 17 | #include <linux/export.h> |
18 | #include <linux/user_namespace.h> | 18 | #include <linux/user_namespace.h> |
19 | #include <linux/proc_fs.h> | ||
19 | 20 | ||
20 | /* | 21 | /* |
21 | * userns count is 1 for root user, 1 for init_uts_ns, | 22 | * userns count is 1 for root user, 1 for init_uts_ns, |
@@ -38,11 +39,20 @@ struct user_namespace init_user_ns = { | |||
38 | .count = 4294967295U, | 39 | .count = 4294967295U, |
39 | }, | 40 | }, |
40 | }, | 41 | }, |
42 | .projid_map = { | ||
43 | .nr_extents = 1, | ||
44 | .extent[0] = { | ||
45 | .first = 0, | ||
46 | .lower_first = 0, | ||
47 | .count = 4294967295U, | ||
48 | }, | ||
49 | }, | ||
41 | .kref = { | 50 | .kref = { |
42 | .refcount = ATOMIC_INIT(3), | 51 | .refcount = ATOMIC_INIT(3), |
43 | }, | 52 | }, |
44 | .owner = GLOBAL_ROOT_UID, | 53 | .owner = GLOBAL_ROOT_UID, |
45 | .group = GLOBAL_ROOT_GID, | 54 | .group = GLOBAL_ROOT_GID, |
55 | .proc_inum = PROC_USER_INIT_INO, | ||
46 | }; | 56 | }; |
47 | EXPORT_SYMBOL_GPL(init_user_ns); | 57 | EXPORT_SYMBOL_GPL(init_user_ns); |
48 | 58 | ||
diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c index 86602316422d..2b042c42fbc4 100644 --- a/kernel/user_namespace.c +++ b/kernel/user_namespace.c | |||
@@ -9,6 +9,7 @@ | |||
9 | #include <linux/nsproxy.h> | 9 | #include <linux/nsproxy.h> |
10 | #include <linux/slab.h> | 10 | #include <linux/slab.h> |
11 | #include <linux/user_namespace.h> | 11 | #include <linux/user_namespace.h> |
12 | #include <linux/proc_fs.h> | ||
12 | #include <linux/highuid.h> | 13 | #include <linux/highuid.h> |
13 | #include <linux/cred.h> | 14 | #include <linux/cred.h> |
14 | #include <linux/securebits.h> | 15 | #include <linux/securebits.h> |
@@ -19,12 +20,31 @@ | |||
19 | #include <linux/fs.h> | 20 | #include <linux/fs.h> |
20 | #include <linux/uaccess.h> | 21 | #include <linux/uaccess.h> |
21 | #include <linux/ctype.h> | 22 | #include <linux/ctype.h> |
23 | #include <linux/projid.h> | ||
22 | 24 | ||
23 | static struct kmem_cache *user_ns_cachep __read_mostly; | 25 | static struct kmem_cache *user_ns_cachep __read_mostly; |
24 | 26 | ||
25 | static bool new_idmap_permitted(struct user_namespace *ns, int cap_setid, | 27 | static bool new_idmap_permitted(struct user_namespace *ns, int cap_setid, |
26 | struct uid_gid_map *map); | 28 | struct uid_gid_map *map); |
27 | 29 | ||
30 | static void set_cred_user_ns(struct cred *cred, struct user_namespace *user_ns) | ||
31 | { | ||
32 | /* Start with the same capabilities as init but useless for doing | ||
33 | * anything as the capabilities are bound to the new user namespace. | ||
34 | */ | ||
35 | cred->securebits = SECUREBITS_DEFAULT; | ||
36 | cred->cap_inheritable = CAP_EMPTY_SET; | ||
37 | cred->cap_permitted = CAP_FULL_SET; | ||
38 | cred->cap_effective = CAP_FULL_SET; | ||
39 | cred->cap_bset = CAP_FULL_SET; | ||
40 | #ifdef CONFIG_KEYS | ||
41 | key_put(cred->request_key_auth); | ||
42 | cred->request_key_auth = NULL; | ||
43 | #endif | ||
44 | /* tgcred will be cleared in our caller bc CLONE_THREAD won't be set */ | ||
45 | cred->user_ns = user_ns; | ||
46 | } | ||
47 | |||
28 | /* | 48 | /* |
29 | * Create a new user namespace, deriving the creator from the user in the | 49 | * Create a new user namespace, deriving the creator from the user in the |
30 | * passed credentials, and replacing that user with the new root user for the | 50 | * passed credentials, and replacing that user with the new root user for the |
@@ -38,6 +58,7 @@ int create_user_ns(struct cred *new) | |||
38 | struct user_namespace *ns, *parent_ns = new->user_ns; | 58 | struct user_namespace *ns, *parent_ns = new->user_ns; |
39 | kuid_t owner = new->euid; | 59 | kuid_t owner = new->euid; |
40 | kgid_t group = new->egid; | 60 | kgid_t group = new->egid; |
61 | int ret; | ||
41 | 62 | ||
42 | /* The creator needs a mapping in the parent user namespace | 63 | /* The creator needs a mapping in the parent user namespace |
43 | * or else we won't be able to reasonably tell userspace who | 64 | * or else we won't be able to reasonably tell userspace who |
@@ -51,38 +72,45 @@ int create_user_ns(struct cred *new) | |||
51 | if (!ns) | 72 | if (!ns) |
52 | return -ENOMEM; | 73 | return -ENOMEM; |
53 | 74 | ||
75 | ret = proc_alloc_inum(&ns->proc_inum); | ||
76 | if (ret) { | ||
77 | kmem_cache_free(user_ns_cachep, ns); | ||
78 | return ret; | ||
79 | } | ||
80 | |||
54 | kref_init(&ns->kref); | 81 | kref_init(&ns->kref); |
82 | /* Leave the new->user_ns reference with the new user namespace. */ | ||
55 | ns->parent = parent_ns; | 83 | ns->parent = parent_ns; |
56 | ns->owner = owner; | 84 | ns->owner = owner; |
57 | ns->group = group; | 85 | ns->group = group; |
58 | 86 | ||
59 | /* Start with the same capabilities as init but useless for doing | 87 | set_cred_user_ns(new, ns); |
60 | * anything as the capabilities are bound to the new user namespace. | ||
61 | */ | ||
62 | new->securebits = SECUREBITS_DEFAULT; | ||
63 | new->cap_inheritable = CAP_EMPTY_SET; | ||
64 | new->cap_permitted = CAP_FULL_SET; | ||
65 | new->cap_effective = CAP_FULL_SET; | ||
66 | new->cap_bset = CAP_FULL_SET; | ||
67 | #ifdef CONFIG_KEYS | ||
68 | key_put(new->request_key_auth); | ||
69 | new->request_key_auth = NULL; | ||
70 | #endif | ||
71 | /* tgcred will be cleared in our caller bc CLONE_THREAD won't be set */ | ||
72 | |||
73 | /* Leave the new->user_ns reference with the new user namespace. */ | ||
74 | /* Leave the reference to our user_ns with the new cred. */ | ||
75 | new->user_ns = ns; | ||
76 | 88 | ||
77 | return 0; | 89 | return 0; |
78 | } | 90 | } |
79 | 91 | ||
92 | int unshare_userns(unsigned long unshare_flags, struct cred **new_cred) | ||
93 | { | ||
94 | struct cred *cred; | ||
95 | |||
96 | if (!(unshare_flags & CLONE_NEWUSER)) | ||
97 | return 0; | ||
98 | |||
99 | cred = prepare_creds(); | ||
100 | if (!cred) | ||
101 | return -ENOMEM; | ||
102 | |||
103 | *new_cred = cred; | ||
104 | return create_user_ns(cred); | ||
105 | } | ||
106 | |||
80 | void free_user_ns(struct kref *kref) | 107 | void free_user_ns(struct kref *kref) |
81 | { | 108 | { |
82 | struct user_namespace *parent, *ns = | 109 | struct user_namespace *parent, *ns = |
83 | container_of(kref, struct user_namespace, kref); | 110 | container_of(kref, struct user_namespace, kref); |
84 | 111 | ||
85 | parent = ns->parent; | 112 | parent = ns->parent; |
113 | proc_free_inum(ns->proc_inum); | ||
86 | kmem_cache_free(user_ns_cachep, ns); | 114 | kmem_cache_free(user_ns_cachep, ns); |
87 | put_user_ns(parent); | 115 | put_user_ns(parent); |
88 | } | 116 | } |
@@ -295,6 +323,75 @@ gid_t from_kgid_munged(struct user_namespace *targ, kgid_t kgid) | |||
295 | } | 323 | } |
296 | EXPORT_SYMBOL(from_kgid_munged); | 324 | EXPORT_SYMBOL(from_kgid_munged); |
297 | 325 | ||
326 | /** | ||
327 | * make_kprojid - Map a user-namespace projid pair into a kprojid. | ||
328 | * @ns: User namespace that the projid is in | ||
329 | * @projid: Project identifier | ||
330 | * | ||
331 | * Maps a user-namespace uid pair into a kernel internal kuid, | ||
332 | * and returns that kuid. | ||
333 | * | ||
334 | * When there is no mapping defined for the user-namespace projid | ||
335 | * pair INVALID_PROJID is returned. Callers are expected to test | ||
336 | * for and handle handle INVALID_PROJID being returned. INVALID_PROJID | ||
337 | * may be tested for using projid_valid(). | ||
338 | */ | ||
339 | kprojid_t make_kprojid(struct user_namespace *ns, projid_t projid) | ||
340 | { | ||
341 | /* Map the uid to a global kernel uid */ | ||
342 | return KPROJIDT_INIT(map_id_down(&ns->projid_map, projid)); | ||
343 | } | ||
344 | EXPORT_SYMBOL(make_kprojid); | ||
345 | |||
346 | /** | ||
347 | * from_kprojid - Create a projid from a kprojid user-namespace pair. | ||
348 | * @targ: The user namespace we want a projid in. | ||
349 | * @kprojid: The kernel internal project identifier to start with. | ||
350 | * | ||
351 | * Map @kprojid into the user-namespace specified by @targ and | ||
352 | * return the resulting projid. | ||
353 | * | ||
354 | * There is always a mapping into the initial user_namespace. | ||
355 | * | ||
356 | * If @kprojid has no mapping in @targ (projid_t)-1 is returned. | ||
357 | */ | ||
358 | projid_t from_kprojid(struct user_namespace *targ, kprojid_t kprojid) | ||
359 | { | ||
360 | /* Map the uid from a global kernel uid */ | ||
361 | return map_id_up(&targ->projid_map, __kprojid_val(kprojid)); | ||
362 | } | ||
363 | EXPORT_SYMBOL(from_kprojid); | ||
364 | |||
365 | /** | ||
366 | * from_kprojid_munged - Create a projiid from a kprojid user-namespace pair. | ||
367 | * @targ: The user namespace we want a projid in. | ||
368 | * @kprojid: The kernel internal projid to start with. | ||
369 | * | ||
370 | * Map @kprojid into the user-namespace specified by @targ and | ||
371 | * return the resulting projid. | ||
372 | * | ||
373 | * There is always a mapping into the initial user_namespace. | ||
374 | * | ||
375 | * Unlike from_kprojid from_kprojid_munged never fails and always | ||
376 | * returns a valid projid. This makes from_kprojid_munged | ||
377 | * appropriate for use in syscalls like stat and where | ||
378 | * failing the system call and failing to provide a valid projid are | ||
379 | * not an options. | ||
380 | * | ||
381 | * If @kprojid has no mapping in @targ OVERFLOW_PROJID is returned. | ||
382 | */ | ||
383 | projid_t from_kprojid_munged(struct user_namespace *targ, kprojid_t kprojid) | ||
384 | { | ||
385 | projid_t projid; | ||
386 | projid = from_kprojid(targ, kprojid); | ||
387 | |||
388 | if (projid == (projid_t) -1) | ||
389 | projid = OVERFLOW_PROJID; | ||
390 | return projid; | ||
391 | } | ||
392 | EXPORT_SYMBOL(from_kprojid_munged); | ||
393 | |||
394 | |||
298 | static int uid_m_show(struct seq_file *seq, void *v) | 395 | static int uid_m_show(struct seq_file *seq, void *v) |
299 | { | 396 | { |
300 | struct user_namespace *ns = seq->private; | 397 | struct user_namespace *ns = seq->private; |
@@ -302,7 +399,7 @@ static int uid_m_show(struct seq_file *seq, void *v) | |||
302 | struct user_namespace *lower_ns; | 399 | struct user_namespace *lower_ns; |
303 | uid_t lower; | 400 | uid_t lower; |
304 | 401 | ||
305 | lower_ns = current_user_ns(); | 402 | lower_ns = seq_user_ns(seq); |
306 | if ((lower_ns == ns) && lower_ns->parent) | 403 | if ((lower_ns == ns) && lower_ns->parent) |
307 | lower_ns = lower_ns->parent; | 404 | lower_ns = lower_ns->parent; |
308 | 405 | ||
@@ -323,7 +420,7 @@ static int gid_m_show(struct seq_file *seq, void *v) | |||
323 | struct user_namespace *lower_ns; | 420 | struct user_namespace *lower_ns; |
324 | gid_t lower; | 421 | gid_t lower; |
325 | 422 | ||
326 | lower_ns = current_user_ns(); | 423 | lower_ns = seq_user_ns(seq); |
327 | if ((lower_ns == ns) && lower_ns->parent) | 424 | if ((lower_ns == ns) && lower_ns->parent) |
328 | lower_ns = lower_ns->parent; | 425 | lower_ns = lower_ns->parent; |
329 | 426 | ||
@@ -337,6 +434,27 @@ static int gid_m_show(struct seq_file *seq, void *v) | |||
337 | return 0; | 434 | return 0; |
338 | } | 435 | } |
339 | 436 | ||
437 | static int projid_m_show(struct seq_file *seq, void *v) | ||
438 | { | ||
439 | struct user_namespace *ns = seq->private; | ||
440 | struct uid_gid_extent *extent = v; | ||
441 | struct user_namespace *lower_ns; | ||
442 | projid_t lower; | ||
443 | |||
444 | lower_ns = seq_user_ns(seq); | ||
445 | if ((lower_ns == ns) && lower_ns->parent) | ||
446 | lower_ns = lower_ns->parent; | ||
447 | |||
448 | lower = from_kprojid(lower_ns, KPROJIDT_INIT(extent->lower_first)); | ||
449 | |||
450 | seq_printf(seq, "%10u %10u %10u\n", | ||
451 | extent->first, | ||
452 | lower, | ||
453 | extent->count); | ||
454 | |||
455 | return 0; | ||
456 | } | ||
457 | |||
340 | static void *m_start(struct seq_file *seq, loff_t *ppos, struct uid_gid_map *map) | 458 | static void *m_start(struct seq_file *seq, loff_t *ppos, struct uid_gid_map *map) |
341 | { | 459 | { |
342 | struct uid_gid_extent *extent = NULL; | 460 | struct uid_gid_extent *extent = NULL; |
@@ -362,6 +480,13 @@ static void *gid_m_start(struct seq_file *seq, loff_t *ppos) | |||
362 | return m_start(seq, ppos, &ns->gid_map); | 480 | return m_start(seq, ppos, &ns->gid_map); |
363 | } | 481 | } |
364 | 482 | ||
483 | static void *projid_m_start(struct seq_file *seq, loff_t *ppos) | ||
484 | { | ||
485 | struct user_namespace *ns = seq->private; | ||
486 | |||
487 | return m_start(seq, ppos, &ns->projid_map); | ||
488 | } | ||
489 | |||
365 | static void *m_next(struct seq_file *seq, void *v, loff_t *pos) | 490 | static void *m_next(struct seq_file *seq, void *v, loff_t *pos) |
366 | { | 491 | { |
367 | (*pos)++; | 492 | (*pos)++; |
@@ -387,6 +512,13 @@ struct seq_operations proc_gid_seq_operations = { | |||
387 | .show = gid_m_show, | 512 | .show = gid_m_show, |
388 | }; | 513 | }; |
389 | 514 | ||
515 | struct seq_operations proc_projid_seq_operations = { | ||
516 | .start = projid_m_start, | ||
517 | .stop = m_stop, | ||
518 | .next = m_next, | ||
519 | .show = projid_m_show, | ||
520 | }; | ||
521 | |||
390 | static DEFINE_MUTEX(id_map_mutex); | 522 | static DEFINE_MUTEX(id_map_mutex); |
391 | 523 | ||
392 | static ssize_t map_write(struct file *file, const char __user *buf, | 524 | static ssize_t map_write(struct file *file, const char __user *buf, |
@@ -434,7 +566,7 @@ static ssize_t map_write(struct file *file, const char __user *buf, | |||
434 | /* Require the appropriate privilege CAP_SETUID or CAP_SETGID | 566 | /* Require the appropriate privilege CAP_SETUID or CAP_SETGID |
435 | * over the user namespace in order to set the id mapping. | 567 | * over the user namespace in order to set the id mapping. |
436 | */ | 568 | */ |
437 | if (!ns_capable(ns, cap_setid)) | 569 | if (cap_valid(cap_setid) && !ns_capable(ns, cap_setid)) |
438 | goto out; | 570 | goto out; |
439 | 571 | ||
440 | /* Get a buffer */ | 572 | /* Get a buffer */ |
@@ -564,10 +696,14 @@ ssize_t proc_uid_map_write(struct file *file, const char __user *buf, size_t siz | |||
564 | { | 696 | { |
565 | struct seq_file *seq = file->private_data; | 697 | struct seq_file *seq = file->private_data; |
566 | struct user_namespace *ns = seq->private; | 698 | struct user_namespace *ns = seq->private; |
699 | struct user_namespace *seq_ns = seq_user_ns(seq); | ||
567 | 700 | ||
568 | if (!ns->parent) | 701 | if (!ns->parent) |
569 | return -EPERM; | 702 | return -EPERM; |
570 | 703 | ||
704 | if ((seq_ns != ns) && (seq_ns != ns->parent)) | ||
705 | return -EPERM; | ||
706 | |||
571 | return map_write(file, buf, size, ppos, CAP_SETUID, | 707 | return map_write(file, buf, size, ppos, CAP_SETUID, |
572 | &ns->uid_map, &ns->parent->uid_map); | 708 | &ns->uid_map, &ns->parent->uid_map); |
573 | } | 709 | } |
@@ -576,17 +712,57 @@ ssize_t proc_gid_map_write(struct file *file, const char __user *buf, size_t siz | |||
576 | { | 712 | { |
577 | struct seq_file *seq = file->private_data; | 713 | struct seq_file *seq = file->private_data; |
578 | struct user_namespace *ns = seq->private; | 714 | struct user_namespace *ns = seq->private; |
715 | struct user_namespace *seq_ns = seq_user_ns(seq); | ||
579 | 716 | ||
580 | if (!ns->parent) | 717 | if (!ns->parent) |
581 | return -EPERM; | 718 | return -EPERM; |
582 | 719 | ||
720 | if ((seq_ns != ns) && (seq_ns != ns->parent)) | ||
721 | return -EPERM; | ||
722 | |||
583 | return map_write(file, buf, size, ppos, CAP_SETGID, | 723 | return map_write(file, buf, size, ppos, CAP_SETGID, |
584 | &ns->gid_map, &ns->parent->gid_map); | 724 | &ns->gid_map, &ns->parent->gid_map); |
585 | } | 725 | } |
586 | 726 | ||
727 | ssize_t proc_projid_map_write(struct file *file, const char __user *buf, size_t size, loff_t *ppos) | ||
728 | { | ||
729 | struct seq_file *seq = file->private_data; | ||
730 | struct user_namespace *ns = seq->private; | ||
731 | struct user_namespace *seq_ns = seq_user_ns(seq); | ||
732 | |||
733 | if (!ns->parent) | ||
734 | return -EPERM; | ||
735 | |||
736 | if ((seq_ns != ns) && (seq_ns != ns->parent)) | ||
737 | return -EPERM; | ||
738 | |||
739 | /* Anyone can set any valid project id no capability needed */ | ||
740 | return map_write(file, buf, size, ppos, -1, | ||
741 | &ns->projid_map, &ns->parent->projid_map); | ||
742 | } | ||
743 | |||
587 | static bool new_idmap_permitted(struct user_namespace *ns, int cap_setid, | 744 | static bool new_idmap_permitted(struct user_namespace *ns, int cap_setid, |
588 | struct uid_gid_map *new_map) | 745 | struct uid_gid_map *new_map) |
589 | { | 746 | { |
747 | /* Allow mapping to your own filesystem ids */ | ||
748 | if ((new_map->nr_extents == 1) && (new_map->extent[0].count == 1)) { | ||
749 | u32 id = new_map->extent[0].lower_first; | ||
750 | if (cap_setid == CAP_SETUID) { | ||
751 | kuid_t uid = make_kuid(ns->parent, id); | ||
752 | if (uid_eq(uid, current_fsuid())) | ||
753 | return true; | ||
754 | } | ||
755 | else if (cap_setid == CAP_SETGID) { | ||
756 | kgid_t gid = make_kgid(ns->parent, id); | ||
757 | if (gid_eq(gid, current_fsgid())) | ||
758 | return true; | ||
759 | } | ||
760 | } | ||
761 | |||
762 | /* Allow anyone to set a mapping that doesn't require privilege */ | ||
763 | if (!cap_valid(cap_setid)) | ||
764 | return true; | ||
765 | |||
590 | /* Allow the specified ids if we have the appropriate capability | 766 | /* Allow the specified ids if we have the appropriate capability |
591 | * (CAP_SETUID or CAP_SETGID) over the parent user namespace. | 767 | * (CAP_SETUID or CAP_SETGID) over the parent user namespace. |
592 | */ | 768 | */ |
@@ -596,6 +772,65 @@ static bool new_idmap_permitted(struct user_namespace *ns, int cap_setid, | |||
596 | return false; | 772 | return false; |
597 | } | 773 | } |
598 | 774 | ||
775 | static void *userns_get(struct task_struct *task) | ||
776 | { | ||
777 | struct user_namespace *user_ns; | ||
778 | |||
779 | rcu_read_lock(); | ||
780 | user_ns = get_user_ns(__task_cred(task)->user_ns); | ||
781 | rcu_read_unlock(); | ||
782 | |||
783 | return user_ns; | ||
784 | } | ||
785 | |||
786 | static void userns_put(void *ns) | ||
787 | { | ||
788 | put_user_ns(ns); | ||
789 | } | ||
790 | |||
791 | static int userns_install(struct nsproxy *nsproxy, void *ns) | ||
792 | { | ||
793 | struct user_namespace *user_ns = ns; | ||
794 | struct cred *cred; | ||
795 | |||
796 | /* Don't allow gaining capabilities by reentering | ||
797 | * the same user namespace. | ||
798 | */ | ||
799 | if (user_ns == current_user_ns()) | ||
800 | return -EINVAL; | ||
801 | |||
802 | /* Threaded processes may not enter a different user namespace */ | ||
803 | if (atomic_read(¤t->mm->mm_users) > 1) | ||
804 | return -EINVAL; | ||
805 | |||
806 | if (!ns_capable(user_ns, CAP_SYS_ADMIN)) | ||
807 | return -EPERM; | ||
808 | |||
809 | cred = prepare_creds(); | ||
810 | if (!cred) | ||
811 | return -ENOMEM; | ||
812 | |||
813 | put_user_ns(cred->user_ns); | ||
814 | set_cred_user_ns(cred, get_user_ns(user_ns)); | ||
815 | |||
816 | return commit_creds(cred); | ||
817 | } | ||
818 | |||
819 | static unsigned int userns_inum(void *ns) | ||
820 | { | ||
821 | struct user_namespace *user_ns = ns; | ||
822 | return user_ns->proc_inum; | ||
823 | } | ||
824 | |||
825 | const struct proc_ns_operations userns_operations = { | ||
826 | .name = "user", | ||
827 | .type = CLONE_NEWUSER, | ||
828 | .get = userns_get, | ||
829 | .put = userns_put, | ||
830 | .install = userns_install, | ||
831 | .inum = userns_inum, | ||
832 | }; | ||
833 | |||
599 | static __init int user_namespaces_init(void) | 834 | static __init int user_namespaces_init(void) |
600 | { | 835 | { |
601 | user_ns_cachep = KMEM_CACHE(user_namespace, SLAB_PANIC); | 836 | user_ns_cachep = KMEM_CACHE(user_namespace, SLAB_PANIC); |
diff --git a/kernel/utsname.c b/kernel/utsname.c index 679d97a5d3fd..08b197e8c485 100644 --- a/kernel/utsname.c +++ b/kernel/utsname.c | |||
@@ -32,18 +32,25 @@ static struct uts_namespace *create_uts_ns(void) | |||
32 | * @old_ns: namespace to clone | 32 | * @old_ns: namespace to clone |
33 | * Return NULL on error (failure to kmalloc), new ns otherwise | 33 | * Return NULL on error (failure to kmalloc), new ns otherwise |
34 | */ | 34 | */ |
35 | static struct uts_namespace *clone_uts_ns(struct task_struct *tsk, | 35 | static struct uts_namespace *clone_uts_ns(struct user_namespace *user_ns, |
36 | struct uts_namespace *old_ns) | 36 | struct uts_namespace *old_ns) |
37 | { | 37 | { |
38 | struct uts_namespace *ns; | 38 | struct uts_namespace *ns; |
39 | int err; | ||
39 | 40 | ||
40 | ns = create_uts_ns(); | 41 | ns = create_uts_ns(); |
41 | if (!ns) | 42 | if (!ns) |
42 | return ERR_PTR(-ENOMEM); | 43 | return ERR_PTR(-ENOMEM); |
43 | 44 | ||
45 | err = proc_alloc_inum(&ns->proc_inum); | ||
46 | if (err) { | ||
47 | kfree(ns); | ||
48 | return ERR_PTR(err); | ||
49 | } | ||
50 | |||
44 | down_read(&uts_sem); | 51 | down_read(&uts_sem); |
45 | memcpy(&ns->name, &old_ns->name, sizeof(ns->name)); | 52 | memcpy(&ns->name, &old_ns->name, sizeof(ns->name)); |
46 | ns->user_ns = get_user_ns(task_cred_xxx(tsk, user_ns)); | 53 | ns->user_ns = get_user_ns(user_ns); |
47 | up_read(&uts_sem); | 54 | up_read(&uts_sem); |
48 | return ns; | 55 | return ns; |
49 | } | 56 | } |
@@ -55,9 +62,8 @@ static struct uts_namespace *clone_uts_ns(struct task_struct *tsk, | |||
55 | * versa. | 62 | * versa. |
56 | */ | 63 | */ |
57 | struct uts_namespace *copy_utsname(unsigned long flags, | 64 | struct uts_namespace *copy_utsname(unsigned long flags, |
58 | struct task_struct *tsk) | 65 | struct user_namespace *user_ns, struct uts_namespace *old_ns) |
59 | { | 66 | { |
60 | struct uts_namespace *old_ns = tsk->nsproxy->uts_ns; | ||
61 | struct uts_namespace *new_ns; | 67 | struct uts_namespace *new_ns; |
62 | 68 | ||
63 | BUG_ON(!old_ns); | 69 | BUG_ON(!old_ns); |
@@ -66,7 +72,7 @@ struct uts_namespace *copy_utsname(unsigned long flags, | |||
66 | if (!(flags & CLONE_NEWUTS)) | 72 | if (!(flags & CLONE_NEWUTS)) |
67 | return old_ns; | 73 | return old_ns; |
68 | 74 | ||
69 | new_ns = clone_uts_ns(tsk, old_ns); | 75 | new_ns = clone_uts_ns(user_ns, old_ns); |
70 | 76 | ||
71 | put_uts_ns(old_ns); | 77 | put_uts_ns(old_ns); |
72 | return new_ns; | 78 | return new_ns; |
@@ -78,6 +84,7 @@ void free_uts_ns(struct kref *kref) | |||
78 | 84 | ||
79 | ns = container_of(kref, struct uts_namespace, kref); | 85 | ns = container_of(kref, struct uts_namespace, kref); |
80 | put_user_ns(ns->user_ns); | 86 | put_user_ns(ns->user_ns); |
87 | proc_free_inum(ns->proc_inum); | ||
81 | kfree(ns); | 88 | kfree(ns); |
82 | } | 89 | } |
83 | 90 | ||
@@ -102,19 +109,32 @@ static void utsns_put(void *ns) | |||
102 | put_uts_ns(ns); | 109 | put_uts_ns(ns); |
103 | } | 110 | } |
104 | 111 | ||
105 | static int utsns_install(struct nsproxy *nsproxy, void *ns) | 112 | static int utsns_install(struct nsproxy *nsproxy, void *new) |
106 | { | 113 | { |
114 | struct uts_namespace *ns = new; | ||
115 | |||
116 | if (!ns_capable(ns->user_ns, CAP_SYS_ADMIN) || | ||
117 | !nsown_capable(CAP_SYS_ADMIN)) | ||
118 | return -EPERM; | ||
119 | |||
107 | get_uts_ns(ns); | 120 | get_uts_ns(ns); |
108 | put_uts_ns(nsproxy->uts_ns); | 121 | put_uts_ns(nsproxy->uts_ns); |
109 | nsproxy->uts_ns = ns; | 122 | nsproxy->uts_ns = ns; |
110 | return 0; | 123 | return 0; |
111 | } | 124 | } |
112 | 125 | ||
126 | static unsigned int utsns_inum(void *vp) | ||
127 | { | ||
128 | struct uts_namespace *ns = vp; | ||
129 | |||
130 | return ns->proc_inum; | ||
131 | } | ||
132 | |||
113 | const struct proc_ns_operations utsns_operations = { | 133 | const struct proc_ns_operations utsns_operations = { |
114 | .name = "uts", | 134 | .name = "uts", |
115 | .type = CLONE_NEWUTS, | 135 | .type = CLONE_NEWUTS, |
116 | .get = utsns_get, | 136 | .get = utsns_get, |
117 | .put = utsns_put, | 137 | .put = utsns_put, |
118 | .install = utsns_install, | 138 | .install = utsns_install, |
139 | .inum = utsns_inum, | ||
119 | }; | 140 | }; |
120 | |||
diff --git a/kernel/wait.c b/kernel/wait.c index 7fdd9eaca2c3..6698e0c04ead 100644 --- a/kernel/wait.c +++ b/kernel/wait.c | |||
@@ -1,7 +1,7 @@ | |||
1 | /* | 1 | /* |
2 | * Generic waiting primitives. | 2 | * Generic waiting primitives. |
3 | * | 3 | * |
4 | * (C) 2004 William Irwin, Oracle | 4 | * (C) 2004 Nadia Yvette Chambers, Oracle |
5 | */ | 5 | */ |
6 | #include <linux/init.h> | 6 | #include <linux/init.h> |
7 | #include <linux/export.h> | 7 | #include <linux/export.h> |
diff --git a/kernel/watchdog.c b/kernel/watchdog.c index 4b1dfba70f7c..75a2ab3d0b02 100644 --- a/kernel/watchdog.c +++ b/kernel/watchdog.c | |||
@@ -22,6 +22,7 @@ | |||
22 | #include <linux/notifier.h> | 22 | #include <linux/notifier.h> |
23 | #include <linux/module.h> | 23 | #include <linux/module.h> |
24 | #include <linux/sysctl.h> | 24 | #include <linux/sysctl.h> |
25 | #include <linux/smpboot.h> | ||
25 | 26 | ||
26 | #include <asm/irq_regs.h> | 27 | #include <asm/irq_regs.h> |
27 | #include <linux/kvm_para.h> | 28 | #include <linux/kvm_para.h> |
@@ -29,16 +30,19 @@ | |||
29 | 30 | ||
30 | int watchdog_enabled = 1; | 31 | int watchdog_enabled = 1; |
31 | int __read_mostly watchdog_thresh = 10; | 32 | int __read_mostly watchdog_thresh = 10; |
33 | static int __read_mostly watchdog_disabled; | ||
34 | static u64 __read_mostly sample_period; | ||
32 | 35 | ||
33 | static DEFINE_PER_CPU(unsigned long, watchdog_touch_ts); | 36 | static DEFINE_PER_CPU(unsigned long, watchdog_touch_ts); |
34 | static DEFINE_PER_CPU(struct task_struct *, softlockup_watchdog); | 37 | static DEFINE_PER_CPU(struct task_struct *, softlockup_watchdog); |
35 | static DEFINE_PER_CPU(struct hrtimer, watchdog_hrtimer); | 38 | static DEFINE_PER_CPU(struct hrtimer, watchdog_hrtimer); |
36 | static DEFINE_PER_CPU(bool, softlockup_touch_sync); | 39 | static DEFINE_PER_CPU(bool, softlockup_touch_sync); |
37 | static DEFINE_PER_CPU(bool, soft_watchdog_warn); | 40 | static DEFINE_PER_CPU(bool, soft_watchdog_warn); |
41 | static DEFINE_PER_CPU(unsigned long, hrtimer_interrupts); | ||
42 | static DEFINE_PER_CPU(unsigned long, soft_lockup_hrtimer_cnt); | ||
38 | #ifdef CONFIG_HARDLOCKUP_DETECTOR | 43 | #ifdef CONFIG_HARDLOCKUP_DETECTOR |
39 | static DEFINE_PER_CPU(bool, hard_watchdog_warn); | 44 | static DEFINE_PER_CPU(bool, hard_watchdog_warn); |
40 | static DEFINE_PER_CPU(bool, watchdog_nmi_touch); | 45 | static DEFINE_PER_CPU(bool, watchdog_nmi_touch); |
41 | static DEFINE_PER_CPU(unsigned long, hrtimer_interrupts); | ||
42 | static DEFINE_PER_CPU(unsigned long, hrtimer_interrupts_saved); | 46 | static DEFINE_PER_CPU(unsigned long, hrtimer_interrupts_saved); |
43 | static DEFINE_PER_CPU(struct perf_event *, watchdog_ev); | 47 | static DEFINE_PER_CPU(struct perf_event *, watchdog_ev); |
44 | #endif | 48 | #endif |
@@ -113,7 +117,7 @@ static unsigned long get_timestamp(int this_cpu) | |||
113 | return cpu_clock(this_cpu) >> 30LL; /* 2^30 ~= 10^9 */ | 117 | return cpu_clock(this_cpu) >> 30LL; /* 2^30 ~= 10^9 */ |
114 | } | 118 | } |
115 | 119 | ||
116 | static unsigned long get_sample_period(void) | 120 | static void set_sample_period(void) |
117 | { | 121 | { |
118 | /* | 122 | /* |
119 | * convert watchdog_thresh from seconds to ns | 123 | * convert watchdog_thresh from seconds to ns |
@@ -122,7 +126,7 @@ static unsigned long get_sample_period(void) | |||
122 | * and hard thresholds) to increment before the | 126 | * and hard thresholds) to increment before the |
123 | * hardlockup detector generates a warning | 127 | * hardlockup detector generates a warning |
124 | */ | 128 | */ |
125 | return get_softlockup_thresh() * (NSEC_PER_SEC / 5); | 129 | sample_period = get_softlockup_thresh() * ((u64)NSEC_PER_SEC / 5); |
126 | } | 130 | } |
127 | 131 | ||
128 | /* Commands for resetting the watchdog */ | 132 | /* Commands for resetting the watchdog */ |
@@ -248,13 +252,15 @@ static void watchdog_overflow_callback(struct perf_event *event, | |||
248 | __this_cpu_write(hard_watchdog_warn, false); | 252 | __this_cpu_write(hard_watchdog_warn, false); |
249 | return; | 253 | return; |
250 | } | 254 | } |
255 | #endif /* CONFIG_HARDLOCKUP_DETECTOR */ | ||
256 | |||
251 | static void watchdog_interrupt_count(void) | 257 | static void watchdog_interrupt_count(void) |
252 | { | 258 | { |
253 | __this_cpu_inc(hrtimer_interrupts); | 259 | __this_cpu_inc(hrtimer_interrupts); |
254 | } | 260 | } |
255 | #else | 261 | |
256 | static inline void watchdog_interrupt_count(void) { return; } | 262 | static int watchdog_nmi_enable(unsigned int cpu); |
257 | #endif /* CONFIG_HARDLOCKUP_DETECTOR */ | 263 | static void watchdog_nmi_disable(unsigned int cpu); |
258 | 264 | ||
259 | /* watchdog kicker functions */ | 265 | /* watchdog kicker functions */ |
260 | static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer) | 266 | static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer) |
@@ -270,7 +276,7 @@ static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer) | |||
270 | wake_up_process(__this_cpu_read(softlockup_watchdog)); | 276 | wake_up_process(__this_cpu_read(softlockup_watchdog)); |
271 | 277 | ||
272 | /* .. and repeat */ | 278 | /* .. and repeat */ |
273 | hrtimer_forward_now(hrtimer, ns_to_ktime(get_sample_period())); | 279 | hrtimer_forward_now(hrtimer, ns_to_ktime(sample_period)); |
274 | 280 | ||
275 | if (touch_ts == 0) { | 281 | if (touch_ts == 0) { |
276 | if (unlikely(__this_cpu_read(softlockup_touch_sync))) { | 282 | if (unlikely(__this_cpu_read(softlockup_touch_sync))) { |
@@ -327,49 +333,68 @@ static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer) | |||
327 | return HRTIMER_RESTART; | 333 | return HRTIMER_RESTART; |
328 | } | 334 | } |
329 | 335 | ||
336 | static void watchdog_set_prio(unsigned int policy, unsigned int prio) | ||
337 | { | ||
338 | struct sched_param param = { .sched_priority = prio }; | ||
330 | 339 | ||
331 | /* | 340 | sched_setscheduler(current, policy, ¶m); |
332 | * The watchdog thread - touches the timestamp. | 341 | } |
333 | */ | 342 | |
334 | static int watchdog(void *unused) | 343 | static void watchdog_enable(unsigned int cpu) |
335 | { | 344 | { |
336 | struct sched_param param = { .sched_priority = 0 }; | ||
337 | struct hrtimer *hrtimer = &__raw_get_cpu_var(watchdog_hrtimer); | 345 | struct hrtimer *hrtimer = &__raw_get_cpu_var(watchdog_hrtimer); |
338 | 346 | ||
339 | /* initialize timestamp */ | ||
340 | __touch_watchdog(); | ||
341 | |||
342 | /* kick off the timer for the hardlockup detector */ | 347 | /* kick off the timer for the hardlockup detector */ |
348 | hrtimer_init(hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); | ||
349 | hrtimer->function = watchdog_timer_fn; | ||
350 | |||
351 | if (!watchdog_enabled) { | ||
352 | kthread_park(current); | ||
353 | return; | ||
354 | } | ||
355 | |||
356 | /* Enable the perf event */ | ||
357 | watchdog_nmi_enable(cpu); | ||
358 | |||
343 | /* done here because hrtimer_start can only pin to smp_processor_id() */ | 359 | /* done here because hrtimer_start can only pin to smp_processor_id() */ |
344 | hrtimer_start(hrtimer, ns_to_ktime(get_sample_period()), | 360 | hrtimer_start(hrtimer, ns_to_ktime(sample_period), |
345 | HRTIMER_MODE_REL_PINNED); | 361 | HRTIMER_MODE_REL_PINNED); |
346 | 362 | ||
347 | set_current_state(TASK_INTERRUPTIBLE); | 363 | /* initialize timestamp */ |
348 | /* | 364 | watchdog_set_prio(SCHED_FIFO, MAX_RT_PRIO - 1); |
349 | * Run briefly (kicked by the hrtimer callback function) once every | 365 | __touch_watchdog(); |
350 | * get_sample_period() seconds (4 seconds by default) to reset the | 366 | } |
351 | * softlockup timestamp. If this gets delayed for more than | ||
352 | * 2*watchdog_thresh seconds then the debug-printout triggers in | ||
353 | * watchdog_timer_fn(). | ||
354 | */ | ||
355 | while (!kthread_should_stop()) { | ||
356 | __touch_watchdog(); | ||
357 | schedule(); | ||
358 | 367 | ||
359 | if (kthread_should_stop()) | 368 | static void watchdog_disable(unsigned int cpu) |
360 | break; | 369 | { |
370 | struct hrtimer *hrtimer = &__raw_get_cpu_var(watchdog_hrtimer); | ||
361 | 371 | ||
362 | set_current_state(TASK_INTERRUPTIBLE); | 372 | watchdog_set_prio(SCHED_NORMAL, 0); |
363 | } | 373 | hrtimer_cancel(hrtimer); |
364 | /* | 374 | /* disable the perf event */ |
365 | * Drop the policy/priority elevation during thread exit to avoid a | 375 | watchdog_nmi_disable(cpu); |
366 | * scheduling latency spike. | 376 | } |
367 | */ | 377 | |
368 | __set_current_state(TASK_RUNNING); | 378 | static int watchdog_should_run(unsigned int cpu) |
369 | sched_setscheduler(current, SCHED_NORMAL, ¶m); | 379 | { |
370 | return 0; | 380 | return __this_cpu_read(hrtimer_interrupts) != |
381 | __this_cpu_read(soft_lockup_hrtimer_cnt); | ||
371 | } | 382 | } |
372 | 383 | ||
384 | /* | ||
385 | * The watchdog thread function - touches the timestamp. | ||
386 | * | ||
387 | * It only runs once every sample_period seconds (4 seconds by | ||
388 | * default) to reset the softlockup timestamp. If this gets delayed | ||
389 | * for more than 2*watchdog_thresh seconds then the debug-printout | ||
390 | * triggers in watchdog_timer_fn(). | ||
391 | */ | ||
392 | static void watchdog(unsigned int cpu) | ||
393 | { | ||
394 | __this_cpu_write(soft_lockup_hrtimer_cnt, | ||
395 | __this_cpu_read(hrtimer_interrupts)); | ||
396 | __touch_watchdog(); | ||
397 | } | ||
373 | 398 | ||
374 | #ifdef CONFIG_HARDLOCKUP_DETECTOR | 399 | #ifdef CONFIG_HARDLOCKUP_DETECTOR |
375 | /* | 400 | /* |
@@ -379,7 +404,7 @@ static int watchdog(void *unused) | |||
379 | */ | 404 | */ |
380 | static unsigned long cpu0_err; | 405 | static unsigned long cpu0_err; |
381 | 406 | ||
382 | static int watchdog_nmi_enable(int cpu) | 407 | static int watchdog_nmi_enable(unsigned int cpu) |
383 | { | 408 | { |
384 | struct perf_event_attr *wd_attr; | 409 | struct perf_event_attr *wd_attr; |
385 | struct perf_event *event = per_cpu(watchdog_ev, cpu); | 410 | struct perf_event *event = per_cpu(watchdog_ev, cpu); |
@@ -433,7 +458,7 @@ out: | |||
433 | return 0; | 458 | return 0; |
434 | } | 459 | } |
435 | 460 | ||
436 | static void watchdog_nmi_disable(int cpu) | 461 | static void watchdog_nmi_disable(unsigned int cpu) |
437 | { | 462 | { |
438 | struct perf_event *event = per_cpu(watchdog_ev, cpu); | 463 | struct perf_event *event = per_cpu(watchdog_ev, cpu); |
439 | 464 | ||
@@ -447,107 +472,35 @@ static void watchdog_nmi_disable(int cpu) | |||
447 | return; | 472 | return; |
448 | } | 473 | } |
449 | #else | 474 | #else |
450 | static int watchdog_nmi_enable(int cpu) { return 0; } | 475 | static int watchdog_nmi_enable(unsigned int cpu) { return 0; } |
451 | static void watchdog_nmi_disable(int cpu) { return; } | 476 | static void watchdog_nmi_disable(unsigned int cpu) { return; } |
452 | #endif /* CONFIG_HARDLOCKUP_DETECTOR */ | 477 | #endif /* CONFIG_HARDLOCKUP_DETECTOR */ |
453 | 478 | ||
454 | /* prepare/enable/disable routines */ | 479 | /* prepare/enable/disable routines */ |
455 | static void watchdog_prepare_cpu(int cpu) | ||
456 | { | ||
457 | struct hrtimer *hrtimer = &per_cpu(watchdog_hrtimer, cpu); | ||
458 | |||
459 | WARN_ON(per_cpu(softlockup_watchdog, cpu)); | ||
460 | hrtimer_init(hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); | ||
461 | hrtimer->function = watchdog_timer_fn; | ||
462 | } | ||
463 | |||
464 | static int watchdog_enable(int cpu) | ||
465 | { | ||
466 | struct task_struct *p = per_cpu(softlockup_watchdog, cpu); | ||
467 | int err = 0; | ||
468 | |||
469 | /* enable the perf event */ | ||
470 | err = watchdog_nmi_enable(cpu); | ||
471 | |||
472 | /* Regardless of err above, fall through and start softlockup */ | ||
473 | |||
474 | /* create the watchdog thread */ | ||
475 | if (!p) { | ||
476 | struct sched_param param = { .sched_priority = MAX_RT_PRIO-1 }; | ||
477 | p = kthread_create_on_node(watchdog, NULL, cpu_to_node(cpu), "watchdog/%d", cpu); | ||
478 | if (IS_ERR(p)) { | ||
479 | pr_err("softlockup watchdog for %i failed\n", cpu); | ||
480 | if (!err) { | ||
481 | /* if hardlockup hasn't already set this */ | ||
482 | err = PTR_ERR(p); | ||
483 | /* and disable the perf event */ | ||
484 | watchdog_nmi_disable(cpu); | ||
485 | } | ||
486 | goto out; | ||
487 | } | ||
488 | sched_setscheduler(p, SCHED_FIFO, ¶m); | ||
489 | kthread_bind(p, cpu); | ||
490 | per_cpu(watchdog_touch_ts, cpu) = 0; | ||
491 | per_cpu(softlockup_watchdog, cpu) = p; | ||
492 | wake_up_process(p); | ||
493 | } | ||
494 | |||
495 | out: | ||
496 | return err; | ||
497 | } | ||
498 | |||
499 | static void watchdog_disable(int cpu) | ||
500 | { | ||
501 | struct task_struct *p = per_cpu(softlockup_watchdog, cpu); | ||
502 | struct hrtimer *hrtimer = &per_cpu(watchdog_hrtimer, cpu); | ||
503 | |||
504 | /* | ||
505 | * cancel the timer first to stop incrementing the stats | ||
506 | * and waking up the kthread | ||
507 | */ | ||
508 | hrtimer_cancel(hrtimer); | ||
509 | |||
510 | /* disable the perf event */ | ||
511 | watchdog_nmi_disable(cpu); | ||
512 | |||
513 | /* stop the watchdog thread */ | ||
514 | if (p) { | ||
515 | per_cpu(softlockup_watchdog, cpu) = NULL; | ||
516 | kthread_stop(p); | ||
517 | } | ||
518 | } | ||
519 | |||
520 | /* sysctl functions */ | 480 | /* sysctl functions */ |
521 | #ifdef CONFIG_SYSCTL | 481 | #ifdef CONFIG_SYSCTL |
522 | static void watchdog_enable_all_cpus(void) | 482 | static void watchdog_enable_all_cpus(void) |
523 | { | 483 | { |
524 | int cpu; | 484 | unsigned int cpu; |
525 | |||
526 | watchdog_enabled = 0; | ||
527 | |||
528 | for_each_online_cpu(cpu) | ||
529 | if (!watchdog_enable(cpu)) | ||
530 | /* if any cpu succeeds, watchdog is considered | ||
531 | enabled for the system */ | ||
532 | watchdog_enabled = 1; | ||
533 | |||
534 | if (!watchdog_enabled) | ||
535 | pr_err("failed to be enabled on some cpus\n"); | ||
536 | 485 | ||
486 | if (watchdog_disabled) { | ||
487 | watchdog_disabled = 0; | ||
488 | for_each_online_cpu(cpu) | ||
489 | kthread_unpark(per_cpu(softlockup_watchdog, cpu)); | ||
490 | } | ||
537 | } | 491 | } |
538 | 492 | ||
539 | static void watchdog_disable_all_cpus(void) | 493 | static void watchdog_disable_all_cpus(void) |
540 | { | 494 | { |
541 | int cpu; | 495 | unsigned int cpu; |
542 | |||
543 | for_each_online_cpu(cpu) | ||
544 | watchdog_disable(cpu); | ||
545 | 496 | ||
546 | /* if all watchdogs are disabled, then they are disabled for the system */ | 497 | if (!watchdog_disabled) { |
547 | watchdog_enabled = 0; | 498 | watchdog_disabled = 1; |
499 | for_each_online_cpu(cpu) | ||
500 | kthread_park(per_cpu(softlockup_watchdog, cpu)); | ||
501 | } | ||
548 | } | 502 | } |
549 | 503 | ||
550 | |||
551 | /* | 504 | /* |
552 | * proc handler for /proc/sys/kernel/nmi_watchdog,watchdog_thresh | 505 | * proc handler for /proc/sys/kernel/nmi_watchdog,watchdog_thresh |
553 | */ | 506 | */ |
@@ -557,73 +510,38 @@ int proc_dowatchdog(struct ctl_table *table, int write, | |||
557 | { | 510 | { |
558 | int ret; | 511 | int ret; |
559 | 512 | ||
513 | if (watchdog_disabled < 0) | ||
514 | return -ENODEV; | ||
515 | |||
560 | ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos); | 516 | ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos); |
561 | if (ret || !write) | 517 | if (ret || !write) |
562 | goto out; | 518 | return ret; |
563 | 519 | ||
520 | set_sample_period(); | ||
564 | if (watchdog_enabled && watchdog_thresh) | 521 | if (watchdog_enabled && watchdog_thresh) |
565 | watchdog_enable_all_cpus(); | 522 | watchdog_enable_all_cpus(); |
566 | else | 523 | else |
567 | watchdog_disable_all_cpus(); | 524 | watchdog_disable_all_cpus(); |
568 | 525 | ||
569 | out: | ||
570 | return ret; | 526 | return ret; |
571 | } | 527 | } |
572 | #endif /* CONFIG_SYSCTL */ | 528 | #endif /* CONFIG_SYSCTL */ |
573 | 529 | ||
574 | 530 | static struct smp_hotplug_thread watchdog_threads = { | |
575 | /* | 531 | .store = &softlockup_watchdog, |
576 | * Create/destroy watchdog threads as CPUs come and go: | 532 | .thread_should_run = watchdog_should_run, |
577 | */ | 533 | .thread_fn = watchdog, |
578 | static int __cpuinit | 534 | .thread_comm = "watchdog/%u", |
579 | cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu) | 535 | .setup = watchdog_enable, |
580 | { | 536 | .park = watchdog_disable, |
581 | int hotcpu = (unsigned long)hcpu; | 537 | .unpark = watchdog_enable, |
582 | |||
583 | switch (action) { | ||
584 | case CPU_UP_PREPARE: | ||
585 | case CPU_UP_PREPARE_FROZEN: | ||
586 | watchdog_prepare_cpu(hotcpu); | ||
587 | break; | ||
588 | case CPU_ONLINE: | ||
589 | case CPU_ONLINE_FROZEN: | ||
590 | if (watchdog_enabled) | ||
591 | watchdog_enable(hotcpu); | ||
592 | break; | ||
593 | #ifdef CONFIG_HOTPLUG_CPU | ||
594 | case CPU_UP_CANCELED: | ||
595 | case CPU_UP_CANCELED_FROZEN: | ||
596 | watchdog_disable(hotcpu); | ||
597 | break; | ||
598 | case CPU_DEAD: | ||
599 | case CPU_DEAD_FROZEN: | ||
600 | watchdog_disable(hotcpu); | ||
601 | break; | ||
602 | #endif /* CONFIG_HOTPLUG_CPU */ | ||
603 | } | ||
604 | |||
605 | /* | ||
606 | * hardlockup and softlockup are not important enough | ||
607 | * to block cpu bring up. Just always succeed and | ||
608 | * rely on printk output to flag problems. | ||
609 | */ | ||
610 | return NOTIFY_OK; | ||
611 | } | ||
612 | |||
613 | static struct notifier_block __cpuinitdata cpu_nfb = { | ||
614 | .notifier_call = cpu_callback | ||
615 | }; | 538 | }; |
616 | 539 | ||
617 | void __init lockup_detector_init(void) | 540 | void __init lockup_detector_init(void) |
618 | { | 541 | { |
619 | void *cpu = (void *)(long)smp_processor_id(); | 542 | set_sample_period(); |
620 | int err; | 543 | if (smpboot_register_percpu_thread(&watchdog_threads)) { |
621 | 544 | pr_err("Failed to create watchdog threads, disabled\n"); | |
622 | err = cpu_callback(&cpu_nfb, CPU_UP_PREPARE, cpu); | 545 | watchdog_disabled = -ENODEV; |
623 | WARN_ON(notifier_to_errno(err)); | 546 | } |
624 | |||
625 | cpu_callback(&cpu_nfb, CPU_ONLINE, cpu); | ||
626 | register_cpu_notifier(&cpu_nfb); | ||
627 | |||
628 | return; | ||
629 | } | 547 | } |
diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 3c5a79e2134c..fbc6576a83c3 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c | |||
@@ -58,7 +58,7 @@ enum { | |||
58 | * be executing on any CPU. The gcwq behaves as an unbound one. | 58 | * be executing on any CPU. The gcwq behaves as an unbound one. |
59 | * | 59 | * |
60 | * Note that DISASSOCIATED can be flipped only while holding | 60 | * Note that DISASSOCIATED can be flipped only while holding |
61 | * managership of all pools on the gcwq to avoid changing binding | 61 | * assoc_mutex of all pools on the gcwq to avoid changing binding |
62 | * state while create_worker() is in progress. | 62 | * state while create_worker() is in progress. |
63 | */ | 63 | */ |
64 | GCWQ_DISASSOCIATED = 1 << 0, /* cpu can't serve workers */ | 64 | GCWQ_DISASSOCIATED = 1 << 0, /* cpu can't serve workers */ |
@@ -73,11 +73,10 @@ enum { | |||
73 | WORKER_DIE = 1 << 1, /* die die die */ | 73 | WORKER_DIE = 1 << 1, /* die die die */ |
74 | WORKER_IDLE = 1 << 2, /* is idle */ | 74 | WORKER_IDLE = 1 << 2, /* is idle */ |
75 | WORKER_PREP = 1 << 3, /* preparing to run works */ | 75 | WORKER_PREP = 1 << 3, /* preparing to run works */ |
76 | WORKER_REBIND = 1 << 5, /* mom is home, come back */ | ||
77 | WORKER_CPU_INTENSIVE = 1 << 6, /* cpu intensive */ | 76 | WORKER_CPU_INTENSIVE = 1 << 6, /* cpu intensive */ |
78 | WORKER_UNBOUND = 1 << 7, /* worker is unbound */ | 77 | WORKER_UNBOUND = 1 << 7, /* worker is unbound */ |
79 | 78 | ||
80 | WORKER_NOT_RUNNING = WORKER_PREP | WORKER_REBIND | WORKER_UNBOUND | | 79 | WORKER_NOT_RUNNING = WORKER_PREP | WORKER_UNBOUND | |
81 | WORKER_CPU_INTENSIVE, | 80 | WORKER_CPU_INTENSIVE, |
82 | 81 | ||
83 | NR_WORKER_POOLS = 2, /* # worker pools per gcwq */ | 82 | NR_WORKER_POOLS = 2, /* # worker pools per gcwq */ |
@@ -126,7 +125,6 @@ enum { | |||
126 | 125 | ||
127 | struct global_cwq; | 126 | struct global_cwq; |
128 | struct worker_pool; | 127 | struct worker_pool; |
129 | struct idle_rebind; | ||
130 | 128 | ||
131 | /* | 129 | /* |
132 | * The poor guys doing the actual heavy lifting. All on-duty workers | 130 | * The poor guys doing the actual heavy lifting. All on-duty workers |
@@ -150,7 +148,6 @@ struct worker { | |||
150 | int id; /* I: worker id */ | 148 | int id; /* I: worker id */ |
151 | 149 | ||
152 | /* for rebinding worker to CPU */ | 150 | /* for rebinding worker to CPU */ |
153 | struct idle_rebind *idle_rebind; /* L: for idle worker */ | ||
154 | struct work_struct rebind_work; /* L: for busy worker */ | 151 | struct work_struct rebind_work; /* L: for busy worker */ |
155 | }; | 152 | }; |
156 | 153 | ||
@@ -160,13 +157,15 @@ struct worker_pool { | |||
160 | 157 | ||
161 | struct list_head worklist; /* L: list of pending works */ | 158 | struct list_head worklist; /* L: list of pending works */ |
162 | int nr_workers; /* L: total number of workers */ | 159 | int nr_workers; /* L: total number of workers */ |
160 | |||
161 | /* nr_idle includes the ones off idle_list for rebinding */ | ||
163 | int nr_idle; /* L: currently idle ones */ | 162 | int nr_idle; /* L: currently idle ones */ |
164 | 163 | ||
165 | struct list_head idle_list; /* X: list of idle workers */ | 164 | struct list_head idle_list; /* X: list of idle workers */ |
166 | struct timer_list idle_timer; /* L: worker idle timeout */ | 165 | struct timer_list idle_timer; /* L: worker idle timeout */ |
167 | struct timer_list mayday_timer; /* L: SOS timer for workers */ | 166 | struct timer_list mayday_timer; /* L: SOS timer for workers */ |
168 | 167 | ||
169 | struct mutex manager_mutex; /* mutex manager should hold */ | 168 | struct mutex assoc_mutex; /* protect GCWQ_DISASSOCIATED */ |
170 | struct ida worker_ida; /* L: for worker IDs */ | 169 | struct ida worker_ida; /* L: for worker IDs */ |
171 | }; | 170 | }; |
172 | 171 | ||
@@ -184,9 +183,8 @@ struct global_cwq { | |||
184 | struct hlist_head busy_hash[BUSY_WORKER_HASH_SIZE]; | 183 | struct hlist_head busy_hash[BUSY_WORKER_HASH_SIZE]; |
185 | /* L: hash of busy workers */ | 184 | /* L: hash of busy workers */ |
186 | 185 | ||
187 | struct worker_pool pools[2]; /* normal and highpri pools */ | 186 | struct worker_pool pools[NR_WORKER_POOLS]; |
188 | 187 | /* normal and highpri pools */ | |
189 | wait_queue_head_t rebind_hold; /* rebind hold wait */ | ||
190 | } ____cacheline_aligned_in_smp; | 188 | } ____cacheline_aligned_in_smp; |
191 | 189 | ||
192 | /* | 190 | /* |
@@ -269,17 +267,15 @@ struct workqueue_struct { | |||
269 | }; | 267 | }; |
270 | 268 | ||
271 | struct workqueue_struct *system_wq __read_mostly; | 269 | struct workqueue_struct *system_wq __read_mostly; |
272 | struct workqueue_struct *system_long_wq __read_mostly; | ||
273 | struct workqueue_struct *system_nrt_wq __read_mostly; | ||
274 | struct workqueue_struct *system_unbound_wq __read_mostly; | ||
275 | struct workqueue_struct *system_freezable_wq __read_mostly; | ||
276 | struct workqueue_struct *system_nrt_freezable_wq __read_mostly; | ||
277 | EXPORT_SYMBOL_GPL(system_wq); | 270 | EXPORT_SYMBOL_GPL(system_wq); |
271 | struct workqueue_struct *system_highpri_wq __read_mostly; | ||
272 | EXPORT_SYMBOL_GPL(system_highpri_wq); | ||
273 | struct workqueue_struct *system_long_wq __read_mostly; | ||
278 | EXPORT_SYMBOL_GPL(system_long_wq); | 274 | EXPORT_SYMBOL_GPL(system_long_wq); |
279 | EXPORT_SYMBOL_GPL(system_nrt_wq); | 275 | struct workqueue_struct *system_unbound_wq __read_mostly; |
280 | EXPORT_SYMBOL_GPL(system_unbound_wq); | 276 | EXPORT_SYMBOL_GPL(system_unbound_wq); |
277 | struct workqueue_struct *system_freezable_wq __read_mostly; | ||
281 | EXPORT_SYMBOL_GPL(system_freezable_wq); | 278 | EXPORT_SYMBOL_GPL(system_freezable_wq); |
282 | EXPORT_SYMBOL_GPL(system_nrt_freezable_wq); | ||
283 | 279 | ||
284 | #define CREATE_TRACE_POINTS | 280 | #define CREATE_TRACE_POINTS |
285 | #include <trace/events/workqueue.h> | 281 | #include <trace/events/workqueue.h> |
@@ -534,18 +530,24 @@ static int work_next_color(int color) | |||
534 | } | 530 | } |
535 | 531 | ||
536 | /* | 532 | /* |
537 | * A work's data points to the cwq with WORK_STRUCT_CWQ set while the | 533 | * While queued, %WORK_STRUCT_CWQ is set and non flag bits of a work's data |
538 | * work is on queue. Once execution starts, WORK_STRUCT_CWQ is | 534 | * contain the pointer to the queued cwq. Once execution starts, the flag |
539 | * cleared and the work data contains the cpu number it was last on. | 535 | * is cleared and the high bits contain OFFQ flags and CPU number. |
540 | * | 536 | * |
541 | * set_work_{cwq|cpu}() and clear_work_data() can be used to set the | 537 | * set_work_cwq(), set_work_cpu_and_clear_pending(), mark_work_canceling() |
542 | * cwq, cpu or clear work->data. These functions should only be | 538 | * and clear_work_data() can be used to set the cwq, cpu or clear |
543 | * called while the work is owned - ie. while the PENDING bit is set. | 539 | * work->data. These functions should only be called while the work is |
540 | * owned - ie. while the PENDING bit is set. | ||
544 | * | 541 | * |
545 | * get_work_[g]cwq() can be used to obtain the gcwq or cwq | 542 | * get_work_[g]cwq() can be used to obtain the gcwq or cwq corresponding to |
546 | * corresponding to a work. gcwq is available once the work has been | 543 | * a work. gcwq is available once the work has been queued anywhere after |
547 | * queued anywhere after initialization. cwq is available only from | 544 | * initialization until it is sync canceled. cwq is available only while |
548 | * queueing until execution starts. | 545 | * the work item is queued. |
546 | * | ||
547 | * %WORK_OFFQ_CANCELING is used to mark a work item which is being | ||
548 | * canceled. While being canceled, a work item may have its PENDING set | ||
549 | * but stay off timer and worklist for arbitrarily long and nobody should | ||
550 | * try to steal the PENDING bit. | ||
549 | */ | 551 | */ |
550 | static inline void set_work_data(struct work_struct *work, unsigned long data, | 552 | static inline void set_work_data(struct work_struct *work, unsigned long data, |
551 | unsigned long flags) | 553 | unsigned long flags) |
@@ -562,13 +564,22 @@ static void set_work_cwq(struct work_struct *work, | |||
562 | WORK_STRUCT_PENDING | WORK_STRUCT_CWQ | extra_flags); | 564 | WORK_STRUCT_PENDING | WORK_STRUCT_CWQ | extra_flags); |
563 | } | 565 | } |
564 | 566 | ||
565 | static void set_work_cpu(struct work_struct *work, unsigned int cpu) | 567 | static void set_work_cpu_and_clear_pending(struct work_struct *work, |
568 | unsigned int cpu) | ||
566 | { | 569 | { |
567 | set_work_data(work, cpu << WORK_STRUCT_FLAG_BITS, WORK_STRUCT_PENDING); | 570 | /* |
571 | * The following wmb is paired with the implied mb in | ||
572 | * test_and_set_bit(PENDING) and ensures all updates to @work made | ||
573 | * here are visible to and precede any updates by the next PENDING | ||
574 | * owner. | ||
575 | */ | ||
576 | smp_wmb(); | ||
577 | set_work_data(work, (unsigned long)cpu << WORK_OFFQ_CPU_SHIFT, 0); | ||
568 | } | 578 | } |
569 | 579 | ||
570 | static void clear_work_data(struct work_struct *work) | 580 | static void clear_work_data(struct work_struct *work) |
571 | { | 581 | { |
582 | smp_wmb(); /* see set_work_cpu_and_clear_pending() */ | ||
572 | set_work_data(work, WORK_STRUCT_NO_CPU, 0); | 583 | set_work_data(work, WORK_STRUCT_NO_CPU, 0); |
573 | } | 584 | } |
574 | 585 | ||
@@ -591,7 +602,7 @@ static struct global_cwq *get_work_gcwq(struct work_struct *work) | |||
591 | return ((struct cpu_workqueue_struct *) | 602 | return ((struct cpu_workqueue_struct *) |
592 | (data & WORK_STRUCT_WQ_DATA_MASK))->pool->gcwq; | 603 | (data & WORK_STRUCT_WQ_DATA_MASK))->pool->gcwq; |
593 | 604 | ||
594 | cpu = data >> WORK_STRUCT_FLAG_BITS; | 605 | cpu = data >> WORK_OFFQ_CPU_SHIFT; |
595 | if (cpu == WORK_CPU_NONE) | 606 | if (cpu == WORK_CPU_NONE) |
596 | return NULL; | 607 | return NULL; |
597 | 608 | ||
@@ -599,6 +610,22 @@ static struct global_cwq *get_work_gcwq(struct work_struct *work) | |||
599 | return get_gcwq(cpu); | 610 | return get_gcwq(cpu); |
600 | } | 611 | } |
601 | 612 | ||
613 | static void mark_work_canceling(struct work_struct *work) | ||
614 | { | ||
615 | struct global_cwq *gcwq = get_work_gcwq(work); | ||
616 | unsigned long cpu = gcwq ? gcwq->cpu : WORK_CPU_NONE; | ||
617 | |||
618 | set_work_data(work, (cpu << WORK_OFFQ_CPU_SHIFT) | WORK_OFFQ_CANCELING, | ||
619 | WORK_STRUCT_PENDING); | ||
620 | } | ||
621 | |||
622 | static bool work_is_canceling(struct work_struct *work) | ||
623 | { | ||
624 | unsigned long data = atomic_long_read(&work->data); | ||
625 | |||
626 | return !(data & WORK_STRUCT_CWQ) && (data & WORK_OFFQ_CANCELING); | ||
627 | } | ||
628 | |||
602 | /* | 629 | /* |
603 | * Policy functions. These define the policies on how the global worker | 630 | * Policy functions. These define the policies on how the global worker |
604 | * pools are managed. Unless noted otherwise, these functions assume that | 631 | * pools are managed. Unless noted otherwise, these functions assume that |
@@ -657,6 +684,13 @@ static bool too_many_workers(struct worker_pool *pool) | |||
657 | int nr_idle = pool->nr_idle + managing; /* manager is considered idle */ | 684 | int nr_idle = pool->nr_idle + managing; /* manager is considered idle */ |
658 | int nr_busy = pool->nr_workers - nr_idle; | 685 | int nr_busy = pool->nr_workers - nr_idle; |
659 | 686 | ||
687 | /* | ||
688 | * nr_idle and idle_list may disagree if idle rebinding is in | ||
689 | * progress. Never return %true if idle_list is empty. | ||
690 | */ | ||
691 | if (list_empty(&pool->idle_list)) | ||
692 | return false; | ||
693 | |||
660 | return nr_idle > 2 && (nr_idle - 2) * MAX_IDLE_WORKERS_RATIO >= nr_busy; | 694 | return nr_idle > 2 && (nr_idle - 2) * MAX_IDLE_WORKERS_RATIO >= nr_busy; |
661 | } | 695 | } |
662 | 696 | ||
@@ -705,8 +739,10 @@ void wq_worker_waking_up(struct task_struct *task, unsigned int cpu) | |||
705 | { | 739 | { |
706 | struct worker *worker = kthread_data(task); | 740 | struct worker *worker = kthread_data(task); |
707 | 741 | ||
708 | if (!(worker->flags & WORKER_NOT_RUNNING)) | 742 | if (!(worker->flags & WORKER_NOT_RUNNING)) { |
743 | WARN_ON_ONCE(worker->pool->gcwq->cpu != cpu); | ||
709 | atomic_inc(get_pool_nr_running(worker->pool)); | 744 | atomic_inc(get_pool_nr_running(worker->pool)); |
745 | } | ||
710 | } | 746 | } |
711 | 747 | ||
712 | /** | 748 | /** |
@@ -903,6 +939,206 @@ static struct worker *find_worker_executing_work(struct global_cwq *gcwq, | |||
903 | } | 939 | } |
904 | 940 | ||
905 | /** | 941 | /** |
942 | * move_linked_works - move linked works to a list | ||
943 | * @work: start of series of works to be scheduled | ||
944 | * @head: target list to append @work to | ||
945 | * @nextp: out paramter for nested worklist walking | ||
946 | * | ||
947 | * Schedule linked works starting from @work to @head. Work series to | ||
948 | * be scheduled starts at @work and includes any consecutive work with | ||
949 | * WORK_STRUCT_LINKED set in its predecessor. | ||
950 | * | ||
951 | * If @nextp is not NULL, it's updated to point to the next work of | ||
952 | * the last scheduled work. This allows move_linked_works() to be | ||
953 | * nested inside outer list_for_each_entry_safe(). | ||
954 | * | ||
955 | * CONTEXT: | ||
956 | * spin_lock_irq(gcwq->lock). | ||
957 | */ | ||
958 | static void move_linked_works(struct work_struct *work, struct list_head *head, | ||
959 | struct work_struct **nextp) | ||
960 | { | ||
961 | struct work_struct *n; | ||
962 | |||
963 | /* | ||
964 | * Linked worklist will always end before the end of the list, | ||
965 | * use NULL for list head. | ||
966 | */ | ||
967 | list_for_each_entry_safe_from(work, n, NULL, entry) { | ||
968 | list_move_tail(&work->entry, head); | ||
969 | if (!(*work_data_bits(work) & WORK_STRUCT_LINKED)) | ||
970 | break; | ||
971 | } | ||
972 | |||
973 | /* | ||
974 | * If we're already inside safe list traversal and have moved | ||
975 | * multiple works to the scheduled queue, the next position | ||
976 | * needs to be updated. | ||
977 | */ | ||
978 | if (nextp) | ||
979 | *nextp = n; | ||
980 | } | ||
981 | |||
982 | static void cwq_activate_delayed_work(struct work_struct *work) | ||
983 | { | ||
984 | struct cpu_workqueue_struct *cwq = get_work_cwq(work); | ||
985 | |||
986 | trace_workqueue_activate_work(work); | ||
987 | move_linked_works(work, &cwq->pool->worklist, NULL); | ||
988 | __clear_bit(WORK_STRUCT_DELAYED_BIT, work_data_bits(work)); | ||
989 | cwq->nr_active++; | ||
990 | } | ||
991 | |||
992 | static void cwq_activate_first_delayed(struct cpu_workqueue_struct *cwq) | ||
993 | { | ||
994 | struct work_struct *work = list_first_entry(&cwq->delayed_works, | ||
995 | struct work_struct, entry); | ||
996 | |||
997 | cwq_activate_delayed_work(work); | ||
998 | } | ||
999 | |||
1000 | /** | ||
1001 | * cwq_dec_nr_in_flight - decrement cwq's nr_in_flight | ||
1002 | * @cwq: cwq of interest | ||
1003 | * @color: color of work which left the queue | ||
1004 | * | ||
1005 | * A work either has completed or is removed from pending queue, | ||
1006 | * decrement nr_in_flight of its cwq and handle workqueue flushing. | ||
1007 | * | ||
1008 | * CONTEXT: | ||
1009 | * spin_lock_irq(gcwq->lock). | ||
1010 | */ | ||
1011 | static void cwq_dec_nr_in_flight(struct cpu_workqueue_struct *cwq, int color) | ||
1012 | { | ||
1013 | /* ignore uncolored works */ | ||
1014 | if (color == WORK_NO_COLOR) | ||
1015 | return; | ||
1016 | |||
1017 | cwq->nr_in_flight[color]--; | ||
1018 | |||
1019 | cwq->nr_active--; | ||
1020 | if (!list_empty(&cwq->delayed_works)) { | ||
1021 | /* one down, submit a delayed one */ | ||
1022 | if (cwq->nr_active < cwq->max_active) | ||
1023 | cwq_activate_first_delayed(cwq); | ||
1024 | } | ||
1025 | |||
1026 | /* is flush in progress and are we at the flushing tip? */ | ||
1027 | if (likely(cwq->flush_color != color)) | ||
1028 | return; | ||
1029 | |||
1030 | /* are there still in-flight works? */ | ||
1031 | if (cwq->nr_in_flight[color]) | ||
1032 | return; | ||
1033 | |||
1034 | /* this cwq is done, clear flush_color */ | ||
1035 | cwq->flush_color = -1; | ||
1036 | |||
1037 | /* | ||
1038 | * If this was the last cwq, wake up the first flusher. It | ||
1039 | * will handle the rest. | ||
1040 | */ | ||
1041 | if (atomic_dec_and_test(&cwq->wq->nr_cwqs_to_flush)) | ||
1042 | complete(&cwq->wq->first_flusher->done); | ||
1043 | } | ||
1044 | |||
1045 | /** | ||
1046 | * try_to_grab_pending - steal work item from worklist and disable irq | ||
1047 | * @work: work item to steal | ||
1048 | * @is_dwork: @work is a delayed_work | ||
1049 | * @flags: place to store irq state | ||
1050 | * | ||
1051 | * Try to grab PENDING bit of @work. This function can handle @work in any | ||
1052 | * stable state - idle, on timer or on worklist. Return values are | ||
1053 | * | ||
1054 | * 1 if @work was pending and we successfully stole PENDING | ||
1055 | * 0 if @work was idle and we claimed PENDING | ||
1056 | * -EAGAIN if PENDING couldn't be grabbed at the moment, safe to busy-retry | ||
1057 | * -ENOENT if someone else is canceling @work, this state may persist | ||
1058 | * for arbitrarily long | ||
1059 | * | ||
1060 | * On >= 0 return, the caller owns @work's PENDING bit. To avoid getting | ||
1061 | * interrupted while holding PENDING and @work off queue, irq must be | ||
1062 | * disabled on entry. This, combined with delayed_work->timer being | ||
1063 | * irqsafe, ensures that we return -EAGAIN for finite short period of time. | ||
1064 | * | ||
1065 | * On successful return, >= 0, irq is disabled and the caller is | ||
1066 | * responsible for releasing it using local_irq_restore(*@flags). | ||
1067 | * | ||
1068 | * This function is safe to call from any context including IRQ handler. | ||
1069 | */ | ||
1070 | static int try_to_grab_pending(struct work_struct *work, bool is_dwork, | ||
1071 | unsigned long *flags) | ||
1072 | { | ||
1073 | struct global_cwq *gcwq; | ||
1074 | |||
1075 | local_irq_save(*flags); | ||
1076 | |||
1077 | /* try to steal the timer if it exists */ | ||
1078 | if (is_dwork) { | ||
1079 | struct delayed_work *dwork = to_delayed_work(work); | ||
1080 | |||
1081 | /* | ||
1082 | * dwork->timer is irqsafe. If del_timer() fails, it's | ||
1083 | * guaranteed that the timer is not queued anywhere and not | ||
1084 | * running on the local CPU. | ||
1085 | */ | ||
1086 | if (likely(del_timer(&dwork->timer))) | ||
1087 | return 1; | ||
1088 | } | ||
1089 | |||
1090 | /* try to claim PENDING the normal way */ | ||
1091 | if (!test_and_set_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(work))) | ||
1092 | return 0; | ||
1093 | |||
1094 | /* | ||
1095 | * The queueing is in progress, or it is already queued. Try to | ||
1096 | * steal it from ->worklist without clearing WORK_STRUCT_PENDING. | ||
1097 | */ | ||
1098 | gcwq = get_work_gcwq(work); | ||
1099 | if (!gcwq) | ||
1100 | goto fail; | ||
1101 | |||
1102 | spin_lock(&gcwq->lock); | ||
1103 | if (!list_empty(&work->entry)) { | ||
1104 | /* | ||
1105 | * This work is queued, but perhaps we locked the wrong gcwq. | ||
1106 | * In that case we must see the new value after rmb(), see | ||
1107 | * insert_work()->wmb(). | ||
1108 | */ | ||
1109 | smp_rmb(); | ||
1110 | if (gcwq == get_work_gcwq(work)) { | ||
1111 | debug_work_deactivate(work); | ||
1112 | |||
1113 | /* | ||
1114 | * A delayed work item cannot be grabbed directly | ||
1115 | * because it might have linked NO_COLOR work items | ||
1116 | * which, if left on the delayed_list, will confuse | ||
1117 | * cwq->nr_active management later on and cause | ||
1118 | * stall. Make sure the work item is activated | ||
1119 | * before grabbing. | ||
1120 | */ | ||
1121 | if (*work_data_bits(work) & WORK_STRUCT_DELAYED) | ||
1122 | cwq_activate_delayed_work(work); | ||
1123 | |||
1124 | list_del_init(&work->entry); | ||
1125 | cwq_dec_nr_in_flight(get_work_cwq(work), | ||
1126 | get_work_color(work)); | ||
1127 | |||
1128 | spin_unlock(&gcwq->lock); | ||
1129 | return 1; | ||
1130 | } | ||
1131 | } | ||
1132 | spin_unlock(&gcwq->lock); | ||
1133 | fail: | ||
1134 | local_irq_restore(*flags); | ||
1135 | if (work_is_canceling(work)) | ||
1136 | return -ENOENT; | ||
1137 | cpu_relax(); | ||
1138 | return -EAGAIN; | ||
1139 | } | ||
1140 | |||
1141 | /** | ||
906 | * insert_work - insert a work into gcwq | 1142 | * insert_work - insert a work into gcwq |
907 | * @cwq: cwq @work belongs to | 1143 | * @cwq: cwq @work belongs to |
908 | * @work: work to insert | 1144 | * @work: work to insert |
@@ -982,7 +1218,15 @@ static void __queue_work(unsigned int cpu, struct workqueue_struct *wq, | |||
982 | struct cpu_workqueue_struct *cwq; | 1218 | struct cpu_workqueue_struct *cwq; |
983 | struct list_head *worklist; | 1219 | struct list_head *worklist; |
984 | unsigned int work_flags; | 1220 | unsigned int work_flags; |
985 | unsigned long flags; | 1221 | unsigned int req_cpu = cpu; |
1222 | |||
1223 | /* | ||
1224 | * While a work item is PENDING && off queue, a task trying to | ||
1225 | * steal the PENDING will busy-loop waiting for it to either get | ||
1226 | * queued or lose PENDING. Grabbing PENDING and queueing should | ||
1227 | * happen with IRQ disabled. | ||
1228 | */ | ||
1229 | WARN_ON_ONCE(!irqs_disabled()); | ||
986 | 1230 | ||
987 | debug_work_activate(work); | 1231 | debug_work_activate(work); |
988 | 1232 | ||
@@ -995,21 +1239,22 @@ static void __queue_work(unsigned int cpu, struct workqueue_struct *wq, | |||
995 | if (!(wq->flags & WQ_UNBOUND)) { | 1239 | if (!(wq->flags & WQ_UNBOUND)) { |
996 | struct global_cwq *last_gcwq; | 1240 | struct global_cwq *last_gcwq; |
997 | 1241 | ||
998 | if (unlikely(cpu == WORK_CPU_UNBOUND)) | 1242 | if (cpu == WORK_CPU_UNBOUND) |
999 | cpu = raw_smp_processor_id(); | 1243 | cpu = raw_smp_processor_id(); |
1000 | 1244 | ||
1001 | /* | 1245 | /* |
1002 | * It's multi cpu. If @wq is non-reentrant and @work | 1246 | * It's multi cpu. If @work was previously on a different |
1003 | * was previously on a different cpu, it might still | 1247 | * cpu, it might still be running there, in which case the |
1004 | * be running there, in which case the work needs to | 1248 | * work needs to be queued on that cpu to guarantee |
1005 | * be queued on that cpu to guarantee non-reentrance. | 1249 | * non-reentrancy. |
1006 | */ | 1250 | */ |
1007 | gcwq = get_gcwq(cpu); | 1251 | gcwq = get_gcwq(cpu); |
1008 | if (wq->flags & WQ_NON_REENTRANT && | 1252 | last_gcwq = get_work_gcwq(work); |
1009 | (last_gcwq = get_work_gcwq(work)) && last_gcwq != gcwq) { | 1253 | |
1254 | if (last_gcwq && last_gcwq != gcwq) { | ||
1010 | struct worker *worker; | 1255 | struct worker *worker; |
1011 | 1256 | ||
1012 | spin_lock_irqsave(&last_gcwq->lock, flags); | 1257 | spin_lock(&last_gcwq->lock); |
1013 | 1258 | ||
1014 | worker = find_worker_executing_work(last_gcwq, work); | 1259 | worker = find_worker_executing_work(last_gcwq, work); |
1015 | 1260 | ||
@@ -1017,22 +1262,23 @@ static void __queue_work(unsigned int cpu, struct workqueue_struct *wq, | |||
1017 | gcwq = last_gcwq; | 1262 | gcwq = last_gcwq; |
1018 | else { | 1263 | else { |
1019 | /* meh... not running there, queue here */ | 1264 | /* meh... not running there, queue here */ |
1020 | spin_unlock_irqrestore(&last_gcwq->lock, flags); | 1265 | spin_unlock(&last_gcwq->lock); |
1021 | spin_lock_irqsave(&gcwq->lock, flags); | 1266 | spin_lock(&gcwq->lock); |
1022 | } | 1267 | } |
1023 | } else | 1268 | } else { |
1024 | spin_lock_irqsave(&gcwq->lock, flags); | 1269 | spin_lock(&gcwq->lock); |
1270 | } | ||
1025 | } else { | 1271 | } else { |
1026 | gcwq = get_gcwq(WORK_CPU_UNBOUND); | 1272 | gcwq = get_gcwq(WORK_CPU_UNBOUND); |
1027 | spin_lock_irqsave(&gcwq->lock, flags); | 1273 | spin_lock(&gcwq->lock); |
1028 | } | 1274 | } |
1029 | 1275 | ||
1030 | /* gcwq determined, get cwq and queue */ | 1276 | /* gcwq determined, get cwq and queue */ |
1031 | cwq = get_cwq(gcwq->cpu, wq); | 1277 | cwq = get_cwq(gcwq->cpu, wq); |
1032 | trace_workqueue_queue_work(cpu, cwq, work); | 1278 | trace_workqueue_queue_work(req_cpu, cwq, work); |
1033 | 1279 | ||
1034 | if (WARN_ON(!list_empty(&work->entry))) { | 1280 | if (WARN_ON(!list_empty(&work->entry))) { |
1035 | spin_unlock_irqrestore(&gcwq->lock, flags); | 1281 | spin_unlock(&gcwq->lock); |
1036 | return; | 1282 | return; |
1037 | } | 1283 | } |
1038 | 1284 | ||
@@ -1050,134 +1296,220 @@ static void __queue_work(unsigned int cpu, struct workqueue_struct *wq, | |||
1050 | 1296 | ||
1051 | insert_work(cwq, work, worklist, work_flags); | 1297 | insert_work(cwq, work, worklist, work_flags); |
1052 | 1298 | ||
1053 | spin_unlock_irqrestore(&gcwq->lock, flags); | 1299 | spin_unlock(&gcwq->lock); |
1054 | } | 1300 | } |
1055 | 1301 | ||
1056 | /** | 1302 | /** |
1057 | * queue_work - queue work on a workqueue | 1303 | * queue_work_on - queue work on specific cpu |
1304 | * @cpu: CPU number to execute work on | ||
1058 | * @wq: workqueue to use | 1305 | * @wq: workqueue to use |
1059 | * @work: work to queue | 1306 | * @work: work to queue |
1060 | * | 1307 | * |
1061 | * Returns 0 if @work was already on a queue, non-zero otherwise. | 1308 | * Returns %false if @work was already on a queue, %true otherwise. |
1062 | * | 1309 | * |
1063 | * We queue the work to the CPU on which it was submitted, but if the CPU dies | 1310 | * We queue the work to a specific CPU, the caller must ensure it |
1064 | * it can be processed by another CPU. | 1311 | * can't go away. |
1065 | */ | 1312 | */ |
1066 | int queue_work(struct workqueue_struct *wq, struct work_struct *work) | 1313 | bool queue_work_on(int cpu, struct workqueue_struct *wq, |
1314 | struct work_struct *work) | ||
1067 | { | 1315 | { |
1068 | int ret; | 1316 | bool ret = false; |
1317 | unsigned long flags; | ||
1069 | 1318 | ||
1070 | ret = queue_work_on(get_cpu(), wq, work); | 1319 | local_irq_save(flags); |
1071 | put_cpu(); | 1320 | |
1321 | if (!test_and_set_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(work))) { | ||
1322 | __queue_work(cpu, wq, work); | ||
1323 | ret = true; | ||
1324 | } | ||
1072 | 1325 | ||
1326 | local_irq_restore(flags); | ||
1073 | return ret; | 1327 | return ret; |
1074 | } | 1328 | } |
1075 | EXPORT_SYMBOL_GPL(queue_work); | 1329 | EXPORT_SYMBOL_GPL(queue_work_on); |
1076 | 1330 | ||
1077 | /** | 1331 | /** |
1078 | * queue_work_on - queue work on specific cpu | 1332 | * queue_work - queue work on a workqueue |
1079 | * @cpu: CPU number to execute work on | ||
1080 | * @wq: workqueue to use | 1333 | * @wq: workqueue to use |
1081 | * @work: work to queue | 1334 | * @work: work to queue |
1082 | * | 1335 | * |
1083 | * Returns 0 if @work was already on a queue, non-zero otherwise. | 1336 | * Returns %false if @work was already on a queue, %true otherwise. |
1084 | * | 1337 | * |
1085 | * We queue the work to a specific CPU, the caller must ensure it | 1338 | * We queue the work to the CPU on which it was submitted, but if the CPU dies |
1086 | * can't go away. | 1339 | * it can be processed by another CPU. |
1087 | */ | 1340 | */ |
1088 | int | 1341 | bool queue_work(struct workqueue_struct *wq, struct work_struct *work) |
1089 | queue_work_on(int cpu, struct workqueue_struct *wq, struct work_struct *work) | ||
1090 | { | 1342 | { |
1091 | int ret = 0; | 1343 | return queue_work_on(WORK_CPU_UNBOUND, wq, work); |
1092 | |||
1093 | if (!test_and_set_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(work))) { | ||
1094 | __queue_work(cpu, wq, work); | ||
1095 | ret = 1; | ||
1096 | } | ||
1097 | return ret; | ||
1098 | } | 1344 | } |
1099 | EXPORT_SYMBOL_GPL(queue_work_on); | 1345 | EXPORT_SYMBOL_GPL(queue_work); |
1100 | 1346 | ||
1101 | static void delayed_work_timer_fn(unsigned long __data) | 1347 | void delayed_work_timer_fn(unsigned long __data) |
1102 | { | 1348 | { |
1103 | struct delayed_work *dwork = (struct delayed_work *)__data; | 1349 | struct delayed_work *dwork = (struct delayed_work *)__data; |
1104 | struct cpu_workqueue_struct *cwq = get_work_cwq(&dwork->work); | 1350 | struct cpu_workqueue_struct *cwq = get_work_cwq(&dwork->work); |
1105 | 1351 | ||
1106 | __queue_work(smp_processor_id(), cwq->wq, &dwork->work); | 1352 | /* should have been called from irqsafe timer with irq already off */ |
1353 | __queue_work(dwork->cpu, cwq->wq, &dwork->work); | ||
1354 | } | ||
1355 | EXPORT_SYMBOL_GPL(delayed_work_timer_fn); | ||
1356 | |||
1357 | static void __queue_delayed_work(int cpu, struct workqueue_struct *wq, | ||
1358 | struct delayed_work *dwork, unsigned long delay) | ||
1359 | { | ||
1360 | struct timer_list *timer = &dwork->timer; | ||
1361 | struct work_struct *work = &dwork->work; | ||
1362 | unsigned int lcpu; | ||
1363 | |||
1364 | WARN_ON_ONCE(timer->function != delayed_work_timer_fn || | ||
1365 | timer->data != (unsigned long)dwork); | ||
1366 | WARN_ON_ONCE(timer_pending(timer)); | ||
1367 | WARN_ON_ONCE(!list_empty(&work->entry)); | ||
1368 | |||
1369 | /* | ||
1370 | * If @delay is 0, queue @dwork->work immediately. This is for | ||
1371 | * both optimization and correctness. The earliest @timer can | ||
1372 | * expire is on the closest next tick and delayed_work users depend | ||
1373 | * on that there's no such delay when @delay is 0. | ||
1374 | */ | ||
1375 | if (!delay) { | ||
1376 | __queue_work(cpu, wq, &dwork->work); | ||
1377 | return; | ||
1378 | } | ||
1379 | |||
1380 | timer_stats_timer_set_start_info(&dwork->timer); | ||
1381 | |||
1382 | /* | ||
1383 | * This stores cwq for the moment, for the timer_fn. Note that the | ||
1384 | * work's gcwq is preserved to allow reentrance detection for | ||
1385 | * delayed works. | ||
1386 | */ | ||
1387 | if (!(wq->flags & WQ_UNBOUND)) { | ||
1388 | struct global_cwq *gcwq = get_work_gcwq(work); | ||
1389 | |||
1390 | /* | ||
1391 | * If we cannot get the last gcwq from @work directly, | ||
1392 | * select the last CPU such that it avoids unnecessarily | ||
1393 | * triggering non-reentrancy check in __queue_work(). | ||
1394 | */ | ||
1395 | lcpu = cpu; | ||
1396 | if (gcwq) | ||
1397 | lcpu = gcwq->cpu; | ||
1398 | if (lcpu == WORK_CPU_UNBOUND) | ||
1399 | lcpu = raw_smp_processor_id(); | ||
1400 | } else { | ||
1401 | lcpu = WORK_CPU_UNBOUND; | ||
1402 | } | ||
1403 | |||
1404 | set_work_cwq(work, get_cwq(lcpu, wq), 0); | ||
1405 | |||
1406 | dwork->cpu = cpu; | ||
1407 | timer->expires = jiffies + delay; | ||
1408 | |||
1409 | if (unlikely(cpu != WORK_CPU_UNBOUND)) | ||
1410 | add_timer_on(timer, cpu); | ||
1411 | else | ||
1412 | add_timer(timer); | ||
1107 | } | 1413 | } |
1108 | 1414 | ||
1109 | /** | 1415 | /** |
1416 | * queue_delayed_work_on - queue work on specific CPU after delay | ||
1417 | * @cpu: CPU number to execute work on | ||
1418 | * @wq: workqueue to use | ||
1419 | * @dwork: work to queue | ||
1420 | * @delay: number of jiffies to wait before queueing | ||
1421 | * | ||
1422 | * Returns %false if @work was already on a queue, %true otherwise. If | ||
1423 | * @delay is zero and @dwork is idle, it will be scheduled for immediate | ||
1424 | * execution. | ||
1425 | */ | ||
1426 | bool queue_delayed_work_on(int cpu, struct workqueue_struct *wq, | ||
1427 | struct delayed_work *dwork, unsigned long delay) | ||
1428 | { | ||
1429 | struct work_struct *work = &dwork->work; | ||
1430 | bool ret = false; | ||
1431 | unsigned long flags; | ||
1432 | |||
1433 | /* read the comment in __queue_work() */ | ||
1434 | local_irq_save(flags); | ||
1435 | |||
1436 | if (!test_and_set_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(work))) { | ||
1437 | __queue_delayed_work(cpu, wq, dwork, delay); | ||
1438 | ret = true; | ||
1439 | } | ||
1440 | |||
1441 | local_irq_restore(flags); | ||
1442 | return ret; | ||
1443 | } | ||
1444 | EXPORT_SYMBOL_GPL(queue_delayed_work_on); | ||
1445 | |||
1446 | /** | ||
1110 | * queue_delayed_work - queue work on a workqueue after delay | 1447 | * queue_delayed_work - queue work on a workqueue after delay |
1111 | * @wq: workqueue to use | 1448 | * @wq: workqueue to use |
1112 | * @dwork: delayable work to queue | 1449 | * @dwork: delayable work to queue |
1113 | * @delay: number of jiffies to wait before queueing | 1450 | * @delay: number of jiffies to wait before queueing |
1114 | * | 1451 | * |
1115 | * Returns 0 if @work was already on a queue, non-zero otherwise. | 1452 | * Equivalent to queue_delayed_work_on() but tries to use the local CPU. |
1116 | */ | 1453 | */ |
1117 | int queue_delayed_work(struct workqueue_struct *wq, | 1454 | bool queue_delayed_work(struct workqueue_struct *wq, |
1118 | struct delayed_work *dwork, unsigned long delay) | 1455 | struct delayed_work *dwork, unsigned long delay) |
1119 | { | 1456 | { |
1120 | if (delay == 0) | 1457 | return queue_delayed_work_on(WORK_CPU_UNBOUND, wq, dwork, delay); |
1121 | return queue_work(wq, &dwork->work); | ||
1122 | |||
1123 | return queue_delayed_work_on(-1, wq, dwork, delay); | ||
1124 | } | 1458 | } |
1125 | EXPORT_SYMBOL_GPL(queue_delayed_work); | 1459 | EXPORT_SYMBOL_GPL(queue_delayed_work); |
1126 | 1460 | ||
1127 | /** | 1461 | /** |
1128 | * queue_delayed_work_on - queue work on specific CPU after delay | 1462 | * mod_delayed_work_on - modify delay of or queue a delayed work on specific CPU |
1129 | * @cpu: CPU number to execute work on | 1463 | * @cpu: CPU number to execute work on |
1130 | * @wq: workqueue to use | 1464 | * @wq: workqueue to use |
1131 | * @dwork: work to queue | 1465 | * @dwork: work to queue |
1132 | * @delay: number of jiffies to wait before queueing | 1466 | * @delay: number of jiffies to wait before queueing |
1133 | * | 1467 | * |
1134 | * Returns 0 if @work was already on a queue, non-zero otherwise. | 1468 | * If @dwork is idle, equivalent to queue_delayed_work_on(); otherwise, |
1469 | * modify @dwork's timer so that it expires after @delay. If @delay is | ||
1470 | * zero, @work is guaranteed to be scheduled immediately regardless of its | ||
1471 | * current state. | ||
1472 | * | ||
1473 | * Returns %false if @dwork was idle and queued, %true if @dwork was | ||
1474 | * pending and its timer was modified. | ||
1475 | * | ||
1476 | * This function is safe to call from any context including IRQ handler. | ||
1477 | * See try_to_grab_pending() for details. | ||
1135 | */ | 1478 | */ |
1136 | int queue_delayed_work_on(int cpu, struct workqueue_struct *wq, | 1479 | bool mod_delayed_work_on(int cpu, struct workqueue_struct *wq, |
1137 | struct delayed_work *dwork, unsigned long delay) | 1480 | struct delayed_work *dwork, unsigned long delay) |
1138 | { | 1481 | { |
1139 | int ret = 0; | 1482 | unsigned long flags; |
1140 | struct timer_list *timer = &dwork->timer; | 1483 | int ret; |
1141 | struct work_struct *work = &dwork->work; | ||
1142 | |||
1143 | if (!test_and_set_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(work))) { | ||
1144 | unsigned int lcpu; | ||
1145 | |||
1146 | BUG_ON(timer_pending(timer)); | ||
1147 | BUG_ON(!list_empty(&work->entry)); | ||
1148 | |||
1149 | timer_stats_timer_set_start_info(&dwork->timer); | ||
1150 | |||
1151 | /* | ||
1152 | * This stores cwq for the moment, for the timer_fn. | ||
1153 | * Note that the work's gcwq is preserved to allow | ||
1154 | * reentrance detection for delayed works. | ||
1155 | */ | ||
1156 | if (!(wq->flags & WQ_UNBOUND)) { | ||
1157 | struct global_cwq *gcwq = get_work_gcwq(work); | ||
1158 | |||
1159 | if (gcwq && gcwq->cpu != WORK_CPU_UNBOUND) | ||
1160 | lcpu = gcwq->cpu; | ||
1161 | else | ||
1162 | lcpu = raw_smp_processor_id(); | ||
1163 | } else | ||
1164 | lcpu = WORK_CPU_UNBOUND; | ||
1165 | |||
1166 | set_work_cwq(work, get_cwq(lcpu, wq), 0); | ||
1167 | 1484 | ||
1168 | timer->expires = jiffies + delay; | 1485 | do { |
1169 | timer->data = (unsigned long)dwork; | 1486 | ret = try_to_grab_pending(&dwork->work, true, &flags); |
1170 | timer->function = delayed_work_timer_fn; | 1487 | } while (unlikely(ret == -EAGAIN)); |
1171 | 1488 | ||
1172 | if (unlikely(cpu >= 0)) | 1489 | if (likely(ret >= 0)) { |
1173 | add_timer_on(timer, cpu); | 1490 | __queue_delayed_work(cpu, wq, dwork, delay); |
1174 | else | 1491 | local_irq_restore(flags); |
1175 | add_timer(timer); | ||
1176 | ret = 1; | ||
1177 | } | 1492 | } |
1493 | |||
1494 | /* -ENOENT from try_to_grab_pending() becomes %true */ | ||
1178 | return ret; | 1495 | return ret; |
1179 | } | 1496 | } |
1180 | EXPORT_SYMBOL_GPL(queue_delayed_work_on); | 1497 | EXPORT_SYMBOL_GPL(mod_delayed_work_on); |
1498 | |||
1499 | /** | ||
1500 | * mod_delayed_work - modify delay of or queue a delayed work | ||
1501 | * @wq: workqueue to use | ||
1502 | * @dwork: work to queue | ||
1503 | * @delay: number of jiffies to wait before queueing | ||
1504 | * | ||
1505 | * mod_delayed_work_on() on local CPU. | ||
1506 | */ | ||
1507 | bool mod_delayed_work(struct workqueue_struct *wq, struct delayed_work *dwork, | ||
1508 | unsigned long delay) | ||
1509 | { | ||
1510 | return mod_delayed_work_on(WORK_CPU_UNBOUND, wq, dwork, delay); | ||
1511 | } | ||
1512 | EXPORT_SYMBOL_GPL(mod_delayed_work); | ||
1181 | 1513 | ||
1182 | /** | 1514 | /** |
1183 | * worker_enter_idle - enter idle state | 1515 | * worker_enter_idle - enter idle state |
@@ -1305,37 +1637,21 @@ __acquires(&gcwq->lock) | |||
1305 | } | 1637 | } |
1306 | } | 1638 | } |
1307 | 1639 | ||
1308 | struct idle_rebind { | ||
1309 | int cnt; /* # workers to be rebound */ | ||
1310 | struct completion done; /* all workers rebound */ | ||
1311 | }; | ||
1312 | |||
1313 | /* | 1640 | /* |
1314 | * Rebind an idle @worker to its CPU. During CPU onlining, this has to | 1641 | * Rebind an idle @worker to its CPU. worker_thread() will test |
1315 | * happen synchronously for idle workers. worker_thread() will test | 1642 | * list_empty(@worker->entry) before leaving idle and call this function. |
1316 | * %WORKER_REBIND before leaving idle and call this function. | ||
1317 | */ | 1643 | */ |
1318 | static void idle_worker_rebind(struct worker *worker) | 1644 | static void idle_worker_rebind(struct worker *worker) |
1319 | { | 1645 | { |
1320 | struct global_cwq *gcwq = worker->pool->gcwq; | 1646 | struct global_cwq *gcwq = worker->pool->gcwq; |
1321 | 1647 | ||
1322 | /* CPU must be online at this point */ | 1648 | /* CPU may go down again inbetween, clear UNBOUND only on success */ |
1323 | WARN_ON(!worker_maybe_bind_and_lock(worker)); | 1649 | if (worker_maybe_bind_and_lock(worker)) |
1324 | if (!--worker->idle_rebind->cnt) | 1650 | worker_clr_flags(worker, WORKER_UNBOUND); |
1325 | complete(&worker->idle_rebind->done); | ||
1326 | spin_unlock_irq(&worker->pool->gcwq->lock); | ||
1327 | 1651 | ||
1328 | /* we did our part, wait for rebind_workers() to finish up */ | 1652 | /* rebind complete, become available again */ |
1329 | wait_event(gcwq->rebind_hold, !(worker->flags & WORKER_REBIND)); | 1653 | list_add(&worker->entry, &worker->pool->idle_list); |
1330 | 1654 | spin_unlock_irq(&gcwq->lock); | |
1331 | /* | ||
1332 | * rebind_workers() shouldn't finish until all workers passed the | ||
1333 | * above WORKER_REBIND wait. Tell it when done. | ||
1334 | */ | ||
1335 | spin_lock_irq(&worker->pool->gcwq->lock); | ||
1336 | if (!--worker->idle_rebind->cnt) | ||
1337 | complete(&worker->idle_rebind->done); | ||
1338 | spin_unlock_irq(&worker->pool->gcwq->lock); | ||
1339 | } | 1655 | } |
1340 | 1656 | ||
1341 | /* | 1657 | /* |
@@ -1349,16 +1665,8 @@ static void busy_worker_rebind_fn(struct work_struct *work) | |||
1349 | struct worker *worker = container_of(work, struct worker, rebind_work); | 1665 | struct worker *worker = container_of(work, struct worker, rebind_work); |
1350 | struct global_cwq *gcwq = worker->pool->gcwq; | 1666 | struct global_cwq *gcwq = worker->pool->gcwq; |
1351 | 1667 | ||
1352 | worker_maybe_bind_and_lock(worker); | 1668 | if (worker_maybe_bind_and_lock(worker)) |
1353 | 1669 | worker_clr_flags(worker, WORKER_UNBOUND); | |
1354 | /* | ||
1355 | * %WORKER_REBIND must be cleared even if the above binding failed; | ||
1356 | * otherwise, we may confuse the next CPU_UP cycle or oops / get | ||
1357 | * stuck by calling idle_worker_rebind() prematurely. If CPU went | ||
1358 | * down again inbetween, %WORKER_UNBOUND would be set, so clearing | ||
1359 | * %WORKER_REBIND is always safe. | ||
1360 | */ | ||
1361 | worker_clr_flags(worker, WORKER_REBIND); | ||
1362 | 1670 | ||
1363 | spin_unlock_irq(&gcwq->lock); | 1671 | spin_unlock_irq(&gcwq->lock); |
1364 | } | 1672 | } |
@@ -1370,123 +1678,74 @@ static void busy_worker_rebind_fn(struct work_struct *work) | |||
1370 | * @gcwq->cpu is coming online. Rebind all workers to the CPU. Rebinding | 1678 | * @gcwq->cpu is coming online. Rebind all workers to the CPU. Rebinding |
1371 | * is different for idle and busy ones. | 1679 | * is different for idle and busy ones. |
1372 | * | 1680 | * |
1373 | * The idle ones should be rebound synchronously and idle rebinding should | 1681 | * Idle ones will be removed from the idle_list and woken up. They will |
1374 | * be complete before any worker starts executing work items with | 1682 | * add themselves back after completing rebind. This ensures that the |
1375 | * concurrency management enabled; otherwise, scheduler may oops trying to | 1683 | * idle_list doesn't contain any unbound workers when re-bound busy workers |
1376 | * wake up non-local idle worker from wq_worker_sleeping(). | 1684 | * try to perform local wake-ups for concurrency management. |
1377 | * | 1685 | * |
1378 | * This is achieved by repeatedly requesting rebinding until all idle | 1686 | * Busy workers can rebind after they finish their current work items. |
1379 | * workers are known to have been rebound under @gcwq->lock and holding all | 1687 | * Queueing the rebind work item at the head of the scheduled list is |
1380 | * idle workers from becoming busy until idle rebinding is complete. | 1688 | * enough. Note that nr_running will be properly bumped as busy workers |
1689 | * rebind. | ||
1381 | * | 1690 | * |
1382 | * Once idle workers are rebound, busy workers can be rebound as they | 1691 | * On return, all non-manager workers are scheduled for rebind - see |
1383 | * finish executing their current work items. Queueing the rebind work at | 1692 | * manage_workers() for the manager special case. Any idle worker |
1384 | * the head of their scheduled lists is enough. Note that nr_running will | 1693 | * including the manager will not appear on @idle_list until rebind is |
1385 | * be properbly bumped as busy workers rebind. | 1694 | * complete, making local wake-ups safe. |
1386 | * | ||
1387 | * On return, all workers are guaranteed to either be bound or have rebind | ||
1388 | * work item scheduled. | ||
1389 | */ | 1695 | */ |
1390 | static void rebind_workers(struct global_cwq *gcwq) | 1696 | static void rebind_workers(struct global_cwq *gcwq) |
1391 | __releases(&gcwq->lock) __acquires(&gcwq->lock) | ||
1392 | { | 1697 | { |
1393 | struct idle_rebind idle_rebind; | ||
1394 | struct worker_pool *pool; | 1698 | struct worker_pool *pool; |
1395 | struct worker *worker; | 1699 | struct worker *worker, *n; |
1396 | struct hlist_node *pos; | 1700 | struct hlist_node *pos; |
1397 | int i; | 1701 | int i; |
1398 | 1702 | ||
1399 | lockdep_assert_held(&gcwq->lock); | 1703 | lockdep_assert_held(&gcwq->lock); |
1400 | 1704 | ||
1401 | for_each_worker_pool(pool, gcwq) | 1705 | for_each_worker_pool(pool, gcwq) |
1402 | lockdep_assert_held(&pool->manager_mutex); | 1706 | lockdep_assert_held(&pool->assoc_mutex); |
1403 | 1707 | ||
1404 | /* | 1708 | /* dequeue and kick idle ones */ |
1405 | * Rebind idle workers. Interlocked both ways. We wait for | ||
1406 | * workers to rebind via @idle_rebind.done. Workers will wait for | ||
1407 | * us to finish up by watching %WORKER_REBIND. | ||
1408 | */ | ||
1409 | init_completion(&idle_rebind.done); | ||
1410 | retry: | ||
1411 | idle_rebind.cnt = 1; | ||
1412 | INIT_COMPLETION(idle_rebind.done); | ||
1413 | |||
1414 | /* set REBIND and kick idle ones, we'll wait for these later */ | ||
1415 | for_each_worker_pool(pool, gcwq) { | 1709 | for_each_worker_pool(pool, gcwq) { |
1416 | list_for_each_entry(worker, &pool->idle_list, entry) { | 1710 | list_for_each_entry_safe(worker, n, &pool->idle_list, entry) { |
1417 | unsigned long worker_flags = worker->flags; | 1711 | /* |
1418 | 1712 | * idle workers should be off @pool->idle_list | |
1419 | if (worker->flags & WORKER_REBIND) | 1713 | * until rebind is complete to avoid receiving |
1420 | continue; | 1714 | * premature local wake-ups. |
1421 | 1715 | */ | |
1422 | /* morph UNBOUND to REBIND atomically */ | 1716 | list_del_init(&worker->entry); |
1423 | worker_flags &= ~WORKER_UNBOUND; | ||
1424 | worker_flags |= WORKER_REBIND; | ||
1425 | ACCESS_ONCE(worker->flags) = worker_flags; | ||
1426 | |||
1427 | idle_rebind.cnt++; | ||
1428 | worker->idle_rebind = &idle_rebind; | ||
1429 | 1717 | ||
1430 | /* worker_thread() will call idle_worker_rebind() */ | 1718 | /* |
1719 | * worker_thread() will see the above dequeuing | ||
1720 | * and call idle_worker_rebind(). | ||
1721 | */ | ||
1431 | wake_up_process(worker->task); | 1722 | wake_up_process(worker->task); |
1432 | } | 1723 | } |
1433 | } | 1724 | } |
1434 | 1725 | ||
1435 | if (--idle_rebind.cnt) { | 1726 | /* rebind busy workers */ |
1436 | spin_unlock_irq(&gcwq->lock); | ||
1437 | wait_for_completion(&idle_rebind.done); | ||
1438 | spin_lock_irq(&gcwq->lock); | ||
1439 | /* busy ones might have become idle while waiting, retry */ | ||
1440 | goto retry; | ||
1441 | } | ||
1442 | |||
1443 | /* all idle workers are rebound, rebind busy workers */ | ||
1444 | for_each_busy_worker(worker, i, pos, gcwq) { | 1727 | for_each_busy_worker(worker, i, pos, gcwq) { |
1445 | struct work_struct *rebind_work = &worker->rebind_work; | 1728 | struct work_struct *rebind_work = &worker->rebind_work; |
1446 | unsigned long worker_flags = worker->flags; | 1729 | struct workqueue_struct *wq; |
1447 | |||
1448 | /* morph UNBOUND to REBIND atomically */ | ||
1449 | worker_flags &= ~WORKER_UNBOUND; | ||
1450 | worker_flags |= WORKER_REBIND; | ||
1451 | ACCESS_ONCE(worker->flags) = worker_flags; | ||
1452 | 1730 | ||
1453 | if (test_and_set_bit(WORK_STRUCT_PENDING_BIT, | 1731 | if (test_and_set_bit(WORK_STRUCT_PENDING_BIT, |
1454 | work_data_bits(rebind_work))) | 1732 | work_data_bits(rebind_work))) |
1455 | continue; | 1733 | continue; |
1456 | 1734 | ||
1457 | /* wq doesn't matter, use the default one */ | ||
1458 | debug_work_activate(rebind_work); | 1735 | debug_work_activate(rebind_work); |
1459 | insert_work(get_cwq(gcwq->cpu, system_wq), rebind_work, | ||
1460 | worker->scheduled.next, | ||
1461 | work_color_to_flags(WORK_NO_COLOR)); | ||
1462 | } | ||
1463 | |||
1464 | /* | ||
1465 | * All idle workers are rebound and waiting for %WORKER_REBIND to | ||
1466 | * be cleared inside idle_worker_rebind(). Clear and release. | ||
1467 | * Clearing %WORKER_REBIND from this foreign context is safe | ||
1468 | * because these workers are still guaranteed to be idle. | ||
1469 | * | ||
1470 | * We need to make sure all idle workers passed WORKER_REBIND wait | ||
1471 | * in idle_worker_rebind() before returning; otherwise, workers can | ||
1472 | * get stuck at the wait if hotplug cycle repeats. | ||
1473 | */ | ||
1474 | idle_rebind.cnt = 1; | ||
1475 | INIT_COMPLETION(idle_rebind.done); | ||
1476 | 1736 | ||
1477 | for_each_worker_pool(pool, gcwq) { | 1737 | /* |
1478 | list_for_each_entry(worker, &pool->idle_list, entry) { | 1738 | * wq doesn't really matter but let's keep @worker->pool |
1479 | worker->flags &= ~WORKER_REBIND; | 1739 | * and @cwq->pool consistent for sanity. |
1480 | idle_rebind.cnt++; | 1740 | */ |
1481 | } | 1741 | if (worker_pool_pri(worker->pool)) |
1482 | } | 1742 | wq = system_highpri_wq; |
1483 | 1743 | else | |
1484 | wake_up_all(&gcwq->rebind_hold); | 1744 | wq = system_wq; |
1485 | 1745 | ||
1486 | if (--idle_rebind.cnt) { | 1746 | insert_work(get_cwq(gcwq->cpu, wq), rebind_work, |
1487 | spin_unlock_irq(&gcwq->lock); | 1747 | worker->scheduled.next, |
1488 | wait_for_completion(&idle_rebind.done); | 1748 | work_color_to_flags(WORK_NO_COLOR)); |
1489 | spin_lock_irq(&gcwq->lock); | ||
1490 | } | 1749 | } |
1491 | } | 1750 | } |
1492 | 1751 | ||
@@ -1844,22 +2103,22 @@ static bool manage_workers(struct worker *worker) | |||
1844 | * grab %POOL_MANAGING_WORKERS to achieve this because that can | 2103 | * grab %POOL_MANAGING_WORKERS to achieve this because that can |
1845 | * lead to idle worker depletion (all become busy thinking someone | 2104 | * lead to idle worker depletion (all become busy thinking someone |
1846 | * else is managing) which in turn can result in deadlock under | 2105 | * else is managing) which in turn can result in deadlock under |
1847 | * extreme circumstances. Use @pool->manager_mutex to synchronize | 2106 | * extreme circumstances. Use @pool->assoc_mutex to synchronize |
1848 | * manager against CPU hotplug. | 2107 | * manager against CPU hotplug. |
1849 | * | 2108 | * |
1850 | * manager_mutex would always be free unless CPU hotplug is in | 2109 | * assoc_mutex would always be free unless CPU hotplug is in |
1851 | * progress. trylock first without dropping @gcwq->lock. | 2110 | * progress. trylock first without dropping @gcwq->lock. |
1852 | */ | 2111 | */ |
1853 | if (unlikely(!mutex_trylock(&pool->manager_mutex))) { | 2112 | if (unlikely(!mutex_trylock(&pool->assoc_mutex))) { |
1854 | spin_unlock_irq(&pool->gcwq->lock); | 2113 | spin_unlock_irq(&pool->gcwq->lock); |
1855 | mutex_lock(&pool->manager_mutex); | 2114 | mutex_lock(&pool->assoc_mutex); |
1856 | /* | 2115 | /* |
1857 | * CPU hotplug could have happened while we were waiting | 2116 | * CPU hotplug could have happened while we were waiting |
1858 | * for manager_mutex. Hotplug itself can't handle us | 2117 | * for assoc_mutex. Hotplug itself can't handle us |
1859 | * because manager isn't either on idle or busy list, and | 2118 | * because manager isn't either on idle or busy list, and |
1860 | * @gcwq's state and ours could have deviated. | 2119 | * @gcwq's state and ours could have deviated. |
1861 | * | 2120 | * |
1862 | * As hotplug is now excluded via manager_mutex, we can | 2121 | * As hotplug is now excluded via assoc_mutex, we can |
1863 | * simply try to bind. It will succeed or fail depending | 2122 | * simply try to bind. It will succeed or fail depending |
1864 | * on @gcwq's current state. Try it and adjust | 2123 | * on @gcwq's current state. Try it and adjust |
1865 | * %WORKER_UNBOUND accordingly. | 2124 | * %WORKER_UNBOUND accordingly. |
@@ -1882,112 +2141,11 @@ static bool manage_workers(struct worker *worker) | |||
1882 | ret |= maybe_create_worker(pool); | 2141 | ret |= maybe_create_worker(pool); |
1883 | 2142 | ||
1884 | pool->flags &= ~POOL_MANAGING_WORKERS; | 2143 | pool->flags &= ~POOL_MANAGING_WORKERS; |
1885 | mutex_unlock(&pool->manager_mutex); | 2144 | mutex_unlock(&pool->assoc_mutex); |
1886 | return ret; | 2145 | return ret; |
1887 | } | 2146 | } |
1888 | 2147 | ||
1889 | /** | 2148 | /** |
1890 | * move_linked_works - move linked works to a list | ||
1891 | * @work: start of series of works to be scheduled | ||
1892 | * @head: target list to append @work to | ||
1893 | * @nextp: out paramter for nested worklist walking | ||
1894 | * | ||
1895 | * Schedule linked works starting from @work to @head. Work series to | ||
1896 | * be scheduled starts at @work and includes any consecutive work with | ||
1897 | * WORK_STRUCT_LINKED set in its predecessor. | ||
1898 | * | ||
1899 | * If @nextp is not NULL, it's updated to point to the next work of | ||
1900 | * the last scheduled work. This allows move_linked_works() to be | ||
1901 | * nested inside outer list_for_each_entry_safe(). | ||
1902 | * | ||
1903 | * CONTEXT: | ||
1904 | * spin_lock_irq(gcwq->lock). | ||
1905 | */ | ||
1906 | static void move_linked_works(struct work_struct *work, struct list_head *head, | ||
1907 | struct work_struct **nextp) | ||
1908 | { | ||
1909 | struct work_struct *n; | ||
1910 | |||
1911 | /* | ||
1912 | * Linked worklist will always end before the end of the list, | ||
1913 | * use NULL for list head. | ||
1914 | */ | ||
1915 | list_for_each_entry_safe_from(work, n, NULL, entry) { | ||
1916 | list_move_tail(&work->entry, head); | ||
1917 | if (!(*work_data_bits(work) & WORK_STRUCT_LINKED)) | ||
1918 | break; | ||
1919 | } | ||
1920 | |||
1921 | /* | ||
1922 | * If we're already inside safe list traversal and have moved | ||
1923 | * multiple works to the scheduled queue, the next position | ||
1924 | * needs to be updated. | ||
1925 | */ | ||
1926 | if (nextp) | ||
1927 | *nextp = n; | ||
1928 | } | ||
1929 | |||
1930 | static void cwq_activate_first_delayed(struct cpu_workqueue_struct *cwq) | ||
1931 | { | ||
1932 | struct work_struct *work = list_first_entry(&cwq->delayed_works, | ||
1933 | struct work_struct, entry); | ||
1934 | |||
1935 | trace_workqueue_activate_work(work); | ||
1936 | move_linked_works(work, &cwq->pool->worklist, NULL); | ||
1937 | __clear_bit(WORK_STRUCT_DELAYED_BIT, work_data_bits(work)); | ||
1938 | cwq->nr_active++; | ||
1939 | } | ||
1940 | |||
1941 | /** | ||
1942 | * cwq_dec_nr_in_flight - decrement cwq's nr_in_flight | ||
1943 | * @cwq: cwq of interest | ||
1944 | * @color: color of work which left the queue | ||
1945 | * @delayed: for a delayed work | ||
1946 | * | ||
1947 | * A work either has completed or is removed from pending queue, | ||
1948 | * decrement nr_in_flight of its cwq and handle workqueue flushing. | ||
1949 | * | ||
1950 | * CONTEXT: | ||
1951 | * spin_lock_irq(gcwq->lock). | ||
1952 | */ | ||
1953 | static void cwq_dec_nr_in_flight(struct cpu_workqueue_struct *cwq, int color, | ||
1954 | bool delayed) | ||
1955 | { | ||
1956 | /* ignore uncolored works */ | ||
1957 | if (color == WORK_NO_COLOR) | ||
1958 | return; | ||
1959 | |||
1960 | cwq->nr_in_flight[color]--; | ||
1961 | |||
1962 | if (!delayed) { | ||
1963 | cwq->nr_active--; | ||
1964 | if (!list_empty(&cwq->delayed_works)) { | ||
1965 | /* one down, submit a delayed one */ | ||
1966 | if (cwq->nr_active < cwq->max_active) | ||
1967 | cwq_activate_first_delayed(cwq); | ||
1968 | } | ||
1969 | } | ||
1970 | |||
1971 | /* is flush in progress and are we at the flushing tip? */ | ||
1972 | if (likely(cwq->flush_color != color)) | ||
1973 | return; | ||
1974 | |||
1975 | /* are there still in-flight works? */ | ||
1976 | if (cwq->nr_in_flight[color]) | ||
1977 | return; | ||
1978 | |||
1979 | /* this cwq is done, clear flush_color */ | ||
1980 | cwq->flush_color = -1; | ||
1981 | |||
1982 | /* | ||
1983 | * If this was the last cwq, wake up the first flusher. It | ||
1984 | * will handle the rest. | ||
1985 | */ | ||
1986 | if (atomic_dec_and_test(&cwq->wq->nr_cwqs_to_flush)) | ||
1987 | complete(&cwq->wq->first_flusher->done); | ||
1988 | } | ||
1989 | |||
1990 | /** | ||
1991 | * process_one_work - process single work | 2149 | * process_one_work - process single work |
1992 | * @worker: self | 2150 | * @worker: self |
1993 | * @work: work to process | 2151 | * @work: work to process |
@@ -2030,7 +2188,7 @@ __acquires(&gcwq->lock) | |||
2030 | * necessary to avoid spurious warnings from rescuers servicing the | 2188 | * necessary to avoid spurious warnings from rescuers servicing the |
2031 | * unbound or a disassociated gcwq. | 2189 | * unbound or a disassociated gcwq. |
2032 | */ | 2190 | */ |
2033 | WARN_ON_ONCE(!(worker->flags & (WORKER_UNBOUND | WORKER_REBIND)) && | 2191 | WARN_ON_ONCE(!(worker->flags & WORKER_UNBOUND) && |
2034 | !(gcwq->flags & GCWQ_DISASSOCIATED) && | 2192 | !(gcwq->flags & GCWQ_DISASSOCIATED) && |
2035 | raw_smp_processor_id() != gcwq->cpu); | 2193 | raw_smp_processor_id() != gcwq->cpu); |
2036 | 2194 | ||
@@ -2046,15 +2204,13 @@ __acquires(&gcwq->lock) | |||
2046 | return; | 2204 | return; |
2047 | } | 2205 | } |
2048 | 2206 | ||
2049 | /* claim and process */ | 2207 | /* claim and dequeue */ |
2050 | debug_work_deactivate(work); | 2208 | debug_work_deactivate(work); |
2051 | hlist_add_head(&worker->hentry, bwh); | 2209 | hlist_add_head(&worker->hentry, bwh); |
2052 | worker->current_work = work; | 2210 | worker->current_work = work; |
2053 | worker->current_cwq = cwq; | 2211 | worker->current_cwq = cwq; |
2054 | work_color = get_work_color(work); | 2212 | work_color = get_work_color(work); |
2055 | 2213 | ||
2056 | /* record the current cpu number in the work data and dequeue */ | ||
2057 | set_work_cpu(work, gcwq->cpu); | ||
2058 | list_del_init(&work->entry); | 2214 | list_del_init(&work->entry); |
2059 | 2215 | ||
2060 | /* | 2216 | /* |
@@ -2071,9 +2227,16 @@ __acquires(&gcwq->lock) | |||
2071 | if ((worker->flags & WORKER_UNBOUND) && need_more_worker(pool)) | 2227 | if ((worker->flags & WORKER_UNBOUND) && need_more_worker(pool)) |
2072 | wake_up_worker(pool); | 2228 | wake_up_worker(pool); |
2073 | 2229 | ||
2230 | /* | ||
2231 | * Record the last CPU and clear PENDING which should be the last | ||
2232 | * update to @work. Also, do this inside @gcwq->lock so that | ||
2233 | * PENDING and queued state changes happen together while IRQ is | ||
2234 | * disabled. | ||
2235 | */ | ||
2236 | set_work_cpu_and_clear_pending(work, gcwq->cpu); | ||
2237 | |||
2074 | spin_unlock_irq(&gcwq->lock); | 2238 | spin_unlock_irq(&gcwq->lock); |
2075 | 2239 | ||
2076 | work_clear_pending(work); | ||
2077 | lock_map_acquire_read(&cwq->wq->lockdep_map); | 2240 | lock_map_acquire_read(&cwq->wq->lockdep_map); |
2078 | lock_map_acquire(&lockdep_map); | 2241 | lock_map_acquire(&lockdep_map); |
2079 | trace_workqueue_execute_start(work); | 2242 | trace_workqueue_execute_start(work); |
@@ -2087,11 +2250,9 @@ __acquires(&gcwq->lock) | |||
2087 | lock_map_release(&cwq->wq->lockdep_map); | 2250 | lock_map_release(&cwq->wq->lockdep_map); |
2088 | 2251 | ||
2089 | if (unlikely(in_atomic() || lockdep_depth(current) > 0)) { | 2252 | if (unlikely(in_atomic() || lockdep_depth(current) > 0)) { |
2090 | printk(KERN_ERR "BUG: workqueue leaked lock or atomic: " | 2253 | pr_err("BUG: workqueue leaked lock or atomic: %s/0x%08x/%d\n" |
2091 | "%s/0x%08x/%d\n", | 2254 | " last function: %pf\n", |
2092 | current->comm, preempt_count(), task_pid_nr(current)); | 2255 | current->comm, preempt_count(), task_pid_nr(current), f); |
2093 | printk(KERN_ERR " last function: "); | ||
2094 | print_symbol("%s\n", (unsigned long)f); | ||
2095 | debug_show_held_locks(current); | 2256 | debug_show_held_locks(current); |
2096 | dump_stack(); | 2257 | dump_stack(); |
2097 | } | 2258 | } |
@@ -2106,7 +2267,7 @@ __acquires(&gcwq->lock) | |||
2106 | hlist_del_init(&worker->hentry); | 2267 | hlist_del_init(&worker->hentry); |
2107 | worker->current_work = NULL; | 2268 | worker->current_work = NULL; |
2108 | worker->current_cwq = NULL; | 2269 | worker->current_cwq = NULL; |
2109 | cwq_dec_nr_in_flight(cwq, work_color, false); | 2270 | cwq_dec_nr_in_flight(cwq, work_color); |
2110 | } | 2271 | } |
2111 | 2272 | ||
2112 | /** | 2273 | /** |
@@ -2151,18 +2312,17 @@ static int worker_thread(void *__worker) | |||
2151 | woke_up: | 2312 | woke_up: |
2152 | spin_lock_irq(&gcwq->lock); | 2313 | spin_lock_irq(&gcwq->lock); |
2153 | 2314 | ||
2154 | /* | 2315 | /* we are off idle list if destruction or rebind is requested */ |
2155 | * DIE can be set only while idle and REBIND set while busy has | 2316 | if (unlikely(list_empty(&worker->entry))) { |
2156 | * @worker->rebind_work scheduled. Checking here is enough. | ||
2157 | */ | ||
2158 | if (unlikely(worker->flags & (WORKER_REBIND | WORKER_DIE))) { | ||
2159 | spin_unlock_irq(&gcwq->lock); | 2317 | spin_unlock_irq(&gcwq->lock); |
2160 | 2318 | ||
2319 | /* if DIE is set, destruction is requested */ | ||
2161 | if (worker->flags & WORKER_DIE) { | 2320 | if (worker->flags & WORKER_DIE) { |
2162 | worker->task->flags &= ~PF_WQ_WORKER; | 2321 | worker->task->flags &= ~PF_WQ_WORKER; |
2163 | return 0; | 2322 | return 0; |
2164 | } | 2323 | } |
2165 | 2324 | ||
2325 | /* otherwise, rebind */ | ||
2166 | idle_worker_rebind(worker); | 2326 | idle_worker_rebind(worker); |
2167 | goto woke_up; | 2327 | goto woke_up; |
2168 | } | 2328 | } |
@@ -2257,8 +2417,10 @@ static int rescuer_thread(void *__wq) | |||
2257 | repeat: | 2417 | repeat: |
2258 | set_current_state(TASK_INTERRUPTIBLE); | 2418 | set_current_state(TASK_INTERRUPTIBLE); |
2259 | 2419 | ||
2260 | if (kthread_should_stop()) | 2420 | if (kthread_should_stop()) { |
2421 | __set_current_state(TASK_RUNNING); | ||
2261 | return 0; | 2422 | return 0; |
2423 | } | ||
2262 | 2424 | ||
2263 | /* | 2425 | /* |
2264 | * See whether any cpu is asking for help. Unbounded | 2426 | * See whether any cpu is asking for help. Unbounded |
@@ -2645,8 +2807,8 @@ reflush: | |||
2645 | 2807 | ||
2646 | if (++flush_cnt == 10 || | 2808 | if (++flush_cnt == 10 || |
2647 | (flush_cnt % 100 == 0 && flush_cnt <= 1000)) | 2809 | (flush_cnt % 100 == 0 && flush_cnt <= 1000)) |
2648 | pr_warning("workqueue %s: flush on destruction isn't complete after %u tries\n", | 2810 | pr_warn("workqueue %s: flush on destruction isn't complete after %u tries\n", |
2649 | wq->name, flush_cnt); | 2811 | wq->name, flush_cnt); |
2650 | goto reflush; | 2812 | goto reflush; |
2651 | } | 2813 | } |
2652 | 2814 | ||
@@ -2657,8 +2819,7 @@ reflush: | |||
2657 | } | 2819 | } |
2658 | EXPORT_SYMBOL_GPL(drain_workqueue); | 2820 | EXPORT_SYMBOL_GPL(drain_workqueue); |
2659 | 2821 | ||
2660 | static bool start_flush_work(struct work_struct *work, struct wq_barrier *barr, | 2822 | static bool start_flush_work(struct work_struct *work, struct wq_barrier *barr) |
2661 | bool wait_executing) | ||
2662 | { | 2823 | { |
2663 | struct worker *worker = NULL; | 2824 | struct worker *worker = NULL; |
2664 | struct global_cwq *gcwq; | 2825 | struct global_cwq *gcwq; |
@@ -2680,13 +2841,12 @@ static bool start_flush_work(struct work_struct *work, struct wq_barrier *barr, | |||
2680 | cwq = get_work_cwq(work); | 2841 | cwq = get_work_cwq(work); |
2681 | if (unlikely(!cwq || gcwq != cwq->pool->gcwq)) | 2842 | if (unlikely(!cwq || gcwq != cwq->pool->gcwq)) |
2682 | goto already_gone; | 2843 | goto already_gone; |
2683 | } else if (wait_executing) { | 2844 | } else { |
2684 | worker = find_worker_executing_work(gcwq, work); | 2845 | worker = find_worker_executing_work(gcwq, work); |
2685 | if (!worker) | 2846 | if (!worker) |
2686 | goto already_gone; | 2847 | goto already_gone; |
2687 | cwq = worker->current_cwq; | 2848 | cwq = worker->current_cwq; |
2688 | } else | 2849 | } |
2689 | goto already_gone; | ||
2690 | 2850 | ||
2691 | insert_wq_barrier(cwq, barr, work, worker); | 2851 | insert_wq_barrier(cwq, barr, work, worker); |
2692 | spin_unlock_irq(&gcwq->lock); | 2852 | spin_unlock_irq(&gcwq->lock); |
@@ -2713,15 +2873,8 @@ already_gone: | |||
2713 | * flush_work - wait for a work to finish executing the last queueing instance | 2873 | * flush_work - wait for a work to finish executing the last queueing instance |
2714 | * @work: the work to flush | 2874 | * @work: the work to flush |
2715 | * | 2875 | * |
2716 | * Wait until @work has finished execution. This function considers | 2876 | * Wait until @work has finished execution. @work is guaranteed to be idle |
2717 | * only the last queueing instance of @work. If @work has been | 2877 | * on return if it hasn't been requeued since flush started. |
2718 | * enqueued across different CPUs on a non-reentrant workqueue or on | ||
2719 | * multiple workqueues, @work might still be executing on return on | ||
2720 | * some of the CPUs from earlier queueing. | ||
2721 | * | ||
2722 | * If @work was queued only on a non-reentrant, ordered or unbound | ||
2723 | * workqueue, @work is guaranteed to be idle on return if it hasn't | ||
2724 | * been requeued since flush started. | ||
2725 | * | 2878 | * |
2726 | * RETURNS: | 2879 | * RETURNS: |
2727 | * %true if flush_work() waited for the work to finish execution, | 2880 | * %true if flush_work() waited for the work to finish execution, |
@@ -2734,140 +2887,36 @@ bool flush_work(struct work_struct *work) | |||
2734 | lock_map_acquire(&work->lockdep_map); | 2887 | lock_map_acquire(&work->lockdep_map); |
2735 | lock_map_release(&work->lockdep_map); | 2888 | lock_map_release(&work->lockdep_map); |
2736 | 2889 | ||
2737 | if (start_flush_work(work, &barr, true)) { | 2890 | if (start_flush_work(work, &barr)) { |
2738 | wait_for_completion(&barr.done); | 2891 | wait_for_completion(&barr.done); |
2739 | destroy_work_on_stack(&barr.work); | 2892 | destroy_work_on_stack(&barr.work); |
2740 | return true; | 2893 | return true; |
2741 | } else | 2894 | } else { |
2742 | return false; | ||
2743 | } | ||
2744 | EXPORT_SYMBOL_GPL(flush_work); | ||
2745 | |||
2746 | static bool wait_on_cpu_work(struct global_cwq *gcwq, struct work_struct *work) | ||
2747 | { | ||
2748 | struct wq_barrier barr; | ||
2749 | struct worker *worker; | ||
2750 | |||
2751 | spin_lock_irq(&gcwq->lock); | ||
2752 | |||
2753 | worker = find_worker_executing_work(gcwq, work); | ||
2754 | if (unlikely(worker)) | ||
2755 | insert_wq_barrier(worker->current_cwq, &barr, work, worker); | ||
2756 | |||
2757 | spin_unlock_irq(&gcwq->lock); | ||
2758 | |||
2759 | if (unlikely(worker)) { | ||
2760 | wait_for_completion(&barr.done); | ||
2761 | destroy_work_on_stack(&barr.work); | ||
2762 | return true; | ||
2763 | } else | ||
2764 | return false; | 2895 | return false; |
2765 | } | ||
2766 | |||
2767 | static bool wait_on_work(struct work_struct *work) | ||
2768 | { | ||
2769 | bool ret = false; | ||
2770 | int cpu; | ||
2771 | |||
2772 | might_sleep(); | ||
2773 | |||
2774 | lock_map_acquire(&work->lockdep_map); | ||
2775 | lock_map_release(&work->lockdep_map); | ||
2776 | |||
2777 | for_each_gcwq_cpu(cpu) | ||
2778 | ret |= wait_on_cpu_work(get_gcwq(cpu), work); | ||
2779 | return ret; | ||
2780 | } | ||
2781 | |||
2782 | /** | ||
2783 | * flush_work_sync - wait until a work has finished execution | ||
2784 | * @work: the work to flush | ||
2785 | * | ||
2786 | * Wait until @work has finished execution. On return, it's | ||
2787 | * guaranteed that all queueing instances of @work which happened | ||
2788 | * before this function is called are finished. In other words, if | ||
2789 | * @work hasn't been requeued since this function was called, @work is | ||
2790 | * guaranteed to be idle on return. | ||
2791 | * | ||
2792 | * RETURNS: | ||
2793 | * %true if flush_work_sync() waited for the work to finish execution, | ||
2794 | * %false if it was already idle. | ||
2795 | */ | ||
2796 | bool flush_work_sync(struct work_struct *work) | ||
2797 | { | ||
2798 | struct wq_barrier barr; | ||
2799 | bool pending, waited; | ||
2800 | |||
2801 | /* we'll wait for executions separately, queue barr only if pending */ | ||
2802 | pending = start_flush_work(work, &barr, false); | ||
2803 | |||
2804 | /* wait for executions to finish */ | ||
2805 | waited = wait_on_work(work); | ||
2806 | |||
2807 | /* wait for the pending one */ | ||
2808 | if (pending) { | ||
2809 | wait_for_completion(&barr.done); | ||
2810 | destroy_work_on_stack(&barr.work); | ||
2811 | } | ||
2812 | |||
2813 | return pending || waited; | ||
2814 | } | ||
2815 | EXPORT_SYMBOL_GPL(flush_work_sync); | ||
2816 | |||
2817 | /* | ||
2818 | * Upon a successful return (>= 0), the caller "owns" WORK_STRUCT_PENDING bit, | ||
2819 | * so this work can't be re-armed in any way. | ||
2820 | */ | ||
2821 | static int try_to_grab_pending(struct work_struct *work) | ||
2822 | { | ||
2823 | struct global_cwq *gcwq; | ||
2824 | int ret = -1; | ||
2825 | |||
2826 | if (!test_and_set_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(work))) | ||
2827 | return 0; | ||
2828 | |||
2829 | /* | ||
2830 | * The queueing is in progress, or it is already queued. Try to | ||
2831 | * steal it from ->worklist without clearing WORK_STRUCT_PENDING. | ||
2832 | */ | ||
2833 | gcwq = get_work_gcwq(work); | ||
2834 | if (!gcwq) | ||
2835 | return ret; | ||
2836 | |||
2837 | spin_lock_irq(&gcwq->lock); | ||
2838 | if (!list_empty(&work->entry)) { | ||
2839 | /* | ||
2840 | * This work is queued, but perhaps we locked the wrong gcwq. | ||
2841 | * In that case we must see the new value after rmb(), see | ||
2842 | * insert_work()->wmb(). | ||
2843 | */ | ||
2844 | smp_rmb(); | ||
2845 | if (gcwq == get_work_gcwq(work)) { | ||
2846 | debug_work_deactivate(work); | ||
2847 | list_del_init(&work->entry); | ||
2848 | cwq_dec_nr_in_flight(get_work_cwq(work), | ||
2849 | get_work_color(work), | ||
2850 | *work_data_bits(work) & WORK_STRUCT_DELAYED); | ||
2851 | ret = 1; | ||
2852 | } | ||
2853 | } | 2896 | } |
2854 | spin_unlock_irq(&gcwq->lock); | ||
2855 | |||
2856 | return ret; | ||
2857 | } | 2897 | } |
2898 | EXPORT_SYMBOL_GPL(flush_work); | ||
2858 | 2899 | ||
2859 | static bool __cancel_work_timer(struct work_struct *work, | 2900 | static bool __cancel_work_timer(struct work_struct *work, bool is_dwork) |
2860 | struct timer_list* timer) | ||
2861 | { | 2901 | { |
2902 | unsigned long flags; | ||
2862 | int ret; | 2903 | int ret; |
2863 | 2904 | ||
2864 | do { | 2905 | do { |
2865 | ret = (timer && likely(del_timer(timer))); | 2906 | ret = try_to_grab_pending(work, is_dwork, &flags); |
2866 | if (!ret) | 2907 | /* |
2867 | ret = try_to_grab_pending(work); | 2908 | * If someone else is canceling, wait for the same event it |
2868 | wait_on_work(work); | 2909 | * would be waiting for before retrying. |
2910 | */ | ||
2911 | if (unlikely(ret == -ENOENT)) | ||
2912 | flush_work(work); | ||
2869 | } while (unlikely(ret < 0)); | 2913 | } while (unlikely(ret < 0)); |
2870 | 2914 | ||
2915 | /* tell other tasks trying to grab @work to back off */ | ||
2916 | mark_work_canceling(work); | ||
2917 | local_irq_restore(flags); | ||
2918 | |||
2919 | flush_work(work); | ||
2871 | clear_work_data(work); | 2920 | clear_work_data(work); |
2872 | return ret; | 2921 | return ret; |
2873 | } | 2922 | } |
@@ -2892,7 +2941,7 @@ static bool __cancel_work_timer(struct work_struct *work, | |||
2892 | */ | 2941 | */ |
2893 | bool cancel_work_sync(struct work_struct *work) | 2942 | bool cancel_work_sync(struct work_struct *work) |
2894 | { | 2943 | { |
2895 | return __cancel_work_timer(work, NULL); | 2944 | return __cancel_work_timer(work, false); |
2896 | } | 2945 | } |
2897 | EXPORT_SYMBOL_GPL(cancel_work_sync); | 2946 | EXPORT_SYMBOL_GPL(cancel_work_sync); |
2898 | 2947 | ||
@@ -2910,33 +2959,44 @@ EXPORT_SYMBOL_GPL(cancel_work_sync); | |||
2910 | */ | 2959 | */ |
2911 | bool flush_delayed_work(struct delayed_work *dwork) | 2960 | bool flush_delayed_work(struct delayed_work *dwork) |
2912 | { | 2961 | { |
2962 | local_irq_disable(); | ||
2913 | if (del_timer_sync(&dwork->timer)) | 2963 | if (del_timer_sync(&dwork->timer)) |
2914 | __queue_work(raw_smp_processor_id(), | 2964 | __queue_work(dwork->cpu, |
2915 | get_work_cwq(&dwork->work)->wq, &dwork->work); | 2965 | get_work_cwq(&dwork->work)->wq, &dwork->work); |
2966 | local_irq_enable(); | ||
2916 | return flush_work(&dwork->work); | 2967 | return flush_work(&dwork->work); |
2917 | } | 2968 | } |
2918 | EXPORT_SYMBOL(flush_delayed_work); | 2969 | EXPORT_SYMBOL(flush_delayed_work); |
2919 | 2970 | ||
2920 | /** | 2971 | /** |
2921 | * flush_delayed_work_sync - wait for a dwork to finish | 2972 | * cancel_delayed_work - cancel a delayed work |
2922 | * @dwork: the delayed work to flush | 2973 | * @dwork: delayed_work to cancel |
2923 | * | 2974 | * |
2924 | * Delayed timer is cancelled and the pending work is queued for | 2975 | * Kill off a pending delayed_work. Returns %true if @dwork was pending |
2925 | * execution immediately. Other than timer handling, its behavior | 2976 | * and canceled; %false if wasn't pending. Note that the work callback |
2926 | * is identical to flush_work_sync(). | 2977 | * function may still be running on return, unless it returns %true and the |
2978 | * work doesn't re-arm itself. Explicitly flush or use | ||
2979 | * cancel_delayed_work_sync() to wait on it. | ||
2927 | * | 2980 | * |
2928 | * RETURNS: | 2981 | * This function is safe to call from any context including IRQ handler. |
2929 | * %true if flush_work_sync() waited for the work to finish execution, | ||
2930 | * %false if it was already idle. | ||
2931 | */ | 2982 | */ |
2932 | bool flush_delayed_work_sync(struct delayed_work *dwork) | 2983 | bool cancel_delayed_work(struct delayed_work *dwork) |
2933 | { | 2984 | { |
2934 | if (del_timer_sync(&dwork->timer)) | 2985 | unsigned long flags; |
2935 | __queue_work(raw_smp_processor_id(), | 2986 | int ret; |
2936 | get_work_cwq(&dwork->work)->wq, &dwork->work); | 2987 | |
2937 | return flush_work_sync(&dwork->work); | 2988 | do { |
2989 | ret = try_to_grab_pending(&dwork->work, true, &flags); | ||
2990 | } while (unlikely(ret == -EAGAIN)); | ||
2991 | |||
2992 | if (unlikely(ret < 0)) | ||
2993 | return false; | ||
2994 | |||
2995 | set_work_cpu_and_clear_pending(&dwork->work, work_cpu(&dwork->work)); | ||
2996 | local_irq_restore(flags); | ||
2997 | return ret; | ||
2938 | } | 2998 | } |
2939 | EXPORT_SYMBOL(flush_delayed_work_sync); | 2999 | EXPORT_SYMBOL(cancel_delayed_work); |
2940 | 3000 | ||
2941 | /** | 3001 | /** |
2942 | * cancel_delayed_work_sync - cancel a delayed work and wait for it to finish | 3002 | * cancel_delayed_work_sync - cancel a delayed work and wait for it to finish |
@@ -2949,54 +3009,39 @@ EXPORT_SYMBOL(flush_delayed_work_sync); | |||
2949 | */ | 3009 | */ |
2950 | bool cancel_delayed_work_sync(struct delayed_work *dwork) | 3010 | bool cancel_delayed_work_sync(struct delayed_work *dwork) |
2951 | { | 3011 | { |
2952 | return __cancel_work_timer(&dwork->work, &dwork->timer); | 3012 | return __cancel_work_timer(&dwork->work, true); |
2953 | } | 3013 | } |
2954 | EXPORT_SYMBOL(cancel_delayed_work_sync); | 3014 | EXPORT_SYMBOL(cancel_delayed_work_sync); |
2955 | 3015 | ||
2956 | /** | 3016 | /** |
2957 | * schedule_work - put work task in global workqueue | ||
2958 | * @work: job to be done | ||
2959 | * | ||
2960 | * Returns zero if @work was already on the kernel-global workqueue and | ||
2961 | * non-zero otherwise. | ||
2962 | * | ||
2963 | * This puts a job in the kernel-global workqueue if it was not already | ||
2964 | * queued and leaves it in the same position on the kernel-global | ||
2965 | * workqueue otherwise. | ||
2966 | */ | ||
2967 | int schedule_work(struct work_struct *work) | ||
2968 | { | ||
2969 | return queue_work(system_wq, work); | ||
2970 | } | ||
2971 | EXPORT_SYMBOL(schedule_work); | ||
2972 | |||
2973 | /* | ||
2974 | * schedule_work_on - put work task on a specific cpu | 3017 | * schedule_work_on - put work task on a specific cpu |
2975 | * @cpu: cpu to put the work task on | 3018 | * @cpu: cpu to put the work task on |
2976 | * @work: job to be done | 3019 | * @work: job to be done |
2977 | * | 3020 | * |
2978 | * This puts a job on a specific cpu | 3021 | * This puts a job on a specific cpu |
2979 | */ | 3022 | */ |
2980 | int schedule_work_on(int cpu, struct work_struct *work) | 3023 | bool schedule_work_on(int cpu, struct work_struct *work) |
2981 | { | 3024 | { |
2982 | return queue_work_on(cpu, system_wq, work); | 3025 | return queue_work_on(cpu, system_wq, work); |
2983 | } | 3026 | } |
2984 | EXPORT_SYMBOL(schedule_work_on); | 3027 | EXPORT_SYMBOL(schedule_work_on); |
2985 | 3028 | ||
2986 | /** | 3029 | /** |
2987 | * schedule_delayed_work - put work task in global workqueue after delay | 3030 | * schedule_work - put work task in global workqueue |
2988 | * @dwork: job to be done | 3031 | * @work: job to be done |
2989 | * @delay: number of jiffies to wait or 0 for immediate execution | ||
2990 | * | 3032 | * |
2991 | * After waiting for a given time this puts a job in the kernel-global | 3033 | * Returns %false if @work was already on the kernel-global workqueue and |
2992 | * workqueue. | 3034 | * %true otherwise. |
3035 | * | ||
3036 | * This puts a job in the kernel-global workqueue if it was not already | ||
3037 | * queued and leaves it in the same position on the kernel-global | ||
3038 | * workqueue otherwise. | ||
2993 | */ | 3039 | */ |
2994 | int schedule_delayed_work(struct delayed_work *dwork, | 3040 | bool schedule_work(struct work_struct *work) |
2995 | unsigned long delay) | ||
2996 | { | 3041 | { |
2997 | return queue_delayed_work(system_wq, dwork, delay); | 3042 | return queue_work(system_wq, work); |
2998 | } | 3043 | } |
2999 | EXPORT_SYMBOL(schedule_delayed_work); | 3044 | EXPORT_SYMBOL(schedule_work); |
3000 | 3045 | ||
3001 | /** | 3046 | /** |
3002 | * schedule_delayed_work_on - queue work in global workqueue on CPU after delay | 3047 | * schedule_delayed_work_on - queue work in global workqueue on CPU after delay |
@@ -3007,14 +3052,28 @@ EXPORT_SYMBOL(schedule_delayed_work); | |||
3007 | * After waiting for a given time this puts a job in the kernel-global | 3052 | * After waiting for a given time this puts a job in the kernel-global |
3008 | * workqueue on the specified CPU. | 3053 | * workqueue on the specified CPU. |
3009 | */ | 3054 | */ |
3010 | int schedule_delayed_work_on(int cpu, | 3055 | bool schedule_delayed_work_on(int cpu, struct delayed_work *dwork, |
3011 | struct delayed_work *dwork, unsigned long delay) | 3056 | unsigned long delay) |
3012 | { | 3057 | { |
3013 | return queue_delayed_work_on(cpu, system_wq, dwork, delay); | 3058 | return queue_delayed_work_on(cpu, system_wq, dwork, delay); |
3014 | } | 3059 | } |
3015 | EXPORT_SYMBOL(schedule_delayed_work_on); | 3060 | EXPORT_SYMBOL(schedule_delayed_work_on); |
3016 | 3061 | ||
3017 | /** | 3062 | /** |
3063 | * schedule_delayed_work - put work task in global workqueue after delay | ||
3064 | * @dwork: job to be done | ||
3065 | * @delay: number of jiffies to wait or 0 for immediate execution | ||
3066 | * | ||
3067 | * After waiting for a given time this puts a job in the kernel-global | ||
3068 | * workqueue. | ||
3069 | */ | ||
3070 | bool schedule_delayed_work(struct delayed_work *dwork, unsigned long delay) | ||
3071 | { | ||
3072 | return queue_delayed_work(system_wq, dwork, delay); | ||
3073 | } | ||
3074 | EXPORT_SYMBOL(schedule_delayed_work); | ||
3075 | |||
3076 | /** | ||
3018 | * schedule_on_each_cpu - execute a function synchronously on each online CPU | 3077 | * schedule_on_each_cpu - execute a function synchronously on each online CPU |
3019 | * @func: the function to call | 3078 | * @func: the function to call |
3020 | * | 3079 | * |
@@ -3161,9 +3220,8 @@ static int wq_clamp_max_active(int max_active, unsigned int flags, | |||
3161 | int lim = flags & WQ_UNBOUND ? WQ_UNBOUND_MAX_ACTIVE : WQ_MAX_ACTIVE; | 3220 | int lim = flags & WQ_UNBOUND ? WQ_UNBOUND_MAX_ACTIVE : WQ_MAX_ACTIVE; |
3162 | 3221 | ||
3163 | if (max_active < 1 || max_active > lim) | 3222 | if (max_active < 1 || max_active > lim) |
3164 | printk(KERN_WARNING "workqueue: max_active %d requested for %s " | 3223 | pr_warn("workqueue: max_active %d requested for %s is out of range, clamping between %d and %d\n", |
3165 | "is out of range, clamping between %d and %d\n", | 3224 | max_active, name, 1, lim); |
3166 | max_active, name, 1, lim); | ||
3167 | 3225 | ||
3168 | return clamp_val(max_active, 1, lim); | 3226 | return clamp_val(max_active, 1, lim); |
3169 | } | 3227 | } |
@@ -3319,6 +3377,26 @@ void destroy_workqueue(struct workqueue_struct *wq) | |||
3319 | EXPORT_SYMBOL_GPL(destroy_workqueue); | 3377 | EXPORT_SYMBOL_GPL(destroy_workqueue); |
3320 | 3378 | ||
3321 | /** | 3379 | /** |
3380 | * cwq_set_max_active - adjust max_active of a cwq | ||
3381 | * @cwq: target cpu_workqueue_struct | ||
3382 | * @max_active: new max_active value. | ||
3383 | * | ||
3384 | * Set @cwq->max_active to @max_active and activate delayed works if | ||
3385 | * increased. | ||
3386 | * | ||
3387 | * CONTEXT: | ||
3388 | * spin_lock_irq(gcwq->lock). | ||
3389 | */ | ||
3390 | static void cwq_set_max_active(struct cpu_workqueue_struct *cwq, int max_active) | ||
3391 | { | ||
3392 | cwq->max_active = max_active; | ||
3393 | |||
3394 | while (!list_empty(&cwq->delayed_works) && | ||
3395 | cwq->nr_active < cwq->max_active) | ||
3396 | cwq_activate_first_delayed(cwq); | ||
3397 | } | ||
3398 | |||
3399 | /** | ||
3322 | * workqueue_set_max_active - adjust max_active of a workqueue | 3400 | * workqueue_set_max_active - adjust max_active of a workqueue |
3323 | * @wq: target workqueue | 3401 | * @wq: target workqueue |
3324 | * @max_active: new max_active value. | 3402 | * @max_active: new max_active value. |
@@ -3345,7 +3423,7 @@ void workqueue_set_max_active(struct workqueue_struct *wq, int max_active) | |||
3345 | 3423 | ||
3346 | if (!(wq->flags & WQ_FREEZABLE) || | 3424 | if (!(wq->flags & WQ_FREEZABLE) || |
3347 | !(gcwq->flags & GCWQ_FREEZING)) | 3425 | !(gcwq->flags & GCWQ_FREEZING)) |
3348 | get_cwq(gcwq->cpu, wq)->max_active = max_active; | 3426 | cwq_set_max_active(get_cwq(gcwq->cpu, wq), max_active); |
3349 | 3427 | ||
3350 | spin_unlock_irq(&gcwq->lock); | 3428 | spin_unlock_irq(&gcwq->lock); |
3351 | } | 3429 | } |
@@ -3409,7 +3487,7 @@ unsigned int work_busy(struct work_struct *work) | |||
3409 | unsigned int ret = 0; | 3487 | unsigned int ret = 0; |
3410 | 3488 | ||
3411 | if (!gcwq) | 3489 | if (!gcwq) |
3412 | return false; | 3490 | return 0; |
3413 | 3491 | ||
3414 | spin_lock_irqsave(&gcwq->lock, flags); | 3492 | spin_lock_irqsave(&gcwq->lock, flags); |
3415 | 3493 | ||
@@ -3440,23 +3518,23 @@ EXPORT_SYMBOL_GPL(work_busy); | |||
3440 | */ | 3518 | */ |
3441 | 3519 | ||
3442 | /* claim manager positions of all pools */ | 3520 | /* claim manager positions of all pools */ |
3443 | static void gcwq_claim_management_and_lock(struct global_cwq *gcwq) | 3521 | static void gcwq_claim_assoc_and_lock(struct global_cwq *gcwq) |
3444 | { | 3522 | { |
3445 | struct worker_pool *pool; | 3523 | struct worker_pool *pool; |
3446 | 3524 | ||
3447 | for_each_worker_pool(pool, gcwq) | 3525 | for_each_worker_pool(pool, gcwq) |
3448 | mutex_lock_nested(&pool->manager_mutex, pool - gcwq->pools); | 3526 | mutex_lock_nested(&pool->assoc_mutex, pool - gcwq->pools); |
3449 | spin_lock_irq(&gcwq->lock); | 3527 | spin_lock_irq(&gcwq->lock); |
3450 | } | 3528 | } |
3451 | 3529 | ||
3452 | /* release manager positions */ | 3530 | /* release manager positions */ |
3453 | static void gcwq_release_management_and_unlock(struct global_cwq *gcwq) | 3531 | static void gcwq_release_assoc_and_unlock(struct global_cwq *gcwq) |
3454 | { | 3532 | { |
3455 | struct worker_pool *pool; | 3533 | struct worker_pool *pool; |
3456 | 3534 | ||
3457 | spin_unlock_irq(&gcwq->lock); | 3535 | spin_unlock_irq(&gcwq->lock); |
3458 | for_each_worker_pool(pool, gcwq) | 3536 | for_each_worker_pool(pool, gcwq) |
3459 | mutex_unlock(&pool->manager_mutex); | 3537 | mutex_unlock(&pool->assoc_mutex); |
3460 | } | 3538 | } |
3461 | 3539 | ||
3462 | static void gcwq_unbind_fn(struct work_struct *work) | 3540 | static void gcwq_unbind_fn(struct work_struct *work) |
@@ -3469,7 +3547,7 @@ static void gcwq_unbind_fn(struct work_struct *work) | |||
3469 | 3547 | ||
3470 | BUG_ON(gcwq->cpu != smp_processor_id()); | 3548 | BUG_ON(gcwq->cpu != smp_processor_id()); |
3471 | 3549 | ||
3472 | gcwq_claim_management_and_lock(gcwq); | 3550 | gcwq_claim_assoc_and_lock(gcwq); |
3473 | 3551 | ||
3474 | /* | 3552 | /* |
3475 | * We've claimed all manager positions. Make all workers unbound | 3553 | * We've claimed all manager positions. Make all workers unbound |
@@ -3486,7 +3564,7 @@ static void gcwq_unbind_fn(struct work_struct *work) | |||
3486 | 3564 | ||
3487 | gcwq->flags |= GCWQ_DISASSOCIATED; | 3565 | gcwq->flags |= GCWQ_DISASSOCIATED; |
3488 | 3566 | ||
3489 | gcwq_release_management_and_unlock(gcwq); | 3567 | gcwq_release_assoc_and_unlock(gcwq); |
3490 | 3568 | ||
3491 | /* | 3569 | /* |
3492 | * Call schedule() so that we cross rq->lock and thus can guarantee | 3570 | * Call schedule() so that we cross rq->lock and thus can guarantee |
@@ -3514,7 +3592,7 @@ static void gcwq_unbind_fn(struct work_struct *work) | |||
3514 | * Workqueues should be brought up before normal priority CPU notifiers. | 3592 | * Workqueues should be brought up before normal priority CPU notifiers. |
3515 | * This will be registered high priority CPU notifier. | 3593 | * This will be registered high priority CPU notifier. |
3516 | */ | 3594 | */ |
3517 | static int __devinit workqueue_cpu_up_callback(struct notifier_block *nfb, | 3595 | static int __cpuinit workqueue_cpu_up_callback(struct notifier_block *nfb, |
3518 | unsigned long action, | 3596 | unsigned long action, |
3519 | void *hcpu) | 3597 | void *hcpu) |
3520 | { | 3598 | { |
@@ -3542,10 +3620,10 @@ static int __devinit workqueue_cpu_up_callback(struct notifier_block *nfb, | |||
3542 | 3620 | ||
3543 | case CPU_DOWN_FAILED: | 3621 | case CPU_DOWN_FAILED: |
3544 | case CPU_ONLINE: | 3622 | case CPU_ONLINE: |
3545 | gcwq_claim_management_and_lock(gcwq); | 3623 | gcwq_claim_assoc_and_lock(gcwq); |
3546 | gcwq->flags &= ~GCWQ_DISASSOCIATED; | 3624 | gcwq->flags &= ~GCWQ_DISASSOCIATED; |
3547 | rebind_workers(gcwq); | 3625 | rebind_workers(gcwq); |
3548 | gcwq_release_management_and_unlock(gcwq); | 3626 | gcwq_release_assoc_and_unlock(gcwq); |
3549 | break; | 3627 | break; |
3550 | } | 3628 | } |
3551 | return NOTIFY_OK; | 3629 | return NOTIFY_OK; |
@@ -3555,7 +3633,7 @@ static int __devinit workqueue_cpu_up_callback(struct notifier_block *nfb, | |||
3555 | * Workqueues should be brought down after normal priority CPU notifiers. | 3633 | * Workqueues should be brought down after normal priority CPU notifiers. |
3556 | * This will be registered as low priority CPU notifier. | 3634 | * This will be registered as low priority CPU notifier. |
3557 | */ | 3635 | */ |
3558 | static int __devinit workqueue_cpu_down_callback(struct notifier_block *nfb, | 3636 | static int __cpuinit workqueue_cpu_down_callback(struct notifier_block *nfb, |
3559 | unsigned long action, | 3637 | unsigned long action, |
3560 | void *hcpu) | 3638 | void *hcpu) |
3561 | { | 3639 | { |
@@ -3566,7 +3644,7 @@ static int __devinit workqueue_cpu_down_callback(struct notifier_block *nfb, | |||
3566 | case CPU_DOWN_PREPARE: | 3644 | case CPU_DOWN_PREPARE: |
3567 | /* unbinding should happen on the local CPU */ | 3645 | /* unbinding should happen on the local CPU */ |
3568 | INIT_WORK_ONSTACK(&unbind_work, gcwq_unbind_fn); | 3646 | INIT_WORK_ONSTACK(&unbind_work, gcwq_unbind_fn); |
3569 | schedule_work_on(cpu, &unbind_work); | 3647 | queue_work_on(cpu, system_highpri_wq, &unbind_work); |
3570 | flush_work(&unbind_work); | 3648 | flush_work(&unbind_work); |
3571 | break; | 3649 | break; |
3572 | } | 3650 | } |
@@ -3735,11 +3813,7 @@ void thaw_workqueues(void) | |||
3735 | continue; | 3813 | continue; |
3736 | 3814 | ||
3737 | /* restore max_active and repopulate worklist */ | 3815 | /* restore max_active and repopulate worklist */ |
3738 | cwq->max_active = wq->saved_max_active; | 3816 | cwq_set_max_active(cwq, wq->saved_max_active); |
3739 | |||
3740 | while (!list_empty(&cwq->delayed_works) && | ||
3741 | cwq->nr_active < cwq->max_active) | ||
3742 | cwq_activate_first_delayed(cwq); | ||
3743 | } | 3817 | } |
3744 | 3818 | ||
3745 | for_each_worker_pool(pool, gcwq) | 3819 | for_each_worker_pool(pool, gcwq) |
@@ -3759,8 +3833,12 @@ static int __init init_workqueues(void) | |||
3759 | unsigned int cpu; | 3833 | unsigned int cpu; |
3760 | int i; | 3834 | int i; |
3761 | 3835 | ||
3836 | /* make sure we have enough bits for OFFQ CPU number */ | ||
3837 | BUILD_BUG_ON((1LU << (BITS_PER_LONG - WORK_OFFQ_CPU_SHIFT)) < | ||
3838 | WORK_CPU_LAST); | ||
3839 | |||
3762 | cpu_notifier(workqueue_cpu_up_callback, CPU_PRI_WORKQUEUE_UP); | 3840 | cpu_notifier(workqueue_cpu_up_callback, CPU_PRI_WORKQUEUE_UP); |
3763 | cpu_notifier(workqueue_cpu_down_callback, CPU_PRI_WORKQUEUE_DOWN); | 3841 | hotcpu_notifier(workqueue_cpu_down_callback, CPU_PRI_WORKQUEUE_DOWN); |
3764 | 3842 | ||
3765 | /* initialize gcwqs */ | 3843 | /* initialize gcwqs */ |
3766 | for_each_gcwq_cpu(cpu) { | 3844 | for_each_gcwq_cpu(cpu) { |
@@ -3786,11 +3864,9 @@ static int __init init_workqueues(void) | |||
3786 | setup_timer(&pool->mayday_timer, gcwq_mayday_timeout, | 3864 | setup_timer(&pool->mayday_timer, gcwq_mayday_timeout, |
3787 | (unsigned long)pool); | 3865 | (unsigned long)pool); |
3788 | 3866 | ||
3789 | mutex_init(&pool->manager_mutex); | 3867 | mutex_init(&pool->assoc_mutex); |
3790 | ida_init(&pool->worker_ida); | 3868 | ida_init(&pool->worker_ida); |
3791 | } | 3869 | } |
3792 | |||
3793 | init_waitqueue_head(&gcwq->rebind_hold); | ||
3794 | } | 3870 | } |
3795 | 3871 | ||
3796 | /* create the initial worker */ | 3872 | /* create the initial worker */ |
@@ -3813,17 +3889,14 @@ static int __init init_workqueues(void) | |||
3813 | } | 3889 | } |
3814 | 3890 | ||
3815 | system_wq = alloc_workqueue("events", 0, 0); | 3891 | system_wq = alloc_workqueue("events", 0, 0); |
3892 | system_highpri_wq = alloc_workqueue("events_highpri", WQ_HIGHPRI, 0); | ||
3816 | system_long_wq = alloc_workqueue("events_long", 0, 0); | 3893 | system_long_wq = alloc_workqueue("events_long", 0, 0); |
3817 | system_nrt_wq = alloc_workqueue("events_nrt", WQ_NON_REENTRANT, 0); | ||
3818 | system_unbound_wq = alloc_workqueue("events_unbound", WQ_UNBOUND, | 3894 | system_unbound_wq = alloc_workqueue("events_unbound", WQ_UNBOUND, |
3819 | WQ_UNBOUND_MAX_ACTIVE); | 3895 | WQ_UNBOUND_MAX_ACTIVE); |
3820 | system_freezable_wq = alloc_workqueue("events_freezable", | 3896 | system_freezable_wq = alloc_workqueue("events_freezable", |
3821 | WQ_FREEZABLE, 0); | 3897 | WQ_FREEZABLE, 0); |
3822 | system_nrt_freezable_wq = alloc_workqueue("events_nrt_freezable", | 3898 | BUG_ON(!system_wq || !system_highpri_wq || !system_long_wq || |
3823 | WQ_NON_REENTRANT | WQ_FREEZABLE, 0); | 3899 | !system_unbound_wq || !system_freezable_wq); |
3824 | BUG_ON(!system_wq || !system_long_wq || !system_nrt_wq || | ||
3825 | !system_unbound_wq || !system_freezable_wq || | ||
3826 | !system_nrt_freezable_wq); | ||
3827 | return 0; | 3900 | return 0; |
3828 | } | 3901 | } |
3829 | early_initcall(init_workqueues); | 3902 | early_initcall(init_workqueues); |